summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMikko Rapeli <mikko.rapeli@linaro.org>2023-02-15 16:50:40 +0200
committerRichard Purdie <richard.purdie@linuxfoundation.org>2023-03-06 15:09:42 +0000
commitcdd824093f04abaeded6a28279b162a21ef510ed (patch)
tree59024934732eb382b5562054524f7dfaff1b3271
parent4a93913116c27df68ba091362e7ec0060cd520d1 (diff)
downloadpoky-cdd824093f04abaeded6a28279b162a21ef510ed.tar.gz
oeqa ssh.py: fix hangs in run()
When qemu machine hangs, the ssh commands done by tests are not timing out. do_testimage() task has last logs like this: DEBUG: time: 1673531086.3155053, endtime: 1673531686.315502 The test process is stuck for hours, or for ever if the executing command or test case did not set a timeout correctly. The default 300 second timeout is not working when target hangs. Note that timeout is really a "inactive timeout" since data returned by the process will reset the timeout. Make the process stdout non-blocking so read() will always return right away using os.set_blocking() available in python 3.5 and later. Then change from python codec reader to plain read() and make the ssh subprocess stdout non-blocking. Even with select() making sure the file had input to be read, the codec reader was trying to find more stuff and blocking for ever when process hangs. While at it, add a small timeout to read data in larger chunks if possible. This avoids reading data one or few characters at a time and makes the debug logs more readable. close() the stdout file in all cases after read loop is complete. Then make sure to wait or kill the ssh subprocess in all cases. Just reading the output stream and receiving EOF there does not mean that the process exited, and wait() needs a timeout if the process is hanging. In the end kill the process and return the return value and captured output utf-8 encoded, just like before these changes. This fixes ssh run() related deadlocks when a qemu target hangs completely. (From OE-Core rev: 3e1a4d572922eadc85ff6ac169722ad7ab118cf4) Signed-off-by: Mikko Rapeli <mikko.rapeli@linaro.org> Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com> (cherry picked from commit 9c63970fce3a3d6029745252a6ec2bf9b9da862d) Signed-off-by: Steve Sakoman <steve@sakoman.com> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
-rw-r--r--meta/lib/oeqa/core/target/ssh.py39
1 files changed, 30 insertions, 9 deletions
diff --git a/meta/lib/oeqa/core/target/ssh.py b/meta/lib/oeqa/core/target/ssh.py
index 48a463861d..4ab0cddb43 100644
--- a/meta/lib/oeqa/core/target/ssh.py
+++ b/meta/lib/oeqa/core/target/ssh.py
@@ -226,27 +226,33 @@ def SSHCall(command, logger, timeout=None, **opts):
226 def run(): 226 def run():
227 nonlocal output 227 nonlocal output
228 nonlocal process 228 nonlocal process
229 output_raw = b''
229 starttime = time.time() 230 starttime = time.time()
230 process = subprocess.Popen(command, **options) 231 process = subprocess.Popen(command, **options)
231 if timeout: 232 if timeout:
232 endtime = starttime + timeout 233 endtime = starttime + timeout
233 eof = False 234 eof = False
235 os.set_blocking(process.stdout.fileno(), False)
234 while time.time() < endtime and not eof: 236 while time.time() < endtime and not eof:
235 logger.debug('time: %s, endtime: %s' % (time.time(), endtime))
236 try: 237 try:
238 logger.debug('Waiting for process output: time: %s, endtime: %s' % (time.time(), endtime))
237 if select.select([process.stdout], [], [], 5)[0] != []: 239 if select.select([process.stdout], [], [], 5)[0] != []:
238 reader = codecs.getreader('utf-8')(process.stdout, 'ignore') 240 # wait a bit for more data, tries to avoid reading single characters
239 data = reader.read(1024, 4096) 241 time.sleep(0.2)
242 data = process.stdout.read()
240 if not data: 243 if not data:
241 process.stdout.close()
242 eof = True 244 eof = True
243 else: 245 else:
244 output += data 246 output_raw += data
245 logger.debug('Partial data from SSH call:\n%s' % data) 247 # ignore errors to capture as much as possible
248 logger.debug('Partial data from SSH call:\n%s' % data.decode('utf-8', errors='ignore'))
246 endtime = time.time() + timeout 249 endtime = time.time() + timeout
247 except InterruptedError: 250 except InterruptedError:
251 logger.debug('InterruptedError')
248 continue 252 continue
249 253
254 process.stdout.close()
255
250 # process hasn't returned yet 256 # process hasn't returned yet
251 if not eof: 257 if not eof:
252 process.terminate() 258 process.terminate()
@@ -254,6 +260,7 @@ def SSHCall(command, logger, timeout=None, **opts):
254 try: 260 try:
255 process.kill() 261 process.kill()
256 except OSError: 262 except OSError:
263 logger.debug('OSError when killing process')
257 pass 264 pass
258 endtime = time.time() - starttime 265 endtime = time.time() - starttime
259 lastline = ("\nProcess killed - no output for %d seconds. Total" 266 lastline = ("\nProcess killed - no output for %d seconds. Total"
@@ -262,8 +269,21 @@ def SSHCall(command, logger, timeout=None, **opts):
262 output += lastline 269 output += lastline
263 270
264 else: 271 else:
265 output = process.communicate()[0].decode('utf-8', errors='ignore') 272 output_raw = process.communicate()[0]
266 logger.debug('Data from SSH call:\n%s' % output.rstrip()) 273
274 output = output_raw.decode('utf-8', errors='ignore')
275 logger.debug('Data from SSH call:\n%s' % output.rstrip())
276
277 # timout or not, make sure process exits and is not hanging
278 if process.returncode == None:
279 try:
280 process.wait(timeout=5)
281 except TimeoutExpired:
282 try:
283 process.kill()
284 except OSError:
285 logger.debug('OSError')
286 pass
267 287
268 options = { 288 options = {
269 "stdout": subprocess.PIPE, 289 "stdout": subprocess.PIPE,
@@ -292,4 +312,5 @@ def SSHCall(command, logger, timeout=None, **opts):
292 process.kill() 312 process.kill()
293 logger.debug('Something went wrong, killing SSH process') 313 logger.debug('Something went wrong, killing SSH process')
294 raise 314 raise
295 return (process.wait(), output.rstrip()) 315
316 return (process.returncode, output.rstrip())