diff options
author | Mikko Rapeli <mikko.rapeli@linaro.org> | 2023-02-15 16:50:40 +0200 |
---|---|---|
committer | Richard Purdie <richard.purdie@linuxfoundation.org> | 2023-02-23 12:11:29 +0000 |
commit | c7b05d56573cb3304ba05d375a54d0484da23425 (patch) | |
tree | 0af1449be28dcd2ff16b187991021e11e53e88db /meta | |
parent | 97b258985a59fd4cfe982f694f95018dcab22c61 (diff) | |
download | poky-c7b05d56573cb3304ba05d375a54d0484da23425.tar.gz |
oeqa ssh.py: fix hangs in run()
When qemu machine hangs, the ssh commands done by tests
are not timing out. do_testimage() task has last logs like this:
DEBUG: time: 1673531086.3155053, endtime: 1673531686.315502
The test process is stuck for hours, or for ever if the
executing command or test case did not set a timeout correctly.
The default 300 second timeout is not working when target hangs.
Note that timeout is really a "inactive timeout" since data returned
by the process will reset the timeout.
Make the process stdout non-blocking so read() will always return
right away using os.set_blocking() available in python 3.5 and later.
Then change from python codec reader to plain read() and make
the ssh subprocess stdout non-blocking. Even with select()
making sure the file had input to be read, the codec reader was
trying to find more stuff and blocking for ever when process hangs.
While at it, add a small timeout to read data in larger chunks if
possible. This avoids reading data one or few characters at a time
and makes the debug logs more readable.
close() the stdout file in all cases after read loop is complete.
Then make sure to wait or kill the ssh subprocess in all cases.
Just reading the output stream and receiving EOF there does not mean
that the process exited, and wait() needs a timeout if the process
is hanging. In the end kill the process and return the return value
and captured output utf-8 encoded, just like before these changes.
This fixes ssh run() related deadlocks when a qemu target hangs
completely.
(From OE-Core rev: 9c63970fce3a3d6029745252a6ec2bf9b9da862d)
Signed-off-by: Mikko Rapeli <mikko.rapeli@linaro.org>
Signed-off-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Diffstat (limited to 'meta')
-rw-r--r-- | meta/lib/oeqa/core/target/ssh.py | 39 |
1 files changed, 30 insertions, 9 deletions
diff --git a/meta/lib/oeqa/core/target/ssh.py b/meta/lib/oeqa/core/target/ssh.py index 2597256c91..51079075b5 100644 --- a/meta/lib/oeqa/core/target/ssh.py +++ b/meta/lib/oeqa/core/target/ssh.py | |||
@@ -226,27 +226,33 @@ def SSHCall(command, logger, timeout=None, **opts): | |||
226 | def run(): | 226 | def run(): |
227 | nonlocal output | 227 | nonlocal output |
228 | nonlocal process | 228 | nonlocal process |
229 | output_raw = b'' | ||
229 | starttime = time.time() | 230 | starttime = time.time() |
230 | process = subprocess.Popen(command, **options) | 231 | process = subprocess.Popen(command, **options) |
231 | if timeout: | 232 | if timeout: |
232 | endtime = starttime + timeout | 233 | endtime = starttime + timeout |
233 | eof = False | 234 | eof = False |
235 | os.set_blocking(process.stdout.fileno(), False) | ||
234 | while time.time() < endtime and not eof: | 236 | while time.time() < endtime and not eof: |
235 | logger.debug('time: %s, endtime: %s' % (time.time(), endtime)) | ||
236 | try: | 237 | try: |
238 | logger.debug('Waiting for process output: time: %s, endtime: %s' % (time.time(), endtime)) | ||
237 | if select.select([process.stdout], [], [], 5)[0] != []: | 239 | if select.select([process.stdout], [], [], 5)[0] != []: |
238 | reader = codecs.getreader('utf-8')(process.stdout, 'ignore') | 240 | # wait a bit for more data, tries to avoid reading single characters |
239 | data = reader.read(1024, 4096) | 241 | time.sleep(0.2) |
242 | data = process.stdout.read() | ||
240 | if not data: | 243 | if not data: |
241 | process.stdout.close() | ||
242 | eof = True | 244 | eof = True |
243 | else: | 245 | else: |
244 | output += data | 246 | output_raw += data |
245 | logger.debug('Partial data from SSH call:\n%s' % data) | 247 | # ignore errors to capture as much as possible |
248 | logger.debug('Partial data from SSH call:\n%s' % data.decode('utf-8', errors='ignore')) | ||
246 | endtime = time.time() + timeout | 249 | endtime = time.time() + timeout |
247 | except InterruptedError: | 250 | except InterruptedError: |
251 | logger.debug('InterruptedError') | ||
248 | continue | 252 | continue |
249 | 253 | ||
254 | process.stdout.close() | ||
255 | |||
250 | # process hasn't returned yet | 256 | # process hasn't returned yet |
251 | if not eof: | 257 | if not eof: |
252 | process.terminate() | 258 | process.terminate() |
@@ -254,6 +260,7 @@ def SSHCall(command, logger, timeout=None, **opts): | |||
254 | try: | 260 | try: |
255 | process.kill() | 261 | process.kill() |
256 | except OSError: | 262 | except OSError: |
263 | logger.debug('OSError when killing process') | ||
257 | pass | 264 | pass |
258 | endtime = time.time() - starttime | 265 | endtime = time.time() - starttime |
259 | lastline = ("\nProcess killed - no output for %d seconds. Total" | 266 | lastline = ("\nProcess killed - no output for %d seconds. Total" |
@@ -262,8 +269,21 @@ def SSHCall(command, logger, timeout=None, **opts): | |||
262 | output += lastline | 269 | output += lastline |
263 | 270 | ||
264 | else: | 271 | else: |
265 | output = process.communicate()[0].decode('utf-8', errors='ignore') | 272 | output_raw = process.communicate()[0] |
266 | logger.debug('Data from SSH call:\n%s' % output.rstrip()) | 273 | |
274 | output = output_raw.decode('utf-8', errors='ignore') | ||
275 | logger.debug('Data from SSH call:\n%s' % output.rstrip()) | ||
276 | |||
277 | # timout or not, make sure process exits and is not hanging | ||
278 | if process.returncode == None: | ||
279 | try: | ||
280 | process.wait(timeout=5) | ||
281 | except TimeoutExpired: | ||
282 | try: | ||
283 | process.kill() | ||
284 | except OSError: | ||
285 | logger.debug('OSError') | ||
286 | pass | ||
267 | 287 | ||
268 | options = { | 288 | options = { |
269 | "stdout": subprocess.PIPE, | 289 | "stdout": subprocess.PIPE, |
@@ -292,4 +312,5 @@ def SSHCall(command, logger, timeout=None, **opts): | |||
292 | process.kill() | 312 | process.kill() |
293 | logger.debug('Something went wrong, killing SSH process') | 313 | logger.debug('Something went wrong, killing SSH process') |
294 | raise | 314 | raise |
295 | return (process.wait(), output.rstrip()) | 315 | |
316 | return (process.returncode, output.rstrip()) | ||