summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRichard Purdie <richard.purdie@linuxfoundation.org>2025-03-03 12:57:28 +0000
committerSteve Sakoman <steve@sakoman.com>2025-03-13 07:21:43 -0700
commit2449dc88a0d67a121fb0df051a6e56d114d73f2e (patch)
tree06a8b515156777bb69b4289c0c966711a4b9b1a4
parent715043743e31e518cfa74451b81e020340327482 (diff)
downloadpoky-2449dc88a0d67a121fb0df051a6e56d114d73f2e.tar.gz
bitbake: event/utils: Avoid deadlock from lock_timeout() and recursive events
We've been seeing intermittent failures on Ubuntu 22.04 in oe-selftest which were problematic to debug. The failure was inside lock_timeout and once that was identified and the backtrace obtained, the problem becomes clearer: File "X/bitbake/lib/bb/server/process.py", line 466, in idle_thread_internal retval = function(self, data, False) File "X/bitbake/lib/bb/command.py", line 123, in runAsyncCommand self.cooker.updateCache() File "X/bitbake/lib/bb/cooker.py", line 1629, in updateCache self.parser = CookerParser(self, mcfilelist, total_masked) File "X/bitbake/lib/bb/cooker.py", line 2141, in __init__ self.bb_caches = bb.cache.MulticonfigCache(self.cfgbuilder, self.cfghash, cooker.caches_array) File "X/bitbake/lib/bb/cache.py", line 772, in __init__ loaded += c.prepare_cache(progress) File "X/bitbake/lib/bb/cache.py", line 435, in prepare_cache loaded = self.load_cachefile(progress) File "X/bitbake/lib/bb/cache.py", line 516, in load_cachefile progress(cachefile.tell() + previous_progress) File "X/bitbake/lib/bb/cache.py", line 751, in progress bb.event.fire(bb.event.CacheLoadProgress(current_progress, cachesize), File "X/bitbake/lib/bb/event.py", line 234, in fire fire_ui_handlers(event, d) File "X/bitbake/lib/bb/event.py", line 210, in fire_ui_handlers _ui_handlers[h].event.send(event) File "X/bitbake/lib/bb/cooker.py", line 117, in send str_event = codecs.encode(pickle.dumps(event), \'base64\').decode(\'utf-8\') File "/usr/lib/python3.10/asyncio/sslproto.py", line 320, in __del__ _warn(f"unclosed transport {self!r}", ResourceWarning, source=self) File "/usr/lib/python3.10/warnings.py", line 109, in _showwarnmsg sw(msg.message, msg.category, msg.filename, msg.lineno, File "X/bitbake/lib/bb/main.py", line 113, in _showwarning warnlog.warning(s) File "/usr/lib/python3.10/logging/__init__.py", line 1489, in warning self._log(WARNING, msg, args, **kwargs) File "/usr/lib/python3.10/logging/__init__.py", line 1624, in _log self.handle(record) File "/usr/lib/python3.10/logging/__init__.py", line 1634, in handle self.callHandlers(record) File "/usr/lib/python3.10/logging/__init__.py", line 1696, in callHandlers hdlr.handle(record) File "/usr/lib/python3.10/logging/__init__.py", line 968, in handle self.emit(record) File "X/bitbake/lib/bb/event.py", line 778, in emit fire(record, None) File "X/bitbake/lib/bb/event.py", line 234, in fire fire_ui_handlers(event, d) File "X/bitbake/lib/bb/event.py", line 197, in fire_ui_handlers with bb.utils.lock_timeout(_thread_lock): File "/usr/lib/python3.10/contextlib.py", line 135, in __enter__ return next(self.gen) File "X/bitbake/lib/bb/utils.py", line 1888, in lock_timeout bb.server.process.serverlog("Couldn\'t get the lock for 5 mins, timed out, exiting. %s" % traceback.format_stack()) or put in simpler terms, whilst sending an event(), an unrelated warning message happens to be triggered from asyncio: /usr/lib/python3.10/asyncio/sslproto.py:320: ResourceWarning: unclosed transport <asyncio.sslproto._SSLProtocolTransport object at 0x7f0e797d3100> which triggers a second event() which can't be sent as we're already in the critcal section and already hold the lock. That warning is due to the version of asyncio used on Ubuntu 22.04 with python 3.10 and that comined with timing issues explains why we don't see it on other python versions or distros. We can't handle the second event as the lock is there to serialise the events. Instead, we queue the event and then process the queue later. Add a new version of lock_timeout which allows us to handle the situation more gracefully. (Bitbake rev: 82b9f42126983579da03bdbb4e3ebf07346118a7) Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org> (cherry picked from commit 2c590ff1aff89d23b25ce808650f200013a1e6af) Signed-off-by: Steve Sakoman <steve@sakoman.com>
-rw-r--r--bitbake/lib/bb/event.py10
-rw-r--r--bitbake/lib/bb/utils.py15
2 files changed, 24 insertions, 1 deletions
diff --git a/bitbake/lib/bb/event.py b/bitbake/lib/bb/event.py
index 952c85c0bd..a12adbc937 100644
--- a/bitbake/lib/bb/event.py
+++ b/bitbake/lib/bb/event.py
@@ -194,7 +194,12 @@ def fire_ui_handlers(event, d):
194 ui_queue.append(event) 194 ui_queue.append(event)
195 return 195 return
196 196
197 with bb.utils.lock_timeout(_thread_lock): 197 with bb.utils.lock_timeout_nocheck(_thread_lock) as lock:
198 if not lock:
199 # If we can't get the lock, we may be recursively called, queue and return
200 ui_queue.append(event)
201 return
202
198 errors = [] 203 errors = []
199 for h in _ui_handlers: 204 for h in _ui_handlers:
200 #print "Sending event %s" % event 205 #print "Sending event %s" % event
@@ -213,6 +218,9 @@ def fire_ui_handlers(event, d):
213 for h in errors: 218 for h in errors:
214 del _ui_handlers[h] 219 del _ui_handlers[h]
215 220
221 while ui_queue:
222 fire_ui_handlers(ui_queue.pop(), d)
223
216def fire(event, d): 224def fire(event, d):
217 """Fire off an Event""" 225 """Fire off an Event"""
218 226
diff --git a/bitbake/lib/bb/utils.py b/bitbake/lib/bb/utils.py
index da026fe5bf..67e22f4389 100644
--- a/bitbake/lib/bb/utils.py
+++ b/bitbake/lib/bb/utils.py
@@ -1857,6 +1857,9 @@ def path_is_descendant(descendant, ancestor):
1857# If we don't have a timeout of some kind and a process/thread exits badly (for example 1857# If we don't have a timeout of some kind and a process/thread exits badly (for example
1858# OOM killed) and held a lock, we'd just hang in the lock futex forever. It is better 1858# OOM killed) and held a lock, we'd just hang in the lock futex forever. It is better
1859# we exit at some point than hang. 5 minutes with no progress means we're probably deadlocked. 1859# we exit at some point than hang. 5 minutes with no progress means we're probably deadlocked.
1860# This function can still deadlock python since it can't signal the other threads to exit
1861# (signals are handled in the main thread) and even os._exit() will wait on non-daemon threads
1862# to exit.
1860@contextmanager 1863@contextmanager
1861def lock_timeout(lock): 1864def lock_timeout(lock):
1862 try: 1865 try:
@@ -1869,3 +1872,15 @@ def lock_timeout(lock):
1869 finally: 1872 finally:
1870 lock.release() 1873 lock.release()
1871 signal.pthread_sigmask(signal.SIG_SETMASK, s) 1874 signal.pthread_sigmask(signal.SIG_SETMASK, s)
1875
1876# A version of lock_timeout without the check that the lock was locked and a shorter timeout
1877@contextmanager
1878def lock_timeout_nocheck(lock):
1879 try:
1880 s = signal.pthread_sigmask(signal.SIG_BLOCK, signal.valid_signals())
1881 l = lock.acquire(timeout=10)
1882 yield l
1883 finally:
1884 if l:
1885 lock.release()
1886 signal.pthread_sigmask(signal.SIG_SETMASK, s)