efro.debug
Utilities for debugging memory leaks or other issues.
IMPORTANT - these functions use the gc module which looks 'under the hood' at Python and sometimes returns not-fully-initialized objects, which may cause crashes or errors due to suddenly having references to them that they didn't expect, etc. See https://github.com/python/cpython/issues/59313. For this reason, these methods should NEVER be called in production code. Enable them only for debugging situations and be aware that their use may itself cause problems. The same is true for the gc module itself.
1# Released under the MIT License. See LICENSE for details. 2# 3"""Utilities for debugging memory leaks or other issues. 4 5IMPORTANT - these functions use the gc module which looks 'under the hood' 6at Python and sometimes returns not-fully-initialized objects, which may 7cause crashes or errors due to suddenly having references to them that they 8didn't expect, etc. See https://github.com/python/cpython/issues/59313. 9For this reason, these methods should NEVER be called in production code. 10Enable them only for debugging situations and be aware that their use may 11itself cause problems. The same is true for the gc module itself. 12""" 13from __future__ import annotations 14 15import gc 16import sys 17import time 18import types 19import weakref 20import threading 21from typing import TYPE_CHECKING 22 23if TYPE_CHECKING: 24 from typing import Any, TextIO 25 26 from logging import Logger 27 28ABS_MAX_LEVEL = 10 29 30# NOTE: In general we want this toolset to allow us to explore 31# which objects are holding references to others so we can diagnose 32# leaks/etc. It is a bit tricky to do that, however, without 33# affecting the objects we are looking at by adding temporary references 34# from module dicts, function scopes, etc. So we need to try to be 35# careful about cleaning up after ourselves and explicitly avoiding 36# returning these temporary references wherever possible. 37 38# A good test is running printrefs() repeatedly on some object that is 39# known to be static. If the list of references or the ids or any 40# the listed references changes with each run, it's a good sign that 41# we're showing some temporary objects that we should be ignoring. 42 43 44def getobjs( 45 cls: type | str, contains: str | None = None, expanded: bool = False 46) -> list[Any]: 47 """Return all garbage-collected objects matching criteria. 48 49 'type' can be an actual type or a string in which case objects 50 whose types contain that string will be returned. 51 52 If 'contains' is provided, objects will be filtered to those 53 containing that in their str() representations. 54 """ 55 56 # Don't wanna return stuff waiting to be garbage-collected. 57 gc.collect() 58 59 if not isinstance(cls, type | str): 60 raise TypeError('Expected a type or string for cls') 61 if not isinstance(contains, str | None): 62 raise TypeError('Expected a string or None for contains') 63 64 allobjs = _get_all_objects(expanded=expanded) 65 66 if isinstance(cls, str): 67 objs = [o for o in allobjs if cls in str(type(o))] 68 else: 69 objs = [o for o in allobjs if isinstance(o, cls)] 70 if contains is not None: 71 objs = [o for o in objs if contains in str(o)] 72 73 return objs 74 75 76# Recursively expand slists objects into olist, using seen to track 77# already processed objects. 78def _getr(slist: list[Any], olist: list[Any], seen: set[int]) -> None: 79 for obj in slist: 80 if id(obj) in seen: 81 continue 82 seen.add(id(obj)) 83 olist.append(obj) 84 tll = gc.get_referents(obj) 85 if tll: 86 _getr(tll, olist, seen) 87 88 89def _get_all_objects(expanded: bool) -> list[Any]: 90 """Return an expanded list of all objects. 91 92 See https://utcc.utoronto.ca/~cks/space/blog/python/GetAllObjects 93 """ 94 gcl = gc.get_objects() 95 if not expanded: 96 return gcl 97 olist: list[Any] = [] 98 seen: set[int] = set() 99 # Just in case: 100 seen.add(id(gcl)) 101 seen.add(id(olist)) 102 seen.add(id(seen)) 103 # _getr does the real work. 104 _getr(gcl, olist, seen) 105 return olist 106 107 108def getobj(objid: int, expanded: bool = False) -> Any: 109 """Return a garbage-collected object by its id. 110 111 Remember that this is VERY inefficient and should only ever be used 112 for debugging. 113 """ 114 if not isinstance(objid, int): 115 raise TypeError(f'Expected an int for objid; got a {type(objid)}.') 116 117 # Don't wanna return stuff waiting to be garbage-collected. 118 gc.collect() 119 120 allobjs = _get_all_objects(expanded=expanded) 121 for obj in allobjs: 122 if id(obj) == objid: 123 return obj 124 raise RuntimeError(f'Object with id {objid} not found.') 125 126 127def getrefs(obj: Any) -> list[Any]: 128 """Given an object, return things referencing it.""" 129 v = vars() # Ignore ref coming from locals. 130 return [o for o in gc.get_referrers(obj) if o is not v] 131 132 133def printfiles(file: TextIO | None = None) -> None: 134 """Print info about open files in the current app.""" 135 import io 136 137 file = sys.stderr if file is None else file 138 try: 139 import psutil 140 except ImportError: 141 print( 142 "Error: printfiles requires the 'psutil' module to be installed.", 143 file=file, 144 ) 145 return 146 147 proc = psutil.Process() 148 149 # Let's grab all Python file handles so we can associate raw files 150 # with their Python objects when possible. 151 fileio_ids = {obj.fileno(): obj for obj in getobjs(io.FileIO)} 152 textio_ids = {obj.fileno(): obj for obj in getobjs(io.TextIOWrapper)} 153 154 # FIXME: we could do a more limited version of this when psutil is 155 # not present that simply includes Python's files. 156 print('Files open by this app (not limited to Python\'s):', file=file) 157 for i, ofile in enumerate(proc.open_files()): 158 # Mypy doesn't know about mode apparently. 159 # (and can't use type: ignore because we don't require psutil 160 # and then mypy complains about unused ignore comment when its 161 # not present) 162 mode = getattr(ofile, 'mode') 163 assert isinstance(mode, str) 164 textio = textio_ids.get(ofile.fd) 165 textio_s = id(textio) if textio is not None else '<not found>' 166 fileio = fileio_ids.get(ofile.fd) 167 fileio_s = id(fileio) if fileio is not None else '<not found>' 168 print( 169 f'#{i+1}: path={ofile.path!r},' 170 f' fd={ofile.fd}, mode={mode!r}, TextIOWrapper={textio_s},' 171 f' FileIO={fileio_s}' 172 ) 173 174 175def printrefs( 176 obj: Any, 177 max_level: int = 2, 178 exclude_objs: list[Any] | None = None, 179 expand_ids: list[int] | None = None, 180 file: TextIO | None = None, 181) -> None: 182 """Print human readable list of objects referring to an object. 183 184 'max_level' specifies how many levels of recursion are printed. 185 'exclude_objs' can be a list of exact objects to skip if found in the 186 referrers list. This can be useful to avoid printing the local context 187 where the object was passed in from (locals(), etc). 188 'expand_ids' can be a list of object ids; if that particular object is 189 found, it will always be expanded even if max_level has been reached. 190 """ 191 _printrefs( 192 obj, 193 level=0, 194 max_level=max_level, 195 exclude_objs=[] if exclude_objs is None else exclude_objs, 196 expand_ids=[] if expand_ids is None else expand_ids, 197 file=sys.stderr if file is None else file, 198 ) 199 200 201def printtypes( 202 limit: int = 50, file: TextIO | None = None, expanded: bool = False 203) -> None: 204 """Print a human readable list of which types have the most instances.""" 205 assert limit > 0 206 objtypes: dict[str, int] = {} 207 gc.collect() # Recommended before get_objects(). 208 allobjs = _get_all_objects(expanded=expanded) 209 allobjc = len(allobjs) 210 for obj in allobjs: 211 modname = type(obj).__module__ 212 tpname = type(obj).__qualname__ 213 if modname != 'builtins': 214 tpname = f'{modname}.{tpname}' 215 objtypes[tpname] = objtypes.get(tpname, 0) + 1 216 217 # Presumably allobjs contains stack-frame/dict type stuff 218 # from this function call which in turn contain refs to allobjs. 219 # Let's try to prevent these huge lists from accumulating until 220 # the cyclical collector (hopefully) gets to them. 221 allobjs.clear() 222 del allobjs 223 224 print(f'Types most allocated ({allobjc} total objects):', file=file) 225 for i, tpitem in enumerate( 226 sorted(objtypes.items(), key=lambda x: x[1], reverse=True)[:limit] 227 ): 228 tpname, tpval = tpitem 229 percent = tpval / allobjc * 100.0 230 print(f'{i+1}: {tpname}: {tpval} ({percent:.2f}%)', file=file) 231 232 233def printsizes( 234 limit: int = 50, file: TextIO | None = None, expanded: bool = False 235) -> None: 236 """Print total allocated sizes of different types.""" 237 assert limit > 0 238 objsizes: dict[str, int] = {} 239 gc.collect() # Recommended before get_objects(). 240 allobjs = _get_all_objects(expanded=expanded) 241 totalobjsize = 0 242 243 for obj in allobjs: 244 modname = type(obj).__module__ 245 tpname = type(obj).__qualname__ 246 if modname != 'builtins': 247 tpname = f'{modname}.{tpname}' 248 objsize = sys.getsizeof(obj) 249 objsizes[tpname] = objsizes.get(tpname, 0) + objsize 250 totalobjsize += objsize 251 252 totalobjmb = totalobjsize / (1024 * 1024) 253 print( 254 f'Types with most allocated bytes ({totalobjmb:.2f} mb total):', 255 file=file, 256 ) 257 for i, tpitem in enumerate( 258 sorted(objsizes.items(), key=lambda x: x[1], reverse=True)[:limit] 259 ): 260 tpname, tpval = tpitem 261 percent = tpval / totalobjsize * 100.0 262 print(f'{i+1}: {tpname}: {tpval} ({percent:.2f}%)', file=file) 263 264 265def _desctype(obj: Any) -> str: 266 cls = type(obj) 267 # noinspection PyPep8 268 if cls is types.ModuleType: 269 return f'{type(obj).__name__} {obj.__name__}' 270 # noinspection PyPep8 271 if cls is types.MethodType: 272 bnd = 'bound' if hasattr(obj, '__self__') else 'unbound' 273 return f'{bnd} {type(obj).__name__} {obj.__name__}' 274 return f'{type(obj).__name__}' 275 276 277def _desc(obj: Any) -> str: 278 extra: str | None = None 279 if isinstance(obj, list | tuple): 280 # Print length and the first few types. 281 tps = [_desctype(i) for i in obj[:3]] 282 tpsj = ', '.join(tps) 283 tpss = ( 284 f', contains [{tpsj}, ...]' 285 if len(obj) > 3 286 else f', contains [{tpsj}]' if tps else '' 287 ) 288 extra = f' (len {len(obj)}{tpss})' 289 elif isinstance(obj, dict): 290 # If it seems to be the vars() for a type or module, 291 # try to identify what. 292 for ref in getrefs(obj): 293 if hasattr(ref, '__dict__') and vars(ref) is obj: 294 extra = f' (vars for {_desctype(ref)} @ {id(ref)})' 295 296 # Generic dict: print length and the first few key:type pairs. 297 if extra is None: 298 pairs = [ 299 f'{repr(n)}: {_desctype(v)}' for n, v in list(obj.items())[:3] 300 ] 301 pairsj = ', '.join(pairs) 302 pairss = ( 303 f', contains {{{pairsj}, ...}}' 304 if len(obj) > 3 305 else f', contains {{{pairsj}}}' if pairs else '' 306 ) 307 extra = f' (len {len(obj)}{pairss})' 308 if extra is None: 309 extra = '' 310 return f'{_desctype(obj)} @ {id(obj)}{extra}' 311 312 313def _printrefs( 314 obj: Any, 315 *, 316 level: int, 317 max_level: int, 318 exclude_objs: list, 319 expand_ids: list[int], 320 file: TextIO, 321) -> None: 322 ind = ' ' * level 323 print(ind + _desc(obj), file=file) 324 v = vars() 325 if level < max_level or (id(obj) in expand_ids and level < ABS_MAX_LEVEL): 326 refs = getrefs(obj) 327 for ref in refs: 328 # It seems we tend to get a transient cell object with contents 329 # set to obj. Would be nice to understand why that happens 330 # but just ignoring it for now. 331 if isinstance(ref, types.CellType) and ref.cell_contents is obj: 332 continue 333 334 # Ignore anything we were asked to ignore. 335 if exclude_objs is not None: 336 if any(ref is eobj for eobj in exclude_objs): 337 continue 338 339 # Ignore references from our locals. 340 if ref is v: 341 continue 342 343 # The 'refs' list we just made will be listed as a referrer 344 # of this obj, so explicitly exclude it from the obj's listing. 345 _printrefs( 346 ref, 347 level=level + 1, 348 max_level=max_level, 349 exclude_objs=exclude_objs + [refs], 350 expand_ids=expand_ids, 351 file=file, 352 ) 353 354 355class DeadlockDumper: 356 """Dumps thread states if still around after timeout seconds. 357 358 This uses low level Python functionality so should still fire 359 even in the case of deadlock. 360 """ 361 362 # faulthandler has a single traceback-dump-later state, so only 363 # one of us can control it at a time. 364 lock = threading.Lock() 365 watch_in_progress = False 366 367 def __init__(self, timeout: float) -> None: 368 import faulthandler 369 370 cls = type(self) 371 372 with cls.lock: 373 if cls.watch_in_progress: 374 print( 375 'Existing DeadlockDumper found; new one will be a no-op.', 376 file=sys.stderr, 377 ) 378 self.active = False 379 return 380 381 # Ok; no watch is in progress; we can be the active one. 382 cls.watch_in_progress = True 383 self.active = True 384 faulthandler.dump_traceback_later(timeout=timeout) 385 386 def __del__(self) -> None: 387 import faulthandler 388 389 cls = type(self) 390 391 # If we made it to here, call off the dump. But only if we are 392 # the active dump. 393 if self.active: 394 faulthandler.cancel_dump_traceback_later() 395 cls.watch_in_progress = False 396 397 398class DeadlockWatcher: 399 """Individual watcher for deadlock conditions. 400 401 Use the enable_deadlock_watchers() to enable this system. 402 403 Next, use these wrapped in a with statement around some operation 404 that may deadlock. If the with statement does not complete within the 405 timeout period, a traceback of all threads will be dumped. 406 407 Note that the checker thread runs a cycle every ~5 seconds, so 408 something stuck needs to remain stuck for 5 seconds or so to be 409 caught for sure. 410 """ 411 412 watchers_lock: threading.Lock | None = None 413 watchers: list[weakref.ref[DeadlockWatcher]] | None = None 414 415 def __init__( 416 self, 417 timeout: float = 10.0, 418 logger: Logger | None = None, 419 logextra: dict | None = None, 420 ) -> None: 421 from efro.util import caller_source_location 422 423 # pylint: disable=not-context-manager 424 cls = type(self) 425 if cls.watchers_lock is None or cls.watchers is None: 426 print( 427 'DeadlockWatcher created without watchers enabled.', 428 file=sys.stderr, 429 ) 430 return 431 432 # All we do is record when we were made and how long till we 433 # expire. 434 self.create_time = time.monotonic() 435 self.timeout = timeout 436 self.noted_expire = False 437 self.logger = logger 438 self.logextra = logextra 439 self.caller_source_loc = caller_source_location() 440 curthread = threading.current_thread() 441 self.thread_id = ( 442 '<unknown>' 443 if curthread.ident is None 444 else hex(curthread.ident).removeprefix('0x') 445 ) 446 self.active = False 447 448 with cls.watchers_lock: 449 cls.watchers.append(weakref.ref(self)) 450 451 # Support the with statement. 452 def __enter__(self) -> Any: 453 self.active = True 454 return self 455 456 def __exit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> None: 457 self.active = False 458 459 # Print if we lived past our deadline. This is just an extra 460 # data point. The watcher thread should be doing the actual 461 # stack dumps/etc. 462 if self.logger is None or self.logextra is None: 463 return 464 465 duration = time.monotonic() - self.create_time 466 if duration > self.timeout: 467 self.logger.error( 468 'DeadlockWatcher %s at %s in thread %s lived %.2fs,' 469 ' past timeout %.2fs. This should have triggered' 470 ' a deadlock dump.', 471 id(self), 472 self.caller_source_loc, 473 self.thread_id, 474 duration, 475 self.timeout, 476 extra=self.logextra, 477 ) 478 479 @classmethod 480 def enable_deadlock_watchers(cls) -> None: 481 """Spins up deadlock-watcher functionality. 482 483 Must be explicitly called before any DeadlockWatchers are 484 created. 485 """ 486 assert cls.watchers_lock is None 487 cls.watchers_lock = threading.Lock() 488 assert cls.watchers is None 489 cls.watchers = [] 490 491 threading.Thread( 492 target=cls._deadlock_watcher_thread_main, daemon=True 493 ).start() 494 495 @classmethod 496 def _deadlock_watcher_thread_main(cls) -> None: 497 # pylint: disable=not-context-manager 498 # pylint: disable=not-an-iterable 499 assert cls.watchers_lock is not None and cls.watchers is not None 500 501 # Spin in a loop checking our watchers periodically and dumping 502 # state if any have timed out. The trick here is that we don't 503 # explicitly dump state, but rather we set up a "dead man's 504 # switch" that does so after some amount of time if we don't 505 # explicitly cancel it. This way we'll hopefully get state dumps 506 # even for things like total GIL deadlocks. 507 while True: 508 509 # Set a dead man's switch for this pass. 510 dumper = DeadlockDumper(timeout=5.171) 511 512 # Sleep most of the way through it but give ourselves time 513 # to turn it off if we're still responsive. 514 time.sleep(4.129) 515 now = time.monotonic() 516 517 found_fresh_expired = False 518 519 # If any watcher is still active and expired, sleep past the 520 # timeout to force the dumper to do its thing. 521 with cls.watchers_lock: 522 523 for wref in cls.watchers: 524 w = wref() 525 if ( 526 w is not None 527 and now - w.create_time > w.timeout 528 and not w.noted_expire 529 and w.active 530 ): 531 # If they supplied a logger, let them know they 532 # should check stderr for a dump. 533 if w.logger is not None: 534 w.logger.error( 535 'DeadlockWatcher %s at %s in thread %s' 536 ' with time %.2f expired;' 537 ' check stderr for stack traces.', 538 id(w), 539 w.caller_source_loc, 540 w.thread_id, 541 w.timeout, 542 extra=w.logextra, 543 ) 544 found_fresh_expired = True 545 w.noted_expire = True 546 547 # Important to clear this ref; otherwise we can keep 548 # a random watcher alive until our next time through. 549 w = None 550 551 # Prune dead watchers and reset for the next pass. 552 cls.watchers = [w for w in cls.watchers if w() is not None] 553 554 if found_fresh_expired: 555 # Push us over the dumper time limit which give us a 556 # lovely dump. Technically we could just do an immediate 557 # dump here instead, but that would give us two paths to 558 # maintain instead of this single one. 559 time.sleep(2.0) 560 561 # Call off the dump if it hasn't happened yet. 562 del dumper
45def getobjs( 46 cls: type | str, contains: str | None = None, expanded: bool = False 47) -> list[Any]: 48 """Return all garbage-collected objects matching criteria. 49 50 'type' can be an actual type or a string in which case objects 51 whose types contain that string will be returned. 52 53 If 'contains' is provided, objects will be filtered to those 54 containing that in their str() representations. 55 """ 56 57 # Don't wanna return stuff waiting to be garbage-collected. 58 gc.collect() 59 60 if not isinstance(cls, type | str): 61 raise TypeError('Expected a type or string for cls') 62 if not isinstance(contains, str | None): 63 raise TypeError('Expected a string or None for contains') 64 65 allobjs = _get_all_objects(expanded=expanded) 66 67 if isinstance(cls, str): 68 objs = [o for o in allobjs if cls in str(type(o))] 69 else: 70 objs = [o for o in allobjs if isinstance(o, cls)] 71 if contains is not None: 72 objs = [o for o in objs if contains in str(o)] 73 74 return objs
Return all garbage-collected objects matching criteria.
'type' can be an actual type or a string in which case objects whose types contain that string will be returned.
If 'contains' is provided, objects will be filtered to those containing that in their str() representations.
109def getobj(objid: int, expanded: bool = False) -> Any: 110 """Return a garbage-collected object by its id. 111 112 Remember that this is VERY inefficient and should only ever be used 113 for debugging. 114 """ 115 if not isinstance(objid, int): 116 raise TypeError(f'Expected an int for objid; got a {type(objid)}.') 117 118 # Don't wanna return stuff waiting to be garbage-collected. 119 gc.collect() 120 121 allobjs = _get_all_objects(expanded=expanded) 122 for obj in allobjs: 123 if id(obj) == objid: 124 return obj 125 raise RuntimeError(f'Object with id {objid} not found.')
Return a garbage-collected object by its id.
Remember that this is VERY inefficient and should only ever be used for debugging.
128def getrefs(obj: Any) -> list[Any]: 129 """Given an object, return things referencing it.""" 130 v = vars() # Ignore ref coming from locals. 131 return [o for o in gc.get_referrers(obj) if o is not v]
Given an object, return things referencing it.
134def printfiles(file: TextIO | None = None) -> None: 135 """Print info about open files in the current app.""" 136 import io 137 138 file = sys.stderr if file is None else file 139 try: 140 import psutil 141 except ImportError: 142 print( 143 "Error: printfiles requires the 'psutil' module to be installed.", 144 file=file, 145 ) 146 return 147 148 proc = psutil.Process() 149 150 # Let's grab all Python file handles so we can associate raw files 151 # with their Python objects when possible. 152 fileio_ids = {obj.fileno(): obj for obj in getobjs(io.FileIO)} 153 textio_ids = {obj.fileno(): obj for obj in getobjs(io.TextIOWrapper)} 154 155 # FIXME: we could do a more limited version of this when psutil is 156 # not present that simply includes Python's files. 157 print('Files open by this app (not limited to Python\'s):', file=file) 158 for i, ofile in enumerate(proc.open_files()): 159 # Mypy doesn't know about mode apparently. 160 # (and can't use type: ignore because we don't require psutil 161 # and then mypy complains about unused ignore comment when its 162 # not present) 163 mode = getattr(ofile, 'mode') 164 assert isinstance(mode, str) 165 textio = textio_ids.get(ofile.fd) 166 textio_s = id(textio) if textio is not None else '<not found>' 167 fileio = fileio_ids.get(ofile.fd) 168 fileio_s = id(fileio) if fileio is not None else '<not found>' 169 print( 170 f'#{i+1}: path={ofile.path!r},' 171 f' fd={ofile.fd}, mode={mode!r}, TextIOWrapper={textio_s},' 172 f' FileIO={fileio_s}' 173 )
Print info about open files in the current app.
176def printrefs( 177 obj: Any, 178 max_level: int = 2, 179 exclude_objs: list[Any] | None = None, 180 expand_ids: list[int] | None = None, 181 file: TextIO | None = None, 182) -> None: 183 """Print human readable list of objects referring to an object. 184 185 'max_level' specifies how many levels of recursion are printed. 186 'exclude_objs' can be a list of exact objects to skip if found in the 187 referrers list. This can be useful to avoid printing the local context 188 where the object was passed in from (locals(), etc). 189 'expand_ids' can be a list of object ids; if that particular object is 190 found, it will always be expanded even if max_level has been reached. 191 """ 192 _printrefs( 193 obj, 194 level=0, 195 max_level=max_level, 196 exclude_objs=[] if exclude_objs is None else exclude_objs, 197 expand_ids=[] if expand_ids is None else expand_ids, 198 file=sys.stderr if file is None else file, 199 )
Print human readable list of objects referring to an object.
'max_level' specifies how many levels of recursion are printed. 'exclude_objs' can be a list of exact objects to skip if found in the referrers list. This can be useful to avoid printing the local context where the object was passed in from (locals(), etc). 'expand_ids' can be a list of object ids; if that particular object is found, it will always be expanded even if max_level has been reached.
202def printtypes( 203 limit: int = 50, file: TextIO | None = None, expanded: bool = False 204) -> None: 205 """Print a human readable list of which types have the most instances.""" 206 assert limit > 0 207 objtypes: dict[str, int] = {} 208 gc.collect() # Recommended before get_objects(). 209 allobjs = _get_all_objects(expanded=expanded) 210 allobjc = len(allobjs) 211 for obj in allobjs: 212 modname = type(obj).__module__ 213 tpname = type(obj).__qualname__ 214 if modname != 'builtins': 215 tpname = f'{modname}.{tpname}' 216 objtypes[tpname] = objtypes.get(tpname, 0) + 1 217 218 # Presumably allobjs contains stack-frame/dict type stuff 219 # from this function call which in turn contain refs to allobjs. 220 # Let's try to prevent these huge lists from accumulating until 221 # the cyclical collector (hopefully) gets to them. 222 allobjs.clear() 223 del allobjs 224 225 print(f'Types most allocated ({allobjc} total objects):', file=file) 226 for i, tpitem in enumerate( 227 sorted(objtypes.items(), key=lambda x: x[1], reverse=True)[:limit] 228 ): 229 tpname, tpval = tpitem 230 percent = tpval / allobjc * 100.0 231 print(f'{i+1}: {tpname}: {tpval} ({percent:.2f}%)', file=file)
Print a human readable list of which types have the most instances.
234def printsizes( 235 limit: int = 50, file: TextIO | None = None, expanded: bool = False 236) -> None: 237 """Print total allocated sizes of different types.""" 238 assert limit > 0 239 objsizes: dict[str, int] = {} 240 gc.collect() # Recommended before get_objects(). 241 allobjs = _get_all_objects(expanded=expanded) 242 totalobjsize = 0 243 244 for obj in allobjs: 245 modname = type(obj).__module__ 246 tpname = type(obj).__qualname__ 247 if modname != 'builtins': 248 tpname = f'{modname}.{tpname}' 249 objsize = sys.getsizeof(obj) 250 objsizes[tpname] = objsizes.get(tpname, 0) + objsize 251 totalobjsize += objsize 252 253 totalobjmb = totalobjsize / (1024 * 1024) 254 print( 255 f'Types with most allocated bytes ({totalobjmb:.2f} mb total):', 256 file=file, 257 ) 258 for i, tpitem in enumerate( 259 sorted(objsizes.items(), key=lambda x: x[1], reverse=True)[:limit] 260 ): 261 tpname, tpval = tpitem 262 percent = tpval / totalobjsize * 100.0 263 print(f'{i+1}: {tpname}: {tpval} ({percent:.2f}%)', file=file)
Print total allocated sizes of different types.
356class DeadlockDumper: 357 """Dumps thread states if still around after timeout seconds. 358 359 This uses low level Python functionality so should still fire 360 even in the case of deadlock. 361 """ 362 363 # faulthandler has a single traceback-dump-later state, so only 364 # one of us can control it at a time. 365 lock = threading.Lock() 366 watch_in_progress = False 367 368 def __init__(self, timeout: float) -> None: 369 import faulthandler 370 371 cls = type(self) 372 373 with cls.lock: 374 if cls.watch_in_progress: 375 print( 376 'Existing DeadlockDumper found; new one will be a no-op.', 377 file=sys.stderr, 378 ) 379 self.active = False 380 return 381 382 # Ok; no watch is in progress; we can be the active one. 383 cls.watch_in_progress = True 384 self.active = True 385 faulthandler.dump_traceback_later(timeout=timeout) 386 387 def __del__(self) -> None: 388 import faulthandler 389 390 cls = type(self) 391 392 # If we made it to here, call off the dump. But only if we are 393 # the active dump. 394 if self.active: 395 faulthandler.cancel_dump_traceback_later() 396 cls.watch_in_progress = False
Dumps thread states if still around after timeout seconds.
This uses low level Python functionality so should still fire even in the case of deadlock.
368 def __init__(self, timeout: float) -> None: 369 import faulthandler 370 371 cls = type(self) 372 373 with cls.lock: 374 if cls.watch_in_progress: 375 print( 376 'Existing DeadlockDumper found; new one will be a no-op.', 377 file=sys.stderr, 378 ) 379 self.active = False 380 return 381 382 # Ok; no watch is in progress; we can be the active one. 383 cls.watch_in_progress = True 384 self.active = True 385 faulthandler.dump_traceback_later(timeout=timeout)
399class DeadlockWatcher: 400 """Individual watcher for deadlock conditions. 401 402 Use the enable_deadlock_watchers() to enable this system. 403 404 Next, use these wrapped in a with statement around some operation 405 that may deadlock. If the with statement does not complete within the 406 timeout period, a traceback of all threads will be dumped. 407 408 Note that the checker thread runs a cycle every ~5 seconds, so 409 something stuck needs to remain stuck for 5 seconds or so to be 410 caught for sure. 411 """ 412 413 watchers_lock: threading.Lock | None = None 414 watchers: list[weakref.ref[DeadlockWatcher]] | None = None 415 416 def __init__( 417 self, 418 timeout: float = 10.0, 419 logger: Logger | None = None, 420 logextra: dict | None = None, 421 ) -> None: 422 from efro.util import caller_source_location 423 424 # pylint: disable=not-context-manager 425 cls = type(self) 426 if cls.watchers_lock is None or cls.watchers is None: 427 print( 428 'DeadlockWatcher created without watchers enabled.', 429 file=sys.stderr, 430 ) 431 return 432 433 # All we do is record when we were made and how long till we 434 # expire. 435 self.create_time = time.monotonic() 436 self.timeout = timeout 437 self.noted_expire = False 438 self.logger = logger 439 self.logextra = logextra 440 self.caller_source_loc = caller_source_location() 441 curthread = threading.current_thread() 442 self.thread_id = ( 443 '<unknown>' 444 if curthread.ident is None 445 else hex(curthread.ident).removeprefix('0x') 446 ) 447 self.active = False 448 449 with cls.watchers_lock: 450 cls.watchers.append(weakref.ref(self)) 451 452 # Support the with statement. 453 def __enter__(self) -> Any: 454 self.active = True 455 return self 456 457 def __exit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> None: 458 self.active = False 459 460 # Print if we lived past our deadline. This is just an extra 461 # data point. The watcher thread should be doing the actual 462 # stack dumps/etc. 463 if self.logger is None or self.logextra is None: 464 return 465 466 duration = time.monotonic() - self.create_time 467 if duration > self.timeout: 468 self.logger.error( 469 'DeadlockWatcher %s at %s in thread %s lived %.2fs,' 470 ' past timeout %.2fs. This should have triggered' 471 ' a deadlock dump.', 472 id(self), 473 self.caller_source_loc, 474 self.thread_id, 475 duration, 476 self.timeout, 477 extra=self.logextra, 478 ) 479 480 @classmethod 481 def enable_deadlock_watchers(cls) -> None: 482 """Spins up deadlock-watcher functionality. 483 484 Must be explicitly called before any DeadlockWatchers are 485 created. 486 """ 487 assert cls.watchers_lock is None 488 cls.watchers_lock = threading.Lock() 489 assert cls.watchers is None 490 cls.watchers = [] 491 492 threading.Thread( 493 target=cls._deadlock_watcher_thread_main, daemon=True 494 ).start() 495 496 @classmethod 497 def _deadlock_watcher_thread_main(cls) -> None: 498 # pylint: disable=not-context-manager 499 # pylint: disable=not-an-iterable 500 assert cls.watchers_lock is not None and cls.watchers is not None 501 502 # Spin in a loop checking our watchers periodically and dumping 503 # state if any have timed out. The trick here is that we don't 504 # explicitly dump state, but rather we set up a "dead man's 505 # switch" that does so after some amount of time if we don't 506 # explicitly cancel it. This way we'll hopefully get state dumps 507 # even for things like total GIL deadlocks. 508 while True: 509 510 # Set a dead man's switch for this pass. 511 dumper = DeadlockDumper(timeout=5.171) 512 513 # Sleep most of the way through it but give ourselves time 514 # to turn it off if we're still responsive. 515 time.sleep(4.129) 516 now = time.monotonic() 517 518 found_fresh_expired = False 519 520 # If any watcher is still active and expired, sleep past the 521 # timeout to force the dumper to do its thing. 522 with cls.watchers_lock: 523 524 for wref in cls.watchers: 525 w = wref() 526 if ( 527 w is not None 528 and now - w.create_time > w.timeout 529 and not w.noted_expire 530 and w.active 531 ): 532 # If they supplied a logger, let them know they 533 # should check stderr for a dump. 534 if w.logger is not None: 535 w.logger.error( 536 'DeadlockWatcher %s at %s in thread %s' 537 ' with time %.2f expired;' 538 ' check stderr for stack traces.', 539 id(w), 540 w.caller_source_loc, 541 w.thread_id, 542 w.timeout, 543 extra=w.logextra, 544 ) 545 found_fresh_expired = True 546 w.noted_expire = True 547 548 # Important to clear this ref; otherwise we can keep 549 # a random watcher alive until our next time through. 550 w = None 551 552 # Prune dead watchers and reset for the next pass. 553 cls.watchers = [w for w in cls.watchers if w() is not None] 554 555 if found_fresh_expired: 556 # Push us over the dumper time limit which give us a 557 # lovely dump. Technically we could just do an immediate 558 # dump here instead, but that would give us two paths to 559 # maintain instead of this single one. 560 time.sleep(2.0) 561 562 # Call off the dump if it hasn't happened yet. 563 del dumper
Individual watcher for deadlock conditions.
Use the enable_deadlock_watchers() to enable this system.
Next, use these wrapped in a with statement around some operation that may deadlock. If the with statement does not complete within the timeout period, a traceback of all threads will be dumped.
Note that the checker thread runs a cycle every ~5 seconds, so something stuck needs to remain stuck for 5 seconds or so to be caught for sure.
416 def __init__( 417 self, 418 timeout: float = 10.0, 419 logger: Logger | None = None, 420 logextra: dict | None = None, 421 ) -> None: 422 from efro.util import caller_source_location 423 424 # pylint: disable=not-context-manager 425 cls = type(self) 426 if cls.watchers_lock is None or cls.watchers is None: 427 print( 428 'DeadlockWatcher created without watchers enabled.', 429 file=sys.stderr, 430 ) 431 return 432 433 # All we do is record when we were made and how long till we 434 # expire. 435 self.create_time = time.monotonic() 436 self.timeout = timeout 437 self.noted_expire = False 438 self.logger = logger 439 self.logextra = logextra 440 self.caller_source_loc = caller_source_location() 441 curthread = threading.current_thread() 442 self.thread_id = ( 443 '<unknown>' 444 if curthread.ident is None 445 else hex(curthread.ident).removeprefix('0x') 446 ) 447 self.active = False 448 449 with cls.watchers_lock: 450 cls.watchers.append(weakref.ref(self))
480 @classmethod 481 def enable_deadlock_watchers(cls) -> None: 482 """Spins up deadlock-watcher functionality. 483 484 Must be explicitly called before any DeadlockWatchers are 485 created. 486 """ 487 assert cls.watchers_lock is None 488 cls.watchers_lock = threading.Lock() 489 assert cls.watchers is None 490 cls.watchers = [] 491 492 threading.Thread( 493 target=cls._deadlock_watcher_thread_main, daemon=True 494 ).start()
Spins up deadlock-watcher functionality.
Must be explicitly called before any DeadlockWatchers are created.