efro.debug

Utilities for debugging memory leaks or other issues.

IMPORTANT - these functions use the gc module which looks 'under the hood' at Python and sometimes returns not-fully-initialized objects, which may cause crashes or errors due to suddenly having references to them that they didn't expect, etc. See https://github.com/python/cpython/issues/59313. For this reason, these methods should NEVER be called in production code. Enable them only for debugging situations and be aware that their use may itself cause problems. The same is true for the gc module itself.

  1# Released under the MIT License. See LICENSE for details.
  2#
  3"""Utilities for debugging memory leaks or other issues.
  4
  5IMPORTANT - these functions use the gc module which looks 'under the hood'
  6at Python and sometimes returns not-fully-initialized objects, which may
  7cause crashes or errors due to suddenly having references to them that they
  8didn't expect, etc. See https://github.com/python/cpython/issues/59313.
  9For this reason, these methods should NEVER be called in production code.
 10Enable them only for debugging situations and be aware that their use may
 11itself cause problems. The same is true for the gc module itself.
 12"""
 13from __future__ import annotations
 14
 15import gc
 16import sys
 17import time
 18import types
 19import weakref
 20import threading
 21from typing import TYPE_CHECKING
 22
 23if TYPE_CHECKING:
 24    from typing import Any, TextIO
 25
 26    from logging import Logger
 27
 28ABS_MAX_LEVEL = 10
 29
 30# NOTE: In general we want this toolset to allow us to explore
 31# which objects are holding references to others so we can diagnose
 32# leaks/etc. It is a bit tricky to do that, however, without
 33# affecting the objects we are looking at by adding temporary references
 34# from module dicts, function scopes, etc. So we need to try to be
 35# careful about cleaning up after ourselves and explicitly avoiding
 36# returning these temporary references wherever possible.
 37
 38# A good test is running printrefs() repeatedly on some object that is
 39# known to be static. If the list of references or the ids or any
 40# the listed references changes with each run, it's a good sign that
 41# we're showing some temporary objects that we should be ignoring.
 42
 43
 44def getobjs(
 45    cls: type | str, contains: str | None = None, expanded: bool = False
 46) -> list[Any]:
 47    """Return all garbage-collected objects matching criteria.
 48
 49    'type' can be an actual type or a string in which case objects
 50    whose types contain that string will be returned.
 51
 52    If 'contains' is provided, objects will be filtered to those
 53    containing that in their str() representations.
 54    """
 55
 56    # Don't wanna return stuff waiting to be garbage-collected.
 57    gc.collect()
 58
 59    if not isinstance(cls, type | str):
 60        raise TypeError('Expected a type or string for cls')
 61    if not isinstance(contains, str | None):
 62        raise TypeError('Expected a string or None for contains')
 63
 64    allobjs = _get_all_objects(expanded=expanded)
 65
 66    if isinstance(cls, str):
 67        objs = [o for o in allobjs if cls in str(type(o))]
 68    else:
 69        objs = [o for o in allobjs if isinstance(o, cls)]
 70    if contains is not None:
 71        objs = [o for o in objs if contains in str(o)]
 72
 73    return objs
 74
 75
 76# Recursively expand slists objects into olist, using seen to track
 77# already processed objects.
 78def _getr(slist: list[Any], olist: list[Any], seen: set[int]) -> None:
 79    for obj in slist:
 80        if id(obj) in seen:
 81            continue
 82        seen.add(id(obj))
 83        olist.append(obj)
 84        tll = gc.get_referents(obj)
 85        if tll:
 86            _getr(tll, olist, seen)
 87
 88
 89def _get_all_objects(expanded: bool) -> list[Any]:
 90    """Return an expanded list of all objects.
 91
 92    See https://utcc.utoronto.ca/~cks/space/blog/python/GetAllObjects
 93    """
 94    gcl = gc.get_objects()
 95    if not expanded:
 96        return gcl
 97    olist: list[Any] = []
 98    seen: set[int] = set()
 99    # Just in case:
100    seen.add(id(gcl))
101    seen.add(id(olist))
102    seen.add(id(seen))
103    # _getr does the real work.
104    _getr(gcl, olist, seen)
105    return olist
106
107
108def getobj(objid: int, expanded: bool = False) -> Any:
109    """Return a garbage-collected object by its id.
110
111    Remember that this is VERY inefficient and should only ever be used
112    for debugging.
113    """
114    if not isinstance(objid, int):
115        raise TypeError(f'Expected an int for objid; got a {type(objid)}.')
116
117    # Don't wanna return stuff waiting to be garbage-collected.
118    gc.collect()
119
120    allobjs = _get_all_objects(expanded=expanded)
121    for obj in allobjs:
122        if id(obj) == objid:
123            return obj
124    raise RuntimeError(f'Object with id {objid} not found.')
125
126
127def getrefs(obj: Any) -> list[Any]:
128    """Given an object, return things referencing it."""
129    v = vars()  # Ignore ref coming from locals.
130    return [o for o in gc.get_referrers(obj) if o is not v]
131
132
133def printfiles(file: TextIO | None = None) -> None:
134    """Print info about open files in the current app."""
135    import io
136
137    file = sys.stderr if file is None else file
138    try:
139        import psutil
140    except ImportError:
141        print(
142            "Error: printfiles requires the 'psutil' module to be installed.",
143            file=file,
144        )
145        return
146
147    proc = psutil.Process()
148
149    # Let's grab all Python file handles so we can associate raw files
150    # with their Python objects when possible.
151    fileio_ids = {obj.fileno(): obj for obj in getobjs(io.FileIO)}
152    textio_ids = {obj.fileno(): obj for obj in getobjs(io.TextIOWrapper)}
153
154    # FIXME: we could do a more limited version of this when psutil is
155    # not present that simply includes Python's files.
156    print('Files open by this app (not limited to Python\'s):', file=file)
157    for i, ofile in enumerate(proc.open_files()):
158        # Mypy doesn't know about mode apparently.
159        # (and can't use type: ignore because we don't require psutil
160        # and then mypy complains about unused ignore comment when its
161        # not present)
162        mode = getattr(ofile, 'mode')
163        assert isinstance(mode, str)
164        textio = textio_ids.get(ofile.fd)
165        textio_s = id(textio) if textio is not None else '<not found>'
166        fileio = fileio_ids.get(ofile.fd)
167        fileio_s = id(fileio) if fileio is not None else '<not found>'
168        print(
169            f'#{i+1}: path={ofile.path!r},'
170            f' fd={ofile.fd}, mode={mode!r}, TextIOWrapper={textio_s},'
171            f' FileIO={fileio_s}'
172        )
173
174
175def printrefs(
176    obj: Any,
177    max_level: int = 2,
178    exclude_objs: list[Any] | None = None,
179    expand_ids: list[int] | None = None,
180    file: TextIO | None = None,
181) -> None:
182    """Print human readable list of objects referring to an object.
183
184    'max_level' specifies how many levels of recursion are printed.
185    'exclude_objs' can be a list of exact objects to skip if found in the
186      referrers list. This can be useful to avoid printing the local context
187      where the object was passed in from (locals(), etc).
188    'expand_ids' can be a list of object ids; if that particular object is
189      found, it will always be expanded even if max_level has been reached.
190    """
191    _printrefs(
192        obj,
193        level=0,
194        max_level=max_level,
195        exclude_objs=[] if exclude_objs is None else exclude_objs,
196        expand_ids=[] if expand_ids is None else expand_ids,
197        file=sys.stderr if file is None else file,
198    )
199
200
201def printtypes(
202    limit: int = 50, file: TextIO | None = None, expanded: bool = False
203) -> None:
204    """Print a human readable list of which types have the most instances."""
205    assert limit > 0
206    objtypes: dict[str, int] = {}
207    gc.collect()  # Recommended before get_objects().
208    allobjs = _get_all_objects(expanded=expanded)
209    allobjc = len(allobjs)
210    for obj in allobjs:
211        modname = type(obj).__module__
212        tpname = type(obj).__qualname__
213        if modname != 'builtins':
214            tpname = f'{modname}.{tpname}'
215        objtypes[tpname] = objtypes.get(tpname, 0) + 1
216
217    # Presumably allobjs contains stack-frame/dict type stuff
218    # from this function call which in turn contain refs to allobjs.
219    # Let's try to prevent these huge lists from accumulating until
220    # the cyclical collector (hopefully) gets to them.
221    allobjs.clear()
222    del allobjs
223
224    print(f'Types most allocated ({allobjc} total objects):', file=file)
225    for i, tpitem in enumerate(
226        sorted(objtypes.items(), key=lambda x: x[1], reverse=True)[:limit]
227    ):
228        tpname, tpval = tpitem
229        percent = tpval / allobjc * 100.0
230        print(f'{i+1}: {tpname}: {tpval} ({percent:.2f}%)', file=file)
231
232
233def printsizes(
234    limit: int = 50, file: TextIO | None = None, expanded: bool = False
235) -> None:
236    """Print total allocated sizes of different types."""
237    assert limit > 0
238    objsizes: dict[str, int] = {}
239    gc.collect()  # Recommended before get_objects().
240    allobjs = _get_all_objects(expanded=expanded)
241    totalobjsize = 0
242
243    for obj in allobjs:
244        modname = type(obj).__module__
245        tpname = type(obj).__qualname__
246        if modname != 'builtins':
247            tpname = f'{modname}.{tpname}'
248        objsize = sys.getsizeof(obj)
249        objsizes[tpname] = objsizes.get(tpname, 0) + objsize
250        totalobjsize += objsize
251
252    totalobjmb = totalobjsize / (1024 * 1024)
253    print(
254        f'Types with most allocated bytes ({totalobjmb:.2f} mb total):',
255        file=file,
256    )
257    for i, tpitem in enumerate(
258        sorted(objsizes.items(), key=lambda x: x[1], reverse=True)[:limit]
259    ):
260        tpname, tpval = tpitem
261        percent = tpval / totalobjsize * 100.0
262        print(f'{i+1}: {tpname}: {tpval} ({percent:.2f}%)', file=file)
263
264
265def _desctype(obj: Any) -> str:
266    cls = type(obj)
267    # noinspection PyPep8
268    if cls is types.ModuleType:
269        return f'{type(obj).__name__} {obj.__name__}'
270    # noinspection PyPep8
271    if cls is types.MethodType:
272        bnd = 'bound' if hasattr(obj, '__self__') else 'unbound'
273        return f'{bnd} {type(obj).__name__} {obj.__name__}'
274    return f'{type(obj).__name__}'
275
276
277def _desc(obj: Any) -> str:
278    extra: str | None = None
279    if isinstance(obj, list | tuple):
280        # Print length and the first few types.
281        tps = [_desctype(i) for i in obj[:3]]
282        tpsj = ', '.join(tps)
283        tpss = (
284            f', contains [{tpsj}, ...]'
285            if len(obj) > 3
286            else f', contains [{tpsj}]' if tps else ''
287        )
288        extra = f' (len {len(obj)}{tpss})'
289    elif isinstance(obj, dict):
290        # If it seems to be the vars() for a type or module,
291        # try to identify what.
292        for ref in getrefs(obj):
293            if hasattr(ref, '__dict__') and vars(ref) is obj:
294                extra = f' (vars for {_desctype(ref)} @ {id(ref)})'
295
296        # Generic dict: print length and the first few key:type pairs.
297        if extra is None:
298            pairs = [
299                f'{repr(n)}: {_desctype(v)}' for n, v in list(obj.items())[:3]
300            ]
301            pairsj = ', '.join(pairs)
302            pairss = (
303                f', contains {{{pairsj}, ...}}'
304                if len(obj) > 3
305                else f', contains {{{pairsj}}}' if pairs else ''
306            )
307            extra = f' (len {len(obj)}{pairss})'
308    if extra is None:
309        extra = ''
310    return f'{_desctype(obj)} @ {id(obj)}{extra}'
311
312
313def _printrefs(
314    obj: Any,
315    *,
316    level: int,
317    max_level: int,
318    exclude_objs: list,
319    expand_ids: list[int],
320    file: TextIO,
321) -> None:
322    ind = '  ' * level
323    print(ind + _desc(obj), file=file)
324    v = vars()
325    if level < max_level or (id(obj) in expand_ids and level < ABS_MAX_LEVEL):
326        refs = getrefs(obj)
327        for ref in refs:
328            # It seems we tend to get a transient cell object with contents
329            # set to obj. Would be nice to understand why that happens
330            # but just ignoring it for now.
331            if isinstance(ref, types.CellType) and ref.cell_contents is obj:
332                continue
333
334            # Ignore anything we were asked to ignore.
335            if exclude_objs is not None:
336                if any(ref is eobj for eobj in exclude_objs):
337                    continue
338
339            # Ignore references from our locals.
340            if ref is v:
341                continue
342
343            # The 'refs' list we just made will be listed as a referrer
344            # of this obj, so explicitly exclude it from the obj's listing.
345            _printrefs(
346                ref,
347                level=level + 1,
348                max_level=max_level,
349                exclude_objs=exclude_objs + [refs],
350                expand_ids=expand_ids,
351                file=file,
352            )
353
354
355class DeadlockDumper:
356    """Dumps thread states if still around after timeout seconds.
357
358    This uses low level Python functionality so should still fire
359    even in the case of deadlock.
360    """
361
362    # faulthandler has a single traceback-dump-later state, so only
363    # one of us can control it at a time.
364    lock = threading.Lock()
365    watch_in_progress = False
366
367    def __init__(self, timeout: float) -> None:
368        import faulthandler
369
370        cls = type(self)
371
372        with cls.lock:
373            if cls.watch_in_progress:
374                print(
375                    'Existing DeadlockDumper found; new one will be a no-op.',
376                    file=sys.stderr,
377                )
378                self.active = False
379                return
380
381            # Ok; no watch is in progress; we can be the active one.
382            cls.watch_in_progress = True
383            self.active = True
384            faulthandler.dump_traceback_later(timeout=timeout)
385
386    def __del__(self) -> None:
387        import faulthandler
388
389        cls = type(self)
390
391        # If we made it to here, call off the dump. But only if we are
392        # the active dump.
393        if self.active:
394            faulthandler.cancel_dump_traceback_later()
395            cls.watch_in_progress = False
396
397
398class DeadlockWatcher:
399    """Individual watcher for deadlock conditions.
400
401    Use the enable_deadlock_watchers() to enable this system.
402
403    Next, use these wrapped in a with statement around some operation
404    that may deadlock. If the with statement does not complete within the
405    timeout period, a traceback of all threads will be dumped.
406
407    Note that the checker thread runs a cycle every ~5 seconds, so
408    something stuck needs to remain stuck for 5 seconds or so to be
409    caught for sure.
410    """
411
412    watchers_lock: threading.Lock | None = None
413    watchers: list[weakref.ref[DeadlockWatcher]] | None = None
414
415    def __init__(
416        self,
417        timeout: float = 10.0,
418        logger: Logger | None = None,
419        logextra: dict | None = None,
420    ) -> None:
421        from efro.util import caller_source_location
422
423        # pylint: disable=not-context-manager
424        cls = type(self)
425        if cls.watchers_lock is None or cls.watchers is None:
426            print(
427                'DeadlockWatcher created without watchers enabled.',
428                file=sys.stderr,
429            )
430            return
431
432        # All we do is record when we were made and how long till we
433        # expire.
434        self.create_time = time.monotonic()
435        self.timeout = timeout
436        self.noted_expire = False
437        self.logger = logger
438        self.logextra = logextra
439        self.caller_source_loc = caller_source_location()
440        curthread = threading.current_thread()
441        self.thread_id = (
442            '<unknown>'
443            if curthread.ident is None
444            else hex(curthread.ident).removeprefix('0x')
445        )
446        self.active = False
447
448        with cls.watchers_lock:
449            cls.watchers.append(weakref.ref(self))
450
451    # Support the with statement.
452    def __enter__(self) -> Any:
453        self.active = True
454        return self
455
456    def __exit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> None:
457        self.active = False
458
459        # Print if we lived past our deadline. This is just an extra
460        # data point. The watcher thread should be doing the actual
461        # stack dumps/etc.
462        if self.logger is None or self.logextra is None:
463            return
464
465        duration = time.monotonic() - self.create_time
466        if duration > self.timeout:
467            self.logger.error(
468                'DeadlockWatcher %s at %s in thread %s lived %.2fs,'
469                ' past timeout %.2fs. This should have triggered'
470                ' a deadlock dump.',
471                id(self),
472                self.caller_source_loc,
473                self.thread_id,
474                duration,
475                self.timeout,
476                extra=self.logextra,
477            )
478
479    @classmethod
480    def enable_deadlock_watchers(cls) -> None:
481        """Spins up deadlock-watcher functionality.
482
483        Must be explicitly called before any DeadlockWatchers are
484        created.
485        """
486        assert cls.watchers_lock is None
487        cls.watchers_lock = threading.Lock()
488        assert cls.watchers is None
489        cls.watchers = []
490
491        threading.Thread(
492            target=cls._deadlock_watcher_thread_main, daemon=True
493        ).start()
494
495    @classmethod
496    def _deadlock_watcher_thread_main(cls) -> None:
497        # pylint: disable=not-context-manager
498        # pylint: disable=not-an-iterable
499        assert cls.watchers_lock is not None and cls.watchers is not None
500
501        # Spin in a loop checking our watchers periodically and dumping
502        # state if any have timed out. The trick here is that we don't
503        # explicitly dump state, but rather we set up a "dead man's
504        # switch" that does so after some amount of time if we don't
505        # explicitly cancel it. This way we'll hopefully get state dumps
506        # even for things like total GIL deadlocks.
507        while True:
508
509            # Set a dead man's switch for this pass.
510            dumper = DeadlockDumper(timeout=5.171)
511
512            # Sleep most of the way through it but give ourselves time
513            # to turn it off if we're still responsive.
514            time.sleep(4.129)
515            now = time.monotonic()
516
517            found_fresh_expired = False
518
519            # If any watcher is still active and expired, sleep past the
520            # timeout to force the dumper to do its thing.
521            with cls.watchers_lock:
522
523                for wref in cls.watchers:
524                    w = wref()
525                    if (
526                        w is not None
527                        and now - w.create_time > w.timeout
528                        and not w.noted_expire
529                        and w.active
530                    ):
531                        # If they supplied a logger, let them know they
532                        # should check stderr for a dump.
533                        if w.logger is not None:
534                            w.logger.error(
535                                'DeadlockWatcher %s at %s in thread %s'
536                                ' with time %.2f expired;'
537                                ' check stderr for stack traces.',
538                                id(w),
539                                w.caller_source_loc,
540                                w.thread_id,
541                                w.timeout,
542                                extra=w.logextra,
543                            )
544                        found_fresh_expired = True
545                        w.noted_expire = True
546
547                    # Important to clear this ref; otherwise we can keep
548                    # a random watcher alive until our next time through.
549                    w = None
550
551                # Prune dead watchers and reset for the next pass.
552                cls.watchers = [w for w in cls.watchers if w() is not None]
553
554            if found_fresh_expired:
555                # Push us over the dumper time limit which give us a
556                # lovely dump. Technically we could just do an immediate
557                # dump here instead, but that would give us two paths to
558                # maintain instead of this single one.
559                time.sleep(2.0)
560
561            # Call off the dump if it hasn't happened yet.
562            del dumper
ABS_MAX_LEVEL = 10
def getobjs( cls: type | str, contains: str | None = None, expanded: bool = False) -> list[typing.Any]:
45def getobjs(
46    cls: type | str, contains: str | None = None, expanded: bool = False
47) -> list[Any]:
48    """Return all garbage-collected objects matching criteria.
49
50    'type' can be an actual type or a string in which case objects
51    whose types contain that string will be returned.
52
53    If 'contains' is provided, objects will be filtered to those
54    containing that in their str() representations.
55    """
56
57    # Don't wanna return stuff waiting to be garbage-collected.
58    gc.collect()
59
60    if not isinstance(cls, type | str):
61        raise TypeError('Expected a type or string for cls')
62    if not isinstance(contains, str | None):
63        raise TypeError('Expected a string or None for contains')
64
65    allobjs = _get_all_objects(expanded=expanded)
66
67    if isinstance(cls, str):
68        objs = [o for o in allobjs if cls in str(type(o))]
69    else:
70        objs = [o for o in allobjs if isinstance(o, cls)]
71    if contains is not None:
72        objs = [o for o in objs if contains in str(o)]
73
74    return objs

Return all garbage-collected objects matching criteria.

'type' can be an actual type or a string in which case objects whose types contain that string will be returned.

If 'contains' is provided, objects will be filtered to those containing that in their str() representations.

def getobj(objid: int, expanded: bool = False) -> Any:
109def getobj(objid: int, expanded: bool = False) -> Any:
110    """Return a garbage-collected object by its id.
111
112    Remember that this is VERY inefficient and should only ever be used
113    for debugging.
114    """
115    if not isinstance(objid, int):
116        raise TypeError(f'Expected an int for objid; got a {type(objid)}.')
117
118    # Don't wanna return stuff waiting to be garbage-collected.
119    gc.collect()
120
121    allobjs = _get_all_objects(expanded=expanded)
122    for obj in allobjs:
123        if id(obj) == objid:
124            return obj
125    raise RuntimeError(f'Object with id {objid} not found.')

Return a garbage-collected object by its id.

Remember that this is VERY inefficient and should only ever be used for debugging.

def getrefs(obj: Any) -> list[typing.Any]:
128def getrefs(obj: Any) -> list[Any]:
129    """Given an object, return things referencing it."""
130    v = vars()  # Ignore ref coming from locals.
131    return [o for o in gc.get_referrers(obj) if o is not v]

Given an object, return things referencing it.

def printfiles(file: typing.TextIO | None = None) -> None:
134def printfiles(file: TextIO | None = None) -> None:
135    """Print info about open files in the current app."""
136    import io
137
138    file = sys.stderr if file is None else file
139    try:
140        import psutil
141    except ImportError:
142        print(
143            "Error: printfiles requires the 'psutil' module to be installed.",
144            file=file,
145        )
146        return
147
148    proc = psutil.Process()
149
150    # Let's grab all Python file handles so we can associate raw files
151    # with their Python objects when possible.
152    fileio_ids = {obj.fileno(): obj for obj in getobjs(io.FileIO)}
153    textio_ids = {obj.fileno(): obj for obj in getobjs(io.TextIOWrapper)}
154
155    # FIXME: we could do a more limited version of this when psutil is
156    # not present that simply includes Python's files.
157    print('Files open by this app (not limited to Python\'s):', file=file)
158    for i, ofile in enumerate(proc.open_files()):
159        # Mypy doesn't know about mode apparently.
160        # (and can't use type: ignore because we don't require psutil
161        # and then mypy complains about unused ignore comment when its
162        # not present)
163        mode = getattr(ofile, 'mode')
164        assert isinstance(mode, str)
165        textio = textio_ids.get(ofile.fd)
166        textio_s = id(textio) if textio is not None else '<not found>'
167        fileio = fileio_ids.get(ofile.fd)
168        fileio_s = id(fileio) if fileio is not None else '<not found>'
169        print(
170            f'#{i+1}: path={ofile.path!r},'
171            f' fd={ofile.fd}, mode={mode!r}, TextIOWrapper={textio_s},'
172            f' FileIO={fileio_s}'
173        )

Print info about open files in the current app.

def printrefs( obj: Any, max_level: int = 2, exclude_objs: list[typing.Any] | None = None, expand_ids: list[int] | None = None, file: typing.TextIO | None = None) -> None:
176def printrefs(
177    obj: Any,
178    max_level: int = 2,
179    exclude_objs: list[Any] | None = None,
180    expand_ids: list[int] | None = None,
181    file: TextIO | None = None,
182) -> None:
183    """Print human readable list of objects referring to an object.
184
185    'max_level' specifies how many levels of recursion are printed.
186    'exclude_objs' can be a list of exact objects to skip if found in the
187      referrers list. This can be useful to avoid printing the local context
188      where the object was passed in from (locals(), etc).
189    'expand_ids' can be a list of object ids; if that particular object is
190      found, it will always be expanded even if max_level has been reached.
191    """
192    _printrefs(
193        obj,
194        level=0,
195        max_level=max_level,
196        exclude_objs=[] if exclude_objs is None else exclude_objs,
197        expand_ids=[] if expand_ids is None else expand_ids,
198        file=sys.stderr if file is None else file,
199    )

Print human readable list of objects referring to an object.

'max_level' specifies how many levels of recursion are printed. 'exclude_objs' can be a list of exact objects to skip if found in the referrers list. This can be useful to avoid printing the local context where the object was passed in from (locals(), etc). 'expand_ids' can be a list of object ids; if that particular object is found, it will always be expanded even if max_level has been reached.

def printtypes( limit: int = 50, file: typing.TextIO | None = None, expanded: bool = False) -> None:
202def printtypes(
203    limit: int = 50, file: TextIO | None = None, expanded: bool = False
204) -> None:
205    """Print a human readable list of which types have the most instances."""
206    assert limit > 0
207    objtypes: dict[str, int] = {}
208    gc.collect()  # Recommended before get_objects().
209    allobjs = _get_all_objects(expanded=expanded)
210    allobjc = len(allobjs)
211    for obj in allobjs:
212        modname = type(obj).__module__
213        tpname = type(obj).__qualname__
214        if modname != 'builtins':
215            tpname = f'{modname}.{tpname}'
216        objtypes[tpname] = objtypes.get(tpname, 0) + 1
217
218    # Presumably allobjs contains stack-frame/dict type stuff
219    # from this function call which in turn contain refs to allobjs.
220    # Let's try to prevent these huge lists from accumulating until
221    # the cyclical collector (hopefully) gets to them.
222    allobjs.clear()
223    del allobjs
224
225    print(f'Types most allocated ({allobjc} total objects):', file=file)
226    for i, tpitem in enumerate(
227        sorted(objtypes.items(), key=lambda x: x[1], reverse=True)[:limit]
228    ):
229        tpname, tpval = tpitem
230        percent = tpval / allobjc * 100.0
231        print(f'{i+1}: {tpname}: {tpval} ({percent:.2f}%)', file=file)

Print a human readable list of which types have the most instances.

def printsizes( limit: int = 50, file: typing.TextIO | None = None, expanded: bool = False) -> None:
234def printsizes(
235    limit: int = 50, file: TextIO | None = None, expanded: bool = False
236) -> None:
237    """Print total allocated sizes of different types."""
238    assert limit > 0
239    objsizes: dict[str, int] = {}
240    gc.collect()  # Recommended before get_objects().
241    allobjs = _get_all_objects(expanded=expanded)
242    totalobjsize = 0
243
244    for obj in allobjs:
245        modname = type(obj).__module__
246        tpname = type(obj).__qualname__
247        if modname != 'builtins':
248            tpname = f'{modname}.{tpname}'
249        objsize = sys.getsizeof(obj)
250        objsizes[tpname] = objsizes.get(tpname, 0) + objsize
251        totalobjsize += objsize
252
253    totalobjmb = totalobjsize / (1024 * 1024)
254    print(
255        f'Types with most allocated bytes ({totalobjmb:.2f} mb total):',
256        file=file,
257    )
258    for i, tpitem in enumerate(
259        sorted(objsizes.items(), key=lambda x: x[1], reverse=True)[:limit]
260    ):
261        tpname, tpval = tpitem
262        percent = tpval / totalobjsize * 100.0
263        print(f'{i+1}: {tpname}: {tpval} ({percent:.2f}%)', file=file)

Print total allocated sizes of different types.

class DeadlockDumper:
356class DeadlockDumper:
357    """Dumps thread states if still around after timeout seconds.
358
359    This uses low level Python functionality so should still fire
360    even in the case of deadlock.
361    """
362
363    # faulthandler has a single traceback-dump-later state, so only
364    # one of us can control it at a time.
365    lock = threading.Lock()
366    watch_in_progress = False
367
368    def __init__(self, timeout: float) -> None:
369        import faulthandler
370
371        cls = type(self)
372
373        with cls.lock:
374            if cls.watch_in_progress:
375                print(
376                    'Existing DeadlockDumper found; new one will be a no-op.',
377                    file=sys.stderr,
378                )
379                self.active = False
380                return
381
382            # Ok; no watch is in progress; we can be the active one.
383            cls.watch_in_progress = True
384            self.active = True
385            faulthandler.dump_traceback_later(timeout=timeout)
386
387    def __del__(self) -> None:
388        import faulthandler
389
390        cls = type(self)
391
392        # If we made it to here, call off the dump. But only if we are
393        # the active dump.
394        if self.active:
395            faulthandler.cancel_dump_traceback_later()
396            cls.watch_in_progress = False

Dumps thread states if still around after timeout seconds.

This uses low level Python functionality so should still fire even in the case of deadlock.

DeadlockDumper(timeout: float)
368    def __init__(self, timeout: float) -> None:
369        import faulthandler
370
371        cls = type(self)
372
373        with cls.lock:
374            if cls.watch_in_progress:
375                print(
376                    'Existing DeadlockDumper found; new one will be a no-op.',
377                    file=sys.stderr,
378                )
379                self.active = False
380                return
381
382            # Ok; no watch is in progress; we can be the active one.
383            cls.watch_in_progress = True
384            self.active = True
385            faulthandler.dump_traceback_later(timeout=timeout)
lock = <unlocked _thread.lock object>
watch_in_progress = False
class DeadlockWatcher:
399class DeadlockWatcher:
400    """Individual watcher for deadlock conditions.
401
402    Use the enable_deadlock_watchers() to enable this system.
403
404    Next, use these wrapped in a with statement around some operation
405    that may deadlock. If the with statement does not complete within the
406    timeout period, a traceback of all threads will be dumped.
407
408    Note that the checker thread runs a cycle every ~5 seconds, so
409    something stuck needs to remain stuck for 5 seconds or so to be
410    caught for sure.
411    """
412
413    watchers_lock: threading.Lock | None = None
414    watchers: list[weakref.ref[DeadlockWatcher]] | None = None
415
416    def __init__(
417        self,
418        timeout: float = 10.0,
419        logger: Logger | None = None,
420        logextra: dict | None = None,
421    ) -> None:
422        from efro.util import caller_source_location
423
424        # pylint: disable=not-context-manager
425        cls = type(self)
426        if cls.watchers_lock is None or cls.watchers is None:
427            print(
428                'DeadlockWatcher created without watchers enabled.',
429                file=sys.stderr,
430            )
431            return
432
433        # All we do is record when we were made and how long till we
434        # expire.
435        self.create_time = time.monotonic()
436        self.timeout = timeout
437        self.noted_expire = False
438        self.logger = logger
439        self.logextra = logextra
440        self.caller_source_loc = caller_source_location()
441        curthread = threading.current_thread()
442        self.thread_id = (
443            '<unknown>'
444            if curthread.ident is None
445            else hex(curthread.ident).removeprefix('0x')
446        )
447        self.active = False
448
449        with cls.watchers_lock:
450            cls.watchers.append(weakref.ref(self))
451
452    # Support the with statement.
453    def __enter__(self) -> Any:
454        self.active = True
455        return self
456
457    def __exit__(self, exc_type: Any, exc_value: Any, exc_tb: Any) -> None:
458        self.active = False
459
460        # Print if we lived past our deadline. This is just an extra
461        # data point. The watcher thread should be doing the actual
462        # stack dumps/etc.
463        if self.logger is None or self.logextra is None:
464            return
465
466        duration = time.monotonic() - self.create_time
467        if duration > self.timeout:
468            self.logger.error(
469                'DeadlockWatcher %s at %s in thread %s lived %.2fs,'
470                ' past timeout %.2fs. This should have triggered'
471                ' a deadlock dump.',
472                id(self),
473                self.caller_source_loc,
474                self.thread_id,
475                duration,
476                self.timeout,
477                extra=self.logextra,
478            )
479
480    @classmethod
481    def enable_deadlock_watchers(cls) -> None:
482        """Spins up deadlock-watcher functionality.
483
484        Must be explicitly called before any DeadlockWatchers are
485        created.
486        """
487        assert cls.watchers_lock is None
488        cls.watchers_lock = threading.Lock()
489        assert cls.watchers is None
490        cls.watchers = []
491
492        threading.Thread(
493            target=cls._deadlock_watcher_thread_main, daemon=True
494        ).start()
495
496    @classmethod
497    def _deadlock_watcher_thread_main(cls) -> None:
498        # pylint: disable=not-context-manager
499        # pylint: disable=not-an-iterable
500        assert cls.watchers_lock is not None and cls.watchers is not None
501
502        # Spin in a loop checking our watchers periodically and dumping
503        # state if any have timed out. The trick here is that we don't
504        # explicitly dump state, but rather we set up a "dead man's
505        # switch" that does so after some amount of time if we don't
506        # explicitly cancel it. This way we'll hopefully get state dumps
507        # even for things like total GIL deadlocks.
508        while True:
509
510            # Set a dead man's switch for this pass.
511            dumper = DeadlockDumper(timeout=5.171)
512
513            # Sleep most of the way through it but give ourselves time
514            # to turn it off if we're still responsive.
515            time.sleep(4.129)
516            now = time.monotonic()
517
518            found_fresh_expired = False
519
520            # If any watcher is still active and expired, sleep past the
521            # timeout to force the dumper to do its thing.
522            with cls.watchers_lock:
523
524                for wref in cls.watchers:
525                    w = wref()
526                    if (
527                        w is not None
528                        and now - w.create_time > w.timeout
529                        and not w.noted_expire
530                        and w.active
531                    ):
532                        # If they supplied a logger, let them know they
533                        # should check stderr for a dump.
534                        if w.logger is not None:
535                            w.logger.error(
536                                'DeadlockWatcher %s at %s in thread %s'
537                                ' with time %.2f expired;'
538                                ' check stderr for stack traces.',
539                                id(w),
540                                w.caller_source_loc,
541                                w.thread_id,
542                                w.timeout,
543                                extra=w.logextra,
544                            )
545                        found_fresh_expired = True
546                        w.noted_expire = True
547
548                    # Important to clear this ref; otherwise we can keep
549                    # a random watcher alive until our next time through.
550                    w = None
551
552                # Prune dead watchers and reset for the next pass.
553                cls.watchers = [w for w in cls.watchers if w() is not None]
554
555            if found_fresh_expired:
556                # Push us over the dumper time limit which give us a
557                # lovely dump. Technically we could just do an immediate
558                # dump here instead, but that would give us two paths to
559                # maintain instead of this single one.
560                time.sleep(2.0)
561
562            # Call off the dump if it hasn't happened yet.
563            del dumper

Individual watcher for deadlock conditions.

Use the enable_deadlock_watchers() to enable this system.

Next, use these wrapped in a with statement around some operation that may deadlock. If the with statement does not complete within the timeout period, a traceback of all threads will be dumped.

Note that the checker thread runs a cycle every ~5 seconds, so something stuck needs to remain stuck for 5 seconds or so to be caught for sure.

DeadlockWatcher( timeout: float = 10.0, logger: logging.Logger | None = None, logextra: dict | None = None)
416    def __init__(
417        self,
418        timeout: float = 10.0,
419        logger: Logger | None = None,
420        logextra: dict | None = None,
421    ) -> None:
422        from efro.util import caller_source_location
423
424        # pylint: disable=not-context-manager
425        cls = type(self)
426        if cls.watchers_lock is None or cls.watchers is None:
427            print(
428                'DeadlockWatcher created without watchers enabled.',
429                file=sys.stderr,
430            )
431            return
432
433        # All we do is record when we were made and how long till we
434        # expire.
435        self.create_time = time.monotonic()
436        self.timeout = timeout
437        self.noted_expire = False
438        self.logger = logger
439        self.logextra = logextra
440        self.caller_source_loc = caller_source_location()
441        curthread = threading.current_thread()
442        self.thread_id = (
443            '<unknown>'
444            if curthread.ident is None
445            else hex(curthread.ident).removeprefix('0x')
446        )
447        self.active = False
448
449        with cls.watchers_lock:
450            cls.watchers.append(weakref.ref(self))
watchers_lock: 'threading.Lock | None' = None
watchers: list[weakref.ReferenceType[DeadlockWatcher]] | None = None
create_time
timeout
noted_expire
logger
logextra
caller_source_loc
thread_id
active
@classmethod
def enable_deadlock_watchers(cls) -> None:
480    @classmethod
481    def enable_deadlock_watchers(cls) -> None:
482        """Spins up deadlock-watcher functionality.
483
484        Must be explicitly called before any DeadlockWatchers are
485        created.
486        """
487        assert cls.watchers_lock is None
488        cls.watchers_lock = threading.Lock()
489        assert cls.watchers is None
490        cls.watchers = []
491
492        threading.Thread(
493            target=cls._deadlock_watcher_thread_main, daemon=True
494        ).start()

Spins up deadlock-watcher functionality.

Must be explicitly called before any DeadlockWatchers are created.