tqchen commented on code in PR #593:
URL: https://github.com/apache/tvm-ffi/pull/593#discussion_r3324215780


##########
python/tvm_ffi/cython/object.pxi:
##########
@@ -469,22 +475,37 @@ cdef inline object 
make_fallback_cls_for_type_index(int32_t type_index):
 
 
 cdef inline object make_ret_object(TVMFFIAny result):
-    cdef int32_t type_index
-    cdef object cls, obj
-    type_index = result.type_index
+    """Wrap a returned chandle into its canonical Python wrapper.
+
+    Caller must own +1 on ``result.v_obj``; ownership transfers to the
+    returned wrapper.
+    """
+    cdef int32_t type_index = result.type_index
+    cdef object cls, obj, cached
+    cdef PyObject* cached_pyobj
 
     if type_index < len(TYPE_INDEX_TO_CLS) and (cls := 
TYPE_INDEX_TO_CLS[type_index]) is not None:
         if issubclass(cls, PyNativeObject):
+            # Value-typed: the transient Object wrapper is discarded after
+            # __from_tvm_ffi_object__. No identity stability needed.
             obj = Object.__new__(Object)
             (<CObject>obj).chandle = result.v_obj
             return cls.__from_tvm_ffi_object__(cls, obj)
+        cached_pyobj = TVMFFIPyTryGetAttachedPyObject(result.v_obj)
+        if cached_pyobj != NULL:
+            cached = <object>cached_pyobj
+            if (<CObject>cached).chandle == NULL:
+                # Inactive -> Active: rebind, transferring caller's +1.
+                (<CObject>cached).chandle = result.v_obj
+            else:
+                # Active: wrapper already holds its own +1 on chandle.
+                CHECK_CALL(TVMFFIObjectDecRef(result.v_obj))
+            return cached
     else:
-        # Slow path: object is not found in registered entry
-        # In this case create a dummy stub class for future usage.
-        # For every unregistered class, this slow path will be triggered only 
once.
         cls = make_fallback_cls_for_type_index(type_index)
     obj = cls.__new__(cls)
     (<CObject>obj).chandle = result.v_obj
+    TVMFFIPyAttachPyObject(result.v_obj, <PyObject*>obj)

Review Comment:
   comment, this is a new class, so it must be in detached state, attach it



##########
python/tvm_ffi/cython/tvm_ffi_python_helpers.h:
##########
@@ -71,6 +73,327 @@
 //  prefixed with TVMFFIPy so they can be easily invoked through Cython.
 
///--------------------------------------------------------------------------------
 
+//================================================================================
+// PyObject-tying state machine.
+//
+// Ties one Python wrapper to one C++ chandle so that
+//   - ``a.x is a.x`` while the wrapper is live;
+//   - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep
+//     the chandle alive);
+//   - ``f(x) is x`` whenever an FFI function returns a chandle that already
+//     has a canonical wrapper.
+//
+// Layout
+// ------
+// Every Object allocated through the registered Python allocator
+// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``:
+//
+//   malloc start
+//   +-------------------+--------------------------+--------+
+//   |   cached_pyobj    | TVMFFIObjectAllocHeader  |   T    |
+//   |   (offset 0..8)   |   delete_space (8..16)   |        |
+//   +-------------------+--------------------------+--------+
+//                                                  ^ ptr = malloc + 16
+//
+// The single ``cached_pyobj`` field is the canonical Python wrapper
+// pointer (or NULL). The Active vs Inactive distinction lives on the
+// wrapper's own ``chandle`` field, not on the header.
+//
+// States
+// ------
+//   Detached: ``header.cached_pyobj == NULL``
+//             No Python wrapper bound to this chandle.
+//   Active:   ``header.cached_pyobj == self`` AND ``self.chandle == chandle``
+//             ``self`` is the canonical Python wrapper for the chandle.
+//             Wrapper holds +1 on chandle (released by ``tp_finalize``).
+//   Inactive: ``header.cached_pyobj == self`` AND ``self.chandle == NULL``
+//             Wrapper memory is preserved (Python refcount kept alive by
+//             a phantom +1 from ``tp_finalize``). The wrapper does NOT
+//             hold +1 on chandle; the chandle stays alive only via other
+//             C++ holders.
+//
+// Invariants
+// ----------
+//   I1. When a PyObject goes out of scope (no Python var refers to it),
+//       its +1 on chandle is always released.
+//   I2. When a chandle is destroyed, its cached PyObject (if any) is

Review Comment:
   When a chandle is destroyed, we can only be in Detached or Inactive state, 
if it is in Inactive state, the PyObject is reclaimed



##########
python/tvm_ffi/cython/tensor.pxi:
##########
@@ -497,7 +497,11 @@ cdef inline object make_tensor_from_chandle(
                 # call the deleter to free the memory since we will continue 
to use the chandle
                 dlpack.deleter(dlpack)
                 pass
-    # default return the tensor
+    # default return the tensor.

Review Comment:
   NOTE: we don't try to attach py object for tensor types as it may appear in 
callback and there maybe translation depending on the context



##########
python/tvm_ffi/cython/tvm_ffi_python_helpers.h:
##########
@@ -71,6 +73,327 @@
 //  prefixed with TVMFFIPy so they can be easily invoked through Cython.
 
///--------------------------------------------------------------------------------
 
+//================================================================================
+// PyObject-tying state machine.
+//
+// Ties one Python wrapper to one C++ chandle so that
+//   - ``a.x is a.x`` while the wrapper is live;
+//   - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep
+//     the chandle alive);
+//   - ``f(x) is x`` whenever an FFI function returns a chandle that already
+//     has a canonical wrapper.
+//
+// Layout
+// ------
+// Every Object allocated through the registered Python allocator
+// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``:
+//
+//   malloc start
+//   +-------------------+--------------------------+--------+
+//   |   cached_pyobj    | TVMFFIObjectAllocHeader  |   T    |
+//   |   (offset 0..8)   |   delete_space (8..16)   |        |
+//   +-------------------+--------------------------+--------+
+//                                                  ^ ptr = malloc + 16
+//
+// The single ``cached_pyobj`` field is the canonical Python wrapper
+// pointer (or NULL). The Active vs Inactive distinction lives on the
+// wrapper's own ``chandle`` field, not on the header.
+//
+// States
+// ------
+//   Detached: ``header.cached_pyobj == NULL``
+//             No Python wrapper bound to this chandle.
+//   Active:   ``header.cached_pyobj == self`` AND ``self.chandle == chandle``
+//             ``self`` is the canonical Python wrapper for the chandle.
+//             Wrapper holds +1 on chandle (released by ``tp_finalize``).
+//   Inactive: ``header.cached_pyobj == self`` AND ``self.chandle == NULL``
+//             Wrapper memory is preserved (Python refcount kept alive by
+//             a phantom +1 from ``tp_finalize``). The wrapper does NOT
+//             hold +1 on chandle; the chandle stays alive only via other
+//             C++ holders.
+//
+// Invariants
+// ----------
+//   I1. When a PyObject goes out of scope (no Python var refers to it),
+//       its +1 on chandle is always released.
+//   I2. When a chandle is destroyed, its cached PyObject (if any) is
+//       reclaimed.
+//   I3. ``self.chandle != NULL`` implies ``self`` owns +1 on that chandle.
+//       Any reader of ``self.chandle`` that follows the pointer must
+//       therefore observe a live object.
+//
+// Where transitions happen
+// ------------------------
+// ``make_ret_object`` (object.pxi)
+//     Detached -> Active : fresh ``cls.__new__(cls)`` then
+//                          ``TVMFFIPyAttachPyObject`` records the binding.
+//     Active   -> Active : returns the cached wrapper, DecRef the caller's
+//                          redundant +1.
+//     Inactive -> Active : returns the cached wrapper, rebinds
+//                          ``self.chandle`` to transfer the caller's +1.
+//                          No header write needed: ``cached_pyobj`` already
+//                          points at this wrapper.
+//
+// ``TVMFFIPyTPFinalize`` (CObject.__del__)
+//     Active   -> Inactive : other C++ holders keep the chandle alive.
+//                            Null ``self.chandle`` BEFORE DecRef (preserves
+//                            I3), DecRef chandle, then resurrect the
+//                            PyObject with Py_IncRef so a future re-fetch
+//                            can rebind. ``cached_pyobj`` unchanged.
+//     Active   -> Detached : last C++ holder. Clear ``cached_pyobj`` first
+//                            (so the chandle deleter, which fires inside
+//                            DecRef, does not GC_Del our bytes), then null
+//                            ``self.chandle`` and DecRef.
+//     Detached -> Detached : ``self`` is not the canonical wrapper for
+//                            this chandle (eager-detach via move already
+//                            cleared the binding). Just DecRef.

Review Comment:
   need clarification here, since Detached menas chandle == NULL



##########
python/tvm_ffi/cython/tvm_ffi_python_helpers.h:
##########
@@ -71,6 +73,327 @@
 //  prefixed with TVMFFIPy so they can be easily invoked through Cython.
 
///--------------------------------------------------------------------------------
 
+//================================================================================
+// PyObject-tying state machine.
+//
+// Ties one Python wrapper to one C++ chandle so that
+//   - ``a.x is a.x`` while the wrapper is live;
+//   - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep
+//     the chandle alive);
+//   - ``f(x) is x`` whenever an FFI function returns a chandle that already
+//     has a canonical wrapper.
+//
+// Layout
+// ------
+// Every Object allocated through the registered Python allocator
+// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``:
+//
+//   malloc start
+//   +-------------------+--------------------------+--------+
+//   |   cached_pyobj    | TVMFFIObjectAllocHeader  |   T    |
+//   |   (offset 0..8)   |   delete_space (8..16)   |        |
+//   +-------------------+--------------------------+--------+
+//                                                  ^ ptr = malloc + 16
+//
+// The single ``cached_pyobj`` field is the canonical Python wrapper
+// pointer (or NULL). The Active vs Inactive distinction lives on the
+// wrapper's own ``chandle`` field, not on the header.
+//
+// States
+// ------
+//   Detached: ``header.cached_pyobj == NULL``
+//             No Python wrapper bound to this chandle.
+//   Active:   ``header.cached_pyobj == self`` AND ``self.chandle == chandle``
+//             ``self`` is the canonical Python wrapper for the chandle.
+//             Wrapper holds +1 on chandle (released by ``tp_finalize``).
+//   Inactive: ``header.cached_pyobj == self`` AND ``self.chandle == NULL``
+//             Wrapper memory is preserved (Python refcount kept alive by
+//             a phantom +1 from ``tp_finalize``). The wrapper does NOT
+//             hold +1 on chandle; the chandle stays alive only via other
+//             C++ holders.
+//
+// Invariants
+// ----------
+//   I1. When a PyObject goes out of scope (no Python var refers to it),
+//       its +1 on chandle is always released.
+//   I2. When a chandle is destroyed, its cached PyObject (if any) is
+//       reclaimed.
+//   I3. ``self.chandle != NULL`` implies ``self`` owns +1 on that chandle.
+//       Any reader of ``self.chandle`` that follows the pointer must
+//       therefore observe a live object.
+//
+// Where transitions happen
+// ------------------------
+// ``make_ret_object`` (object.pxi)
+//     Detached -> Active : fresh ``cls.__new__(cls)`` then
+//                          ``TVMFFIPyAttachPyObject`` records the binding.
+//     Active   -> Active : returns the cached wrapper, DecRef the caller's
+//                          redundant +1.
+//     Inactive -> Active : returns the cached wrapper, rebinds
+//                          ``self.chandle`` to transfer the caller's +1.
+//                          No header write needed: ``cached_pyobj`` already
+//                          points at this wrapper.
+//
+// ``TVMFFIPyTPFinalize`` (CObject.__del__)
+//     Active   -> Inactive : other C++ holders keep the chandle alive.
+//                            Null ``self.chandle`` BEFORE DecRef (preserves
+//                            I3), DecRef chandle, then resurrect the
+//                            PyObject with Py_IncRef so a future re-fetch
+//                            can rebind. ``cached_pyobj`` unchanged.
+//     Active   -> Detached : last C++ holder. Clear ``cached_pyobj`` first
+//                            (so the chandle deleter, which fires inside
+//                            DecRef, does not GC_Del our bytes), then null
+//                            ``self.chandle`` and DecRef.
+//     Detached -> Detached : ``self`` is not the canonical wrapper for
+//                            this chandle (eager-detach via move already
+//                            cleared the binding). Just DecRef.
+//
+// ``TVMFFIPyArgSetterObjectRValueRef_`` (function.pxi),
+// ``__move_handle_from__`` (object.pxi)
+//     Active   -> Detached : call ``TVMFFIPyDetachPyObject`` BEFORE the
+//                            wrapper's ``chandle`` is nulled during a
+//                            move; clears the header binding so
+//                            subsequent code paths do not mistake the
+//                            moved-from wrapper for the canonical one.
+//                            ``__move_handle_from__`` then rebinds via
+//                            ``TVMFFIPyAttachPyObject`` if the chandle
+//                            still has no canonical wrapper.
+//
+// ``TVMFFIPyDeleteSpace`` (Weak deleter)
+//     Inactive -> (gone)   : chandle refcount finally hit 0 while a
+//                            preserved wrapper was sitting in Inactive.
+//                            GC_Del the wrapper (under the GIL) then free
+//                            the malloc block. Active never reaches here
+//                            with ``cached_pyobj != NULL`` because
+//                            ``TVMFFIPyTPFinalize`` always runs first.
+//
+// Shutdown guard
+// --------------
+// ``TVMFFIPyMarkPythonFinalizing`` is wired to atexit from Cython module
+// init. After it fires, Inactive wrapper bytes are intentionally leaked
+// (process exiting; OS reclaims) rather than reaching for
+// ``PyGILState_Ensure`` on a teardown interpreter.
+//================================================================================
+
+/*!
+ * \brief Python-side derived header. ``base.delete_space`` sits at
+ *        ``ptr - sizeof(TVMFFIObjectAllocHeader)`` so the generic C++
+ *        deleter (which knows nothing about Python) can find it.
+ */
+struct PyCustomAllocHeader {
+  PyObject* cached_pyobj;
+  TVMFFIObjectAllocHeader base;
+};
+
+constexpr size_t kPyHeaderOffset = sizeof(PyCustomAllocHeader);  // 16
+static_assert(kPyHeaderOffset == 16,
+              "header must be 16 bytes so T at ptr = malloc + 16 is naturally "
+              "aligned for alignof(T) up to alignof(max_align_t)");
+static_assert(offsetof(PyCustomAllocHeader, base) ==
+                  kPyHeaderOffset - sizeof(TVMFFIObjectAllocHeader),
+              "base must sit at ptr - sizeof(TVMFFIObjectAllocHeader) for the "
+              "C++ deleter to find it");
+
+TVM_FFI_INLINE PyCustomAllocHeader* TVMFFIPyHeader(void* ptr) {
+  return reinterpret_cast<PyCustomAllocHeader*>(static_cast<char*>(ptr) - 
kPyHeaderOffset);
+}
+
+// Forward decl; defined below.
+//
+// NOTE: deliberately *not* TVM_FFI_INLINE. TVM_FFI_INLINE expands to
+// [[gnu::always_inline]] which forbids taking the function's address as
+// a stable, callable pointer — and we hand the address to the C++ side
+// (stored in PyCustomAllocHeader::base.delete_space at allocate time).
+inline void TVMFFIPyDeleteSpace(void* ptr);
+
+// Atexit-driven shutdown guard. ``TVMFFIPyMarkPythonFinalizing`` flips
+// the flag to false from an atexit hook registered in Cython module init;
+// ``TVMFFIPyDeleteSpace`` reads it via ``TVMFFIPyIsPythonAlive`` before
+// ``PyGILState_Ensure`` to avoid touching a teardown interpreter.
+inline std::atomic<bool>& TVMFFIPyAliveFlagStorage() {
+  static std::atomic<bool> flag{true};
+  return flag;
+}
+
+extern "C" inline bool TVMFFIPyIsPythonAlive() noexcept {
+  return TVMFFIPyAliveFlagStorage().load(std::memory_order_acquire);
+}
+
+extern "C" inline void TVMFFIPyMarkPythonFinalizing() noexcept {
+  TVMFFIPyAliveFlagStorage().store(false, std::memory_order_release);
+}
+
+/*!
+ * \brief True iff ``chandle`` was allocated through the Python custom
+ *        allocator (full ``PyCustomAllocHeader`` ahead of it). False for
+ *        allocations that came through libtvm_ffi's builtin default
+ *        (only the base ``TVMFFIObjectAllocHeader``).
+ *
+ * Detection is by comparing ``base.delete_space`` against
+ * ``TVMFFIPyDeleteSpace``: each frontend recognizes its own deleter
+ * pointer, so multiple frontends can coexist without a flag bit on
+ * ``TVMFFIObject``.
+ */
+TVM_FFI_INLINE bool TVMFFIPyIsCanonical(void* chandle) {
+  if (chandle == nullptr) return false;
+  TVMFFIObjectAllocHeader* base = reinterpret_cast<TVMFFIObjectAllocHeader*>(
+      static_cast<char*>(chandle) - sizeof(TVMFFIObjectAllocHeader));
+  return base->delete_space == &TVMFFIPyDeleteSpace;
+}
+
+//---------------------------------------------------------------
+// State-machine helpers.
+//---------------------------------------------------------------
+
+extern "C" {
+
+/*!
+ * \brief Return the cached PyObject bound to ``chandle``, or NULL.
+ *
+ * Active -> returns ``cached_pyobj`` (caller can re-use as canonical
+ * wrapper).
+ * Inactive -> returns ``cached_pyobj`` (caller rebinds
+ * ``cached_pyobj.chandle = chandle`` to revive). Caller discriminates
+ * Active vs Inactive by inspecting ``cached_pyobj.chandle``.
+ * Detached or non-Python chandle -> returns NULL.
+ */
+TVM_FFI_INLINE PyObject* TVMFFIPyTryGetAttachedPyObject(void* chandle) {
+  if (!TVMFFIPyIsCanonical(chandle)) return nullptr;
+  return TVMFFIPyHeader(chandle)->cached_pyobj;
+}
+
+/*!
+ * \brief Bind ``obj`` to ``chandle`` as the canonical PyObject
+ *        (Detached -> Active). Used for the fresh-allocation path; the
+ *        Inactive -> Active revival path does not call this because
+ *        ``cached_pyobj`` already points at the surviving wrapper.
+ *        No-op for chandles without a Python alloc header.
+ */
+TVM_FFI_INLINE void TVMFFIPyAttachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  TVMFFIPyHeader(chandle)->cached_pyobj = obj;
+}
+
+/*!
+ * \brief Clear the canonical-PyObject binding (Active -> Detached).
+ *        No-op when ``obj`` is not the bound PyObject, or for chandles
+ *        without a Python alloc header.
+ */
+TVM_FFI_INLINE void TVMFFIPyDetachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  if (h->cached_pyobj == obj) {
+    h->cached_pyobj = nullptr;
+  }
+}
+
+/*!
+ * \brief tp_finalize hook. Releases the PyObject's +1 on chandle and
+ *        establishes ``wrapper.chandle == NULL`` for ``__dealloc__``.
+ *
+ *  Active -> Inactive: other C++ holders keep the chandle alive.
+ *    Null ``wrapper.chandle`` BEFORE DecRef (preserves I3), DecRef
+ *    chandle, then Py_IncRef wrapper to resurrect the PyObject
+ *    (PEP-442). The phantom +1 is later reclaimed by
+ *    ``TVMFFIPyDeleteSpace``.
+ *
+ *  Active -> Detached: last C++ holder. Detach the binding first so
+ *    the chandle deleter (fires inside DecRef when strong_count reaches
+ *    0) does not try to GC_Del our still-live PyObject. Then null
+ *    ``wrapper.chandle`` and DecRef.
+ *
+ *  Detached -> Detached: ``cached_pyobj`` was already cleared by an
+ *    eager move, or chandle was not allocated through the Python custom
+ *    allocator. Just null ``wrapper.chandle`` and DecRef.

Review Comment:
   no need to DecRef here? just return and pass to dealloc



##########
python/tvm_ffi/cython/tvm_ffi_python_helpers.h:
##########
@@ -71,6 +73,327 @@
 //  prefixed with TVMFFIPy so they can be easily invoked through Cython.
 
///--------------------------------------------------------------------------------
 
+//================================================================================
+// PyObject-tying state machine.
+//
+// Ties one Python wrapper to one C++ chandle so that
+//   - ``a.x is a.x`` while the wrapper is live;
+//   - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep
+//     the chandle alive);
+//   - ``f(x) is x`` whenever an FFI function returns a chandle that already
+//     has a canonical wrapper.
+//
+// Layout
+// ------
+// Every Object allocated through the registered Python allocator
+// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``:
+//
+//   malloc start
+//   +-------------------+--------------------------+--------+
+//   |   cached_pyobj    | TVMFFIObjectAllocHeader  |   T    |
+//   |   (offset 0..8)   |   delete_space (8..16)   |        |
+//   +-------------------+--------------------------+--------+
+//                                                  ^ ptr = malloc + 16
+//
+// The single ``cached_pyobj`` field is the canonical Python wrapper
+// pointer (or NULL). The Active vs Inactive distinction lives on the
+// wrapper's own ``chandle`` field, not on the header.
+//
+// States
+// ------
+//   Detached: ``header.cached_pyobj == NULL``
+//             No Python wrapper bound to this chandle.
+//   Active:   ``header.cached_pyobj == self`` AND ``self.chandle == chandle``
+//             ``self`` is the canonical Python wrapper for the chandle.
+//             Wrapper holds +1 on chandle (released by ``tp_finalize``).
+//   Inactive: ``header.cached_pyobj == self`` AND ``self.chandle == NULL``
+//             Wrapper memory is preserved (Python refcount kept alive by
+//             a phantom +1 from ``tp_finalize``). The wrapper does NOT
+//             hold +1 on chandle; the chandle stays alive only via other
+//             C++ holders.
+//
+// Invariants
+// ----------
+//   I1. When a PyObject goes out of scope (no Python var refers to it),
+//       its +1 on chandle is always released.
+//   I2. When a chandle is destroyed, its cached PyObject (if any) is
+//       reclaimed.
+//   I3. ``self.chandle != NULL`` implies ``self`` owns +1 on that chandle.
+//       Any reader of ``self.chandle`` that follows the pointer must
+//       therefore observe a live object.
+//
+// Where transitions happen
+// ------------------------
+// ``make_ret_object`` (object.pxi)
+//     Detached -> Active : fresh ``cls.__new__(cls)`` then
+//                          ``TVMFFIPyAttachPyObject`` records the binding.
+//     Active   -> Active : returns the cached wrapper, DecRef the caller's
+//                          redundant +1.
+//     Inactive -> Active : returns the cached wrapper, rebinds
+//                          ``self.chandle`` to transfer the caller's +1.
+//                          No header write needed: ``cached_pyobj`` already
+//                          points at this wrapper.
+//
+// ``TVMFFIPyTPFinalize`` (CObject.__del__)
+//     Active   -> Inactive : other C++ holders keep the chandle alive.
+//                            Null ``self.chandle`` BEFORE DecRef (preserves
+//                            I3), DecRef chandle, then resurrect the
+//                            PyObject with Py_IncRef so a future re-fetch
+//                            can rebind. ``cached_pyobj`` unchanged.
+//     Active   -> Detached : last C++ holder. Clear ``cached_pyobj`` first
+//                            (so the chandle deleter, which fires inside
+//                            DecRef, does not GC_Del our bytes), then null
+//                            ``self.chandle`` and DecRef.
+//     Detached -> Detached : ``self`` is not the canonical wrapper for
+//                            this chandle (eager-detach via move already
+//                            cleared the binding). Just DecRef.
+//
+// ``TVMFFIPyArgSetterObjectRValueRef_`` (function.pxi),
+// ``__move_handle_from__`` (object.pxi)
+//     Active   -> Detached : call ``TVMFFIPyDetachPyObject`` BEFORE the
+//                            wrapper's ``chandle`` is nulled during a
+//                            move; clears the header binding so
+//                            subsequent code paths do not mistake the
+//                            moved-from wrapper for the canonical one.
+//                            ``__move_handle_from__`` then rebinds via
+//                            ``TVMFFIPyAttachPyObject`` if the chandle
+//                            still has no canonical wrapper.
+//
+// ``TVMFFIPyDeleteSpace`` (Weak deleter)
+//     Inactive -> (gone)   : chandle refcount finally hit 0 while a
+//                            preserved wrapper was sitting in Inactive.
+//                            GC_Del the wrapper (under the GIL) then free
+//                            the malloc block. Active never reaches here
+//                            with ``cached_pyobj != NULL`` because
+//                            ``TVMFFIPyTPFinalize`` always runs first.
+//
+// Shutdown guard
+// --------------
+// ``TVMFFIPyMarkPythonFinalizing`` is wired to atexit from Cython module
+// init. After it fires, Inactive wrapper bytes are intentionally leaked
+// (process exiting; OS reclaims) rather than reaching for
+// ``PyGILState_Ensure`` on a teardown interpreter.
+//================================================================================
+
+/*!
+ * \brief Python-side derived header. ``base.delete_space`` sits at
+ *        ``ptr - sizeof(TVMFFIObjectAllocHeader)`` so the generic C++
+ *        deleter (which knows nothing about Python) can find it.
+ */
+struct PyCustomAllocHeader {
+  PyObject* cached_pyobj;
+  TVMFFIObjectAllocHeader base;
+};
+
+constexpr size_t kPyHeaderOffset = sizeof(PyCustomAllocHeader);  // 16
+static_assert(kPyHeaderOffset == 16,
+              "header must be 16 bytes so T at ptr = malloc + 16 is naturally "
+              "aligned for alignof(T) up to alignof(max_align_t)");
+static_assert(offsetof(PyCustomAllocHeader, base) ==
+                  kPyHeaderOffset - sizeof(TVMFFIObjectAllocHeader),
+              "base must sit at ptr - sizeof(TVMFFIObjectAllocHeader) for the "
+              "C++ deleter to find it");
+
+TVM_FFI_INLINE PyCustomAllocHeader* TVMFFIPyHeader(void* ptr) {
+  return reinterpret_cast<PyCustomAllocHeader*>(static_cast<char*>(ptr) - 
kPyHeaderOffset);
+}
+
+// Forward decl; defined below.
+//
+// NOTE: deliberately *not* TVM_FFI_INLINE. TVM_FFI_INLINE expands to
+// [[gnu::always_inline]] which forbids taking the function's address as
+// a stable, callable pointer — and we hand the address to the C++ side
+// (stored in PyCustomAllocHeader::base.delete_space at allocate time).
+inline void TVMFFIPyDeleteSpace(void* ptr);
+
+// Atexit-driven shutdown guard. ``TVMFFIPyMarkPythonFinalizing`` flips
+// the flag to false from an atexit hook registered in Cython module init;
+// ``TVMFFIPyDeleteSpace`` reads it via ``TVMFFIPyIsPythonAlive`` before
+// ``PyGILState_Ensure`` to avoid touching a teardown interpreter.
+inline std::atomic<bool>& TVMFFIPyAliveFlagStorage() {
+  static std::atomic<bool> flag{true};
+  return flag;
+}
+
+extern "C" inline bool TVMFFIPyIsPythonAlive() noexcept {
+  return TVMFFIPyAliveFlagStorage().load(std::memory_order_acquire);
+}
+
+extern "C" inline void TVMFFIPyMarkPythonFinalizing() noexcept {
+  TVMFFIPyAliveFlagStorage().store(false, std::memory_order_release);
+}
+
+/*!
+ * \brief True iff ``chandle`` was allocated through the Python custom
+ *        allocator (full ``PyCustomAllocHeader`` ahead of it). False for
+ *        allocations that came through libtvm_ffi's builtin default
+ *        (only the base ``TVMFFIObjectAllocHeader``).
+ *
+ * Detection is by comparing ``base.delete_space`` against
+ * ``TVMFFIPyDeleteSpace``: each frontend recognizes its own deleter
+ * pointer, so multiple frontends can coexist without a flag bit on
+ * ``TVMFFIObject``.
+ */
+TVM_FFI_INLINE bool TVMFFIPyIsCanonical(void* chandle) {
+  if (chandle == nullptr) return false;
+  TVMFFIObjectAllocHeader* base = reinterpret_cast<TVMFFIObjectAllocHeader*>(
+      static_cast<char*>(chandle) - sizeof(TVMFFIObjectAllocHeader));
+  return base->delete_space == &TVMFFIPyDeleteSpace;
+}
+
+//---------------------------------------------------------------
+// State-machine helpers.
+//---------------------------------------------------------------
+
+extern "C" {
+
+/*!
+ * \brief Return the cached PyObject bound to ``chandle``, or NULL.
+ *
+ * Active -> returns ``cached_pyobj`` (caller can re-use as canonical
+ * wrapper).
+ * Inactive -> returns ``cached_pyobj`` (caller rebinds
+ * ``cached_pyobj.chandle = chandle`` to revive). Caller discriminates
+ * Active vs Inactive by inspecting ``cached_pyobj.chandle``.
+ * Detached or non-Python chandle -> returns NULL.
+ */
+TVM_FFI_INLINE PyObject* TVMFFIPyTryGetAttachedPyObject(void* chandle) {
+  if (!TVMFFIPyIsCanonical(chandle)) return nullptr;
+  return TVMFFIPyHeader(chandle)->cached_pyobj;
+}
+
+/*!
+ * \brief Bind ``obj`` to ``chandle`` as the canonical PyObject
+ *        (Detached -> Active). Used for the fresh-allocation path; the
+ *        Inactive -> Active revival path does not call this because
+ *        ``cached_pyobj`` already points at the surviving wrapper.
+ *        No-op for chandles without a Python alloc header.
+ */
+TVM_FFI_INLINE void TVMFFIPyAttachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  TVMFFIPyHeader(chandle)->cached_pyobj = obj;
+}
+
+/*!
+ * \brief Clear the canonical-PyObject binding (Active -> Detached).
+ *        No-op when ``obj`` is not the bound PyObject, or for chandles
+ *        without a Python alloc header.
+ */
+TVM_FFI_INLINE void TVMFFIPyDetachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  if (h->cached_pyobj == obj) {
+    h->cached_pyobj = nullptr;
+  }
+}
+
+/*!
+ * \brief tp_finalize hook. Releases the PyObject's +1 on chandle and
+ *        establishes ``wrapper.chandle == NULL`` for ``__dealloc__``.
+ *
+ *  Active -> Inactive: other C++ holders keep the chandle alive.
+ *    Null ``wrapper.chandle`` BEFORE DecRef (preserves I3), DecRef
+ *    chandle, then Py_IncRef wrapper to resurrect the PyObject
+ *    (PEP-442). The phantom +1 is later reclaimed by
+ *    ``TVMFFIPyDeleteSpace``.
+ *
+ *  Active -> Detached: last C++ holder. Detach the binding first so
+ *    the chandle deleter (fires inside DecRef when strong_count reaches
+ *    0) does not try to GC_Del our still-live PyObject. Then null
+ *    ``wrapper.chandle`` and DecRef.
+ *
+ *  Detached -> Detached: ``cached_pyobj`` was already cleared by an
+ *    eager move, or chandle was not allocated through the Python custom
+ *    allocator. Just null ``wrapper.chandle`` and DecRef.
+ */
+TVM_FFI_INLINE void TVMFFIPyTPFinalize(void** ptr_to_chandle, PyObject* 
wrapper) {
+  void* chandle = *ptr_to_chandle;
+  if (chandle == nullptr) return;

Review Comment:
   comment Detached



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to