tqchen commented on code in PR #593:
URL: https://github.com/apache/tvm-ffi/pull/593#discussion_r3311618979


##########
python/tvm_ffi/cython/tvm_ffi_python_helpers.h:
##########
@@ -71,6 +72,303 @@
 //  prefixed with TVMFFIPy so they can be easily invoked through Cython.
 
///--------------------------------------------------------------------------------
 
+//================================================================================
+// PyObject-tying state machine.
+//
+// Ties one Python wrapper to one C++ chandle so that
+//   - ``a.x is a.x`` while the wrapper is live;
+//   - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep
+//     the chandle alive);
+//   - ``f(x) is x`` whenever an FFI function returns a chandle that already
+//     has a canonical wrapper.
+//
+// Layout
+// ------
+// Every Object allocated through the registered Python allocator
+// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``:
+//
+//   malloc start
+//   +-------------------+--------------------------+--------+
+//   | tagged_wrapper    | TVMFFIObjectAllocHeader  |   T    |
+//   |   (offset 0..8)   |   delete_space (8..16)   |        |
+//   +-------------------+--------------------------+--------+
+//                                                  ^ tptr = malloc + 16
+//
+// The single ``tagged_wrapper`` field encodes a 3-state machine via the
+// low bit of the canonical wrapper pointer. CPython PyObjects are at least
+// 8-byte aligned (PyObject_HEAD is 16 B), so the low bit is always free.
+//
+// States
+// ------
+//   Init:     ``tagged_wrapper == 0``
+//             No Python wrapper bound to this chandle.
+//   Active:   ``tagged_wrapper == (uintptr_t)wrapper | 1``
+//             ``wrapper`` is the canonical Python wrapper for the chandle.
+//             ``wrapper.chandle == chandle``. Wrapper holds +1 on chandle
+//             (released by ``__dealloc__``).
+//   Inactive: ``tagged_wrapper == (uintptr_t)wrapper``
+//             Wrapper memory is preserved (Python refcount kept alive by
+//             a phantom +1 from ``tp_finalize``), but ``wrapper.chandle``
+//             has been cleared. The wrapper does NOT hold +1 on chandle;
+//             the chandle stays alive only via other C++ holders.
+//
+// Transitions (exposed as the four ``TVMFFIPy*`` helpers below)
+// --------------------------------------------------------------
+//   Init   -> Active    ``TVMFFIPyAttachPyObject`` after a fresh
+//                       ``cls.__new__(cls)`` in ``make_ret_object``.
+//   Active -> Active    ``TVMFFIPyTryGetAttachedPyObject`` returns the
+//                       cached wrapper; caller INCREFs and DecRefs the
+//                       redundant +1 on chandle.
+//   Inactive -> Active  ``TVMFFIPyTryGetAttachedPyObject`` returns the
+//                       preserved wrapper. Caller re-binds
+//                       ``wrapper.chandle = chandle`` (transferring its
+//                       +1) and calls ``TVMFFIPyAttachPyObject`` to set
+//                       the live bit.
+//   Active -> Inactive  ``TVMFFIPyTPFinalize`` when the wrapper's Python
+//                       refcount hits 0 while other C++ holders keep the
+//                       chandle alive: clears the live bit, DECREFs
+//                       chandle, Py_INCREFs self (PEP-442 resurrection).
+//   Active -> Init      ``TVMFFIPyTPFinalize`` when this wrapper is the
+//                       last C++ holder: clears tagged_wrapper, falls
+//                       through to ``__dealloc__`` for normal cleanup.
+//   Active -> Init      ``TVMFFIPyDetachPyObject`` from ``__dealloc__``
+//                       and from the rvalue-ref move setter (eager-detach
+//                       before the C++ side nulls the source chandle).
+//
+// Chandle deleter (Inactive reclamation)
+// --------------------------------------
+// When chandle's refcount finally hits 0 in the Inactive state, the C++
+// Weak deleter calls ``base.delete_space`` -> ``TVMFFIPyDeleteSpace``.
+// The deleter notices the preserved wrapper (``tagged_wrapper != 0``),
+// acquires the GIL, and frees the wrapper memory with ``PyObject_GC_Del``
+// before releasing the malloc block.
+//
+// Shutdown guard
+// --------------
+// ``g_tvm_ffi_python_alive`` is flipped to 0 by an atexit hook registered
+// from Cython module init. After that point, Inactive wrapper bytes are
+// intentionally leaked (process exiting; OS reclaims) rather than
+// reaching for ``PyGILState_Ensure`` on a teardown interpreter.
+//================================================================================
+
+/*!
+ * \brief Python-side derived header. Single tagged-pointer field encodes
+ *        the Init / Active / Inactive states. ``base.delete_space`` sits
+ *        at exactly ``tptr - sizeof(TVMFFIObjectAllocHeader)`` so the
+ *        generic C++ deleter (which knows nothing about Python) can find
+ *        it.
+ */
+struct PyCustomAllocHeader {
+  uintptr_t tagged_wrapper;        // see state encoding
+  TVMFFIObjectAllocHeader base;    // delete_space; at tptr - 8
+};
+
+constexpr uintptr_t kPyHeaderLiveTag = 1;
+constexpr size_t kPyHeaderOffset = sizeof(PyCustomAllocHeader);  // 16
+static_assert(kPyHeaderOffset == 16,
+              "header must be 16 bytes so T at tptr = malloc + 16 is naturally 
"
+              "aligned for alignof(T) up to alignof(max_align_t)");
+static_assert(offsetof(PyCustomAllocHeader, base) ==
+                  kPyHeaderOffset - sizeof(TVMFFIObjectAllocHeader),
+              "base must sit at tptr - sizeof(TVMFFIObjectAllocHeader) for the 
"
+              "C++ deleter to find it");
+
+TVM_FFI_INLINE PyCustomAllocHeader* TVMFFIPyHeader(void* tptr) {
+  return reinterpret_cast<PyCustomAllocHeader*>(static_cast<char*>(tptr) - 
kPyHeaderOffset);
+}
+
+// Forward decl; defined below.
+//
+// NOTE: deliberately *not* TVM_FFI_INLINE. TVM_FFI_INLINE expands to
+// [[gnu::always_inline]] which forbids taking the function's address as
+// a stable, callable pointer — and we hand the address to the C++ side
+// (stored in PyCustomAllocHeader::base.delete_space at allocate time).
+inline void TVMFFIPyDeleteSpace(void* tptr);
+
+// Atexit-driven shutdown guard. Flipped to 0 by an atexit hook registered
+// from Cython module init; read by ``TVMFFIPyDeleteSpace`` before
+// ``PyGILState_Ensure`` to avoid touching a teardown interpreter.
+static volatile int g_tvm_ffi_python_alive = 1;
+
+inline void TVMFFIPyMarkPythonFinalizing() noexcept { g_tvm_ffi_python_alive = 
0; }
+
+/*!
+ * \brief True iff ``chandle`` was allocated through the Python custom
+ *        allocator (full ``PyCustomAllocHeader`` ahead of it). False for
+ *        allocations that came through libtvm_ffi's builtin default
+ *        (only the base ``TVMFFIObjectAllocHeader``).
+ *
+ * Detection is by comparing ``base.delete_space`` against
+ * ``TVMFFIPyDeleteSpace``: each frontend recognizes its own deleter
+ * pointer, so multiple frontends can coexist without a flag bit on
+ * ``TVMFFIObject``.
+ */
+TVM_FFI_INLINE bool TVMFFIPyIsCanonical(void* chandle) {
+  if (chandle == nullptr) return false;
+  TVMFFIObjectAllocHeader* base = reinterpret_cast<TVMFFIObjectAllocHeader*>(
+      static_cast<char*>(chandle) - sizeof(TVMFFIObjectAllocHeader));
+  return base->delete_space == &TVMFFIPyDeleteSpace;
+}
+
+//---------------------------------------------------------------
+// The four state-machine transitions.
+//---------------------------------------------------------------
+
+/*!
+ * \brief Try to fetch the canonical wrapper bound to ``chandle``.
+ *
+ * Returns a borrowed PyObject* in the Active or Inactive state; returns
+ * NULL in the Init state or for chandles without a Python alloc header.
+ * Does NOT mutate state; the Inactive -> Active revival's
+ * ``tagged_wrapper`` re-tagging happens via ``TVMFFIPyAttachPyObject``
+ * after the caller rebinds ``wrapper.chandle``.
+ */
+TVM_FFI_INLINE PyObject* TVMFFIPyTryGetAttachedPyObject(void* chandle) {
+  if (!TVMFFIPyIsCanonical(chandle)) return nullptr;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  if (h->tagged_wrapper == 0) return nullptr;
+  return reinterpret_cast<PyObject*>(h->tagged_wrapper & ~kPyHeaderLiveTag);
+}
+
+/*!
+ * \brief Bind ``obj`` to ``chandle`` as the canonical wrapper (-> Active).
+ *
+ * Used both for fresh allocation (Init -> Active) and for Inactive ->
+ * Active revival after the caller has re-set ``obj.chandle = chandle``.
+ *
+ * No-op for chandles without a Python alloc header.
+ */
+TVM_FFI_INLINE void TVMFFIPyAttachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  TVMFFIPyHeader(chandle)->tagged_wrapper = reinterpret_cast<uintptr_t>(obj) | 
kPyHeaderLiveTag;
+}
+
+/*!
+ * \brief Clear the canonical-wrapper binding (Active -> Init).
+ *
+ * No-op when ``obj`` is not the live canonical wrapper for ``chandle``
+ * (Init, Inactive, stale post-move wrapper, or non-Python alloc header).
+ * Used by ``__dealloc__`` and by the rvalue-ref move setter to
+ * eager-detach before the C++ side nulls the source chandle.
+ */
+TVM_FFI_INLINE void TVMFFIPyDetachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  uintptr_t expected = reinterpret_cast<uintptr_t>(obj) | kPyHeaderLiveTag;
+  if (h->tagged_wrapper == expected) {
+    h->tagged_wrapper = 0;
+  }
+}
+
+/*!
+ * \brief tp_finalize hook. Picks one of two transitions:
+ *
+ *   Active -> Inactive: when other C++ holders keep the chandle alive,
+ *     resurrect the wrapper so a future re-fetch can rebind. Clears the
+ *     live bit, nulls ``*chandle_field``, releases the wrapper's +1 on
+ *     chandle, and Py_INCREFs ``self`` (PEP-442 resurrection). The
+ *     phantom +1 is later reclaimed by ``TVMFFIPyDeleteSpace`` when
+ *     chandle dies.
+ *
+ *   Active -> Init: when the wrapper is the last C++ holder, do nothing
+ *     here and let ``__dealloc__`` run normally. ``__dealloc__`` calls
+ *     ``TVMFFIPyDetachPyObject`` and DECREFs the chandle.
+ *
+ * Why null ``*chandle_field`` BEFORE the DECREF on the Inactive path:
+ *   - It is the Inactive sentinel that ``make_ret_object`` reads on
+ *     revival (``cached.chandle == NULL`` -> rebind & transfer caller's
+ *     +1). A stale non-NULL slot would mis-route to the Active branch
+ *     and double-drop the caller's +1.
+ *   - It keeps the invariant ``wrapper.chandle != NULL implies wrapper
+ *     owns +1 on chandle`` true at every observable point: any reader
+ *     of ``self.chandle`` (hash, same_as, repr, ...) sees NULL before
+ *     any deleter the DECREF may trigger gets a chance to run.
+ */
+TVM_FFI_INLINE void TVMFFIPyTPFinalize(void** chandle_field, PyObject* self) {
+  void* chandle = *chandle_field;
+  if (chandle == nullptr) return;
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  uintptr_t expected = reinterpret_cast<uintptr_t>(self) | kPyHeaderLiveTag;

Review Comment:
   This is an invariant? when would expected not be the wrapper.



##########
python/tvm_ffi/cython/tvm_ffi_python_helpers.h:
##########
@@ -71,6 +72,303 @@
 //  prefixed with TVMFFIPy so they can be easily invoked through Cython.
 
///--------------------------------------------------------------------------------
 
+//================================================================================
+// PyObject-tying state machine.
+//
+// Ties one Python wrapper to one C++ chandle so that
+//   - ``a.x is a.x`` while the wrapper is live;
+//   - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep
+//     the chandle alive);
+//   - ``f(x) is x`` whenever an FFI function returns a chandle that already
+//     has a canonical wrapper.
+//
+// Layout
+// ------
+// Every Object allocated through the registered Python allocator
+// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``:
+//
+//   malloc start
+//   +-------------------+--------------------------+--------+
+//   | tagged_wrapper    | TVMFFIObjectAllocHeader  |   T    |
+//   |   (offset 0..8)   |   delete_space (8..16)   |        |
+//   +-------------------+--------------------------+--------+
+//                                                  ^ tptr = malloc + 16
+//
+// The single ``tagged_wrapper`` field encodes a 3-state machine via the
+// low bit of the canonical wrapper pointer. CPython PyObjects are at least
+// 8-byte aligned (PyObject_HEAD is 16 B), so the low bit is always free.
+//
+// States
+// ------
+//   Init:     ``tagged_wrapper == 0``
+//             No Python wrapper bound to this chandle.
+//   Active:   ``tagged_wrapper == (uintptr_t)wrapper | 1``
+//             ``wrapper`` is the canonical Python wrapper for the chandle.
+//             ``wrapper.chandle == chandle``. Wrapper holds +1 on chandle
+//             (released by ``__dealloc__``).
+//   Inactive: ``tagged_wrapper == (uintptr_t)wrapper``
+//             Wrapper memory is preserved (Python refcount kept alive by
+//             a phantom +1 from ``tp_finalize``), but ``wrapper.chandle``
+//             has been cleared. The wrapper does NOT hold +1 on chandle;
+//             the chandle stays alive only via other C++ holders.
+//
+// Transitions (exposed as the four ``TVMFFIPy*`` helpers below)
+// --------------------------------------------------------------
+//   Init   -> Active    ``TVMFFIPyAttachPyObject`` after a fresh
+//                       ``cls.__new__(cls)`` in ``make_ret_object``.
+//   Active -> Active    ``TVMFFIPyTryGetAttachedPyObject`` returns the
+//                       cached wrapper; caller INCREFs and DecRefs the
+//                       redundant +1 on chandle.
+//   Inactive -> Active  ``TVMFFIPyTryGetAttachedPyObject`` returns the
+//                       preserved wrapper. Caller re-binds
+//                       ``wrapper.chandle = chandle`` (transferring its
+//                       +1) and calls ``TVMFFIPyAttachPyObject`` to set
+//                       the live bit.
+//   Active -> Inactive  ``TVMFFIPyTPFinalize`` when the wrapper's Python
+//                       refcount hits 0 while other C++ holders keep the
+//                       chandle alive: clears the live bit, DECREFs
+//                       chandle, Py_INCREFs self (PEP-442 resurrection).
+//   Active -> Init      ``TVMFFIPyTPFinalize`` when this wrapper is the
+//                       last C++ holder: clears tagged_wrapper, falls
+//                       through to ``__dealloc__`` for normal cleanup.
+//   Active -> Init      ``TVMFFIPyDetachPyObject`` from ``__dealloc__``
+//                       and from the rvalue-ref move setter (eager-detach
+//                       before the C++ side nulls the source chandle).
+//
+// Chandle deleter (Inactive reclamation)
+// --------------------------------------
+// When chandle's refcount finally hits 0 in the Inactive state, the C++
+// Weak deleter calls ``base.delete_space`` -> ``TVMFFIPyDeleteSpace``.
+// The deleter notices the preserved wrapper (``tagged_wrapper != 0``),
+// acquires the GIL, and frees the wrapper memory with ``PyObject_GC_Del``
+// before releasing the malloc block.
+//
+// Shutdown guard
+// --------------
+// ``g_tvm_ffi_python_alive`` is flipped to 0 by an atexit hook registered
+// from Cython module init. After that point, Inactive wrapper bytes are
+// intentionally leaked (process exiting; OS reclaims) rather than
+// reaching for ``PyGILState_Ensure`` on a teardown interpreter.
+//================================================================================
+
+/*!
+ * \brief Python-side derived header. Single tagged-pointer field encodes
+ *        the Init / Active / Inactive states. ``base.delete_space`` sits
+ *        at exactly ``tptr - sizeof(TVMFFIObjectAllocHeader)`` so the
+ *        generic C++ deleter (which knows nothing about Python) can find
+ *        it.
+ */
+struct PyCustomAllocHeader {
+  uintptr_t tagged_wrapper;        // see state encoding
+  TVMFFIObjectAllocHeader base;    // delete_space; at tptr - 8
+};
+
+constexpr uintptr_t kPyHeaderLiveTag = 1;
+constexpr size_t kPyHeaderOffset = sizeof(PyCustomAllocHeader);  // 16
+static_assert(kPyHeaderOffset == 16,
+              "header must be 16 bytes so T at tptr = malloc + 16 is naturally 
"
+              "aligned for alignof(T) up to alignof(max_align_t)");
+static_assert(offsetof(PyCustomAllocHeader, base) ==
+                  kPyHeaderOffset - sizeof(TVMFFIObjectAllocHeader),
+              "base must sit at tptr - sizeof(TVMFFIObjectAllocHeader) for the 
"
+              "C++ deleter to find it");
+
+TVM_FFI_INLINE PyCustomAllocHeader* TVMFFIPyHeader(void* tptr) {
+  return reinterpret_cast<PyCustomAllocHeader*>(static_cast<char*>(tptr) - 
kPyHeaderOffset);
+}
+
+// Forward decl; defined below.
+//
+// NOTE: deliberately *not* TVM_FFI_INLINE. TVM_FFI_INLINE expands to
+// [[gnu::always_inline]] which forbids taking the function's address as
+// a stable, callable pointer — and we hand the address to the C++ side
+// (stored in PyCustomAllocHeader::base.delete_space at allocate time).
+inline void TVMFFIPyDeleteSpace(void* tptr);
+
+// Atexit-driven shutdown guard. Flipped to 0 by an atexit hook registered
+// from Cython module init; read by ``TVMFFIPyDeleteSpace`` before
+// ``PyGILState_Ensure`` to avoid touching a teardown interpreter.
+static volatile int g_tvm_ffi_python_alive = 1;
+
+inline void TVMFFIPyMarkPythonFinalizing() noexcept { g_tvm_ffi_python_alive = 
0; }
+
+/*!
+ * \brief True iff ``chandle`` was allocated through the Python custom
+ *        allocator (full ``PyCustomAllocHeader`` ahead of it). False for
+ *        allocations that came through libtvm_ffi's builtin default
+ *        (only the base ``TVMFFIObjectAllocHeader``).
+ *
+ * Detection is by comparing ``base.delete_space`` against
+ * ``TVMFFIPyDeleteSpace``: each frontend recognizes its own deleter
+ * pointer, so multiple frontends can coexist without a flag bit on
+ * ``TVMFFIObject``.
+ */
+TVM_FFI_INLINE bool TVMFFIPyIsCanonical(void* chandle) {
+  if (chandle == nullptr) return false;
+  TVMFFIObjectAllocHeader* base = reinterpret_cast<TVMFFIObjectAllocHeader*>(
+      static_cast<char*>(chandle) - sizeof(TVMFFIObjectAllocHeader));
+  return base->delete_space == &TVMFFIPyDeleteSpace;
+}
+
+//---------------------------------------------------------------
+// The four state-machine transitions.
+//---------------------------------------------------------------
+
+/*!
+ * \brief Try to fetch the canonical wrapper bound to ``chandle``.
+ *
+ * Returns a borrowed PyObject* in the Active or Inactive state; returns
+ * NULL in the Init state or for chandles without a Python alloc header.
+ * Does NOT mutate state; the Inactive -> Active revival's
+ * ``tagged_wrapper`` re-tagging happens via ``TVMFFIPyAttachPyObject``
+ * after the caller rebinds ``wrapper.chandle``.
+ */
+TVM_FFI_INLINE PyObject* TVMFFIPyTryGetAttachedPyObject(void* chandle) {
+  if (!TVMFFIPyIsCanonical(chandle)) return nullptr;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  if (h->tagged_wrapper == 0) return nullptr;
+  return reinterpret_cast<PyObject*>(h->tagged_wrapper & ~kPyHeaderLiveTag);
+}
+
+/*!
+ * \brief Bind ``obj`` to ``chandle`` as the canonical wrapper (-> Active).
+ *
+ * Used both for fresh allocation (Init -> Active) and for Inactive ->
+ * Active revival after the caller has re-set ``obj.chandle = chandle``.
+ *
+ * No-op for chandles without a Python alloc header.
+ */
+TVM_FFI_INLINE void TVMFFIPyAttachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  TVMFFIPyHeader(chandle)->tagged_wrapper = reinterpret_cast<uintptr_t>(obj) | 
kPyHeaderLiveTag;
+}
+
+/*!
+ * \brief Clear the canonical-wrapper binding (Active -> Init).
+ *
+ * No-op when ``obj`` is not the live canonical wrapper for ``chandle``
+ * (Init, Inactive, stale post-move wrapper, or non-Python alloc header).
+ * Used by ``__dealloc__`` and by the rvalue-ref move setter to
+ * eager-detach before the C++ side nulls the source chandle.
+ */
+TVM_FFI_INLINE void TVMFFIPyDetachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  uintptr_t expected = reinterpret_cast<uintptr_t>(obj) | kPyHeaderLiveTag;
+  if (h->tagged_wrapper == expected) {
+    h->tagged_wrapper = 0;
+  }
+}
+
+/*!
+ * \brief tp_finalize hook. Picks one of two transitions:
+ *
+ *   Active -> Inactive: when other C++ holders keep the chandle alive,
+ *     resurrect the wrapper so a future re-fetch can rebind. Clears the
+ *     live bit, nulls ``*chandle_field``, releases the wrapper's +1 on
+ *     chandle, and Py_INCREFs ``self`` (PEP-442 resurrection). The
+ *     phantom +1 is later reclaimed by ``TVMFFIPyDeleteSpace`` when
+ *     chandle dies.
+ *
+ *   Active -> Init: when the wrapper is the last C++ holder, do nothing
+ *     here and let ``__dealloc__`` run normally. ``__dealloc__`` calls
+ *     ``TVMFFIPyDetachPyObject`` and DECREFs the chandle.
+ *
+ * Why null ``*chandle_field`` BEFORE the DECREF on the Inactive path:
+ *   - It is the Inactive sentinel that ``make_ret_object`` reads on
+ *     revival (``cached.chandle == NULL`` -> rebind & transfer caller's
+ *     +1). A stale non-NULL slot would mis-route to the Active branch
+ *     and double-drop the caller's +1.
+ *   - It keeps the invariant ``wrapper.chandle != NULL implies wrapper
+ *     owns +1 on chandle`` true at every observable point: any reader
+ *     of ``self.chandle`` (hash, same_as, repr, ...) sees NULL before
+ *     any deleter the DECREF may trigger gets a chance to run.
+ */
+TVM_FFI_INLINE void TVMFFIPyTPFinalize(void** chandle_field, PyObject* self) {
+  void* chandle = *chandle_field;
+  if (chandle == nullptr) return;
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  uintptr_t expected = reinterpret_cast<uintptr_t>(self) | kPyHeaderLiveTag;
+  if (h->tagged_wrapper != expected) return;
+  // Read strong count without atomic — under classic CPython the GIL
+  // serializes Python-side access, and C++-side DecRefs cross the FFI
+  // boundary which cannot run while we hold the GIL here.
+  uint64_t strong_count =
+      reinterpret_cast<TVMFFIObject*>(chandle)->combined_ref_count & 
0xFFFFFFFFu;
+  if (strong_count <= 1) {
+    // Active -> Init: we hold the last strong ref; let __dealloc__ run
+    // normally and detach the binding there.

Review Comment:
   favor locality, do detached here



##########
python/tvm_ffi/cython/tvm_ffi_python_helpers.h:
##########
@@ -71,6 +72,303 @@
 //  prefixed with TVMFFIPy so they can be easily invoked through Cython.
 
///--------------------------------------------------------------------------------
 
+//================================================================================
+// PyObject-tying state machine.
+//
+// Ties one Python wrapper to one C++ chandle so that
+//   - ``a.x is a.x`` while the wrapper is live;
+//   - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep
+//     the chandle alive);
+//   - ``f(x) is x`` whenever an FFI function returns a chandle that already
+//     has a canonical wrapper.
+//
+// Layout
+// ------
+// Every Object allocated through the registered Python allocator
+// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``:
+//
+//   malloc start
+//   +-------------------+--------------------------+--------+
+//   | tagged_wrapper    | TVMFFIObjectAllocHeader  |   T    |
+//   |   (offset 0..8)   |   delete_space (8..16)   |        |
+//   +-------------------+--------------------------+--------+
+//                                                  ^ tptr = malloc + 16
+//
+// The single ``tagged_wrapper`` field encodes a 3-state machine via the
+// low bit of the canonical wrapper pointer. CPython PyObjects are at least
+// 8-byte aligned (PyObject_HEAD is 16 B), so the low bit is always free.
+//
+// States
+// ------
+//   Init:     ``tagged_wrapper == 0``
+//             No Python wrapper bound to this chandle.
+//   Active:   ``tagged_wrapper == (uintptr_t)wrapper | 1``
+//             ``wrapper`` is the canonical Python wrapper for the chandle.
+//             ``wrapper.chandle == chandle``. Wrapper holds +1 on chandle
+//             (released by ``__dealloc__``).
+//   Inactive: ``tagged_wrapper == (uintptr_t)wrapper``
+//             Wrapper memory is preserved (Python refcount kept alive by
+//             a phantom +1 from ``tp_finalize``), but ``wrapper.chandle``
+//             has been cleared. The wrapper does NOT hold +1 on chandle;
+//             the chandle stays alive only via other C++ holders.
+//
+// Transitions (exposed as the four ``TVMFFIPy*`` helpers below)
+// --------------------------------------------------------------
+//   Init   -> Active    ``TVMFFIPyAttachPyObject`` after a fresh
+//                       ``cls.__new__(cls)`` in ``make_ret_object``.
+//   Active -> Active    ``TVMFFIPyTryGetAttachedPyObject`` returns the
+//                       cached wrapper; caller INCREFs and DecRefs the
+//                       redundant +1 on chandle.
+//   Inactive -> Active  ``TVMFFIPyTryGetAttachedPyObject`` returns the
+//                       preserved wrapper. Caller re-binds
+//                       ``wrapper.chandle = chandle`` (transferring its
+//                       +1) and calls ``TVMFFIPyAttachPyObject`` to set
+//                       the live bit.
+//   Active -> Inactive  ``TVMFFIPyTPFinalize`` when the wrapper's Python
+//                       refcount hits 0 while other C++ holders keep the
+//                       chandle alive: clears the live bit, DECREFs
+//                       chandle, Py_INCREFs self (PEP-442 resurrection).
+//   Active -> Init      ``TVMFFIPyTPFinalize`` when this wrapper is the
+//                       last C++ holder: clears tagged_wrapper, falls
+//                       through to ``__dealloc__`` for normal cleanup.
+//   Active -> Init      ``TVMFFIPyDetachPyObject`` from ``__dealloc__``
+//                       and from the rvalue-ref move setter (eager-detach
+//                       before the C++ side nulls the source chandle).
+//
+// Chandle deleter (Inactive reclamation)
+// --------------------------------------
+// When chandle's refcount finally hits 0 in the Inactive state, the C++
+// Weak deleter calls ``base.delete_space`` -> ``TVMFFIPyDeleteSpace``.
+// The deleter notices the preserved wrapper (``tagged_wrapper != 0``),
+// acquires the GIL, and frees the wrapper memory with ``PyObject_GC_Del``
+// before releasing the malloc block.
+//
+// Shutdown guard
+// --------------
+// ``g_tvm_ffi_python_alive`` is flipped to 0 by an atexit hook registered
+// from Cython module init. After that point, Inactive wrapper bytes are
+// intentionally leaked (process exiting; OS reclaims) rather than
+// reaching for ``PyGILState_Ensure`` on a teardown interpreter.
+//================================================================================
+
+/*!
+ * \brief Python-side derived header. Single tagged-pointer field encodes
+ *        the Init / Active / Inactive states. ``base.delete_space`` sits
+ *        at exactly ``tptr - sizeof(TVMFFIObjectAllocHeader)`` so the
+ *        generic C++ deleter (which knows nothing about Python) can find
+ *        it.
+ */
+struct PyCustomAllocHeader {
+  uintptr_t tagged_wrapper;        // see state encoding
+  TVMFFIObjectAllocHeader base;    // delete_space; at tptr - 8
+};
+
+constexpr uintptr_t kPyHeaderLiveTag = 1;
+constexpr size_t kPyHeaderOffset = sizeof(PyCustomAllocHeader);  // 16
+static_assert(kPyHeaderOffset == 16,
+              "header must be 16 bytes so T at tptr = malloc + 16 is naturally 
"
+              "aligned for alignof(T) up to alignof(max_align_t)");
+static_assert(offsetof(PyCustomAllocHeader, base) ==
+                  kPyHeaderOffset - sizeof(TVMFFIObjectAllocHeader),
+              "base must sit at tptr - sizeof(TVMFFIObjectAllocHeader) for the 
"
+              "C++ deleter to find it");
+
+TVM_FFI_INLINE PyCustomAllocHeader* TVMFFIPyHeader(void* tptr) {
+  return reinterpret_cast<PyCustomAllocHeader*>(static_cast<char*>(tptr) - 
kPyHeaderOffset);
+}
+
+// Forward decl; defined below.
+//
+// NOTE: deliberately *not* TVM_FFI_INLINE. TVM_FFI_INLINE expands to
+// [[gnu::always_inline]] which forbids taking the function's address as
+// a stable, callable pointer — and we hand the address to the C++ side
+// (stored in PyCustomAllocHeader::base.delete_space at allocate time).
+inline void TVMFFIPyDeleteSpace(void* tptr);
+
+// Atexit-driven shutdown guard. Flipped to 0 by an atexit hook registered
+// from Cython module init; read by ``TVMFFIPyDeleteSpace`` before
+// ``PyGILState_Ensure`` to avoid touching a teardown interpreter.
+static volatile int g_tvm_ffi_python_alive = 1;
+
+inline void TVMFFIPyMarkPythonFinalizing() noexcept { g_tvm_ffi_python_alive = 
0; }
+
+/*!
+ * \brief True iff ``chandle`` was allocated through the Python custom
+ *        allocator (full ``PyCustomAllocHeader`` ahead of it). False for
+ *        allocations that came through libtvm_ffi's builtin default
+ *        (only the base ``TVMFFIObjectAllocHeader``).
+ *
+ * Detection is by comparing ``base.delete_space`` against
+ * ``TVMFFIPyDeleteSpace``: each frontend recognizes its own deleter
+ * pointer, so multiple frontends can coexist without a flag bit on
+ * ``TVMFFIObject``.
+ */
+TVM_FFI_INLINE bool TVMFFIPyIsCanonical(void* chandle) {
+  if (chandle == nullptr) return false;
+  TVMFFIObjectAllocHeader* base = reinterpret_cast<TVMFFIObjectAllocHeader*>(
+      static_cast<char*>(chandle) - sizeof(TVMFFIObjectAllocHeader));
+  return base->delete_space == &TVMFFIPyDeleteSpace;
+}
+
+//---------------------------------------------------------------
+// The four state-machine transitions.
+//---------------------------------------------------------------
+
+/*!
+ * \brief Try to fetch the canonical wrapper bound to ``chandle``.
+ *
+ * Returns a borrowed PyObject* in the Active or Inactive state; returns
+ * NULL in the Init state or for chandles without a Python alloc header.
+ * Does NOT mutate state; the Inactive -> Active revival's
+ * ``tagged_wrapper`` re-tagging happens via ``TVMFFIPyAttachPyObject``
+ * after the caller rebinds ``wrapper.chandle``.
+ */
+TVM_FFI_INLINE PyObject* TVMFFIPyTryGetAttachedPyObject(void* chandle) {
+  if (!TVMFFIPyIsCanonical(chandle)) return nullptr;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  if (h->tagged_wrapper == 0) return nullptr;
+  return reinterpret_cast<PyObject*>(h->tagged_wrapper & ~kPyHeaderLiveTag);
+}
+
+/*!
+ * \brief Bind ``obj`` to ``chandle`` as the canonical wrapper (-> Active).
+ *
+ * Used both for fresh allocation (Init -> Active) and for Inactive ->
+ * Active revival after the caller has re-set ``obj.chandle = chandle``.
+ *
+ * No-op for chandles without a Python alloc header.
+ */
+TVM_FFI_INLINE void TVMFFIPyAttachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  TVMFFIPyHeader(chandle)->tagged_wrapper = reinterpret_cast<uintptr_t>(obj) | 
kPyHeaderLiveTag;
+}
+
+/*!
+ * \brief Clear the canonical-wrapper binding (Active -> Init).
+ *
+ * No-op when ``obj`` is not the live canonical wrapper for ``chandle``
+ * (Init, Inactive, stale post-move wrapper, or non-Python alloc header).
+ * Used by ``__dealloc__`` and by the rvalue-ref move setter to
+ * eager-detach before the C++ side nulls the source chandle.
+ */
+TVM_FFI_INLINE void TVMFFIPyDetachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  uintptr_t expected = reinterpret_cast<uintptr_t>(obj) | kPyHeaderLiveTag;
+  if (h->tagged_wrapper == expected) {
+    h->tagged_wrapper = 0;
+  }
+}
+
+/*!
+ * \brief tp_finalize hook. Picks one of two transitions:
+ *
+ *   Active -> Inactive: when other C++ holders keep the chandle alive,
+ *     resurrect the wrapper so a future re-fetch can rebind. Clears the

Review Comment:
   resurrect the wrapper => resurrect the PyObject



##########
include/tvm/ffi/c_api.h:
##########
@@ -580,6 +580,80 @@ TVM_FFI_DLL int TVMFFIObjectDecRef(TVMFFIObjectHandle obj);
 TVM_FFI_DLL int TVMFFIObjectCreateOpaque(void* handle, int32_t type_index,
                                          void (*deleter)(void* handle), 
TVMFFIObjectHandle* out);
 
+//-----------------------------------------------------------------------
+// Section: Custom allocator hook for Object allocations
+//
+// Every allocation of a ref-counted Object goes through the registered
+// TVMFFICustomAllocator. The allocator places a TVMFFIObjectAllocHeader
+// immediately before the returned T pointer; the C++ Weak deleter recovers
+// the header and invokes ``delete_space`` to release the whole block.
+//
+// libtvm_ffi installs a builtin default allocator at registry init so every
+// Object always has at least the base header. Frontends (Python via the
+// Cython extension, Rust binding, etc.) override the global allocator with
+// TVMFFISetCustomAllocator when they need richer per-allocation bookkeeping
+// (e.g. Python's wrapper back-pointer).
+//-----------------------------------------------------------------------
+/*!
+ * \brief Mandatory header placed immediately before each TVMFFIObject body.
+ *
+ * The C++ deleter recovers this header by walking back exactly
+ * ``sizeof(TVMFFIObjectAllocHeader)`` bytes from the T pointer and invokes
+ * ``delete_space``. Frontends with a derived layout add extra fields ahead
+ * of the base; the derived ``delete_space`` callback recovers its enclosing
+ * struct from ``tptr``.
+ */
+typedef struct {
+  /*!
+   * \brief Free the allocation that contains ``tptr``.
+   * \param tptr The pointer originally returned by
+   *             ``TVMFFICustomAllocator::allocate`` (i.e. the address of T,
+   *             not the start of the underlying malloc block).
+   */
+  void (*delete_space)(void* tptr);
+} TVMFFIObjectAllocHeader;
+
+/*!
+ * \brief Custom allocator entry registered with TVMFFISetCustomAllocator.
+ */
+typedef struct {
+  /*!
+   * \brief Allocate ``size`` bytes for an Object of ``type_index`` with the
+   *        requested ``alignment``. Implementations must place a
+   *        TVMFFIObjectAllocHeader immediately before the returned pointer

Review Comment:
   \note Implementation ...



##########
include/tvm/ffi/c_api.h:
##########
@@ -580,6 +580,80 @@ TVM_FFI_DLL int TVMFFIObjectDecRef(TVMFFIObjectHandle obj);
 TVM_FFI_DLL int TVMFFIObjectCreateOpaque(void* handle, int32_t type_index,
                                          void (*deleter)(void* handle), 
TVMFFIObjectHandle* out);
 
+//-----------------------------------------------------------------------
+// Section: Custom allocator hook for Object allocations
+//
+// Every allocation of a ref-counted Object goes through the registered
+// TVMFFICustomAllocator. The allocator places a TVMFFIObjectAllocHeader
+// immediately before the returned T pointer; the C++ Weak deleter recovers
+// the header and invokes ``delete_space`` to release the whole block.
+//
+// libtvm_ffi installs a builtin default allocator at registry init so every
+// Object always has at least the base header. Frontends (Python via the
+// Cython extension, Rust binding, etc.) override the global allocator with
+// TVMFFISetCustomAllocator when they need richer per-allocation bookkeeping
+// (e.g. Python's wrapper back-pointer).
+//-----------------------------------------------------------------------
+/*!
+ * \brief Mandatory header placed immediately before each TVMFFIObject body.
+ *
+ * The C++ deleter recovers this header by walking back exactly
+ * ``sizeof(TVMFFIObjectAllocHeader)`` bytes from the T pointer and invokes
+ * ``delete_space``. Frontends with a derived layout add extra fields ahead
+ * of the base; the derived ``delete_space`` callback recovers its enclosing
+ * struct from ``tptr``.
+ */
+typedef struct {
+  /*!
+   * \brief Free the allocation that contains ``tptr``.
+   * \param tptr The pointer originally returned by
+   *             ``TVMFFICustomAllocator::allocate`` (i.e. the address of T,
+   *             not the start of the underlying malloc block).
+   */
+  void (*delete_space)(void* tptr);
+} TVMFFIObjectAllocHeader;
+
+/*!
+ * \brief Custom allocator entry registered with TVMFFISetCustomAllocator.
+ */
+typedef struct {
+  /*!
+   * \brief Allocate ``size`` bytes for an Object of ``type_index`` with the

Review Comment:
   Allocate the space



##########
include/tvm/ffi/memory.h:
##########
@@ -92,6 +93,32 @@ TVM_FFI_INLINE void AlignedFree(void* data) {
 #endif
 }
 
+/*!
+ * \brief Fixed offset between the base TVMFFIObjectAllocHeader and the
+ *        embedded T (= Object = TVMFFIObject).
+ *
+ * The C++ deleter recovers the base header from a chandle by walking back
+ * exactly this many bytes: ``tptr - kObjectAllocHeaderOffset``. Frontends
+ * with a derived layout add their extra fields ahead of the base.
+ */
+constexpr size_t kObjectAllocHeaderOffset = sizeof(TVMFFIObjectAllocHeader);

Review Comment:
   move the constant and static assert into GetObjectAllocHeader, needs 
GetObjectAllocHeader



##########
include/tvm/ffi/c_api.h:
##########
@@ -580,6 +580,80 @@ TVM_FFI_DLL int TVMFFIObjectDecRef(TVMFFIObjectHandle obj);
 TVM_FFI_DLL int TVMFFIObjectCreateOpaque(void* handle, int32_t type_index,
                                          void (*deleter)(void* handle), 
TVMFFIObjectHandle* out);
 
+//-----------------------------------------------------------------------
+// Section: Custom allocator hook for Object allocations
+//
+// Every allocation of a ref-counted Object goes through the registered
+// TVMFFICustomAllocator. The allocator places a TVMFFIObjectAllocHeader
+// immediately before the returned T pointer; the C++ Weak deleter recovers

Review Comment:
   Section: ObjectAllocHeader and CustomAllocator



##########
include/tvm/ffi/memory.h:
##########


Review Comment:
   Deleter and DeleterWithDelete space



##########
python/tvm_ffi/cython/function.pxi:
##########
@@ -544,10 +561,19 @@ cdef int TVMFFIPyArgSetterObjectRValueRef_(
     TVMFFIPyArgSetter* handle, TVMFFIPyCallContext* ctx,
     PyObject* py_arg, TVMFFIAny* out
 ) except -1:
-    """Setter for ObjectRValueRef"""
-    cdef object arg = <object>py_arg
+    """Setter for ObjectRValueRef.
+
+    The C++ side (TVMFFIAnyViewToOwnedAny) reads ``*v_ptr`` and writes NULL
+    back, transferring the +1 strong ref to the FFI side and leaving the
+    source wrapper with ``chandle == NULL``. After that point ``__dealloc__``
+    can no longer reach the header to clear the canonical-wrapper binding,
+    so if the source is the canonical wrapper for this chandle we eager-
+    detach now.
+    """
+    cdef CObject src = (<object>py_arg).obj
+    TVMFFIPyDetachPyObject(src.chandle, <PyObject*>src)

Review Comment:
   note: TVMFFIPyDetachPyObject is robust to cases where Object is not 
allocated by custom PyAttachField



##########
python/tvm_ffi/cython/function.pxi:
##########
@@ -544,10 +561,19 @@ cdef int TVMFFIPyArgSetterObjectRValueRef_(
     TVMFFIPyArgSetter* handle, TVMFFIPyCallContext* ctx,
     PyObject* py_arg, TVMFFIAny* out
 ) except -1:
-    """Setter for ObjectRValueRef"""
-    cdef object arg = <object>py_arg
+    """Setter for ObjectRValueRef.
+
+    The C++ side (TVMFFIAnyViewToOwnedAny) reads ``*v_ptr`` and writes NULL
+    back, transferring the +1 strong ref to the FFI side and leaving the
+    source wrapper with ``chandle == NULL``. After that point ``__dealloc__``
+    can no longer reach the header to clear the canonical-wrapper binding,
+    so if the source is the canonical wrapper for this chandle we eager-
+    detach now.
+    """
+    cdef CObject src = (<object>py_arg).obj
+    TVMFFIPyDetachPyObject(src.chandle, <PyObject*>src)

Review Comment:
   comment in here, 
   ```
   # need to detach from chandle
   # there are two possible outcomes after all:
   # chandle get moved so it is set to NULL
   # callee did not move chandle, in such case, src.chandle is valid but 
chandle is not attached to PyObject
   # we need to carefully handle chandle and PyObject recycling in both cases
   # These logics are implemented in TVMFFIPyTPFinalize
   ```



##########
python/tvm_ffi/cython/function.pxi:
##########
@@ -62,8 +62,25 @@ cdef inline object make_ret_small_bytes(TVMFFIAny result):
     return bytearray_to_bytes(&bytes)
 
 
-cdef inline object make_ret(TVMFFIAny result, const DLPackExchangeAPI* 
c_ctx_dlpack_api = NULL):
-    """convert result to return value."""
+cdef inline object make_ret(
+    TVMFFIAny result,
+    const DLPackExchangeAPI* c_ctx_dlpack_api = NULL,
+):
+    """Convert ``result`` to a Python return value.

Review Comment:
   no need for doc change



##########
src/ffi/custom_allocator.cc:
##########
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * \file src/ffi/custom_allocator.cc
+ * \brief Process-wide registry for the custom Object allocator and the
+ *        builtin default allocator behind it. See ObjAllocatorBase /
+ *        Handler::New in <tvm/ffi/memory.h> for the allocator contract.
+ */
+#include <tvm/ffi/base_details.h>
+#include <tvm/ffi/c_api.h>
+#include <tvm/ffi/error.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/memory.h>
+
+#include <atomic>
+
+namespace tvm {
+namespace ffi {
+namespace {
+
+void BuiltinDefaultDeleteSpace(void* tptr) {
+  details::AlignedFree(static_cast<char*>(tptr) - 
details::kObjectAllocHeaderOffset);
+}
+
+void* BuiltinDefaultAllocate(size_t size, size_t alignment, int32_t 
/*type_index*/,
+                             void* /*context*/) {
+  // Total bytes between malloc start and T must be a multiple of `alignment`
+  // (so T is aligned). The header is `kObjectAllocHeaderOffset` bytes; if
+  // that's already enough, no leading pad. Otherwise round up.
+  // ``alignment`` is bounded by ``alignof(max_align_t)`` per Handler::New's
+  // static_assert, so AlignedAlloc<alignof(max_align_t)> (plain malloc)
+  // returns memory aligned for any T we allocate here.
+  const size_t total_offset =
+      (details::kObjectAllocHeaderOffset + alignment - 1) & ~(alignment - 1);
+  const size_t total_size = total_offset + size;
+  void* base_alloc = 
details::AlignedAlloc<alignof(::std::max_align_t)>(total_size);
+  void* tptr = static_cast<char*>(base_alloc) + total_offset;
+  details::GetObjectAllocHeader(tptr)->delete_space = 
&BuiltinDefaultDeleteSpace;
+  return tptr;
+}
+
+class CustomAllocatorRegistry {
+ public:
+  CustomAllocatorRegistry() {
+    current_.store(BuiltinDefault(), std::memory_order_release);
+  }
+
+  TVMFFICustomAllocator* Get() const {
+    return current_.load(std::memory_order_acquire);
+  }
+
+  void Set(TVMFFICustomAllocator* allocator) {
+    current_.store(allocator != nullptr ? allocator : BuiltinDefault(), 
std::memory_order_release);
+  }
+
+  static CustomAllocatorRegistry* Global() {
+    static CustomAllocatorRegistry inst;
+    return &inst;
+  }
+
+ private:
+  static TVMFFICustomAllocator* BuiltinDefault() {
+    static TVMFFICustomAllocator builtin{&BuiltinDefaultAllocate, 
/*context=*/nullptr};
+    return &builtin;
+  }
+
+  std::atomic<TVMFFICustomAllocator*> current_{nullptr};

Review Comment:
   assume sync in other places, not use automic



##########
include/tvm/ffi/memory.h:
##########
@@ -92,6 +93,32 @@ TVM_FFI_INLINE void AlignedFree(void* data) {
 #endif
 }
 
+/*!
+ * \brief Fixed offset between the base TVMFFIObjectAllocHeader and the
+ *        embedded T (= Object = TVMFFIObject).
+ *
+ * The C++ deleter recovers the base header from a chandle by walking back
+ * exactly this many bytes: ``tptr - kObjectAllocHeaderOffset``. Frontends
+ * with a derived layout add their extra fields ahead of the base.
+ */
+constexpr size_t kObjectAllocHeaderOffset = sizeof(TVMFFIObjectAllocHeader);

Review Comment:
   inline plus comment on use site



##########
python/tvm_ffi/cython/tvm_ffi_python_helpers.h:
##########
@@ -71,6 +72,303 @@
 //  prefixed with TVMFFIPy so they can be easily invoked through Cython.
 
///--------------------------------------------------------------------------------
 
+//================================================================================
+// PyObject-tying state machine.
+//
+// Ties one Python wrapper to one C++ chandle so that
+//   - ``a.x is a.x`` while the wrapper is live;
+//   - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep
+//     the chandle alive);
+//   - ``f(x) is x`` whenever an FFI function returns a chandle that already
+//     has a canonical wrapper.
+//
+// Layout
+// ------
+// Every Object allocated through the registered Python allocator
+// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``:
+//
+//   malloc start
+//   +-------------------+--------------------------+--------+
+//   | tagged_wrapper    | TVMFFIObjectAllocHeader  |   T    |
+//   |   (offset 0..8)   |   delete_space (8..16)   |        |
+//   +-------------------+--------------------------+--------+
+//                                                  ^ tptr = malloc + 16
+//
+// The single ``tagged_wrapper`` field encodes a 3-state machine via the
+// low bit of the canonical wrapper pointer. CPython PyObjects are at least
+// 8-byte aligned (PyObject_HEAD is 16 B), so the low bit is always free.
+//
+// States
+// ------
+//   Init:     ``tagged_wrapper == 0``
+//             No Python wrapper bound to this chandle.
+//   Active:   ``tagged_wrapper == (uintptr_t)wrapper | 1``
+//             ``wrapper`` is the canonical Python wrapper for the chandle.
+//             ``wrapper.chandle == chandle``. Wrapper holds +1 on chandle
+//             (released by ``__dealloc__``).
+//   Inactive: ``tagged_wrapper == (uintptr_t)wrapper``
+//             Wrapper memory is preserved (Python refcount kept alive by
+//             a phantom +1 from ``tp_finalize``), but ``wrapper.chandle``
+//             has been cleared. The wrapper does NOT hold +1 on chandle;
+//             the chandle stays alive only via other C++ holders.
+//
+// Transitions (exposed as the four ``TVMFFIPy*`` helpers below)
+// --------------------------------------------------------------
+//   Init   -> Active    ``TVMFFIPyAttachPyObject`` after a fresh
+//                       ``cls.__new__(cls)`` in ``make_ret_object``.
+//   Active -> Active    ``TVMFFIPyTryGetAttachedPyObject`` returns the
+//                       cached wrapper; caller INCREFs and DecRefs the
+//                       redundant +1 on chandle.
+//   Inactive -> Active  ``TVMFFIPyTryGetAttachedPyObject`` returns the
+//                       preserved wrapper. Caller re-binds
+//                       ``wrapper.chandle = chandle`` (transferring its
+//                       +1) and calls ``TVMFFIPyAttachPyObject`` to set
+//                       the live bit.
+//   Active -> Inactive  ``TVMFFIPyTPFinalize`` when the wrapper's Python
+//                       refcount hits 0 while other C++ holders keep the
+//                       chandle alive: clears the live bit, DECREFs
+//                       chandle, Py_INCREFs self (PEP-442 resurrection).
+//   Active -> Init      ``TVMFFIPyTPFinalize`` when this wrapper is the
+//                       last C++ holder: clears tagged_wrapper, falls
+//                       through to ``__dealloc__`` for normal cleanup.
+//   Active -> Init      ``TVMFFIPyDetachPyObject`` from ``__dealloc__``
+//                       and from the rvalue-ref move setter (eager-detach
+//                       before the C++ side nulls the source chandle).
+//
+// Chandle deleter (Inactive reclamation)
+// --------------------------------------
+// When chandle's refcount finally hits 0 in the Inactive state, the C++
+// Weak deleter calls ``base.delete_space`` -> ``TVMFFIPyDeleteSpace``.
+// The deleter notices the preserved wrapper (``tagged_wrapper != 0``),
+// acquires the GIL, and frees the wrapper memory with ``PyObject_GC_Del``
+// before releasing the malloc block.
+//
+// Shutdown guard
+// --------------
+// ``g_tvm_ffi_python_alive`` is flipped to 0 by an atexit hook registered
+// from Cython module init. After that point, Inactive wrapper bytes are
+// intentionally leaked (process exiting; OS reclaims) rather than
+// reaching for ``PyGILState_Ensure`` on a teardown interpreter.
+//================================================================================
+
+/*!
+ * \brief Python-side derived header. Single tagged-pointer field encodes
+ *        the Init / Active / Inactive states. ``base.delete_space`` sits
+ *        at exactly ``tptr - sizeof(TVMFFIObjectAllocHeader)`` so the
+ *        generic C++ deleter (which knows nothing about Python) can find
+ *        it.
+ */
+struct PyCustomAllocHeader {
+  uintptr_t tagged_wrapper;        // see state encoding
+  TVMFFIObjectAllocHeader base;    // delete_space; at tptr - 8
+};
+
+constexpr uintptr_t kPyHeaderLiveTag = 1;
+constexpr size_t kPyHeaderOffset = sizeof(PyCustomAllocHeader);  // 16
+static_assert(kPyHeaderOffset == 16,
+              "header must be 16 bytes so T at tptr = malloc + 16 is naturally 
"
+              "aligned for alignof(T) up to alignof(max_align_t)");
+static_assert(offsetof(PyCustomAllocHeader, base) ==
+                  kPyHeaderOffset - sizeof(TVMFFIObjectAllocHeader),
+              "base must sit at tptr - sizeof(TVMFFIObjectAllocHeader) for the 
"
+              "C++ deleter to find it");
+
+TVM_FFI_INLINE PyCustomAllocHeader* TVMFFIPyHeader(void* tptr) {
+  return reinterpret_cast<PyCustomAllocHeader*>(static_cast<char*>(tptr) - 
kPyHeaderOffset);
+}
+
+// Forward decl; defined below.
+//
+// NOTE: deliberately *not* TVM_FFI_INLINE. TVM_FFI_INLINE expands to
+// [[gnu::always_inline]] which forbids taking the function's address as
+// a stable, callable pointer — and we hand the address to the C++ side
+// (stored in PyCustomAllocHeader::base.delete_space at allocate time).
+inline void TVMFFIPyDeleteSpace(void* tptr);
+
+// Atexit-driven shutdown guard. Flipped to 0 by an atexit hook registered
+// from Cython module init; read by ``TVMFFIPyDeleteSpace`` before
+// ``PyGILState_Ensure`` to avoid touching a teardown interpreter.
+static volatile int g_tvm_ffi_python_alive = 1;
+
+inline void TVMFFIPyMarkPythonFinalizing() noexcept { g_tvm_ffi_python_alive = 
0; }
+
+/*!
+ * \brief True iff ``chandle`` was allocated through the Python custom
+ *        allocator (full ``PyCustomAllocHeader`` ahead of it). False for
+ *        allocations that came through libtvm_ffi's builtin default
+ *        (only the base ``TVMFFIObjectAllocHeader``).
+ *
+ * Detection is by comparing ``base.delete_space`` against
+ * ``TVMFFIPyDeleteSpace``: each frontend recognizes its own deleter
+ * pointer, so multiple frontends can coexist without a flag bit on
+ * ``TVMFFIObject``.
+ */
+TVM_FFI_INLINE bool TVMFFIPyIsCanonical(void* chandle) {
+  if (chandle == nullptr) return false;
+  TVMFFIObjectAllocHeader* base = reinterpret_cast<TVMFFIObjectAllocHeader*>(
+      static_cast<char*>(chandle) - sizeof(TVMFFIObjectAllocHeader));
+  return base->delete_space == &TVMFFIPyDeleteSpace;
+}
+
+//---------------------------------------------------------------
+// The four state-machine transitions.
+//---------------------------------------------------------------
+
+/*!
+ * \brief Try to fetch the canonical wrapper bound to ``chandle``.
+ *
+ * Returns a borrowed PyObject* in the Active or Inactive state; returns
+ * NULL in the Init state or for chandles without a Python alloc header.
+ * Does NOT mutate state; the Inactive -> Active revival's
+ * ``tagged_wrapper`` re-tagging happens via ``TVMFFIPyAttachPyObject``
+ * after the caller rebinds ``wrapper.chandle``.
+ */
+TVM_FFI_INLINE PyObject* TVMFFIPyTryGetAttachedPyObject(void* chandle) {
+  if (!TVMFFIPyIsCanonical(chandle)) return nullptr;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  if (h->tagged_wrapper == 0) return nullptr;
+  return reinterpret_cast<PyObject*>(h->tagged_wrapper & ~kPyHeaderLiveTag);
+}
+
+/*!
+ * \brief Bind ``obj`` to ``chandle`` as the canonical wrapper (-> Active).
+ *
+ * Used both for fresh allocation (Init -> Active) and for Inactive ->
+ * Active revival after the caller has re-set ``obj.chandle = chandle``.
+ *
+ * No-op for chandles without a Python alloc header.
+ */
+TVM_FFI_INLINE void TVMFFIPyAttachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  TVMFFIPyHeader(chandle)->tagged_wrapper = reinterpret_cast<uintptr_t>(obj) | 
kPyHeaderLiveTag;
+}
+
+/*!
+ * \brief Clear the canonical-wrapper binding (Active -> Init).
+ *
+ * No-op when ``obj`` is not the live canonical wrapper for ``chandle``
+ * (Init, Inactive, stale post-move wrapper, or non-Python alloc header).
+ * Used by ``__dealloc__`` and by the rvalue-ref move setter to
+ * eager-detach before the C++ side nulls the source chandle.
+ */
+TVM_FFI_INLINE void TVMFFIPyDetachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  uintptr_t expected = reinterpret_cast<uintptr_t>(obj) | kPyHeaderLiveTag;
+  if (h->tagged_wrapper == expected) {
+    h->tagged_wrapper = 0;
+  }
+}
+
+/*!
+ * \brief tp_finalize hook. Picks one of two transitions:
+ *
+ *   Active -> Inactive: when other C++ holders keep the chandle alive,
+ *     resurrect the wrapper so a future re-fetch can rebind. Clears the
+ *     live bit, nulls ``*chandle_field``, releases the wrapper's +1 on
+ *     chandle, and Py_INCREFs ``self`` (PEP-442 resurrection). The
+ *     phantom +1 is later reclaimed by ``TVMFFIPyDeleteSpace`` when
+ *     chandle dies.
+ *
+ *   Active -> Init: when the wrapper is the last C++ holder, do nothing
+ *     here and let ``__dealloc__`` run normally. ``__dealloc__`` calls
+ *     ``TVMFFIPyDetachPyObject`` and DECREFs the chandle.
+ *
+ * Why null ``*chandle_field`` BEFORE the DECREF on the Inactive path:
+ *   - It is the Inactive sentinel that ``make_ret_object`` reads on
+ *     revival (``cached.chandle == NULL`` -> rebind & transfer caller's
+ *     +1). A stale non-NULL slot would mis-route to the Active branch
+ *     and double-drop the caller's +1.
+ *   - It keeps the invariant ``wrapper.chandle != NULL implies wrapper
+ *     owns +1 on chandle`` true at every observable point: any reader
+ *     of ``self.chandle`` (hash, same_as, repr, ...) sees NULL before
+ *     any deleter the DECREF may trigger gets a chance to run.
+ */
+TVM_FFI_INLINE void TVMFFIPyTPFinalize(void** chandle_field, PyObject* self) {
+  void* chandle = *chandle_field;
+  if (chandle == nullptr) return;
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  uintptr_t expected = reinterpret_cast<uintptr_t>(self) | kPyHeaderLiveTag;
+  if (h->tagged_wrapper != expected) return;
+  // Read strong count without atomic — under classic CPython the GIL
+  // serializes Python-side access, and C++-side DecRefs cross the FFI
+  // boundary which cannot run while we hold the GIL here.
+  uint64_t strong_count =
+      reinterpret_cast<TVMFFIObject*>(chandle)->combined_ref_count & 
0xFFFFFFFFu;
+  if (strong_count <= 1) {
+    // Active -> Init: we hold the last strong ref; let __dealloc__ run
+    // normally and detach the binding there.

Review Comment:
   Call detach here then go directly to `__dealloc__`



##########
include/tvm/ffi/c_api.h:
##########
@@ -580,6 +580,80 @@ TVM_FFI_DLL int TVMFFIObjectDecRef(TVMFFIObjectHandle obj);
 TVM_FFI_DLL int TVMFFIObjectCreateOpaque(void* handle, int32_t type_index,
                                          void (*deleter)(void* handle), 
TVMFFIObjectHandle* out);
 
+//-----------------------------------------------------------------------
+// Section: Custom allocator hook for Object allocations
+//
+// Every allocation of a ref-counted Object goes through the registered
+// TVMFFICustomAllocator. The allocator places a TVMFFIObjectAllocHeader
+// immediately before the returned T pointer; the C++ Weak deleter recovers
+// the header and invokes ``delete_space`` to release the whole block.
+//
+// libtvm_ffi installs a builtin default allocator at registry init so every
+// Object always has at least the base header. Frontends (Python via the
+// Cython extension, Rust binding, etc.) override the global allocator with
+// TVMFFISetCustomAllocator when they need richer per-allocation bookkeeping
+// (e.g. Python's wrapper back-pointer).
+//-----------------------------------------------------------------------
+/*!
+ * \brief Mandatory header placed immediately before each TVMFFIObject body.

Review Comment:
   Mandatory header placed immediately before each TVMFFIObject body.
   
   This header maybe used by TVMFFIObject::deleter to reclaim space when custom 
allocator is present. It can also be set to NULL if  TVMFFIObject::deleter 
directly calls system free. This section must be available for each Object so 
frontend can rely on this field to confirm if the object come from certain 
allocator



##########
include/tvm/ffi/c_api.h:
##########
@@ -580,6 +580,80 @@ TVM_FFI_DLL int TVMFFIObjectDecRef(TVMFFIObjectHandle obj);
 TVM_FFI_DLL int TVMFFIObjectCreateOpaque(void* handle, int32_t type_index,
                                          void (*deleter)(void* handle), 
TVMFFIObjectHandle* out);
 
+//-----------------------------------------------------------------------
+// Section: Custom allocator hook for Object allocations
+//
+// Every allocation of a ref-counted Object goes through the registered
+// TVMFFICustomAllocator. The allocator places a TVMFFIObjectAllocHeader
+// immediately before the returned T pointer; the C++ Weak deleter recovers
+// the header and invokes ``delete_space`` to release the whole block.
+//
+// libtvm_ffi installs a builtin default allocator at registry init so every
+// Object always has at least the base header. Frontends (Python via the
+// Cython extension, Rust binding, etc.) override the global allocator with
+// TVMFFISetCustomAllocator when they need richer per-allocation bookkeeping
+// (e.g. Python's wrapper back-pointer).
+//-----------------------------------------------------------------------
+/*!
+ * \brief Mandatory header placed immediately before each TVMFFIObject body.
+ *
+ * The C++ deleter recovers this header by walking back exactly
+ * ``sizeof(TVMFFIObjectAllocHeader)`` bytes from the T pointer and invokes
+ * ``delete_space``. Frontends with a derived layout add extra fields ahead
+ * of the base; the derived ``delete_space`` callback recovers its enclosing
+ * struct from ``tptr``.
+ */
+typedef struct {
+  /*!
+   * \brief Free the allocation that contains ``tptr``.
+   * \param tptr The pointer originally returned by

Review Comment:
   ptr, the pointer space to object.
   
   \note ptr points to space of TVMFFIObject and does not include 
TVMFFIObjectAllocHeader



##########
include/tvm/ffi/c_api.h:
##########
@@ -580,6 +580,80 @@ TVM_FFI_DLL int TVMFFIObjectDecRef(TVMFFIObjectHandle obj);
 TVM_FFI_DLL int TVMFFIObjectCreateOpaque(void* handle, int32_t type_index,
                                          void (*deleter)(void* handle), 
TVMFFIObjectHandle* out);
 
+//-----------------------------------------------------------------------
+// Section: Custom allocator hook for Object allocations
+//
+// Every allocation of a ref-counted Object goes through the registered
+// TVMFFICustomAllocator. The allocator places a TVMFFIObjectAllocHeader
+// immediately before the returned T pointer; the C++ Weak deleter recovers
+// the header and invokes ``delete_space`` to release the whole block.
+//
+// libtvm_ffi installs a builtin default allocator at registry init so every
+// Object always has at least the base header. Frontends (Python via the
+// Cython extension, Rust binding, etc.) override the global allocator with
+// TVMFFISetCustomAllocator when they need richer per-allocation bookkeeping
+// (e.g. Python's wrapper back-pointer).
+//-----------------------------------------------------------------------
+/*!
+ * \brief Mandatory header placed immediately before each TVMFFIObject body.
+ *
+ * The C++ deleter recovers this header by walking back exactly
+ * ``sizeof(TVMFFIObjectAllocHeader)`` bytes from the T pointer and invokes
+ * ``delete_space``. Frontends with a derived layout add extra fields ahead
+ * of the base; the derived ``delete_space`` callback recovers its enclosing
+ * struct from ``tptr``.
+ */
+typedef struct {
+  /*!
+   * \brief Free the allocation that contains ``tptr``.
+   * \param tptr The pointer originally returned by
+   *             ``TVMFFICustomAllocator::allocate`` (i.e. the address of T,
+   *             not the start of the underlying malloc block).
+   */
+  void (*delete_space)(void* tptr);
+} TVMFFIObjectAllocHeader;
+
+/*!
+ * \brief Custom allocator entry registered with TVMFFISetCustomAllocator.
+ */
+typedef struct {
+  /*!
+   * \brief Allocate ``size`` bytes for an Object of ``type_index`` with the
+   *        requested ``alignment``. Implementations must place a
+   *        TVMFFIObjectAllocHeader immediately before the returned pointer
+   *        with a non-NULL ``delete_space``.
+   * \param size The total size requested for the object body.
+   * \param alignment The alignment requirement for the object body.
+   * \param type_index Type index of the object.
+   * \param context The ``context`` field of the registered allocator.
+   * \return Pointer to T's location (treated as TVMFFIObject* by the caller).
+   */
+  void* (*allocate)(size_t size, size_t alignment, int32_t type_index, void* 
context);
+  /*! \brief Allocator context passed unmodified to ``allocate``. */
+  void* context;
+} TVMFFICustomAllocator;
+
+/*!
+ * \brief Get the process-wide custom allocator.
+ *
+ * libtvm_ffi installs a builtin default at registry init, so the result is

Review Comment:
   \note TVMFFIGetCustomAllocator always return a valid allocator and can be 
overriden by TVMFFISetCustomAllocator.
   
   



##########
include/tvm/ffi/memory.h:
##########
@@ -168,7 +195,19 @@ class SimpleObjAllocator : public 
ObjAllocatorBase<SimpleObjAllocator> {
       // class with non-virtual destructor.
       // We are fine here as we captured the right deleter during construction.
       // This is also the right way to get storage type for an object pool.
-      void* data = AlignedAlloc<alignof(T)>(sizeof(T));
+
+      // Route through the custom-allocator hook. libtvm_ffi installs a

Review Comment:
   TVMFFIGetCustomAllocator() if it is NULL, go with original AlignedAlloc but 
prepend TVMFFIObjectAllocHeader where delete space is set to NULL



##########
include/tvm/ffi/c_api.h:
##########
@@ -580,6 +580,80 @@ TVM_FFI_DLL int TVMFFIObjectDecRef(TVMFFIObjectHandle obj);
 TVM_FFI_DLL int TVMFFIObjectCreateOpaque(void* handle, int32_t type_index,
                                          void (*deleter)(void* handle), 
TVMFFIObjectHandle* out);
 
+//-----------------------------------------------------------------------
+// Section: Custom allocator hook for Object allocations
+//
+// Every allocation of a ref-counted Object goes through the registered
+// TVMFFICustomAllocator. The allocator places a TVMFFIObjectAllocHeader
+// immediately before the returned T pointer; the C++ Weak deleter recovers
+// the header and invokes ``delete_space`` to release the whole block.
+//
+// libtvm_ffi installs a builtin default allocator at registry init so every
+// Object always has at least the base header. Frontends (Python via the
+// Cython extension, Rust binding, etc.) override the global allocator with
+// TVMFFISetCustomAllocator when they need richer per-allocation bookkeeping
+// (e.g. Python's wrapper back-pointer).
+//-----------------------------------------------------------------------
+/*!
+ * \brief Mandatory header placed immediately before each TVMFFIObject body.
+ *
+ * The C++ deleter recovers this header by walking back exactly
+ * ``sizeof(TVMFFIObjectAllocHeader)`` bytes from the T pointer and invokes
+ * ``delete_space``. Frontends with a derived layout add extra fields ahead
+ * of the base; the derived ``delete_space`` callback recovers its enclosing
+ * struct from ``tptr``.
+ */
+typedef struct {
+  /*!
+   * \brief Free the allocation that contains ``tptr``.
+   * \param tptr The pointer originally returned by
+   *             ``TVMFFICustomAllocator::allocate`` (i.e. the address of T,
+   *             not the start of the underlying malloc block).
+   */
+  void (*delete_space)(void* tptr);
+} TVMFFIObjectAllocHeader;
+
+/*!
+ * \brief Custom allocator entry registered with TVMFFISetCustomAllocator.
+ */
+typedef struct {
+  /*!
+   * \brief Allocate ``size`` bytes for an Object of ``type_index`` with the
+   *        requested ``alignment``. Implementations must place a
+   *        TVMFFIObjectAllocHeader immediately before the returned pointer
+   *        with a non-NULL ``delete_space``.
+   * \param size The total size requested for the object body.
+   * \param alignment The alignment requirement for the object body.
+   * \param type_index Type index of the object.
+   * \param context The ``context`` field of the registered allocator.
+   * \return Pointer to T's location (treated as TVMFFIObject* by the caller).
+   */
+  void* (*allocate)(size_t size, size_t alignment, int32_t type_index, void* 
context);
+  /*! \brief Allocator context passed unmodified to ``allocate``. */
+  void* context;
+} TVMFFICustomAllocator;
+
+/*!
+ * \brief Get the process-wide custom allocator.
+ *
+ * libtvm_ffi installs a builtin default at registry init, so the result is
+ * never NULL in practice. Frontends can override the global allocator via
+ * TVMFFISetCustomAllocator.
+ *
+ * \return The currently registered allocator.
+ */
+TVM_FFI_DLL TVMFFICustomAllocator* TVMFFIGetCustomAllocator(void);
+
+/*!
+ * \brief Register the process-wide custom allocator. Pass ``allocator==NULL``
+ *        to restore the builtin default.

Review Comment:
   \note allocator must be alive throughout the live range of the process



##########
include/tvm/ffi/memory.h:
##########
@@ -92,6 +93,32 @@ TVM_FFI_INLINE void AlignedFree(void* data) {
 #endif
 }
 
+/*!
+ * \brief Fixed offset between the base TVMFFIObjectAllocHeader and the
+ *        embedded T (= Object = TVMFFIObject).
+ *
+ * The C++ deleter recovers the base header from a chandle by walking back
+ * exactly this many bytes: ``tptr - kObjectAllocHeaderOffset``. Frontends
+ * with a derived layout add their extra fields ahead of the base.
+ */
+constexpr size_t kObjectAllocHeaderOffset = sizeof(TVMFFIObjectAllocHeader);
+static_assert(alignof(::std::max_align_t) % alignof(TVMFFIObjectAllocHeader) 
== 0,
+              "base header alignment must divide max_align_t so it remains "
+              "aligned when placed just before a max-aligned T");
+
+/*!
+ * \brief Recover the base TVMFFIObjectAllocHeader from a TVMFFIObject pointer.
+ *
+ * Every Object allocation is routed through the custom-allocator registry
+ * (libtvm_ffi installs a builtin default at registry init), so this is
+ * always valid for chandles produced by ``make_object`` /
+ * ``make_inplace_array_object``.
+ */
+TVM_FFI_INLINE TVMFFIObjectAllocHeader* GetObjectAllocHeader(void* tptr) {
+  return reinterpret_cast<TVMFFIObjectAllocHeader*>(static_cast<char*>(tptr) -

Review Comment:
   Go to ObjectUnsafe::GetObjectAllocHeaderFromPtr(void*)



##########
python/tvm_ffi/cython/base.pxi:
##########
@@ -15,12 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 import ctypes
-from libc.stdint cimport int32_t, int64_t, uint64_t, uint32_t, uint8_t, int16_t
+from libc.stdint cimport int32_t, int64_t, uint64_t, uint32_t, uint8_t, 
int16_t, uintptr_t
 from libc.string cimport memcpy
 from libcpp.vector cimport vector
 from cpython.bytes cimport PyBytes_AsStringAndSize, PyBytes_FromStringAndSize, 
PyBytes_AsString
 from cpython cimport Py_INCREF, Py_DECREF, Py_REFCNT
+from cpython.object cimport PyTypeObject
 from cpython cimport PyErr_CheckSignals, PyGILState_Ensure, 
PyGILState_Release, PyObject
+
+cdef extern from "Python.h":
+    void Py_IncRef(PyObject*) nogil

Review Comment:
   likely not needed



##########
python/tvm_ffi/cython/function.pxi:
##########
@@ -544,10 +561,19 @@ cdef int TVMFFIPyArgSetterObjectRValueRef_(
     TVMFFIPyArgSetter* handle, TVMFFIPyCallContext* ctx,
     PyObject* py_arg, TVMFFIAny* out
 ) except -1:
-    """Setter for ObjectRValueRef"""
-    cdef object arg = <object>py_arg
+    """Setter for ObjectRValueRef.
+
+    The C++ side (TVMFFIAnyViewToOwnedAny) reads ``*v_ptr`` and writes NULL
+    back, transferring the +1 strong ref to the FFI side and leaving the

Review Comment:
   remove



##########
python/tvm_ffi/cython/base.pxi:
##########
@@ -162,6 +167,16 @@ cdef extern from "tvm/ffi/c_api.h":
         uint32_t __padding
         void (*deleter)(void* self, int flags)
 
+    ctypedef struct TVMFFIObjectAllocHeader:

Review Comment:
   not neede,d they are only needed in python_helprs.h



##########
include/tvm/ffi/c_api.h:
##########
@@ -580,6 +580,80 @@ TVM_FFI_DLL int TVMFFIObjectDecRef(TVMFFIObjectHandle obj);
 TVM_FFI_DLL int TVMFFIObjectCreateOpaque(void* handle, int32_t type_index,
                                          void (*deleter)(void* handle), 
TVMFFIObjectHandle* out);
 
+//-----------------------------------------------------------------------
+// Section: Custom allocator hook for Object allocations
+//
+// Every allocation of a ref-counted Object goes through the registered
+// TVMFFICustomAllocator. The allocator places a TVMFFIObjectAllocHeader
+// immediately before the returned T pointer; the C++ Weak deleter recovers
+// the header and invokes ``delete_space`` to release the whole block.
+//
+// libtvm_ffi installs a builtin default allocator at registry init so every
+// Object always has at least the base header. Frontends (Python via the
+// Cython extension, Rust binding, etc.) override the global allocator with
+// TVMFFISetCustomAllocator when they need richer per-allocation bookkeeping
+// (e.g. Python's wrapper back-pointer).
+//-----------------------------------------------------------------------
+/*!
+ * \brief Mandatory header placed immediately before each TVMFFIObject body.
+ *
+ * The C++ deleter recovers this header by walking back exactly
+ * ``sizeof(TVMFFIObjectAllocHeader)`` bytes from the T pointer and invokes
+ * ``delete_space``. Frontends with a derived layout add extra fields ahead
+ * of the base; the derived ``delete_space`` callback recovers its enclosing
+ * struct from ``tptr``.
+ */
+typedef struct {
+  /*!
+   * \brief Free the allocation that contains ``tptr``.

Review Comment:
   Free the allocation



##########
python/tvm_ffi/cython/base.pxi:
##########
@@ -361,6 +376,17 @@ def _env_get_current_stream(int device_type, int 
device_id):
 
 
 cdef extern from "tvm_ffi_python_helpers.h":
+    # PyObject-tying state machine, fully implemented in helpers.h.
+    # See the comment block in tvm_ffi_python_helpers.h for the layout
+    # and the Init / Active / Inactive state semantics.
+    int TVMFFIPyRegisterDefaultAllocator() noexcept
+    void TVMFFIPyMarkPythonFinalizing() noexcept
+
+    PyObject* TVMFFIPyTryGetAttachedPyObject(void* chandle) noexcept
+    void TVMFFIPyAttachPyObject(void* chandle, PyObject* obj) noexcept
+    void TVMFFIPyDetachPyObject(void* chandle, PyObject* obj) noexcept
+    void TVMFFIPyTPFinalize(void** chandle_field, PyObject* self) noexcept

Review Comment:
   avoid keyword self, ptr_to_chandle



##########
python/tvm_ffi/cython/tvm_ffi_python_helpers.h:
##########
@@ -71,6 +72,303 @@
 //  prefixed with TVMFFIPy so they can be easily invoked through Cython.
 
///--------------------------------------------------------------------------------
 
+//================================================================================
+// PyObject-tying state machine.
+//
+// Ties one Python wrapper to one C++ chandle so that
+//   - ``a.x is a.x`` while the wrapper is live;
+//   - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep
+//     the chandle alive);
+//   - ``f(x) is x`` whenever an FFI function returns a chandle that already
+//     has a canonical wrapper.
+//
+// Layout
+// ------
+// Every Object allocated through the registered Python allocator
+// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``:
+//
+//   malloc start
+//   +-------------------+--------------------------+--------+
+//   | tagged_wrapper    | TVMFFIObjectAllocHeader  |   T    |
+//   |   (offset 0..8)   |   delete_space (8..16)   |        |
+//   +-------------------+--------------------------+--------+
+//                                                  ^ tptr = malloc + 16
+//
+// The single ``tagged_wrapper`` field encodes a 3-state machine via the
+// low bit of the canonical wrapper pointer. CPython PyObjects are at least
+// 8-byte aligned (PyObject_HEAD is 16 B), so the low bit is always free.
+//
+// States
+// ------
+//   Init:     ``tagged_wrapper == 0``
+//             No Python wrapper bound to this chandle.
+//   Active:   ``tagged_wrapper == (uintptr_t)wrapper | 1``
+//             ``wrapper`` is the canonical Python wrapper for the chandle.
+//             ``wrapper.chandle == chandle``. Wrapper holds +1 on chandle
+//             (released by ``__dealloc__``).
+//   Inactive: ``tagged_wrapper == (uintptr_t)wrapper``
+//             Wrapper memory is preserved (Python refcount kept alive by
+//             a phantom +1 from ``tp_finalize``), but ``wrapper.chandle``
+//             has been cleared. The wrapper does NOT hold +1 on chandle;
+//             the chandle stays alive only via other C++ holders.
+//
+// Transitions (exposed as the four ``TVMFFIPy*`` helpers below)
+// --------------------------------------------------------------
+//   Init   -> Active    ``TVMFFIPyAttachPyObject`` after a fresh
+//                       ``cls.__new__(cls)`` in ``make_ret_object``.
+//   Active -> Active    ``TVMFFIPyTryGetAttachedPyObject`` returns the
+//                       cached wrapper; caller INCREFs and DecRefs the
+//                       redundant +1 on chandle.
+//   Inactive -> Active  ``TVMFFIPyTryGetAttachedPyObject`` returns the
+//                       preserved wrapper. Caller re-binds
+//                       ``wrapper.chandle = chandle`` (transferring its
+//                       +1) and calls ``TVMFFIPyAttachPyObject`` to set
+//                       the live bit.
+//   Active -> Inactive  ``TVMFFIPyTPFinalize`` when the wrapper's Python
+//                       refcount hits 0 while other C++ holders keep the
+//                       chandle alive: clears the live bit, DECREFs
+//                       chandle, Py_INCREFs self (PEP-442 resurrection).
+//   Active -> Init      ``TVMFFIPyTPFinalize`` when this wrapper is the
+//                       last C++ holder: clears tagged_wrapper, falls
+//                       through to ``__dealloc__`` for normal cleanup.
+//   Active -> Init      ``TVMFFIPyDetachPyObject`` from ``__dealloc__``
+//                       and from the rvalue-ref move setter (eager-detach
+//                       before the C++ side nulls the source chandle).
+//
+// Chandle deleter (Inactive reclamation)
+// --------------------------------------
+// When chandle's refcount finally hits 0 in the Inactive state, the C++
+// Weak deleter calls ``base.delete_space`` -> ``TVMFFIPyDeleteSpace``.
+// The deleter notices the preserved wrapper (``tagged_wrapper != 0``),
+// acquires the GIL, and frees the wrapper memory with ``PyObject_GC_Del``
+// before releasing the malloc block.
+//
+// Shutdown guard
+// --------------
+// ``g_tvm_ffi_python_alive`` is flipped to 0 by an atexit hook registered
+// from Cython module init. After that point, Inactive wrapper bytes are
+// intentionally leaked (process exiting; OS reclaims) rather than
+// reaching for ``PyGILState_Ensure`` on a teardown interpreter.
+//================================================================================
+
+/*!
+ * \brief Python-side derived header. Single tagged-pointer field encodes
+ *        the Init / Active / Inactive states. ``base.delete_space`` sits
+ *        at exactly ``tptr - sizeof(TVMFFIObjectAllocHeader)`` so the
+ *        generic C++ deleter (which knows nothing about Python) can find
+ *        it.
+ */
+struct PyCustomAllocHeader {
+  uintptr_t tagged_wrapper;        // see state encoding
+  TVMFFIObjectAllocHeader base;    // delete_space; at tptr - 8
+};
+
+constexpr uintptr_t kPyHeaderLiveTag = 1;
+constexpr size_t kPyHeaderOffset = sizeof(PyCustomAllocHeader);  // 16
+static_assert(kPyHeaderOffset == 16,
+              "header must be 16 bytes so T at tptr = malloc + 16 is naturally 
"
+              "aligned for alignof(T) up to alignof(max_align_t)");
+static_assert(offsetof(PyCustomAllocHeader, base) ==
+                  kPyHeaderOffset - sizeof(TVMFFIObjectAllocHeader),
+              "base must sit at tptr - sizeof(TVMFFIObjectAllocHeader) for the 
"
+              "C++ deleter to find it");
+
+TVM_FFI_INLINE PyCustomAllocHeader* TVMFFIPyHeader(void* tptr) {
+  return reinterpret_cast<PyCustomAllocHeader*>(static_cast<char*>(tptr) - 
kPyHeaderOffset);
+}
+
+// Forward decl; defined below.
+//
+// NOTE: deliberately *not* TVM_FFI_INLINE. TVM_FFI_INLINE expands to
+// [[gnu::always_inline]] which forbids taking the function's address as
+// a stable, callable pointer — and we hand the address to the C++ side
+// (stored in PyCustomAllocHeader::base.delete_space at allocate time).
+inline void TVMFFIPyDeleteSpace(void* tptr);
+
+// Atexit-driven shutdown guard. Flipped to 0 by an atexit hook registered
+// from Cython module init; read by ``TVMFFIPyDeleteSpace`` before
+// ``PyGILState_Ensure`` to avoid touching a teardown interpreter.
+static volatile int g_tvm_ffi_python_alive = 1;

Review Comment:
   std:atomic<bool>



##########
python/tvm_ffi/cython/tvm_ffi_python_helpers.h:
##########
@@ -71,6 +72,303 @@
 //  prefixed with TVMFFIPy so they can be easily invoked through Cython.
 
///--------------------------------------------------------------------------------
 
+//================================================================================
+// PyObject-tying state machine.
+//
+// Ties one Python wrapper to one C++ chandle so that
+//   - ``a.x is a.x`` while the wrapper is live;
+//   - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep
+//     the chandle alive);
+//   - ``f(x) is x`` whenever an FFI function returns a chandle that already
+//     has a canonical wrapper.
+//
+// Layout
+// ------
+// Every Object allocated through the registered Python allocator
+// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``:
+//
+//   malloc start
+//   +-------------------+--------------------------+--------+
+//   | tagged_wrapper    | TVMFFIObjectAllocHeader  |   T    |
+//   |   (offset 0..8)   |   delete_space (8..16)   |        |
+//   +-------------------+--------------------------+--------+
+//                                                  ^ tptr = malloc + 16
+//
+// The single ``tagged_wrapper`` field encodes a 3-state machine via the
+// low bit of the canonical wrapper pointer. CPython PyObjects are at least
+// 8-byte aligned (PyObject_HEAD is 16 B), so the low bit is always free.
+//
+// States
+// ------
+//   Init:     ``tagged_wrapper == 0``
+//             No Python wrapper bound to this chandle.
+//   Active:   ``tagged_wrapper == (uintptr_t)wrapper | 1``
+//             ``wrapper`` is the canonical Python wrapper for the chandle.
+//             ``wrapper.chandle == chandle``. Wrapper holds +1 on chandle
+//             (released by ``__dealloc__``).
+//   Inactive: ``tagged_wrapper == (uintptr_t)wrapper``
+//             Wrapper memory is preserved (Python refcount kept alive by
+//             a phantom +1 from ``tp_finalize``), but ``wrapper.chandle``
+//             has been cleared. The wrapper does NOT hold +1 on chandle;
+//             the chandle stays alive only via other C++ holders.
+//
+// Transitions (exposed as the four ``TVMFFIPy*`` helpers below)
+// --------------------------------------------------------------
+//   Init   -> Active    ``TVMFFIPyAttachPyObject`` after a fresh
+//                       ``cls.__new__(cls)`` in ``make_ret_object``.
+//   Active -> Active    ``TVMFFIPyTryGetAttachedPyObject`` returns the
+//                       cached wrapper; caller INCREFs and DecRefs the
+//                       redundant +1 on chandle.
+//   Inactive -> Active  ``TVMFFIPyTryGetAttachedPyObject`` returns the
+//                       preserved wrapper. Caller re-binds
+//                       ``wrapper.chandle = chandle`` (transferring its
+//                       +1) and calls ``TVMFFIPyAttachPyObject`` to set
+//                       the live bit.
+//   Active -> Inactive  ``TVMFFIPyTPFinalize`` when the wrapper's Python
+//                       refcount hits 0 while other C++ holders keep the
+//                       chandle alive: clears the live bit, DECREFs
+//                       chandle, Py_INCREFs self (PEP-442 resurrection).
+//   Active -> Init      ``TVMFFIPyTPFinalize`` when this wrapper is the
+//                       last C++ holder: clears tagged_wrapper, falls
+//                       through to ``__dealloc__`` for normal cleanup.
+//   Active -> Init      ``TVMFFIPyDetachPyObject`` from ``__dealloc__``
+//                       and from the rvalue-ref move setter (eager-detach
+//                       before the C++ side nulls the source chandle).
+//
+// Chandle deleter (Inactive reclamation)
+// --------------------------------------
+// When chandle's refcount finally hits 0 in the Inactive state, the C++
+// Weak deleter calls ``base.delete_space`` -> ``TVMFFIPyDeleteSpace``.
+// The deleter notices the preserved wrapper (``tagged_wrapper != 0``),
+// acquires the GIL, and frees the wrapper memory with ``PyObject_GC_Del``
+// before releasing the malloc block.
+//
+// Shutdown guard
+// --------------
+// ``g_tvm_ffi_python_alive`` is flipped to 0 by an atexit hook registered
+// from Cython module init. After that point, Inactive wrapper bytes are
+// intentionally leaked (process exiting; OS reclaims) rather than
+// reaching for ``PyGILState_Ensure`` on a teardown interpreter.
+//================================================================================
+
+/*!
+ * \brief Python-side derived header. Single tagged-pointer field encodes
+ *        the Init / Active / Inactive states. ``base.delete_space`` sits
+ *        at exactly ``tptr - sizeof(TVMFFIObjectAllocHeader)`` so the
+ *        generic C++ deleter (which knows nothing about Python) can find
+ *        it.
+ */
+struct PyCustomAllocHeader {
+  uintptr_t tagged_wrapper;        // see state encoding
+  TVMFFIObjectAllocHeader base;    // delete_space; at tptr - 8
+};
+
+constexpr uintptr_t kPyHeaderLiveTag = 1;
+constexpr size_t kPyHeaderOffset = sizeof(PyCustomAllocHeader);  // 16
+static_assert(kPyHeaderOffset == 16,
+              "header must be 16 bytes so T at tptr = malloc + 16 is naturally 
"
+              "aligned for alignof(T) up to alignof(max_align_t)");
+static_assert(offsetof(PyCustomAllocHeader, base) ==
+                  kPyHeaderOffset - sizeof(TVMFFIObjectAllocHeader),
+              "base must sit at tptr - sizeof(TVMFFIObjectAllocHeader) for the 
"
+              "C++ deleter to find it");
+
+TVM_FFI_INLINE PyCustomAllocHeader* TVMFFIPyHeader(void* tptr) {
+  return reinterpret_cast<PyCustomAllocHeader*>(static_cast<char*>(tptr) - 
kPyHeaderOffset);
+}
+
+// Forward decl; defined below.
+//
+// NOTE: deliberately *not* TVM_FFI_INLINE. TVM_FFI_INLINE expands to
+// [[gnu::always_inline]] which forbids taking the function's address as
+// a stable, callable pointer — and we hand the address to the C++ side
+// (stored in PyCustomAllocHeader::base.delete_space at allocate time).
+inline void TVMFFIPyDeleteSpace(void* tptr);
+
+// Atexit-driven shutdown guard. Flipped to 0 by an atexit hook registered
+// from Cython module init; read by ``TVMFFIPyDeleteSpace`` before
+// ``PyGILState_Ensure`` to avoid touching a teardown interpreter.
+static volatile int g_tvm_ffi_python_alive = 1;
+
+inline void TVMFFIPyMarkPythonFinalizing() noexcept { g_tvm_ffi_python_alive = 
0; }
+
+/*!
+ * \brief True iff ``chandle`` was allocated through the Python custom
+ *        allocator (full ``PyCustomAllocHeader`` ahead of it). False for
+ *        allocations that came through libtvm_ffi's builtin default
+ *        (only the base ``TVMFFIObjectAllocHeader``).
+ *
+ * Detection is by comparing ``base.delete_space`` against
+ * ``TVMFFIPyDeleteSpace``: each frontend recognizes its own deleter
+ * pointer, so multiple frontends can coexist without a flag bit on
+ * ``TVMFFIObject``.
+ */
+TVM_FFI_INLINE bool TVMFFIPyIsCanonical(void* chandle) {
+  if (chandle == nullptr) return false;
+  TVMFFIObjectAllocHeader* base = reinterpret_cast<TVMFFIObjectAllocHeader*>(
+      static_cast<char*>(chandle) - sizeof(TVMFFIObjectAllocHeader));
+  return base->delete_space == &TVMFFIPyDeleteSpace;
+}
+
+//---------------------------------------------------------------
+// The four state-machine transitions.
+//---------------------------------------------------------------
+
+/*!
+ * \brief Try to fetch the canonical wrapper bound to ``chandle``.
+ *
+ * Returns a borrowed PyObject* in the Active or Inactive state; returns
+ * NULL in the Init state or for chandles without a Python alloc header.
+ * Does NOT mutate state; the Inactive -> Active revival's
+ * ``tagged_wrapper`` re-tagging happens via ``TVMFFIPyAttachPyObject``
+ * after the caller rebinds ``wrapper.chandle``.
+ */
+TVM_FFI_INLINE PyObject* TVMFFIPyTryGetAttachedPyObject(void* chandle) {
+  if (!TVMFFIPyIsCanonical(chandle)) return nullptr;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  if (h->tagged_wrapper == 0) return nullptr;
+  return reinterpret_cast<PyObject*>(h->tagged_wrapper & ~kPyHeaderLiveTag);
+}
+
+/*!
+ * \brief Bind ``obj`` to ``chandle`` as the canonical wrapper (-> Active).
+ *
+ * Used both for fresh allocation (Init -> Active) and for Inactive ->
+ * Active revival after the caller has re-set ``obj.chandle = chandle``.
+ *
+ * No-op for chandles without a Python alloc header.
+ */
+TVM_FFI_INLINE void TVMFFIPyAttachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  TVMFFIPyHeader(chandle)->tagged_wrapper = reinterpret_cast<uintptr_t>(obj) | 
kPyHeaderLiveTag;
+}
+
+/*!
+ * \brief Clear the canonical-wrapper binding (Active -> Init).
+ *
+ * No-op when ``obj`` is not the live canonical wrapper for ``chandle``
+ * (Init, Inactive, stale post-move wrapper, or non-Python alloc header).
+ * Used by ``__dealloc__`` and by the rvalue-ref move setter to
+ * eager-detach before the C++ side nulls the source chandle.
+ */
+TVM_FFI_INLINE void TVMFFIPyDetachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  uintptr_t expected = reinterpret_cast<uintptr_t>(obj) | kPyHeaderLiveTag;
+  if (h->tagged_wrapper == expected) {
+    h->tagged_wrapper = 0;
+  }
+}
+
+/*!
+ * \brief tp_finalize hook. Picks one of two transitions:
+ *
+ *   Active -> Inactive: when other C++ holders keep the chandle alive,
+ *     resurrect the wrapper so a future re-fetch can rebind. Clears the
+ *     live bit, nulls ``*chandle_field``, releases the wrapper's +1 on
+ *     chandle, and Py_INCREFs ``self`` (PEP-442 resurrection). The
+ *     phantom +1 is later reclaimed by ``TVMFFIPyDeleteSpace`` when
+ *     chandle dies.
+ *
+ *   Active -> Init: when the wrapper is the last C++ holder, do nothing
+ *     here and let ``__dealloc__`` run normally. ``__dealloc__`` calls
+ *     ``TVMFFIPyDetachPyObject`` and DECREFs the chandle.
+ *
+ * Why null ``*chandle_field`` BEFORE the DECREF on the Inactive path:
+ *   - It is the Inactive sentinel that ``make_ret_object`` reads on
+ *     revival (``cached.chandle == NULL`` -> rebind & transfer caller's
+ *     +1). A stale non-NULL slot would mis-route to the Active branch
+ *     and double-drop the caller's +1.
+ *   - It keeps the invariant ``wrapper.chandle != NULL implies wrapper
+ *     owns +1 on chandle`` true at every observable point: any reader
+ *     of ``self.chandle`` (hash, same_as, repr, ...) sees NULL before
+ *     any deleter the DECREF may trigger gets a chance to run.
+ */
+TVM_FFI_INLINE void TVMFFIPyTPFinalize(void** chandle_field, PyObject* self) {
+  void* chandle = *chandle_field;
+  if (chandle == nullptr) return;
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  uintptr_t expected = reinterpret_cast<uintptr_t>(self) | kPyHeaderLiveTag;
+  if (h->tagged_wrapper != expected) return;

Review Comment:
   need explicit comment
   
   // it is possible that the PyObject is already eagerly moved to detached 
(from move), we directly go into normal delete process



##########
python/tvm_ffi/cython/object.pxi:
##########
@@ -102,8 +102,23 @@ cdef class CObject:
         # case of error before chandle is set
         self.chandle = NULL
 
+    def __del__(self):
+        """tp_finalize hook. Delegates the Active->Inactive revive vs
+        Active->Init cleanup decision to ``TVMFFIPyTPFinalize``. The
+        helper takes ``&self.chandle`` so it can null the field in-line
+        on the Active->Inactive path before DECREFing the chandle,
+        matching the original ordering.
+        """
+        TVMFFIPyTPFinalize(<void**>&self.chandle, <PyObject*>self)
+
     def __dealloc__(self):
+        """Detach the canonical-wrapper binding and DecRef the chandle.
+
+        Reached only when ``__del__`` did NOT enter the Inactive state
+        (NULL chandle, non-canonical wrapper, or last C++ ref).
+        """
         if self.chandle != NULL:
+            TVMFFIPyDetachPyObject(self.chandle, <PyObject*>self)

Review Comment:
   invariant: when we get here, chandle ware are always in init state



##########
python/tvm_ffi/cython/tvm_ffi_python_helpers.h:
##########
@@ -71,6 +72,303 @@
 //  prefixed with TVMFFIPy so they can be easily invoked through Cython.
 
///--------------------------------------------------------------------------------
 
+//================================================================================
+// PyObject-tying state machine.
+//
+// Ties one Python wrapper to one C++ chandle so that
+//   - ``a.x is a.x`` while the wrapper is live;
+//   - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep
+//     the chandle alive);
+//   - ``f(x) is x`` whenever an FFI function returns a chandle that already
+//     has a canonical wrapper.
+//
+// Layout
+// ------
+// Every Object allocated through the registered Python allocator
+// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``:
+//
+//   malloc start
+//   +-------------------+--------------------------+--------+
+//   | tagged_wrapper    | TVMFFIObjectAllocHeader  |   T    |
+//   |   (offset 0..8)   |   delete_space (8..16)   |        |
+//   +-------------------+--------------------------+--------+
+//                                                  ^ tptr = malloc + 16
+//
+// The single ``tagged_wrapper`` field encodes a 3-state machine via the
+// low bit of the canonical wrapper pointer. CPython PyObjects are at least
+// 8-byte aligned (PyObject_HEAD is 16 B), so the low bit is always free.
+//
+// States
+// ------
+//   Init:     ``tagged_wrapper == 0``

Review Comment:
   tagged_wrapper => cached_pyobj



##########
python/tvm_ffi/cython/tvm_ffi_python_helpers.h:
##########
@@ -71,6 +72,303 @@
 //  prefixed with TVMFFIPy so they can be easily invoked through Cython.
 
///--------------------------------------------------------------------------------
 
+//================================================================================
+// PyObject-tying state machine.
+//
+// Ties one Python wrapper to one C++ chandle so that
+//   - ``a.x is a.x`` while the wrapper is live;
+//   - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep
+//     the chandle alive);
+//   - ``f(x) is x`` whenever an FFI function returns a chandle that already
+//     has a canonical wrapper.
+//
+// Layout
+// ------
+// Every Object allocated through the registered Python allocator
+// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``:
+//
+//   malloc start
+//   +-------------------+--------------------------+--------+
+//   | tagged_wrapper    | TVMFFIObjectAllocHeader  |   T    |
+//   |   (offset 0..8)   |   delete_space (8..16)   |        |
+//   +-------------------+--------------------------+--------+
+//                                                  ^ tptr = malloc + 16
+//
+// The single ``tagged_wrapper`` field encodes a 3-state machine via the
+// low bit of the canonical wrapper pointer. CPython PyObjects are at least
+// 8-byte aligned (PyObject_HEAD is 16 B), so the low bit is always free.
+//
+// States
+// ------
+//   Init:     ``tagged_wrapper == 0``

Review Comment:
   Init => Detached



##########
python/tvm_ffi/cython/tvm_ffi_python_helpers.h:
##########
@@ -71,6 +72,303 @@
 //  prefixed with TVMFFIPy so they can be easily invoked through Cython.
 
///--------------------------------------------------------------------------------
 
+//================================================================================
+// PyObject-tying state machine.
+//
+// Ties one Python wrapper to one C++ chandle so that
+//   - ``a.x is a.x`` while the wrapper is live;
+//   - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep
+//     the chandle alive);
+//   - ``f(x) is x`` whenever an FFI function returns a chandle that already
+//     has a canonical wrapper.
+//
+// Layout
+// ------
+// Every Object allocated through the registered Python allocator
+// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``:
+//
+//   malloc start
+//   +-------------------+--------------------------+--------+
+//   | tagged_wrapper    | TVMFFIObjectAllocHeader  |   T    |
+//   |   (offset 0..8)   |   delete_space (8..16)   |        |
+//   +-------------------+--------------------------+--------+
+//                                                  ^ tptr = malloc + 16
+//
+// The single ``tagged_wrapper`` field encodes a 3-state machine via the
+// low bit of the canonical wrapper pointer. CPython PyObjects are at least
+// 8-byte aligned (PyObject_HEAD is 16 B), so the low bit is always free.
+//
+// States
+// ------
+//   Init:     ``tagged_wrapper == 0``
+//             No Python wrapper bound to this chandle.
+//   Active:   ``tagged_wrapper == (uintptr_t)wrapper | 1``
+//             ``wrapper`` is the canonical Python wrapper for the chandle.
+//             ``wrapper.chandle == chandle``. Wrapper holds +1 on chandle
+//             (released by ``__dealloc__``).
+//   Inactive: ``tagged_wrapper == (uintptr_t)wrapper``
+//             Wrapper memory is preserved (Python refcount kept alive by
+//             a phantom +1 from ``tp_finalize``), but ``wrapper.chandle``
+//             has been cleared. The wrapper does NOT hold +1 on chandle;
+//             the chandle stays alive only via other C++ holders.
+//
+// Transitions (exposed as the four ``TVMFFIPy*`` helpers below)
+// --------------------------------------------------------------
+//   Init   -> Active    ``TVMFFIPyAttachPyObject`` after a fresh

Review Comment:
   At high-level there are possibly places that examines the states
   
   - `make_ret_object` and other constructor calls 
   - `TVMFFIPyTPFinalize` and possibly dealloc
   - TVMFFIDeleter
   - move
   
   We hope to achieve the following things:
   
   - object identity
   - safe deletion of PyObject
   
   we need to clearly proof
   - When a PyObject goes out of scope(not referened by any python var), 
pyobj.chandle refcount is always decrefed
   - When chandle goes out of scope, its attached pyobject will get recycled(if 
any)
   
   
   
   



##########
python/tvm_ffi/cython/tvm_ffi_python_helpers.h:
##########
@@ -71,6 +72,303 @@
 //  prefixed with TVMFFIPy so they can be easily invoked through Cython.
 
///--------------------------------------------------------------------------------
 
+//================================================================================
+// PyObject-tying state machine.
+//
+// Ties one Python wrapper to one C++ chandle so that
+//   - ``a.x is a.x`` while the wrapper is live;
+//   - ``id(a.x)`` is stable across drop+refetch (when other C++ holders keep
+//     the chandle alive);
+//   - ``f(x) is x`` whenever an FFI function returns a chandle that already
+//     has a canonical wrapper.
+//
+// Layout
+// ------
+// Every Object allocated through the registered Python allocator
+// (`TVMFFIPyAllocate`) is preceded by a 16-byte ``PyCustomAllocHeader``:
+//
+//   malloc start
+//   +-------------------+--------------------------+--------+
+//   | tagged_wrapper    | TVMFFIObjectAllocHeader  |   T    |
+//   |   (offset 0..8)   |   delete_space (8..16)   |        |
+//   +-------------------+--------------------------+--------+
+//                                                  ^ tptr = malloc + 16
+//
+// The single ``tagged_wrapper`` field encodes a 3-state machine via the
+// low bit of the canonical wrapper pointer. CPython PyObjects are at least
+// 8-byte aligned (PyObject_HEAD is 16 B), so the low bit is always free.
+//
+// States
+// ------
+//   Init:     ``tagged_wrapper == 0``
+//             No Python wrapper bound to this chandle.
+//   Active:   ``tagged_wrapper == (uintptr_t)wrapper | 1``
+//             ``wrapper`` is the canonical Python wrapper for the chandle.
+//             ``wrapper.chandle == chandle``. Wrapper holds +1 on chandle
+//             (released by ``__dealloc__``).
+//   Inactive: ``tagged_wrapper == (uintptr_t)wrapper``
+//             Wrapper memory is preserved (Python refcount kept alive by
+//             a phantom +1 from ``tp_finalize``), but ``wrapper.chandle``
+//             has been cleared. The wrapper does NOT hold +1 on chandle;
+//             the chandle stays alive only via other C++ holders.
+//
+// Transitions (exposed as the four ``TVMFFIPy*`` helpers below)
+// --------------------------------------------------------------
+//   Init   -> Active    ``TVMFFIPyAttachPyObject`` after a fresh
+//                       ``cls.__new__(cls)`` in ``make_ret_object``.
+//   Active -> Active    ``TVMFFIPyTryGetAttachedPyObject`` returns the
+//                       cached wrapper; caller INCREFs and DecRefs the
+//                       redundant +1 on chandle.
+//   Inactive -> Active  ``TVMFFIPyTryGetAttachedPyObject`` returns the
+//                       preserved wrapper. Caller re-binds
+//                       ``wrapper.chandle = chandle`` (transferring its
+//                       +1) and calls ``TVMFFIPyAttachPyObject`` to set
+//                       the live bit.
+//   Active -> Inactive  ``TVMFFIPyTPFinalize`` when the wrapper's Python
+//                       refcount hits 0 while other C++ holders keep the
+//                       chandle alive: clears the live bit, DECREFs
+//                       chandle, Py_INCREFs self (PEP-442 resurrection).
+//   Active -> Init      ``TVMFFIPyTPFinalize`` when this wrapper is the
+//                       last C++ holder: clears tagged_wrapper, falls
+//                       through to ``__dealloc__`` for normal cleanup.
+//   Active -> Init      ``TVMFFIPyDetachPyObject`` from ``__dealloc__``
+//                       and from the rvalue-ref move setter (eager-detach
+//                       before the C++ side nulls the source chandle).
+//
+// Chandle deleter (Inactive reclamation)
+// --------------------------------------
+// When chandle's refcount finally hits 0 in the Inactive state, the C++
+// Weak deleter calls ``base.delete_space`` -> ``TVMFFIPyDeleteSpace``.
+// The deleter notices the preserved wrapper (``tagged_wrapper != 0``),
+// acquires the GIL, and frees the wrapper memory with ``PyObject_GC_Del``
+// before releasing the malloc block.
+//
+// Shutdown guard
+// --------------
+// ``g_tvm_ffi_python_alive`` is flipped to 0 by an atexit hook registered
+// from Cython module init. After that point, Inactive wrapper bytes are
+// intentionally leaked (process exiting; OS reclaims) rather than
+// reaching for ``PyGILState_Ensure`` on a teardown interpreter.
+//================================================================================
+
+/*!
+ * \brief Python-side derived header. Single tagged-pointer field encodes
+ *        the Init / Active / Inactive states. ``base.delete_space`` sits
+ *        at exactly ``tptr - sizeof(TVMFFIObjectAllocHeader)`` so the
+ *        generic C++ deleter (which knows nothing about Python) can find
+ *        it.
+ */
+struct PyCustomAllocHeader {
+  uintptr_t tagged_wrapper;        // see state encoding
+  TVMFFIObjectAllocHeader base;    // delete_space; at tptr - 8
+};
+
+constexpr uintptr_t kPyHeaderLiveTag = 1;
+constexpr size_t kPyHeaderOffset = sizeof(PyCustomAllocHeader);  // 16
+static_assert(kPyHeaderOffset == 16,
+              "header must be 16 bytes so T at tptr = malloc + 16 is naturally 
"
+              "aligned for alignof(T) up to alignof(max_align_t)");
+static_assert(offsetof(PyCustomAllocHeader, base) ==
+                  kPyHeaderOffset - sizeof(TVMFFIObjectAllocHeader),
+              "base must sit at tptr - sizeof(TVMFFIObjectAllocHeader) for the 
"
+              "C++ deleter to find it");
+
+TVM_FFI_INLINE PyCustomAllocHeader* TVMFFIPyHeader(void* tptr) {
+  return reinterpret_cast<PyCustomAllocHeader*>(static_cast<char*>(tptr) - 
kPyHeaderOffset);
+}
+
+// Forward decl; defined below.
+//
+// NOTE: deliberately *not* TVM_FFI_INLINE. TVM_FFI_INLINE expands to
+// [[gnu::always_inline]] which forbids taking the function's address as
+// a stable, callable pointer — and we hand the address to the C++ side
+// (stored in PyCustomAllocHeader::base.delete_space at allocate time).
+inline void TVMFFIPyDeleteSpace(void* tptr);
+
+// Atexit-driven shutdown guard. Flipped to 0 by an atexit hook registered
+// from Cython module init; read by ``TVMFFIPyDeleteSpace`` before
+// ``PyGILState_Ensure`` to avoid touching a teardown interpreter.
+static volatile int g_tvm_ffi_python_alive = 1;
+
+inline void TVMFFIPyMarkPythonFinalizing() noexcept { g_tvm_ffi_python_alive = 
0; }
+
+/*!
+ * \brief True iff ``chandle`` was allocated through the Python custom
+ *        allocator (full ``PyCustomAllocHeader`` ahead of it). False for
+ *        allocations that came through libtvm_ffi's builtin default
+ *        (only the base ``TVMFFIObjectAllocHeader``).
+ *
+ * Detection is by comparing ``base.delete_space`` against
+ * ``TVMFFIPyDeleteSpace``: each frontend recognizes its own deleter
+ * pointer, so multiple frontends can coexist without a flag bit on
+ * ``TVMFFIObject``.
+ */
+TVM_FFI_INLINE bool TVMFFIPyIsCanonical(void* chandle) {
+  if (chandle == nullptr) return false;
+  TVMFFIObjectAllocHeader* base = reinterpret_cast<TVMFFIObjectAllocHeader*>(
+      static_cast<char*>(chandle) - sizeof(TVMFFIObjectAllocHeader));
+  return base->delete_space == &TVMFFIPyDeleteSpace;
+}
+
+//---------------------------------------------------------------
+// The four state-machine transitions.
+//---------------------------------------------------------------
+
+/*!
+ * \brief Try to fetch the canonical wrapper bound to ``chandle``.
+ *
+ * Returns a borrowed PyObject* in the Active or Inactive state; returns
+ * NULL in the Init state or for chandles without a Python alloc header.
+ * Does NOT mutate state; the Inactive -> Active revival's
+ * ``tagged_wrapper`` re-tagging happens via ``TVMFFIPyAttachPyObject``
+ * after the caller rebinds ``wrapper.chandle``.
+ */
+TVM_FFI_INLINE PyObject* TVMFFIPyTryGetAttachedPyObject(void* chandle) {
+  if (!TVMFFIPyIsCanonical(chandle)) return nullptr;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  if (h->tagged_wrapper == 0) return nullptr;
+  return reinterpret_cast<PyObject*>(h->tagged_wrapper & ~kPyHeaderLiveTag);
+}
+
+/*!
+ * \brief Bind ``obj`` to ``chandle`` as the canonical wrapper (-> Active).
+ *
+ * Used both for fresh allocation (Init -> Active) and for Inactive ->
+ * Active revival after the caller has re-set ``obj.chandle = chandle``.
+ *
+ * No-op for chandles without a Python alloc header.
+ */
+TVM_FFI_INLINE void TVMFFIPyAttachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  TVMFFIPyHeader(chandle)->tagged_wrapper = reinterpret_cast<uintptr_t>(obj) | 
kPyHeaderLiveTag;
+}
+
+/*!
+ * \brief Clear the canonical-wrapper binding (Active -> Init).
+ *
+ * No-op when ``obj`` is not the live canonical wrapper for ``chandle``
+ * (Init, Inactive, stale post-move wrapper, or non-Python alloc header).
+ * Used by ``__dealloc__`` and by the rvalue-ref move setter to
+ * eager-detach before the C++ side nulls the source chandle.
+ */
+TVM_FFI_INLINE void TVMFFIPyDetachPyObject(void* chandle, PyObject* obj) {
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  uintptr_t expected = reinterpret_cast<uintptr_t>(obj) | kPyHeaderLiveTag;
+  if (h->tagged_wrapper == expected) {
+    h->tagged_wrapper = 0;
+  }
+}
+
+/*!
+ * \brief tp_finalize hook. Picks one of two transitions:
+ *
+ *   Active -> Inactive: when other C++ holders keep the chandle alive,
+ *     resurrect the wrapper so a future re-fetch can rebind. Clears the
+ *     live bit, nulls ``*chandle_field``, releases the wrapper's +1 on
+ *     chandle, and Py_INCREFs ``self`` (PEP-442 resurrection). The
+ *     phantom +1 is later reclaimed by ``TVMFFIPyDeleteSpace`` when
+ *     chandle dies.
+ *
+ *   Active -> Init: when the wrapper is the last C++ holder, do nothing
+ *     here and let ``__dealloc__`` run normally. ``__dealloc__`` calls
+ *     ``TVMFFIPyDetachPyObject`` and DECREFs the chandle.
+ *
+ * Why null ``*chandle_field`` BEFORE the DECREF on the Inactive path:
+ *   - It is the Inactive sentinel that ``make_ret_object`` reads on
+ *     revival (``cached.chandle == NULL`` -> rebind & transfer caller's
+ *     +1). A stale non-NULL slot would mis-route to the Active branch
+ *     and double-drop the caller's +1.
+ *   - It keeps the invariant ``wrapper.chandle != NULL implies wrapper
+ *     owns +1 on chandle`` true at every observable point: any reader
+ *     of ``self.chandle`` (hash, same_as, repr, ...) sees NULL before
+ *     any deleter the DECREF may trigger gets a chance to run.
+ */
+TVM_FFI_INLINE void TVMFFIPyTPFinalize(void** chandle_field, PyObject* self) {
+  void* chandle = *chandle_field;
+  if (chandle == nullptr) return;
+  if (!TVMFFIPyIsCanonical(chandle)) return;
+  PyCustomAllocHeader* h = TVMFFIPyHeader(chandle);
+  uintptr_t expected = reinterpret_cast<uintptr_t>(self) | kPyHeaderLiveTag;
+  if (h->tagged_wrapper != expected) return;
+  // Read strong count without atomic — under classic CPython the GIL
+  // serializes Python-side access, and C++-side DecRefs cross the FFI
+  // boundary which cannot run while we hold the GIL here.
+  uint64_t strong_count =
+      reinterpret_cast<TVMFFIObject*>(chandle)->combined_ref_count & 
0xFFFFFFFFu;
+  if (strong_count <= 1) {
+    // Active -> Init: we hold the last strong ref; let __dealloc__ run
+    // normally and detach the binding there.
+    return;
+  }
+  // Active -> Inactive: clear the live bit (pointer kept as the Inactive
+  // sentinel), null self.chandle BEFORE releasing the wrapper's +1, then
+  // resurrect via Py_IncRef.
+  h->tagged_wrapper = reinterpret_cast<uintptr_t>(self);
+  *chandle_field = nullptr;
+  TVMFFIObjectDecRef(chandle);
+  Py_IncRef(self);
+}
+
+//---------------------------------------------------------------
+// Custom allocator hook.
+//---------------------------------------------------------------
+
+/*!
+ * \brief Allocator entry registered with TVMFFISetCustomAllocator at
+ *        Cython module init. Allocates ``kPyHeaderOffset + size`` bytes,
+ *        zero-inits the header to the Init state, wires
+ *        ``base.delete_space = &TVMFFIPyDeleteSpace``, and returns the T
+ *        location.
+ *
+ * Handler::New static_asserts ``alignof(T) <= alignof(max_align_t)``, so
+ * the runtime ``alignment`` is bounded and we can reuse the shared
+ * ``AlignedAlloc<alignof(max_align_t)>`` (plain malloc on POSIX,
+ * ``_aligned_malloc`` on MSVC).
+ */
+inline void* TVMFFIPyAllocate(size_t size, size_t /*alignment*/, int32_t 
/*type_index*/,
+                              void* /*context*/) {
+  void* base_alloc = 
::tvm::ffi::details::AlignedAlloc<alignof(::std::max_align_t)>(
+      kPyHeaderOffset + size);
+  auto* h = static_cast<PyCustomAllocHeader*>(base_alloc);
+  h->tagged_wrapper = 0;  // Init
+  h->base.delete_space = &TVMFFIPyDeleteSpace;
+  return static_cast<char*>(base_alloc) + kPyHeaderOffset;
+}
+
+/*!
+ * \brief delete_space callback installed by TVMFFIPyAllocate. Invoked from
+ *        the C++ Weak deleter when the chandle is freed.
+ *
+ * In the Inactive state, reclaims the preserved wrapper bytes via
+ * PyObject_GC_Del under the GIL before freeing the malloc block. After
+ * Python finalize starts (g_tvm_ffi_python_alive == 0) the wrapper bytes
+ * are intentionally leaked and only the malloc block is freed — process
+ * is exiting.
+ */
+inline void TVMFFIPyDeleteSpace(void* tptr) {

Review Comment:
   extern "C"



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to