This is an automated email from the ASF dual-hosted git repository.
tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm-ffi.git
The following commit(s) were added to refs/heads/main by this push:
new d73a267 [BUILD] TVM_FFI_COLD_CODE / TVM_FFI_PREDICT_FALSE macros and
cold-marking of error paths (#589)
d73a267 is described below
commit d73a26783488430986fd855ae67ff7a9fb016413
Author: Tianqi Chen <[email protected]>
AuthorDate: Wed May 13 22:21:33 2026 -0400
[BUILD] TVM_FFI_COLD_CODE / TVM_FFI_PREDICT_FALSE macros and cold-marking
of error paths (#589)
## Summary
Adds three header-only macros (`TVM_FFI_COLD_CODE`,
`TVM_FFI_PREDICT_FALSE`, `TVM_FFI_PREDICT_TRUE`) in
`tvm/ffi/base_details.h` and applies them to a small audited set of
error-only helpers. No CMake changes.
## What
`TVM_FFI_COLD_CODE`: `[[gnu::cold]]` on GCC/Clang, no-op on MSVC.
`TVM_FFI_PREDICT_FALSE` / `TVM_FFI_PREDICT_TRUE`: `__builtin_expect` on
GCC/Clang, no-op on MSVC.
Cold-marked (error / setup / teardown only):
- `details::ErrorBuilder` ctors and the `[[noreturn]]` destructor
- `TVMFFISegFaultHandler`
- `TVMFFIInstallSignalHandler`
- `ForwardPyErrorToFFI`
C ABI exports stay hot per cross-DSO surface hygiene — `TVMFFIError*`
family, `TVMFFIBacktrace`, and `SafeCallContext` setters all remain
ordinary entry points. Deleters are not cold-marked: they run on every
callback destruction during normal program operation, not on an error
path.
`TVM_FFI_PREDICT_FALSE` is applied to `TVM_FFI_CHECK_SAFE_CALL`,
`TVM_FFI_CHECK`, and ~17 error-check branches in the Python→FFI
dispatchers in `tvm_ffi_python_helpers.h`.
## Mechanism
GCC and Clang emit cold-marked functions into per-TU `.text.unlikely`.
The default GNU linker script's `*(.text.unlikely .text.*_unlikely
.text.unlikely.*)` rule gathers them into a contiguous slot inside
`.text`. No `-ffunction-sections` flag required.
Detailed measurements (binary-size matrix, isolation study, perf table,
cold-cluster bounds) posted as a follow-up comment.
---
include/tvm/ffi/base_details.h | 38 +++++++++++++++
include/tvm/ffi/error.h | 7 ++-
include/tvm/ffi/function.h | 2 +-
python/tvm_ffi/cython/tvm_ffi_python_helpers.h | 65 ++++++++++++++++++--------
src/ffi/backtrace.cc | 2 +
src/ffi/error.cc | 1 +
src/ffi/function.cc | 4 +-
7 files changed, 94 insertions(+), 25 deletions(-)
diff --git a/include/tvm/ffi/base_details.h b/include/tvm/ffi/base_details.h
index acbd652..c2ac2b8 100644
--- a/include/tvm/ffi/base_details.h
+++ b/include/tvm/ffi/base_details.h
@@ -76,6 +76,44 @@
#define TVM_FFI_UNREACHABLE() __builtin_unreachable()
#endif
+/*!
+ * \brief Mark a function as cold so the toolchain places it in a
+ * separate cold region of `.text`. Apply to functions that only
+ * run on error / setup / teardown paths.
+ *
+ * On GCC and Clang, expands to `[[gnu::cold]]`, which emits the
+ * function into a per-TU `.text.unlikely` section. The default GNU
+ * linker script gathers `.text.unlikely.*` into a contiguous slot
+ * inside `.text`, so cold-marked functions cluster away from hot
+ * code without any additional CMake flags. On MSVC the attribute
+ * does not exist and the macro is a no-op.
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#define TVM_FFI_COLD_CODE [[gnu::cold]]
+#else
+#define TVM_FFI_COLD_CODE
+#endif
+
+/*!
+ * \brief Branch-prediction / layout hint that the condition is unlikely
+ * to be true. Use on error-checking branches to keep the hot
+ * fall-through contiguous and push the error-handling block to
+ * the function tail.
+ *
+ * if (TVM_FFI_PREDICT_FALSE(rc != 0)) { ...error... }
+ *
+ * On GCC/Clang, expands to `__builtin_expect((cond), 0)`. On MSVC,
+ * expands to `(cond)` (no equivalent builtin; modern MSVC does its own
+ * profile-driven block reordering).
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#define TVM_FFI_PREDICT_FALSE(cond) (__builtin_expect(static_cast<bool>(cond),
0))
+#define TVM_FFI_PREDICT_TRUE(cond) (__builtin_expect(static_cast<bool>(cond),
1))
+#else
+#define TVM_FFI_PREDICT_FALSE(cond) (cond)
+#define TVM_FFI_PREDICT_TRUE(cond) (cond)
+#endif
+
#define TVM_FFI_STR_CONCAT_(__x, __y) __x##__y
#define TVM_FFI_STR_CONCAT(__x, __y) TVM_FFI_STR_CONCAT_(__x, __y)
diff --git a/include/tvm/ffi/error.h b/include/tvm/ffi/error.h
index 1b6cb47..4931703 100644
--- a/include/tvm/ffi/error.h
+++ b/include/tvm/ffi/error.h
@@ -338,11 +338,13 @@ TVM_FFI_INLINE void SetSafeCallRaised(const Error& error)
{
class ErrorBuilder {
public:
+ TVM_FFI_COLD_CODE
explicit ErrorBuilder(std::string kind, std::string backtrace, bool
log_before_throw)
: kind_(std::move(kind)),
backtrace_(std::move(backtrace)),
log_before_throw_(log_before_throw) {}
+ TVM_FFI_COLD_CODE
explicit ErrorBuilder(std::string kind, const TVMFFIByteArray* backtrace,
bool log_before_throw)
: ErrorBuilder(std::move(kind), std::string(backtrace->data,
backtrace->size),
log_before_throw) {}
@@ -353,7 +355,7 @@ class ErrorBuilder {
#pragma warning(disable : 4722)
#endif
// avoid inline to reduce binary size, error throw path do not need to be
fast
- [[noreturn]] ~ErrorBuilder() noexcept(false) {
+ [[noreturn]] TVM_FFI_COLD_CODE ~ErrorBuilder() noexcept(false) {
::tvm::ffi::Error error(std::move(kind_), stream_.str(),
std::move(backtrace_));
if (log_before_throw_) {
std::cerr << error.FullMessage();
@@ -456,7 +458,8 @@ TVM_FFI_CHECK_FUNC(_NE, !=)
TVM_FFI_THROW(ErrorKind) << "Check failed: " << #x " " #op " " #y <<
*__tvm_ffi_log_err << ": "
#define TVM_FFI_CHECK(cond, ErrorKind) \
- if (!(cond)) TVM_FFI_THROW(ErrorKind) << "Check failed: (" #cond << ") is
false: "
+ if (TVM_FFI_PREDICT_FALSE(!(cond))) \
+ TVM_FFI_THROW(ErrorKind) << "Check failed: (" #cond << ") is false: "
#define TVM_FFI_CHECK_LT(x, y, ErrorKind) TVM_FFI_CHECK_BINARY_OP(_LT, <, x,
y, ErrorKind)
#define TVM_FFI_CHECK_GT(x, y, ErrorKind) TVM_FFI_CHECK_BINARY_OP(_GT, >, x,
y, ErrorKind)
diff --git a/include/tvm/ffi/function.h b/include/tvm/ffi/function.h
index 4ec0e00..1100603 100644
--- a/include/tvm/ffi/function.h
+++ b/include/tvm/ffi/function.h
@@ -101,7 +101,7 @@ namespace ffi {
#define TVM_FFI_CHECK_SAFE_CALL(func) \
{ \
int ret_code = (func); \
- if (ret_code != 0) { \
+ if (TVM_FFI_PREDICT_FALSE(ret_code != 0)) { \
throw ::tvm::ffi::details::MoveFromSafeCallRaised(); \
} \
}
diff --git a/python/tvm_ffi/cython/tvm_ffi_python_helpers.h
b/python/tvm_ffi/cython/tvm_ffi_python_helpers.h
index 79494d5..c82b09c 100644
--- a/python/tvm_ffi/cython/tvm_ffi_python_helpers.h
+++ b/python/tvm_ffi/cython/tvm_ffi_python_helpers.h
@@ -36,6 +36,29 @@
#endif
#endif
+// Local mirror of TVM_FFI_COLD_CODE / TVM_FFI_PREDICT_* from
+// <tvm/ffi/base_details.h>. The Cython helper deliberately avoids that header
+// (keeps the include surface c-headers-only), so we duplicate the macro
+// definitions here. Keep these in sync with base_details.h: same expansion on
+// GCC/Clang, no-op on MSVC.
+#ifndef TVM_FFI_COLD_CODE
+#if defined(__GNUC__) || defined(__clang__)
+#define TVM_FFI_COLD_CODE [[gnu::cold]]
+#else
+#define TVM_FFI_COLD_CODE
+#endif
+#endif
+
+#ifndef TVM_FFI_PREDICT_FALSE
+#if defined(__GNUC__) || defined(__clang__)
+#define TVM_FFI_PREDICT_FALSE(cond) (__builtin_expect(static_cast<bool>(cond),
0))
+#define TVM_FFI_PREDICT_TRUE(cond) (__builtin_expect(static_cast<bool>(cond),
1))
+#else
+#define TVM_FFI_PREDICT_FALSE(cond) (cond)
+#define TVM_FFI_PREDICT_TRUE(cond) (cond)
+#endif
+#endif
+
#include <cstring>
#include <exception>
#include <iostream>
@@ -252,7 +275,7 @@ int TVMFFIPyArgSetterInt_(TVMFFIPyArgSetter*,
TVMFFIPyCallContext*, PyObject* ar
out->type_index = kTVMFFIInt;
out->v_int64 = PyLong_AsLongLongAndOverflow(arg, &overflow);
- if (overflow != 0) {
+ if (TVM_FFI_PREDICT_FALSE(overflow != 0)) {
PyErr_SetString(PyExc_OverflowError, "Python int too large to convert to
int64_t");
return -1;
}
@@ -454,7 +477,7 @@ class TVMFFIPyCallManager {
int* c_api_ret_code, bool release_gil,
const DLPackExchangeAPI**
optional_out_ctx_dlpack_api) {
int64_t num_args = PyTuple_Size(py_arg_tuple);
- if (num_args == -1) return -1;
+ if (TVM_FFI_PREDICT_FALSE(num_args == -1)) return -1;
try {
// allocate a call stack
TVMFFIPyCallContext ctx(&call_stack_, num_args);
@@ -462,7 +485,7 @@ class TVMFFIPyCallManager {
for (int64_t i = 0; i < num_args; ++i) {
PyObject* py_arg = PyTuple_GetItem(py_arg_tuple, i);
TVMFFIAny* c_arg = ctx.packed_args + i;
- if (SetArgument(&ctx, py_arg, c_arg) != 0) return -1;
+ if (TVM_FFI_PREDICT_FALSE(SetArgument(&ctx, py_arg, c_arg) != 0))
return -1;
}
TVMFFIStreamHandle prev_stream = nullptr;
DLPackManagedTensorAllocator prev_tensor_allocator = nullptr;
@@ -471,13 +494,13 @@ class TVMFFIPyCallManager {
c_api_ret_code[0] =
TVMFFIEnvSetStream(ctx.device_type, ctx.device_id, ctx.stream,
&prev_stream);
// setting failed, directly return
- if (c_api_ret_code[0] != 0) return 0;
+ if (TVM_FFI_PREDICT_FALSE(c_api_ret_code[0] != 0)) return 0;
}
if (ctx.dlpack_c_exchange_api != nullptr &&
ctx.dlpack_c_exchange_api->managed_tensor_allocator != nullptr) {
c_api_ret_code[0] = TVMFFIEnvSetDLPackManagedTensorAllocator(
ctx.dlpack_c_exchange_api->managed_tensor_allocator, 0,
&prev_tensor_allocator);
- if (c_api_ret_code[0] != 0) return 0;
+ if (TVM_FFI_PREDICT_FALSE(c_api_ret_code[0] != 0)) return 0;
}
// call the function
if (release_gil) {
@@ -491,7 +514,8 @@ class TVMFFIPyCallManager {
// restore the original stream
if (ctx.device_type != -1 && prev_stream != ctx.stream) {
// always try recover first, even if error happens
- if (TVMFFIEnvSetStream(ctx.device_type, ctx.device_id, prev_stream,
nullptr) != 0) {
+ if (TVM_FFI_PREDICT_FALSE(
+ TVMFFIEnvSetStream(ctx.device_type, ctx.device_id,
prev_stream, nullptr) != 0)) {
// recover failed, set python error
PyErr_SetString(PyExc_RuntimeError, "Failed to recover stream");
return -1;
@@ -502,12 +526,13 @@ class TVMFFIPyCallManager {
prev_tensor_allocator !=
ctx.dlpack_c_exchange_api->managed_tensor_allocator) {
// note: we cannot set the error value to c_api_ret_code[0] here
because it
// will be overwritten by the error value from the function call
- if (TVMFFIEnvSetDLPackManagedTensorAllocator(prev_tensor_allocator, 0,
nullptr) != 0) {
+ if (TVM_FFI_PREDICT_FALSE(
+
TVMFFIEnvSetDLPackManagedTensorAllocator(prev_tensor_allocator, 0, nullptr) !=
0)) {
PyErr_SetString(PyExc_RuntimeError, "Failed to recover DLPack
managed tensor allocator");
return -1;
}
// return error after
- if (c_api_ret_code[0] != 0) return 0;
+ if (TVM_FFI_PREDICT_FALSE(c_api_ret_code[0] != 0)) return 0;
}
if (optional_out_ctx_dlpack_api != nullptr && ctx.dlpack_c_exchange_api
!= nullptr) {
*optional_out_ctx_dlpack_api = ctx.dlpack_c_exchange_api;
@@ -540,7 +565,7 @@ class TVMFFIPyCallManager {
TVM_FFI_INLINE int ConstructorCall(void* func_handle, PyObject*
py_arg_tuple, TVMFFIAny* result,
int* c_api_ret_code, TVMFFIPyCallContext*
parent_ctx) {
int64_t num_args = PyTuple_Size(py_arg_tuple);
- if (num_args == -1) return -1;
+ if (TVM_FFI_PREDICT_FALSE(num_args == -1)) return -1;
try {
// allocate a call stack
TVMFFIPyCallContext ctx(&call_stack_, num_args);
@@ -548,7 +573,7 @@ class TVMFFIPyCallManager {
for (int64_t i = 0; i < num_args; ++i) {
PyObject* py_arg = PyTuple_GetItem(py_arg_tuple, i);
TVMFFIAny* c_arg = ctx.packed_args + i;
- if (SetArgument(&ctx, py_arg, c_arg) != 0) return -1;
+ if (TVM_FFI_PREDICT_FALSE(SetArgument(&ctx, py_arg, c_arg) != 0))
return -1;
}
c_api_ret_code[0] = TVMFFIFunctionCall(func_handle, ctx.packed_args,
num_args, result);
// propagate the call context to the parent context
@@ -577,7 +602,7 @@ class TVMFFIPyCallManager {
try {
TVMFFIPyCallContext ctx(&call_stack_, 1);
TVMFFIAny* c_arg = ctx.packed_args;
- if (SetArgument(&ctx, py_arg, c_arg) != 0) return -1;
+ if (TVM_FFI_PREDICT_FALSE(SetArgument(&ctx, py_arg, c_arg) != 0)) return
-1;
if (!(field_flags & kTVMFFIFieldFlagBitSetterIsFunctionObj)) {
auto setter = reinterpret_cast<TVMFFIFieldSetter>(field_setter);
c_api_ret_code[0] = (*setter)(field_ptr, c_arg);
@@ -603,7 +628,7 @@ class TVMFFIPyCallManager {
try {
TVMFFIPyCallContext ctx(&call_stack_, 1);
TVMFFIAny* c_arg = ctx.packed_args;
- if (SetArgument(&ctx, py_arg, c_arg) != 0) return -1;
+ if (TVM_FFI_PREDICT_FALSE(SetArgument(&ctx, py_arg, c_arg) != 0)) return
-1;
c_api_ret_code[0] = TVMFFIAnyViewToOwnedAny(c_arg, out);
return 0;
} catch (const std::exception& ex) {
@@ -629,20 +654,20 @@ class TVMFFIPyCallManager {
// find the pre-cached setter
// This class is thread-local, so we don't need to worry about race
condition
auto it = arg_dispatch_map_.find(py_type);
- if (it != arg_dispatch_map_.end()) {
+ if (TVM_FFI_PREDICT_TRUE(it != arg_dispatch_map_.end())) {
TVMFFIPyArgSetter setter = it->second;
// if error happens, propagate it back
- if (setter(ctx, py_arg, out) != 0) return -1;
+ if (TVM_FFI_PREDICT_FALSE(setter(ctx, py_arg, out) != 0)) return -1;
} else {
// no dispatch found, query and create a new one.
TVMFFIPyArgSetter setter;
// propagate python error back
- if (TVMFFICyArgSetterFactory(py_arg, &setter) != 0) {
+ if (TVM_FFI_PREDICT_FALSE(TVMFFICyArgSetterFactory(py_arg, &setter) !=
0)) {
return -1;
}
// update dispatch table
arg_dispatch_map_.emplace(py_type, setter);
- if (setter(ctx, py_arg, out) != 0) return -1;
+ if (TVM_FFI_PREDICT_FALSE(setter(ctx, py_arg, out) != 0)) return -1;
}
return 0;
}
@@ -706,8 +731,8 @@ class TVMFFIPyCallManager {
TVMFFIPyCallbackContext cb_ctx(&call_stack_, num_args);
// Step 1: Convert each packed arg (borrowed AnyView) to a PyObject*
for (int32_t i = 0; i < num_args; ++i) {
- if (SetPyCallbackArg(closure->dlpack_exchange_api, &packed_args[i],
&cb_ctx.py_args[i]) !=
- 0) {
+ if
(TVM_FFI_PREDICT_FALSE(SetPyCallbackArg(closure->dlpack_exchange_api,
&packed_args[i],
+ &cb_ctx.py_args[i]) != 0)) {
ForwardPyErrorToFFI();
return -1;
}
@@ -749,7 +774,7 @@ class TVMFFIPyCallManager {
// The guard's destructor runs AFTER the return value is computed.
TVMFFIPyCallContext ret_ctx(&call_stack_, 1);
TVMFFIAny* view = ret_ctx.packed_args;
- if (SetArgument(&ret_ctx, py_result.p, view) != 0) {
+ if (TVM_FFI_PREDICT_FALSE(SetArgument(&ret_ctx, py_result.p, view) !=
0)) {
ForwardPyErrorToFFI();
return -1;
}
@@ -776,7 +801,7 @@ class TVMFFIPyCallManager {
* returned by PyErr_Occurred()) so that set_last_ffi_error can access the
* message and traceback.
*/
- static void ForwardPyErrorToFFI() noexcept {
+ TVM_FFI_COLD_CODE static void ForwardPyErrorToFFI() noexcept {
#if PY_VERSION_HEX >= 0x030C0000
// Python 3.12+: PyErr_Fetch / PyErr_NormalizeException are deprecated.
// PyErr_GetRaisedException returns an already-normalized exception
diff --git a/src/ffi/backtrace.cc b/src/ffi/backtrace.cc
index 0e2e9b4..8a349d2 100644
--- a/src/ffi/backtrace.cc
+++ b/src/ffi/backtrace.cc
@@ -147,6 +147,7 @@ const TVMFFIByteArray* TVMFFIBacktrace(const char*
filename, int lineno, const c
}
#if TVM_FFI_BACKTRACE_ON_SEGFAULT
+TVM_FFI_COLD_CODE
void TVMFFISegFaultHandler(int sig) {
// Technically we shouldn't do any allocation in a signal handler, but
// Backtrace may allocate. What's the worst it could do? We're already
@@ -163,6 +164,7 @@ void TVMFFISegFaultHandler(int sig) {
raise(sig);
}
+TVM_FFI_COLD_CODE
__attribute__((constructor)) void TVMFFIInstallSignalHandler() {
// this may override already installed signal handlers
std::signal(SIGSEGV, TVMFFISegFaultHandler);
diff --git a/src/ffi/error.cc b/src/ffi/error.cc
index 8dd209c..058002f 100644
--- a/src/ffi/error.cc
+++ b/src/ffi/error.cc
@@ -20,6 +20,7 @@
* \file src/ffi/error.cc
* \brief Error handling implementation
*/
+#include <tvm/ffi/base_details.h>
#include <tvm/ffi/c_api.h>
#include <tvm/ffi/error.h>
diff --git a/src/ffi/function.cc b/src/ffi/function.cc
index 4b378f7..15ffd80 100644
--- a/src/ffi/function.cc
+++ b/src/ffi/function.cc
@@ -83,7 +83,7 @@ class GlobalFunctionTable {
};
void Update(const String& name, Function func, bool can_override) {
- if (table_.count(name)) {
+ if (TVM_FFI_PREDICT_FALSE(table_.count(name) != 0)) {
if (!can_override) {
TVM_FFI_THROW(RuntimeError) << "Global Function `" << name << "` is
already registered";
}
@@ -93,7 +93,7 @@ class GlobalFunctionTable {
void Update(const TVMFFIMethodInfo* method_info, bool can_override) {
String name(method_info->name.data, method_info->name.size);
- if (table_.count(name)) {
+ if (TVM_FFI_PREDICT_FALSE(table_.count(name) != 0)) {
if (!can_override) {
TVM_FFI_LOG_AND_THROW(RuntimeError)
<< "Global Function `" << name << "` is already registered,
possible causes:\n"