(tvm) 06/22: refactor(codegen): remove tirx.entry_cluster_sync codegen attribute (#634)

bohan Thu, 28 May 2026 21:45:39 -0700

This is an automated email from the ASF dual-hosted git repository.

spectrometerHBH pushed a commit to branch tir-bench
in repository https://gitbox.apache.org/repos/asf/tvm.git


commit d4b9e91923df3e707ab95f9ca9a134878fc88286
Author: Bohan Hou <[email protected]>
AuthorDate: Mon May 18 19:53:24 2026 -0400

    refactor(codegen): remove tirx.entry_cluster_sync codegen attribute (#634)
    
    The attribute caused codegen_cuda to inject a ``cluster_sync()`` at the
    very entry of any kernel whose ``Tx.func_attr`` carried it. Kernels that
    need a cluster sync at the start of their body do it explicitly via
    ``Tx.cuda.cluster_sync()``, so the entry-injected one was redundant
    (fp8_blockwise_gemm.py was the only user and called both).
---
 src/target/cuda/codegen_cuda.cc         | 18 ------------------
 src/target/cuda/codegen_cuda.h          |  1 -
 src/tirx/transform/split_host_device.cc | 10 ----------
 3 files changed, 29 deletions(-)

diff --git a/src/target/cuda/codegen_cuda.cc b/src/target/cuda/codegen_cuda.cc
index d34565c2c5..64832963ed 100644
--- a/src/target/cuda/codegen_cuda.cc
+++ b/src/target/cuda/codegen_cuda.cc
@@ -45,12 +45,6 @@
 namespace tvm {
 namespace codegen {
 
-namespace {
-
-constexpr const char* kEntryClusterSyncAttr = "tirx.entry_cluster_sync";
-
-}  // namespace
-
 std::string GetFP8Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
@@ -278,18 +272,6 @@ void CodeGenCUDA::VisitStmt_(const WhileNode* op) {
   stream << "}\n";
 }
 
-void CodeGenCUDA::PreFunctionBody(const PrimFunc& f) {
-  if (!f->HasNonzeroAttr(kEntryClusterSyncAttr)) {
-    return;
-  }
-  AddUtilFunction("tvm_builtin_cuda_cluster_sync",
-                  "\n__forceinline__ __device__ void 
tvm_builtin_cuda_cluster_sync() {\n"
-                  "    asm(\"barrier.cluster.arrive.aligned;\");\n"
-                  "    asm(\"barrier.cluster.wait.aligned;\");\n"
-                  "}\n");
-  stream << "  tvm_builtin_cuda_cluster_sync();\n";
-}
-
 void CodeGenCUDA::BindThreadIndex(const IterVar& iv) {
   TVM_FFI_ICHECK(!var_idmap_.count(iv->var.get()));
   const auto& scope = runtime::ThreadScope::Create(iv->thread_tag);
diff --git a/src/target/cuda/codegen_cuda.h b/src/target/cuda/codegen_cuda.h
index 714c070767..91d640ee5d 100644
--- a/src/target/cuda/codegen_cuda.h
+++ b/src/target/cuda/codegen_cuda.h
@@ -54,7 +54,6 @@ class CodeGenCUDA final : public CodeGenC {
   void PrintExtraAttrs(const PrimFunc& f, std::ostream& os) final;  // 
NOLINT(*)
   void VisitStmt_(const ForNode* op) final;
   void VisitStmt_(const WhileNode* op) final;
-  void PreFunctionBody(const PrimFunc& f) final;
   void PrintStorageSync(const CallNode* op) final;
   void PrintStorageScope(const std::string& scope, std::ostream& os) final;  
// NOLINT(*)
   void PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs, 
PrimExpr rhs,
diff --git a/src/tirx/transform/split_host_device.cc 
b/src/tirx/transform/split_host_device.cc
index 520ffa2268..3a91cf3e13 100644
--- a/src/tirx/transform/split_host_device.cc
+++ b/src/tirx/transform/split_host_device.cc
@@ -38,12 +38,6 @@
 namespace tvm {
 namespace tirx {
 
-namespace {
-
-constexpr const char* kEntryClusterSyncAttr = "tirx.entry_cluster_sync";
-
-}  // namespace
-
 class HostDeviceSplitter : public StmtMutator {
  public:
   explicit HostDeviceSplitter(IRModule* device_mod, std::function<GlobalVar()> 
var_supply,
@@ -123,10 +117,6 @@ class HostDeviceSplitter : public StmtMutator {
     if (persistent.defined()) {
       device_func = WithAttr(std::move(device_func), 
tirx::attr::kPersistentKernel, persistent);
     }
-    auto entry_cluster_sync = cur_func_->GetAttr<Bool>(kEntryClusterSyncAttr);
-    if (entry_cluster_sync.defined()) {
-      device_func = WithAttr(std::move(device_func), kEntryClusterSyncAttr, 
entry_cluster_sync);
-    }
     GlobalVar kernel_symbol_global = var_supply_();
     (*device_mod_)->Add(kernel_symbol_global, device_func);
     ffi::Array<PrimExpr> args = params.Map([](const Var& var) -> PrimExpr { 
return var; });

(tvm) 06/22: refactor(codegen): remove tirx.entry_cluster_sync codegen attribute (#634)

Reply via email to