This is an automated email from the ASF dual-hosted git repository. spectrometerHBH pushed a commit to branch tir-bench in repository https://gitbox.apache.org/repos/asf/tvm.git
commit d4b9e91923df3e707ab95f9ca9a134878fc88286 Author: Bohan Hou <[email protected]> AuthorDate: Mon May 18 19:53:24 2026 -0400 refactor(codegen): remove tirx.entry_cluster_sync codegen attribute (#634) The attribute caused codegen_cuda to inject a ``cluster_sync()`` at the very entry of any kernel whose ``Tx.func_attr`` carried it. Kernels that need a cluster sync at the start of their body do it explicitly via ``Tx.cuda.cluster_sync()``, so the entry-injected one was redundant (fp8_blockwise_gemm.py was the only user and called both). --- src/target/cuda/codegen_cuda.cc | 18 ------------------ src/target/cuda/codegen_cuda.h | 1 - src/tirx/transform/split_host_device.cc | 10 ---------- 3 files changed, 29 deletions(-) diff --git a/src/target/cuda/codegen_cuda.cc b/src/target/cuda/codegen_cuda.cc index d34565c2c5..64832963ed 100644 --- a/src/target/cuda/codegen_cuda.cc +++ b/src/target/cuda/codegen_cuda.cc @@ -45,12 +45,6 @@ namespace tvm { namespace codegen { -namespace { - -constexpr const char* kEntryClusterSyncAttr = "tirx.entry_cluster_sync"; - -} // namespace - std::string GetFP8Type(DataType type) { std::stringstream stream; int32_t lanes = type.lanes(); @@ -278,18 +272,6 @@ void CodeGenCUDA::VisitStmt_(const WhileNode* op) { stream << "}\n"; } -void CodeGenCUDA::PreFunctionBody(const PrimFunc& f) { - if (!f->HasNonzeroAttr(kEntryClusterSyncAttr)) { - return; - } - AddUtilFunction("tvm_builtin_cuda_cluster_sync", - "\n__forceinline__ __device__ void tvm_builtin_cuda_cluster_sync() {\n" - " asm(\"barrier.cluster.arrive.aligned;\");\n" - " asm(\"barrier.cluster.wait.aligned;\");\n" - "}\n"); - stream << " tvm_builtin_cuda_cluster_sync();\n"; -} - void CodeGenCUDA::BindThreadIndex(const IterVar& iv) { TVM_FFI_ICHECK(!var_idmap_.count(iv->var.get())); const auto& scope = runtime::ThreadScope::Create(iv->thread_tag); diff --git a/src/target/cuda/codegen_cuda.h b/src/target/cuda/codegen_cuda.h index 714c070767..91d640ee5d 100644 --- a/src/target/cuda/codegen_cuda.h +++ b/src/target/cuda/codegen_cuda.h @@ -54,7 +54,6 @@ class CodeGenCUDA final : public CodeGenC { void PrintExtraAttrs(const PrimFunc& f, std::ostream& os) final; // NOLINT(*) void VisitStmt_(const ForNode* op) final; void VisitStmt_(const WhileNode* op) final; - void PreFunctionBody(const PrimFunc& f) final; void PrintStorageSync(const CallNode* op) final; void PrintStorageScope(const std::string& scope, std::ostream& os) final; // NOLINT(*) void PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr lhs, PrimExpr rhs, diff --git a/src/tirx/transform/split_host_device.cc b/src/tirx/transform/split_host_device.cc index 520ffa2268..3a91cf3e13 100644 --- a/src/tirx/transform/split_host_device.cc +++ b/src/tirx/transform/split_host_device.cc @@ -38,12 +38,6 @@ namespace tvm { namespace tirx { -namespace { - -constexpr const char* kEntryClusterSyncAttr = "tirx.entry_cluster_sync"; - -} // namespace - class HostDeviceSplitter : public StmtMutator { public: explicit HostDeviceSplitter(IRModule* device_mod, std::function<GlobalVar()> var_supply, @@ -123,10 +117,6 @@ class HostDeviceSplitter : public StmtMutator { if (persistent.defined()) { device_func = WithAttr(std::move(device_func), tirx::attr::kPersistentKernel, persistent); } - auto entry_cluster_sync = cur_func_->GetAttr<Bool>(kEntryClusterSyncAttr); - if (entry_cluster_sync.defined()) { - device_func = WithAttr(std::move(device_func), kEntryClusterSyncAttr, entry_cluster_sync); - } GlobalVar kernel_symbol_global = var_supply_(); (*device_mod_)->Add(kernel_symbol_global, device_func); ffi::Array<PrimExpr> args = params.Map([](const Var& var) -> PrimExpr { return var; });
