(tvm) branch main updated: [Attn] Fix calling FlashInfer attention plan function (#18557)

tlopex Mon, 08 Dec 2025 11:09:03 -0800

This is an automated email from the ASF dual-hosted git repository.

tlopex pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new e78fbd8a33 [Attn] Fix calling FlashInfer attention plan function 
(#18557)
e78fbd8a33 is described below

commit e78fbd8a330fb2fb7512919a5bf4fbe5219ea177
Author: Ruihang Lai <[email protected]>
AuthorDate: Mon Dec 8 14:08:25 2025 -0500

    [Attn] Fix calling FlashInfer attention plan function (#18557)
    
    The FlashInfer attention plan function introduced a new parameter of
    `num_colocated_ctas`. This commit updates the TVM caller side
    accordingly.
---
 src/runtime/vm/attn_backend.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/runtime/vm/attn_backend.h b/src/runtime/vm/attn_backend.h
index 1fd22a97ab..31f1ce9f4a 100644
--- a/src/runtime/vm/attn_backend.h
+++ b/src/runtime/vm/attn_backend.h
@@ -251,7 +251,8 @@ class FlashInferPagedPrefillFunc : public PagedPrefillFunc {
                      qo_indptr->as_tensor(), page_indptr->as_tensor(), 
kv_len_arr, total_qo_len,
                      batch_size, num_qo_heads, num_kv_heads, page_size,
                      /*enable_cuda_graph=*/false, qk_head_dim, v_head_dim, 
causal,
-                     /*window_left=*/-1, /*fixed_split_size=*/-1, 
/*disable_split_kv=*/false)
+                     /*window_left=*/-1, /*fixed_split_size=*/-1, 
/*disable_split_kv=*/false,
+                     /*num_colocated_ctas=*/0)
               .cast<ffi::Array<int64_t>>();
     } else if (attn_kind == AttnKind::kMLA) {
       plan_info_vec =
@@ -375,7 +376,8 @@ class FlashInferRaggedPrefillFunc : public 
RaggedPrefillFunc {
                    qo_indptr->as_tensor(), kv_indptr->as_tensor(), kv_len_arr, 
total_qo_len,
                    batch_size, num_qo_heads, num_kv_heads, /*page_size=*/1,
                    /*enable_cuda_graph=*/false, qk_head_dim, v_head_dim, 
causal,
-                   /*window_left=*/-1, /*fixed_split_size=*/-1, 
/*disable_split_kv=*/false)
+                   /*window_left=*/-1, /*fixed_split_size=*/-1, 
/*disable_split_kv=*/false,
+                   /*num_colocated_ctas=*/0)
             .cast<ffi::Array<int64_t>>();
     DeviceAPI::Get(device)->SetStream(device, original_stream);
   }

(tvm) branch main updated: [Attn] Fix calling FlashInfer attention plan function (#18557)

Reply via email to