This is an automated email from the ASF dual-hosted git repository.
tlopex pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new e78fbd8a33 [Attn] Fix calling FlashInfer attention plan function
(#18557)
e78fbd8a33 is described below
commit e78fbd8a330fb2fb7512919a5bf4fbe5219ea177
Author: Ruihang Lai <[email protected]>
AuthorDate: Mon Dec 8 14:08:25 2025 -0500
[Attn] Fix calling FlashInfer attention plan function (#18557)
The FlashInfer attention plan function introduced a new parameter of
`num_colocated_ctas`. This commit updates the TVM caller side
accordingly.
---
src/runtime/vm/attn_backend.h | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/src/runtime/vm/attn_backend.h b/src/runtime/vm/attn_backend.h
index 1fd22a97ab..31f1ce9f4a 100644
--- a/src/runtime/vm/attn_backend.h
+++ b/src/runtime/vm/attn_backend.h
@@ -251,7 +251,8 @@ class FlashInferPagedPrefillFunc : public PagedPrefillFunc {
qo_indptr->as_tensor(), page_indptr->as_tensor(),
kv_len_arr, total_qo_len,
batch_size, num_qo_heads, num_kv_heads, page_size,
/*enable_cuda_graph=*/false, qk_head_dim, v_head_dim,
causal,
- /*window_left=*/-1, /*fixed_split_size=*/-1,
/*disable_split_kv=*/false)
+ /*window_left=*/-1, /*fixed_split_size=*/-1,
/*disable_split_kv=*/false,
+ /*num_colocated_ctas=*/0)
.cast<ffi::Array<int64_t>>();
} else if (attn_kind == AttnKind::kMLA) {
plan_info_vec =
@@ -375,7 +376,8 @@ class FlashInferRaggedPrefillFunc : public
RaggedPrefillFunc {
qo_indptr->as_tensor(), kv_indptr->as_tensor(), kv_len_arr,
total_qo_len,
batch_size, num_qo_heads, num_kv_heads, /*page_size=*/1,
/*enable_cuda_graph=*/false, qk_head_dim, v_head_dim,
causal,
- /*window_left=*/-1, /*fixed_split_size=*/-1,
/*disable_split_kv=*/false)
+ /*window_left=*/-1, /*fixed_split_size=*/-1,
/*disable_split_kv=*/false,
+ /*num_colocated_ctas=*/0)
.cast<ffi::Array<int64_t>>();
DeviceAPI::Get(device)->SetStream(device, original_stream);
}