This is an automated email from the ASF dual-hosted git repository.
ruihangl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 4a37f64167 [KVCache] Increase coalesce threshold (#17280)
4a37f64167 is described below
commit 4a37f64167ce80552719cf9975c5ff8e4a053538
Author: Yaxing Cai <[email protected]>
AuthorDate: Sat Aug 17 10:22:28 2024 -0700
[KVCache] Increase coalesce threshold (#17280)
This PR changes the threshold of coalesce in kvcache for better performance.
---
src/runtime/relax_vm/paged_kv_cache.cc | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/runtime/relax_vm/paged_kv_cache.cc
b/src/runtime/relax_vm/paged_kv_cache.cc
index cf5de97202..6bf3dc7ce6 100644
--- a/src/runtime/relax_vm/paged_kv_cache.cc
+++ b/src/runtime/relax_vm/paged_kv_cache.cc
@@ -1727,7 +1727,7 @@ class PagedAttentionKVCacheObj : public
AttentionKVCacheObj {
qkv_data->dtype);
// Part 2. Split fused qkv and apply rotary embedding to q/k data.
f_split_rotary_(qkv_data, q_rope_position_map_view_, q_data, k_data,
v_data,
- rope_mode_ == RoPEMode::kNormal);
+ static_cast<int>(rope_mode_ == RoPEMode::kNormal));
// Part 3. Append k/v data to kv-cache if flag "append_before_attn" is set.
if (append_before_attn_) {
@@ -2202,7 +2202,7 @@ class PagedAttentionKVCacheObj : public
AttentionKVCacheObj {
}
double coalesce_ratio = 1.0 * page_counter_uncoalesced /
page_counter_coalesced;
// Do not coalesce and use batch decode kernel when coalesce ratio is
small.
- bool use_decode_kernel = is_decode_request_ && coalesce_ratio < 1.1;
+ bool use_decode_kernel = is_decode_request_ && coalesce_ratio < 32;
return {use_decode_kernel || !enable_coalesce ? uncoalesced_block_ids :
coalesced_block_ids,
use_decode_kernel};
}