This is an automated email from the ASF dual-hosted git repository.

ruihangl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 4a37f64167 [KVCache] Increase coalesce threshold (#17280)
4a37f64167 is described below

commit 4a37f64167ce80552719cf9975c5ff8e4a053538
Author: Yaxing Cai <[email protected]>
AuthorDate: Sat Aug 17 10:22:28 2024 -0700

    [KVCache] Increase coalesce threshold (#17280)
    
    This PR changes the threshold of coalesce in kvcache for better performance.
---
 src/runtime/relax_vm/paged_kv_cache.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/runtime/relax_vm/paged_kv_cache.cc 
b/src/runtime/relax_vm/paged_kv_cache.cc
index cf5de97202..6bf3dc7ce6 100644
--- a/src/runtime/relax_vm/paged_kv_cache.cc
+++ b/src/runtime/relax_vm/paged_kv_cache.cc
@@ -1727,7 +1727,7 @@ class PagedAttentionKVCacheObj : public 
AttentionKVCacheObj {
                                                     qkv_data->dtype);
     // Part 2. Split fused qkv and apply rotary embedding to q/k data.
     f_split_rotary_(qkv_data, q_rope_position_map_view_, q_data, k_data, 
v_data,
-                    rope_mode_ == RoPEMode::kNormal);
+                    static_cast<int>(rope_mode_ == RoPEMode::kNormal));
 
     // Part 3. Append k/v data to kv-cache if flag "append_before_attn" is set.
     if (append_before_attn_) {
@@ -2202,7 +2202,7 @@ class PagedAttentionKVCacheObj : public 
AttentionKVCacheObj {
     }
     double coalesce_ratio = 1.0 * page_counter_uncoalesced / 
page_counter_coalesced;
     // Do not coalesce and use batch decode kernel when coalesce ratio is 
small.
-    bool use_decode_kernel = is_decode_request_ && coalesce_ratio < 1.1;
+    bool use_decode_kernel = is_decode_request_ && coalesce_ratio < 32;
     return {use_decode_kernel || !enable_coalesce ? uncoalesced_block_ids : 
coalesced_block_ids,
             use_decode_kernel};
   }

Reply via email to