tqchen commented on code in PR #15910: URL: https://github.com/apache/tvm/pull/15910#discussion_r1356857176
########## src/runtime/relax_vm/paged_kv_cache.cc: ########## @@ -0,0 +1,633 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/*! + * \file src/runtime/relax_vm/paged_kv_cache.cc + * \brief Runtime paged KV cache object for language models. + */ +#include <tvm/runtime/device_api.h> +#include <tvm/runtime/logging.h> +#include <tvm/runtime/ndarray.h> +#include <tvm/runtime/registry.h> + +namespace tvm { +namespace runtime { +namespace relax_vm { + +//------------------------------------------- +// We keep the implementation private as +// they may subject to future changes. +// +// Users can interact with it through the +// runtime API function calls +//------------------------------------------- + +/*! + * \brief The paged KV cache for attention. + * - It supports managing the K/V data of **multiple sequences**. + * - It manages K/V values by doing paging along the sequence-length + * dimension with a configured page size. + * - The basic example use of the paged KV cache after initialization + * in each round of model forwarding is the following: + * - step 1. use `ResetAppendLengths` to reset the appending information + * for preparation, + * - step 2. use `ReserveExtraLengthForAppend` to specify the length + * of K/V data to be appended for each sequence, + * - step 3. use `SyncAuxArrayToDevice` to synchronize auxiliary arrays + * to device for append/attention computation, + * - step 4. for each layer, use `Append` to append the K/V data to the + * cache, and then use `Attention` to compute attention results with + * Q data. + */ +class PagedAttentionKVCacheObj : public Object { + private: + /*! \brief The total number of sequences managed in the KV cache. */ + int64_t num_total_seqs_ = 0; + /*! \brief The number of pages that are in use by the sequences. */ + int64_t num_pages_in_use_ = 0; + /*! + * \brief The number of allocated pages, including the in-use pages + * and the pages released due to sequence removal. + */ + int64_t num_pages_allocated_ = 0; + + /********************* Configuration *********************/ + + /*! \brief The page size (the sequence length each page manages) of the cache. */ + const int64_t page_size_; + /*! \brief The number of layers in the model. */ + const int64_t num_layers_; + /*! \brief The number of heads in the model. */ + const int64_t num_heads_; + /*! \brief The number of features each head has. */ + const int64_t head_dim_; + + /*! \brief We fix int32 to be the index dtype of auxiliary data. */ + const DLDataType dtype_aux_ = DataType::Int(32, 1).operator DLDataType(); Review Comment: We can store as DataType and auto conversts to DLDataType later -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
