This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a change to branch nightly
in repository https://gitbox.apache.org/repos/asf/tvm.git
from 48cedc7d2e [Arith][Fixup] Require feature flag for tighter inequality
bounds (#16735)
add a9436b8154 [Fix][Builtin] Fix "GetQueryPosition" of PagedKVCache
(#16746)
add 89e9028849 [Cutlass] Add group gemm kernels (#16751)
add 7683bc23b1 [Fix] Lazy import of "psutil" in disco process pool (#16752)
add 0f38ef2d6e [Bugfix][TIR] Fix cache_read update buffer region (#16742)
add e257fb8a41 [Runtime] CUDA IPC Memory support and custom allreduce
kernels (#16750)
No new revisions were added by this update.
Summary of changes:
3rdparty/cutlass | 2 +-
3rdparty/tensorrt_llm/custom_allreduce_kernels.cu | 400 +++++++++++++++++++++
3rdparty/tensorrt_llm/custom_allreduce_kernels.h | 48 +++
CMakeLists.txt | 29 +-
LICENSE | 1 +
cmake/modules/contrib/CUTLASS.cmake | 49 ++-
include/tvm/runtime/disco/cuda_ipc_memory.h | 102 ++++++
include/tvm/runtime/memory/memory_manager.h | 13 +-
..._fpA_intB_gemm.txt => LICENSE.tensorrt_llm.txt} | 0
python/tvm/runtime/disco/process_pool.py | 4 +-
python/tvm/runtime/disco/session.py | 13 +-
src/runtime/contrib/cutlass/fp16_group_gemm.cu | 70 ++++
src/runtime/contrib/cutlass/fp8_group_gemm.cu | 83 +++++
src/runtime/contrib/cutlass/group_gemm_runner.cuh | 209 +++++++++++
src/runtime/contrib/cutlass/weight_preprocess.cc | 2 +-
src/runtime/disco/cuda_ipc/cuda_ipc_memory.cc | 227 ++++++++++++
src/runtime/disco/cuda_ipc/custom_allreduce.cc | 112 ++++++
src/runtime/disco/nccl/nccl.cc | 117 +-----
src/runtime/disco/nccl/nccl_context.h | 147 ++++++++
src/runtime/memory/memory_manager.cc | 9 +-
src/runtime/memory/naive_allocator.h | 2 +-
src/runtime/memory/pooled_allocator.h | 25 +-
src/runtime/relax_vm/builtin.cc | 1 +
src/runtime/relax_vm/kv_state.h | 2 +-
src/runtime/relax_vm/paged_kv_cache.cc | 9 +-
src/runtime/vm/vm.cc | 2 +
src/tir/schedule/primitive/cache_read_write.cc | 7 +-
tests/python/contrib/test_cutlass.py | 98 +++++
tests/python/disco/test_custom_allreduce.py | 78 ++++
.../test_tir_schedule_cache_read_write.py | 41 +++
30 files changed, 1749 insertions(+), 153 deletions(-)
create mode 100644 3rdparty/tensorrt_llm/custom_allreduce_kernels.cu
create mode 100644 3rdparty/tensorrt_llm/custom_allreduce_kernels.h
create mode 100644 include/tvm/runtime/disco/cuda_ipc_memory.h
copy licenses/{LICENSE.cutlass_fpA_intB_gemm.txt => LICENSE.tensorrt_llm.txt}
(100%)
create mode 100644 src/runtime/contrib/cutlass/fp16_group_gemm.cu
create mode 100644 src/runtime/contrib/cutlass/fp8_group_gemm.cu
create mode 100644 src/runtime/contrib/cutlass/group_gemm_runner.cuh
create mode 100644 src/runtime/disco/cuda_ipc/cuda_ipc_memory.cc
create mode 100644 src/runtime/disco/cuda_ipc/custom_allreduce.cc
create mode 100644 src/runtime/disco/nccl/nccl_context.h
create mode 100644 tests/python/disco/test_custom_allreduce.py