This is an automated email from the ASF dual-hosted git repository. tqchen pushed a commit to branch refactor-s2 in repository https://gitbox.apache.org/repos/asf/tvm.git
commit 1e0a870827a2b1bfbbc3bb1677b9767fd8a43e5b Author: tqchen <[email protected]> AuthorDate: Mon Apr 28 20:51:07 2025 -0400 Update to fix gpu and ffi --- 3rdparty/cutlass_fpA_intB_gemm | 2 +- apps/hexagon_launcher/launcher_main.cc | 2 +- ffi/include/tvm/ffi/container/ndarray.h | 6 +++-- ffi/include/tvm/ffi/function.h | 1 + include/tvm/ir/module.h | 2 -- include/tvm/relax/tuning_api.h | 2 ++ src/node/structural_hash.cc | 3 +-- src/runtime/contrib/thrust/thrust.cu | 3 ++- src/runtime/hexagon/ops/conv2d.h | 2 +- src/runtime/hexagon/ops/conv2d_fp16_hvx.cc | 3 ++- src/runtime/hexagon/ops/conv2d_quant_hvx.cc | 2 +- src/runtime/hexagon/ops/conv_utils.cc | 4 ++-- src/runtime/memory/memory_manager.cc | 10 ++++---- src/runtime/ndarray.cc | 2 +- src/runtime/rpc/rpc_module.cc | 2 +- src/support/ffi_testing.cc | 2 -- src/target/llvm/codegen_llvm.h | 2 +- .../cpp-runtime/hexagon/hexagon_conv_utils_test.h | 2 +- tests/python/codegen/test_target_codegen_cuda.py | 28 ---------------------- 19 files changed, 27 insertions(+), 53 deletions(-) diff --git a/3rdparty/cutlass_fpA_intB_gemm b/3rdparty/cutlass_fpA_intB_gemm index fdef230791..d463617c21 160000 --- a/3rdparty/cutlass_fpA_intB_gemm +++ b/3rdparty/cutlass_fpA_intB_gemm @@ -1 +1 @@ -Subproject commit fdef2307917ec2c7cc5becc29fb95d77498484bd +Subproject commit d463617c215969d9d2333a216042ddb963949d0d diff --git a/apps/hexagon_launcher/launcher_main.cc b/apps/hexagon_launcher/launcher_main.cc index 087b2a1b00..8690996684 100644 --- a/apps/hexagon_launcher/launcher_main.cc +++ b/apps/hexagon_launcher/launcher_main.cc @@ -102,7 +102,7 @@ int main(int argc, char* argv[]) { for (int i = 0, e = config.inputs.size(); i != e; ++i) { const TensorConfig& tc = config.inputs[i]; input_meta->ndim = tc.shape.size(); - input_meta->dtype = tvm::runtime::StringToDLDataType(tc.dtype); + input_meta->dtype = tvm::ffi::StringToDLDataType(tc.dtype); std::copy(tc.shape.begin(), tc.shape.end(), input_meta->shape); auto* input_data = session.alloc<unsigned char>(input_meta->data_size()); diff --git a/ffi/include/tvm/ffi/container/ndarray.h b/ffi/include/tvm/ffi/container/ndarray.h index 66bdbb6d70..065c8db2cd 100644 --- a/ffi/include/tvm/ffi/container/ndarray.h +++ b/ffi/include/tvm/ffi/container/ndarray.h @@ -29,6 +29,8 @@ #include <tvm/ffi/error.h> #include <tvm/ffi/type_traits.h> +#include <utility> + namespace tvm { namespace ffi { @@ -85,7 +87,7 @@ inline bool IsAligned(const DLTensor& arr, size_t alignment) { */ inline size_t GetDataSize(int64_t numel, DLDataType dtype) { // compatible handling sub-byte uint1(bool), which usually stored as uint8_t - // TODO: revisit and switch to kDLBool + // TODO(tqchen): revisit and switch to kDLBool if (dtype.code == kDLUInt && dtype.bits == 1 && dtype.lanes == 1) { return numel; } @@ -197,7 +199,7 @@ class NDArrayObjFromNDAlloc : public NDArrayObj { template <typename TDLPackManagedTensor> class NDArrayObjFromDLPack : public NDArrayObj { public: - NDArrayObjFromDLPack(TDLPackManagedTensor* tensor) : tensor_(tensor) { + explicit NDArrayObjFromDLPack(TDLPackManagedTensor* tensor) : tensor_(tensor) { *static_cast<DLTensor*>(this) = tensor_->dl_tensor; // set strides to nullptr if the tensor is contiguous. if (IsContiguous(tensor->dl_tensor)) { diff --git a/ffi/include/tvm/ffi/function.h b/ffi/include/tvm/ffi/function.h index d53bb01934..d5e31372c6 100644 --- a/ffi/include/tvm/ffi/function.h +++ b/ffi/include/tvm/ffi/function.h @@ -32,6 +32,7 @@ #include <functional> #include <string> #include <utility> +#include <vector> namespace tvm { namespace ffi { diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h index b41725e0c0..66637f67d9 100644 --- a/include/tvm/ir/module.h +++ b/include/tvm/ir/module.h @@ -302,8 +302,6 @@ namespace attr { * \brief Name of the module * * Type: String - * - * \sa tvm::runtime::String */ constexpr const char* kModuleName = "mod_name"; diff --git a/include/tvm/relax/tuning_api.h b/include/tvm/relax/tuning_api.h index 1e9a74cbfb..6d72c56025 100644 --- a/include/tvm/relax/tuning_api.h +++ b/include/tvm/relax/tuning_api.h @@ -27,7 +27,9 @@ #include <tvm/ir/transform.h> #include <tvm/meta_schedule/database.h> +#include <utility> #include <vector> + namespace tvm { namespace relax { diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc index 4835518e10..982c915759 100644 --- a/src/node/structural_hash.cc +++ b/src/node/structural_hash.cc @@ -609,8 +609,7 @@ struct MapObjTrait { } } - static bool SEqualReduceTraced(const MapObj* lhs, const MapObj* rhs, - const SEqualReducer& equal) { + static bool SEqualReduceTraced(const MapObj* lhs, const MapObj* rhs, const SEqualReducer& equal) { const ObjectPathPair& map_paths = equal.GetCurrentObjectPaths(); // First, check that every key from `lhs` is also in `rhs`, // and their values are mapped to each other. diff --git a/src/runtime/contrib/thrust/thrust.cu b/src/runtime/contrib/thrust/thrust.cu index 754a5942fb..03517c366d 100644 --- a/src/runtime/contrib/thrust/thrust.cu +++ b/src/runtime/contrib/thrust/thrust.cu @@ -393,7 +393,8 @@ void thrust_scan(DLTensor* data, DLTensor* output, bool exclusive, DLTensor* wor } } -TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sum_scan").set_body_packed([](TVMArgs args, TVMRetValue* ret) { +TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sum_scan") +.set_body_packed([](TVMArgs args, TVMRetValue* ret) { ICHECK(args.num_args == 2 || args.num_args == 3 || args.num_args == 4); DLTensor* data = args[0]; DLTensor* output = args[1]; diff --git a/src/runtime/hexagon/ops/conv2d.h b/src/runtime/hexagon/ops/conv2d.h index 76c6cccff7..79bd021717 100644 --- a/src/runtime/hexagon/ops/conv2d.h +++ b/src/runtime/hexagon/ops/conv2d.h @@ -286,7 +286,7 @@ void chunkify_hwio_8b(void** out_ptr, int out_ptr_size, void* out, void* inp, in template <typename T, int block_height, int block_width, int block_depth> SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* nhwc_flat, bool copy_data) { - tvm::runtime::String vtcm_scope = "global.vtcm"; + tvm::ffi::String vtcm_scope = "global.vtcm"; // Allocate blocks for activations. We will use the block pointers // directly from the allocated area. diff --git a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc index 6d4a4839fb..e37a7f3fb1 100644 --- a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc +++ b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc @@ -22,6 +22,7 @@ #include <hvx_hexagon_protos.h> #include <tvm/runtime/c_runtime_api.h> #include <tvm/runtime/device_api.h> +#include <tvm/runtime/data_type.h> #include <algorithm> #include <cassert> @@ -442,7 +443,7 @@ int conv2d_packed_fp16(TVMValue* args, int* type_codes, int num_args, TVMValue* auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, false); ICHECK(device_api != nullptr); - tvm::runtime::String vtcm_scope = "global.vtcm"; + tvm::ffi::String vtcm_scope = "global.vtcm"; auto act_vtcm = conv_utils::prepare_nhwc<uint16_t, 8, 4, 32>(device_api, act_flat, /*copy_data=*/true); diff --git a/src/runtime/hexagon/ops/conv2d_quant_hvx.cc b/src/runtime/hexagon/ops/conv2d_quant_hvx.cc index 682eebb137..99f7c245f5 100644 --- a/src/runtime/hexagon/ops/conv2d_quant_hvx.cc +++ b/src/runtime/hexagon/ops/conv2d_quant_hvx.cc @@ -283,7 +283,7 @@ int conv2d_packed_quant(TVMValue* args, int* type_codes, int num_args, TVMValue* auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, false); ICHECK(device_api != nullptr); - tvm::runtime::String vtcm_scope = "global.vtcm"; + tvm::ffi::String vtcm_scope = "global.vtcm"; auto act_vtcm = conv_utils::prepare_nhwc<uint8_t, 8, 8, 32>(device_api, act_flat, /*copy_data=*/true); diff --git a/src/runtime/hexagon/ops/conv_utils.cc b/src/runtime/hexagon/ops/conv_utils.cc index a40e23e463..6cad8c472f 100644 --- a/src/runtime/hexagon/ops/conv_utils.cc +++ b/src/runtime/hexagon/ops/conv_utils.cc @@ -167,7 +167,7 @@ std::tuple<int, int, int, int> getHWIO(const DLTensor* hwio_flat) { SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat, int num_chunks, void** ptr_table, int wgt_zp) { - tvm::runtime::String vtcm_scope = "global.vtcm"; + tvm::ffi::String vtcm_scope = "global.vtcm"; auto [h, w, i, o] = getHWIO(hwio_flat); int64_t shape_1d[] = {h * w * i * o}; @@ -182,7 +182,7 @@ SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const DLTensor SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* hwio_flat, int num_chunks, void** ptr_table) { - tvm::runtime::String vtcm_scope = "global.vtcm"; + tvm::ffi::String vtcm_scope = "global.vtcm"; // Allocate one block for filter data. We will need to create our own // pointer table. The reason is that filter chunks cannot be padded diff --git a/src/runtime/memory/memory_manager.cc b/src/runtime/memory/memory_manager.cc index 9ca438d6b0..ed53b04840 100644 --- a/src/runtime/memory/memory_manager.cc +++ b/src/runtime/memory/memory_manager.cc @@ -68,7 +68,7 @@ NDArray StorageObj::AllocNDArrayScoped(int64_t offset, ShapeTuple shape, DLDataT struct StorageScopedAlloc { public: - StorageScopedAlloc(Storage storage) : storage_(storage) {} + explicit StorageScopedAlloc(Storage storage) : storage_(storage) {} void AllocData(DLTensor* tensor, const ffi::Shape& shape, const String& scope, int64_t byte_offset) { @@ -97,9 +97,9 @@ NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dt ICHECK(offset + needed_size <= this->buffer.size) << "storage allocation failure, attempted to allocate " << needed_size << " at offset " << offset << " in region that is " << this->buffer.size << "bytes"; - struct StorageAlloc { + class StorageAlloc { public: - StorageAlloc(Storage storage) : storage_(storage) {} + explicit StorageAlloc(Storage storage) : storage_(storage) {} void AllocData(DLTensor* tensor, int64_t offset) { if (storage_->buffer.device.device_type == kDLHexagon) { @@ -216,9 +216,9 @@ NDArray Allocator::Empty(ShapeTuple shape, DLDataType dtype, DLDevice dev, Optional<String> mem_scope) { VerifyDataType(dtype); - struct BufferAlloc { + class BufferAlloc { public: - BufferAlloc(Buffer buffer) : buffer_(buffer) {} + explicit BufferAlloc(Buffer buffer) : buffer_(buffer) {} void AllocData(DLTensor* tensor) { tensor->data = buffer_.data; } void FreeData(DLTensor* tensor) { diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc index b1fdca15ed..8911533977 100644 --- a/src/runtime/ndarray.cc +++ b/src/runtime/ndarray.cc @@ -158,7 +158,7 @@ NDArray NDArray::CreateView(ShapeTuple shape, DLDataType dtype, // helper allocator class that retains ref count of original NDArray class ViewBasedAlloc { public: - ViewBasedAlloc(NDArray source) : source_(source) {} + explicit ViewBasedAlloc(NDArray source) : source_(source) {} void AllocData(DLTensor* tensor, int64_t byte_offset) { tensor->data = source_.get_mutable()->data; tensor->byte_offset = byte_offset; diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index 9e60e18172..b7be8ef098 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -56,7 +56,7 @@ NDArray NDArrayFromRemoteOpaqueHandle(std::shared_ptr<RPCSession> sess, void* ha << "The Device given does not belong to the given session"; class RemoteSpaceAlloc { public: - RemoteSpaceAlloc(RemoteSpace space) : space_(space) {} + explicit RemoteSpaceAlloc(RemoteSpace space) : space_(space) {} void AllocData(DLTensor* tensor) { // the pointer to the remote space is passed in as the data pointer tensor->data = &(space_); diff --git a/src/support/ffi_testing.cc b/src/support/ffi_testing.cc index fc7315eb83..5c59c08792 100644 --- a/src/support/ffi_testing.cc +++ b/src/support/ffi_testing.cc @@ -53,7 +53,6 @@ struct TestAttrs : public AttrsNode<TestAttrs> { TVM_REGISTER_NODE_TYPE(TestAttrs); - TVM_REGISTER_GLOBAL("testing.test_wrap_callback") .set_body_packed([](TVMArgs args, TVMRetValue* ret) { PackedFunc pf = args[0]; @@ -167,7 +166,6 @@ TVM_REGISTER_GLOBAL("testing.sleep_in_ffi").set_body_typed([](double timeout) { std::this_thread::sleep_for(duration); }); - TVM_REGISTER_GLOBAL("testing.ReturnsVariant").set_body_typed([](int x) -> Variant<String, IntImm> { if (x % 2 == 0) { return IntImm(DataType::Int(64), x / 2); diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h index c9cbc16b11..530d1772df 100644 --- a/src/target/llvm/codegen_llvm.h +++ b/src/target/llvm/codegen_llvm.h @@ -264,7 +264,7 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const PrimExpr&)>, int alignment{0}; }; /*! - * \brief Convert tvm::runtime::String into llvm::StringRef + * \brief Convert tvm::ffi::String into llvm::StringRef */ static llvm::StringRef MakeStringRef(const String& string) { return llvm::StringRef(string.c_str(), string.size()); diff --git a/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h b/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h index 07e1596686..e6840d88bc 100644 --- a/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h +++ b/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h @@ -93,7 +93,7 @@ class HexagonUtilsTest : public ::testing::Test { void* flat_mem; T* flat_mem_data; tvm::runtime::DeviceAPI* device_api; - tvm::runtime::String vtcm_scope; + tvm::ffi::String vtcm_scope; DLDataType float16; DLDataType int8, uint8; int64_t tensor_shape[4]; diff --git a/tests/python/codegen/test_target_codegen_cuda.py b/tests/python/codegen/test_target_codegen_cuda.py index e0e37660fc..e96217034f 100644 --- a/tests/python/codegen/test_target_codegen_cuda.py +++ b/tests/python/codegen/test_target_codegen_cuda.py @@ -211,34 +211,6 @@ def test_cuda_make_int8(): check_cuda(64, -3, 2) [email protected]_gpu [email protected]_cuda -def test_cuda_make_int4(): - def check_cuda(n, value, lanes): - dtype = "int4" - dev = tvm.cuda(0) - A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype), name="A") - sch = tvm.tir.Schedule(te.create_prim_func([A])) - y, x = sch.get_loops("A") - sch.vectorize(x) - sch.bind(y, "blockIdx.x") - fun = tvm.compile(sch.mod, target="cuda") - - np_a = np.full((n, lanes), value, dtype="int8") - a = tvm.nd.empty((n, lanes), dtype, dev) - fun(a) - np.testing.assert_equal(a.numpy(), np_a) - - check_cuda(64, 1, 4) - check_cuda(64, 7, 4) - check_cuda(64, 1, 8) - check_cuda(64, 7, 8) - check_cuda(64, 1, 16) - check_cuda(64, 7, 16) - check_cuda(64, 1, 32) - check_cuda(64, 7, 32) - - @tvm.testing.requires_gpu @tvm.testing.requires_cuda def test_cuda_inf_nan():
