This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch refactor-s2
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit 1e0a870827a2b1bfbbc3bb1677b9767fd8a43e5b
Author: tqchen <[email protected]>
AuthorDate: Mon Apr 28 20:51:07 2025 -0400

    Update to fix gpu and ffi
---
 3rdparty/cutlass_fpA_intB_gemm                     |  2 +-
 apps/hexagon_launcher/launcher_main.cc             |  2 +-
 ffi/include/tvm/ffi/container/ndarray.h            |  6 +++--
 ffi/include/tvm/ffi/function.h                     |  1 +
 include/tvm/ir/module.h                            |  2 --
 include/tvm/relax/tuning_api.h                     |  2 ++
 src/node/structural_hash.cc                        |  3 +--
 src/runtime/contrib/thrust/thrust.cu               |  3 ++-
 src/runtime/hexagon/ops/conv2d.h                   |  2 +-
 src/runtime/hexagon/ops/conv2d_fp16_hvx.cc         |  3 ++-
 src/runtime/hexagon/ops/conv2d_quant_hvx.cc        |  2 +-
 src/runtime/hexagon/ops/conv_utils.cc              |  4 ++--
 src/runtime/memory/memory_manager.cc               | 10 ++++----
 src/runtime/ndarray.cc                             |  2 +-
 src/runtime/rpc/rpc_module.cc                      |  2 +-
 src/support/ffi_testing.cc                         |  2 --
 src/target/llvm/codegen_llvm.h                     |  2 +-
 .../cpp-runtime/hexagon/hexagon_conv_utils_test.h  |  2 +-
 tests/python/codegen/test_target_codegen_cuda.py   | 28 ----------------------
 19 files changed, 27 insertions(+), 53 deletions(-)

diff --git a/3rdparty/cutlass_fpA_intB_gemm b/3rdparty/cutlass_fpA_intB_gemm
index fdef230791..d463617c21 160000
--- a/3rdparty/cutlass_fpA_intB_gemm
+++ b/3rdparty/cutlass_fpA_intB_gemm
@@ -1 +1 @@
-Subproject commit fdef2307917ec2c7cc5becc29fb95d77498484bd
+Subproject commit d463617c215969d9d2333a216042ddb963949d0d
diff --git a/apps/hexagon_launcher/launcher_main.cc 
b/apps/hexagon_launcher/launcher_main.cc
index 087b2a1b00..8690996684 100644
--- a/apps/hexagon_launcher/launcher_main.cc
+++ b/apps/hexagon_launcher/launcher_main.cc
@@ -102,7 +102,7 @@ int main(int argc, char* argv[]) {
   for (int i = 0, e = config.inputs.size(); i != e; ++i) {
     const TensorConfig& tc = config.inputs[i];
     input_meta->ndim = tc.shape.size();
-    input_meta->dtype = tvm::runtime::StringToDLDataType(tc.dtype);
+    input_meta->dtype = tvm::ffi::StringToDLDataType(tc.dtype);
     std::copy(tc.shape.begin(), tc.shape.end(), input_meta->shape);
 
     auto* input_data = session.alloc<unsigned char>(input_meta->data_size());
diff --git a/ffi/include/tvm/ffi/container/ndarray.h 
b/ffi/include/tvm/ffi/container/ndarray.h
index 66bdbb6d70..065c8db2cd 100644
--- a/ffi/include/tvm/ffi/container/ndarray.h
+++ b/ffi/include/tvm/ffi/container/ndarray.h
@@ -29,6 +29,8 @@
 #include <tvm/ffi/error.h>
 #include <tvm/ffi/type_traits.h>
 
+#include <utility>
+
 namespace tvm {
 namespace ffi {
 
@@ -85,7 +87,7 @@ inline bool IsAligned(const DLTensor& arr, size_t alignment) {
  */
 inline size_t GetDataSize(int64_t numel, DLDataType dtype) {
   // compatible handling sub-byte uint1(bool), which usually stored as uint8_t
-  // TODO: revisit and switch to kDLBool
+  // TODO(tqchen): revisit and switch to kDLBool
   if (dtype.code == kDLUInt && dtype.bits == 1 && dtype.lanes == 1) {
     return numel;
   }
@@ -197,7 +199,7 @@ class NDArrayObjFromNDAlloc : public NDArrayObj {
 template <typename TDLPackManagedTensor>
 class NDArrayObjFromDLPack : public NDArrayObj {
  public:
-  NDArrayObjFromDLPack(TDLPackManagedTensor* tensor) : tensor_(tensor) {
+  explicit NDArrayObjFromDLPack(TDLPackManagedTensor* tensor) : 
tensor_(tensor) {
     *static_cast<DLTensor*>(this) = tensor_->dl_tensor;
     // set strides to nullptr if the tensor is contiguous.
     if (IsContiguous(tensor->dl_tensor)) {
diff --git a/ffi/include/tvm/ffi/function.h b/ffi/include/tvm/ffi/function.h
index d53bb01934..d5e31372c6 100644
--- a/ffi/include/tvm/ffi/function.h
+++ b/ffi/include/tvm/ffi/function.h
@@ -32,6 +32,7 @@
 #include <functional>
 #include <string>
 #include <utility>
+#include <vector>
 
 namespace tvm {
 namespace ffi {
diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h
index b41725e0c0..66637f67d9 100644
--- a/include/tvm/ir/module.h
+++ b/include/tvm/ir/module.h
@@ -302,8 +302,6 @@ namespace attr {
  * \brief Name of the module
  *
  * Type: String
- *
- * \sa tvm::runtime::String
  */
 constexpr const char* kModuleName = "mod_name";
 
diff --git a/include/tvm/relax/tuning_api.h b/include/tvm/relax/tuning_api.h
index 1e9a74cbfb..6d72c56025 100644
--- a/include/tvm/relax/tuning_api.h
+++ b/include/tvm/relax/tuning_api.h
@@ -27,7 +27,9 @@
 #include <tvm/ir/transform.h>
 #include <tvm/meta_schedule/database.h>
 
+#include <utility>
 #include <vector>
+
 namespace tvm {
 namespace relax {
 
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index 4835518e10..982c915759 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -609,8 +609,7 @@ struct MapObjTrait {
     }
   }
 
-  static bool SEqualReduceTraced(const MapObj* lhs, const MapObj* rhs,
-                                 const SEqualReducer& equal) {
+  static bool SEqualReduceTraced(const MapObj* lhs, const MapObj* rhs, const 
SEqualReducer& equal) {
     const ObjectPathPair& map_paths = equal.GetCurrentObjectPaths();
     // First, check that every key from `lhs` is also in `rhs`,
     // and their values are mapped to each other.
diff --git a/src/runtime/contrib/thrust/thrust.cu 
b/src/runtime/contrib/thrust/thrust.cu
index 754a5942fb..03517c366d 100644
--- a/src/runtime/contrib/thrust/thrust.cu
+++ b/src/runtime/contrib/thrust/thrust.cu
@@ -393,7 +393,8 @@ void thrust_scan(DLTensor* data, DLTensor* output, bool 
exclusive, DLTensor* wor
   }
 }
 
-TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sum_scan").set_body_packed([](TVMArgs 
args, TVMRetValue* ret) {
+TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sum_scan")
+.set_body_packed([](TVMArgs args, TVMRetValue* ret) {
   ICHECK(args.num_args == 2 || args.num_args == 3 || args.num_args == 4);
   DLTensor* data = args[0];
   DLTensor* output = args[1];
diff --git a/src/runtime/hexagon/ops/conv2d.h b/src/runtime/hexagon/ops/conv2d.h
index 76c6cccff7..79bd021717 100644
--- a/src/runtime/hexagon/ops/conv2d.h
+++ b/src/runtime/hexagon/ops/conv2d.h
@@ -286,7 +286,7 @@ void chunkify_hwio_8b(void** out_ptr, int out_ptr_size, 
void* out, void* inp, in
 template <typename T, int block_height, int block_width, int block_depth>
 SDLTensor<4> prepare_nhwc(tvm::runtime::DeviceAPI* device_api, const DLTensor* 
nhwc_flat,
                           bool copy_data) {
-  tvm::runtime::String vtcm_scope = "global.vtcm";
+  tvm::ffi::String vtcm_scope = "global.vtcm";
 
   // Allocate blocks for activations. We will use the block pointers
   // directly from the allocated area.
diff --git a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc 
b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
index 6d4a4839fb..e37a7f3fb1 100644
--- a/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
+++ b/src/runtime/hexagon/ops/conv2d_fp16_hvx.cc
@@ -22,6 +22,7 @@
 #include <hvx_hexagon_protos.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/runtime/data_type.h>
 
 #include <algorithm>
 #include <cassert>
@@ -442,7 +443,7 @@ int conv2d_packed_fp16(TVMValue* args, int* type_codes, int 
num_args, TVMValue*
 
   auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, 
false);
   ICHECK(device_api != nullptr);
-  tvm::runtime::String vtcm_scope = "global.vtcm";
+  tvm::ffi::String vtcm_scope = "global.vtcm";
 
   auto act_vtcm =
       conv_utils::prepare_nhwc<uint16_t, 8, 4, 32>(device_api, act_flat, 
/*copy_data=*/true);
diff --git a/src/runtime/hexagon/ops/conv2d_quant_hvx.cc 
b/src/runtime/hexagon/ops/conv2d_quant_hvx.cc
index 682eebb137..99f7c245f5 100644
--- a/src/runtime/hexagon/ops/conv2d_quant_hvx.cc
+++ b/src/runtime/hexagon/ops/conv2d_quant_hvx.cc
@@ -283,7 +283,7 @@ int conv2d_packed_quant(TVMValue* args, int* type_codes, 
int num_args, TVMValue*
 
   auto* device_api = tvm::runtime::DeviceAPI::Get(conv_utils::hexagon_device, 
false);
   ICHECK(device_api != nullptr);
-  tvm::runtime::String vtcm_scope = "global.vtcm";
+  tvm::ffi::String vtcm_scope = "global.vtcm";
 
   auto act_vtcm =
       conv_utils::prepare_nhwc<uint8_t, 8, 8, 32>(device_api, act_flat, 
/*copy_data=*/true);
diff --git a/src/runtime/hexagon/ops/conv_utils.cc 
b/src/runtime/hexagon/ops/conv_utils.cc
index a40e23e463..6cad8c472f 100644
--- a/src/runtime/hexagon/ops/conv_utils.cc
+++ b/src/runtime/hexagon/ops/conv_utils.cc
@@ -167,7 +167,7 @@ std::tuple<int, int, int, int> getHWIO(const DLTensor* 
hwio_flat) {
 
 SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* device_api, const 
DLTensor* hwio_flat,
                              int num_chunks, void** ptr_table, int wgt_zp) {
-  tvm::runtime::String vtcm_scope = "global.vtcm";
+  tvm::ffi::String vtcm_scope = "global.vtcm";
 
   auto [h, w, i, o] = getHWIO(hwio_flat);
   int64_t shape_1d[] = {h * w * i * o};
@@ -182,7 +182,7 @@ SDLTensor<4> prepare_hwio_8b(tvm::runtime::DeviceAPI* 
device_api, const DLTensor
 
 SDLTensor<4> prepare_hwio(tvm::runtime::DeviceAPI* device_api, const DLTensor* 
hwio_flat,
                           int num_chunks, void** ptr_table) {
-  tvm::runtime::String vtcm_scope = "global.vtcm";
+  tvm::ffi::String vtcm_scope = "global.vtcm";
 
   // Allocate one block for filter data. We will need to create our own
   // pointer table. The reason is that filter chunks cannot be padded
diff --git a/src/runtime/memory/memory_manager.cc 
b/src/runtime/memory/memory_manager.cc
index 9ca438d6b0..ed53b04840 100644
--- a/src/runtime/memory/memory_manager.cc
+++ b/src/runtime/memory/memory_manager.cc
@@ -68,7 +68,7 @@ NDArray StorageObj::AllocNDArrayScoped(int64_t offset, 
ShapeTuple shape, DLDataT
 
   struct StorageScopedAlloc {
    public:
-    StorageScopedAlloc(Storage storage) : storage_(storage) {}
+    explicit StorageScopedAlloc(Storage storage) : storage_(storage) {}
 
     void AllocData(DLTensor* tensor, const ffi::Shape& shape, const String& 
scope,
                    int64_t byte_offset) {
@@ -97,9 +97,9 @@ NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple 
shape, DLDataType dt
   ICHECK(offset + needed_size <= this->buffer.size)
       << "storage allocation failure, attempted to allocate " << needed_size 
<< " at offset "
       << offset << " in region that is " << this->buffer.size << "bytes";
-  struct StorageAlloc {
+  class StorageAlloc {
    public:
-    StorageAlloc(Storage storage) : storage_(storage) {}
+    explicit StorageAlloc(Storage storage) : storage_(storage) {}
 
     void AllocData(DLTensor* tensor, int64_t offset) {
       if (storage_->buffer.device.device_type == kDLHexagon) {
@@ -216,9 +216,9 @@ NDArray Allocator::Empty(ShapeTuple shape, DLDataType 
dtype, DLDevice dev,
                          Optional<String> mem_scope) {
   VerifyDataType(dtype);
 
-  struct BufferAlloc {
+  class BufferAlloc {
    public:
-    BufferAlloc(Buffer buffer) : buffer_(buffer) {}
+    explicit BufferAlloc(Buffer buffer) : buffer_(buffer) {}
 
     void AllocData(DLTensor* tensor) { tensor->data = buffer_.data; }
     void FreeData(DLTensor* tensor) {
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index b1fdca15ed..8911533977 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -158,7 +158,7 @@ NDArray NDArray::CreateView(ShapeTuple shape, DLDataType 
dtype,
   // helper allocator class that retains ref count of original NDArray
   class ViewBasedAlloc {
    public:
-    ViewBasedAlloc(NDArray source) : source_(source) {}
+    explicit ViewBasedAlloc(NDArray source) : source_(source) {}
     void AllocData(DLTensor* tensor, int64_t byte_offset) {
       tensor->data = source_.get_mutable()->data;
       tensor->byte_offset = byte_offset;
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 9e60e18172..b7be8ef098 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -56,7 +56,7 @@ NDArray 
NDArrayFromRemoteOpaqueHandle(std::shared_ptr<RPCSession> sess, void* ha
       << "The Device given does not belong to the given session";
   class RemoteSpaceAlloc {
    public:
-    RemoteSpaceAlloc(RemoteSpace space) : space_(space) {}
+    explicit RemoteSpaceAlloc(RemoteSpace space) : space_(space) {}
     void AllocData(DLTensor* tensor) {
       // the pointer to the remote space is passed in as the data pointer
       tensor->data = &(space_);
diff --git a/src/support/ffi_testing.cc b/src/support/ffi_testing.cc
index fc7315eb83..5c59c08792 100644
--- a/src/support/ffi_testing.cc
+++ b/src/support/ffi_testing.cc
@@ -53,7 +53,6 @@ struct TestAttrs : public AttrsNode<TestAttrs> {
 
 TVM_REGISTER_NODE_TYPE(TestAttrs);
 
-
 TVM_REGISTER_GLOBAL("testing.test_wrap_callback")
     .set_body_packed([](TVMArgs args, TVMRetValue* ret) {
       PackedFunc pf = args[0];
@@ -167,7 +166,6 @@ 
TVM_REGISTER_GLOBAL("testing.sleep_in_ffi").set_body_typed([](double timeout) {
   std::this_thread::sleep_for(duration);
 });
 
-
 TVM_REGISTER_GLOBAL("testing.ReturnsVariant").set_body_typed([](int x) -> 
Variant<String, IntImm> {
   if (x % 2 == 0) {
     return IntImm(DataType::Int(64), x / 2);
diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h
index c9cbc16b11..530d1772df 100644
--- a/src/target/llvm/codegen_llvm.h
+++ b/src/target/llvm/codegen_llvm.h
@@ -264,7 +264,7 @@ class CodeGenLLVM : public ExprFunctor<llvm::Value*(const 
PrimExpr&)>,
     int alignment{0};
   };
   /*!
-   * \brief Convert tvm::runtime::String into llvm::StringRef
+   * \brief Convert tvm::ffi::String into llvm::StringRef
    */
   static llvm::StringRef MakeStringRef(const String& string) {
     return llvm::StringRef(string.c_str(), string.size());
diff --git a/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h 
b/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h
index 07e1596686..e6840d88bc 100644
--- a/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h
+++ b/tests/cpp-runtime/hexagon/hexagon_conv_utils_test.h
@@ -93,7 +93,7 @@ class HexagonUtilsTest : public ::testing::Test {
   void* flat_mem;
   T* flat_mem_data;
   tvm::runtime::DeviceAPI* device_api;
-  tvm::runtime::String vtcm_scope;
+  tvm::ffi::String vtcm_scope;
   DLDataType float16;
   DLDataType int8, uint8;
   int64_t tensor_shape[4];
diff --git a/tests/python/codegen/test_target_codegen_cuda.py 
b/tests/python/codegen/test_target_codegen_cuda.py
index e0e37660fc..e96217034f 100644
--- a/tests/python/codegen/test_target_codegen_cuda.py
+++ b/tests/python/codegen/test_target_codegen_cuda.py
@@ -211,34 +211,6 @@ def test_cuda_make_int8():
     check_cuda(64, -3, 2)
 
 
[email protected]_gpu
[email protected]_cuda
-def test_cuda_make_int4():
-    def check_cuda(n, value, lanes):
-        dtype = "int4"
-        dev = tvm.cuda(0)
-        A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, 
dtype=dtype), name="A")
-        sch = tvm.tir.Schedule(te.create_prim_func([A]))
-        y, x = sch.get_loops("A")
-        sch.vectorize(x)
-        sch.bind(y, "blockIdx.x")
-        fun = tvm.compile(sch.mod, target="cuda")
-
-        np_a = np.full((n, lanes), value, dtype="int8")
-        a = tvm.nd.empty((n, lanes), dtype, dev)
-        fun(a)
-        np.testing.assert_equal(a.numpy(), np_a)
-
-    check_cuda(64, 1, 4)
-    check_cuda(64, 7, 4)
-    check_cuda(64, 1, 8)
-    check_cuda(64, 7, 8)
-    check_cuda(64, 1, 16)
-    check_cuda(64, 7, 16)
-    check_cuda(64, 1, 32)
-    check_cuda(64, 7, 32)
-
-
 @tvm.testing.requires_gpu
 @tvm.testing.requires_cuda
 def test_cuda_inf_nan():

Reply via email to