(tvm) branch main updated (593a4bd9d7 -> b98bc66584)

2024-01-24 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 593a4bd9d7 [Relax] NDArray Cache Update with DLTensor Support (#16464)
 add b98bc66584 [Relax] Memory planning for "partially dynamic" shapes 
(#16466)

No new revisions were added by this update.

Summary of changes:
 src/relax/transform/static_plan_block_memory.cc| 12 ++--
 .../test_transform_static_plan_block_memory.py | 64 ++
 2 files changed, 72 insertions(+), 4 deletions(-)



(tvm) branch main updated: [ROCm][WebGPU] Intrin Dispatch: `tanh`, `erf`, `log` (#16441)

2024-01-21 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new ed5c46aebc [ROCm][WebGPU] Intrin Dispatch: `tanh`, `erf`, `log` 
(#16441)
ed5c46aebc is described below

commit ed5c46aebcfc89a6d13c33272572f2be5d9575d7
Author: Junru Shao 
AuthorDate: Sun Jan 21 06:43:27 2024 -0800

[ROCm][WebGPU] Intrin Dispatch: `tanh`, `erf`, `log` (#16441)

This commit fixes a few minor intrinsic dispatch issues in the ROCm
and WebGPU backend that affects LLM compilation, including Mixtral,
RedPajama (GPT-NeoX) and GPT-BigCode.
---
 python/tvm/autotvm/tuner/droplet_tuner.py |  6 +-
 python/tvm/topi/math.py   | 11 +-
 src/target/llvm/intrin_rule_rocm.cc   | 34 +--
 3 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/python/tvm/autotvm/tuner/droplet_tuner.py 
b/python/tvm/autotvm/tuner/droplet_tuner.py
index d58bfa4989..d115353d77 100644
--- a/python/tvm/autotvm/tuner/droplet_tuner.py
+++ b/python/tvm/autotvm/tuner/droplet_tuner.py
@@ -18,8 +18,9 @@
 
 import logging
 import os
+
 import numpy as np
-from scipy import stats
+
 from .tuner import Tuner
 
 LOGGER = logging.getLogger("autotvm")
@@ -85,6 +86,9 @@ class DropletTuner(Tuner):
 def p_value(self, elem_1, elem_2):
 if len(elem_1) <= 1 or len(elem_2) <= 1:
 return True
+
+from scipy import stats  # pylint: disable=import-outside-toplevel
+
 return stats.ttest_ind(np.array(elem_1), np.array(elem_2)).pvalue <= 
self.pvalue
 
 def next_batch(self, batch_size):
diff --git a/python/tvm/topi/math.py b/python/tvm/topi/math.py
index 8b66ca2cc9..63a1e48c2b 100644
--- a/python/tvm/topi/math.py
+++ b/python/tvm/topi/math.py
@@ -20,8 +20,7 @@ import tvm
 from tvm import te
 from tvm.tir import PrimExpr
 
-from . import tag
-from . import cpp
+from . import cpp, tag
 from .utils import get_const_tuple
 
 
@@ -855,17 +854,17 @@ def ceil_log2(x):
 if "float" in x.dtype:
 return tvm.tir.ceil(tvm.tir.log2(x))
 
-if "vulkan" in tvm.target.Target.current().kind.name:
+target = tvm.target.Target.current()
+
+if "vulkan" in target.kind.name:
 clz = tvm.tir.clz(x)
 bits = int(x.dtype[-2:])
 res = tvm.tir.if_then_else(x & (x - 1) == 0, bits - clz - 1, bits - 
clz)
-
 if res.dtype != x.dtype:
 return cast(res, x.dtype)
-
 return res
 
-if "adreno" in tvm.target.Target.current().device_name:
+if "adreno" in target.device_name or target.kind.name in ["metal", "rocm", 
"webgpu"]:
 return cast(tvm.tir.ceil(tvm.tir.log2(cast(x, "float32"))), x.dtype)
 
 return cast(tvm.tir.ceil(tvm.tir.log2(cast(x, "float64"))), x.dtype)
diff --git a/src/target/llvm/intrin_rule_rocm.cc 
b/src/target/llvm/intrin_rule_rocm.cc
index 0fbfade335..c80d8388da 100644
--- a/src/target/llvm/intrin_rule_rocm.cc
+++ b/src/target/llvm/intrin_rule_rocm.cc
@@ -31,12 +31,15 @@
 
 #include 
 
+#include "../intrin_rule.h"
 #include "intrin_rule_llvm.h"
 
 namespace tvm {
 namespace codegen {
 
 inline PrimExpr DispatchPureExternOCML(const PrimExpr& e) {
+  // NOTE: OCML dispatch fails to work properly with vectorization, and thus 
should be used with
+  // extreme caution.
   using namespace tir;
   const CallNode* call = e.as();
   ICHECK(call != nullptr);
@@ -150,13 +153,6 @@ TVM_REGISTER_OP("tir.exp2")
 .set_attr("rocm.FLowerIntrinsic",
DispatchLLVMPureIntrin<::llvm::Intrinsic::exp2, 
1>);
 
-// TVM_REGISTER_OP("tir.exp10")
-// .set_attr("rocm.FLowerIntrinsic",
-//
DispatchLLVMPureIntrin<::llvm::Intrinsic::exp10, 1>);
-
-// TVM_REGISTER_OP("tir.erf").set_attr("rocm.FLowerIntrinsic",
-//  
DispatchPureExternOCML);
-
 TVM_REGISTER_OP("tir.fma").set_attr(
 "rocm.FLowerIntrinsic", DispatchLLVMPureIntrin<::llvm::Intrinsic::fmuladd, 
3>);
 
@@ -178,27 +174,35 @@ TVM_REGISTER_OP("tir.sqrt")
 TVM_REGISTER_OP("tir.pow").set_attr(
 "rocm.FLowerIntrinsic", DispatchLLVMPureIntrin<::llvm::Intrinsic::pow, 2>);
 
-// TVM_REGISTER_OP("tir.tanh")
-// .set_attr("rocm.FLowerIntrinsic", 
DispatchPureExternOCML);
+TVM_REGISTER_OP("tir.cos").set_attr(
+"rocm.FLowerIntrinsic", DispatchLLVMPureIntrin<::llvm::Intrinsic::cos, 1>);
+
+TVM_REGISTER_OP("tir.sin").set_attr(
+"rocm.FLowerIntrinsic", DispatchLLVMPureIntrin<::llvm::Intrinsic::sin, 1>);
+
+TV

(tvm) branch main updated: [TIR] Fix of inter thread reduction with shared memory prefetch (#16406)

2024-01-20 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 81f8690aaa [TIR] Fix of inter thread reduction with shared memory 
prefetch (#16406)
81f8690aaa is described below

commit 81f8690aaa15d957acdf29bf85624513161c3936
Author: Lei Wang <34334180+leiwang1...@users.noreply.github.com>
AuthorDate: Sun Jan 21 15:44:33 2024 +0800

[TIR] Fix of inter thread reduction with shared memory prefetch (#16406)

This is a fix of `LowerCrossThreadReduction`: The pass will remove all the 
loops with thread bind under the inter thread reduction block, which will 
introduce some issues when we meet the case where there could be other 
non-reduction blocks under the reduction thread.

Before removing a thread-bound loop, check if the block(s) under this loop 
has reduction block var. If the block(s) under have reduction do not have any 
reduction block var, it means that block is not reduction, and therefore this 
thread-bound loop should be kept. Otherwise, we remove the thread-bound loop as 
usual.

related discussion: 
https://discuss.tvm.apache.org/t/missing-thread-bind-loops-under-block-reduction-when-transformed-with-tir/16232/6
---
 src/tir/transforms/lower_cross_thread_reduction.cc |  37 +++-
 ...t_tir_transform_lower_cross_thread_reduction.py | 226 +
 2 files changed, 262 insertions(+), 1 deletion(-)

diff --git a/src/tir/transforms/lower_cross_thread_reduction.cc 
b/src/tir/transforms/lower_cross_thread_reduction.cc
index 249555ad6e..0146e2aebf 100644
--- a/src/tir/transforms/lower_cross_thread_reduction.cc
+++ b/src/tir/transforms/lower_cross_thread_reduction.cc
@@ -198,6 +198,37 @@ class BufferReplacer : private StmtExprMutator {
  */
 class InThreadReducerMaker : private StmtMutator {
  public:
+  /*!
+   * \brief Visitor class to collect all reduction block variables under a 
loop.
+   */
+  class UnderLoopReductionBlockVarCollector : public StmtVisitor {
+   public:
+/*!
+ * \brief Check if the given statement has any reduction blocks.
+ * \param stmt The statement to check.
+ * \return True if the statement has reduction blocks, false otherwise.
+ */
+static bool CheckHasReductionBlocks(const Stmt& stmt) {
+  UnderLoopReductionBlockVarCollector collector;
+  collector(stmt);
+  return collector.reduction_block_vars_.size() > 0;
+}
+
+   private:
+void VisitStmt_(const BlockNode* block) final {
+  Array iter_vars = block->iter_vars;
+  for (const IterVar& iter_var : block->iter_vars) {
+if (iter_var->iter_type == kCommReduce) {
+  reduction_block_vars_.push_back(iter_var);
+}
+  }
+  StmtVisitor::VisitStmt_(block);
+}
+
+/*! \brief the map from thread tag to its extent */
+Array reduction_block_vars_;
+  };
+
   static Optional Make(const BlockRealizeNode* src_realize,
  Optional tgt_realize, Stmt stmt) {
 return InThreadReducerMaker(src_realize, 
std::move(tgt_realize))(std::move(stmt));
@@ -220,7 +251,11 @@ class InThreadReducerMaker : private StmtMutator {
 if (Optional opt_res = 
Downcast>(StmtMutator::VisitStmt_(loop))) {
   For res = opt_res.value();
   if (res->thread_binding.defined()) {
-return res->body;
+UnderLoopReductionBlockVarCollector collector;
+if (!res->body.defined() || collector.CheckHasReductionBlocks(res)) {
+  return res->body;
+}
+return std::move(res);
   } else {
 return std::move(res);
   }
diff --git 
a/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py 
b/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py
index f42f8ca85f..aa55b25f16 100644
--- 
a/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py
+++ 
b/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py
@@ -496,6 +496,225 @@ def lowered_single_reduction_loop_with_block_predicate(
 )
 
 
+@T.prim_func
+def spatial_reduction_with_shared_prefetch(
+A: T.Buffer((128, 150528), "float32"),
+B: T.Buffer((128, 150528), "float32"),
+C: T.Buffer((128, 128), "float32"),
+):
+C_local = T.alloc_buffer((128, 128), scope="local")
+A_shared = T.alloc_buffer((128, 150528), scope="shared")
+B_shared = T.alloc_buffer((128, 150528), scope="shared")
+for ax0_0_ax1_0_fused in T.thread_binding(256, thread="blockIdx.x"):
+for ax0_1_ax1_1_fused in T.thread_binding(64, thread="threadIdx.y"):
+for ax2_1_1_fused in T.thread_binding(2, thread="threadIdx.x"):
+for ax2_0 in range(392):
+   

(tvm) branch unity updated: [Unity] Improve buffer allocation for handling duplicated buffer names. (#16437)

2024-01-20 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 8f2e820434 [Unity] Improve buffer allocation for handling duplicated 
buffer names. (#16437)
8f2e820434 is described below

commit 8f2e82043492423de96bb2a73193424dfa7418f1
Author: Lei Wang <34334180+leiwang1...@users.noreply.github.com>
AuthorDate: Sun Jan 21 15:44:06 2024 +0800

[Unity] Improve buffer allocation for handling duplicated buffer names. 
(#16437)

Refer to this issue
https://github.com/apache/tvm/issues/16433 , the relax Pass FuseTIR creates 
intermediate name by

```c++
n->name = param->name_hint + "_intermediate";
```

which may lead to some bugs when we encounter two fused blocks with the 
same param name, for example what I encountered in resnet-18, the 
conv+add+multiply+add, the fused two add operators' block may point to the same 
block.

So we should provide a method to get an unique name for each in duplicate 
buffers.

```cpp
auto unify_name_hints = [this, , ]() {
String base_name = buffer->name;
String unique_name = base_name + "_intermediate";
size_t unique_id = 0;
std::unordered_set names;

for (auto& _buffer : func_info_.alloc_buffers) {
  names.insert(_buffer->name);
}

while (names.find(unique_name) != names.end()) {
  unique_name = unique_name + "_" + std::to_string(++unique_id);
}
 return unique_name;
 };
```
---
 src/relax/transform/fuse_tir.cc   | 17 +-
 tests/python/relax/test_transform_fuse_tir.py | 86 +++
 2 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/src/relax/transform/fuse_tir.cc b/src/relax/transform/fuse_tir.cc
index e5e2883a29..fc3d79ae62 100644
--- a/src/relax/transform/fuse_tir.cc
+++ b/src/relax/transform/fuse_tir.cc
@@ -682,10 +682,25 @@ class FusedTIRConstructor : public ExprVisitor {
   const tir::Var& param = output_params[i];
   const tir::Buffer& buffer = func->buffer_map.at(param);
 
+  auto unify_name_hints = [this, , ]() {
+String base_name = buffer->name;
+String unique_name = base_name + "_intermediate";
+size_t unique_id = 0;
+std::unordered_set names;
+
+for (auto& _buffer : func_info_.alloc_buffers) {
+  names.insert(_buffer->name);
+}
+
+while (names.find(unique_name) != names.end()) {
+  unique_name = unique_name + "_" + std::to_string(++unique_id);
+}
+return unique_name;
+  };
   // Update buffer with new symbolic shape according to the sinfo
   auto n = make_object(*buffer.get());
   n->shape = output_shapes[i];
-  n->name = param->name_hint + "_intermediate";
+  n->name = unify_name_hints();
   tir::Buffer new_buffer(n);
   func_info_.alloc_buffers.push_back(new_buffer);
   alloc_buffers.push_back(new_buffer);
diff --git a/tests/python/relax/test_transform_fuse_tir.py 
b/tests/python/relax/test_transform_fuse_tir.py
index c8f98e8724..143670c701 100644
--- a/tests/python/relax/test_transform_fuse_tir.py
+++ b/tests/python/relax/test_transform_fuse_tir.py
@@ -1351,6 +1351,92 @@ def test_tuple_input_unused_field():
 _check(Module, Expected)
 
 
+def test_unique_duplicated_buffer_allocation():
+@I.ir_module
+class Module:
+@T.prim_func(private=True)
+def add(
+A: T.Buffer((T.int64(4096), T.int64(4096)), "float16"),
+Out: T.Buffer((T.int64(4096), T.int64(4096)), "float16"),
+):
+for i, j in T.grid(T.int64(4096), T.int64(4096)):
+with T.block("add"):
+vi, vj = T.axis.remap("SS", [i, j])
+Out[vi, vj] = A[vi, vj] + T.float16(1.0)
+
+@T.prim_func(private=True)
+def add1(
+A: T.Buffer((T.int64(4096), T.int64(4096)), "float16"),
+Out: T.Buffer((T.int64(4096), T.int64(4096)), "float16"),
+):
+for i, j in T.grid(T.int64(4096), T.int64(4096)):
+with T.block("add"):
+vi, vj = T.axis.remap("SS", [i, j])
+Out[vi, vj] = A[vi, vj] + T.float16(2.0)
+
+@R.function
+def main(
+input_embeds: R.Tensor((4096, 4096), dtype="float16"),
+) -> R.Tensor((4096, 4096), dtype="float16"):
+cls = Module
+with R.dataflow():
+gv: R.Tensor((4096, 4096), dtype="float16") = 
cls.fused_func(input_embeds)
+ 

(tvm) branch main updated: [Metal] Dispatch numerically stable tanh for metal (#16438)

2024-01-19 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new ffa404fb48 [Metal] Dispatch numerically stable tanh for metal (#16438)
ffa404fb48 is described below

commit ffa404fb48b9445cc3490d343f76442c01aef46d
Author: Charlie Ruan <53290280+charliefr...@users.noreply.github.com>
AuthorDate: Sat Jan 20 01:30:43 2024 -0500

[Metal] Dispatch numerically stable tanh for metal (#16438)

Prior to this PR, `tanh(x)`returns `NaN` on metal when `x > 45.0`.

Metal's built-in tanh is implemented as `(t - 1.0) / (t + 1.0)`, where `t = 
exp(2.0 * x)`. Hence for large `x`, `t` becomes `inf`, causing `tanh(x)` to be 
`NaN`.

A numerically stable `tanh` is implemented for `llvm`, this PR lifts it to 
`src/target/intrin_rule.cc` and apply the same rule for metal as well.
---
 src/target/intrin_rule.cc   | 18 ++
 src/target/intrin_rule.h|  3 +++
 src/target/llvm/intrin_rule_llvm.cc | 25 ++---
 src/target/source/intrin_rule_metal.cc  |  2 +-
 src/target/source/intrin_rule_webgpu.cc |  2 +-
 5 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/src/target/intrin_rule.cc b/src/target/intrin_rule.cc
index 398e24d251..d9fc73cb56 100644
--- a/src/target/intrin_rule.cc
+++ b/src/target/intrin_rule.cc
@@ -134,6 +134,24 @@ PrimExpr DispatchFastErf(const PrimExpr& e) {
   return res;
 }
 
+PrimExpr DispatchNumericalStableTanh(const PrimExpr& e) {
+  using tir::make_const;
+  using tir::make_zero;
+  const tir::CallNode* call = e.as();
+  ICHECK(call != nullptr);
+  const PrimExpr& x = call->args[0];
+  PrimExpr one = make_const(x.dtype(), 1);
+  PrimExpr two = make_const(x.dtype(), 2);
+  PrimExpr neg_two = make_const(x.dtype(), -2);
+
+  PrimExpr exp_neg2x = exp(neg_two * x);
+  PrimExpr exp_pos2x = exp(two * x);
+
+  PrimExpr tanh_pos = (one - exp_neg2x) / (one + exp_neg2x);
+  PrimExpr tanh_neg = (exp_pos2x - one) / (exp_pos2x + one);
+  return tir::Select(x >= make_zero(x.dtype()), tanh_pos, tanh_neg);
+}
+
 }  // namespace intrin
 
 namespace legalize {
diff --git a/src/target/intrin_rule.h b/src/target/intrin_rule.h
index b7f5881b3a..2695c43173 100644
--- a/src/target/intrin_rule.h
+++ b/src/target/intrin_rule.h
@@ -80,6 +80,9 @@ inline PrimExpr DispatchPureExtern(const PrimExpr& e) {
 // Dispatch ERF to fast erf when it is not available.
 PrimExpr DispatchFastErf(const PrimExpr& e);
 
+// Dispatch numerically stable tanh such that tanh(large_num) does not result 
in NaN
+PrimExpr DispatchNumericalStableTanh(const PrimExpr& e);
+
 }  // namespace intrin
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/target/llvm/intrin_rule_llvm.cc 
b/src/target/llvm/intrin_rule_llvm.cc
index 9ef494fd2a..2730c0a34d 100644
--- a/src/target/llvm/intrin_rule_llvm.cc
+++ b/src/target/llvm/intrin_rule_llvm.cc
@@ -28,6 +28,8 @@
 #include 
 #include 
 
+#include "../intrin_rule.h"
+
 namespace tvm {
 namespace codegen {
 namespace llvm {
@@ -99,6 +101,10 @@ TVM_REGISTER_OP("tir.cos").set_attr(
 
 TVM_REGISTER_OP("tir.sin").set_attr(
 "llvm.FLowerIntrinsic", DispatchLLVMPureIntrin<::llvm::Intrinsic::sin, 1>);
+
+TVM_REGISTER_OP("tir.tanh")
+.set_attr("llvm.FLowerIntrinsic",
+   
::tvm::codegen::intrin::DispatchNumericalStableTanh);
 }  // namespace intrin
 
 namespace legalize {
@@ -116,25 +122,6 @@ TVM_REGISTER_OP("tir.exp10")
   return ret;
 });
 
-TVM_REGISTER_OP("tir.tanh")
-.set_attr("llvm.FLegalize", [](const PrimExpr& e) -> PrimExpr {
-  using tir::make_const;
-  using tir::make_zero;
-  const tir::CallNode* call = e.as();
-  ICHECK(call != nullptr);
-  const PrimExpr& x = call->args[0];
-  PrimExpr one = make_const(x.dtype(), 1);
-  PrimExpr two = make_const(x.dtype(), 2);
-  PrimExpr neg_two = make_const(x.dtype(), -2);
-
-  PrimExpr exp_neg2x = exp(neg_two * x);
-  PrimExpr exp_pos2x = exp(two * x);
-
-  PrimExpr tanh_pos = (one - exp_neg2x) / (one + exp_neg2x);
-  PrimExpr tanh_neg = (exp_pos2x - one) / (exp_pos2x + one);
-  return tir::Select(x >= make_zero(x.dtype()), tanh_pos, tanh_neg);
-});
-
 TVM_REGISTER_OP("tir.tan").set_attr("llvm.FLegalize", [](const 
PrimExpr& e) -> PrimExpr {
   const tir::CallNode* call = e.as();
   ICHECK(call != nullptr);
diff --git a/src/target/source/intrin_rule_metal.cc 
b/src/target/source/intrin_rule_metal.cc
index cc83eb1462..50685f6ef2 100644
--- a/src/target/source/intrin_rule_metal.cc
+++ b/src/target/source/intrin_rule_metal.cc
@@ -89,7 +89,7 @@ TVM_REGISTER_OP("tir.log10")
 .set_attr("metal.FLowerI

(tvm) branch unity updated: [Unity] Support cumsum with pure int32 (#16439)

2024-01-19 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 35551d4fdf [Unity]  Support cumsum with pure int32 (#16439)
35551d4fdf is described below

commit 35551d4fdf5eff4616ba6ebdfaaf49f9583b58d6
Author: Hongyi Jin 
AuthorDate: Sat Jan 20 01:29:55 2024 -0500

[Unity]  Support cumsum with pure int32 (#16439)

This PR fixes a bug on attr handling in data type rewriter and enforces i32 
buffer in cumsum function definition, which ensures that cumsum can be run on a 
machine with int32 but not int64.
---
 include/tvm/tir/data_type_rewriter.h |  1 +
 python/tvm/topi/cuda/scan.py | 24 
 src/tir/ir/data_type_rewriter.cc | 11 +++
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/include/tvm/tir/data_type_rewriter.h 
b/include/tvm/tir/data_type_rewriter.h
index 8bdcc097a2..846cda74c6 100644
--- a/include/tvm/tir/data_type_rewriter.h
+++ b/include/tvm/tir/data_type_rewriter.h
@@ -104,6 +104,7 @@ class IndexDataTypeRewriter : public DataTypeLegalizer {
   Stmt VisitStmt_(const BlockRealizeNode* op) override;
   Stmt VisitStmt_(const BlockNode* op) override;
   Stmt VisitStmt_(const BufferStoreNode* op) override;
+  Stmt VisitStmt_(const AttrStmtNode* op) override;
   PrimExpr VisitExpr_(const BufferLoadNode* op) override;
   Array VisitIndices(Array indices);
   Stmt VisitStmt_(const IfThenElseNode* op) override;
diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/cuda/scan.py
index f697302961..238163722f 100644
--- a/python/tvm/topi/cuda/scan.py
+++ b/python/tvm/topi/cuda/scan.py
@@ -60,8 +60,8 @@ def exclusive_scan_ir(data, output, reduction=None, 
binop=tvm.tir.generic.add, i
 your operation.
 """
 
-batch_size = prod(data.shape[:-1])
-scan_axis_size = data.shape[-1]
+batch_size = cast(prod(data.shape[:-1]), "int32")
+scan_axis_size = cast(data.shape[-1], "int32")
 
 ib = tvm.tir.ir_builder.create()
 
@@ -105,7 +105,7 @@ def exclusive_scan_ir(data, output, reduction=None, 
binop=tvm.tir.generic.add, i
 # Up Sweep of exclusive scan
 lim = ceil_log2(scan_axis_size)
 
-with ib.for_range(0, cast(lim, "int64"), dtype="int64") as l2_width:
+with ib.for_range(0, cast(lim, "int32"), dtype="int32") as l2_width:
 width = 2 << l2_width
 
 with ib.new_scope():
@@ -121,9 +121,9 @@ def exclusive_scan_ir(data, output, reduction=None, 
binop=tvm.tir.generic.add, i
 
 by = te.thread_axis("blockIdx.y")
 ib.scope_attr(by, "thread_extent", nthread_by)
-start = ib.allocate("int64", (1,), name="start", scope="local")
-middle = ib.allocate("int64", (1,), name="middle", 
scope="local")
-end = ib.allocate("int64", (1,), name="end", scope="local")
+start = ib.allocate("int32", (1,), name="start", scope="local")
+middle = ib.allocate("int32", (1,), name="middle", 
scope="local")
+end = ib.allocate("int32", (1,), name="end", scope="local")
 start[0] = width * tid
 with ib.if_scope(start[0] < scan_axis_size):
 middle[0] = start[0] + tvm.tir.indexdiv(width, 2)
@@ -143,7 +143,7 @@ def exclusive_scan_ir(data, output, reduction=None, 
binop=tvm.tir.generic.add, i
 reduction[bx] = output[(bx + 1) * scan_axis_size - 1]
 output[(bx + 1) * scan_axis_size - 1] = cast(identity_value, 
out_dtype)
 
-with ib.for_range(0, cast(lim, "int64"), dtype="int64") as l2_width:
+with ib.for_range(0, cast(lim, "int32"), dtype="int32") as l2_width:
 width = 2 << (lim - l2_width - 1)
 
 with ib.new_scope():
@@ -159,9 +159,9 @@ def exclusive_scan_ir(data, output, reduction=None, 
binop=tvm.tir.generic.add, i
 
 by = te.thread_axis("blockIdx.y")
 ib.scope_attr(by, "thread_extent", nthread_by)
-start = ib.allocate("int64", (1,), name="start", scope="local")
-middle = ib.allocate("int64", (1,), name="middle", 
scope="local")
-end = ib.allocate("int64", (1,), name="end", scope="local")
+start = ib.allocate("int32", (1,), name="start", scope="local")
+middle = ib.allocate("int32", (1,),

(tvm) branch unity updated: [Unity][Dlight][Fix] Reduction rule support dyn-shape epilogue (#16429)

2024-01-19 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new a42ca7f189 [Unity][Dlight][Fix] Reduction rule support dyn-shape 
epilogue (#16429)
a42ca7f189 is described below

commit a42ca7f189f81c900aeab9ef9c4c7f2deada4bd7
Author: Ruihang Lai 
AuthorDate: Fri Jan 19 03:46:18 2024 -0500

[Unity][Dlight][Fix] Reduction rule support dyn-shape epilogue (#16429)

This PR fixes a bug in the dlight reduction rule, which fails
to recognize the pattern of a dynamic-shape epilogue block
and therefore errors out when transforming such blocks.

One regression test is provided.
---
 python/tvm/dlight/gpu/reduction.py|  2 +-
 tests/python/dlight/test_gpu_reduction.py | 83 +++
 2 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/python/tvm/dlight/gpu/reduction.py 
b/python/tvm/dlight/gpu/reduction.py
index f07ee45f37..651e09dc52 100644
--- a/python/tvm/dlight/gpu/reduction.py
+++ b/python/tvm/dlight/gpu/reduction.py
@@ -281,7 +281,7 @@ class Reduction(GPUScheduleRule):
 # Schedule epilogue
 if epilogue_info is not None:
 epilogue = epilogue_info.block_rv
-sch.reverse_compute_at(epilogue, bx)
+sch.reverse_compute_at(epilogue, bx, preserve_unit_loops=True)
 if is_broadcast_epilogue(sch, block, epilogue):
 sch.set_scope(block, 0, "shared")
 _, *s = sch.get_loops(epilogue)  # pylint: disable=invalid-name
diff --git a/tests/python/dlight/test_gpu_reduction.py 
b/tests/python/dlight/test_gpu_reduction.py
index 75d2eeeb07..def124a9b2 100644
--- a/tests/python/dlight/test_gpu_reduction.py
+++ b/tests/python/dlight/test_gpu_reduction.py
@@ -1006,5 +1006,88 @@ def test_repeat_transpose_gemv():
 assert_structural_equal(mod, Expected)
 
 
+def test_gemv_dyn_shape_epilogue():
+@I.ir_module
+class Module:
+@T.prim_func(private=True)
+def main(
+var_A: T.handle,
+B: T.Buffer((T.int64(1), T.int64(1), T.int64(4096)), "float16"),
+var_C: T.handle,
+):
+T.func_attr({"tir.noalias": T.bool(True)})
+vocab_size = T.int64()
+A = T.match_buffer(var_A, (T.int64(4096), vocab_size), "float16")
+C = T.match_buffer(var_C, (T.int64(1), T.int64(1), vocab_size))
+C_temp = T.alloc_buffer((T.int64(1), T.int64(1), vocab_size), 
"float16")
+for i0, i1, i2, k in T.grid(T.int64(1), T.int64(1), vocab_size, 
T.int64(4096)):
+with T.block("matmul"):
+v_i0, v_i1, v_i2, v_k = T.axis.remap("SSSR", [i0, i1, i2, 
k])
+T.reads(B[v_i0, v_i1, v_k], A[v_k, v_i2])
+T.writes(C_temp[v_i0, v_i1, v_i2])
+with T.init():
+C_temp[v_i0, v_i1, v_i2] = T.float16(0)
+C_temp[v_i0, v_i1, v_i2] = (
+C_temp[v_i0, v_i1, v_i2] + B[v_i0, v_i1, v_k] * A[v_k, 
v_i2]
+)
+for i0, i1, i2 in T.grid(T.int64(1), T.int64(1), vocab_size):
+with T.block("epilogue"):
+v_i0, v_i1, v_i2 = T.axis.remap("SSS", [i0, i1, i2])
+T.reads(C_temp[v_i0, v_i1, v_i2])
+T.writes(C[v_i0, v_i1, v_i2])
+C[v_i0, v_i1, v_i2] = T.Cast("float32", C_temp[v_i0, v_i1, 
v_i2])
+
+# fmt: off
+@I.ir_module
+class Expected:
+@T.prim_func(private=True)
+def main(var_A: T.handle, B: T.Buffer((T.int64(1), T.int64(1), 
T.int64(4096)), "float16"), var_C: T.handle):
+T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
+vocab_size = T.int64()
+A = T.match_buffer(var_A, (T.int64(4096), vocab_size), "float16")
+C = T.match_buffer(var_C, (T.int64(1), T.int64(1), vocab_size))
+# with T.block("root"):
+C_temp_local = T.alloc_buffer((T.int64(1), T.int64(1), 
vocab_size), "float16", scope="local")
+C_temp_rf_local = T.alloc_buffer((T.int64(16), T.int64(1), 
T.int64(1), vocab_size), "float16", scope="local")
+for ax0_fused_0 in T.thread_binding(vocab_size, 
thread="blockIdx.x"):
+for ax0_fused_1 in T.thread_binding(T.int64(1), 
thread="threadIdx.x"):
+for ax1_fused_1 in T.thread_binding(T.int64(16), 
thread="threadIdx.y"):
+with T.block("matmul_rf_init"):
+vax1_fused_1 = T.axis.spatial(T.int64(16), 
ax1_fu

(tvm) branch main updated: [CI][WASM] Update emsdk and nodejs version (#16420)

2024-01-18 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 614a7a9e31 [CI][WASM] Update emsdk and nodejs version (#16420)
614a7a9e31 is described below

commit 614a7a9e31e00ddd9442b218a5b7042f3a49e9b1
Author: Tianqi Chen 
AuthorDate: Thu Jan 18 19:36:16 2024 -0500

[CI][WASM] Update emsdk and nodejs version (#16420)

This PR updates the emsdk and nodejs version of docker.
---
 docker/install/ubuntu_install_emscripten.sh | 4 ++--
 docker/install/ubuntu_install_nodejs.sh | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/install/ubuntu_install_emscripten.sh 
b/docker/install/ubuntu_install_emscripten.sh
index 87c95f2936..98331d2cbd 100755
--- a/docker/install/ubuntu_install_emscripten.sh
+++ b/docker/install/ubuntu_install_emscripten.sh
@@ -23,5 +23,5 @@ set -o pipefail
 cd /
 git clone https://github.com/emscripten-core/emsdk.git
 cd emsdk
-./emsdk install 3.1.30
-./emsdk activate 3.1.30
+./emsdk install 3.1.51
+./emsdk activate 3.1.51
diff --git a/docker/install/ubuntu_install_nodejs.sh 
b/docker/install/ubuntu_install_nodejs.sh
index b295d9e3e4..6d9ef3f5de 100755
--- a/docker/install/ubuntu_install_nodejs.sh
+++ b/docker/install/ubuntu_install_nodejs.sh
@@ -28,5 +28,5 @@ apt-install-and-clear -y curl
 
 # The node install script fetched and executed here will update the
 # apt source list, hence the second apt-get update --fix-missing is necessary.
-curl -s -S -L https://deb.nodesource.com/setup_14.x | bash -
+curl -s -S -L https://deb.nodesource.com/setup_16.x | bash -
 apt-install-and-clear -y nodejs



(tvm) branch main updated (a7dd32cc16 -> 68be158d35)

2024-01-16 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from a7dd32cc16 [DeviceAPI] Support querying total global memory (#16398)
 add 68be158d35 [ROCm] Some fixes of ROCm codegen (#16404)

No new revisions were added by this update.

Summary of changes:
 src/target/llvm/codegen_llvm.cc  |  2 +
 src/target/llvm/intrin_rule_rocm.cc  | 87 ++--
 src/tir/transforms/lower_thread_allreduce.cc |  2 +-
 tests/python/codegen/test_target_codegen_rocm.py | 53 +++
 4 files changed, 108 insertions(+), 36 deletions(-)



(tvm) branch unity updated: [Unity] Set CMAKE_CUDA_ARCHITECTURES default to native (#16335)

2024-01-10 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 474c06b8b3 [Unity] Set CMAKE_CUDA_ARCHITECTURES default to native 
(#16335)
474c06b8b3 is described below

commit 474c06b8b3e9c0f751a69a7aaf718af156edcd32
Author: Wuwei Lin 
AuthorDate: Wed Jan 10 15:01:43 2024 -0800

[Unity] Set CMAKE_CUDA_ARCHITECTURES default to native (#16335)

CI images should also be updated to install cmake 3.24
---
 3rdparty/cutlass_fpA_intB_gemm |  2 +-
 CMakeLists.txt |  2 +-
 cmake/modules/CUDA.cmake   | 25 -
 tests/scripts/task_config_build_gpu.sh |  1 -
 4 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/3rdparty/cutlass_fpA_intB_gemm b/3rdparty/cutlass_fpA_intB_gemm
index 1243f910a4..12e0cc70d7 16
--- a/3rdparty/cutlass_fpA_intB_gemm
+++ b/3rdparty/cutlass_fpA_intB_gemm
@@ -1 +1 @@
-Subproject commit 1243f910a4afd49b7983c087e4f610b81e45f71c
+Subproject commit 12e0cc70d7e5c6baeb5fa7e41406e4a08e32e1e6
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 058f477dbd..f7fd92e25a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.24)
 project(tvm C CXX)
 
 # Utility functions
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index 03a79326c8..1284f85bec 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -38,27 +38,10 @@ if(USE_CUDA)
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDA_LIBRARY})
   list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_NVRTC_LIBRARY})
 
-  # Compatibility with cmake 3.18+
-  #
-  # The updates to the cutlass kernels made in TVM PR#16244 require
-  # symbols provided in cuda 7.5+.  While the cuda architecture is
-  # specified by setting `NVCC_FLAGS` in the `CMakeLists.txt` for each
-  # kernel, cmake 3.18+ also sets it based on the
-  # `CMAKE_CUDA_ARCHITECTURES` value.  If not set, cmake will explicitly
-  # pass the compute capability as nvidia's default of 5.2, *EVEN IF* it
-  # has already been specified in `NVCC_FLAGS`.  Because the kernels
-  # cannot compile with compute capability of 5.2, this causes
-  # compilation errors.
-  #
-  # By setting `CMAKE_CUDA_ARCHITECTURES` to `OFF`, cmake does not add
-  # 5.2 as a target architecture.
-  #
-  # See https://cmake.org/cmake/help/latest/policy/CMP0104.html for
-  # details on CMake's policy for CUDA architecture flags.
-  #
-  # See https://cmake.org/cmake/help/latest/policy/CMP0104.html for the
-  # default CUDA architecture for each version of CUDA.
-  set(CMAKE_CUDA_ARCHITECTURES OFF)
+  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+message(STATUS "CMAKE_CUDA_ARCHITECTURES not set, using native")
+set(CMAKE_CUDA_ARCHITECTURES native)
+  endif()
 
   if(USE_CUDNN)
 message(STATUS "Build with cuDNN support")
diff --git a/tests/scripts/task_config_build_gpu.sh 
b/tests/scripts/task_config_build_gpu.sh
index e68e646ce1..37ab0a87f1 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -54,4 +54,3 @@ echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
 echo set\(USE_CUTLASS ON\) >> config.cmake
 echo set\(USE_CMSISNN ON\) >> config.cmake
 echo set\(USE_MSC ON\) >> config.cmake
-echo set\(CMAKE_CUDA_ARCHITECTURES 75\) >> config.cmake



(tvm) branch unity updated (2d53e6ac63 -> a796023342)

2024-01-08 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 2d53e6ac63 [Unity][Transform] Handle replacement at both var binding 
and usage (#16367)
 add a796023342 [Unity][Fix] Memory planning check value type of 
'tir_var_upper_bound' (#16362)

No new revisions were added by this update.

Summary of changes:
 src/relax/transform/static_plan_block_memory.cc | 21 +++--
 .../test_transform_static_plan_block_memory.py  | 16 +++-
 2 files changed, 34 insertions(+), 3 deletions(-)



(tvm) branch unity updated: [Unity][NN] Use Linear name for nn.op.permute_dims (#16303)

2024-01-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new f215a417bf [Unity][NN] Use Linear name for nn.op.permute_dims (#16303)
f215a417bf is described below

commit f215a417bf86ad8d5eb2c74b3a98719c86a915ed
Author: Eric Lunderberg 
AuthorDate: Fri Jan 5 12:19:05 2024 -0600

[Unity][NN] Use Linear name for nn.op.permute_dims (#16303)

The `relax::op::linear` is implemented as `permute_dims`, followed by
`matmul`.  In this case, readability can be improved by naming the
weights.
---
 python/tvm/relax/frontend/nn/op.py |  9 -
 tests/python/relax/test_frontend_nn_packing.py | 51 +-
 2 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/python/tvm/relax/frontend/nn/op.py 
b/python/tvm/relax/frontend/nn/op.py
index 1d3454fc88..ac5858d5cd 100644
--- a/python/tvm/relax/frontend/nn/op.py
+++ b/python/tvm/relax/frontend/nn/op.py
@@ -577,7 +577,7 @@ def broadcast_to(x: Tensor, shape: Sequence[IntExpr], name: 
str = "broadcast_to"
 return wrap_nested(_op.broadcast_to(x._expr, shape), name)
 
 
-def permute_dims(x: Tensor, axes: Optional[List[int]] = None, name: str = 
"permute_dims") -> Tensor:
+def permute_dims(x: Tensor, axes: Optional[List[int]] = None, name: str = 
None) -> Tensor:
 """Permutes the dimensions of an array.
 
 Parameters
@@ -596,6 +596,13 @@ def permute_dims(x: Tensor, axes: Optional[List[int]] = 
None, name: str = "permu
 result : Tensor
 The transposed result.
 """
+if name is None:
+x_name = getattr(getattr(x, "_expr", None), "name_hint", None)
+if x_name is not None and "linear" in x_name:
+name = x_name.replace("linear", "matmul")
+else:
+name = "permute_dims"
+
 return wrap_nested(_op.permute_dims(x._expr, axes=axes), name)
 
 
diff --git a/tests/python/relax/test_frontend_nn_packing.py 
b/tests/python/relax/test_frontend_nn_packing.py
index 00f981d1d4..56b614a807 100644
--- a/tests/python/relax/test_frontend_nn_packing.py
+++ b/tests/python/relax/test_frontend_nn_packing.py
@@ -21,7 +21,14 @@ from tvm.script import ir as I
 from tvm.script import relax as R
 
 
-def main():
+def _iter_binding_names(mod):
+"""Helper function to compare the names of relax variables"""
+for block in mod["forward"].body.blocks:
+for binding in block.bindings:
+yield binding.var.name_hint
+
+
+def test_nn_export_to_relax():
 class TestModule(nn.Module):
 def __init__(self, in_features: int, out_features: int):
 super().__init__()
@@ -35,39 +42,28 @@ def main():
 x2 = self.linear_2(x)
 return x1 + x2
 
-# pylint: disable=line-too-long
 @I.ir_module
-class ExpectedModule:  # pylint: disable=too-few-public-methods
+class ExpectedModule:
 @R.function
 def forward(
 x: R.Tensor((1, 10), dtype="float32"),
 packed_params: R.Tuple(
 R.Tensor((20, 10), dtype="float32"), R.Tensor((20, 10), 
dtype="float32")
 ),
-) -> R.Tensor((1, 20), dtype="float32"):
-R.func_attr({"num_input": 1})  # type: ignore[attr-defined]
-with R.dataflow():  # type: ignore[attr-defined]
-linear_1_weight: R.Tensor((20, 10), dtype="float32") = 
packed_params[0]  # type: ignore[valid-type]
-linear_2_weight: R.Tensor((20, 10), dtype="float32") = 
packed_params[1]  # type: ignore[valid-type]
-permute_dims: R.Tensor((10, 20), dtype="float32") = 
R.permute_dims(  # type: ignore[attr-defined,valid-type]
-linear_1_weight, axes=None
-)
-matmul: R.Tensor((1, 20), dtype="float32") = R.matmul(  # 
type: ignore[attr-defined,valid-type]
-x, permute_dims, out_dtype="void"
-)
-permute_dims1: R.Tensor((10, 20), dtype="float32") = 
R.permute_dims(  # type: ignore[attr-defined,valid-type]
-linear_2_weight, axes=None
-)
-matmul1: R.Tensor((1, 20), dtype="float32") = R.matmul(  # 
type: ignore[attr-defined,valid-type]
-x, permute_dims1, out_dtype="void"
-)
-add: R.Tensor((1, 20), dtype="float32") = R.add(matmul, 
matmul1)  # type: ignore[attr-defined,valid-type]
-gv: R.Tensor((1, 20), dtype="float32") = add  # type: 
ignore[attr-defined,valid-type]
-

(tvm) branch unity updated: [DLight] Skip rule if target is not suitable (#16321)

2024-01-04 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 8d72091b27 [DLight] Skip rule if target is not suitable (#16321)
8d72091b27 is described below

commit 8d72091b279d19744c4305abaf1af4fd48e25050
Author: Siyuan Feng 
AuthorDate: Fri Jan 5 03:22:50 2024 +0800

[DLight] Skip rule if target is not suitable (#16321)

This PR adds a check for GPU rules to skip if the target is not suitable
for the rule.
---
 python/tvm/dlight/base/schedule_rule.py| 15 +++
 python/tvm/dlight/gpu/base.py  | 40 ++
 python/tvm/dlight/gpu/fallback.py  |  7 --
 python/tvm/dlight/gpu/gemv.py  |  6 ++---
 python/tvm/dlight/gpu/general_reduction.py |  7 +++---
 python/tvm/dlight/gpu/matmul.py| 15 ---
 python/tvm/dlight/gpu/reduction.py |  6 ++---
 python/tvm/dlight/gpu/transpose.py | 11 
 8 files changed, 87 insertions(+), 20 deletions(-)

diff --git a/python/tvm/dlight/base/schedule_rule.py 
b/python/tvm/dlight/base/schedule_rule.py
index 3bb7e5c1a9..dda66b7cfe 100644
--- a/python/tvm/dlight/base/schedule_rule.py
+++ b/python/tvm/dlight/base/schedule_rule.py
@@ -103,3 +103,18 @@ class ScheduleRule:  # pylint: 
disable=too-few-public-methods
 return _Rule()
 
 return decorator
+
+def is_target_available(self, target: Target) -> bool:  # pylint: 
disable=unused-argument
+"""Check whether the rule is available for the given target.
+
+Parameters
+--
+target : Target
+The compilation target the schedule is supposed to be built for.
+
+Returns
+---
+available : bool
+Whether the rule is available for the given target.
+"""
+return True
diff --git a/python/tvm/dlight/gpu/base.py b/python/tvm/dlight/gpu/base.py
new file mode 100644
index 00..b5cf0bb7a9
--- /dev/null
+++ b/python/tvm/dlight/gpu/base.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Base schedule rule for GPU operators."""
+
+from tvm.target import Target
+
+from ..base import ScheduleRule
+
+
+class GPUScheduleRule(ScheduleRule):  # pylint: disable=too-few-public-methods
+"""The Schedule Rule specific to GPU targets, will return None if the 
target is not GPU."""
+
+def is_target_available(self, target: Target) -> bool:
+"""Check whether the target is available for gpu rule.
+
+Parameters
+--
+target : Target
+The compilation target to check.
+
+Returns
+---
+available : bool
+Whether the target is available for this rule.
+"""
+return super().is_target_available(target) and "gpu" in target.keys
diff --git a/python/tvm/dlight/gpu/fallback.py 
b/python/tvm/dlight/gpu/fallback.py
index 2c1e7424dc..7139c7ea41 100644
--- a/python/tvm/dlight/gpu/fallback.py
+++ b/python/tvm/dlight/gpu/fallback.py
@@ -21,11 +21,12 @@ from typing import List, Tuple
 from tvm import tir
 from tvm.target import Target
 
-from ..base import ScheduleRule, normalize_prim_func, try_inline
+from ..base import normalize_prim_func, try_inline
 from . import utils
+from .base import GPUScheduleRule
 
 
-class Fallback(ScheduleRule):
+class Fallback(GPUScheduleRule):
 """
 A fallback schedule rule for all GPU operators. It will try to inline all 
the blocks first,
 and then apply a simple block/grid mapping to the spatial loops on top of 
the remaining blocks.
@@ -37,6 +38,8 @@ class Fallback(ScheduleRule):
 target: Target,
 _: bool,
 ) -> tir.Schedule:
+if not isinstance(func, tir.PrimFunc) or not 
self.is_target_available(target):
+return None
 max_threads_per_block = utils.max_threads_per_block(target)
 
 sch = tir.Schedule(func)

(tvm) branch main updated: [CMake][MSVC] Disable permissive mode for MSVC builds (#16343)

2024-01-04 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new e3d031bc7c [CMake][MSVC] Disable permissive mode for MSVC builds 
(#16343)
e3d031bc7c is described below

commit e3d031bc7cef6f61c287b1f642c0c928612c018c
Author: Eric Lunderberg 
AuthorDate: Thu Jan 4 13:21:58 2024 -0600

[CMake][MSVC] Disable permissive mode for MSVC builds (#16343)

[CMake][MSVC] Use /permissive- flag for MSVC builds

The C++ standard requires two-phase name resolution for templates.  By
default, MSVC uses a non-standard name resolution, in which all names
are looked up when a template is instantiated.  This has caused
MSVC-specific compilation
errors, 
([example](https://github.com/apache/tvm/actions/runs/7400684492/job/20134841480?pr=16183)),
which are quite difficult to debug.

This commit updates adds the `/permissive-` flag when building TVM
with MSVC, disabling the non-standard name resolution.
---
 CMakeLists.txt | 5 +
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a55a629bd..09c656f8cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -166,6 +166,11 @@ if(MSVC)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP")
   add_compile_options(/bigobj)
 
+  # Use standard-conforming two-phase name resolution for templates.
+  # This minimizes the differences between g++/clang builds on Linux,
+  # and MSVC builds on Windows.
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /permissive-")
+
   # MSVC already errors on undefined symbols, no additional flag needed.
   set(TVM_NO_UNDEFINED_SYMBOLS "")
 



(tvm) branch unity updated: [Unity][WEBGPU] Enable wasm exception propagation (#16330)

2024-01-04 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 49fc613a3c [Unity][WEBGPU] Enable wasm exception propagation (#16330)
49fc613a3c is described below

commit 49fc613a3c55c679bcd98508828f5ec6bc6a13e3
Author: Tianqi Chen 
AuthorDate: Thu Jan 4 12:54:50 2024 -0500

[Unity][WEBGPU] Enable wasm exception propagation (#16330)

This PR enables wasm exception propagation among
c++ runtime generated wasm and javascript.

Right now the error.message is passed back
this would allow us to do some handling in webgpu
related exceptions raised through FFI boundaries.

Note that this would require the latest emscripten
and on the nodejs, --experimental-wasm-eh support.
---
 python/tvm/contrib/emcc.py |  1 +
 web/Makefile   |  2 +-
 web/emcc/tvmjs_support.cc  | 11 +--
 web/package.json   |  2 +-
 web/src/ctypes.ts  |  5 +
 web/src/runtime.ts | 31 ---
 web/tests/node/test_packed_func.js | 15 +++
 7 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/python/tvm/contrib/emcc.py b/python/tvm/contrib/emcc.py
index d62b22f1e8..73cb33dfcc 100644
--- a/python/tvm/contrib/emcc.py
+++ b/python/tvm/contrib/emcc.py
@@ -42,6 +42,7 @@ def create_tvmjs_wasm(output, objects, options=None, 
cc="emcc"):
 cmd += ["-O3"]
 cmd += ["-std=c++17"]
 cmd += ["--no-entry"]
+cmd += ["-fwasm-exception"]
 cmd += ["-s", "WASM_BIGINT=1"]
 cmd += ["-s", "ERROR_ON_UNDEFINED_SYMBOLS=0"]
 cmd += ["-s", "STANDALONE_WASM=1"]
diff --git a/web/Makefile b/web/Makefile
index 8fccf9636c..bd5e6cbf2b 100644
--- a/web/Makefile
+++ b/web/Makefile
@@ -27,7 +27,7 @@ all: dist/wasm/tvmjs_runtime.wasm 
dist/wasm/tvmjs_runtime.wasi.js src/tvmjs_runt
 
 EMCC = emcc
 
-EMCC_CFLAGS = $(INCLUDE_FLAGS) -O3 -std=c++17 -Wno-ignored-attributes
+EMCC_CFLAGS = $(INCLUDE_FLAGS) -O3 -std=c++17 -Wno-ignored-attributes 
-fwasm-exceptions
 
 EMCC_LDFLAGS = --no-entry -s WASM_BIGINT=1 -s ALLOW_MEMORY_GROWTH=1 -s 
STANDALONE_WASM=1\
  -s ERROR_ON_UNDEFINED_SYMBOLS=0 --pre-js emcc/preload.js
diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc
index a314f08fb4..324dcf7fd0 100644
--- a/web/emcc/tvmjs_support.cc
+++ b/web/emcc/tvmjs_support.cc
@@ -148,8 +148,15 @@ class AsyncLocalSession : public LocalSession {
 int code = args[0];
 TVMRetValue rv;
 rv = args[1];
-this->EncodeReturn(std::move(rv),
-   [&](TVMArgs encoded_args) { 
callback(RPCCode::kReturn, encoded_args); });
+if (code == static_cast(RPCCode::kReturn)) {
+  this->EncodeReturn(std::move(rv), [&](TVMArgs encoded_args) {
+callback(RPCCode::kReturn, encoded_args);
+  });
+} else {
+  // for exception, we can pass through as since this is just normal 
encoding.
+  ICHECK_EQ(code, static_cast(RPCCode::kException));
+  callback(RPCCode::kException, args);
+}
   });
 
   TVMRetValue temp;
diff --git a/web/package.json b/web/package.json
index 95a5c8efd7..779152ede2 100644
--- a/web/package.json
+++ b/web/package.json
@@ -13,7 +13,7 @@
 "build": "rollup -c",
 "lint": "eslint -c .eslintrc.json .",
 "typedoc": "typedoc src/index.ts --plugin typedoc-plugin-missing-exports",
-"test": "jest",
+"test": "node --experimental-wasm-eh node_modules/.bin/jest",
 "bundle": "npm run build && cp lib/index.js dist/index.js && cp 
lib/index.js dist/tvmjs.bundle.js",
 "example": "npm run bundle && node apps/node/example.js",
 "example:wasi": "npm run bundle && node 
--experimental-wasi-unstable-preview1 --experimental-wasm-bigint 
apps/node/wasi_example.js",
diff --git a/web/src/ctypes.ts b/web/src/ctypes.ts
index 282679fc02..cb2a0e1097 100644
--- a/web/src/ctypes.ts
+++ b/web/src/ctypes.ts
@@ -33,6 +33,11 @@ export type PtrOffset = number;
  */
 export type FTVMGetLastError = () => Pointer;
 
+/**
+ * void TVMAPISetLastError(const char* msg);
+ */
+export type FTVMAPISetLastError = (msg: Pointer) => void;
+
 /**
  * int TVMModGetFunction(TVMModuleHandle mod,
  *   const char* func_name,
diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index 5aa38dee39..4c56005261 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -78,6 +78,7 @@ class FFILibrary implements Disposable {
 if (code != 0) {
   const

(tvm) branch main updated: [TIR] Allow sync threads inside condition (#16345)

2024-01-04 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 5308739741 [TIR] Allow sync threads inside condition (#16345)
5308739741 is described below

commit 5308739741bc9962ecbbcd7d58182f0874508c19
Author: Bohan Hou 
AuthorDate: Thu Jan 4 12:53:25 2024 -0500

[TIR] Allow sync threads inside condition (#16345)

Originally, it is not allowed to sync threads inside a condition `while, 
if`.

This PR introduces `tvm_thread_invariant` op to annotate the condition to 
be thread id invariant and get around the check.
---
 include/tvm/tir/builtin.h|  6 
 python/tvm/script/ir_builder/tir/ir.py   |  2 ++
 python/tvm/tir/op.py | 17 +
 src/target/source/codegen_c.cc   |  4 +++
 src/tir/op/builtin.cc|  4 +++
 src/tir/transforms/storage_access.cc | 30 +---
 tests/python/codegen/test_target_codegen_cuda.py | 46 
 7 files changed, 105 insertions(+), 4 deletions(-)

diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
index 65012c6c0f..96222e03a9 100644
--- a/include/tvm/tir/builtin.h
+++ b/include/tvm/tir/builtin.h
@@ -411,6 +411,12 @@ TVM_DLL const Op& tvm_check_return();
  */
 TVM_DLL const Op& tvm_thread_context();
 
+/*!
+ * \brief Mark a condition to be thread invariant.
+ *  This means the condition must be the same for all threads.
+ */
+TVM_DLL const Op& tvm_thread_invariant();
+
 /*!
  * \brief Lowered version of call packed, the space of value and
  *  type codes are explicitly allocated.
diff --git a/python/tvm/script/ir_builder/tir/ir.py 
b/python/tvm/script/ir_builder/tir/ir.py
index d4a7445b7d..b5f427c34c 100644
--- a/python/tvm/script/ir_builder/tir/ir.py
+++ b/python/tvm/script/ir_builder/tir/ir.py
@@ -1832,6 +1832,7 @@ call_cpacked_lowered = 
_op_wrapper(_tir_op.call_cpacked_lowered)
 tvm_tuple = _op_wrapper(_tir_op.tvm_tuple)
 tvm_struct_set = _op_wrapper(_tir_op.tvm_struct_set)
 tvm_struct_get = _tir_op.tvm_struct_get
+tvm_thread_invariant = _op_wrapper(_tir_op.tvm_thread_invariant)
 tvm_thread_allreduce = _op_wrapper(_tir_op.tvm_thread_allreduce)
 tvm_load_matrix_sync = _op_wrapper(_tir_op.tvm_load_matrix_sync)
 tvm_mma_sync = _op_wrapper(_tir_op.tvm_mma_sync)
@@ -2104,6 +2105,7 @@ __all__ = [
 "tvm_tuple",
 "tvm_struct_set",
 "tvm_struct_get",
+"tvm_thread_invariant",
 "tvm_thread_allreduce",
 "tvm_load_matrix_sync",
 "tvm_mma_sync",
diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
index bb2530b125..d7478645c5 100644
--- a/python/tvm/tir/op.py
+++ b/python/tvm/tir/op.py
@@ -602,6 +602,23 @@ def tvm_thread_allreduce(*freduce_args):
 return call_intrin("handle", "tir.tvm_thread_allreduce", *freduce_args)
 
 
+def tvm_thread_invariant(cond):
+"""Mark condition as thread invariant.
+
+Parameters
+--
+cond : Expr
+The condition.
+
+Returns
+---
+call : PrimExpr
+The call expression.
+"""
+assert isinstance(cond, PrimExpr)
+return call_intrin(cond.dtype, "tir.tvm_thread_invariant", cond)
+
+
 def tvm_storage_sync(storage_scope):
 """Perform synchronization in specified scope.
 
diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc
index 0ff0531b5c..8380971249 100644
--- a/src/target/source/codegen_c.cc
+++ b/src/target/source/codegen_c.cc
@@ -669,6 +669,10 @@ void CodeGenC::VisitExpr_(const CallNode* op, 
std::ostream& os) {  // NOLINT(*)
   const StringImmNode* str = op->args[0].as();
   ICHECK(str != nullptr);
   os << "__tvm_param__" << str->value;
+} else if (op->op.same_as(builtin::tvm_thread_invariant())) {
+  os << "(";
+  this->PrintExpr(op->args[0], os);
+  os << ")";
 } else {
   LOG(FATAL) << "Unresolved call " << op->op;
 }
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
index 1b80959b57..a5089e2566 100644
--- a/src/tir/op/builtin.cc
+++ b/src/tir/op/builtin.cc
@@ -211,6 +211,10 @@ TIR_DEFINE_BUILTIN_FUNC(tvm_thread_context)
 .set_num_inputs(1)
 .set_attr("TCallEffectKind", 
Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_BUILTIN_FUNC(tvm_thread_invariant)
+.set_num_inputs(1)
+.set_attr("TCallEffectKind", 
Integer(CallEffectKind::kPure));
+
 TIR_DEFINE_BUILTIN_FUNC(tvm_call_packed_lowered)
 .set_attr("TCallEffectKind", 
Integer(CallEffectKind::kOpaque))
 .set_attr("TScriptPrinterName", 
String("call_packed

(tvm) branch main updated: [Hexagon][UnitTest] Disable flaky quantization test (#16337)

2024-01-03 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 42b4f213a7 [Hexagon][UnitTest] Disable flaky quantization test (#16337)
42b4f213a7 is described below

commit 42b4f213a75f74440a44f2d106f6321eafae6466
Author: Eric Lunderberg 
AuthorDate: Wed Jan 3 11:18:18 2024 -0600

[Hexagon][UnitTest] Disable flaky quantization test (#16337)

* [Hexagon][UnitTest] Disable flaky quantization test

The `test_pass_fq2i_avg_pool2d.py::test_avgpool_conv2d` test is
sensitive to rounding errors, and failed about a third of the time (42
/ 100 tests).  This was first noticed as CI failures in unrelated
PRs (e.g. 
https://ci.tlcpack.ai/blue/organizations/jenkins/tvm-hexagon/detail/PR-16184/6/tests).
This commit marks the flaky portions of the test with
`pytest.mark.xfail`, to avoid causing breaking CI for other PRs.

To minimize the extent of the disabled test cases, this commit breaks
up each of the unit tests.  Where previously a single test performed
both hardware/simulation tests and relay graph comparisons, these are
now done in separate test functions.  The hardware/simulation tests
use `tvm.testing.assert_allclose` and
have a tolerance of `1e-02`, while the graph-comparison tests use
`tvm.ir.structural_equal`, and require identical floating-point
values.  Only the graph-comparison test is disabled here.

The other two test cases in `test_pass_fq2i_avg_pool2d.py` do not show
this same sensitivity, with no failures seen in 100 executions.

* Disable pylint for pytest fixture names
---
 .../test_hexagon/test_pass_fq2i_avg_pool2d.py  | 115 -
 1 file changed, 69 insertions(+), 46 deletions(-)

diff --git a/tests/python/contrib/test_hexagon/test_pass_fq2i_avg_pool2d.py 
b/tests/python/contrib/test_hexagon/test_pass_fq2i_avg_pool2d.py
index 34f356a015..e45f56ba17 100644
--- a/tests/python/contrib/test_hexagon/test_pass_fq2i_avg_pool2d.py
+++ b/tests/python/contrib/test_hexagon/test_pass_fq2i_avg_pool2d.py
@@ -15,53 +15,24 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# pylint: disable=redefined-outer-name
+
 """ Tests for avg_pool2d fake quantization to integer """
 
 import numpy as np
+import pytest
+
 import tvm
 import tvm.testing
 import tvm.topi.testing
 from tvm import relay
 from tvm.contrib.hexagon.session import Session
 from tvm.contrib.hexagon.pytest_plugin import HEXAGON_AOT_LLVM_TARGET
-from .infrastructure import quantize_np, build_module, run_module
-
-
-def compare_graphs(expr, ref_expr):
-"""Compares the given graph with the expected graph"""
-mod = tvm.IRModule.from_expr(expr)
-mod = tvm.relay.transform.InferType()(mod)
-mod_int = tvm.relay.transform.FakeQuantizationToInteger()(mod)
-ref_mod = tvm.IRModule.from_expr(ref_expr)
-ref_mod = tvm.relay.transform.InferType()(ref_mod)
-assert tvm.ir.structural_equal(mod_int["main"], ref_mod["main"], 
map_free_vars=True)
-
-
-def compare_fq_to_int(hexagon_session, expr, inputs):
-"""Compares the float module output with the integer module output"""
-mod = tvm.IRModule.from_expr(expr)
-mod = tvm.relay.transform.InferType()(mod)
-mod_int = tvm.relay.transform.FakeQuantizationToInteger()(mod)
-assert not tvm.ir.structural_equal(mod, mod_int)
-
-mod = build_module(
-mod, tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, 
host=HEXAGON_AOT_LLVM_TARGET)
-)
-mod_int = build_module(
-mod_int, tvm.target.Target(HEXAGON_AOT_LLVM_TARGET, 
host=HEXAGON_AOT_LLVM_TARGET)
-)
-
-hexagon_mod = hexagon_session.get_executor_from_factory(mod)
-result = run_module(hexagon_mod, inputs)
-
-hexagon_mod = hexagon_session.get_executor_from_factory(mod_int)
-result_int = run_module(hexagon_mod, inputs)
 
-tvm.testing.assert_allclose(result, result_int, rtol=1e-02, atol=1e-02)
+from .infrastructure import quantize_np, build_module, run_module
 
 
-@tvm.testing.requires_hexagon
-def test_avgpool_conv2d(hexagon_session: Session):
+def _make_avgpool_conv2d():
 """Test case with avg_pool2d followed by a conv2d"""
 dtype = "int8"
 shape_x = [1, 2, 9, 9]
@@ -112,8 +83,6 @@ def test_avgpool_conv2d(hexagon_session: Session):
 expr = relay.qnn.op.dequantize(expr, out_sc, out_zp)
 args = {"input": input_quant, "weight": weight_quant}
 
-compare_fq_to_int(hexagon_session, expr, args)
-
 # Expected graph
 op0 = relay.qnn.op.avg_pool2d(
 inp,
@@ -148,11 +117,11 @@ def test_avgpool_conv2d(hexagon_session: Session):
 out_dtype="int8&quo

(tvm) branch main updated (e69363ab1e -> 97f6e6507f)

2024-01-02 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


from e69363ab1e [release] Update version to 0.16.dev0 on main branch
 add 97f6e6507f [CI] Upgrade cmake version to 3.24.0 (#16336)

No new revisions were added by this update.

Summary of changes:
 docker/install/ubuntu_install_cmake_source.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)



(tvm) branch unity updated: [Unity] Upgrade flashinfer 3rdparty submodule (#16323)

2024-01-01 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 09c44e6a93 [Unity] Upgrade flashinfer 3rdparty submodule (#16323)
09c44e6a93 is described below

commit 09c44e6a936602b4876179b348cfa14d8a10de8b
Author: Zihao Ye 
AuthorDate: Mon Jan 1 23:01:18 2024 -0800

[Unity] Upgrade flashinfer 3rdparty submodule (#16323)

Mixtral requires `group_size = 4` in Grouped Query Attention which 
FlashInfer didn't enable (to reduce compilation time) by default.
This PR incorporates the hotfix 
(https://github.com/flashinfer-ai/flashinfer/pull/35) from flashinfer 3rdparty 
submodule to support Mixtral.
---
 3rdparty/flashinfer | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/flashinfer b/3rdparty/flashinfer
index ae1a6501ca..7d3a47310a 16
--- a/3rdparty/flashinfer
+++ b/3rdparty/flashinfer
@@ -1 +1 @@
-Subproject commit ae1a6501ca69e59c982bd196fc87514ab3f1773e
+Subproject commit 7d3a47310af1ac0795e0d8e8435e42c882c96a13



(tvm) branch unity updated: [Unity] Fix PagedKVCache per FlashInfer update (#16317)

2024-01-01 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new ac568eb30a [Unity] Fix PagedKVCache per FlashInfer update (#16317)
ac568eb30a is described below

commit ac568eb30a4e19d51fc9ef9b7ca5642a4f589fde
Author: Ruihang Lai 
AuthorDate: Tue Jan 2 01:15:37 2024 -0500

[Unity] Fix PagedKVCache per FlashInfer update (#16317)

This PR fixes PagedKVCache due to recent FlashInfer interface
change, and also bumps FlashInfer to the latest.
---
 3rdparty/flashinfer|  2 +-
 src/runtime/relax_vm/paged_kv_cache.cc | 10 --
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/3rdparty/flashinfer b/3rdparty/flashinfer
index 8d987b98f7..ae1a6501ca 16
--- a/3rdparty/flashinfer
+++ b/3rdparty/flashinfer
@@ -1 +1 @@
-Subproject commit 8d987b98f7f7b9381097566643a7f53c99cf312d
+Subproject commit ae1a6501ca69e59c982bd196fc87514ab3f1773e
diff --git a/src/runtime/relax_vm/paged_kv_cache.cc 
b/src/runtime/relax_vm/paged_kv_cache.cc
index fc7d351e5b..e941908dbc 100644
--- a/src/runtime/relax_vm/paged_kv_cache.cc
+++ b/src/runtime/relax_vm/paged_kv_cache.cc
@@ -848,9 +848,8 @@ class PagedAttentionKVCacheObj : public AttentionKVCache {
 if (num_depths_ == 1) {
   if (use_decode_kernel_[0]) {
 f_attention_decode_begin_forward_(
-/*depth=*/0, page_indptr_on_depths_view_[0], 
page_indices_on_depths_view_[0],
-last_page_len_on_depths_view_[0], /*return_lse=*/true, 
num_qo_heads_, num_kv_heads_,
-head_dim_, page_size_, /*rotary_mode=*/true);
+/*depth=*/0, page_indptr_on_depths_view_[0], 
last_page_len_on_depths_view_[0],
+num_qo_heads_, num_kv_heads_, head_dim_, page_size_, 
/*rotary_mode=*/true);
   } else {
 f_attention_prefill_begin_forward_(/*depth=*/0, 
qo_indptr_on_depths_view_[0],
cur_batch_size_, num_qo_heads_, 
num_kv_heads_);
@@ -864,9 +863,8 @@ class PagedAttentionKVCacheObj : public AttentionKVCache {
 }
 if (use_decode_kernel_[d]) {
   f_attention_decode_begin_forward_(
-  d, page_indptr_on_depths_view_[d], 
page_indices_on_depths_view_[d],
-  last_page_len_on_depths_view_[d], /*rotary_mode=*/false, 
num_qo_heads_, num_kv_heads_,
-  head_dim_, page_size_, /*return_lse=*/true);
+  d, page_indptr_on_depths_view_[d], 
last_page_len_on_depths_view_[d], num_qo_heads_,
+  num_kv_heads_, head_dim_, page_size_, /*rotary_mode=*/false);
 } else {
   f_attention_prefill_begin_forward_(/*depth=*/d, 
qo_indptr_on_depths_view_[d],
  
last_page_len_on_depths_view_[d]->shape[0],



(tvm) branch unity updated: [Unity][nn.Module] Introduce operator `empty` (#16327)

2024-01-01 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new faa8a0ad46 [Unity][nn.Module] Introduce operator `empty` (#16327)
faa8a0ad46 is described below

commit faa8a0ad46d2e3159680df0e09a84e5d6376b1fd
Author: Junru Shao 
AuthorDate: Mon Jan 1 20:30:57 2024 -0800

[Unity][nn.Module] Introduce operator `empty` (#16327)

This PR introduces an operator `op.empty` in the `nn.Module` frontend.
It helps us to create an uninitialized memory from the memory pool,
which could be used as temporary scratchpad memory to handcrafted
operators.
---
 python/tvm/relax/frontend/nn/op.py| 59 +++
 tests/python/relax/test_frontend_nn_op.py | 27 --
 2 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relax/frontend/nn/op.py 
b/python/tvm/relax/frontend/nn/op.py
index 3197145289..66f023ef9d 100644
--- a/python/tvm/relax/frontend/nn/op.py
+++ b/python/tvm/relax/frontend/nn/op.py
@@ -1142,6 +1142,65 @@ def zeros(
 return wrap_nested(_op.zeros(shape, dtype), name)
 
 
+def ones(
+shape: Sequence[IntExpr],
+dtype: str = "float32",
+name: str = "ones",
+) -> Tensor:
+"""Construct a tensor of all zeros, with the input shape and dtype.
+
+Parameters
+--
+shape : Sequence[IntExpr]
+The shape of the created tensor.
+
+dtype : str
+The data type of the created tensor.
+
+name : str
+Name hint.
+
+Returns
+---
+result : Tensor
+The result tensor.
+"""
+return wrap_nested(_op.ones(shape, dtype), name)
+
+
+def empty(
+shape: Sequence[IntExpr],
+dtype: str = "float32",
+name: str = "empty",
+) -> Tensor:
+"""Construct an uninitialized tensor, with the input shape and dtype.
+
+Parameters
+--
+shape : Sequence[IntExpr]
+The shape of the created tensor.
+
+dtype : str
+The data type of the created tensor.
+
+name : str
+Name hint.
+
+Returns
+---
+result : Tensor
+The result tensor.
+"""
+return wrap_nested(  # type: ignore
+_op.builtin.alloc_tensor(
+rx.ShapeExpr(shape),  # type: ignore
+dtype,
+runtime_device_index=0,
+),
+name,
+)
+
+
 def split(
 ary: Tensor,
 indices_or_sections: Union[int, Sequence[int]],
diff --git a/tests/python/relax/test_frontend_nn_op.py 
b/tests/python/relax/test_frontend_nn_op.py
index 55870426e4..43f4a9efc0 100644
--- a/tests/python/relax/test_frontend_nn_op.py
+++ b/tests/python/relax/test_frontend_nn_op.py
@@ -17,12 +17,14 @@
 # pylint: disable=missing-docstring, invalid-name
 import tvm
 import tvm.testing
-from tvm import tir
+from tvm import relax, tir
 from tvm.relax.frontend.nn import Module, Tensor, op, spec
 from tvm.script import ir as I
 from tvm.script import relax as R
 from tvm.script import tir as T
 
+# mypy: disable-error-code="attr-defined,valid-type,name-defined"
+
 
 def test_binary():
 class Model(Module):
@@ -174,7 +176,7 @@ def test_image():
 def test(self, x: Tensor, weight: Tensor, bias: Tensor):
 padded = op.pad(x, [0, 0, 0, 0, 1, 1, 1, 1])
 conv2d = op.conv2d(padded, weight, bias)
-interpolate = op.interpolate(x, size=[40, 40])
+interpolate = op.interpolate(x, size=[40, 40])  # type: ignore
 return (conv2d, interpolate)
 
 @R.function
@@ -347,7 +349,7 @@ def test_create():
 class Model(Module):
 def test(self, x: Tensor):
 triu_out = op.triu(x)
-full_with_scalar_out = op.full([10, 10], fill_value=10)
+full_with_scalar_out = op.full([10, 10], fill_value=10)  # type: 
ignore
 full_with_FloatImm_out = op.full(
 [10, 10], fill_value=tir.FloatImm(dtype="float32", value=10)
 )
@@ -638,5 +640,24 @@ def test_extern():
 tvm.ir.assert_structural_equal(irmodule, Expected)
 
 
+def test_empty():
+@tvm.register_func("test_empty_assert", override=True)
+def test_empty_assert(_lineo, x):
+assert x.shape == (10, 10)
+assert x.dtype == "float32"
+
+class Model(Module):
+def test(self):
+result = op.empty([10, 10], dtype="float32")
+op.debug_func("test_empty_assert", result)
+return result
+
+irmodule, _ = Model().export_tvm(spec={"test": {}}, debug=True)
+ex = relax.build(irmodule, "llvm")
+vm = relax.VirtualMachine(ex, tvm.cpu())
+effects = vm["_initialize_effect"]()
+vm["test"](*effects)
+
+
 if __name__ == "__main__":
 tvm.testing.main()



(tvm) branch unity updated: [Unity][Web][Fix] Fix fetchNDArray for f32-to-bf16 (#16294)

2024-01-01 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new b1df4b0856 [Unity][Web][Fix] Fix fetchNDArray for f32-to-bf16 (#16294)
b1df4b0856 is described below

commit b1df4b085608158b451d443d63f9fab3e0e3
Author: Charlie Ruan <53290280+charliefr...@users.noreply.github.com>
AuthorDate: Tue Jan 2 06:10:59 2024 +0800

[Unity][Web][Fix] Fix fetchNDArray for f32-to-bf16 (#16294)

Currently when loading the params, we try to decode from bf16 to f32 
regardless of the dtype of the param, since all the params are stored with 
"format=f32-to-bf16" in the record regardless of the dtype.

We solve it by checking the dtype as well, just like the C++ counter part:

https://github.com/apache/tvm/blob/4e66690a4d033af912f5051c0e5a16c9c10691d9/src/runtime/relax_vm/ndarray_cache_support.cc#L168-L172
---
 web/emcc/wasm_runtime.cc | 4 ++--
 web/src/runtime.ts   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
index 60f40adbf4..311bbd9971 100644
--- a/web/emcc/wasm_runtime.cc
+++ b/web/emcc/wasm_runtime.cc
@@ -126,8 +126,8 @@ 
TVM_REGISTER_GLOBAL("testing.object_use_count").set_body([](TVMArgs args, TVMRet
   *ret = (obj.use_count() - 1);
 });
 
-void ArrayDecodeStorage(NDArray cpu_arr, std::string bytes, std::string 
format) {
-  if (format == "f32-to-bf16") {
+void ArrayDecodeStorage(NDArray cpu_arr, std::string bytes, std::string 
format, std::string dtype) {
+  if (format == "f32-to-bf16" && dtype == "float32") {
 std::vector buffer(bytes.length() / 2);
 std::memcpy(buffer.data(), bytes.data(), buffer.size() * 2);
 // decode bf16 to f32
diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index f842b2723f..5aa38dee39 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -1556,7 +1556,7 @@ export class Instance implements Disposable {
 });
 const recSource = buffer.slice(rec.byteOffset, rec.byteOffset + 
rec.nbytes);
 // first sync copy to cpu.
-this.ctx.arrayDecodeStorage(cpu_arr, new Uint8Array(recSource), 
rec.format);
+this.ctx.arrayDecodeStorage(cpu_arr, new Uint8Array(recSource), 
rec.format, rec.dtype);
 // then async stream into GPU if needed
 if (device.deviceType === DeviceStrToEnum.cpu) {
   this.ndarrayCacheUpdate(rec.name, cpu_arr, false);



(tvm) branch main updated: [Relay][Frontend][Torch] add aten:broadcast_to (#16319)

2023-12-31 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 2da3798dd1 [Relay][Frontend][Torch] add aten:broadcast_to  (#16319)
2da3798dd1 is described below

commit 2da3798dd150d15d4d560b495d22422c9eb23194
Author: Huan Mei 
AuthorDate: Mon Jan 1 07:41:56 2024 +0800

[Relay][Frontend][Torch] add aten:broadcast_to  (#16319)

Recently, I worked with the Stable Video Diffusion model, which contains 
the `aten::broadcast_to` op, but TVM does not support it.

Add support for it here.
---
 python/tvm/relay/frontend/pytorch.py  | 16 
 tests/python/frontend/pytorch/test_forward.py | 25 +
 2 files changed, 41 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py 
b/python/tvm/relay/frontend/pytorch.py
index 54004c379d..0213dcc488 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2328,6 +2328,21 @@ class PyTorchOpConverter:
 res_shape = list(torch.broadcast_tensors(*map(torch.empty, 
infer_shape_value))[0].shape)
 return [_op.broadcast_to(tensor, res_shape) for tensor in tensor_list]
 
+def broadcast_to(self, inputs, input_types):
+tensor = inputs[0]
+new_shape = inputs[1]
+import torch
+
+if not isinstance(new_shape, (list, tuple, torch.Size)):
+msg = f"Data type {type(new_shape)} could not be parsed in 
broadcast_to op"
+raise AssertionError(msg)
+
+for i, dim in enumerate(new_shape):
+if not isinstance(dim, int):
+new_shape[i] = int(_infer_value(dim, {}).numpy())
+
+return _op.broadcast_to(tensor, new_shape)
+
 def Bool(self, inputs, input_types):
 assert len(inputs) == 1
 return inputs[0]
@@ -4190,6 +4205,7 @@ class PyTorchOpConverter:
 "aten::upsample_nearest3d": 
self.make_upsample3d("nearest_neighbor"),
 "aten::expand_as": self.expand_as,
 "aten::broadcast_tensors": self.broadcast_tensors,
+"aten::broadcast_to": self.broadcast_to,
 "aten::lt": self.make_elemwise("less"),
 "aten::gt": self.make_elemwise("greater"),
 "aten::le": self.make_elemwise("less_equal"),
diff --git a/tests/python/frontend/pytorch/test_forward.py 
b/tests/python/frontend/pytorch/test_forward.py
index 56afe72ecd..6178a58b6d 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -2162,6 +2162,31 @@ def test_forward_broadcast_tensors():
 verify_model(BroadCastTensors2().float().eval(), input_data=[x, y, z])
 
 
+@tvm.testing.uses_gpu
+def test_forward_broadcast_to():
+"""test_forward_broadcast_to"""
+torch.set_grad_enabled(False)
+
+class BroadCastTo1(Module):
+def forward(self, x):
+return torch.broadcast_to(x, (3, 3))
+
+x = torch.tensor([1, 2, 3])
+verify_model(BroadCastTo1().float().eval(), input_data=[x])
+
+class BroadCastTo2(Module):
+def __init__(self):
+super().__init__()
+self.y = torch.tensor(1)
+self.z = torch.tensor(2)
+
+def forward(self, x):
+return torch.broadcast_to(x, (self.y + self.z, 3))
+
+x = torch.tensor([1, 2, 3])
+verify_model(BroadCastTo2().float().eval(), input_data=[x])
+
+
 @tvm.testing.uses_gpu
 def test_forward_pow():
 """test_forward_pow"""



(tvm) branch unity updated: [Unity][Frontend] Introducing Object (#16316)

2023-12-31 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 9030522960 [Unity][Frontend] Introducing Object (#16316)
9030522960 is described below

commit 90305229604b0ca4cce34bd6de5b6b21925b55d4
Author: Ruihang Lai 
AuthorDate: Sun Dec 31 18:33:10 2023 -0500

[Unity][Frontend] Introducing Object (#16316)

This PR supports `Object` as a new spec/frontend type in
nn.Module, so that non-tensor opaque objects (such as
PagedKVCache) can be effectively represented.
---
 python/tvm/relax/frontend/nn/__init__.py |  2 +-
 python/tvm/relax/frontend/nn/core.py | 23 ++-
 python/tvm/relax/frontend/nn/exporter.py |  8 +---
 python/tvm/relax/frontend/nn/spec.py | 16 ++--
 4 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/python/tvm/relax/frontend/nn/__init__.py 
b/python/tvm/relax/frontend/nn/__init__.py
index 5723e3d9ff..61d1001ea8 100644
--- a/python/tvm/relax/frontend/nn/__init__.py
+++ b/python/tvm/relax/frontend/nn/__init__.py
@@ -17,7 +17,7 @@
 """A PyTorch-like API to build IRModules."""
 # pylint: disable=redefined-builtin
 from . import op, spec
-from .core import Effect, Module, ModuleList, Parameter, Tensor
+from .core import Effect, Module, ModuleList, Object, Parameter, Tensor
 from .exporter import add_extern
 from .extern import ExternModule, ObjectModule, SourceModule
 from .modules import (
diff --git a/python/tvm/relax/frontend/nn/core.py 
b/python/tvm/relax/frontend/nn/core.py
index 8ed0efe2cd..9c99ba6177 100644
--- a/python/tvm/relax/frontend/nn/core.py
+++ b/python/tvm/relax/frontend/nn/core.py
@@ -50,7 +50,12 @@ from tvm.target import Target
 
 from ... import expr as rx
 from ...block_builder import BlockBuilder
-from ...struct_info import ShapeStructInfo, TensorStructInfo, TupleStructInfo
+from ...struct_info import (
+ObjectStructInfo,
+ShapeStructInfo,
+TensorStructInfo,
+TupleStructInfo,
+)
 from ._tensor_op import _TensorOp
 from .subroutine import SubroutineMixin
 
@@ -274,6 +279,22 @@ class Parameter(Tensor):
 )._expr
 
 
+class Object:
+"""A wrapper on top of relax.Expr whose struct_info is the base
+ObjectStructInfo (rather than any its subclass). Object effectively
+represents non-tensor frontend components such as KV caches.
+"""
+
+_expr: rx.Var
+
+def __init__(self, *, _expr: rx.Expr, _name: str) -> None:
+"""Private constructor. Object is never supposed to be constructed 
directly by users."""
+if not isinstance(_expr, rx.Var):
+_expr = BlockBuilder.current().emit(_expr, _name)
+self._expr = _expr
+assert isinstance(self._expr.struct_info, ObjectStructInfo)
+
+
 class Effect:
 """Effect is a special non-user facing type that is used to represent 
operations with side
 effects, for example, print. It is used to represent the output of a 
computation.
diff --git a/python/tvm/relax/frontend/nn/exporter.py 
b/python/tvm/relax/frontend/nn/exporter.py
index 416913def4..99591c8a3e 100644
--- a/python/tvm/relax/frontend/nn/exporter.py
+++ b/python/tvm/relax/frontend/nn/exporter.py
@@ -23,7 +23,7 @@ from tvm.ir import IRModule
 
 from ... import expr as rx
 from ...block_builder import BlockBuilder
-from ...struct_info import ShapeStructInfo, TupleStructInfo
+from ...struct_info import ObjectStructInfo, ShapeStructInfo, TupleStructInfo
 from . import core, extern
 from . import spec as _spec
 from .modules import IOEffect
@@ -160,7 +160,7 @@ def _emit_method(  # pylint: 
disable=too-many-locals,too-many-branches,too-many-
 ):
 # pylint: disable=protected-access
 def _unwrap_ret(expr: typing.Any) -> typing.Any:
-if isinstance(expr, core.Tensor):
+if isinstance(expr, (core.Tensor, core.Object)):
 return expr._expr
 if isinstance(expr, tuple):
 return rx.Tuple([_unwrap_ret(x) for x in expr])
@@ -171,7 +171,7 @@ def _emit_method(  # pylint: 
disable=too-many-locals,too-many-branches,too-many-
 def _convert_input(arg):
 if isinstance(arg, tir.Var):
 return rx.Var(arg.name, struct_info=ShapeStructInfo(values=[arg]))
-if isinstance(arg, core.Tensor):
+if isinstance(arg, (core.Tensor, core.Object)):
 return arg._expr  # pylint: disable=protected-access
 if isinstance(arg, _spec.Tuple):
 return rx.Var(
@@ -292,6 +292,8 @@ def _method_spec_to_inputs(
 dtype=arg_spec.dtype,
 name=arg_name,
 )
+elif isinstance(arg_spec, _spec.Object):
+arg = arg_spec.object_type(_expr=rx.Var(arg_name, 
ObjectStructInfo()), _name=arg_name)
 elif is

(tvm) branch unity updated (beb832616d -> 8867de843d)

2023-12-31 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


from beb832616d [Unity] Update cutlass FpA IntB GeMM submodule (#16320)
 add 8867de843d [Unity][MSC][Bugfix] Use random workspace for test (#16322)

No new revisions were added by this update.

Summary of changes:
 tests/python/contrib/test_msc/test_manager.py | 1 +
 tests/python/contrib/test_msc/test_runner.py  | 4 ++--
 tests/python/contrib/test_msc/test_tools.py   | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)



(tvm) branch unity updated (2f7e0d578f -> beb832616d)

2023-12-31 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 2f7e0d578f [Unity] Ensure memory planning cross-function independence 
(#16318)
 add beb832616d [Unity] Update cutlass FpA IntB GeMM submodule (#16320)

No new revisions were added by this update.

Summary of changes:
 3rdparty/cutlass_fpA_intB_gemm  | 2 +-
 CMakeLists.txt  | 2 ++
 cmake/modules/contrib/CUTLASS.cmake | 3 +++
 3 files changed, 6 insertions(+), 1 deletion(-)



(tvm) branch unity updated: [Unity] Ensure memory planning cross-function independence (#16318)

2023-12-31 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 2f7e0d578f [Unity] Ensure memory planning cross-function independence 
(#16318)
2f7e0d578f is described below

commit 2f7e0d578f777a630bb6e4a79d9c2ec52b7be461
Author: Ruihang Lai 
AuthorDate: Sun Dec 31 13:10:02 2023 -0500

[Unity] Ensure memory planning cross-function independence (#16318)

Prior to this PR, the memory planning for different Relax functions
are not independent -- storage tokens are shared across different
Relax functions.

This will incur memory overuse sometimes. For example, tensor `A`
in `func1` has 128 bytes, tensor `B` in `func2` has 2048 bytes.
If the memory planning decides to share the storage token for `A`
and `B`, the shared token will have size 2048 bytes.

Consider the case when at runtime only `func1` is executed, and
`func2` is never invoked. In this case, only 128 bytes for tensor
`A` is needed, while a total 2048-chunk is allocated in total,
which is a 16x memory overuse.

This PR makes the memory planning across different Relax function
independent. That means in the example above, when only `func1`
is executed, only 128 bytes will be allocated.
---
 src/relax/transform/static_plan_block_memory.cc|  8 +++
 .../test_transform_static_plan_block_memory.py | 73 +-
 2 files changed, 80 insertions(+), 1 deletion(-)

diff --git a/src/relax/transform/static_plan_block_memory.cc 
b/src/relax/transform/static_plan_block_memory.cc
index 4a2a1555ff..3873f624ef 100644
--- a/src/relax/transform/static_plan_block_memory.cc
+++ b/src/relax/transform/static_plan_block_memory.cc
@@ -219,6 +219,12 @@ class TokenAllocator1D {
 available_pool_[token->dtype].insert({token->bytes, token});
   }
 
+  /*! \brief Clear the allocator. */
+  void Clear() {
+available_pool_.clear();
+full_pool_.clear();
+  }
+
  private:
   /*! \brief A constant scale representing the token search range. */
   const int match_range_{16};
@@ -569,6 +575,8 @@ class StorageAllocator : public StorageAllocatorBaseVisitor 
{
   if (func == nullptr) {
 continue;
   }
+  // Clear the allocator to make the planning of different functions 
independent.
+  allocator_.Clear();
   this->VisitExpr_(func);
 }
   }
diff --git a/tests/python/relax/test_transform_static_plan_block_memory.py 
b/tests/python/relax/test_transform_static_plan_block_memory.py
index 0c24f90efc..f12b5b9fc1 100644
--- a/tests/python/relax/test_transform_static_plan_block_memory.py
+++ b/tests/python/relax/test_transform_static_plan_block_memory.py
@@ -18,7 +18,9 @@
 import tvm
 import tvm.testing
 from tvm import relax
-from tvm.script import ir as I, relax as R, tir as T
+from tvm.script import ir as I
+from tvm.script import relax as R
+from tvm.script import tir as T
 
 
 def test_basic():
@@ -1105,5 +1107,74 @@ def test_call_tir_dyn():
 tvm.ir.assert_structural_equal(mod, Expected)
 
 
+def test_function_independence():
+# fmt: off
+@tvm.script.ir_module
+class Module:
+@T.prim_func
+def exp(A: T.handle, B: T.handle):
+T.evaluate(0)
+
+@R.function
+def func1(x: R.Tensor((8,), dtype="float32")) -> R.Tensor((8,), 
dtype="float32"):
+R.func_attr({"relax.force_pure": 1})
+cls = Module
+alloc: R.Tensor((8,), dtype="float32") = 
R.builtin.alloc_tensor(R.shape([8,]), dtype="float32", runtime_device_index=0)
+_: R.Tuple() = cls.exp(x, alloc)
+lv: R.Tensor((8,), dtype="float32") = alloc
+alloc1: R.Tensor((8,), dtype="float32") = 
R.builtin.alloc_tensor(R.shape([8,]), dtype="float32", runtime_device_index=0)
+_1: R.Tuple() = cls.exp(lv, alloc1)
+gv: R.Tensor((8,), dtype="float32") = alloc1
+return gv
+
+@R.function
+def func2(x: R.Tensor((10,), dtype="float32")) -> R.Tensor((10,), 
dtype="float32"):
+R.func_attr({"relax.force_pure": 1})
+cls = Module
+alloc: R.Tensor((10,), dtype="float32") = 
R.builtin.alloc_tensor(R.shape([10,]), dtype="float32", runtime_device_index=0)
+_: R.Tuple() = cls.exp(x, alloc)
+lv: R.Tensor((10,), dtype="float32") = alloc
+alloc1: R.Tensor((10,), dtype="float32") = 
R.builtin.alloc_tensor(R.shape([10,]), dtype="float32", runtime_device_index=0)
+_1: R.Tuple() = cls.exp(lv, alloc1)
+gv: R.Tensor((10,), dtype="float32") = alloc1
+return gv
+
+@I.ir_module
+class 

(tvm) branch unity updated: Update FlashInfer (#16292)

2023-12-28 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 58daeb4905 Update FlashInfer (#16292)
58daeb4905 is described below

commit 58daeb49058f17249d84e0a24f32bfb6ece6
Author: Junru Shao 
AuthorDate: Thu Dec 28 17:45:22 2023 -0500

Update FlashInfer (#16292)
---
 3rdparty/flashinfer | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/flashinfer b/3rdparty/flashinfer
index 65db90e4a7..8d987b98f7 16
--- a/3rdparty/flashinfer
+++ b/3rdparty/flashinfer
@@ -1 +1 @@
-Subproject commit 65db90e4a755af5c6f3aee1607041f6adcb3accc
+Subproject commit 8d987b98f7f7b9381097566643a7f53c99cf312d



(tvm) branch main updated: [BugFix] Fixed Inappropriate Logical Expression (#16272)

2023-12-26 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 1c4538947b [BugFix] Fixed Inappropriate Logical Expression (#16272)
1c4538947b is described below

commit 1c4538947bed5b987530730a3a7be5632725c965
Author: Ataf Fazledin Ahamed 
AuthorDate: Wed Dec 27 03:19:17 2023 +0600

[BugFix] Fixed Inappropriate Logical Expression (#16272)

[BugFix] Fixed a comparison for splitting tensor

In the `tensor_split` method, there's a comparsion
that checks if the input tensor is zero-dimensional
or one-dimensional long tensor.

In the comparsion, there's a typo that converts
the shape of the tensor to a list and compares
against integer.

This commit fixes the bug by comapring the
length of the tensor against the integer.

Signed-off-by: fazledyn-or 
---
 python/tvm/relay/frontend/pytorch.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/tvm/relay/frontend/pytorch.py 
b/python/tvm/relay/frontend/pytorch.py
index b02e59b265..54004c379d 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -595,9 +595,7 @@ class PyTorchOpConverter:
 )
 raise AssertionError(msg)
 
-if isinstance(inputs[1], torch.Tensor) and not (
-list(inputs[1].shape) == [] or list(inputs[1].shape) == 1
-):
+if isinstance(inputs[1], torch.Tensor) and len(inputs[1].shape) not in 
[0, 1]:
 msg = "indices_or_sections must be a zero-dimensional or 
one-dimensional long tensor"
 raise AssertionError(msg)
 



(tvm) branch unity updated: Update FlashInfer (#16281)

2023-12-26 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 8946efa62e Update FlashInfer (#16281)
8946efa62e is described below

commit 8946efa62e30ec7f71ddd835adc92b171951ed40
Author: Junru Shao 
AuthorDate: Tue Dec 26 16:09:05 2023 -0500

Update FlashInfer (#16281)
---
 3rdparty/flashinfer | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/flashinfer b/3rdparty/flashinfer
index f77482a346..65db90e4a7 16
--- a/3rdparty/flashinfer
+++ b/3rdparty/flashinfer
@@ -1 +1 @@
-Subproject commit f77482a3466453d9c8d72f09c437ecaecd0ef096
+Subproject commit 65db90e4a755af5c6f3aee1607041f6adcb3accc



(tvm) branch unity updated: [Unity][Frontend] NNModule `tensor_ir_op` support (#16278)

2023-12-26 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 889d2f6cef [Unity][Frontend] NNModule `tensor_ir_op` support (#16278)
889d2f6cef is described below

commit 889d2f6cef5a0a533f48e763b626367d9c36ccff
Author: Siyuan Feng 
AuthorDate: Wed Dec 27 05:05:47 2023 +0800

[Unity][Frontend] NNModule `tensor_ir_op` support (#16278)

This PR adds support for `tensor_ir_op` in NNModule, which enables us to
call TensorIR function in NNModule.

Also this PR adds a test case for extern op.
---
 python/tvm/relax/frontend/nn/_tensor_op.py |  12 +++
 python/tvm/relax/frontend/nn/modules.py|   4 +-
 python/tvm/relax/frontend/nn/op.py |  76 -
 tests/python/relax/test_frontend_nn_op.py  | 130 +
 4 files changed, 219 insertions(+), 3 deletions(-)

diff --git a/python/tvm/relax/frontend/nn/_tensor_op.py 
b/python/tvm/relax/frontend/nn/_tensor_op.py
index a653c9fa29..627b8b626c 100644
--- a/python/tvm/relax/frontend/nn/_tensor_op.py
+++ b/python/tvm/relax/frontend/nn/_tensor_op.py
@@ -47,10 +47,22 @@ class _TensorOp:
 other = _convert_scalar(other, self)
 return _op().add(self, other)
 
+def __sub__(self, other):
+other = _convert_scalar(other, self)
+return _op().subtract(self, other)
+
+def __rsub__(self, other):
+other = _convert_scalar(other, self)
+return _op().subtract(other, self)
+
 def __mul__(self, other):
 other = _convert_scalar(other, self)
 return _op().multiply(self, other)
 
+def __rmul__(self, other):
+other = _convert_scalar(other, self)
+return _op().multiply(self, other)
+
 def __truediv__(self, other):
 other = _convert_scalar(other, self)
 return _op().divide(self, other)
diff --git a/python/tvm/relax/frontend/nn/modules.py 
b/python/tvm/relax/frontend/nn/modules.py
index b2c97a567a..03d6a06994 100644
--- a/python/tvm/relax/frontend/nn/modules.py
+++ b/python/tvm/relax/frontend/nn/modules.py
@@ -311,7 +311,7 @@ class ConvTranspose1D(Module):
 
 def forward(self, x: Tensor) -> Tensor:
 """
-Forward method for convtranspose1d layer.
+Forward method for conv transpose 1d layer.
 
 Parameters
 --
@@ -321,7 +321,7 @@ class ConvTranspose1D(Module):
 Returns
 ---
 ret : Tensor
-The output tensor for the convtranspose1d layer.
+The output tensor for the conv transpose 1d layer.
 """
 return op.conv1d_transpose(
 x,
diff --git a/python/tvm/relax/frontend/nn/op.py 
b/python/tvm/relax/frontend/nn/op.py
index 2369451ac9..3197145289 100644
--- a/python/tvm/relax/frontend/nn/op.py
+++ b/python/tvm/relax/frontend/nn/op.py
@@ -1461,13 +1461,87 @@ def tensor_expr_op(
 OutType = TypeVar("OutType", bound=Union[Tensor, Sequence[Tensor]])
 
 
+def tensor_ir_op(
+func: _tir.PrimFunc,
+name_hint: str,
+args: Union[Tensor, Sequence[Union[Tensor, _tir.Var]]],
+out: OutType,
+) -> OutType:
+"""Create a `call_tir` binding with given PrimFunc
+
+Parameters
+--
+func : _tir.PrimFunc
+The PrimFunc to call.
+
+name_hint : str
+Name hint.
+
+args : Union[Tensor, Sequence[Union[Tensor, _tir.Var]]]
+The arguments to pass to the PrimFunc.
+
+out : Union[Tensor, List[Tensor]]
+The output tensors.
+
+Returns
+---
+result : Tensor
+The result tensor
+"""
+from tvm import relax as rx  # pylint: disable=import-outside-toplevel
+
+call_tir_args, tir_vars = [], []
+if not isinstance(args, (tuple, list)):
+args = [args]
+
+for arg in args:
+if isinstance(arg, Tensor):
+call_tir_args.append(arg._expr)
+elif isinstance(arg, _tir.Var):
+tir_vars.append(arg)
+else:
+raise TypeError(
+f"Unsupported type: tensor_ir_op args expect Tensor or 
tir.Var, but got {type(arg)}"
+)
+
+if isinstance(out, Tensor):
+out_sinfo = [out._expr.struct_info]
+else:
+out_sinfo = [x._expr.struct_info for x in out]
+
+bb = BlockBuilder.current()
+global_var = bb.add_func(func, name_hint)
+
+return wrap_nested(
+bb.emit(rx.call_tir(global_var, call_tir_args, out_sinfo, 
tir_vars=tir_vars)),
+name=name_hint,
+)
+
+
 def extern(
 name: str,
 args: Sequence[Union[Tensor, _tir.PrimExpr, int, float, str]],
 out: OutType,
 ) -> OutType:
 """Invoke an extern function during runtime. The extern function must be 
registered with the "
-TVM runtime using `TVM

(tvm) branch main updated: replace deprecated np.int with int to avoid crash (#16279)

2023-12-26 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new a050696ca5 replace deprecated np.int with int to avoid crash (#16279)
a050696ca5 is described below

commit a050696ca54a823f7f6072de5ea89f1b900520ba
Author: Qingchao Shen 
AuthorDate: Wed Dec 27 05:04:43 2023 +0800

replace deprecated np.int with int to avoid crash (#16279)

replace deprecated np.int with int
---
 python/tvm/relay/frontend/pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/pytorch.py 
b/python/tvm/relay/frontend/pytorch.py
index 32a7d9c5d8..b02e59b265 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -2756,7 +2756,7 @@ class PyTorchOpConverter:
 for i in [0, 1]:
 size, _ = try_infer_value(
 inputs[1][i],
-lambda ret: ret.astype(np.int),
+lambda ret: ret.astype(int),
 lambda: _op.expand_dims(inputs[1][i], axis=0),
 )
 out_size.append(size)



(tvm) branch feature/2023-12-24/extern-op deleted (was 0896465b87)

2023-12-26 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch feature/2023-12-24/extern-op
in repository https://gitbox.apache.org/repos/asf/tvm.git


 was 0896465b87 [Unity][nn.Module] Support Runtime-Calling Any PackedFunc 
via `op.extern`

The revisions that were on this branch are still contained in
other references; therefore, this change does not discard any commits
from the repository.



(tvm) 01/01: [Unity][nn.Module] Support Runtime-Calling Any PackedFunc via `op.extern`

2023-12-24 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch feature/2023-12-24/extern-op
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit 0896465b8747aa95a38b4c9272b7984cd1231613
Author: Junru Shao 
AuthorDate: Sun Dec 24 21:19:04 2023 -0800

[Unity][nn.Module] Support Runtime-Calling Any PackedFunc via `op.extern`
---
 python/tvm/relax/frontend/nn/op.py | 41 +-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relax/frontend/nn/op.py 
b/python/tvm/relax/frontend/nn/op.py
index 75bc4574fc..2369451ac9 100644
--- a/python/tvm/relax/frontend/nn/op.py
+++ b/python/tvm/relax/frontend/nn/op.py
@@ -19,7 +19,7 @@
 """nn.Tensor operators."""
 import inspect
 import math
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, 
TypeVar, Union
 
 import numpy as np
 
@@ -1458,6 +1458,45 @@ def tensor_expr_op(
 )
 
 
+OutType = TypeVar("OutType", bound=Union[Tensor, Sequence[Tensor]])
+
+
+def extern(
+name: str,
+args: Sequence[Union[Tensor, _tir.PrimExpr, int, float, str]],
+out: OutType,
+) -> OutType:
+"""Invoke an extern function during runtime. The extern function must be 
registered with the "
+TVM runtime using `TVM_REGISTER_GLOBAL` (C++), or `tvm.register_func` 
(Python)."""
+from tvm import relax as rx  # pylint: disable=import-outside-toplevel
+
+def _convert(arg, name: str):
+if isinstance(arg, Tensor):
+return arg._expr  # pylint: disable=protected-access
+if isinstance(arg, int):
+return rx.PrimValue(_tir.IntImm("int64", arg))
+if isinstance(arg, float):
+return rx.PrimValue(_tir.FloatImm("float64", arg))
+if isinstance(arg, str):
+return rx.StringImm(arg)
+if isinstance(arg, _tir.PrimExpr):
+return rx.PrimValue(arg)
+if isinstance(arg, (tuple, list)):
+return rx.Tuple([_convert(e, f"{name}_{i}") for i, e in 
enumerate(arg)])
+raise TypeError(f"Unsupported input type: {type(arg)}")
+
+rx_inputs = _convert(args, "input")
+rx_outputs_sinfo = _convert(out, "dummy").struct_info
+return wrap_nested(
+_op.call_dps_packed(
+name,
+args=rx_inputs,
+out_sinfo=rx_outputs_sinfo,
+),
+name,
+)  # type: ignore
+
+
 def debug_func(
 name: str,
 *args: Union[Tensor, _tir.PrimExpr, int, float, str],



(tvm) branch feature/2023-12-24/extern-op created (now 0896465b87)

2023-12-24 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch feature/2023-12-24/extern-op
in repository https://gitbox.apache.org/repos/asf/tvm.git


  at 0896465b87 [Unity][nn.Module] Support Runtime-Calling Any PackedFunc 
via `op.extern`

This branch includes the following new commits:

 new 0896465b87 [Unity][nn.Module] Support Runtime-Calling Any PackedFunc 
via `op.extern`

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.




(tvm) 01/03: [Unity] Bump fpA_intB_gemm (#16244)

2023-12-18 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit e98fdea65460512b97ccc87be1f43e6e37486814
Author: Wuwei Lin 
AuthorDate: Thu Dec 14 11:38:00 2023 -0800

[Unity] Bump fpA_intB_gemm (#16244)

Updated preprocessing and submodule the support 3D weight for MoE.

* update

* update

* update
---
 3rdparty/cutlass_fpA_intB_gemm   |  2 +-
 src/runtime/contrib/cutlass/weight_preprocess.cc | 15 +--
 tests/scripts/task_config_build_gpu.sh   |  1 +
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/3rdparty/cutlass_fpA_intB_gemm b/3rdparty/cutlass_fpA_intB_gemm
index ed951b046f..74ee6cb468 16
--- a/3rdparty/cutlass_fpA_intB_gemm
+++ b/3rdparty/cutlass_fpA_intB_gemm
@@ -1 +1 @@
-Subproject commit ed951b046f89ddfd990af8d2482e3350bda2fec6
+Subproject commit 74ee6cb46816267515c08eb78755d2b9b8db0bb4
diff --git a/src/runtime/contrib/cutlass/weight_preprocess.cc 
b/src/runtime/contrib/cutlass/weight_preprocess.cc
index ef80627cc7..4b378fa4a7 100644
--- a/src/runtime/contrib/cutlass/weight_preprocess.cc
+++ b/src/runtime/contrib/cutlass/weight_preprocess.cc
@@ -37,18 +37,21 @@ namespace runtime {
 // The preprocessing functions are defined in C++, so we need to copy the 
input weight to CPU.
 TVM_REGISTER_GLOBAL("cutlass.ft_preprocess_weight")
 .set_body_typed([](NDArray packed_weight, int sm, bool is_int4) {
-  int rows = packed_weight->shape[0];
-  int cols = packed_weight->shape[1];
-  std::vector input_cpu(rows * cols);
-  std::vector output_cpu(rows * cols);
+  bool is_2d = packed_weight->ndim == 2;
+  int num_experts = is_2d ? 1 : packed_weight->shape[0];
+  int rows = packed_weight->shape[is_2d ? 0 : 1];
+  int cols = packed_weight->shape[is_2d ? 1 : 2];
+
+  std::vector input_cpu(num_experts * rows * cols);
+  std::vector output_cpu(num_experts * rows * cols);
   packed_weight.CopyToBytes(input_cpu.data(), input_cpu.size());
   // multiply cols by 2 since the "col" params in preprocess_weights 
refers to the column of
   // the unpacked weight.
   if (is_int4) {
 cols *= 2;
   }
-  fastertransformer::preprocess_weights(output_cpu.data(), 
input_cpu.data(), rows, cols,
-is_int4, sm);
+  fastertransformer::preprocess_weights(output_cpu.data(), 
input_cpu.data(), num_experts, rows,
+cols, is_int4, sm);
   auto out = NDArray::Empty(packed_weight.Shape(), packed_weight->dtype, 
packed_weight->device);
   out.CopyFromBytes(output_cpu.data(), output_cpu.size());
   return out;
diff --git a/tests/scripts/task_config_build_gpu.sh 
b/tests/scripts/task_config_build_gpu.sh
index 37ab0a87f1..e68e646ce1 100755
--- a/tests/scripts/task_config_build_gpu.sh
+++ b/tests/scripts/task_config_build_gpu.sh
@@ -54,3 +54,4 @@ echo set\(USE_PIPELINE_EXECUTOR ON\) >> config.cmake
 echo set\(USE_CUTLASS ON\) >> config.cmake
 echo set\(USE_CMSISNN ON\) >> config.cmake
 echo set\(USE_MSC ON\) >> config.cmake
+echo set\(CMAKE_CUDA_ARCHITECTURES 75\) >> config.cmake



(tvm) 03/03: [Unity] Fix ccache env for `nn.SourceModule` (#16257)

2023-12-18 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit 45eeb8c83857b1ff01bcc3930fe7d83e94626c10
Author: Yaxing Cai 
AuthorDate: Mon Dec 18 12:44:45 2023 -0800

[Unity] Fix ccache env for `nn.SourceModule` (#16257)

This PR refactors the compilation of `nn.SourceModule` to enable ccache by 
using the relative path, instead of using absolute path. Also it adds the 
ccache env to not hash the directory.
---
 python/tvm/relax/frontend/nn/extern.py | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relax/frontend/nn/extern.py 
b/python/tvm/relax/frontend/nn/extern.py
index 2d20809d23..8c0491eaec 100644
--- a/python/tvm/relax/frontend/nn/extern.py
+++ b/python/tvm/relax/frontend/nn/extern.py
@@ -371,17 +371,24 @@ class SourceModule(ExternModule):  # pylint: 
disable=too-few-public-methods
 """Compiles the source code in a provided directory and returns the 
compiled artifact."""
 with tempfile.TemporaryDirectory() as temp_dir_str:
 temp_dir = Path(temp_dir_str)
-source_path = temp_dir / f"main{self.source_suffix}"
-object_path = temp_dir / f"main{self.output_suffix}"
+source_filename = f"main{self.source_suffix}"
+object_filename = f"main{self.output_suffix}"
+source_path = temp_dir / source_filename
+object_path = temp_dir / object_filename
 with source_path.open("w", encoding="utf-8") as file:
 file.write(self.source_code)
 _cc.create_shared(
-output=str(object_path),
-objects=[str(source_path)],
+output=object_filename,
+objects=[source_filename],
 options=self.compile_options,
 cc=self.compiler,
 cwd=temp_dir,
-ccache_env={"CCACHE_COMPILERCHECK": "content"} if 
shutil.which("ccache") else None,
+ccache_env={
+"CCACHE_COMPILERCHECK": "content",
+"CCACHE_NOHASHDIR": "1",
+}
+if shutil.which("ccache")
+else None,
 )
 shutil.move(str(object_path), str(output_path))
 



(tvm) branch unity updated (c95d45f4bd -> 45eeb8c838)

2023-12-18 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


 discard c95d45f4bd [Unity] Fix ccache env for `nn.SourceModule` (#16257)
 discard 1bf4437bc9 [Fix] add TVM_DLL to disco functions (#16258)
 new e98fdea654 [Unity] Bump fpA_intB_gemm (#16244)
 new 4e66690a4d [Fix] add TVM_DLL to disco functions (#16258)
 new 45eeb8c838 [Unity] Fix ccache env for `nn.SourceModule` (#16257)

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (c95d45f4bd)
\
 N -- N -- N   refs/heads/unity (45eeb8c838)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 3rdparty/cutlass_fpA_intB_gemm   |  2 +-
 src/runtime/contrib/cutlass/weight_preprocess.cc | 15 +--
 tests/scripts/task_config_build_gpu.sh   |  1 +
 3 files changed, 11 insertions(+), 7 deletions(-)



(tvm) 02/03: [Fix] add TVM_DLL to disco functions (#16258)

2023-12-18 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit 4e66690a4d033af912f5051c0e5a16c9c10691d9
Author: Lesheng Jin 
AuthorDate: Sun Dec 17 18:08:58 2023 +

[Fix] add TVM_DLL to disco functions (#16258)
---
 include/tvm/runtime/disco/builtin.h  |  4 ++--
 include/tvm/runtime/disco/disco_worker.h |  2 +-
 include/tvm/runtime/relax_vm/ndarray_cache_support.h | 10 +-
 src/runtime/disco/builtin.cc |  4 ++--
 src/runtime/disco/disco_worker.cc|  2 +-
 src/runtime/relax_vm/ndarray_cache_support.cc| 11 ++-
 6 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/include/tvm/runtime/disco/builtin.h 
b/include/tvm/runtime/disco/builtin.h
index 3847aef3f2..512059b31b 100644
--- a/include/tvm/runtime/disco/builtin.h
+++ b/include/tvm/runtime/disco/builtin.h
@@ -89,14 +89,14 @@ void AllGather(NDArray send, NDArray recv);
  * \param send The buffer to be broadcasted
  * \param recv The buffer receives the broadcasted array
  */
-void BroadcastFromWorker0(NDArray send, NDArray recv);
+TVM_DLL void BroadcastFromWorker0(NDArray send, NDArray recv);
 /*!
  * \brief Perform a scatter operation from worker-0, chunking the given buffer 
into equal parts.
  * \param send For worker-0, it must be provided, and otherwise, the buffer 
must be None.
  * The buffer will be divided into equal parts and sent to each worker 
accordingly.
  * \param recv The receiving buffer, which must not be None.
  */
-void ScatterFromWorker0(Optional send, NDArray recv);
+TVM_DLL void ScatterFromWorker0(Optional send, NDArray recv);
 /*!
  * \brief Perform a gather operation to worker-0.
  * \param send The sending buffer, which must not be None.
diff --git a/include/tvm/runtime/disco/disco_worker.h 
b/include/tvm/runtime/disco/disco_worker.h
index 0c666150d4..14f8f23807 100644
--- a/include/tvm/runtime/disco/disco_worker.h
+++ b/include/tvm/runtime/disco/disco_worker.h
@@ -60,7 +60,7 @@ class DiscoWorker {
   /*! \brief Main loop of the worker */
   void MainLoop();
   /*! \brief Get the worker instance on the current thread */
-  static DiscoWorker* ThreadLocal();
+  TVM_DLL static DiscoWorker* ThreadLocal();
   /*! \brief Set the specific register to a specific value */
   void SetRegister(int reg_id, TVMArgValue value);
 
diff --git a/include/tvm/runtime/relax_vm/ndarray_cache_support.h 
b/include/tvm/runtime/relax_vm/ndarray_cache_support.h
index 3d8b639ee4..584da8f0ca 100644
--- a/include/tvm/runtime/relax_vm/ndarray_cache_support.h
+++ b/include/tvm/runtime/relax_vm/ndarray_cache_support.h
@@ -63,10 +63,10 @@ struct NDArrayCacheMetadata {
 };
 
 /*! \brief Load a FileRecord into memory */
-Array Load(Device device,   //
-const std::string& path_prefix,  //
-std::string* raw_data_buffer,//
-Optional* staging_buffer = nullptr) const;
+TVM_DLL Array Load(Device device,   //
+const std::string& path_prefix,  //
+std::string* raw_data_buffer,//
+Optional* staging_buffer = nullptr) 
const;
 
 /*! \brief Relative path to the bin file */
 std::string data_path;
@@ -83,7 +83,7 @@ struct NDArrayCacheMetadata {
   std::string path;
 
   /*! \brief Load the metadata from a specific directory */
-  static NDArrayCacheMetadata Load(const std::string& path);
+  TVM_DLL static NDArrayCacheMetadata Load(const std::string& path);
   /*! \brief Load the metadata from a given JSON string */
   static NDArrayCacheMetadata LoadFromStr(const std::string& json_str, const 
std::string& path);
 };
diff --git a/src/runtime/disco/builtin.cc b/src/runtime/disco/builtin.cc
index 51fe4c13fc..911fdaae3d 100644
--- a/src/runtime/disco/builtin.cc
+++ b/src/runtime/disco/builtin.cc
@@ -85,11 +85,11 @@ void AllReduce(NDArray send, ReduceKind reduce_kind, 
NDArray recv) {
 
 void AllGather(NDArray send, NDArray recv) { GetCCLFunc("allgather")(send, 
recv); }
 
-void BroadcastFromWorker0(NDArray send, NDArray recv) {
+TVM_DLL void BroadcastFromWorker0(NDArray send, NDArray recv) {
   GetCCLFunc("broadcast_from_worker0")(send, recv);
 }
 
-void ScatterFromWorker0(Optional send, NDArray recv) {
+TVM_DLL void ScatterFromWorker0(Optional send, NDArray recv) {
   GetCCLFunc("scatter_from_worker0")(send, recv);
 }
 
diff --git a/src/runtime/disco/disco_worker.cc 
b/src/runtime/disco/disco_worker.cc
index d3c6d6a383..e8ba351e79 100644
--- a/src/runtime/disco/disco_worker.cc
+++ b/src/runtime/disco/disco_worker.cc
@@ -37,7 +37,7 @@ struct ThreadLocalDiscoWorker {
   }
 };
 
-DiscoWorker* DiscoWorker::ThreadLocal() {
+TVM_DLL DiscoWorker* DiscoWorker::ThreadLocal() {
   Di

(tvm) branch revert-16235-fix-dtype-rewrite deleted (was a1920d6c48)

2023-12-18 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch revert-16235-fix-dtype-rewrite
in repository https://gitbox.apache.org/repos/asf/tvm.git


 was a1920d6c48 Revert "[Unity] Fix IndexDataTypeNormalizer so that it 
correctly handles corner case (#16235)"

The revisions that were on this branch are still contained in
other references; therefore, this change does not discard any commits
from the repository.



(tvm) branch revert-16258-fix/add_tvm_dll_for_disco deleted (was 541e221efe)

2023-12-18 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch revert-16258-fix/add_tvm_dll_for_disco
in repository https://gitbox.apache.org/repos/asf/tvm.git


 was 541e221efe Revert "[Fix] add TVM_DLL to disco functions (#16258)"

This change permanently discards the following revisions:

 discard 541e221efe Revert "[Fix] add TVM_DLL to disco functions (#16258)"
 discard 056a84457b [Unity] Fix ccache env for `nn.SourceModule` (#16257)



(tvm) branch unity updated (056a84457b -> c95d45f4bd)

2023-12-18 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


omit 056a84457b [Unity] Fix ccache env for `nn.SourceModule` (#16257)
 new 1bf4437bc9 [Fix] add TVM_DLL to disco functions (#16258)
 new c95d45f4bd [Unity] Fix ccache env for `nn.SourceModule` (#16257)

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (056a84457b)
\
 N -- N -- N   refs/heads/unity (c95d45f4bd)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 include/tvm/runtime/disco/builtin.h  |  4 ++--
 include/tvm/runtime/disco/disco_worker.h |  2 +-
 include/tvm/runtime/relax_vm/ndarray_cache_support.h | 10 +-
 src/runtime/disco/builtin.cc |  4 ++--
 src/runtime/disco/disco_worker.cc|  2 +-
 src/runtime/relax_vm/ndarray_cache_support.cc| 11 ++-
 6 files changed, 17 insertions(+), 16 deletions(-)



(tvm) 02/02: [Unity] Fix ccache env for `nn.SourceModule` (#16257)

2023-12-18 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit c95d45f4bd4b29cd3fb2950dfc5f8d53a4030bce
Author: Yaxing Cai 
AuthorDate: Mon Dec 18 12:44:45 2023 -0800

[Unity] Fix ccache env for `nn.SourceModule` (#16257)

This PR refactors the compilation of `nn.SourceModule` to enable ccache by 
using the relative path, instead of using absolute path. Also it adds the 
ccache env to not hash the directory.
---
 python/tvm/relax/frontend/nn/extern.py | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relax/frontend/nn/extern.py 
b/python/tvm/relax/frontend/nn/extern.py
index 2d20809d23..8c0491eaec 100644
--- a/python/tvm/relax/frontend/nn/extern.py
+++ b/python/tvm/relax/frontend/nn/extern.py
@@ -371,17 +371,24 @@ class SourceModule(ExternModule):  # pylint: 
disable=too-few-public-methods
 """Compiles the source code in a provided directory and returns the 
compiled artifact."""
 with tempfile.TemporaryDirectory() as temp_dir_str:
 temp_dir = Path(temp_dir_str)
-source_path = temp_dir / f"main{self.source_suffix}"
-object_path = temp_dir / f"main{self.output_suffix}"
+source_filename = f"main{self.source_suffix}"
+object_filename = f"main{self.output_suffix}"
+source_path = temp_dir / source_filename
+object_path = temp_dir / object_filename
 with source_path.open("w", encoding="utf-8") as file:
 file.write(self.source_code)
 _cc.create_shared(
-output=str(object_path),
-objects=[str(source_path)],
+output=object_filename,
+objects=[source_filename],
 options=self.compile_options,
 cc=self.compiler,
 cwd=temp_dir,
-ccache_env={"CCACHE_COMPILERCHECK": "content"} if 
shutil.which("ccache") else None,
+ccache_env={
+"CCACHE_COMPILERCHECK": "content",
+"CCACHE_NOHASHDIR": "1",
+}
+if shutil.which("ccache")
+else None,
 )
 shutil.move(str(object_path), str(output_path))
 



(tvm) 01/02: [Fix] add TVM_DLL to disco functions (#16258)

2023-12-18 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit 1bf4437bc90ab62529a6ab86d5c00a04109f2a01
Author: Lesheng Jin 
AuthorDate: Sun Dec 17 18:08:58 2023 +

[Fix] add TVM_DLL to disco functions (#16258)
---
 include/tvm/runtime/disco/builtin.h  |  4 ++--
 include/tvm/runtime/disco/disco_worker.h |  2 +-
 include/tvm/runtime/relax_vm/ndarray_cache_support.h | 10 +-
 src/runtime/disco/builtin.cc |  4 ++--
 src/runtime/disco/disco_worker.cc|  2 +-
 src/runtime/relax_vm/ndarray_cache_support.cc| 11 ++-
 6 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/include/tvm/runtime/disco/builtin.h 
b/include/tvm/runtime/disco/builtin.h
index 3847aef3f2..512059b31b 100644
--- a/include/tvm/runtime/disco/builtin.h
+++ b/include/tvm/runtime/disco/builtin.h
@@ -89,14 +89,14 @@ void AllGather(NDArray send, NDArray recv);
  * \param send The buffer to be broadcasted
  * \param recv The buffer receives the broadcasted array
  */
-void BroadcastFromWorker0(NDArray send, NDArray recv);
+TVM_DLL void BroadcastFromWorker0(NDArray send, NDArray recv);
 /*!
  * \brief Perform a scatter operation from worker-0, chunking the given buffer 
into equal parts.
  * \param send For worker-0, it must be provided, and otherwise, the buffer 
must be None.
  * The buffer will be divided into equal parts and sent to each worker 
accordingly.
  * \param recv The receiving buffer, which must not be None.
  */
-void ScatterFromWorker0(Optional send, NDArray recv);
+TVM_DLL void ScatterFromWorker0(Optional send, NDArray recv);
 /*!
  * \brief Perform a gather operation to worker-0.
  * \param send The sending buffer, which must not be None.
diff --git a/include/tvm/runtime/disco/disco_worker.h 
b/include/tvm/runtime/disco/disco_worker.h
index 0c666150d4..14f8f23807 100644
--- a/include/tvm/runtime/disco/disco_worker.h
+++ b/include/tvm/runtime/disco/disco_worker.h
@@ -60,7 +60,7 @@ class DiscoWorker {
   /*! \brief Main loop of the worker */
   void MainLoop();
   /*! \brief Get the worker instance on the current thread */
-  static DiscoWorker* ThreadLocal();
+  TVM_DLL static DiscoWorker* ThreadLocal();
   /*! \brief Set the specific register to a specific value */
   void SetRegister(int reg_id, TVMArgValue value);
 
diff --git a/include/tvm/runtime/relax_vm/ndarray_cache_support.h 
b/include/tvm/runtime/relax_vm/ndarray_cache_support.h
index 3d8b639ee4..584da8f0ca 100644
--- a/include/tvm/runtime/relax_vm/ndarray_cache_support.h
+++ b/include/tvm/runtime/relax_vm/ndarray_cache_support.h
@@ -63,10 +63,10 @@ struct NDArrayCacheMetadata {
 };
 
 /*! \brief Load a FileRecord into memory */
-Array Load(Device device,   //
-const std::string& path_prefix,  //
-std::string* raw_data_buffer,//
-Optional* staging_buffer = nullptr) const;
+TVM_DLL Array Load(Device device,   //
+const std::string& path_prefix,  //
+std::string* raw_data_buffer,//
+Optional* staging_buffer = nullptr) 
const;
 
 /*! \brief Relative path to the bin file */
 std::string data_path;
@@ -83,7 +83,7 @@ struct NDArrayCacheMetadata {
   std::string path;
 
   /*! \brief Load the metadata from a specific directory */
-  static NDArrayCacheMetadata Load(const std::string& path);
+  TVM_DLL static NDArrayCacheMetadata Load(const std::string& path);
   /*! \brief Load the metadata from a given JSON string */
   static NDArrayCacheMetadata LoadFromStr(const std::string& json_str, const 
std::string& path);
 };
diff --git a/src/runtime/disco/builtin.cc b/src/runtime/disco/builtin.cc
index 51fe4c13fc..911fdaae3d 100644
--- a/src/runtime/disco/builtin.cc
+++ b/src/runtime/disco/builtin.cc
@@ -85,11 +85,11 @@ void AllReduce(NDArray send, ReduceKind reduce_kind, 
NDArray recv) {
 
 void AllGather(NDArray send, NDArray recv) { GetCCLFunc("allgather")(send, 
recv); }
 
-void BroadcastFromWorker0(NDArray send, NDArray recv) {
+TVM_DLL void BroadcastFromWorker0(NDArray send, NDArray recv) {
   GetCCLFunc("broadcast_from_worker0")(send, recv);
 }
 
-void ScatterFromWorker0(Optional send, NDArray recv) {
+TVM_DLL void ScatterFromWorker0(Optional send, NDArray recv) {
   GetCCLFunc("scatter_from_worker0")(send, recv);
 }
 
diff --git a/src/runtime/disco/disco_worker.cc 
b/src/runtime/disco/disco_worker.cc
index d3c6d6a383..e8ba351e79 100644
--- a/src/runtime/disco/disco_worker.cc
+++ b/src/runtime/disco/disco_worker.cc
@@ -37,7 +37,7 @@ struct ThreadLocalDiscoWorker {
   }
 };
 
-DiscoWorker* DiscoWorker::ThreadLocal() {
+TVM_DLL DiscoWorker* DiscoWorker::ThreadLocal() {
   Di

(tvm) 01/01: Revert "[Fix] add TVM_DLL to disco functions (#16258)"

2023-12-18 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch revert-16258-fix/add_tvm_dll_for_disco
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit 541e221efe10eb3b5ec92b4d6d8130013b9509e8
Author: Junru Shao 
AuthorDate: Mon Dec 18 12:45:11 2023 -0800

Revert "[Fix] add TVM_DLL to disco functions (#16258)"

This reverts commit 7c352677568df0f12c49a4b5b8864b11fb37701f.



(tvm) branch revert-16258-fix/add_tvm_dll_for_disco created (now 541e221efe)

2023-12-18 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch revert-16258-fix/add_tvm_dll_for_disco
in repository https://gitbox.apache.org/repos/asf/tvm.git


  at 541e221efe Revert "[Fix] add TVM_DLL to disco functions (#16258)"

This branch includes the following new commits:

 new 541e221efe Revert "[Fix] add TVM_DLL to disco functions (#16258)"

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.




(tvm) branch unity updated: [Unity] Fix ccache env for `nn.SourceModule` (#16257)

2023-12-18 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 056a84457b [Unity] Fix ccache env for `nn.SourceModule` (#16257)
056a84457b is described below

commit 056a84457b87fabec4bc05259ca207fd585d61b7
Author: Yaxing Cai 
AuthorDate: Mon Dec 18 12:44:45 2023 -0800

[Unity] Fix ccache env for `nn.SourceModule` (#16257)

This PR refactors the compilation of `nn.SourceModule` to enable ccache by 
using the relative path, instead of using absolute path. Also it adds the 
ccache env to not hash the directory.
---
 python/tvm/relax/frontend/nn/extern.py | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relax/frontend/nn/extern.py 
b/python/tvm/relax/frontend/nn/extern.py
index 2d20809d23..8c0491eaec 100644
--- a/python/tvm/relax/frontend/nn/extern.py
+++ b/python/tvm/relax/frontend/nn/extern.py
@@ -371,17 +371,24 @@ class SourceModule(ExternModule):  # pylint: 
disable=too-few-public-methods
 """Compiles the source code in a provided directory and returns the 
compiled artifact."""
 with tempfile.TemporaryDirectory() as temp_dir_str:
 temp_dir = Path(temp_dir_str)
-source_path = temp_dir / f"main{self.source_suffix}"
-object_path = temp_dir / f"main{self.output_suffix}"
+source_filename = f"main{self.source_suffix}"
+object_filename = f"main{self.output_suffix}"
+source_path = temp_dir / source_filename
+object_path = temp_dir / object_filename
 with source_path.open("w", encoding="utf-8") as file:
 file.write(self.source_code)
 _cc.create_shared(
-output=str(object_path),
-objects=[str(source_path)],
+output=object_filename,
+objects=[source_filename],
 options=self.compile_options,
 cc=self.compiler,
 cwd=temp_dir,
-ccache_env={"CCACHE_COMPILERCHECK": "content"} if 
shutil.which("ccache") else None,
+ccache_env={
+"CCACHE_COMPILERCHECK": "content",
+"CCACHE_NOHASHDIR": "1",
+}
+if shutil.which("ccache")
+else None,
 )
 shutil.move(str(object_path), str(output_path))
 



(tvm) branch unity updated (c796f47f95 -> 95f1b5c0e8)

2023-12-18 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


omit c796f47f95 fix
omit 0760ab0929 cleanup
omit f238c3b264 fix
omit fffbd9b79c fix
omit 976a951728 fix
omit 02f6f7daf2 fix
omit c606a069c3 kernel
omit 33989b48d9 moe update
omit bbb8f40d25 add kernel for copying cache blocks
omit f19ab37142 update FT rev to support H100
omit d0634f26ac enable sm90 codegen for thrust and vllm
omit e73558e41d WA for H100
omit 8e448d8f29 window_size_left should be max_seqlen_k, not num_key for 
the var_len case
omit abde85edbc fix window_size_left param for var len attention
omit 1fda9ad03e fix fp32 accum not getting enabled
omit 605a9ad8ce Add vllm kernels
omit 6e3c8d34cd use fp32 accum in cublas
omit adb9edfff0 (Hacky) VM allocator changes to support batched serving 
with memory profiling
omit 2f7691e245 disable sinfo check
 add aae1112a65 [Unity] Support constant args in `nn.ExternModule` (#16130)
 add 756ce9917f [Unity][3rdparty] Remove TVM in 3rdparty of FlashInfer 
(#16155)
 add 1de8b347d1 [Unity][DistIR] LowerGlobalViewToLocalView (#16095)
 add 2dcb8716e8 [Unity][BlockBuilder] Depracate `BlockBuilder.get()` and 
change it to `BlockBuilder.finalize()` (#16090)
 add 8f24a272a0 [Unity][MSC][M2.1] Add Manager for compile pipeline (#16163)
 add af803cf7b4 [Unity][DLight] Fix `general_reduction` for GroupNorm 
(#16161)
 add 64fe5a8a89 [Unity][DistIR] Add DTensor struct info propagation rule 
for stop_lift_params (#16170)
 add c640d0a3c9 [Unity][Web] Fix missing function NVTXScopedRange for web 
(#16177)
 add 8a6184ccfa [Unity, BYOC] Add check for leaking intemediate variables 
for cublas and cudnn (#16175)
 add a6adaae5ef [Unity][DistIR] LowerDistIR (#16169)
 add 85389efa2c [Unity][BYOC] Fix Flash var_len attention with sliding 
window (#16185)
 add 68443482c9 [Unity][Bugfix] Handle symbolic matching with 
non-structural match (#15994)
 add d52a9bf388 [Unity][Transform] Implement RemoveUnusedOutputs (#16117)
 add d6015c5643 [Unity][BugFix] Fix a bug in relax gelu_tanh computation 
(#16188)
 add fe9d2fe57d [Unity][Transform] Implement ExpandTupleArguments (#16115)
 add fc324d0f2c [Unity][Transform] Implement RemoveUnusedParameters (#16116)
 add 74667b97f0 [Unity] Enable ccache for `nn.SourceModule` (#16189)
 add ed2772f9c8 [Unity][MSC][M2.1] Add pruner for model pruning (#16186)
 add 9e4e17ca88 [Unity][WebGPU] Get params from cache by name (#16198)
 add a2f55a8812 [WEBGPU] Update to latest compilationHints API (#16197)
 add 8f95f6147a [Unity] [Transform] Remove iteration over functions in 
function pass (#16173)
 add 3c7067d6ed [Unity] Minor: Remove debug logging (#16200)
 add 34fd234f55 [Unity] Check usage location when canonicalizing trivial 
bindings (#16193)
 add 4e8c975700 [Unity][Bugfix] Fix 
`tests/python/topi/test_topi_transform.py::test_relax_dynamic_strided_slice` 
(#16205)
 add d0504027bb [Unity] Update FlashInfer (#16208)
 add ebbad09cd5 [Unity] Upgrade cutlass_fpA_intB_gemm (#16206)
 add 03fc4f6f03 [Dlight] Change max_threads on CUDA (#16203)
 add 58e622b74d [Unity][Transform] Implement Relax function inlining 
(#16194)
 add e0518da2a5 [Unity][MSC][M2.3] Add tracker for track layer datas 
(#16207)
 add 35e8404f17 [Disco] Expose `DiscoWorker` and `ndarray_cache_support` in 
header (#16153)
 add f18d186559 [Unity] Speed up NormalizeGlobalVar (#16219)
 add b5b980e33a [Unity] Support out dtype for nn.Linear and nn.MultiLinear 
(#16220)
 add 8241385f59 [Unity] De-duplicate calls to TensorStructInfo constructor 
(#16209)
 add 2772fb072a [Unity] Fix upstream tests that fail on unity branch 
(#16196)
 add c6d4926529 [Dlight] Fix NormalizePrimFunc with scalar block (#16156)
 add af14fbbbe1 [Relax] Fix to enable emit_te of topi scan/sort kernels 
(#16226)
 add 943508a295 [Unity] Fix typo in dlight fallback rule (#16230)
 add cbcb67c047 [Unity][Frontend] Add the `sum` op to frontend ops (#16225)
 add fe89ccc360 [Unity][Transform] Pass for automatically extracting 
DataflowBlocks (#16204)
 add f7b0193f9d [Unity] Fix IndexDataTypeNormalizer so that it correctly 
handles corner case (#16235)
 add e100a13737 [Unity] Fix legalizing strided slice (#16232)
 add 674167805c Revert "[Unity] Fix IndexDataTypeNormalizer so that it 
correctly handles corner case" (#16241)
 add 6118b770b1 [Unity] Improved error checking for DataflowBlock in nested 
SeqExpr (#16195)
 add e1964eceb5 [Unity] Add runtime debugging method to RelaxVM (#16238)
 add cd9445d63b [Unity][lm_support] window kvcache sink (#16240)
 add a2e19d21eb [Unity] Fix IndexDataTypeNormalizer so that it correctly 
handles corner case (#16245)
 add 8edfee8574 [Unity][MSC][M2.4] Add

(tvm) branch unity updated: [Fix] add TVM_DLL to disco functions (#16258)

2023-12-17 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 7c35267756 [Fix] add TVM_DLL to disco functions (#16258)
7c35267756 is described below

commit 7c352677568df0f12c49a4b5b8864b11fb37701f
Author: Lesheng Jin <34279105+lesheng...@users.noreply.github.com>
AuthorDate: Mon Dec 18 15:32:52 2023 +0800

[Fix] add TVM_DLL to disco functions (#16258)
---
 include/tvm/runtime/disco/builtin.h  |  4 ++--
 include/tvm/runtime/disco/disco_worker.h |  2 +-
 include/tvm/runtime/relax_vm/ndarray_cache_support.h | 10 +-
 src/runtime/disco/builtin.cc |  4 ++--
 src/runtime/disco/disco_worker.cc|  2 +-
 src/runtime/relax_vm/ndarray_cache_support.cc| 11 ++-
 6 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/include/tvm/runtime/disco/builtin.h 
b/include/tvm/runtime/disco/builtin.h
index 3847aef3f2..512059b31b 100644
--- a/include/tvm/runtime/disco/builtin.h
+++ b/include/tvm/runtime/disco/builtin.h
@@ -89,14 +89,14 @@ void AllGather(NDArray send, NDArray recv);
  * \param send The buffer to be broadcasted
  * \param recv The buffer receives the broadcasted array
  */
-void BroadcastFromWorker0(NDArray send, NDArray recv);
+TVM_DLL void BroadcastFromWorker0(NDArray send, NDArray recv);
 /*!
  * \brief Perform a scatter operation from worker-0, chunking the given buffer 
into equal parts.
  * \param send For worker-0, it must be provided, and otherwise, the buffer 
must be None.
  * The buffer will be divided into equal parts and sent to each worker 
accordingly.
  * \param recv The receiving buffer, which must not be None.
  */
-void ScatterFromWorker0(Optional send, NDArray recv);
+TVM_DLL void ScatterFromWorker0(Optional send, NDArray recv);
 /*!
  * \brief Perform a gather operation to worker-0.
  * \param send The sending buffer, which must not be None.
diff --git a/include/tvm/runtime/disco/disco_worker.h 
b/include/tvm/runtime/disco/disco_worker.h
index 0c666150d4..14f8f23807 100644
--- a/include/tvm/runtime/disco/disco_worker.h
+++ b/include/tvm/runtime/disco/disco_worker.h
@@ -60,7 +60,7 @@ class DiscoWorker {
   /*! \brief Main loop of the worker */
   void MainLoop();
   /*! \brief Get the worker instance on the current thread */
-  static DiscoWorker* ThreadLocal();
+  TVM_DLL static DiscoWorker* ThreadLocal();
   /*! \brief Set the specific register to a specific value */
   void SetRegister(int reg_id, TVMArgValue value);
 
diff --git a/include/tvm/runtime/relax_vm/ndarray_cache_support.h 
b/include/tvm/runtime/relax_vm/ndarray_cache_support.h
index 3d8b639ee4..584da8f0ca 100644
--- a/include/tvm/runtime/relax_vm/ndarray_cache_support.h
+++ b/include/tvm/runtime/relax_vm/ndarray_cache_support.h
@@ -63,10 +63,10 @@ struct NDArrayCacheMetadata {
 };
 
 /*! \brief Load a FileRecord into memory */
-Array Load(Device device,   //
-const std::string& path_prefix,  //
-std::string* raw_data_buffer,//
-Optional* staging_buffer = nullptr) const;
+TVM_DLL Array Load(Device device,   //
+const std::string& path_prefix,  //
+std::string* raw_data_buffer,//
+Optional* staging_buffer = nullptr) 
const;
 
 /*! \brief Relative path to the bin file */
 std::string data_path;
@@ -83,7 +83,7 @@ struct NDArrayCacheMetadata {
   std::string path;
 
   /*! \brief Load the metadata from a specific directory */
-  static NDArrayCacheMetadata Load(const std::string& path);
+  TVM_DLL static NDArrayCacheMetadata Load(const std::string& path);
   /*! \brief Load the metadata from a given JSON string */
   static NDArrayCacheMetadata LoadFromStr(const std::string& json_str, const 
std::string& path);
 };
diff --git a/src/runtime/disco/builtin.cc b/src/runtime/disco/builtin.cc
index 51fe4c13fc..911fdaae3d 100644
--- a/src/runtime/disco/builtin.cc
+++ b/src/runtime/disco/builtin.cc
@@ -85,11 +85,11 @@ void AllReduce(NDArray send, ReduceKind reduce_kind, 
NDArray recv) {
 
 void AllGather(NDArray send, NDArray recv) { GetCCLFunc("allgather")(send, 
recv); }
 
-void BroadcastFromWorker0(NDArray send, NDArray recv) {
+TVM_DLL void BroadcastFromWorker0(NDArray send, NDArray recv) {
   GetCCLFunc("broadcast_from_worker0")(send, recv);
 }
 
-void ScatterFromWorker0(Optional send, NDArray recv) {
+TVM_DLL void ScatterFromWorker0(Optional send, NDArray recv) {
   GetCCLFunc("scatter_from_worker0")(send, recv);
 }
 
diff --git a/src/runtime/disco/disco_worker.cc 
b/src/runtime/disco/disco_worker.cc
index d3c6d6a383..e8ba351e79 100644
--- a/src/runtime/d

(tvm) branch unity updated: [Unity][nn.Module] Refactor `ExternModule` (#16247)

2023-12-16 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new a0e58987b0 [Unity][nn.Module] Refactor `ExternModule` (#16247)
a0e58987b0 is described below

commit a0e58987b07743c6880e3d73a6b15cb7ea995783
Author: Junru Shao 
AuthorDate: Sat Dec 16 13:59:51 2023 -0800

[Unity][nn.Module] Refactor `ExternModule` (#16247)

`nn.ExternModule` allows incorporation of handcrafted kernels into the
compilation stack and being invoked by Relax just like TIR or any other
ordinary operator. This PR simplifies its workflow.

The system consists of the abstract base class `ExternModule` and its
two derivatives:
- `.o` (object files) can be linked using `ObjectModule`.
- `.cpp` (C++ files) and `.cu` (CUDA files) can be compiled and linked
  into the system usung `SourceModule`.

**Symbols, and shape/dtype inference.**
To provide the system with sufficient information about the kernels, it
is required to provide all symbols of an external module, as well as a
method for each symbol that tells the system about the output dtype/shape
of this symbol.

Consider a case where function `my_func` accepts two tensors, `a` of
shape `(x, y, 1)`, `b` of shape `(y, z, 5)`, and then produces a tensor
`c` of shape `(x, y, z, 9)`, the shape/dtype inference function should
look like:

```python
def shape_dtype_inference(a, b):
  x, y, _ = a.shape
  _, z, _ = b.shape
  return nn.Tensor.placeholder((x, y, z, 9), dtype="float32")
```

Regarding the interface, the symbols and their corresponding shape/dtype
inference function should be provided as a Python dictionary that maps
each symbol to the function as below:

```python
symbols={
  "my_func": shape_dtype_inference,
}
```

**Calling convention.**
All external modules now follows "destination-passing-style" (DPS)
calling convention, which means the returned tensors are pre-allocated
by the system already and passed in as an argument of the external
function.

Reuse the example above, the implementation of `my_func` should include
three parameters in its signature, where tensors are represented using
DLTensor from DLPack, the de facto standard of in-memory representation
of tensors. More info on DLPack:
https://github.com/dmlc/dlpack/blob/v0.8/include/dlpack/dlpack.h#L163-L206.

To expose the symbol, `TVM_DLL_EXPORT_TYPED_FUNC(symbol, function)` is
guaranteed available:

```C++
// those headers are guaranteed to be available
\#include 
\#include 
\#include 
namespace {
// anonymous namespace hides the symbol `_my_func_impl` from other TUs
int _my_func_impl(DLTensor* a, DLTensor* b, DLTensor* c) {
// `a` and `b` are inputs, and `c` is the output
}
}
// expose symbol `my_func` instead of `_my_func_impl`
TVM_DLL_EXPORT_TYPED_FUNC(my_func, _my_func_impl);
```

**A compiler pass `AttachExternModules`.**
It is introduced to attach a list of `nn.ExternModule`s into an IRModule
at any stage of the compilation pipeline, and attach the compiled external
modules as `runtime.Module`s into IRModule's `external_mods` attribute.
It is required by linking in `relax.build`, but with the existence of
this pass, source compilation can be deferred to arbitrary stage of TVM
compilation.

**Caveats.**
It is required to call `nn.add_extern` to register external modules exactly
once during `export_tvm`. Each symbol should be registered exactly once to
avoid potential conflicts, and otherwise an error will be raised. This
programming model might be a bit of constraint, and we will consider loose
it slightly in the future.

Also, for backward compatibility, `ExternModule`s are exported from
`export_tvm` only when `allow_extern` flag is turned on. Otherwise, any
external module will cause an exception asking to turn on the flag.
---
 3rdparty/flashinfer|   2 +-
 python/tvm/contrib/cc.py   |   2 +-
 python/tvm/relax/frontend/nn/__init__.py   |  13 +-
 python/tvm/relax/frontend/nn/core.py   | 381 +++-
 python/tvm/relax/frontend/nn/exporter.py   | 314 
 python/tvm/relax/frontend/nn/extern.py | 392 
 python/tvm/relax/frontend/nn/modules.py|  66 
 python/tvm/relax/frontend/nn/op.py | 119 +++
 python/tvm/relax/frontend/nn/spec.py   | 393 +
 python/tvm/relax/transform/__init__.py |   1 +
 .../tvm/relax/transform/attach_external_modules.

(tvm) branch unity updated: [Unity] Avoid to use `std::regex` (#16249)

2023-12-15 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new f794db4373 [Unity] Avoid to use `std::regex` (#16249)
f794db4373 is described below

commit f794db4373d48ee8a5c0b6d47e9a70019a1613bd
Author: Siyuan Feng 
AuthorDate: Sat Dec 16 08:15:38 2023 +0800

[Unity] Avoid to use `std::regex` (#16249)

`std::regex` in TVM codebase may cause a symbol conflict with PyTorch,
we temporarily disable it before we find a better solution, meanwhile
the current usage of `std::regex` is not necessary.
---
 src/node/script_printer.cc| 15 ---
 src/relax/ir/dataflow_matcher.cc  |  7 ---
 src/runtime/contrib/cublas/cublas_json_runtime.cc |  1 -
 src/runtime/contrib/cudnn/cudnn_json_runtime.cc   |  3 +--
 tests/python/relax/test_dataflow_pattern.py   |  4 +++-
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/node/script_printer.cc b/src/node/script_printer.cc
index f2d985279f..38334de357 100644
--- a/src/node/script_printer.cc
+++ b/src/node/script_printer.cc
@@ -21,7 +21,7 @@
 #include 
 #include 
 
-#include 
+#include 
 
 namespace tvm {
 
@@ -38,8 +38,17 @@ std::string TVMScriptPrinter::Script(const ObjectRef& node, 
const Optional 0 &&//
+ (std::isalpha(name[0]) || name[0] == '_') &&  //
+ std::all_of(name.begin() + 1, name.end(),
+ [](char c) { return std::isalnum(c) || c == '_'; });
 }
 
 PrinterConfig::PrinterConfig(Map config_dict) {
diff --git a/src/relax/ir/dataflow_matcher.cc b/src/relax/ir/dataflow_matcher.cc
index 9524c90b57..7fb67d9376 100644
--- a/src/relax/ir/dataflow_matcher.cc
+++ b/src/relax/ir/dataflow_matcher.cc
@@ -36,7 +36,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
@@ -557,8 +557,9 @@ bool DFPatternMatcher::VisitDFPattern_(const 
DataflowVarPatternNode* op, const E
 bool DFPatternMatcher::VisitDFPattern_(const GlobalVarPatternNode* op, const 
Expr& expr) {
   // GlobalVarPattern is not inherited from Var, so we need to handle it 
separately.
   if (const auto* var_node = expr.as()) {
-std::regex pat{std::string(op->name_hint())};
-return "" == op->name_hint() || 
std::regex_search(std::string(var_node->name_hint), pat);
+std::string pat = std::string(op->name_hint());
+std::string var_name = std::string(var_node->name_hint);
+return pat.empty() || var_name.find(pat) != std::string::npos;
   }
   return false;
 }
diff --git a/src/runtime/contrib/cublas/cublas_json_runtime.cc 
b/src/runtime/contrib/cublas/cublas_json_runtime.cc
index c6916d4f86..23e35d2f71 100644
--- a/src/runtime/contrib/cublas/cublas_json_runtime.cc
+++ b/src/runtime/contrib/cublas/cublas_json_runtime.cc
@@ -26,7 +26,6 @@
 #include 
 
 #include 
-#include 
 #include 
 #include 
 
diff --git a/src/runtime/contrib/cudnn/cudnn_json_runtime.cc 
b/src/runtime/contrib/cudnn/cudnn_json_runtime.cc
index 58e4e59afc..7d701396d0 100644
--- a/src/runtime/contrib/cudnn/cudnn_json_runtime.cc
+++ b/src/runtime/contrib/cudnn/cudnn_json_runtime.cc
@@ -26,7 +26,6 @@
 #include 
 
 #include 
-#include 
 #include 
 #include 
 
@@ -54,7 +53,7 @@ class cuDNNJSONRuntime : public JSONRuntimeBase {
 stream = static_cast((*func)().operator void*());
 
 auto attr_in_name = [](const std::string& op_name, const std::string& 
attr_name) {
-  return std::regex_search(op_name, std::regex(attr_name));
+  return op_name.find(attr_name) != std::string::npos;
 };
 
 auto vstr2vint = [](const JSONGraphNode& node, const std::string& attrStr) 
{
diff --git a/tests/python/relax/test_dataflow_pattern.py 
b/tests/python/relax/test_dataflow_pattern.py
index 685a382ad7..edd3bd1610 100644
--- a/tests/python/relax/test_dataflow_pattern.py
+++ b/tests/python/relax/test_dataflow_pattern.py
@@ -97,7 +97,9 @@ def test_dataflow_var_pattern():
 
 def test_global_var_pattern():
 assert is_gv("x").match(rx.GlobalVar("x"))
-assert is_gv("x.*").match(rx.GlobalVar("x_2"))
+# TODO: disabled as regex is not supported due to
+# symbol conflict with PyTorch
+# assert is_gv("x.*").match(rx.GlobalVar("x_2"))
 assert is_gv().match(rx.GlobalVar("x"))
 assert not is_gv("x").match(rx.GlobalVar("y"))
 assert not is_gv("x").match(rx.Var("x"))



(tvm) branch unity updated: [Disco] Expose `DiscoWorker` and `ndarray_cache_support` in header (#16153)

2023-12-10 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 35e8404f17 [Disco] Expose `DiscoWorker` and `ndarray_cache_support` in 
header (#16153)
35e8404f17 is described below

commit 35e8404f17a2aae2e00110748ecce515395a5428
Author: Lesheng Jin <34279105+lesheng...@users.noreply.github.com>
AuthorDate: Sun Dec 10 18:25:40 2023 +0800

[Disco] Expose `DiscoWorker` and `ndarray_cache_support` in header (#16153)
---
 {src => include/tvm}/runtime/disco/builtin.h   |  38 -
 .../tvm/runtime/disco/disco_worker.h   |  71 +
 include/tvm/runtime/disco/session.h|  20 ++-
 .../tvm}/runtime/relax_vm/ndarray_cache_support.h  |  39 ++---
 python/tvm/relax/frontend/nn/core.py   |   2 +
 python/tvm/relax/frontend/nn/op.py |  39 +
 python/tvm/runtime/disco/process_pool.py   |  16 +-
 python/tvm/runtime/disco/session.py|   3 +-
 src/runtime/disco/bcast_session.h  |   3 +-
 src/runtime/disco/builtin.cc   |   5 +-
 src/runtime/disco/{worker.cc => disco_worker.cc}   |   8 +-
 src/runtime/disco/disco_worker_thread.h|  83 +++
 src/runtime/disco/loader.cc|  89 ++--
 src/runtime/disco/nccl/nccl.cc |   5 +-
 src/runtime/disco/process_session.cc   |   9 +-
 src/runtime/disco/session.cc   |   3 +-
 src/runtime/disco/threaded_session.cc  |   3 +-
 src/runtime/disco/utils.h  |  32 +---
 src/runtime/relax_vm/ndarray_cache_support.cc  | 161 ++---
 19 files changed, 379 insertions(+), 250 deletions(-)

diff --git a/src/runtime/disco/builtin.h b/include/tvm/runtime/disco/builtin.h
similarity index 81%
rename from src/runtime/disco/builtin.h
rename to include/tvm/runtime/disco/builtin.h
index cfbf2e2477..3847aef3f2 100644
--- a/src/runtime/disco/builtin.h
+++ b/include/tvm/runtime/disco/builtin.h
@@ -25,11 +25,37 @@
 
 #include 
 
-#include "./utils.h"
-
 namespace tvm {
 namespace runtime {
 
+/*!
+ * \brief Possible kinds of reduction operations.
+ */
+enum class ReduceKind : int32_t {
+  kSum = 0,
+  kProd = 1,
+  kMin = 2,
+  kMax = 3,
+  kAvg = 4,
+};
+
+/*! \brief Converts `ReduceKind` to string */
+inline std::string ReduceKind2String(ReduceKind kind) {
+  switch (kind) {
+case ReduceKind::kSum:
+  return "kSum";
+case ReduceKind::kProd:
+  return "kProd";
+case ReduceKind::kMin:
+  return "kMin";
+case ReduceKind::kMax:
+  return "kMax";
+case ReduceKind::kAvg:
+  return "kAvg";
+  }
+  LOG(FATAL) << "ValueError: Unknown ReduceKind: " << static_cast(kind);
+}
+
 /*!
  * \brief Load a runtime Module, then create and initialize a RelaxVM
  * \param path The path to the runtime Module (a DSO file) to be loaded
@@ -49,19 +75,19 @@ NDArray DiscoEmptyNDArray(ShapeTuple shape, DataType dtype, 
Device device);
  * \brief Perform an allreduce operation using the underlying communication 
library
  * \param send The array send to perform allreduce on
  * \param reduce_kind The kind of reduction operation (e.g. sum, avg, min, max)
- * \return The outcome of allreduce
+ * \param recv The array receives the outcome of allreduce
  */
 void AllReduce(NDArray send, ReduceKind reduce_kind, NDArray recv);
 /*!
  * \brief Perform an allgather operation using the underlying communication 
library
  * \param send The array send to perform allgather on
- * \return The outcome of allgather
+ * \param recv The array receives the outcome of allgather
  */
 void AllGather(NDArray send, NDArray recv);
 /*!
  * \brief Perform a broadcast operation from worker-0
- * \param buffer The buffer to be broadcasted
- * \return The result buffer
+ * \param send The buffer to be broadcasted
+ * \param recv The buffer receives the broadcasted array
  */
 void BroadcastFromWorker0(NDArray send, NDArray recv);
 /*!
diff --git a/src/runtime/disco/worker.h 
b/include/tvm/runtime/disco/disco_worker.h
similarity index 60%
rename from src/runtime/disco/worker.h
rename to include/tvm/runtime/disco/disco_worker.h
index e948fa1668..0c666150d4 100644
--- a/src/runtime/disco/worker.h
+++ b/include/tvm/runtime/disco/disco_worker.h
@@ -17,44 +17,22 @@
  * under the License.
  */
 /*!
- * \file worker.h
+ * \file disco_worker.h
  * \brief This file defines a worker in Disco. A worker can be launched in a 
separate thread or
  * process as long as the channel supports bi-directional communication 
in-between the worker and
  * the controler.
  */
-#ifndef TVM_RUNTIME_DISCO_WORKER_H_
-#define TVM_RUNTIME_DISCO_WORKER_H_
+#ifndef TVM_RUNTIME_DISCO_DISCO_WORKER_H_
+#defi

(tvm) branch main updated: [Device][Metal] Fix metal warp size (#16192)

2023-12-02 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 2eb17fa87f [Device][Metal] Fix metal warp size (#16192)
2eb17fa87f is described below

commit 2eb17fa87f5661cf25ebf516cd405ef7be34fc40
Author: Hongyi Jin 
AuthorDate: Sat Dec 2 03:29:42 2023 -0500

[Device][Metal] Fix metal warp size (#16192)

Metal warp size should be 1 on x86 device, but 32 on M1/M2 device
---
 src/runtime/metal/metal_device_api.mm | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/runtime/metal/metal_device_api.mm 
b/src/runtime/metal/metal_device_api.mm
index c0fe0b76a0..f7c2976d22 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -55,8 +55,14 @@ void MetalWorkspace::GetAttr(Device dev, DeviceAttrKind 
kind, TVMRetValue* rv) {
 break;
   }
   case kWarpSize: {
-// Set warp size to be 1 for safty reason.
+#if defined(__x86_64__)
 *rv = 1;
+#elif defined(__aarch64__)
+*rv = 32;
+#else
+LOG(WARNING) << "The CPU architecture is neither x86 nor aarch64. 
Fallback to warp size 1.";
+*rv = 1;
+#endif
 break;
   }
   case kMaxSharedMemoryPerBlock:



(tvm) branch unity updated: [Unity] Enable ccache for `nn.SourceModule` (#16189)

2023-12-01 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 74667b97f0 [Unity] Enable ccache for `nn.SourceModule` (#16189)
74667b97f0 is described below

commit 74667b97f06fbd7ffa5f43378b92168437d2d521
Author: Yaxing Cai 
AuthorDate: Fri Dec 1 16:49:28 2023 -0800

[Unity] Enable ccache for `nn.SourceModule` (#16189)

This PR is a mirror PR of #16176, and enables ccache in `nn.SourceModule` 
compilation.
---
 python/tvm/contrib/cc.py | 52 +++-
 python/tvm/relax/frontend/nn/core.py | 19 +++--
 tests/python/contrib/test_ccache.py  | 79 
 3 files changed, 136 insertions(+), 14 deletions(-)

diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
index 561ce0134d..8ad70dc254 100644
--- a/python/tvm/contrib/cc.py
+++ b/python/tvm/contrib/cc.py
@@ -64,7 +64,7 @@ def get_cc():
 return None
 
 
-def create_shared(output, objects, options=None, cc=None):
+def create_shared(output, objects, options=None, cc=None, cwd=None, 
ccache_env=None):
 """Create shared library.
 
 Parameters
@@ -80,13 +80,19 @@ def create_shared(output, objects, options=None, cc=None):
 
 cc : Optional[str]
 The compiler command.
+
+cwd : Optional[str]
+The urrent working directory.
+
+ccache_env : Optional[Dict[str, str]]
+The environment variable for ccache. Set `None` to disable ccache by 
default.
 """
 cc = cc or get_cc()
 
 if _is_linux_like():
-_linux_compile(output, objects, options, cc, compile_shared=True)
+_linux_compile(output, objects, options, cc, cwd, ccache_env, 
compile_shared=True)
 elif _is_windows_like():
-_windows_compile(output, objects, options)
+_windows_compile(output, objects, options, cwd, ccache_env)
 else:
 raise ValueError("Unsupported platform")
 
@@ -139,7 +145,7 @@ def create_staticlib(output, inputs, ar=None):
 raise ValueError("Unsupported platform")
 
 
-def create_executable(output, objects, options=None, cc=None):
+def create_executable(output, objects, options=None, cc=None, cwd=None, 
ccache_env=None):
 """Create executable binary.
 
 Parameters
@@ -155,13 +161,19 @@ def create_executable(output, objects, options=None, 
cc=None):
 
 cc : Optional[str]
 The compiler command.
+
+cwd : Optional[str]
+The urrent working directory.
+
+ccache_env : Optional[Dict[str, str]]
+The environment variable for ccache. Set `None` to disable ccache by 
default.
 """
 cc = cc or get_cc()
 
 if _is_linux_like():
-_linux_compile(output, objects, options, cc)
+_linux_compile(output, objects, options, cc, cwd, ccache_env)
 elif _is_windows_like():
-_windows_compile(output, objects, options)
+_windows_compile(output, objects, options, cwd, ccache_env)
 else:
 raise ValueError("Unsupported platform")
 
@@ -275,7 +287,9 @@ def cross_compiler(
 return _fcompile
 
 
-def _linux_compile(output, objects, options, compile_cmd, 
compile_shared=False):
+def _linux_compile(
+output, objects, options, compile_cmd, cwd=None, ccache_env=None, 
compile_shared=False
+):
 cmd = [compile_cmd]
 if compile_cmd != "nvcc":
 if compile_shared or output.endswith(".so") or 
output.endswith(".dylib"):
@@ -294,7 +308,15 @@ def _linux_compile(output, objects, options, compile_cmd, 
compile_shared=False):
 cmd += objects
 if options:
 cmd += options
-proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, 
stderr=subprocess.STDOUT)
+env = None
+if ccache_env is not None:
+if shutil.which("ccache"):
+cmd.insert(0, "ccache")
+env = os.environ.copy()
+env.update(ccache_env)
+else:
+raise ValueError("ccache not found")
+proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, 
stderr=subprocess.STDOUT, cwd=cwd, env=env)
 (out, _) = proc.communicate()
 if proc.returncode != 0:
 msg = "Compilation error:\n"
@@ -303,7 +325,7 @@ def _linux_compile(output, objects, options, compile_cmd, 
compile_shared=False):
 raise RuntimeError(msg)
 
 
-def _windows_compile(output, objects, options):
+def _windows_compile(output, objects, options, cwd=None, ccache_env=None):
 cmd = ["clang"]
 cmd += ["-O2"]
 
@@ -318,9 +340,19 @@ def _windows_compile(output, objects, options):
 cmd += objects
 if options:
 cmd += options
+env = None
+if ccache_env is not None:
+if shutil.which("ccache"):
+cmd.insert(0, &q

(tvm) branch main updated: Enable ccache to accelerate contrib compilation (#16176)

2023-11-30 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 1994f402e6 Enable ccache to accelerate contrib compilation (#16176)
1994f402e6 is described below

commit 1994f402e69281a53df37a77acf935798eb856bb
Author: Yaxing Cai 
AuthorDate: Thu Nov 30 14:25:22 2023 -0800

Enable ccache to accelerate contrib compilation (#16176)

This PR adds the interface for ccache in `contrib.cc` to enable ccache when 
creating libs or exectuables.
---
 python/tvm/contrib/cc.py| 58 +--
 tests/python/contrib/test_ccache.py | 79 +
 2 files changed, 126 insertions(+), 11 deletions(-)

diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
index ad6a82c49c..918e3c8f72 100644
--- a/python/tvm/contrib/cc.py
+++ b/python/tvm/contrib/cc.py
@@ -33,6 +33,10 @@ def _is_linux_like():
 )
 
 
+def _is_windows_like():
+return sys.platform == "win32"
+
+
 def get_cc():
 """Return the path to the default C/C++ compiler.
 
@@ -58,7 +62,7 @@ def get_cc():
 return None
 
 
-def create_shared(output, objects, options=None, cc=None):
+def create_shared(output, objects, options=None, cc=None, cwd=None, 
ccache_env=None):
 """Create shared library.
 
 Parameters
@@ -74,13 +78,19 @@ def create_shared(output, objects, options=None, cc=None):
 
 cc : Optional[str]
 The compiler command.
+
+cwd : Optional[str]
+The urrent working directory.
+
+ccache_env : Optional[Dict[str, str]]
+The environment variable for ccache. Set `None` to disable ccache by 
default.
 """
 cc = cc or get_cc()
 
 if _is_linux_like():
-_linux_compile(output, objects, options, cc, compile_shared=True)
-elif sys.platform == "win32":
-_windows_compile(output, objects, options)
+_linux_compile(output, objects, options, cc, cwd, ccache_env, 
compile_shared=True)
+elif _is_windows_like():
+_windows_compile(output, objects, options, cwd, ccache_env)
 else:
 raise ValueError("Unsupported platform")
 
@@ -133,7 +143,7 @@ def create_staticlib(output, inputs, ar=None):
 raise ValueError("Unsupported platform")
 
 
-def create_executable(output, objects, options=None, cc=None):
+def create_executable(output, objects, options=None, cc=None, cwd=None, 
ccache_env=None):
 """Create executable binary.
 
 Parameters
@@ -149,13 +159,19 @@ def create_executable(output, objects, options=None, 
cc=None):
 
 cc : Optional[str]
 The compiler command.
+
+cwd : Optional[str]
+The urrent working directory.
+
+ccache_env : Optional[Dict[str, str]]
+The environment variable for ccache. Set `None` to disable ccache by 
default.
 """
 cc = cc or get_cc()
 
 if _is_linux_like():
-_linux_compile(output, objects, options, cc)
+_linux_compile(output, objects, options, cc, cwd, ccache_env)
 elif sys.platform == "win32":
-_windows_compile(output, objects, options)
+_windows_compile(output, objects, options, cwd, ccache_env)
 else:
 raise ValueError("Unsupported platform")
 
@@ -269,7 +285,9 @@ def cross_compiler(
 return _fcompile
 
 
-def _linux_compile(output, objects, options, compile_cmd, 
compile_shared=False):
+def _linux_compile(
+output, objects, options, compile_cmd, cwd=None, ccache_env=None, 
compile_shared=False
+):
 cmd = [compile_cmd]
 if compile_cmd != "nvcc":
 if compile_shared or output.endswith(".so") or 
output.endswith(".dylib"):
@@ -288,7 +306,15 @@ def _linux_compile(output, objects, options, compile_cmd, 
compile_shared=False):
 cmd += objects
 if options:
 cmd += options
-proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, 
stderr=subprocess.STDOUT)
+env = None
+if ccache_env is not None:
+if shutil.which("ccache"):
+cmd.insert(0, "ccache")
+env = os.environ.copy()
+env.update(ccache_env)
+else:
+raise ValueError("ccache not found")
+proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, 
stderr=subprocess.STDOUT, cwd=cwd, env=env)
 (out, _) = proc.communicate()
 if proc.returncode != 0:
 msg = "Compilation error:\n"
@@ -297,7 +323,7 @@ def _linux_compile(output, objects, options, compile_cmd, 
compile_shared=False):
 raise RuntimeError(msg)
 
 
-def _windows_compile(output, objects, options):
+def _windows_compile(output, objects, options, cwd=None, ccache_env=None):
 cmd = ["clang"]
 cmd += [&quo

(tvm) branch unity updated: [Unity][3rdparty] Remove TVM in 3rdparty of FlashInfer (#16155)

2023-11-21 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 756ce9917f [Unity][3rdparty] Remove TVM in 3rdparty of FlashInfer 
(#16155)
756ce9917f is described below

commit 756ce9917f39d0f4efbaeefc9f3ee4ffc3ec1af0
Author: Ruihang Lai 
AuthorDate: Wed Nov 22 02:56:37 2023 -0500

[Unity][3rdparty] Remove TVM in 3rdparty of FlashInfer (#16155)

This PR updates the FlashInfer version in 3rdparty, which
has removed TVM from its 3rdparty to avoid recursive TVM
dependency.
---
 3rdparty/flashinfer | 2 +-
 CMakeLists.txt  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/3rdparty/flashinfer b/3rdparty/flashinfer
index b78ea385de..4aa8130e05 16
--- a/3rdparty/flashinfer
+++ b/3rdparty/flashinfer
@@ -1 +1 @@
-Subproject commit b78ea385de26137979025c585e378a294e70e014
+Subproject commit 4aa8130e05a2cd9e1f4077979535de8c1c8c71cd
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 29aa865a59..b09ad9e542 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -912,6 +912,7 @@ option(USE_FLASHINFER "Build TVM with FlashInfer" OFF)
 if (USE_FLASHINFER STREQUAL "ON")
   message(STATUS "Build with FlashInfer")
   set(FLASHINFER_TVM_BINDING ON)
+  set(FLASHINFER_TVM_HOME ${PROJECT_SOURCE_DIR})
   set(FLASHINFER_ENABLE_FP8 OFF)
   set(FLASHINFER_PREFILL OFF)
   set(FLASHINFER_DECODE OFF)



(tvm) branch unity updated: [Unity] Support constant args in `nn.ExternModule` (#16130)

2023-11-21 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new aae1112a65 [Unity] Support constant args in `nn.ExternModule` (#16130)
aae1112a65 is described below

commit aae1112a6565816dd60e95f6cc4e3bd4fd9c965a
Author: Yaxing Cai 
AuthorDate: Tue Nov 21 23:55:29 2023 -0800

[Unity] Support constant args in `nn.ExternModule` (#16130)

This PR introduces `spec.ConstInt`, `spec.ConstFloat` and 
`spec.ConstString` for the `nn.ExternModule` args. So we will enable the 
constant input for extern module functions.
---
 python/tvm/relax/frontend/nn/core.py   | 27 -
 python/tvm/relax/frontend/nn/spec.py   | 42 +-
 .../python/relax/test_frontend_nn_extern_module.py | 64 +-
 3 files changed, 127 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relax/frontend/nn/core.py 
b/python/tvm/relax/frontend/nn/core.py
index ffda5c7901..e6c9a64f92 100644
--- a/python/tvm/relax/frontend/nn/core.py
+++ b/python/tvm/relax/frontend/nn/core.py
@@ -474,9 +474,15 @@ class ExternModule(Module):
 from . import spec as _spec
 from .op import _wrap_nested
 
-def extern_func(*args: Tensor) -> Tensor:
+def extern_func(
+*args: List[
+Union[_spec.Tensor, _spec.ConstInt, _spec.ConstFloat, 
_spec.ConstString]
+]
+) -> Tensor:
 spec2var = {}
 for arg, arg_spec in zip(args, function_spec.args):
+if not isinstance(arg_spec, _spec.Tensor):
+continue
 for value, value_spec in zip(arg.shape, 
arg_spec.shape):
 if isinstance(value_spec, str):
 if not value_spec in spec2var:
@@ -503,10 +509,27 @@ class ExternModule(Module):
 out_shape,  # type: ignore[arg-type]
 func_spec_ret.dtype,
 )
+relax_args = []
+for arg, arg_spec in zip(args, function_spec.args):
+if isinstance(arg_spec, _spec.Tensor):
+relax_args.append(arg._expr)
+elif isinstance(arg_spec, _spec.ConstInt):
+if arg_spec.dtype is None:
+relax_args.append(rx.PrimValue(int(arg)))
+else:
+
relax_args.append(rx.PrimValue(tir.IntImm(arg_spec.dtype, arg)))
+elif isinstance(arg_spec, _spec.ConstFloat):
+if arg_spec.dtype is None:
+relax_args.append(rx.PrimValue(float(arg)))
+else:
+
relax_args.append(rx.PrimValue(tir.FloatImm(arg_spec.dtype, arg)))
+elif isinstance(arg_spec, _spec.ConstString):
+relax_args.append(rx.StringImm(arg))
+
 ret_tensor = _wrap_nested(
 call_dps_packed(
 func_name,
-args=RxTuple([tensor._expr for tensor in args]),
+args=RxTuple(relax_args),
 out_sinfo=out_sinfo,
 ),
 func_name,
diff --git a/python/tvm/relax/frontend/nn/spec.py 
b/python/tvm/relax/frontend/nn/spec.py
index fb7f2588fa..07d21cff1b 100644
--- a/python/tvm/relax/frontend/nn/spec.py
+++ b/python/tvm/relax/frontend/nn/spec.py
@@ -78,6 +78,44 @@ class Tuple:  # pylint: disable=too-few-public-methods
 return self.elements.__repr__()
 
 
+class ConstInt:  # pylint: disable=too-few-public-methods
+"""An integer constant"""
+
+dtype: typing.Optional[str]
+
+def __init__(self, dtype: str = None) -> None:
+self.dtype = dtype
+
+def __repr__(self) -> str:
+if self.dtype is None:
+return "const.int"
+return f"const.int({self.dtype})"
+
+
+class ConstFloat:  # pylint: disable=too-few-public-methods
+"""A float constant"""
+
+dtype: typing.Optional[str]
+
+def __init__(self, dtype: str = None) -> None:
+self.dtype = dtype
+
+def __repr__(self) -> str:
+if self.dtype is None:
+return "const.float"
+return f"const.float({self.dtype})"
+
+
+class ConstString:  # pylint: disable=too-few-public-methods
+"""A string constant"""
+
+def __init__(self) -> None:
+pass
+
+def __repr

(tvm) branch unity updated: [Runtime] Allowing Packed Arguments in TVM Module VTable (#16148)

2023-11-19 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 4e70c28217 [Runtime] Allowing Packed Arguments in TVM Module VTable 
(#16148)
4e70c28217 is described below

commit 4e70c282171c26fe081f3773252143ad687170f6
Author: Junru Shao 
AuthorDate: Sun Nov 19 12:05:04 2023 -0800

[Runtime] Allowing Packed Arguments in TVM Module VTable (#16148)

Prior to this PR, the `TVM_MODULE_VTABLE_*` macros work by detailing the
correspondence between the string names and the module methods as part,
so that it declares the calling convention of a TVM Module.

For example, in the `Execuable` of `RelayVM`, the convention is
specified by 
[link](https://github.com/apache/tvm/blob/748882aae7be1435f042e22b0fc67cb236705b6c/include/tvm/runtime/vm/executable.h#L60-L76):

```C++
 # Name of the Module
TVM_MODULE_VTABLE_BEGIN("VMExecutable");
 # Bind member methods to string name as the calling convention
TVM_MODULE_VTABLE_ENTRY("get_lib", ::GetLib);
...
TVM_MODULE_VTABLE_END();
```

Note that it supports only "unpacked" member methods so far, i.e.,
ordinary C++ methods with fixed number of arguments, each of which has
a compile-type known type, e.g.

```C++
class MyModule : public tvm::runtime::Module {
  ...
  int UnpackedMethod(int a, double b, std::string c);
};
```

However, TVM's calling convention is actually much more powerful and
covers the case for variadic arguments and runtime type dynamism via
type erasure and tagged union, e.g.

```C++
class MyModule : public tvm::runtime::Module {
  ...
  int PackedMethod(TVMArgs args, TVMRetValue* rv);
};
```

This PR introduces support for this scenario by a new macro
`TVM_MODULE_VTABLE_PACKED`. Example:

```C++
class MLCServingEngine : public tvm::runtime::Module {
  ...
  TVM_MODULE_VTABLE_BEGIN("mlc.serve.engine");
  TVM_MODULE_VTABLE_ENTRY_PACKED("init", ::InitPacked);
  TVM_MODULE_VTABLE_ENTRY("add_request", ::AddRequest);
  ...
  TVM_MODULE_VTABLE_END();

  void InitPacked(TVMArgs args, TVMRetValue* rv) {}
};
```
---
 include/tvm/runtime/packed_func.h | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/tvm/runtime/packed_func.h 
b/include/tvm/runtime/packed_func.h
index 5ede2f953a..eebdb288d1 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -1171,11 +1171,11 @@ struct PackedFuncValueConverter {
   Helper::Call(rv, self, MemFunc, args, Helper::IndexSeq{});   
   \
 });
   \
   }
-#define TVM_MODULE_VTABLE_ENTRY_PACKED(Name, Func) 
   \
-  if (_name == Name) { 
   \
-auto f = (Func);   
   \
-using FType = 
::tvm::runtime::detail::function_signature::FType; \
-return TypedPackedFunc(std::move(f)).packed();  
   \
+#define TVM_MODULE_VTABLE_ENTRY_PACKED(Name, MemFunc)  \
+  if (_name == Name) { \
+return PackedFunc([_self](TVMArgs args, TVMRetValue* rv) -> void { \
+  (static_cast(_self.get())->*(MemFunc))(args, rv);   \
+});\
   }
 
 /*!



(tvm) branch unity updated (165b84bc7f -> 4c07f6af43)

2023-11-19 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 165b84bc7f Always use int64 in JSON parser (#16145)
 add 4c07f6af43 [Runtime] Introduce Type-Checked `TVMArgs::At(i)` 
(#16147)

No new revisions were added by this update.

Summary of changes:
 include/tvm/runtime/packed_func.h | 21 +
 1 file changed, 21 insertions(+)



(tvm) branch unity updated: [Unity][LLM] Add NaN checks during sampling for better error reporting (#16141)

2023-11-17 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 90edf76716 [Unity][LLM] Add NaN checks during sampling for better 
error reporting (#16141)
90edf76716 is described below

commit 90edf767167224a5367ff24eba894fe3d5e502a4
Author: Siyuan Feng 
AuthorDate: Sat Nov 18 01:36:28 2023 +0800

[Unity][LLM] Add NaN checks during sampling for better error reporting 
(#16141)

The current error message would be confusing:
```
mlc-llm/3rdparty/tvm/src/runtime/relax_vm/lm_support.cc:421: InternalError: 
Check failed: sampled_index >= 0 (-1 vs. 0)
```
But most of the case is cause by NaN error. This PR improves the error 
message
---
 src/runtime/relax_vm/lm_support.cc | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/runtime/relax_vm/lm_support.cc 
b/src/runtime/relax_vm/lm_support.cc
index fbff8ff029..6301245dac 100644
--- a/src/runtime/relax_vm/lm_support.cc
+++ b/src/runtime/relax_vm/lm_support.cc
@@ -452,6 +452,10 @@ int SampleTopPFromProb(NDArray prob, double top_p, double 
uniform_sample) {
 return data[data.size() - 1].second;
   };
 
+  auto is_all_nan = [&]() -> bool {
+return std::all_of(p_prob, p_prob + ndata, [](float x) { return 
std::isnan(x); });
+  };
+
   if (top_p < 1) {
 // sample through cutoff by a number
 // by pigeonhole principle we will get at most 1024 elements
@@ -463,7 +467,11 @@ int SampleTopPFromProb(NDArray prob, double top_p, double 
uniform_sample) {
   // fallback via full prob, rare case
   data.reserve(ndata);
   int64_t sampled_index = sample_top_p_with_filter(0.0f);
-  ICHECK_GE(sampled_index, 0);
+  if (sampled_index < 0 && is_all_nan()) {
+LOG(FATAL) << "The output probabilities are all NaNs, can not sample from 
it";
+  } else if (sampled_index < 0) {
+LOG(FATAL) << "Cannot sample from the given probability distribution due 
to unknown reason";
+  }
   return sampled_index;
 }
 



(tvm) branch main updated: [Runtime] Parallel-for with threading backend (#16133)

2023-11-16 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 748882aae7 [Runtime] Parallel-for with threading backend (#16133)
748882aae7 is described below

commit 748882aae7be1435f042e22b0fc67cb236705b6c
Author: Ruihang Lai 
AuthorDate: Thu Nov 16 05:20:44 2023 -0500

[Runtime] Parallel-for with threading backend (#16133)

This PR introduces the runtime parallel-for helper function
in C++ with the threading backend in TVM.

Right now the existing 
[parallel-for](https://github.com/apache/tvm/blob/bd67d2e5ebde1aec18bcfa74c087516579bda1ae/include/tvm/support/parallel_for.h#L48-L68)
in TVM is not thread persistent,
in which case we cannot get persistent TLS for each thread.

The introduced parallel-for-with-threading-backend function
leverages the threading backend in TVM and persists threads.
---
 include/tvm/runtime/threading_backend.h | 70 +
 tests/cpp/threading_backend_test.cc |  9 +
 2 files changed, 79 insertions(+)

diff --git a/include/tvm/runtime/threading_backend.h 
b/include/tvm/runtime/threading_backend.h
index 77d6730c09..3122b000e0 100644
--- a/include/tvm/runtime/threading_backend.h
+++ b/include/tvm/runtime/threading_backend.h
@@ -24,6 +24,9 @@
 #ifndef TVM_RUNTIME_THREADING_BACKEND_H_
 #define TVM_RUNTIME_THREADING_BACKEND_H_
 
+#include 
+
+#include 
 #include 
 #include 
 #include 
@@ -147,6 +150,73 @@ TVM_DLL void 
Configure(tvm::runtime::threading::ThreadGroup::AffinityMode mode,
 int32_t NumThreads();
 
 }  // namespace threading
+
+/*!
+ * \brief Execute the given lambda function in parallel with
+ * threading backend in TVM.
+ * \tparam T The type of the lambda: "void (int i)".
+ * \param flambda The lambda to be executed in parallel.
+ * It should have the signature "void (int i)".
+ * \param begin The start index of this parallel loop (inclusive).
+ * \param end The end index of this parallel loop (exclusive).
+ * \example
+ *
+ * The for loop
+ *   for (int i = 0; i < 10; i++) {
+ * a[i] = i;
+ *   }
+ * should work the same as:
+ *   parallel_for_with_threading_backend([](int i) {
+ * a[i] = i;
+ *   }, 0, 10);
+ */
+template 
+inline void parallel_for_with_threading_backend(T flambda, int64_t begin, 
int64_t end);
+
+namespace detail {
+
+// The detailed implementation of `parallel_for_with_threading_backend`.
+// To avoid template expansion, the implementation cannot be placed
+// in .cc files.
+
+template 
+struct ParallelForWithThreadingBackendLambdaInvoker {
+  static int TVMParallelLambdaInvoke(int task_id, TVMParallelGroupEnv* penv, 
void* cdata) {
+int num_task = penv->num_task;
+// Convert void* back to lambda type.
+T* lambda_ptr = static_cast(cdata);
+// Invoke the lambda with the task id (thread id).
+(*lambda_ptr)(task_id, num_task);
+return 0;
+  }
+};
+
+template 
+inline void parallel_launch_with_threading_backend(T flambda) {
+  // Launch the lambda by passing its address.
+  void* cdata = 
+  
TVMBackendParallelLaunch(ParallelForWithThreadingBackendLambdaInvoker::TVMParallelLambdaInvoke,
+   cdata, /*num_task=*/0);
+}
+
+}  // namespace detail
+
+template 
+inline void parallel_for_with_threading_backend(T flambda, int64_t begin, 
int64_t end) {
+  auto flaunch = [begin, end, flambda](int task_id, int num_task) {
+// For each thread, do static division and call into flambda.
+int64_t total_len = end - begin;
+int64_t step = (total_len + num_task - 1) / num_task;
+int64_t local_begin = std::min(begin + step * task_id, end);
+int64_t local_end = std::min(local_begin + step, end);
+for (int64_t i = local_begin; i < local_end; ++i) {
+  flambda(i);
+}
+  };
+  // Launch with all threads.
+  detail::parallel_launch_with_threading_backend(flaunch);
+}
+
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/tests/cpp/threading_backend_test.cc 
b/tests/cpp/threading_backend_test.cc
index 5adf1f9ae3..b156eec8ab 100644
--- a/tests/cpp/threading_backend_test.cc
+++ b/tests/cpp/threading_backend_test.cc
@@ -185,3 +185,12 @@ TEST(ThreadingBackend, TVMBackendAffinityConfigure) {
 t->join();
   }
 }
+
+TEST(ThreadingBackend, TVMBackendParallelForWithThreadingBackend) {
+  int n = 100;
+  std::vector vec(/*size=*/n, /*value=*/0);
+  tvm::runtime::parallel_for_with_threading_backend([](int i) { vec[i] = 
i; }, 0, n);
+  for (int i = 0; i < n; ++i) {
+EXPECT_EQ(vec[i], i);
+  }
+}



(tvm) branch unity updated: [Unity][DLight] Enhance the inline consumer rule (#16124)

2023-11-14 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 684a8ca6a4 [Unity][DLight] Enhance the inline consumer rule (#16124)
684a8ca6a4 is described below

commit 684a8ca6a41c984a2431405e29b82ba862f70f82
Author: Siyuan Feng 
AuthorDate: Wed Nov 15 14:38:33 2023 +0800

[Unity][DLight] Enhance the inline consumer rule (#16124)

The current inline consumer rule failed on the following case, because
of the missing inline of the producers of the output stage

```
 A   B   D
  \ /|
 matmul  C
\   /
 out
```
---
 python/tvm/dlight/gpu/matmul.py|   1 +
 tests/python/dlight/test_gpu_matmul.py | 130 -
 2 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/python/tvm/dlight/gpu/matmul.py b/python/tvm/dlight/gpu/matmul.py
index 703f9c151f..7d5d6489cb 100644
--- a/python/tvm/dlight/gpu/matmul.py
+++ b/python/tvm/dlight/gpu/matmul.py
@@ -99,6 +99,7 @@ def auto_inline_consumer_chain(
 for c in remaining_consumers:
 for p in sch.get_producers(c):
 if sch.get(p) != sch.get(block):
+auto_inline_producers(sch, p)
 sch.compute_inline(p)
 
 # Try inlining into the cache-write stage again, this time it should 
succeed.
diff --git a/tests/python/dlight/test_gpu_matmul.py 
b/tests/python/dlight/test_gpu_matmul.py
index 550e30e6e7..82f481da46 100644
--- a/tests/python/dlight/test_gpu_matmul.py
+++ b/tests/python/dlight/test_gpu_matmul.py
@@ -19,7 +19,6 @@ import pytest
 
 import tvm.testing
 from tvm import dlight as dl
-from tvm.script import ir as I
 from tvm.script import tir as T
 from tvm.target import Target
 
@@ -476,6 +475,135 @@ class TestOutputFP32(BaseBeforeAfter):
 # fmt: on
 
 
+class TestInlineConsumerChain(BaseBeforeAfter):
+# fmt: off
+@T.prim_func(private=True)
+def before(p_lv26: T.handle, lv9: T.Buffer((T.int64(2048), T.int64(2048)), 
"float16"), p_lv52: T.handle, p_output0: T.handle):
+T.func_attr({"tir.noalias": T.bool(True)})
+n = T.int64()
+lv26 = T.match_buffer(p_lv26, (n, T.int64(2048)), "float16")
+lv52 = T.match_buffer(p_lv52, (T.int64(1), n, T.int64(2048)))
+var_T_multiply_intermediate = T.match_buffer(p_output0, (n, 
T.int64(2048)), "float16")
+# with T.block("root"):
+var_NT_matmul_intermediate = T.alloc_buffer((n, T.int64(2048)), 
"float16")
+compute = T.alloc_buffer((n, T.int64(2048)), "float16")
+var_T_multiply_intermediate_1 = T.alloc_buffer((n, T.int64(2048)), 
"float16")
+var_T_squeeze_intermediate = T.alloc_buffer((n, T.int64(2048)))
+var_compute_intermediate = T.alloc_buffer((n, T.int64(2048)), 
"float16")
+for i0, i1, k in T.grid(n, T.int64(2048), T.int64(2048)):
+with T.block("NT_matmul"):
+v_i0, v_i1, v_k = T.axis.remap("SSR", [i0, i1, k])
+T.reads(lv26[v_i0, v_k], lv9[v_i1, v_k])
+T.writes(var_NT_matmul_intermediate[v_i0, v_i1])
+with T.init():
+var_NT_matmul_intermediate[v_i0, v_i1] = T.float16(0)
+var_NT_matmul_intermediate[v_i0, v_i1] = 
var_NT_matmul_intermediate[v_i0, v_i1] + lv26[v_i0, v_k] * lv9[v_i1, v_k]
+for i0, i1 in T.grid(n, T.int64(2048)):
+with T.block("compute"):
+v_i0, v_i1 = T.axis.remap("SS", [i0, i1])
+T.reads(var_NT_matmul_intermediate[v_i0, v_i1])
+T.writes(compute[v_i0, v_i1])
+compute[v_i0, v_i1] = 
T.sigmoid(var_NT_matmul_intermediate[v_i0, v_i1])
+for ax0, ax1 in T.grid(n, T.int64(2048)):
+with T.block("T_multiply"):
+v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1])
+T.reads(var_NT_matmul_intermediate[v_ax0, v_ax1], 
compute[v_ax0, v_ax1])
+T.writes(var_T_multiply_intermediate_1[v_ax0, v_ax1])
+var_T_multiply_intermediate_1[v_ax0, v_ax1] = 
var_NT_matmul_intermediate[v_ax0, v_ax1] * compute[v_ax0, v_ax1]
+for ax0, ax1 in T.grid(n, T.int64(2048)):
+with T.block("T_squeeze"):
+v_ax0, v_ax1 = T.axis.remap("SS", [ax0, ax1])
+T.reads(lv52[T.int64(0), v_ax0, v_ax1])
+T.writes(var_T_squeeze_intermediate[v_ax0, v_ax1])
+var_T_squeeze_intermediate[v_ax0, v_ax1] = lv52[T.int64(0), 
v_ax0, v_ax1]
+for i0, i1 in T.grid(n, T.int64(2048)):
+with T.block("compute_1"):
+v_i0, v_i1 = T.axis.remap("SS", [i0, i1])
+

(tvm) branch unity updated: [Unity] [Transform] Skip constants in CSE pass (#16125)

2023-11-14 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new c9de001490 [Unity] [Transform] Skip constants in CSE pass (#16125)
c9de001490 is described below

commit c9de0014905c69b67d7e36488af12466d7d9a940
Author: Anirudh Sundar Subramaniam 
AuthorDate: Wed Nov 15 01:38:03 2023 +0530

[Unity] [Transform] Skip constants in CSE pass (#16125)

This patch modifies the CSE pass to skip all constants as
[discussed 
here](https://discuss.tvm.apache.org/t/common-subexpr-elimination-pass-replaces-constant-args-with-vars/15971)
---
 src/relax/transform/eliminate_common_subexpr.cc | 2 +-
 tests/python/relax/test_transform_cse.py| 8 +---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/relax/transform/eliminate_common_subexpr.cc 
b/src/relax/transform/eliminate_common_subexpr.cc
index 2addb60697..095274b0f8 100644
--- a/src/relax/transform/eliminate_common_subexpr.cc
+++ b/src/relax/transform/eliminate_common_subexpr.cc
@@ -95,7 +95,7 @@ class SubexprCounter : public ExprVisitor {
   e->IsInstance() || e->IsInstance() ||
   e->IsInstance() || e->IsInstance() ||
   e->IsInstance() || e->IsInstance() ||
-  (e.as() && (e.as()->is_scalar() {
+  e->IsInstance())) {
   // also if e has an impure subexpression, we will not deduplicate it
   if (!impurity_detector_.Detect(e)) {
 int count = 0;
diff --git a/tests/python/relax/test_transform_cse.py 
b/tests/python/relax/test_transform_cse.py
index d69ec61b5c..92cf4349d4 100644
--- a/tests/python/relax/test_transform_cse.py
+++ b/tests/python/relax/test_transform_cse.py
@@ -78,9 +78,11 @@ def test_constants():
 def foo() -> R.Tuple(R.Tensor((), dtype="int32"), R.Tensor((2, 2), 
dtype="int32")):
 with R.dataflow():
 lv0 = R.add(R.const(1, dtype="int32"), R.const(1, 
dtype="int32"))
-lv1 = R.const(tvm.nd.array(np.zeros((2, 2), dtype="int32")))
-lv2 = R.add(lv1, lv1)
-gv = (lv0, lv2)
+lv1 = R.add(
+R.const(tvm.nd.array(np.zeros((2, 2), dtype="int32"))),
+R.const(tvm.nd.array(np.zeros((2, 2), dtype="int32"))),
+)
+gv = (lv0, lv1)
 R.output(gv)
 return gv
 



(tvm) branch unity updated: [Unity] Allow Customized Pipeline in `relax.build` (#16121)

2023-11-14 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new d5daa9806d [Unity] Allow Customized Pipeline in `relax.build` (#16121)
d5daa9806d is described below

commit d5daa9806dc6dd1d2d8cdedcd142cfa89f22eaef
Author: Junru Shao 
AuthorDate: Tue Nov 14 00:20:05 2023 -0800

[Unity] Allow Customized Pipeline in `relax.build` (#16121)

The existing `relax.build` method assumes the compilation follows a
fixed set of passes to be used. With the introduction of Relax pipeline
system, one could effectively manage which passes to use during
lowering. This PR generalizes this approach by further allowing what to
use during compilation.
---
 python/tvm/relax/pipeline.py | 32 +++-
 python/tvm/relax/vm_build.py | 30 +++---
 2 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/python/tvm/relax/pipeline.py b/python/tvm/relax/pipeline.py
index 367c1ede0e..a4ba3315b8 100644
--- a/python/tvm/relax/pipeline.py
+++ b/python/tvm/relax/pipeline.py
@@ -74,8 +74,38 @@ def zero_pipeline(*, enable_warning: bool = False):
 return f_zero_pipeline
 
 
+def default_build_pipeline():
+"""The default compilation pipeline used in relax.build"""
+
+@tvm.transform.module_pass(opt_level=0)
+def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> 
tvm.ir.IRModule:
+seq = tvm.transform.Sequential(
+[
+transform.LegalizeOps(),
+transform.RewriteDataflowReshape(),
+transform.ToNonDataflow(),
+transform.RemovePurityChecking(),
+transform.CallTIRRewrite(),
+transform.StaticPlanBlockMemory(),
+transform.RewriteCUDAGraph(),
+transform.LowerAllocTensor(),
+transform.KillAfterLastUse(),
+transform.VMBuiltinLower(),
+transform.VMShapeLower(),
+transform.AttachGlobalSymbol(),
+],
+)
+mod = seq(mod._move())  # pylint: disable=protected-access
+return mod
+
+return _pipeline
+
+
 # global map of pre-built pipelines
-PIPELINE_MAP = {"zero": zero_pipeline}
+PIPELINE_MAP = {
+"zero": zero_pipeline,
+"default_build": default_build_pipeline,
+}
 
 
 def get_pipeline(name: str = "zero", **kwargs) -> tvm.transform.Pass:
diff --git a/python/tvm/relax/vm_build.py b/python/tvm/relax/vm_build.py
index a54c0154fc..7a7649c449 100644
--- a/python/tvm/relax/vm_build.py
+++ b/python/tvm/relax/vm_build.py
@@ -16,13 +16,11 @@
 # under the License.
 # pylint: disable=invalid-name, no-member
 """VM build logics"""
-from typing import List, Optional, Union, Dict, Any
+from typing import Any, Dict, List, Optional, Union
 
 import tvm
 from tvm import relax
-
 from tvm.contrib import utils as _utils
-
 from tvm.ir.module import IRModule
 from tvm.tir.function import PrimFunc
 
@@ -80,6 +78,7 @@ class Executable:
 rt_mod = ex.jit()
 vm = tvm.relax.VirtualMachine(rt_mod, tvm.cuda())
 """
+
 # TODO(tvm-team): Update runtime.Module interfac
 # to query these properties as bitmask.
 def _not_runnable(x):
@@ -249,6 +248,7 @@ def build(
 mod: tvm.IRModule,
 target: Union[str, tvm.target.Target],
 params: Optional[Dict[str, list]] = None,
+pipeline: str = "default_build",
 exec_mode: str = "bytecode",
 *,
 system_lib: Optional[bool] = None,
@@ -274,6 +274,9 @@ def build(
 params: Optional[Dict[str, list]]
 Parameters for the input IRModule that will be bound.
 
+pipeline : str = "default_build"
+The compilation pipeline to use.
+
 exec_mode: {"bytecode", "compiled"}
 The execution mode.
 
@@ -305,26 +308,7 @@ def build(
 if isinstance(target, str):
 target = tvm.target.Target(target)
 
-lowering_passes = tvm.transform.Sequential(
-[
-relax.transform.LegalizeOps(),
-relax.transform.RewriteDataflowReshape(),
-relax.transform.ToNonDataflow(),
-relax.transform.RemovePurityChecking(),
-relax.transform.CallTIRRewrite(),
-relax.transform.StaticPlanBlockMemory(),
-relax.transform.RewriteCUDAGraph(),
-relax.transform.LowerAllocTensor(),
-relax.transform.KillAfterLastUse(),
-relax.transform.VMBuiltinLower(),
-relax.transform.VMShapeLower(),
-relax.transform.AttachGlobalSymbol(),
-],
-name="relax.lower",
-)
-
-new_mod = lowering_passes(mod)
-
+new_mod = relax.get_pipeline(pipeline)(mod)
 # Extract external runtime modules if exist.
 attrs = dict(mod.attrs) if mod.attrs else {}
 



(tvm) branch unity updated (7a0c3f9e05 -> 835bc82665)

2023-11-13 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 7a0c3f9e05 [Unity][Support] PagedKVCache support growth control 
(#16112)
 add 835bc82665 [Unity][TVMJS] Add md5sum to weight shards (#16122)

No new revisions were added by this update.

Summary of changes:
 python/tvm/contrib/tvmjs.py | 22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)



(tvm) branch unity updated: [Unity][Support] PagedKVCache support growth control (#16112)

2023-11-12 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 7a0c3f9e05 [Unity][Support] PagedKVCache support growth control 
(#16112)
7a0c3f9e05 is described below

commit 7a0c3f9e056a771b6854d91be6656b31e871e622
Author: Ruihang Lai 
AuthorDate: Sun Nov 12 12:54:19 2023 -0500

[Unity][Support] PagedKVCache support growth control (#16112)

This PR supports controlling whether KV cache automatic
growth is allowed through constructor parameter. Previously
we always allow the KV cache to grow whenever it is full
and more capacity is demanded.

Although automatic growth can be good, in practice we
often want the pre-allocated memory to be static, large enough
and not changeable, which will make the memory management more
controllable. Hence, this PR supports to specify if growth
is allowed, and will throw error when growing in unallowed cases.

This PR also adds an auxiliary function to KV cache to query
the number of available pages.
---
 src/runtime/relax_vm/paged_kv_cache.cc | 46 --
 ...est_runtime_builtin_paged_attention_kv_cache.py |  5 +++
 2 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/src/runtime/relax_vm/paged_kv_cache.cc 
b/src/runtime/relax_vm/paged_kv_cache.cc
index 4c61af9018..6d2444ea64 100644
--- a/src/runtime/relax_vm/paged_kv_cache.cc
+++ b/src/runtime/relax_vm/paged_kv_cache.cc
@@ -77,6 +77,8 @@ class PagedAttentionKVCacheObj : public Object {
   const int64_t num_heads_;
   /*! \brief The number of features each head has. */
   const int64_t head_dim_;
+  /*! \brief A boolean denoting if cache automatic growth is allowed. */
+  const bool allow_growth_;
 
   /*! \brief We fix int32 to be the index dtype of auxiliary data. */
   const DLDataType dtype_aux_ = DLDataType(DataType::Int(32, 1));
@@ -145,8 +147,6 @@ class PagedAttentionKVCacheObj : public Object {
* length dimension of K/V data. It is used for efficient computation.
*/
   NDArray cur_pos2seqid_device_;
-  /*! \brief A temporary buffer for efficient attention computation. */
-  NDArray attn_tmp_buffer_;
 
   //---
   // For efficient memory management, the actual sizes of the arrays
@@ -165,8 +165,13 @@ class PagedAttentionKVCacheObj : public Object {
   /*! \brief Constructor. Take the cache configuration and initialize the 
NDArrays. */
   explicit PagedAttentionKVCacheObj(int64_t page_size, int64_t num_layers, 
int64_t num_heads,
 int64_t head_dim, int64_t 
reserved_num_seqs,
-int64_t reserved_num_pages, DLDataType 
dtype, DLDevice device)
-  : page_size_(page_size), num_layers_(num_layers), num_heads_(num_heads), 
head_dim_(head_dim) {
+int64_t reserved_num_pages, DLDataType 
dtype, DLDevice device,
+bool allow_growth)
+  : page_size_(page_size),
+num_layers_(num_layers),
+num_heads_(num_heads),
+head_dim_(head_dim),
+allow_growth_(allow_growth) {
 pages_ = NDArray::Empty({reserved_num_pages, num_layers, 2, num_heads, 
page_size, head_dim},
 dtype, device);
 page_table_indptr_device_ = NDArray::Empty({reserved_num_seqs + 1}, 
dtype_aux_, device);
@@ -174,7 +179,6 @@ class PagedAttentionKVCacheObj : public Object {
 last_page_offset_device_ = NDArray::Empty({reserved_num_seqs}, dtype_aux_, 
device);
 cur_append_length_indptr_device_ = NDArray::Empty({reserved_num_seqs + 1}, 
dtype_aux_, device);
 cur_pos2seqid_device_ = NDArray::Empty({reserved_num_pages * page_size}, 
dtype_aux_, device);
-attn_tmp_buffer_ = NDArray::Empty({8 * 1024 * 1024}, 
DLDataType(DataType::Float(32)), device);
   }
 
   /*!
@@ -272,7 +276,7 @@ class PagedAttentionKVCacheObj : public Object {
 f_attention(q_data, pages_,  //
 page_table_indptr_view_, page_table_values_view_,//
 last_page_offset_view_, cur_append_length_indptr_view_,  //
-layer_id, attn_tmp_buffer_, output, apply_rotary, 
rotary_scale, rotary_theta);
+layer_id, output, apply_rotary, rotary_scale, rotary_theta);
   }
 
   /*!
@@ -486,6 +490,12 @@ class PagedAttentionKVCacheObj : public Object {
 dirty_aux_data_device_ = false;
   }
 
+  /*! \brief Return the number of remaining pages. */
+  int GetNumAvailablePages() {
+ICHECK_EQ(num_pages_allocated_, free_page_ids_.size() + num_pages_in_use_);
+return pages_->shape[0] - num_pages_in_use_;
+  }
+
   /*! \brief Reset the KV cache. */
   void Clear() {
 num_total_seqs_ = 0;
@@ -530,6 +540,9 @@ class PagedAttentionKVCacheObj : public Obj

(tvm) branch unity updated: [Unity][WebGPU] Allow lower max storage buffer binding size (#16108)

2023-11-11 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 7892af0d46 [Unity][WebGPU] Allow lower max storage buffer binding size 
(#16108)
7892af0d46 is described below

commit 7892af0d46f38c031aea61b1ffbc4e0e21041df5
Author: Charlie Ruan <53290280+charliefr...@users.noreply.github.com>
AuthorDate: Sat Nov 11 16:03:59 2023 -0500

[Unity][WebGPU] Allow lower max storage buffer binding size (#16108)

This PR lowers the required `maxStorageBufferBindingSize` from 1GB to 128MB 
when necessary.

Previously, we required 1GB, which led to errors when running on some 
devices (e.g. Android's Chrome, since Android has 128MB as its limit).

Now, if the device does not allow 1GB, we lower it to 128MB. We only throw 
an error when 128MB still exceeds the limit.
---
 web/src/webgpu.ts | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts
index 5ffed62cf9..fb4d868f45 100644
--- a/web/src/webgpu.ts
+++ b/web/src/webgpu.ts
@@ -55,13 +55,25 @@ export async function detectGPUDevice(): 
Promise 
adapter.limits.maxStorageBufferBindingSize) {
-  throw Error(
-`Cannot initialize runtime because of requested 
maxStorageBufferBindingSize ` +
-`exceeds limit. 
requested=${computeMB(requiredMaxStorageBufferBindingSize)}, ` +
-`limit=${computeMB(adapter.limits.maxStorageBufferBindingSize)}. `
+  // If 1GB is too large, try 128MB (default size for Android)
+  const backupRequiredMaxStorageBufferBindingSize = 1 << 27;  // 128MB
+  console.log(
+`Requested maxStorageBufferBindingSize exceeds limit. \n` +
+`requested=${computeMB(requiredMaxStorageBufferBindingSize)}, \n` +
+`limit=${computeMB(adapter.limits.maxStorageBufferBindingSize)}. \n` +
+`WARNING: Falling back to 
${computeMB(backupRequiredMaxStorageBufferBindingSize)}...`
   );
+  requiredMaxStorageBufferBindingSize = 
backupRequiredMaxStorageBufferBindingSize;
+  if (backupRequiredMaxStorageBufferBindingSize > 
adapter.limits.maxStorageBufferBindingSize) {
+// Fail if 128MB is still too big
+throw Error(
+  `Cannot initialize runtime because of requested 
maxStorageBufferBindingSize ` +
+  `exceeds limit. 
requested=${computeMB(backupRequiredMaxStorageBufferBindingSize)}, ` +
+  `limit=${computeMB(adapter.limits.maxStorageBufferBindingSize)}. `
+);
+  }
 }
 
 const requiredMaxComputeWorkgroupStorageSize = 32 << 10;



(tvm) branch unity updated: [Smallfix][WEB] Change memory manager import for web (#16107)

2023-11-10 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new a3d9108050 [Smallfix][WEB] Change memory manager import for web 
(#16107)
a3d9108050 is described below

commit a3d9108050a49fc557ebe3e7f8034cef1ccd94c2
Author: Charlie Ruan <53290280+charliefr...@users.noreply.github.com>
AuthorDate: Fri Nov 10 23:56:21 2023 -0500

[Smallfix][WEB] Change memory manager import for web (#16107)

Change import in wasm runtime to reflect new memory manager location.
---
 web/emcc/wasm_runtime.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
index 618e230282..786227b5b1 100644
--- a/web/emcc/wasm_runtime.cc
+++ b/web/emcc/wasm_runtime.cc
@@ -53,11 +53,11 @@
 #include "src/runtime/system_library.cc"
 #include "src/runtime/workspace_pool.cc"
 // relax setup
+#include "src/runtime/memory/memory_manager.cc"
 #include "src/runtime/relax_vm/builtin.cc"
 #include "src/runtime/relax_vm/bytecode.cc"
 #include "src/runtime/relax_vm/executable.cc"
 #include "src/runtime/relax_vm/lm_support.cc"
-#include "src/runtime/relax_vm/memory_manager.cc"
 #include "src/runtime/relax_vm/ndarray_cache_support.cc"
 #include "src/runtime/relax_vm/vm.cc"
 



(tvm) branch unity updated: [Unity] Add LoadParamOnWorker0 function in shard loader (#16093)

2023-11-10 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new bc10f769d8 [Unity] Add LoadParamOnWorker0 function in shard loader 
(#16093)
bc10f769d8 is described below

commit bc10f769d8e1410acc07eb549429ed260e250c46
Author: Hongyi Jin 
AuthorDate: Fri Nov 10 08:48:20 2023 -0800

[Unity] Add LoadParamOnWorker0 function in shard loader (#16093)

In DistIR compilation flow, shard loading is implemented in a Relax func, 
like

```
a = LoadParamOnWorker0(loader, index=0)
b = broadcast(a)
c = LoadParamOnWorker0(loader, index=1)
d = scatter_from_worker0(c)
```

LoadWholeParamOnWorker0 loads the unsharded param on worker0, and for other 
workers returns an empty array.

This PR implements LoadWholeParamOnWorker0
---
 src/runtime/disco/loader.cc | 39 +++
 1 file changed, 39 insertions(+)

diff --git a/src/runtime/disco/loader.cc b/src/runtime/disco/loader.cc
index c8d7eeb2a4..c931baa942 100644
--- a/src/runtime/disco/loader.cc
+++ b/src/runtime/disco/loader.cc
@@ -48,6 +48,8 @@ class ShardLoaderObj : public Object {
   /*! \brief Load the i-th parameter */
   NDArray Load(int weight_index) const;
 
+  NDArray LoadParamOnWorker0(int weight_index) const;
+
   /*! \brief Load all the parameters */
   Array LoadAll() const;
 
@@ -164,6 +166,35 @@ std::string GetSiblingPath(const std::string& path, const 
std::string& filename)
   LOG(FATAL) << "ValueError: Cannot find the parent directory: " << path;
 }
 
+NDArray ShardLoaderObj::LoadParamOnWorker0(int weight_index) const {
+  DiscoWorker* worker = DiscoWorker::ThreadLocal();
+  int worker_id = worker->worker_id;
+  Device device = worker->default_device;
+  int param_index = param_name_to_index_.at("param_" + 
std::to_string(weight_index));
+  const ParamInfo& param_info = param_info_.at(param_index);
+  const ParamRecord* param = param_info.param;
+  const FileRecord* file = param_info.file;
+
+  auto load = [this, param, device, file]() {
+if (file != current_file_) {
+  current_file_ = file;
+  std::string file_name = GetSiblingPath(this->metadata_.path, 
file->data_path);
+  LoadBinaryFromFile(file_name, >current_file_stream_);
+}
+return param->Load(
+device, >current_file_stream_,
+[](NDArray param, const void* data, size_t nbytes) { 
param.CopyFromBytes(data, nbytes); });
+  };
+
+  if (worker_id == 0) {
+NDArray w = load();
+return w;
+  } else {
+NDArray w = NDArray::Empty(param->shape, param->dtype, device);
+return w;
+  }
+}
+
 std::tuple ParseParamShardingInfo(const ParamRecord* param) {
   // Given a name "param_shard-X-of-Y", return the integer values
   // rank=(X-1) and world_size=Y.
@@ -337,5 +368,13 @@ 
TVM_REGISTER_GLOBAL("runtime.disco.ShardLoaderLoadAllPresharded")
   return loader->LoadAllPresharded();
 });
 
+TVM_REGISTER_GLOBAL("runtime.disco.ShardLoaderLoadParamOnWorker0")
+.set_body_typed([](ObjectRef loader_obj, int param_index) {
+  const auto* loader = loader_obj.as();
+  CHECK(loader != nullptr) << "TypeError: Expected ShardLoaderObj, but 
gets: "
+   << loader_obj->GetTypeKey();
+  return loader->LoadParamOnWorker0(param_index);
+});
+
 }  // namespace runtime
 }  // namespace tvm



(tvm) branch main updated: [Relay][Pytorch] Add support for aten::swapaxes operator (#16079)

2023-11-09 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new f9ac3b98b1 [Relay][Pytorch] Add support for aten::swapaxes operator 
(#16079)
f9ac3b98b1 is described below

commit f9ac3b98b12badb727215097cdf380809cb01309
Author: Duc-Nhat Luong <54342094+ducnhat-lu...@users.noreply.github.com>
AuthorDate: Fri Nov 10 11:18:36 2023 +0900

[Relay][Pytorch] Add support for aten::swapaxes operator (#16079)

support the pytorch's maxvit model by adding the aten::swapaxes operator 
support.

Co-authored-by: Masahiro Hiramori 

---
 python/tvm/relay/frontend/pytorch.py  |  1 +
 tests/python/frontend/pytorch/test_forward.py | 24 
 2 files changed, 25 insertions(+)

diff --git a/python/tvm/relay/frontend/pytorch.py 
b/python/tvm/relay/frontend/pytorch.py
index 81392a08ec..402ab59202 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -4108,6 +4108,7 @@ class PyTorchOpConverter:
 "aten::multinomial": self.multinomial,
 "aten::_weight_norm": self.weight_norm,
 "aten::copy_": self.inplace_copy,
+"aten::swapaxes": self.transpose,
 }
 
 def update_convert_map(self, custom_map):
diff --git a/tests/python/frontend/pytorch/test_forward.py 
b/tests/python/frontend/pytorch/test_forward.py
index abdbda8e40..b9c1b6ce9c 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -5381,6 +5381,30 @@ def test_inplace_copy():
 verify_model(PartialDimensionInplaceCopy(), [inputs])
 
 
+@tvm.testing.uses_gpu
+def test_swapaxes():
+"""test_swapaxes"""
+torch.set_grad_enabled(False)
+input_shape = [2, 3, 10, 5]
+
+class Swapaxes1(Module):
+def forward(self, *args):
+return args[0].swapaxes(2, 3)
+
+class Swapaxes2(Module):
+def forward(self, *args):
+return args[0].swapaxes(-2, -1)
+
+class Swapaxes3(Module):
+def forward(self, *args):
+return args[0].swapaxes(1, 1)
+
+input_data = torch.rand(input_shape).float()
+verify_model(Swapaxes1().float().eval(), input_data=input_data)
+verify_model(Swapaxes2().float().eval(), input_data=input_data)
+verify_model(Swapaxes3().float().eval(), input_data=input_data)
+
+
 class TestSetSpan:
 """test structural equal between translated / hand-crafted relay IR with 
span tagged."""
 



(tvm) branch unity updated: [Unity][Fix] Fix `topi.rms_norm` with float32 upscale (#16099)

2023-11-09 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 276b4cedbd [Unity][Fix] Fix `topi.rms_norm` with float32 upscale 
(#16099)
276b4cedbd is described below

commit 276b4cedbd71308ed7d43f65375bf42a080c8a01
Author: Yaxing Cai 
AuthorDate: Thu Nov 9 11:23:06 2023 -0800

[Unity][Fix] Fix `topi.rms_norm` with float32 upscale (#16099)

This PR is a mirror PR for #16091
---
 include/tvm/topi/nn/rms_norm.h | 17 ++---
 python/tvm/topi/testing/rms_norm_python.py |  9 +
 tests/python/topi/python/test_topi_rms_norm.py | 14 ++
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/include/tvm/topi/nn/rms_norm.h b/include/tvm/topi/nn/rms_norm.h
index 55dac39b71..ba2f7e49ac 100644
--- a/include/tvm/topi/nn/rms_norm.h
+++ b/include/tvm/topi/nn/rms_norm.h
@@ -54,15 +54,18 @@ inline Tensor rms_norm(const Tensor& data, const Tensor& 
weight, const Arraydtype : data_type;
   ICHECK(data_type == weight_type) << "rms_norm: data and weight must have the 
same type";
 
-  auto square = multiply(data, data);
+  const auto& data_fp32 = cast(data, DataType::Float(32));
+  const auto& weight_fp32 = cast(weight, DataType::Float(32));
+
+  auto square = multiply(data_fp32, data_fp32);
   auto square_sum = sum(square, axis, /*keepdims=*/false, /*atleast1d=*/true);
 
-  auto ndim = data->shape.size();
+  auto ndim = data_fp32->shape.size();
   ICHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
   auto real_axis = GetRealAxis(static_cast(ndim), axis);
-  auto reduce_extent = make_const(data->dtype, 1);
+  auto reduce_extent = make_const(data_fp32->dtype, 1);
   for (int i : real_axis) {
-reduce_extent *= data->shape[i];
+reduce_extent *= data_fp32->shape[i];
   }
   auto rms_norm_func = [&](const Array& indices) {
 Array reduce_indices, non_reduce_indices;
@@ -74,12 +77,12 @@ inline Tensor rms_norm(const Tensor& data, const Tensor& 
weight, const Arrayshape, rms_norm_func, name, tag);
-  return rms_norm;
+  auto rms_norm = tvm::te::compute(data_fp32->shape, rms_norm_func, name, tag);
+  return cast(rms_norm, data_type);
 }
 
 }  // namespace nn
diff --git a/python/tvm/topi/testing/rms_norm_python.py 
b/python/tvm/topi/testing/rms_norm_python.py
index 7fad5d57ce..651f6f8843 100644
--- a/python/tvm/topi/testing/rms_norm_python.py
+++ b/python/tvm/topi/testing/rms_norm_python.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 
-def rms_norm_python(data, weight, bias, axis, epsilon=1e-5):
+def rms_norm_python(data, weight, axis, epsilon=1e-5):
 """Root mean square normalization operator in Python.
 
 Parameters
@@ -44,8 +44,9 @@ def rms_norm_python(data, weight, bias, axis, epsilon=1e-5):
 result : np.ndarray
 N-D with shape (d_0, d_1, ..., d_{N-1})
 """
+dtype = data.dtype
+data = data.astype("float32")
+weight = weight.astype("float32")
 square_mean = np.mean(np.square(data), axis, keepdims=True)
 result = data * weight / np.sqrt(square_mean + epsilon)
-if bias is not None:
-result += bias
-return result
+return result.astype(dtype)
diff --git a/tests/python/topi/python/test_topi_rms_norm.py 
b/tests/python/topi/python/test_topi_rms_norm.py
index 35a1485afa..c8c1b8795f 100644
--- a/tests/python/topi/python/test_topi_rms_norm.py
+++ b/tests/python/topi/python/test_topi_rms_norm.py
@@ -34,7 +34,8 @@ _rms_norm_schedule = {
 # only test on llvm because schedule is missing
 @tvm.testing.parametrize_targets("llvm")
 @pytest.mark.parametrize(
-"shape,axis", [([4, 16], (1,)), ([4, 16, 16], (1, 2)), ([("a", 4), ("b", 
16)], (1,))]
+"shape,axis",
+[([4, 16], (1,)), ([4, 16, 16], (1, 2)), ([("a", 4), ("b", 16)], (1,)), 
([2, 8192], (1,))],
 )
 @pytest.mark.parametrize("dtype", ["float32", "float16"])
 def test_rms_norm(target, dev, shape, axis, dtype, episilon=1e-5, rtol=5e-3, 
atol=1e-4):
@@ -42,25 +43,22 @@ def test_rms_norm(target, dev, shape, axis, dtype, 
episilon=1e-5, rtol=5e-3, ato
 scale_shape_te = [shape_te[dim] for dim in axis]
 data = te.placeholder(shape_te, dtype=dtype, name="data")
 weight = te.placeholder(scale_shape_te, dtype=dtype, name="weight")
-bias = te.placeholder(scale_shape_te, dtype=dtype, name="weight")
-B = topi.nn.rms_norm(data, weight, bias, axis, episilon)
+B = topi.nn.rms_norm(data, weight, axis, episilon)
 
 shape_np = [v[1] if isinstance(v, tuple) else v for v in shape]
 scale_shape_np = [shape_np[dim] for dim in axis]
 data_np = np.random.uniform(si

(tvm) branch main updated: Add missing backtick (#15968)

2023-11-09 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new f6634afc16 Add missing backtick (#15968)
f6634afc16 is described below

commit f6634afc16185d64df06d6296c62d50968f828f3
Author: maurice 
AuthorDate: Thu Nov 9 17:07:55 2023 +0100

Add missing backtick (#15968)

Add missing backtick



(tvm) branch main updated: [Fix] Fix `topi.rms_norm` with float32 upscale (#16091)

2023-11-09 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 42de91ff45 [Fix] Fix `topi.rms_norm` with float32 upscale (#16091)
42de91ff45 is described below

commit 42de91ff458eac951c3a8ac7020ca246e0442563
Author: Yaxing Cai 
AuthorDate: Thu Nov 9 08:07:00 2023 -0800

[Fix] Fix `topi.rms_norm` with float32 upscale (#16091)

This PR fixes the `topi.rms_norm` with upscale to float32, for large 
reduction dimension of computation on float16.
---
 include/tvm/topi/nn/rms_norm.h | 28 +++---
 python/tvm/topi/nn/rms_norm.py |  7 ++-
 python/tvm/topi/testing/rms_norm_python.py |  9 +
 src/topi/nn.cc |  2 +-
 tests/python/topi/python/test_topi_rms_norm.py | 14 ++---
 5 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/include/tvm/topi/nn/rms_norm.h b/include/tvm/topi/nn/rms_norm.h
index 44d38bae6d..ba2f7e49ac 100644
--- a/include/tvm/topi/nn/rms_norm.h
+++ b/include/tvm/topi/nn/rms_norm.h
@@ -41,32 +41,31 @@ using namespace tvm::te;
  * \param data N-D tensor with shape [d_0, d_1, ..., d_{N-1}]
  * \param weight K-D tensor with shape [r_0, r_1, ..., r_{K-1}] where K == 
len(axis) and
  *   d_{axis_k} == r_k
- * \param bias Optional, K-D tensor with shape [r_0, r_1, ..., r_{K-1}] where
- * d_{axis_k} == r_k
  * \param axis The axis to normalize over.
  * \param epsilon The epsilon value to avoid division by zero.
  * \param name The name of the operation.
  * \param tag The tag to mark the operation.
  * \return The normalized tensor, with the same shape as data.
  */
-inline Tensor rms_norm(const Tensor& data, const Tensor& weight, const Tensor& 
bias,
-   const Array& axis, double epsilon, std::string 
name = "T_rms_norm",
+inline Tensor rms_norm(const Tensor& data, const Tensor& weight, const 
Array& axis,
+   double epsilon, std::string name = "T_rms_norm",
std::string tag = kInjective) {
   const auto& data_type = data->dtype;
   const auto& weight_type = weight.defined() ? weight->dtype : data_type;
   ICHECK(data_type == weight_type) << "rms_norm: data and weight must have the 
same type";
-  const auto& bias_type = bias.defined() ? bias->dtype : data_type;
-  ICHECK(data_type == bias_type) << "rms_norm: data and bias must have the 
same type";
 
-  auto square = multiply(data, data);
+  const auto& data_fp32 = cast(data, DataType::Float(32));
+  const auto& weight_fp32 = cast(weight, DataType::Float(32));
+
+  auto square = multiply(data_fp32, data_fp32);
   auto square_sum = sum(square, axis, /*keepdims=*/false, /*atleast1d=*/true);
 
-  auto ndim = data->shape.size();
+  auto ndim = data_fp32->shape.size();
   ICHECK_NE(ndim, 0) << "Cannot reduce a 0 dim Tensor";
   auto real_axis = GetRealAxis(static_cast(ndim), axis);
-  auto reduce_extent = make_const(data->dtype, 1);
+  auto reduce_extent = make_const(data_fp32->dtype, 1);
   for (int i : real_axis) {
-reduce_extent *= data->shape[i];
+reduce_extent *= data_fp32->shape[i];
   }
   auto rms_norm_func = [&](const Array& indices) {
 Array reduce_indices, non_reduce_indices;
@@ -78,15 +77,12 @@ inline Tensor rms_norm(const Tensor& data, const Tensor& 
weight, const Tensor& b
   }
 }
 auto output =
-data(indices) * weight(reduce_indices) *
+data_fp32(indices) * weight_fp32(reduce_indices) *
 tvm::rsqrt(square_sum(non_reduce_indices) / reduce_extent + 
make_const(data_type, epsilon));
-if (bias.defined()) {
-  output += bias(reduce_indices);
-}
 return output;
   };
-  auto rms_norm = tvm::te::compute(data->shape, rms_norm_func, name, tag);
-  return rms_norm;
+  auto rms_norm = tvm::te::compute(data_fp32->shape, rms_norm_func, name, tag);
+  return cast(rms_norm, data_type);
 }
 
 }  // namespace nn
diff --git a/python/tvm/topi/nn/rms_norm.py b/python/tvm/topi/nn/rms_norm.py
index f2f5a7e674..9284517468 100644
--- a/python/tvm/topi/nn/rms_norm.py
+++ b/python/tvm/topi/nn/rms_norm.py
@@ -18,7 +18,7 @@
 from .. import cpp
 
 
-def rms_norm(data, weight, bias, axis, epsilon=1e-5):
+def rms_norm(data, weight, axis, epsilon=1e-5):
 """Root mean square normalization operator. The output will have the same 
data type as input.
 
 Parameters
@@ -29,9 +29,6 @@ def rms_norm(data, weight, bias, axis, epsilon=1e-5):
 weight: tvm.te.Tensor
 K-D with shape (r_0, r_1, ..., r_{K-1}) where K == len(axis) and 
d_{axis_k} == r_k
 
-bias: tvm.te.Tensor
-Optional, K-D with shape (r_0, r_1, ..., r_{K-1

(tvm) branch unity updated: [Unity] Add `axis` field to scatter_from_worker0 (#16092)

2023-11-09 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 384f9b69ea [Unity] Add `axis` field to scatter_from_worker0 (#16092)
384f9b69ea is described below

commit 384f9b69eae44b8331572f2c43be6a7b87913d10
Author: Hongyi Jin 
AuthorDate: Thu Nov 9 07:39:29 2023 -0800

[Unity] Add `axis` field to scatter_from_worker0 (#16092)

This PR adds an `axis` field to scatter_from_worker0, which means the 
tensor axis along which it is scattered. legalize_ops will automatically 
generate reshape and transpose to preserve the constraint of ccl that 
collective communication ops must be performed on contiguous memory. For 
example, if the tensor shape of x is [10, 20], and we have 
`scatter_from_worker0(x, num_workers=2, axis=1)`, then after legalization it 
will expand to

```
x = reshape(x, [10, 2, 10]) # shape: [10, 2, 10]
x = permute_dims(x, [1, 0, 2]) # shape: [2, 10, 10]
x = call_dps_packed("scatter_from_worker0", x) # shape: [10, 10]
```

When axis=0, the behavior is the same as before.

Also, this PR renames ScatterFromWorker0Attrs to ScatterAttrs to enable 
reuse by other ops like worker-id-aware slicing (scatter_from_worker0 = 
broadcast_from_worker0 +  worker-id-aware slicing).
---
 include/tvm/relax/attrs/ccl.h  | 12 --
 python/tvm/relax/op/ccl/ccl.py |  7 +++-
 python/tvm/relax/transform/legalize_ops/ccl.py | 45 ++
 src/relax/op/ccl/ccl.cc| 17 
 src/relax/op/ccl/ccl.h |  2 +-
 .../relax/test_transform_legalize_ops_ccl.py   | 32 +--
 6 files changed, 82 insertions(+), 33 deletions(-)

diff --git a/include/tvm/relax/attrs/ccl.h b/include/tvm/relax/attrs/ccl.h
index b4b3880384..42cec88de6 100644
--- a/include/tvm/relax/attrs/ccl.h
+++ b/include/tvm/relax/attrs/ccl.h
@@ -40,17 +40,21 @@ struct AllReduceAttrs : public 
tvm::AttrsNode {
   }
 };  // struct AllReduceAttrs
 
-/*! \brief Attributes used in scatter_from_worker0 operators */
-struct ScatterFromWorker0Attrs : public 
tvm::AttrsNode {
+/*! \brief Attributes used in scatter operators */
+struct ScatterCollectiveAttrs : public tvm::AttrsNode {
   int num_workers;
+  int axis;
 
-  TVM_DECLARE_ATTRS(ScatterFromWorker0Attrs, 
"relax.attrs.ScatterFromWorker0Attrs") {
+  TVM_DECLARE_ATTRS(ScatterCollectiveAttrs, 
"relax.attrs.ScatterCollectiveAttrs") {
 TVM_ATTR_FIELD(num_workers)
 .describe(
 "The number of workers, also the number of parts the given buffer 
should be chunked "
 "into.");
+TVM_ATTR_FIELD(axis).describe(
+"The axis of the tensor to be scattered. The tensor will be chunked 
along "
+"this axis.");
   }
-};  // struct ScatterFromWorker0Attrs
+};  // struct ScatterCollectiveAttrs
 
 }  // namespace relax
 }  // namespace tvm
diff --git a/python/tvm/relax/op/ccl/ccl.py b/python/tvm/relax/op/ccl/ccl.py
index 4829bac761..21c7946120 100644
--- a/python/tvm/relax/op/ccl/ccl.py
+++ b/python/tvm/relax/op/ccl/ccl.py
@@ -84,7 +84,7 @@ def broadcast_from_worker0(x: Expr) -> Expr:
 return _ffi_api.broadcast_from_worker0(x)
 
 
-def scatter_from_worker0(x: Expr, num_workers: int) -> Expr:
+def scatter_from_worker0(x: Expr, num_workers: int, axis: int = 0) -> Expr:
 """Perform a scatter operation from worker-0, chunking the given buffer 
into equal parts.
 
 Parameters
@@ -95,9 +95,12 @@ def scatter_from_worker0(x: Expr, num_workers: int) -> Expr:
 num_worker : int
   The number of workers, i.e. the number of parts the given buffer should 
be chunked into.
 
+axis : int
+  The dimension of the tensor to be scattered. Default is 0.
+
 Returns
 ---
 result : relax.Expr
   Chunked Tensor received by different workers.
 """
-return _ffi_api.scatter_from_worker0(x, num_workers)
+return _ffi_api.scatter_from_worker0(x, num_workers, axis)
diff --git a/python/tvm/relax/transform/legalize_ops/ccl.py 
b/python/tvm/relax/transform/legalize_ops/ccl.py
index 9b13d1be7c..ae0be3c228 100644
--- a/python/tvm/relax/transform/legalize_ops/ccl.py
+++ b/python/tvm/relax/transform/legalize_ops/ccl.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name
 """Default legalization function for ccl operators."""
-from tvm import tir, arith
+from tvm import tir, arith, topi
 from ...block_builder import BlockBuilder
 from ...expr import Call, Expr, ShapeExpr
 from ...op import call_dps_packed
@@ -80,28 +80,43 @@ def _broadcast_from_worker0(_bb: BlockBuilder, call: Call) 
-> Expr:
 )
 
 
-@register_legalize("relax.

(tvm) branch main updated: Add _ffi_api.py under script folder (#16087)

2023-11-08 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 47a0b4558e Add _ffi_api.py under script folder (#16087)
47a0b4558e is described below

commit 47a0b4558ef52d229e11f51ab525472dcc13bf1c
Author: Hongyi Jin 
AuthorDate: Wed Nov 8 09:01:47 2023 -0800

Add _ffi_api.py under script folder (#16087)

Add _ffi_api.py under script folder so that user can call script.Complete 
in python.
---
 python/tvm/script/_ffi_api.py | 20 
 1 file changed, 20 insertions(+)

diff --git a/python/tvm/script/_ffi_api.py b/python/tvm/script/_ffi_api.py
new file mode 100644
index 00..ebc638f3fd
--- /dev/null
+++ b/python/tvm/script/_ffi_api.py
@@ -0,0 +1,20 @@
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""FFI APIs for tvm.script"""
+import tvm._ffi
+
+
+tvm._ffi._init_api("script", __name__)



(tvm) branch unity updated (3f1347cbd4 -> 3de77f8def)

2023-11-06 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 3f1347cbd4 [Unity] Enhance Python Annotations for Relax Expr (#16075)
 add 57597f62b4 [Fix][TIR]fix symbolic strides lower (#16000)
 add c6f2816b58 [FFI][Python] Handle error propagation when line number is 
missing (#15955)
 add 5ac2d1a219 [Pylint] fix pylint issues for cblas (#16015)
 add c4c0a492af [Tests] Fix str vs. int comparison in test_num_threads 
(#16017)
 add 5b561a6059 [Tests] Check int8+int32 testcases in 
test_estimate_peak_flops_cpu (#16019)
 add ba0179f2f8 [Tests] Fix work_dir location used by 
test_micro_tuning_with_meta_schedule (#16018)
 add d83cd217a5 [microNPU][ETHOSU] Fix ConcatRewriter args processing 
(#16003)
 add 878a61105e [Fix][TIR]fix mul dtype mismatch (#16010)
 add 7a50c36f1c [Codegen] Add shuffle for cuda and metal (#15998)
 add bd3e8bb9cf [Runtime] Introduce `TVM_MODULE_VTABLE` Macros (#16030)
 add c3ce474dc1 [Pylint] fix pylint issues for thrust_runtime 
(#16023)
 add 043f147328 [Codegen][Metal] Disable cross-function call in Metal 
codegen (#16033)
 add 4b29f25613 [TVMScript] Fix mismatched dtype of IterVar in 
`T.thread_binding` (#16041)
 add c9fb87fd0e [TIR] Fix software pipeline with dynamic loop extent 
(#16027)
 add 3b8d1a831d [CMake] Use llvm-config to locate Findzstd.cmake (#16032)
 add 9df0683701 [TIR] Fix pass RenewDefs error in gather/take case (#16063)
 add 134e73d8fd [CI] Use LLVM 17 for tests on `ci_arm` (#16062)
 add b144145c4b [CUDA] Add an option for profiling cuda kernels (#16061)
 add 1de5aa551d [TIR] Fix the thread binding iter_var dtype in `Bind` 
primitive (#16074)
 add 7294a1e934 [TRT] fix outdated module building method in tensorrt 
(#16031)
 add 02d4df781e [Runtime] Support clear global memory allocators (#16066)
 add ffa00332ad [Bugfix][FFI] Typo fix of IncRef to DecRef (#16021)
 add 3de77f8def Merge remote-tracking branch 'main' into unity

No new revisions were added by this update.

Summary of changes:
 .github/workflows/main.yml |   3 +-
 cmake/utils/FindLLVM.cmake |   9 ++
 docs/how_to/deploy/tensorrt.rst|   4 +-
 include/tvm/runtime/memory/memory_manager.h|   4 +
 include/tvm/runtime/packed_func.h  |  80 ++
 include/tvm/runtime/vm/executable.h|  39 +++--
 python/tvm/_ffi/_ctypes/packed_func.py |   2 +
 python/tvm/_ffi/_cython/packed_func.pxi|   4 +-
 python/tvm/_ffi/base.py|  15 +-
 python/tvm/contrib/nvcc.py |  19 ++-
 .../tvm/relay/backend/contrib/ethosu/legalize.py   |   2 +-
 src/runtime/memory/memory_manager.cc   |  19 +++
 src/runtime/memory/pooled_allocator.h  |   2 +
 src/runtime/registry.cc|  44 +-
 src/runtime/vm/executable.cc   |  81 ++
 src/script/ir_builder/tir/ir.cc|   9 +-
 src/target/opt/build_cuda_on.cc|   1 +
 src/target/source/codegen_c.cc |  47 +-
 src/target/source/codegen_c.h  |   2 +
 src/target/source/codegen_cuda.cc  |  49 ++
 src/target/source/codegen_cuda.h   |   2 +-
 src/target/source/codegen_metal.cc |  93 +++-
 src/target/source/codegen_metal.h  |   3 +-
 src/tir/schedule/concrete_schedule.cc  |   4 +-
 src/tir/schedule/primitive.h   |   2 +-
 src/tir/schedule/primitive/for_kind.cc |  19 ++-
 src/tir/transforms/inject_ptx_async_copy.cc|   7 +-
 src/tir/transforms/inject_software_pipeline.cc |   2 +-
 src/tir/transforms/ir_utils.cc |   3 +-
 src/tir/transforms/lower_cross_thread_reduction.cc |   2 +-
 tests/lint/pylint.sh   |   4 +
 tests/python/contrib/test_cblas.py | 165 +
 tests/python/contrib/test_ethosu/test_codegen.py   |  16 ++
 tests/python/contrib/test_tflite_runtime.py|  18 ++-
 tests/python/contrib/test_thrust.py|   6 +-
 tests/python/contrib/test_util.py  |   4 +-
 tests/python/unittest/test_micro_ms_tuning.py  |   2 +-
 tests/python/unittest/test_roofline.py |   2 +-
 .../test_runtime_module_based_interface.py |   4 +-
 tests/python/unittest/test_target_codegen_cuda.py  |  24 +++
 tests/python/unittest/test_target_codegen_metal.py |  23 +++
 .../python/unittest/test_tir_schedule_for_kind.py  |  29 
 .../test_tir_transform_inject_ptx_async_copy.py|  34 +
 .../test_tir_transform_inject_software_pipeline.py |  72 +
 .../test_tir_transform_lower_opaque_block.py   |  48 ++
 tests

(tvm) 01/01: Merge remote-tracking branch 'main' into unity

2023-11-06 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity-staging
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit 3de77f8defb541d8d168d9268d796d5c5b7655b2
Merge: 3f1347cbd4 ffa00332ad
Author: Junru Shao 
AuthorDate: Mon Nov 6 06:48:58 2023 -0800

Merge remote-tracking branch 'main' into unity

 .github/workflows/main.yml |   3 +-
 cmake/utils/FindLLVM.cmake |   9 ++
 docs/how_to/deploy/tensorrt.rst|   4 +-
 include/tvm/runtime/memory/memory_manager.h|   4 +
 include/tvm/runtime/packed_func.h  |  80 ++
 include/tvm/runtime/vm/executable.h|  39 +++--
 python/tvm/_ffi/_ctypes/packed_func.py |   2 +
 python/tvm/_ffi/_cython/packed_func.pxi|   4 +-
 python/tvm/_ffi/base.py|  15 +-
 python/tvm/contrib/nvcc.py |  19 ++-
 .../tvm/relay/backend/contrib/ethosu/legalize.py   |   2 +-
 src/runtime/memory/memory_manager.cc   |  19 +++
 src/runtime/memory/pooled_allocator.h  |   2 +
 src/runtime/registry.cc|  44 +-
 src/runtime/vm/executable.cc   |  81 ++
 src/script/ir_builder/tir/ir.cc|   9 +-
 src/target/opt/build_cuda_on.cc|   1 +
 src/target/source/codegen_c.cc |  47 +-
 src/target/source/codegen_c.h  |   2 +
 src/target/source/codegen_cuda.cc  |  49 ++
 src/target/source/codegen_cuda.h   |   2 +-
 src/target/source/codegen_metal.cc |  93 +++-
 src/target/source/codegen_metal.h  |   3 +-
 src/tir/schedule/concrete_schedule.cc  |   4 +-
 src/tir/schedule/primitive.h   |   2 +-
 src/tir/schedule/primitive/for_kind.cc |  19 ++-
 src/tir/transforms/inject_ptx_async_copy.cc|   7 +-
 src/tir/transforms/inject_software_pipeline.cc |   2 +-
 src/tir/transforms/ir_utils.cc |   3 +-
 src/tir/transforms/lower_cross_thread_reduction.cc |   2 +-
 tests/lint/pylint.sh   |   4 +
 tests/python/contrib/test_cblas.py | 165 +
 tests/python/contrib/test_ethosu/test_codegen.py   |  16 ++
 tests/python/contrib/test_tflite_runtime.py|  18 ++-
 tests/python/contrib/test_thrust.py|   6 +-
 tests/python/contrib/test_util.py  |   4 +-
 tests/python/unittest/test_micro_ms_tuning.py  |   2 +-
 tests/python/unittest/test_roofline.py |   2 +-
 .../test_runtime_module_based_interface.py |   4 +-
 tests/python/unittest/test_target_codegen_cuda.py  |  24 +++
 tests/python/unittest/test_target_codegen_metal.py |  23 +++
 .../python/unittest/test_tir_schedule_for_kind.py  |  29 
 .../test_tir_transform_inject_ptx_async_copy.py|  34 +
 .../test_tir_transform_inject_software_pipeline.py |  72 +
 .../test_tir_transform_lower_opaque_block.py   |  48 ++
 tests/python/unittest/test_tvmscript_parser_tir.py |  15 ++
 tests/scripts/task_config_build_arm.sh |   2 +-
 47 files changed, 768 insertions(+), 272 deletions(-)




(tvm) branch unity-staging updated (be8cab10e1 -> 3de77f8def)

2023-11-06 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch unity-staging
in repository https://gitbox.apache.org/repos/asf/tvm.git


 discard be8cab10e1 Merge remote-tracking branch 'main' into unity
 add 021f31b737 [Unity] Fix FuseTIR pass for gather/take cases  (#16064)
 add 7d0e60ad0e Revert "[Unity][Support] Sample from top-p supports offset" 
(#16077)
 add 664beaaf1a [Unity] Loading NDArrayCache by parameter names (#16078)
 add cf013c2f6b [Unity] Handle duplicate outputs in LazyTransformParams 
(#15942)
 add 3f1347cbd4 [Unity] Enhance Python Annotations for Relax Expr (#16075)
 add 7294a1e934 [TRT] fix outdated module building method in tensorrt 
(#16031)
 add 02d4df781e [Runtime] Support clear global memory allocators (#16066)
 add ffa00332ad [Bugfix][FFI] Typo fix of IncRef to DecRef (#16021)
 new 3de77f8def Merge remote-tracking branch 'main' into unity

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (be8cab10e1)
\
 N -- N -- N   refs/heads/unity-staging (3de77f8def)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 docs/how_to/deploy/tensorrt.rst|   4 +-
 include/tvm/relax/expr.h   |   5 +-
 include/tvm/runtime/memory/memory_manager.h|   4 +
 python/tvm/_ffi/_ctypes/packed_func.py |   2 +
 python/tvm/_ffi/_cython/packed_func.pxi|   4 +-
 python/tvm/ir/expr.py  |   3 +-
 python/tvm/relax/__init__.py   |   1 -
 python/tvm/relax/expr.py   | 180 +
 .../tvm/relax/transform/lazy_transform_params.py   |  26 +--
 src/relax/ir/expr.cc   |   4 +-
 src/relax/transform/fuse_tir.cc|  28 +++-
 src/runtime/memory/memory_manager.cc   |  19 +++
 src/runtime/memory/pooled_allocator.h  |   2 +
 src/runtime/registry.cc|  44 -
 src/runtime/relax_vm/lm_support.cc |  16 +-
 src/runtime/relax_vm/ndarray_cache_support.cc  |  15 ++
 tests/python/relax/test_transform_fuse_tir.py  |  90 ++-
 .../relax/test_transform_lazy_transform_params.py  |  52 ++
 web/src/runtime.ts |   2 +-
 19 files changed, 427 insertions(+), 74 deletions(-)



(tvm) branch main updated: [Bugfix][FFI] Typo fix of IncRef to DecRef (#16021)

2023-11-06 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new ffa00332ad [Bugfix][FFI] Typo fix of IncRef to DecRef (#16021)
ffa00332ad is described below

commit ffa00332ad12c8610984a216dbebd318f531033d
Author: Eric Lunderberg 
AuthorDate: Mon Nov 6 02:16:41 2023 -0600

[Bugfix][FFI] Typo fix of IncRef to DecRef (#16021)

Propagation of Python exceptions across C++ stack frames was
introduced in https://github.com/apache/tvm/pull/15596.  This commit
primarily fixes a typo in the initial implementation, where
`Py_IncRef` was used instead of `Py_DecRef`.

In addition, this PR resolves errors that were exposed by this typo
fix, which caused test failures in
`tests/python/unittest/test_crt.py::test_compile_runtime`.  These were
due to use of the `Py_IncRef` and `Py_DecRef` functions on threads
that hadn't acquired the GIL.  This usage error has been corrected for
both the ctypes and cython FFI handling.
---
 python/tvm/_ffi/_ctypes/packed_func.py  |  2 ++
 python/tvm/_ffi/_cython/packed_func.pxi |  4 ++-
 src/runtime/registry.cc | 44 ++---
 3 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/python/tvm/_ffi/_ctypes/packed_func.py 
b/python/tvm/_ffi/_ctypes/packed_func.py
index e8680afcdf..6465e0335d 100644
--- a/python/tvm/_ffi/_ctypes/packed_func.py
+++ b/python/tvm/_ffi/_ctypes/packed_func.py
@@ -340,6 +340,8 @@ def _init_pythonapi_inc_def_ref():
 register_func = _LIB.TVMBackendRegisterEnvCAPI
 register_func(c_str("Py_IncRef"), ctypes.pythonapi.Py_IncRef)
 register_func(c_str("Py_DecRef"), ctypes.pythonapi.Py_DecRef)
+register_func(c_str("PyGILState_Ensure"), 
ctypes.pythonapi.PyGILState_Ensure)
+register_func(c_str("PyGILState_Release"), 
ctypes.pythonapi.PyGILState_Release)
 
 
 _init_pythonapi_inc_def_ref()
diff --git a/python/tvm/_ffi/_cython/packed_func.pxi 
b/python/tvm/_ffi/_cython/packed_func.pxi
index ae528bcb78..3d1e87bf56 100644
--- a/python/tvm/_ffi/_cython/packed_func.pxi
+++ b/python/tvm/_ffi/_cython/packed_func.pxi
@@ -17,7 +17,7 @@
 
 import ctypes
 import traceback
-from cpython cimport Py_INCREF, Py_DECREF
+from cpython cimport Py_INCREF, Py_DECREF, PyGILState_Ensure, 
PyGILState_Release
 from numbers import Number, Integral
 from ..base import string_types, py2cerror
 from ..runtime_ctypes import DataType, Device, TVMByteArray, ObjectRValueRef
@@ -381,5 +381,7 @@ def _init_pythonapi_inc_def_ref():
 register_func = TVMBackendRegisterEnvCAPI
 register_func(c_str("Py_IncRef"), _py_incref_wrapper)
 register_func(c_str("Py_DecRef"), _py_decref_wrapper)
+register_func(c_str("PyGILState_Ensure"), PyGILState_Ensure)
+register_func(c_str("PyGILState_Release"), PyGILState_Release)
 
 _init_pythonapi_inc_def_ref()
diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc
index 0db8786145..0a034a7b58 100644
--- a/src/runtime/registry.cc
+++ b/src/runtime/registry.cc
@@ -148,6 +148,16 @@ class EnvCAPIRegistry {
*/
   F_Py_IncDefRef py_dec_ref = nullptr;
 
+  /*!
+\brief PyGILState_Ensure function
+   */
+  void* (*py_gil_state_ensure)() = nullptr;
+
+  /*!
+\brief PyGILState_Release function
+   */
+  void (*py_gil_state_release)(void*) = nullptr;
+
   static EnvCAPIRegistry* Global() {
 static EnvCAPIRegistry* inst = new EnvCAPIRegistry();
 return inst;
@@ -161,6 +171,10 @@ class EnvCAPIRegistry {
   Update(symbol_name, _inc_ref, fptr);
 } else if (symbol_name == "Py_DecRef") {
   Update(symbol_name, _dec_ref, fptr);
+} else if (symbol_name == "PyGILState_Ensure") {
+  Update(symbol_name, _gil_state_ensure, fptr);
+} else if (symbol_name == "PyGILState_Release") {
+  Update(symbol_name, _gil_state_release, fptr);
 } else {
   LOG(FATAL) << "Unknown env API " << symbol_name;
 }
@@ -177,15 +191,17 @@ class EnvCAPIRegistry {
   }
 
   void IncRef(void* python_obj) {
+WithGIL context(this);
 ICHECK(py_inc_ref) << "Attempted to call Py_IncRef through 
EnvCAPIRegistry, "
<< "but Py_IncRef wasn't registered";
 (*py_inc_ref)(python_obj);
   }
 
   void DecRef(void* python_obj) {
-ICHECK(py_inc_ref) << "Attempted to call Py_IncRef through 
EnvCAPIRegistry, "
-   << "but Py_IncRef wasn't registered";
-(*py_inc_ref)(python_obj);
+WithGIL context(this);
+ICHECK(py_dec_ref) << "Attempted to call Py_DefRef through 
EnvCAPIRegistry, "
+   << "but Py_DefRef wasn't registered";
+(*py_dec_ref)(python_obj

(tvm) branch unity updated: [Unity] Enhance Python Annotations for Relax Expr (#16075)

2023-11-06 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 3f1347cbd4 [Unity] Enhance Python Annotations for Relax Expr (#16075)
3f1347cbd4 is described below

commit 3f1347cbd40028f9798f9c5e9d614380d5467fc3
Author: Siyuan Feng 
AuthorDate: Mon Nov 6 16:15:44 2023 +0800

[Unity] Enhance Python Annotations for Relax Expr (#16075)

This PR enhances the Python annotations for the Relax Expr, adding
class member variables annotations and improving docstring
---
 include/tvm/relax/expr.h |   5 +-
 python/tvm/ir/expr.py|   3 +-
 python/tvm/relax/__init__.py |   1 -
 python/tvm/relax/expr.py | 180 +++
 src/relax/ir/expr.cc |   4 +-
 5 files changed, 153 insertions(+), 40 deletions(-)

diff --git a/include/tvm/relax/expr.h b/include/tvm/relax/expr.h
index 02d6f8d276..bb1b2c8dd7 100644
--- a/include/tvm/relax/expr.h
+++ b/include/tvm/relax/expr.h
@@ -567,8 +567,9 @@ class Constant : public LeafExpr {
   /*!
* \brief The constructor
* \param data The data of the constant tensor.
-   * \param struct_info_annotation The struct info of the constant tensor. If 
not specified, infer
-   * it from data. \param span The source span of the expression.
+   * \param struct_info_annotation The struct info of the constant tensor.
+   *If not specified, infer it from data.
+   * \param span The source span of the expression.
*/
   TVM_DLL explicit Constant(runtime::NDArray data,
 Optional struct_info_annotation = 
NullOpt,
diff --git a/python/tvm/ir/expr.py b/python/tvm/ir/expr.py
index b8b71666ec..9ca802d80e 100644
--- a/python/tvm/ir/expr.py
+++ b/python/tvm/ir/expr.py
@@ -16,6 +16,7 @@
 # under the License.
 """Common expressions data structures in the IR."""
 from numbers import Number
+from typing import Optional
 
 import tvm._ffi
 
@@ -54,7 +55,7 @@ class RelayExpr(BaseExpr):
 return ret
 
 @property
-def struct_info(self) -> "tvm.relax.StructInfo":
+def struct_info(self) -> Optional["tvm.relax.StructInfo"]:
 """Get the struct info field
 
 Returns
diff --git a/python/tvm/relax/__init__.py b/python/tvm/relax/__init__.py
index 09b5b965ea..5bc0d6c56e 100644
--- a/python/tvm/relax/__init__.py
+++ b/python/tvm/relax/__init__.py
@@ -23,7 +23,6 @@ from tvm.runtime.relax_vm import VirtualMachine, 
VMInstrumentReturnKind
 from .expr import (
 Expr,
 Span,
-SourceName,
 Id,
 GlobalVar,
 Var,
diff --git a/python/tvm/relax/expr.py b/python/tvm/relax/expr.py
index 7e43506e97..71f23577e7 100644
--- a/python/tvm/relax/expr.py
+++ b/python/tvm/relax/expr.py
@@ -14,8 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# pylint: disable=invalid-name, unused-import, super-init-not-called
-# pylint: disable=redefined-builtin
 """The expression nodes of Relax."""
 import typing
 from numbers import Number
@@ -32,7 +30,7 @@ from tvm._ffi import base as _base
 from tvm.runtime import Object
 from tvm.runtime import ndarray as _nd
 
-from ..ir import BaseFunc, Node, SourceName, Span
+from ..ir import BaseFunc, Node, Span
 from ..runtime import Scriptable, String
 from ..tir import PrimExpr
 from . import _ffi_api
@@ -51,6 +49,8 @@ class Id(Object):
 Guaranteed to be stable across all passes.
 """
 
+name_hint: str
+
 def __init__(self):
 raise RuntimeError("Cannot directly construct Id")
 
@@ -92,7 +92,7 @@ class StructInfo(Node, Scriptable):
 
 
 # will be registered afterwards in python/tvm/relax/op/init.py
-_op_ffi_api = None
+_op_ffi_api = None  # pylint: disable=invalid-name
 
 
 def _binary_op_helper(lhs: "ExprWithOp", rhs: "ExprWithOp", op: Callable) -> 
"ExprWithOp":
@@ -273,6 +273,12 @@ class Call(ExprWithOp):
 Span that points to original source code
 """
 
+op: Expr
+args: List[Expr]
+attrs: tvm.ir.Attrs
+sinfo_args: List[StructInfo]
+span: Optional[Span]
+
 def __init__(
 self,
 op: Union[Expr, tvm.ir.Op],
@@ -302,9 +308,19 @@ class If(ExprWithOp):
 
 false_branch: Expr
 The expression evaluated when condition is false.
+
+span: Optional[Span]
+Span that points to original source code
 """
 
-def __init__(self, cond: Expr, true_branch: Expr, false_branch: Expr, 
span: Span = None):
+cond: Expr
+true_branch: Expr
+false_branch: Expr
+span: Optional[Span]
+
+def __init__(
+self, cond: Expr, true_branch: Expr, false_branch: Expr, span: 
Optional[S

(tvm) branch main updated: [Runtime] Support clear global memory allocators (#16066)

2023-11-06 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 02d4df781e [Runtime] Support clear global memory allocators (#16066)
02d4df781e is described below

commit 02d4df781ef14a928eca6ba522736b80929875ff
Author: Ruihang Lai 
AuthorDate: Mon Nov 6 03:15:29 2023 -0500

[Runtime] Support clear global memory allocators (#16066)

This PR supports clearing up all the allocated memory.

Prior to this PR, all the allocated memory are managed in the pool
of memory manager. The allocated memory in the pool is on hold and
never freed. Consequently, the pool size always goes up monotonically
within a single run in a process.

While good to save time of memory allocation, in some cases (e.g.,
on mobile phones which may have running memory limit) we need to
clear the pool and free all the memory in order to prevent the pool
from endlessly growing up and some of allocated memory not being
effectively utilized (fragmentation).

Therefore, this PR introduces a PackedFunc that helps clean up
the memory manager, releasing all the allocated memory. Runtime
apps can decide when to invoke this PackedFunc and clean up the
pool. Usually, this will happen at some app "reset" or "reload"
stage.
---
 include/tvm/runtime/memory/memory_manager.h |  4 
 src/runtime/memory/memory_manager.cc| 19 +++
 src/runtime/memory/pooled_allocator.h   |  2 ++
 3 files changed, 25 insertions(+)

diff --git a/include/tvm/runtime/memory/memory_manager.h 
b/include/tvm/runtime/memory/memory_manager.h
index 8b38fbf6f0..3f5f83fdb4 100644
--- a/include/tvm/runtime/memory/memory_manager.h
+++ b/include/tvm/runtime/memory/memory_manager.h
@@ -89,6 +89,8 @@ class Allocator {
*  \param buffer The buffer to free.
*/
   virtual void Free(const Buffer& buffer) = 0;
+  /*! \brief Clear the allocated memory. */
+  virtual void Clear();
   /*! \brief The amount of memory currently allocated.
*  \return The amount of memory currently allocated.
*/
@@ -119,6 +121,8 @@ class MemoryManager {
* \return The memory allocator.
*/
   static Allocator* GetAllocator(Device dev, AllocatorType type);
+  /*! \brief Clear the allocators. */
+  static void Clear();
 
  private:
   MemoryManager() {}
diff --git a/src/runtime/memory/memory_manager.cc 
b/src/runtime/memory/memory_manager.cc
index e72934ed2e..71d5a7d034 100644
--- a/src/runtime/memory/memory_manager.cc
+++ b/src/runtime/memory/memory_manager.cc
@@ -22,6 +22,7 @@
  * \brief Allocate and manage memory for the runtime.
  */
 #include 
+#include 
 
 #include 
 #include 
@@ -166,6 +167,16 @@ Allocator* MemoryManager::GetAllocator(Device dev, 
AllocatorType type) {
   return it->second.at(type).get();
 }
 
+void MemoryManager::Clear() {
+  MemoryManager* m = MemoryManager::Global();
+  std::lock_guard lock(m->mu_);
+  for (const auto& [device, allocators] : m->allocators_) {
+for (const auto& [allocator_type, allocator] : allocators) {
+  allocator->Clear();
+}
+  }
+}
+
 NDArray Allocator::Empty(ShapeTuple shape, DLDataType dtype, DLDevice dev,
  Optional mem_scope) {
   VerifyDataType(dtype);
@@ -198,6 +209,14 @@ Buffer Allocator::Alloc(Device dev, ShapeTuple shape, 
DLDataType type_hint,
   return {};
 }
 
+void Allocator::Clear() {
+  // This function by default does nothing.
+  // For naive allocator, no explicit manual clear is needed.
+  // Pooled allocator will override this method.
+}
+
+TVM_REGISTER_GLOBAL("vm.builtin.memory_manager.clear").set_body_typed(MemoryManager::Clear);
+
 }  // namespace memory
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/memory/pooled_allocator.h 
b/src/runtime/memory/pooled_allocator.h
index 01dded966b..826af49e5a 100644
--- a/src/runtime/memory/pooled_allocator.h
+++ b/src/runtime/memory/pooled_allocator.h
@@ -90,6 +90,8 @@ class PooledAllocator final : public Allocator {
 VLOG(1) << "reclaim buffer " << buffer.size;
   }
 
+  void Clear() override { ReleaseAll(); }
+
   size_t UsedMemory() const override { return 
used_memory_.load(std::memory_order_relaxed); }
 
  private:



(tvm) branch unity updated: [Unity] Loading NDArrayCache by parameter names (#16078)

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 664beaaf1a [Unity] Loading NDArrayCache by parameter names (#16078)
664beaaf1a is described below

commit 664beaaf1a14b3ca176d5a0d6e3451bfc3e37420
Author: Junru Shao 
AuthorDate: Sun Nov 5 18:58:18 2023 -0800

[Unity] Loading NDArrayCache by parameter names (#16078)

This PR adds support for loading parameters from NDArrayCache ordered by
their names.
---
 src/runtime/relax_vm/ndarray_cache_support.cc | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/src/runtime/relax_vm/ndarray_cache_support.cc 
b/src/runtime/relax_vm/ndarray_cache_support.cc
index b2f53bfe1e..ea90255fba 100644
--- a/src/runtime/relax_vm/ndarray_cache_support.cc
+++ b/src/runtime/relax_vm/ndarray_cache_support.cc
@@ -308,6 +308,19 @@ class ParamModuleNode : public runtime::ModuleNode {
 return params;
   }
 
+  static Array GetParamByName(const Array& names) {
+Array result;
+result.reserve(names.size());
+for (const String& name : names) {
+  if (Optional opt = NDArrayCache::Get(name)) {
+result.push_back(opt.value());
+  } else {
+LOG(FATAL) << "ValueError: Cannot find parameter in cache: " << name;
+  }
+}
+return result;
+  }
+
   static Module Create(const std::string& prefix, int num_params) {
 auto n = make_object();
 n->params_ = GetParams(prefix, num_params);
@@ -320,6 +333,8 @@ class ParamModuleNode : public runtime::ModuleNode {
 
 
TVM_REGISTER_GLOBAL("vm.builtin.param_module_from_cache").set_body_typed(ParamModuleNode::Create);
 
TVM_REGISTER_GLOBAL("vm.builtin.param_array_from_cache").set_body_typed(ParamModuleNode::GetParams);
+TVM_REGISTER_GLOBAL("vm.builtin.param_array_from_cache_by_name")
+.set_body_typed(ParamModuleNode::GetParamByName);
 
 }  // namespace relax_vm
 }  // namespace runtime



(tvm) branch dependabot/npm_and_yarn/web/babel/traverse-7.23.2 deleted (was 8d59e183d0)

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch 
dependabot/npm_and_yarn/web/babel/traverse-7.23.2
in repository https://gitbox.apache.org/repos/asf/tvm.git


 was 8d59e183d0 Bump @babel/traverse from 7.20.5 to 7.23.2 in /web

The revisions that were on this branch are still contained in
other references; therefore, this change does not discard any commits
from the repository.



(tvm) branch dependabot/pip/apps/microtvm/django-4.1.13 deleted (was baed30369b)

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch dependabot/pip/apps/microtvm/django-4.1.13
in repository https://gitbox.apache.org/repos/asf/tvm.git


 was baed30369b Bump django from 4.1.7 to 4.1.13 in /apps/microtvm

The revisions that were on this branch are still contained in
other references; therefore, this change does not discard any commits
from the repository.



(tvm) branch dependabot/pip/docker/python/pip-23.3 deleted (was bd080a97c5)

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch dependabot/pip/docker/python/pip-23.3
in repository https://gitbox.apache.org/repos/asf/tvm.git


 was bd080a97c5 Bump pip from 22.1.1 to 23.3 in /docker/python

The revisions that were on this branch are still contained in
other references; therefore, this change does not discard any commits
from the repository.



(tvm) branch revert-16069-unity-dev/2023-11-03-sample-top-p deleted (was c97ac62aee)

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch 
revert-16069-unity-dev/2023-11-03-sample-top-p
in repository https://gitbox.apache.org/repos/asf/tvm.git


 was c97ac62aee Revert "[Unity][Support] Sample from top-p supports offset 
(#16069)"

The revisions that were on this branch are still contained in
other references; therefore, this change does not discard any commits
from the repository.



(tvm) branch unity updated: Revert "[Unity][Support] Sample from top-p supports offset" (#16077)

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 7d0e60ad0e Revert "[Unity][Support] Sample from top-p supports offset" 
(#16077)
7d0e60ad0e is described below

commit 7d0e60ad0e6640f0238a7a5d78104d562179ca40
Author: Ruihang Lai 
AuthorDate: Sun Nov 5 17:18:14 2023 -0500

Revert "[Unity][Support] Sample from top-p supports offset" (#16077)

Revert "[Unity][Support] Sample from top-p supports offset (#16069)"

This reverts commit 0ed1e30df51c50ce9ffcfe4e61115fb13cd5340e.

Co-authored-by: Junru Shao 
---
 src/runtime/relax_vm/lm_support.cc | 16 +++-
 web/src/runtime.ts |  2 +-
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/runtime/relax_vm/lm_support.cc 
b/src/runtime/relax_vm/lm_support.cc
index ba88820f08..e56a03fdea 100644
--- a/src/runtime/relax_vm/lm_support.cc
+++ b/src/runtime/relax_vm/lm_support.cc
@@ -379,13 +379,7 @@ int SampleTopPFromLogits(NDArray logits, double 
temperature, double top_p, doubl
 
 
TVM_REGISTER_GLOBAL("vm.builtin.sample_top_p_from_logits").set_body_typed(SampleTopPFromLogits);
 
-int SampleTopPFromProb(NDArray prob, int unit_offset, double top_p, double 
uniform_sample) {
-  // prob: (*, v)
-  // The prob array may have arbitrary ndim and shape.
-  // The last dimension corresponds to the prob distribution size.
-  // We use the `unit_offset` parameter to determine which slice
-  // of the prob array we sample from.
-
+int SampleTopPFromProb(NDArray prob, double top_p, double uniform_sample) {
   ICHECK(prob.IsContiguous());
   ICHECK(prob.DataType() == DataType::Float(32));
 
@@ -395,12 +389,16 @@ int SampleTopPFromProb(NDArray prob, int unit_offset, 
double top_p, double unifo
 
   ICHECK(prob->device.device_type == kDLCPU);
 
+  for (int i = 0; i < prob->ndim - 1; ++i) {
+ICHECK_EQ(prob->shape[i], 1) << "The leading dimensions of logits must be 
1";
+  }
+
   // Key observation: when we are doing top_p sampling
   // usually we only need to preserve some of the elements with
-  // high probabilities before we do sort
+  // high probablities before we do sort
   std::vector> data;
   int64_t ndata = prob->shape[prob->ndim - 1];
-  const float* p_prob = static_cast(prob->data) + (unit_offset * 
ndata);
+  const float* p_prob = static_cast(prob->data);
 
   auto sample_top_p_with_filter = [&](float cuttoff) -> int64_t {
 data.clear();
diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index 137b3e2280..453d6240f3 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -1726,7 +1726,7 @@ export class Instance implements Disposable {
* @returns The sampled index.
*/
   sampleTopPFromLogits(logits: NDArray, temperature: number, top_p: number): 
number {
-return this.ctx.sampleTopPFromLogits(logits, /*unit_offset=*/0, 
temperature, top_p, Math.random());
+return this.ctx.sampleTopPFromLogits(logits, temperature, top_p, 
Math.random());
   }
 
   /**



(tvm) 01/01: Revert "[Unity][Support] Sample from top-p supports offset (#16069)"

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch 
revert-16069-unity-dev/2023-11-03-sample-top-p
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit c97ac62aeec63eea9d69083e90e0df8582b96c53
Author: Junru Shao 
AuthorDate: Sun Nov 5 14:16:03 2023 -0800

Revert "[Unity][Support] Sample from top-p supports offset (#16069)"

This reverts commit 0ed1e30df51c50ce9ffcfe4e61115fb13cd5340e.
---
 src/runtime/relax_vm/lm_support.cc | 16 +++-
 web/src/runtime.ts |  2 +-
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/runtime/relax_vm/lm_support.cc 
b/src/runtime/relax_vm/lm_support.cc
index ba88820f08..e56a03fdea 100644
--- a/src/runtime/relax_vm/lm_support.cc
+++ b/src/runtime/relax_vm/lm_support.cc
@@ -379,13 +379,7 @@ int SampleTopPFromLogits(NDArray logits, double 
temperature, double top_p, doubl
 
 
TVM_REGISTER_GLOBAL("vm.builtin.sample_top_p_from_logits").set_body_typed(SampleTopPFromLogits);
 
-int SampleTopPFromProb(NDArray prob, int unit_offset, double top_p, double 
uniform_sample) {
-  // prob: (*, v)
-  // The prob array may have arbitrary ndim and shape.
-  // The last dimension corresponds to the prob distribution size.
-  // We use the `unit_offset` parameter to determine which slice
-  // of the prob array we sample from.
-
+int SampleTopPFromProb(NDArray prob, double top_p, double uniform_sample) {
   ICHECK(prob.IsContiguous());
   ICHECK(prob.DataType() == DataType::Float(32));
 
@@ -395,12 +389,16 @@ int SampleTopPFromProb(NDArray prob, int unit_offset, 
double top_p, double unifo
 
   ICHECK(prob->device.device_type == kDLCPU);
 
+  for (int i = 0; i < prob->ndim - 1; ++i) {
+ICHECK_EQ(prob->shape[i], 1) << "The leading dimensions of logits must be 
1";
+  }
+
   // Key observation: when we are doing top_p sampling
   // usually we only need to preserve some of the elements with
-  // high probabilities before we do sort
+  // high probablities before we do sort
   std::vector> data;
   int64_t ndata = prob->shape[prob->ndim - 1];
-  const float* p_prob = static_cast(prob->data) + (unit_offset * 
ndata);
+  const float* p_prob = static_cast(prob->data);
 
   auto sample_top_p_with_filter = [&](float cuttoff) -> int64_t {
 data.clear();
diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index 137b3e2280..453d6240f3 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -1726,7 +1726,7 @@ export class Instance implements Disposable {
* @returns The sampled index.
*/
   sampleTopPFromLogits(logits: NDArray, temperature: number, top_p: number): 
number {
-return this.ctx.sampleTopPFromLogits(logits, /*unit_offset=*/0, 
temperature, top_p, Math.random());
+return this.ctx.sampleTopPFromLogits(logits, temperature, top_p, 
Math.random());
   }
 
   /**



(tvm) branch revert-16069-unity-dev/2023-11-03-sample-top-p created (now c97ac62aee)

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch 
revert-16069-unity-dev/2023-11-03-sample-top-p
in repository https://gitbox.apache.org/repos/asf/tvm.git


  at c97ac62aee Revert "[Unity][Support] Sample from top-p supports offset 
(#16069)"

This branch includes the following new commits:

 new c97ac62aee Revert "[Unity][Support] Sample from top-p supports offset 
(#16069)"

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.




(tvm) 01/01: Merge remote-tracking branch 'main' into unity

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity-staging
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit be8cab10e135c0da0148fe70c1b8632aad51a3cb
Merge: 151aa74370 1de5aa551d
Author: Junru Shao 
AuthorDate: Sun Nov 5 13:58:37 2023 -0800

Merge remote-tracking branch 'main' into unity

 .github/workflows/main.yml |   3 +-
 cmake/utils/FindLLVM.cmake |   9 ++
 include/tvm/runtime/packed_func.h  |  80 ++
 include/tvm/runtime/vm/executable.h|  39 +++--
 python/tvm/_ffi/base.py|  15 +-
 python/tvm/contrib/nvcc.py |  19 ++-
 .../tvm/relay/backend/contrib/ethosu/legalize.py   |   2 +-
 src/runtime/vm/executable.cc   |  81 ++
 src/script/ir_builder/tir/ir.cc|   9 +-
 src/target/opt/build_cuda_on.cc|   1 +
 src/target/source/codegen_c.cc |  47 +-
 src/target/source/codegen_c.h  |   2 +
 src/target/source/codegen_cuda.cc  |  49 ++
 src/target/source/codegen_cuda.h   |   2 +-
 src/target/source/codegen_metal.cc |  93 +++-
 src/target/source/codegen_metal.h  |   3 +-
 src/tir/schedule/concrete_schedule.cc  |   4 +-
 src/tir/schedule/primitive.h   |   2 +-
 src/tir/schedule/primitive/for_kind.cc |  19 ++-
 src/tir/transforms/inject_ptx_async_copy.cc|   7 +-
 src/tir/transforms/inject_software_pipeline.cc |   2 +-
 src/tir/transforms/ir_utils.cc |   3 +-
 src/tir/transforms/lower_cross_thread_reduction.cc |   2 +-
 src/tir/transforms/renew_defs.cc   |  10 +-
 tests/lint/pylint.sh   |   4 +
 tests/python/contrib/test_cblas.py | 165 +
 tests/python/contrib/test_ethosu/test_codegen.py   |  16 ++
 tests/python/contrib/test_tflite_runtime.py|  18 ++-
 tests/python/contrib/test_thrust.py|   6 +-
 tests/python/contrib/test_util.py  |   4 +-
 tests/python/unittest/test_micro_ms_tuning.py  |   2 +-
 tests/python/unittest/test_roofline.py |   2 +-
 .../test_runtime_module_based_interface.py |   4 +-
 tests/python/unittest/test_target_codegen_cuda.py  |  24 +++
 tests/python/unittest/test_target_codegen_metal.py |  23 +++
 tests/python/unittest/test_tir_renew_defs.py   |  22 ++-
 .../python/unittest/test_tir_schedule_for_kind.py  |  29 
 .../test_tir_transform_inject_ptx_async_copy.py|  34 +
 .../test_tir_transform_inject_software_pipeline.py |  72 +
 .../test_tir_transform_lower_opaque_block.py   |  48 ++
 tests/python/unittest/test_tvmscript_parser_tir.py |  15 ++
 tests/scripts/task_config_build_arm.sh |   2 +-
 42 files changed, 721 insertions(+), 272 deletions(-)




(tvm) branch unity-staging updated (1e2294de6c -> be8cab10e1)

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch unity-staging
in repository https://gitbox.apache.org/repos/asf/tvm.git


 discard 1e2294de6c Merge remote-tracking branch 'main' into unity
 new be8cab10e1 Merge remote-tracking branch 'main' into unity

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (1e2294de6c)
\
 N -- N -- N   refs/heads/unity-staging (be8cab10e1)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:



(tvm) branch unity updated: [Unity] Fix FuseTIR pass for gather/take cases (#16064)

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 021f31b737 [Unity] Fix FuseTIR pass for gather/take cases  (#16064)
021f31b737 is described below

commit 021f31b73720edcfcf28219642ec90b6e90062aa
Author: Siyuan Feng 
AuthorDate: Mon Nov 6 05:57:49 2023 +0800

[Unity] Fix FuseTIR pass for gather/take cases  (#16064)

* [TIR] Fix pass RenewDefs error in gather/take case

Pervious implementation of RenewDefs pass will fail in the case of
the gather/take function. This is because the pass visit and renew the
read/write regions twice. This PR fixes it and adds a regression test.

* [Unity] Fix FuseTIR pass for gather/take cases

The current implementation of FuseTIR pass does not handle the buffer
access region of the blocks, which may fail when the function is in
gather or take pattern. This PR fixes the issue.
---
 src/relax/transform/fuse_tir.cc   | 28 +++--
 src/tir/transforms/renew_defs.cc  | 10 ++-
 tests/python/relax/test_transform_fuse_tir.py | 90 ++-
 tests/python/unittest/test_tir_renew_defs.py  | 22 ++-
 4 files changed, 138 insertions(+), 12 deletions(-)

diff --git a/src/relax/transform/fuse_tir.cc b/src/relax/transform/fuse_tir.cc
index 2fb3f1d8ce..df3c85c05c 100644
--- a/src/relax/transform/fuse_tir.cc
+++ b/src/relax/transform/fuse_tir.cc
@@ -202,21 +202,27 @@ class FuseTIRBufferSubstitutor : private StmtExprMutator {
 auto f_mutate_match_buffers = [this](const MatchBufferRegion& 
match_buffer) {
   const Buffer& src_buffer = 
SubstituteBuffer(match_buffer->source->buffer);
   const Buffer& tgt_buffer = 
SubstituteAllocatedBuffer(match_buffer->buffer);
+  Region region = MutateRegion(match_buffer->source->region);
   if (src_buffer.same_as(match_buffer->source->buffer) &&
-  tgt_buffer.same_as(match_buffer->buffer)) {
+  tgt_buffer.same_as(match_buffer->buffer) &&
+  region.same_as(match_buffer->source->region)) {
 return match_buffer;
   } else {
 auto n = make_object(*match_buffer.get());
 n->buffer = tgt_buffer;
-n->source = BufferRegion(src_buffer, match_buffer->source->region);
+n->source = BufferRegion(src_buffer, region);
 return MatchBufferRegion(n);
   }
 };
 
 auto f_mutate_read_write_region = [this](const BufferRegion& 
buffer_region) {
-  auto it = buffer_remap_.find(buffer_region->buffer);
-  return it == buffer_remap_.end() ? buffer_region
-   : BufferRegion((*it).second, 
buffer_region->region);
+  const Buffer& buffer = SubstituteBuffer(buffer_region->buffer);
+  const Region& region = MutateRegion(buffer_region->region);
+  if (buffer.same_as(buffer_region->buffer) && 
region.same_as(buffer_region->region)) {
+return buffer_region;
+  } else {
+return BufferRegion(buffer, region);
+  }
 };
 
 // Step 1. Mutate `match_buffers`.
@@ -285,6 +291,18 @@ class FuseTIRBufferSubstitutor : private StmtExprMutator {
   return buffer;
 }
   }
+
+  inline Region MutateRegion(const Region& region) {
+return MutateArray(region, [this](const Range& range) {
+  const PrimExpr& min = this->VisitExpr(range->min);
+  const PrimExpr& extent = this->VisitExpr(range->extent);
+  if (min.same_as(range->min) && extent.same_as(range->extent)) {
+return range;
+  } else {
+return Range::FromMinExtent(min, extent);
+  }
+});
+  }
 };
 
 /*! \brief A mutator which detect block name duplication and deduplicate the 
names. */
diff --git a/src/tir/transforms/renew_defs.cc b/src/tir/transforms/renew_defs.cc
index fd2c27dcd1..8a122f8922 100644
--- a/src/tir/transforms/renew_defs.cc
+++ b/src/tir/transforms/renew_defs.cc
@@ -120,9 +120,11 @@ class RenewDefMutator : public StmtExprMutator {
 std::bind(::VisitMatchBuffer, this, 
std::placeholders::_1));
 
 // Step 3. Visit body
-Stmt stmt = StmtExprMutator::VisitStmt_(op);
-op = stmt.as();
-ICHECK(op);
+Optional init = NullOpt;
+if (op->init.defined()) {
+  init = this->VisitStmt(op->init.value());
+}
+Stmt body = this->VisitStmt(op->body);
 
 // Step 4. Revisit access region
 Array reads =
@@ -137,6 +139,8 @@ class RenewDefMutator : public StmtExprMutator {
 n->match_buffers = std::move(match_buffers);
 n->reads = std::move(reads);
 n->writes = std::move(writes);
+n->body = std::move(body);
+n->init = std::move(init);
 
 return Stmt(n);
   }
diff --git a/te

(tvm) branch unity-staging updated (54b2a074c9 -> 1e2294de6c)

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch unity-staging
in repository https://gitbox.apache.org/repos/asf/tvm.git


 discard 54b2a074c9 Merge remote-tracking branch 'main' into unity
 add 0ed1e30df5 [Unity][Support] Sample from top-p supports offset (#16069)
 add 9202f4b8fc [Bugfix] Compilation Error with Clang (#16071)
 add 151aa74370 [Unity][Dlight] Metal Performance (#15985)
 add 134e73d8fd [CI] Use LLVM 17 for tests on `ci_arm` (#16062)
 add b144145c4b [CUDA] Add an option for profiling cuda kernels (#16061)
 add 1de5aa551d [TIR] Fix the thread binding iter_var dtype in `Bind` 
primitive (#16074)
 new 1e2294de6c Merge remote-tracking branch 'main' into unity

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (54b2a074c9)
\
 N -- N -- N   refs/heads/unity-staging (1e2294de6c)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 include/tvm/runtime/memory/memory_manager.h|  2 +-
 python/tvm/contrib/nvcc.py | 19 +-
 python/tvm/dlight/gpu/gemv.py  | 14 +++--
 python/tvm/dlight/gpu/utils.py |  2 +
 src/runtime/memory/memory_manager.cc   |  2 +-
 src/runtime/relax_vm/lm_support.cc | 16 ++---
 src/target/opt/build_cuda_on.cc|  1 +
 src/tir/schedule/concrete_schedule.cc  |  4 +-
 src/tir/schedule/primitive.h   |  2 +-
 src/tir/schedule/primitive/for_kind.cc | 19 --
 tests/python/dlight/test_gpu_gemv.py   | 70 +-
 tests/python/unittest/test_target_codegen_cuda.py  | 24 
 .../python/unittest/test_tir_schedule_for_kind.py  | 29 +
 tests/scripts/task_config_build_arm.sh |  2 +-
 web/src/runtime.ts |  2 +-
 15 files changed, 139 insertions(+), 69 deletions(-)



(tvm) 01/01: Merge remote-tracking branch 'main' into unity

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity-staging
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit 1e2294de6c23741e57e9322a535ef733fb1a24bc
Merge: 151aa74370 1de5aa551d
Author: Junru Shao 
AuthorDate: Sun Nov 5 13:51:26 2023 -0800

Merge remote-tracking branch 'main' into unity

 .github/workflows/main.yml |   3 +-
 cmake/utils/FindLLVM.cmake |   9 ++
 include/tvm/runtime/packed_func.h  |  80 ++
 include/tvm/runtime/vm/executable.h|  39 +++--
 python/tvm/_ffi/base.py|  15 +-
 python/tvm/contrib/nvcc.py |  19 ++-
 .../tvm/relay/backend/contrib/ethosu/legalize.py   |   2 +-
 src/runtime/vm/executable.cc   |  81 ++
 src/script/ir_builder/tir/ir.cc|   9 +-
 src/target/opt/build_cuda_on.cc|   1 +
 src/target/source/codegen_c.cc |  47 +-
 src/target/source/codegen_c.h  |   2 +
 src/target/source/codegen_cuda.cc  |  49 ++
 src/target/source/codegen_cuda.h   |   2 +-
 src/target/source/codegen_metal.cc |  93 +++-
 src/target/source/codegen_metal.h  |   3 +-
 src/tir/schedule/concrete_schedule.cc  |   4 +-
 src/tir/schedule/primitive.h   |   2 +-
 src/tir/schedule/primitive/for_kind.cc |  19 ++-
 src/tir/transforms/inject_ptx_async_copy.cc|   7 +-
 src/tir/transforms/inject_software_pipeline.cc |   2 +-
 src/tir/transforms/ir_utils.cc |   3 +-
 src/tir/transforms/lower_cross_thread_reduction.cc |   2 +-
 src/tir/transforms/renew_defs.cc   |  10 +-
 tests/lint/pylint.sh   |   4 +
 tests/python/contrib/test_cblas.py | 165 +
 tests/python/contrib/test_ethosu/test_codegen.py   |  16 ++
 tests/python/contrib/test_tflite_runtime.py|  18 ++-
 tests/python/contrib/test_thrust.py|   6 +-
 tests/python/contrib/test_util.py  |   4 +-
 tests/python/unittest/test_micro_ms_tuning.py  |   2 +-
 tests/python/unittest/test_roofline.py |   2 +-
 .../test_runtime_module_based_interface.py |   4 +-
 tests/python/unittest/test_target_codegen_cuda.py  |  24 +++
 tests/python/unittest/test_target_codegen_metal.py |  23 +++
 tests/python/unittest/test_tir_renew_defs.py   |  22 ++-
 .../python/unittest/test_tir_schedule_for_kind.py  |  29 
 .../test_tir_transform_inject_ptx_async_copy.py|  34 +
 .../test_tir_transform_inject_software_pipeline.py |  72 +
 .../test_tir_transform_lower_opaque_block.py   |  48 ++
 tests/python/unittest/test_tvmscript_parser_tir.py |  15 ++
 tests/scripts/task_config_build_arm.sh |   2 +-
 42 files changed, 721 insertions(+), 272 deletions(-)




(tvm) branch unity updated: [Unity][Dlight] Metal Performance (#15985)

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 151aa74370 [Unity][Dlight] Metal Performance (#15985)
151aa74370 is described below

commit 151aa7437082f4af3c2c191b4a093919546adb34
Author: Bohan Hou 
AuthorDate: Sun Nov 5 16:47:27 2023 -0500

[Unity][Dlight] Metal Performance (#15985)

Co-authored-by: Bohan Hou 
Co-authored-by: Junru Shao 
---
 python/tvm/dlight/gpu/gemv.py| 14 +---
 python/tvm/dlight/gpu/utils.py   |  2 ++
 tests/python/dlight/test_gpu_gemv.py | 70 +++-
 3 files changed, 40 insertions(+), 46 deletions(-)

diff --git a/python/tvm/dlight/gpu/gemv.py b/python/tvm/dlight/gpu/gemv.py
index 3544719af0..76839d4166 100644
--- a/python/tvm/dlight/gpu/gemv.py
+++ b/python/tvm/dlight/gpu/gemv.py
@@ -418,11 +418,17 @@ class GEMV(ScheduleRule):
 else:
 TS, TR = 16, 32
 elif target.kind.name == "metal":
-VEC_C = 2
-LOAD_V_SHARED = True
-LOAD_V_VEC = 4
+# Note that the following tile size is tuned on M2 Ultra for 7B
+TAG_S, TAG_R = "threadIdx.x", "threadIdx.y"
+VEC_C = 4
+LOAD_V_SHARED = False
+LOAD_V_VEC = -1
 UNROLL = 256
-TS, TR = 64, 8
+if isinstance(len_S, int):
+if len_S > len_R:
+TS, TR = 1, 64
+else:
+TS, TR = 1, 256
 elif target.kind.name == "rocm":
 VEC_C = 4
 LOAD_V_SHARED = True
diff --git a/python/tvm/dlight/gpu/utils.py b/python/tvm/dlight/gpu/utils.py
index 9f9a9c5ae4..00d97ab7f1 100644
--- a/python/tvm/dlight/gpu/utils.py
+++ b/python/tvm/dlight/gpu/utils.py
@@ -53,6 +53,8 @@ def suggest_threads_per_block(
 threads = 256
 elif target.kind.name == "rocm":
 threads = 256
+elif target.kind.name == "metal":
+threads = 256
 else:
 threads = 64
 results: List[Optional[int]] = []
diff --git a/tests/python/dlight/test_gpu_gemv.py 
b/tests/python/dlight/test_gpu_gemv.py
index 7f60d5db32..83d2c3c06c 100644
--- a/tests/python/dlight/test_gpu_gemv.py
+++ b/tests/python/dlight/test_gpu_gemv.py
@@ -209,78 +209,64 @@ def test_decode_gemv_256_threads():
 def expected(lv571: T.Buffer((22016, 512), "uint32"), lv572: 
T.Buffer((22016, 128), "float16"), lv1654: T.Buffer((1, 1, 4096), "float16"), 
var_NT_matmul_intermediate: T.Buffer((1, 1, 22016), "float16")):
 T.func_attr({"tir.is_scheduled": 1, "tir.noalias": T.bool(True)})
 # with T.block("root"):
-var_NT_matmul_intermediate_rf_local = T.alloc_buffer((16, 1, 1, 
22016), "float16", scope="local")
-var_NT_matmul_intermediate_rf_local_1 = T.alloc_buffer((8, 1, 1, 
22016), "float16", scope="local")
+var_NT_matmul_intermediate_rf_local = T.alloc_buffer((256, 1, 1, 
22016), "float16", scope="local")
+var_NT_matmul_intermediate_rf_local_1 = T.alloc_buffer((64, 1, 1, 
22016), "float16", scope="local")
 lv571_local = T.alloc_buffer((22016, 512), "uint32", scope="local")
-lv1654_shared = T.alloc_buffer((1, 1, 4096), "float16", scope="shared")
-for u_fused_ax0_fused_fused_0 in T.thread_binding(688, 
thread="blockIdx.x"):
-for u_fused_ax0_fused_fused_1 in T.thread_binding(32, 
thread="threadIdx.y"):
-for 
ax1_0_fused_ax1_1_fused_1_ax1_0_fused_ax1_1_fused_3_fused_0 in 
T.thread_binding(8, thread="threadIdx.x"):
-for ax0, ax1 in T.grid(1, 1):
-for ax2_0 in T.serial(4, 
annotations={"pragma_unroll_explicit": 256, "pragma_vectorize": 1}):
-for ax2_1 in T.thread_binding(32, 
thread="threadIdx.y"):
-for ax2_2 in T.thread_binding(8, 
thread="threadIdx.x"):
-for ax2_3 in T.vectorized(4):
-with T.block("lv1654_shared"):
-v0, v1 = T.axis.remap("SS", [ax0, 
ax1])
-v2 = T.axis.spatial(4096, ax2_0 * 
1024 + ax2_1 * 32 + ax2_2 * 4 + ax2_3)
-T.reads(lv1654[v0, v1, v2])
-T.writes(lv1654_shared[v0, v1, v2])
-lv1654_sha

(tvm) branch main updated: [TIR] Fix the thread binding iter_var dtype in `Bind` primitive (#16074)

2023-11-05 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 1de5aa551d [TIR] Fix the thread binding iter_var dtype in `Bind` 
primitive (#16074)
1de5aa551d is described below

commit 1de5aa551d8729994d326c431e2a27f6ebb233bc
Author: Siyuan Feng 
AuthorDate: Mon Nov 6 05:46:04 2023 +0800

[TIR] Fix the thread binding iter_var dtype in `Bind` primitive (#16074)

As a follow up PR of #16041, this PR fixes the iter_var dtype generated
by the schedule primitive `bind`. Now the iter_var dtype is the same as
the loop_var.

Note that this PR changes the internal interface (tir interface) of the
bind primitive. But it does not change the user interface (python side,
and concrete_schedule.cc side).
---
 src/tir/schedule/concrete_schedule.cc  |  4 +--
 src/tir/schedule/primitive.h   |  2 +-
 src/tir/schedule/primitive/for_kind.cc | 19 +-
 .../python/unittest/test_tir_schedule_for_kind.py  | 29 ++
 4 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/src/tir/schedule/concrete_schedule.cc 
b/src/tir/schedule/concrete_schedule.cc
index 77afcf8266..343fb76178 100644
--- a/src/tir/schedule/concrete_schedule.cc
+++ b/src/tir/schedule/concrete_schedule.cc
@@ -555,9 +555,7 @@ void ConcreteScheduleNode::Bind(const LoopRV& loop_rv, 
const String& thread_axis
 "`vthread.x`, `vthread.y` and `vthread.z` instead";
   }
   TVM_TIR_SCHEDULE_BEGIN();
-  tir::Bind(state_, this->GetSRef(loop_rv),
-IterVar(/*dom=*/Range(nullptr), /*var=*/Var(thread_axis), 
/*iter_type=*/kThreadIndex,
-/*thread_tag=*/thread_axis));
+  tir::Bind(state_, this->GetSRef(loop_rv), thread_axis);
   this->state_->DebugVerify();
   TVM_TIR_SCHEDULE_END("bind", this->error_render_level_);
 }
diff --git a/src/tir/schedule/primitive.h b/src/tir/schedule/primitive.h
index fe6280e1c4..02fb982f5e 100644
--- a/src/tir/schedule/primitive.h
+++ b/src/tir/schedule/primitive.h
@@ -303,7 +303,7 @@ TVM_DLL void Vectorize(ScheduleState self, const StmtSRef& 
loop_sref);
  * \param loop_sref The sref of the loop to be bound to the thread axis
  * \param thread_axis The thread axis to be bound to the loop
  */
-TVM_DLL void Bind(ScheduleState self, const StmtSRef& loop_sref, const 
IterVar& thread_axis);
+TVM_DLL void Bind(ScheduleState self, const StmtSRef& loop_sref, const String& 
thread_axis);
 /*!
  * \brief Unroll the input loop. It requires nothing
  * \param self The state of the schedule
diff --git a/src/tir/schedule/primitive/for_kind.cc 
b/src/tir/schedule/primitive/for_kind.cc
index 02d8866e8e..9690cd78c8 100644
--- a/src/tir/schedule/primitive/for_kind.cc
+++ b/src/tir/schedule/primitive/for_kind.cc
@@ -144,7 +144,7 @@ void CheckParallelizability(const ScheduleState& self, 
const For& loop, ForKind
  * `for_kind` is `kThreadBinding`
  */
 void ParallelizeComputation(const ScheduleState& self, const StmtSRef& 
loop_sref, ForKind for_kind,
-Optional thread_axis) {
+Optional thread_axis) {
   const ForNode* loop = TVM_SREF_TO_FOR(loop_sref);
 
   /*
@@ -164,14 +164,21 @@ void ParallelizeComputation(const ScheduleState& self, 
const StmtSRef& loop_sref
   // Step 2. Check whether the loop can be parallelized/vectorized/bound with 
regard to each
   // underlying block.
   CheckParallelizability(self, GetRef(loop), for_kind,
- thread_axis.defined()
- ? 
runtime::ThreadScope::Create(thread_axis.value()->thread_tag)
- : runtime::ThreadScope{-1, -1});
+ thread_axis.defined() ? 
runtime::ThreadScope::Create(thread_axis.value())
+   : runtime::ThreadScope{-1, -1});
 
   // Step 3. Loop update and IR replacement
   ObjectPtr new_loop = make_object(*loop);
   new_loop->kind = for_kind;
-  new_loop->thread_binding = std::move(thread_axis);
+  if (thread_axis.defined()) {
+const String& thread_tag = thread_axis.value();
+new_loop->thread_binding = IterVar(/*dom=*/Range(nullptr), 
   //
+   /*var=*/Var(thread_axis.value(), 
loop->loop_var.dtype()),  //
+   /*iter_type=*/kThreadIndex, 
   //
+   /*thread_tag=*/thread_axis.value());
+  } else {
+new_loop->thread_binding = NullOpt;
+  }
   self->Replace(loop_sref, For(new_loop), {});
 }
 
@@ -183,7 +190,7 @@ void Vectorize(ScheduleState self, const StmtSRef& 
loop_sref) {
   Parall

(tvm) branch unity updated: [Bugfix] Compilation Error with Clang (#16071)

2023-11-04 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 9202f4b8fc [Bugfix] Compilation Error with Clang (#16071)
9202f4b8fc is described below

commit 9202f4b8fcf3b09ccf9e7f80f1ac8f0166c1dbe6
Author: Junru Shao 
AuthorDate: Sat Nov 4 18:26:58 2023 -0700

[Bugfix] Compilation Error with Clang (#16071)

This PR fixes compilation error from macOS's clang due to recent change.

```
/Users/jshao/Projects/tvm-dev/src/runtime/relax_vm/builtin.cc:357:48: note: 
in instantiation of function template specialization 
'tvm::runtime::Registry::set_body_method' requested here
  357 | 
TVM_REGISTER_GLOBAL("vm.builtin.alloc_tensor").set_body_method(::AllocNDArray);
  |^
/Users/jshao/Projects/tvm-dev/include/tvm/runtime/packed_func.h:546:3: 
note: candidate function
  546 |   operator double() const {
  |   ^
/Users/jshao/Projects/tvm-dev/include/tvm/runtime/packed_func.h:556:3: 
note: candidate function
  556 |   operator int64_t() const {
  |   ^
/Users/jshao/Projects/tvm-dev/include/tvm/runtime/packed_func.h:560:3: 
note: candidate function
  560 |   operator uint64_t() const {
  |   ^
/Users/jshao/Projects/tvm-dev/include/tvm/runtime/packed_func.h:564:3: 
note: candidate function
  564 |   operator int() const {
  |   ^
/Users/jshao/Projects/tvm-dev/include/tvm/runtime/packed_func.h:570:3: 
note: candidate function
  570 |   operator bool() const {
  |   ^
```
---
 include/tvm/runtime/memory/memory_manager.h | 2 +-
 src/runtime/memory/memory_manager.cc| 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/tvm/runtime/memory/memory_manager.h 
b/include/tvm/runtime/memory/memory_manager.h
index 8b38fbf6f0..763c0dc1dd 100644
--- a/include/tvm/runtime/memory/memory_manager.h
+++ b/include/tvm/runtime/memory/memory_manager.h
@@ -136,7 +136,7 @@ class StorageObj : public Object {
   Buffer buffer;
 
   /*! \brief Allocate an NDArray from a given piece of storage. */
-  NDArray AllocNDArray(size_t offset, ShapeTuple shape, DLDataType dtype);
+  NDArray AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype);
 
   /*! \brief The deleter for an NDArray when allocated from underlying 
storage. */
   static void Deleter(Object* ptr);
diff --git a/src/runtime/memory/memory_manager.cc 
b/src/runtime/memory/memory_manager.cc
index e72934ed2e..4ceecf99be 100644
--- a/src/runtime/memory/memory_manager.cc
+++ b/src/runtime/memory/memory_manager.cc
@@ -82,7 +82,7 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
   return align;
 }
 
-NDArray StorageObj::AllocNDArray(size_t offset, ShapeTuple shape, DLDataType 
dtype) {
+NDArray StorageObj::AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType 
dtype) {
   VerifyDataType(dtype);
 
   // crtical zone: allocate header, cannot throw



(tvm) branch main updated: [CI] Use LLVM 17 for tests on `ci_arm` (#16062)

2023-11-04 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
 new 134e73d8fd [CI] Use LLVM 17 for tests on `ci_arm` (#16062)
134e73d8fd is described below

commit 134e73d8fdf56171fbd80e9ccc7c62bb35b4e755
Author: Luke Hutton 
AuthorDate: Sat Nov 4 22:40:10 2023 +

[CI] Use LLVM 17 for tests on `ci_arm` (#16062)

Changes the config script to build TVM with LLVM 17.

Change-Id: Id7ff105881272f8bcd6e1fc9b1bfefa49b84dc07
---
 tests/scripts/task_config_build_arm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scripts/task_config_build_arm.sh 
b/tests/scripts/task_config_build_arm.sh
index 81813e0398..6aa53f5100 100755
--- a/tests/scripts/task_config_build_arm.sh
+++ b/tests/scripts/task_config_build_arm.sh
@@ -28,7 +28,7 @@ echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_MICRO ON\) >> config.cmake
 echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake
 echo set\(USE_PROFILER ON\) >> config.cmake
-echo -e 'find_program(LLVM_CONFIG "llvm-config")\nif (LLVM_CONFIG) 
\n\tset(USE_LLVM llvm-config) \nelse() \n\tset(USE_LLVM 
llvm-config-16)\nendif()' >> config.cmake
+echo set\(USE_LLVM llvm-config-17\) >> config.cmake
 echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake
 echo set\(USE_VTA_FSIM ON\) >> config.cmake
 echo set\(USE_ARM_COMPUTE_LIB ON\) >> config.cmake



(tvm) branch unity updated: [Unity][Support] Sample from top-p supports offset (#16069)

2023-11-04 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a commit to branch unity
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/unity by this push:
 new 0ed1e30df5 [Unity][Support] Sample from top-p supports offset (#16069)
0ed1e30df5 is described below

commit 0ed1e30df51c50ce9ffcfe4e61115fb13cd5340e
Author: Ruihang Lai 
AuthorDate: Sat Nov 4 18:39:19 2023 -0400

[Unity][Support] Sample from top-p supports offset (#16069)

This PR enhances the `SampleTopPFromProb` function, adding
a new `unit_offset` parameter. With this PR, the input prob
array can contain multiple prob distributions. The offset
is used to specify which probability to sample from.

Prior to this PR, this function assumes the input prob array
only contains a single probability distribution.
This assumption, though effective, prevents efficient sampling
for batching scenarios, since it forces us to split the batched
prob array before sampling.

This PR eliminates this issue by introducing the offset.
---
 src/runtime/relax_vm/lm_support.cc | 16 +---
 web/src/runtime.ts |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/runtime/relax_vm/lm_support.cc 
b/src/runtime/relax_vm/lm_support.cc
index e56a03fdea..ba88820f08 100644
--- a/src/runtime/relax_vm/lm_support.cc
+++ b/src/runtime/relax_vm/lm_support.cc
@@ -379,7 +379,13 @@ int SampleTopPFromLogits(NDArray logits, double 
temperature, double top_p, doubl
 
 
TVM_REGISTER_GLOBAL("vm.builtin.sample_top_p_from_logits").set_body_typed(SampleTopPFromLogits);
 
-int SampleTopPFromProb(NDArray prob, double top_p, double uniform_sample) {
+int SampleTopPFromProb(NDArray prob, int unit_offset, double top_p, double 
uniform_sample) {
+  // prob: (*, v)
+  // The prob array may have arbitrary ndim and shape.
+  // The last dimension corresponds to the prob distribution size.
+  // We use the `unit_offset` parameter to determine which slice
+  // of the prob array we sample from.
+
   ICHECK(prob.IsContiguous());
   ICHECK(prob.DataType() == DataType::Float(32));
 
@@ -389,16 +395,12 @@ int SampleTopPFromProb(NDArray prob, double top_p, double 
uniform_sample) {
 
   ICHECK(prob->device.device_type == kDLCPU);
 
-  for (int i = 0; i < prob->ndim - 1; ++i) {
-ICHECK_EQ(prob->shape[i], 1) << "The leading dimensions of logits must be 
1";
-  }
-
   // Key observation: when we are doing top_p sampling
   // usually we only need to preserve some of the elements with
-  // high probablities before we do sort
+  // high probabilities before we do sort
   std::vector> data;
   int64_t ndata = prob->shape[prob->ndim - 1];
-  const float* p_prob = static_cast(prob->data);
+  const float* p_prob = static_cast(prob->data) + (unit_offset * 
ndata);
 
   auto sample_top_p_with_filter = [&](float cuttoff) -> int64_t {
 data.clear();
diff --git a/web/src/runtime.ts b/web/src/runtime.ts
index 453d6240f3..137b3e2280 100644
--- a/web/src/runtime.ts
+++ b/web/src/runtime.ts
@@ -1726,7 +1726,7 @@ export class Instance implements Disposable {
* @returns The sampled index.
*/
   sampleTopPFromLogits(logits: NDArray, temperature: number, top_p: number): 
number {
-return this.ctx.sampleTopPFromLogits(logits, temperature, top_p, 
Math.random());
+return this.ctx.sampleTopPFromLogits(logits, /*unit_offset=*/0, 
temperature, top_p, Math.random());
   }
 
   /**



(tvm) branch unity-staging updated (393aaa350f -> 54b2a074c9)

2023-11-04 Thread junrushao
This is an automated email from the ASF dual-hosted git repository.

junrushao pushed a change to branch unity-staging
in repository https://gitbox.apache.org/repos/asf/tvm.git


from 393aaa350f Fix after merge
 add 53ccf18625 [Unity] Allow Pipeline Registration (#16008)
 add 6936829aac [Unity] Remove end-of-life handling from 
StaticPlanBlockMemory (#15841)
 add d93260824b [Unity][UnitTest] Enable BindParams test for R.Prim (#15978)
 add 90bb10e2bf [Unity][nn.Module] Support Parameter Packing (#16007)
 add 16af021e3d [Unity][nn.Module] Support `nn.SourceModule` (#16006)
 add 7486476b6f [Unity] Deterministic Ordering when Iterating 
IRModule::functions (#16020)
 add 8c7aaa6a4f [Unity][UnitTest] Cleanup test_vm_build.py (#15981)
 add 49a3a51ff1 [Unity] Ensure one VM register for each relax binding 
(#15855)
 add a9c81a7cc1 [Unity] Replace relax_vm/memory_manager with 
memory/memory_manager (#15882)
 add 3c8603789e [Unity] Avoid Emitting Redandunt Bindings in TensorExpr Op 
(#16026)
 add 0f8186f7dc [Fix] Windows Build (#16028)
 add 853732e5ef [Unity] Support getting variable mapping for FunctionCopier 
(#16012)
 add 7833f4ec29 [Unity][BlockBuilder] Allow emitting nested tuple (#15993)
 add a801064b5b [Unity] Alias IntTuple <= ShapeTuple (#16035)
 add 23371cadb4 [Unity][MSC][M1.4] Add Runner and test with relax (#15997)
 add 2329b1a9a9 [Fix] Update mutator name rule (#16046)
 add 57597f62b4 [Fix][TIR]fix symbolic strides lower (#16000)
 add c6f2816b58 [FFI][Python] Handle error propagation when line number is 
missing (#15955)
 add 5ac2d1a219 [Pylint] fix pylint issues for cblas (#16015)
 add c4c0a492af [Tests] Fix str vs. int comparison in test_num_threads 
(#16017)
 add 5b561a6059 [Tests] Check int8+int32 testcases in 
test_estimate_peak_flops_cpu (#16019)
 add ba0179f2f8 [Tests] Fix work_dir location used by 
test_micro_tuning_with_meta_schedule (#16018)
 add d83cd217a5 [microNPU][ETHOSU] Fix ConcatRewriter args processing 
(#16003)
 add 878a61105e [Fix][TIR]fix mul dtype mismatch (#16010)
 add 7a50c36f1c [Codegen] Add shuffle for cuda and metal (#15998)
 add bd3e8bb9cf [Runtime] Introduce `TVM_MODULE_VTABLE` Macros (#16030)
 add c3ce474dc1 [Pylint] fix pylint issues for thrust_runtime 
(#16023)
 add 043f147328 [Codegen][Metal] Disable cross-function call in Metal 
codegen (#16033)
 add 4b29f25613 [TVMScript] Fix mismatched dtype of IterVar in 
`T.thread_binding` (#16041)
 add c9fb87fd0e [TIR] Fix software pipeline with dynamic loop extent 
(#16027)
 add 3b8d1a831d [CMake] Use llvm-config to locate Findzstd.cmake (#16032)
 add 9df0683701 [TIR] Fix pass RenewDefs error in gather/take case (#16063)
 new 54b2a074c9 Merge remote-tracking branch 'main' into unity

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .github/workflows/main.yml |   3 +-
 cmake/utils/FindLLVM.cmake |   9 +
 include/tvm/runtime/container/shape_tuple.h|   5 +
 include/tvm/runtime/disco/session.h|   2 +-
 include/tvm/runtime/packed_func.h  |  80 ++
 include/tvm/runtime/relax_vm/memory_manager.h  | 152 
 include/tvm/runtime/relax_vm/vm.h  |   9 +-
 include/tvm/runtime/vm/executable.h|  39 +-
 include/tvm/topi/transform.h   |   2 -
 python/tvm/_ffi/base.py|  15 +-
 python/tvm/contrib/cc.py   |  18 +-
 python/tvm/contrib/msc/core/codegen/codegen.py |   2 +-
 .../contrib/msc/core/{ir => frontend}/__init__.py  |   3 +-
 .../contrib/msc/core/{ir => frontend}/translate.py |  41 +-
 python/tvm/contrib/msc/core/ir/__init__.py |   1 -
 .../tvm/contrib/msc/core/runtime}/__init__.py  |   3 +-
 python/tvm/contrib/msc/core/runtime/runner.py  | 818 +
 python/tvm/contrib/msc/core/utils/__init__.py  |   2 +
 python/tvm/contrib/msc/core/utils/dataset.py   |  78 +-
 python/tvm/contrib/msc/core/utils/file.py  | 112 ++-
 python/tvm/contrib/msc/core/utils/info.py  | 166 -
 python/tvm/contrib/msc/core/utils/log.py   | 132 
 python/tvm/contrib/msc/core/utils/message.py   | 133 
 python/tvm/contrib/msc/core/utils/namespace.py |   4 +
 python/tvm/contrib/msc/framework/__init__.py   |   2 +-
 .../msc/framework/tensorflow/codegen/codegen.py|   9 +-
 .../msc/framework/tensorflow/frontend/__init__.py  |   2 +
 .../msc/framework/tensorflow/frontend/translate.py |  15 +-
 .../msc/framework/tensorrt/codegen/codegen.py  |   4 +-
 .../msc/framework/tensorrt/frontend/__init__.py|   2 +
 .../msc/framework/t

  1   2   3   4   5   6   7   8   9   10   >