This is an automated email from the ASF dual-hosted git repository.
tqchen pushed a change to branch unity-staging
in repository https://gitbox.apache.org/repos/asf/tvm.git
from 5c8b7af59c [Unity] Fix lint during for upstream black
add 1527bfba04 [TVMScript][Bugfix] Tuple on the RHS of AssignDoc (#14452)
add 97ab25c33f [Unity][BYOC] Faster cutlass codegen (#14465)
add db01567b09 [Unity][Op] introduce `shape_to_tensor` op (#14447)
add ea56542f96 [Unity][CI] Update gpu and lint image (#14473)
add 751da24bce [Unity][BYOC] Fix `RunCodegen` pass on symbolic shape
(#14472)
add e54e04d520 [Unity][BYOC] Add cuBLAS backend (#14291)
add 5c2058dc3c [Unity][OP] Add `arange` op (#14463)
add 3b8aa040e4 [Unity][op] Relax rsqrt and sinh (#14479)
add aec773d690 [Unity][VM] Add Attention KV cache builtin (#14478)
add b7d3762f39 [Unity][Hexagon] Don't pass raw scalars in
hexagon/qnn/nn.py (#14474)
add 6bcd375369 [Unity] Torch-like NN module enhancement (#14499)
add 1eea30d54a [Unity] LiftTransformParams with symbolic shape robustness
(#14500)
add e93ee9fc56 [Unity][Op] vm.alloc_tensor infer struct info (#14503)
add 22878a52fa [Unity][Refactor] Use NameSupply for unique symbol
generation and remove NameTable (#14507)
add d9f3aa32ef [Unity][Analysis] Get symbolic TIR vars from struct info
(#14509)
add 8e4f94a689 [Unity][Op] Extend `relax.op.take` op to match behavior
with `topi.take`. (#14481)
add bd003e847b [Unity] Memory planning with TIR var upper bound (#14511)
add 3bd516701c [Unity] handle bf16 in dump_ndarray_cache and
load_ndarray_cache (#14514)
add a92258ec55 [Unity] Support Simple Dynamic-Shape-Aware in FuseTIR
(#14515)
add 8474255a04 [Fix] symbolic thread extent program compilation (#14516)
add 603f8bd721 [Unity][Op] Negative Log Likelihood Loss (#14517)
add bb479e66dd [Unity][Transform] Some Improvements on pass DecomposeOps
(#14512)
add cea447cf37 [Unity][Graph matching] Improved matching algorithm and
implementation (#14501)
add 34695afa5c [Unity][Op] introduce `ScatterElement` op (#14493)
add 8ea976276c [Unity] Properly handle tuple-outputting function in
`FuseOpsByPattern` (#14525)
add d5182388a5 [Unity][Op] Gradient functions for high-level Relax
operators (#14527)
add 0c447d6f9c [Unity][Transform] High-level reverse-mode automatic
differentiation pass (#14542)
add 97f4db0f91 [Unity] DefaultGPUSchedule working for targets other than
CUDA (#14540)
add b816aa26db [Unity][Pass] Enhance Dynamic-aware FuseOps (#14543)
add add45b5c1c [Unity] Make FuseOps work on a call_tir which has a
ShapeExpr arg (#14553)
add ac645b2348 [Unity] Fix emit_te with symbolic input (#14552)
add d6c8f7933b [Unity] Preserve symbolic var args when applying call_tir
(#14555)
add a6831ba9c4 [Unity] Enable pod args in WebGPU (#14560)
add 876ed385ce [Unity][Op] Symbolic shape support of take grad (#14559)
add fadf144f27 [Unity][Op] add einsum and flip in Relax (#14545)
add 8b4dcfd1f1 [Unity][TOPI] Symbolic shape support for `collapse_sum`
(#14535)
add c51bd8b76d [Unity] Enhance Dynamic-aware FuseTIR (#14577)
add ddbd3d7ec0 [Unity][TVMScript] Update struct_info for GlobalVar (#14579)
add eeae66b301 [Unity][MetaSchedule] Skip Scheduled PrimFuncs in Task
Generation (#14402)
add 88f5b8f590 [Unity][PyTorch] Disable gradient during dynamo subgraph
capture to save RAM (#14602)
add 61fbf4262d [Unity] Fix FuseTIR when the same buffer is read multiple
times with different access pattern (#14603)
add e416226bd7 [Unity][MetaSchedule] BlockCollector focusing on current
func (#14595)
add 7db0b984de [Unity][Op] Dynamic Strided Slice (#14548)
add a59c09de77 [Unity] Add pass for combining parallel matmul (#14583)
add 66e18fbe1f [Bugfix][TVMScript] Handle LetStmt for `var1 = var2`
expressions (#14320)
add 6006d25f43 [TensorIR][Schedule] New primitive `reorder_block_itervar`
(#14448)
add 8dea77a9ff [TOPI] dynamic externsion (#14450)
add 2c052b2067 [Frontend][Oneflow] Use FLOW_2_STR_DTYPE for dtype (#14454)
add 76c8e66211 [Hexagon][TOPI] Use IndexMap axis separator instead of TE
(#14459)
add 99a5734a9e [TIR] Add merge primitive for TIR schedule (#14398)
add 4d7e890407 [testing] Use tuples for numpy indexing (#14476)
add 44dd6445ef [TensorIR] Support for L2 prefetch async copy and
pred_guard enabled async in vectorized if_then_else (#14329)
add f8f7bc8946 [CI] Fix broken model link (#14458)
add f5db8b7ab5 [Bugfix] Conv1Dtranspose default kernel layout should be
IOW (#14482)
add dba987cae0 [Arith] Simplifications for floormod(x, 2) (#13936)
add 579d999653 [pytest] Don't return values from test_* functions (#14475)
add 25ec6460ce [TIR] Use same DataType of builtin::tvm_struct_set in C++
and Python (#14489)
add 9dcd40d61f [microNPU] Add support for ResizeNearestNeighbor with
half_pixel_centers=True (#14401)
add 4b6e635825 [LLVM] Add guard for #include
<llvm/Transforms/IPO/PassManagerBuilder.h> (#14469)
add deb11d384e [TIR] Use IRModuleNode::Remove to remove None in
PrimFuncPass (#14494)
add 73ca486d2d [LLVM] Add missing `override` to GetFormat and
GetPropertyMask (#14470)
add e51ba294d9 [ACL] Prevent offloading of per-channel quantized operators
(#14484)
add 287cd38651 [TIR] Improved SeqStmt::Flatten utility (#14497)
add 7a73254590 [Relay] Check if the attribute "name" exists before
accessing it (#14485)
add fd7e3643fb [QNN] Convert fake quantized take to quantized op (#14506)
add 6caf08589b [Test][Topi] Avoid depending on f32 rounding behavior for
crop_and_divide tests (#13773)
add 28206d89d7 [TIR] Merged kDeviceThreadAxis and
kUseDynamicSharedMemoryTag (#14495)
add 1113de2ce1 [relay] preserve the order of input_info of pytorch (#14462)
add 5239ec05e9 [TIR] [Schedule] Add get_output_blocks primitive (#14490)
add af39b3441d [Node] Utility methods for ObjectPathPair handling (#14498)
add 11c13ace0b [TVMScript] IRModule TVMScript Parser.
add ff5118f398 [TVMScript] Expose IRModule::attrs as I.module_attrs
add b228037a29 Expose attrs argument of "ir.IRModule" to Rust bindings
add 460374fed5 [TOPI] Support symbolic shape in einsum (#14521)
add b98d036660 [Runtime] Runtime module property mask for Metal and Vulkan
add e1b49c8cb6 [TOPI] Fix data race of batch multibox detection (#14343)
add 2eeb37eef0 [Arith][Bugfix] Simplify "x - 1 < y" into "x <= y" (#14528)
add 7e232264b3 [LLVM] Use DataLayout::getABITypeAlign instead of
getABITypeAlignment
add 4d59c959aa [Target] Add A10G gpu cuda tag (#14467)
add e8cd33b601 [TIR] Update SplitHostDevice to post-process with
ConvertSSA (#14496)
add 17bd178bfa [Docs] Fix MetaSchedule Docs (#14480)
add 2a23d5960b [CI] Pin sccache version to 0.3.3 (#14530)
add f990c0b09b [Target] Fix Jetson AGX Xavier CPU core count (#14508)
add 8e9216013c [hexagon] Hexagon inference fix (#14533)
add a84a2cbe07 [ARITH] Enhance CanProve to handle symbolic bound (#14523)
add 15f9be5449 [TOPI] Expose `topi::collapse_sum` to Python and support
symbolic shape (#14541)
add 0a0cbd69f5 [Target] Add Apple M1 GPU tag with 256-thread restriction
(#14539)
add c581fe3802 Update to v0.13.dev0 (#14544)
add 29d525bfc5 [ARITH] Enhance CanonicalSimplify to Simplify ProdDiv
(#14538)
add 4e07a8ed66 [TOPI] remove the i32 cast for output shape of pool (#14549)
add 6ef73e0cdb [TVMScript] Distinguish between void* and handle (#14488)
add a7a1980480 [CI] Update ci_cpu image and build with llvm-15 (#14466)
add f79e4ebf30 [Bugfix][Topi] Output strides in pack_buffer() utility
(#14566)
add e5fc9f6add [skip ci][COMMUNITY] Zihao Ye -> Committer (#14578)
add 515583ce28 [AutoTVM] New rank-binary loss_type for the new xgboost >=
2.0.0 behaviour (#14468)
add 51dcafb4ab [microNPU][ETHOSU] Add restrictions to convert to NHCWB16
layout in LayoutOptimization pass (#14464)
add 3a3118a754 feat: use spot instances for ci with on demand as a backup
(#14477)
add b5c71bdab1 [Docs] Fix typo in the Vitis AI Integration docs (#14585)
add 1c5442d2e9 [Codegen][LLVM] Remove cast to i8* in builtin::address_of
(#14563)
add 1db4464c08 [Fix][TIR][Analysis] Reduction block checking alloc_buffers
(#14589)
add 9fb9fd6898 [TIR] Use String instead of StringImm for
AttrStmtNode::node (#14491)
add fb2ae1a676 [Arith] Fix solve inequality of unbound var ranges (#14582)
add f622e7f180 [ARITH][BUGFIX] Fix a bug of iter map floormod(x,2)
simplify (#14571)
add 8554e7afb4 [Bugfix] [Relay] fix a bug caused by IncompleteTypeNode in
EinsumRel while doing MergeComposite (#14556)
add f28fcd1239 [TensorIR] Fix ComputeAt with perfect symbolic bound
(#14592)
add 7766f3c51e [Object] Implemented .as<T> for ObjectRef param, returns
Optional<T> (#14522)
add ca7c3d8a14 [LLVM] Expand tvm::Type to DWARF conversion (#14568)
add 40af75b61f [Fix][TIR] UnifyThreadBinding creating unit loop with
annotation (#14588)
add 1c52e633c7 [TIR][Schedule] Method returning the function being worked
on (#14593)
add 17f7db16f5 [ARITH] Enhance IterMapSimplify for symbolic (#14547)
add aee57f682f [MetaSchedule][ARM] Beautification of the function names
(#14584)
add c1d1e9ffb8 [TIR] Add CUDA int4 tensor core intrinsics (#14598)
add 3ef745c1cd [CI] Add JAX deps in Dockerfiles (#14550)
add b4c1995a98 [Node] Allow alternative root names in ObjectPath::Root()
(#14569)
add 48d9165263 [microNPU] Fix skip tests when Vela is not present (#14587)
add ab93b31d0d [ARITH][TensorIR] Improve CompactBufferRegion for symbolic
shape (#14596)
add b1ab4dc1d5 [LLVM] Validate generated LLVM module before optimization
(#14564)
add 742c5eec74 [MetaSchedule] Handle cases when no features found by
FeatureExtractor (#14591)
add 606e2b738a [Frontend][Paddle] [PaddlePaddle Hackathon 4]add attribute
support for dropout/hard_sigmoid/pixel_shuffle (#14575)
new f762b4e833 [MERGE] Bring changes from main into unity 2023-04-12
The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
CONTRIBUTORS.md | 1 +
apps/benchmark/adreno/adreno_gpu_bench_clml.py | 36 +-
apps/benchmark/adreno/adreno_gpu_bench_texture.py | 36 +-
ci/jenkins/docker-images.ini | 2 +-
ci/jenkins/generated/arm_jenkinsfile.groovy | 110 +-
ci/jenkins/generated/cortexm_jenkinsfile.groovy | 154 +-
ci/jenkins/generated/cpu_jenkinsfile.groovy | 88 +-
ci/jenkins/generated/docker_jenkinsfile.groovy | 16 +-
ci/jenkins/generated/gpu_jenkinsfile.groovy | 167 +-
ci/jenkins/generated/hexagon_jenkinsfile.groovy | 110 +-
ci/jenkins/generated/i386_jenkinsfile.groovy | 55 +-
ci/jenkins/generated/lint_jenkinsfile.groovy | 12 +-
.../generated/minimal_cross_isa_jenkinsfile.groovy | 33 +-
ci/jenkins/generated/minimal_jenkinsfile.groovy | 33 +-
ci/jenkins/generated/riscv_jenkinsfile.groovy | 33 +-
ci/jenkins/generated/wasm_jenkinsfile.groovy | 22 +-
ci/jenkins/templates/utils/Prepare.groovy.j2 | 4 +-
ci/jenkins/templates/utils/base.groovy.j2 | 6 +-
ci/jenkins/templates/utils/macros.j2 | 23 +-
ci/jenkins/unity_jenkinsfile.groovy | 4 +-
cmake/modules/CUDA.cmake | 4 +-
conda/recipe/meta.yaml | 2 +-
docker/Dockerfile.ci_cpu | 4 +
docker/Dockerfile.ci_gpu | 3 +
.../install/ubuntu_install_jax.sh | 23 +-
docker/install/ubuntu_install_sccache.sh | 2 +-
docs/how_to/deploy/vitis_ai.rst | 2 +-
docs/reference/api/python/index.rst | 1 +
.../api/python/{index.rst => meta_schedule.rst} | 39 +-
.../how_to/deploy_models/deploy_model_on_adreno.py | 38 +-
gallery/how_to/tune_with_autotvm/tune_relay_arm.py | 24 +-
.../how_to/tune_with_autotvm/tune_relay_cuda.py | 24 +-
.../tune_with_autotvm/tune_relay_mobile_gpu.py | 24 +-
gallery/how_to/tune_with_autotvm/tune_relay_x86.py | 24 +-
gallery/tutorial/autotvm_relay_x86.py | 39 +-
include/tvm/arith/analyzer.h | 40 +-
include/tvm/arith/int_set.h | 16 +
include/tvm/arith/iter_affine_map.h | 3 +-
include/tvm/ir/name_supply.h | 46 +-
include/tvm/node/object_path.h | 6 +-
include/tvm/node/structural_equal.h | 53 +-
include/tvm/relax/analysis.h | 23 +
include/tvm/relax/attrs/linear_algebra.h | 9 +
include/tvm/relax/attrs/manipulate.h | 35 +-
include/tvm/relax/attrs/nn.h | 15 +
include/tvm/relax/attrs/statistical.h | 15 +
include/tvm/relax/binding_rewrite.h | 6 +-
include/tvm/relax/block_builder.h | 7 +-
include/tvm/relax/dataflow_matcher.h | 19 +-
include/tvm/relax/dataflow_pattern.h | 10 +-
include/tvm/relax/op_attr_types.h | 12 +
include/tvm/relax/transform.h | 62 +-
include/tvm/relax/utils.h | 62 -
include/tvm/relay/attrs/nn.h | 6 +-
include/tvm/runtime/c_runtime_api.h | 2 +-
include/tvm/runtime/container/optional.h | 9 +
include/tvm/runtime/object.h | 39 +-
include/tvm/script/ir_builder/tir/ir.h | 18 +-
include/tvm/tir/function.h | 46 +-
include/tvm/tir/schedule/schedule.h | 27 +
include/tvm/tir/stmt.h | 36 +-
include/tvm/tir/transform.h | 13 +
include/tvm/topi/detail/extern.h | 2 +-
include/tvm/topi/nn/dilate.h | 3 +-
include/tvm/topi/nn/pooling.h | 58 +-
include/tvm/topi/reduction.h | 19 +-
include/tvm/topi/transform.h | 58 +-
python/tvm/_ffi/libinfo.py | 2 +-
python/tvm/arith/__init__.py | 3 +-
python/tvm/arith/analyzer.py | 27 +
python/tvm/arith/iter_affine_map.py | 43 +
python/tvm/autotvm/testing/tune_relay.py | 2 +-
python/tvm/autotvm/tuner/xgboost_cost_model.py | 45 +-
python/tvm/autotvm/tuner/xgboost_tuner.py | 4 +-
python/tvm/contrib/cutlass/build.py | 19 +-
python/tvm/contrib/torch/pytorch_tvm.py | 24 +-
python/tvm/contrib/tvmjs.py | 18 +-
python/tvm/driver/tvmc/autotuner.py | 46 +-
python/tvm/exec/rpc_proxy.py | 2 +-
python/tvm/meta_schedule/cost_model/xgb_model.py | 15 +-
python/tvm/meta_schedule/database/database.py | 7 +-
python/tvm/meta_schedule/postproc/postproc.py | 2 +-
python/tvm/meta_schedule/relax_integration.py | 4 +
.../meta_schedule/schedule_rule/schedule_rule.py | 4 +-
.../search_strategy/search_strategy.py | 6 +-
.../space_generator/space_generator.py | 4 +-
python/tvm/meta_schedule/tune.py | 15 +-
python/tvm/meta_schedule/tune_context.py | 4 +-
python/tvm/relax/analysis/analysis.py | 51 +
python/tvm/relax/analysis/estimate_memory_usage.py | 7 +-
python/tvm/relax/backend/contrib/cublas.py | 154 ++
python/tvm/relax/backend/contrib/cutlass.py | 15 +-
python/tvm/relax/block_builder.py | 5 +-
python/tvm/relax/dpl/context.py | 10 +-
python/tvm/relax/frontend/torch/dynamo.py | 5 +-
python/tvm/relax/frontend/torch/fx_translator.py | 12 +-
python/tvm/relax/op/__init__.py | 4 +
python/tvm/relax/op/_op_gradient.py | 1198 ++++++++++
python/tvm/relax/op/base.py | 54 +-
python/tvm/relax/op/create.py | 56 +-
python/tvm/relax/{testing => op/grad}/__init__.py | 6 +-
.../{testing/__init__.py => op/grad/_ffi_api.py} | 8 +-
python/tvm/relax/op/grad/grad.py | 144 ++
python/tvm/relax/op/index.py | 42 +-
python/tvm/relax/op/linear_algebra.py | 24 +-
python/tvm/relax/op/manipulate.py | 105 +-
python/tvm/relax/op/nn/nn.py | 51 +-
python/tvm/relax/op/op_attrs.py | 15 +
python/tvm/relax/op/statistical.py | 51 +
python/tvm/relax/op/unary.py | 24 +
python/tvm/relax/testing/__init__.py | 1 +
python/tvm/relax/testing/matmul.py | 66 +
python/tvm/relax/testing/nn.py | 8 +-
.../tvm/relax/transform/legalize_ops/__init__.py | 1 +
.../tvm/relax/transform/legalize_ops/creation.py | 25 +-
python/tvm/relax/transform/legalize_ops/grad.py | 219 ++
python/tvm/relax/transform/legalize_ops/index.py | 68 +-
.../relax/transform/legalize_ops/linear_algebra.py | 24 +-
.../tvm/relax/transform/legalize_ops/manipulate.py | 18 +-
python/tvm/relax/transform/legalize_ops/nn.py | 33 +
.../relax/transform/legalize_ops/statistical.py | 7 +-
python/tvm/relax/transform/legalize_ops/unary.py | 2 +
python/tvm/relax/transform/transform.py | 234 +-
python/tvm/relax/utils.py | 22 +-
python/tvm/relay/backend/contrib/ethosu/codegen.py | 90 +-
python/tvm/relay/frontend/keras.py | 12 +-
python/tvm/relay/frontend/mxnet.py | 2 +-
python/tvm/relay/frontend/oneflow.py | 4 +-
python/tvm/relay/frontend/paddlepaddle.py | 14 +-
python/tvm/relay/frontend/pytorch.py | 16 +
python/tvm/relay/op/contrib/arm_compute_lib.py | 24 +-
python/tvm/relay/op/contrib/ethosu.py | 15 +-
python/tvm/relay/op/nn/nn.py | 2 +-
python/tvm/relay/qnn/op/_qnn.py | 6 +-
.../transform/fake_quantization_to_integer.py | 11 +
python/tvm/rpc/proxy.py | 3 +-
python/tvm/runtime/object_path.py | 6 +-
python/tvm/script/ir_builder/relax/ir.py | 17 +-
python/tvm/script/ir_builder/tir/ir.py | 14 +-
python/tvm/script/parser/core/parser.py | 13 +-
python/tvm/tir/op.py | 2 +-
python/tvm/tir/schedule/schedule.py | 121 +-
python/tvm/tir/tensor_intrin/cuda.py | 220 +-
python/tvm/topi/cuda/ssd/multibox.py | 4 +-
python/tvm/topi/hexagon/qnn/nn.py | 14 +-
python/tvm/topi/hexagon/utils.py | 35 +-
python/tvm/topi/scatter_elements.py | 2 +-
python/tvm/topi/testing/poolnd_python.py | 6 +-
python/tvm/topi/transform.py | 35 +
python/tvm/topi/vision/ssd/multibox.py | 32 +-
python/tvm/topi/x86/conv2d.py | 10 +-
rust/tvm/src/ir/module.rs | 16 +-
src/arith/analyzer.cc | 64 +-
src/arith/bound_deducer.cc | 4 +
src/arith/canonical_simplify.cc | 132 +-
src/arith/int_constraints.cc | 25 +-
src/arith/int_set.cc | 29 +-
src/arith/interval_set.h | 9 +-
src/arith/ir_mutator_with_analyzer.cc | 14 +-
src/arith/ir_mutator_with_analyzer.h | 29 +-
src/arith/iter_affine_map.cc | 438 +++-
src/arith/pattern_match.h | 1 -
src/arith/product_normal_form.h | 89 +
src/arith/rewrite_simplify.cc | 113 +-
src/arith/rewrite_simplify.h | 1 -
src/contrib/hybrid/codegen_hybrid.cc | 4 +-
src/driver/driver_api.cc | 7 +-
src/ir/expr.cc | 12 +-
src/ir/module.cc | 66 +-
src/ir/name_supply.cc | 8 +-
src/ir/type_functor.cc | 8 +-
src/meta_schedule/database/database_utils.cc | 4 +-
.../feature_extractor/per_store_feature.cc | 11 +-
src/meta_schedule/postproc/verify_gpu_code.cc | 4 +-
src/meta_schedule/postproc/verify_vtcm_limit.cc | 4 +-
src/meta_schedule/schedule_rule/schedule_rule.cc | 8 +-
src/meta_schedule/space_generator/schedule_fn.cc | 8 +-
src/meta_schedule/utils.h | 30 +-
src/node/object_path.cc | 20 +-
src/node/structural_equal.cc | 51 +-
src/relax/analysis/struct_info_analysis.cc | 162 ++
src/relax/backend/contrib/cublas/codegen.cc | 110 +
src/relax/backend/contrib/cutlass/codegen.cc | 29 +-
src/relax/backend/contrib/utils.cc | 68 +
src/relax/backend/contrib/utils.h | 13 +
src/relax/ir/binding_rewrite.cc | 4 +-
src/relax/ir/block_builder.cc | 23 +-
src/relax/ir/dataflow_matcher.cc | 318 +--
src/relax/ir/dataflow_pattern.cc | 19 +-
src/relax/op/nn/nn.cc | 228 +-
src/relax/op/nn/nn.h | 6 +-
src/relax/op/op.cc | 29 +
src/relax/op/op_common.cc | 27 +-
src/relax/op/op_common.h | 11 +
src/relax/op/tensor/create.cc | 54 +
src/relax/op/tensor/create.h | 3 +
src/relax/op/tensor/grad.cc | 167 ++
src/relax/op/tensor/grad.h | 66 +
src/relax/op/tensor/index.cc | 105 +-
src/relax/op/tensor/linear_algebra.cc | 65 +
src/relax/op/tensor/linear_algebra.h | 8 +
src/relax/op/tensor/manipulate.cc | 188 +-
src/relax/op/tensor/manipulate.h | 10 +-
src/relax/op/tensor/statistical.cc | 47 +
src/relax/op/tensor/statistical.h | 12 +
src/relax/op/tensor/unary.cc | 1 +
src/relax/op/tensor/unary.h | 3 +
src/relax/transform/combine_parallel_matmul.cc | 337 +++
src/relax/transform/decompose_composite_ops.cc | 174 --
src/relax/transform/decompose_ops.cc | 269 +++
src/relax/transform/fold_constant.cc | 21 +-
src/relax/transform/fuse_ops.cc | 49 +-
src/relax/transform/fuse_tir.cc | 405 +++-
src/relax/transform/gradient.cc | 469 ++++
src/relax/transform/legalize_ops.cc | 10 +-
src/relax/transform/lift_transform_params.cc | 13 +-
src/relax/transform/run_codegen.cc | 5 +-
src/relax/transform/static_plan_block_memory.cc | 52 +-
src/relax/transform/utils.cc | 13 +
src/relax/transform/utils.h | 16 +
src/relay/analysis/call_graph.cc | 10 +-
src/relay/analysis/get_calibration_data.cc | 11 +-
src/relay/analysis/match_exhaustion.cc | 8 +-
src/relay/analysis/type_solver.cc | 6 +-
src/relay/backend/annotate_used_memory.cc | 5 +-
src/relay/backend/aot/aot_lower_main.cc | 2 +-
src/relay/backend/aot_executor_codegen.cc | 2 +-
.../backend/contrib/cmsisnn/extract_constants.cc | 8 +-
src/relay/backend/contrib/cmsisnn/relay_to_tir.cc | 16 +-
.../contrib/cmsisnn/scalar_to_tensor_constant.cc | 9 +-
src/relay/backend/contrib/ethosu/codegen.cc | 2 +-
src/relay/backend/contrib/ethosu/preprocess.cc | 8 +-
src/relay/backend/contrib/ethosu/source_module.cc | 4 +-
.../contrib/example_target_hooks/relay_to_tir.cc | 13 +-
src/relay/backend/interpreter.cc | 18 +-
src/relay/backend/te_compiler.cc | 21 +-
src/relay/backend/vm/compiler.cc | 15 +-
src/relay/backend/vm/lambda_lift.cc | 4 +-
src/relay/backend/vm/removed_unused_funcs.cc | 4 +-
src/relay/collage/mock_cost_estimator.cc | 3 +-
src/relay/collage/sub_graph.cc | 4 +-
src/relay/ir/dataflow_matcher.cc | 4 +-
src/relay/ir/expr_functor.cc | 4 +-
src/relay/ir/function.cc | 4 +-
src/relay/op/memory/memory.cc | 4 +-
src/relay/op/nn/convolution.cc | 18 +-
src/relay/op/tensor/math.cc | 4 +
src/relay/op/type_relations.cc | 4 +-
src/relay/parser/parser.cc | 4 +-
src/relay/printer/relay_text_printer.cc | 24 +-
src/relay/transforms/canonicalize_cast.cc | 4 +-
src/relay/transforms/compiler_function_utils.cc | 4 +-
src/relay/transforms/dead_code.cc | 4 +-
src/relay/transforms/device_planner.cc | 35 +-
src/relay/transforms/dynamic_to_static.cc | 4 +-
src/relay/transforms/eta_expand.cc | 8 +-
src/relay/transforms/fold_constant.cc | 4 +-
src/relay/transforms/fuse_ops.cc | 4 +-
src/relay/transforms/higher_order_gradient.cc | 16 +-
src/relay/transforms/inline.cc | 8 +-
src/relay/transforms/lazy_gradient_init.cc | 4 +-
src/relay/transforms/partial_eval.cc | 8 +-
src/relay/transforms/partition_graph.cc | 20 +-
src/relay/transforms/simplify_expr.cc | 4 +-
src/relay/transforms/type_infer.cc | 8 +-
src/runtime/contrib/cblas/gemm_common.h | 16 +-
src/runtime/contrib/cublas/cublas.cc | 118 +-
src/runtime/contrib/cublas/cublas_json_runtime.cc | 118 +
src/runtime/contrib/cublas/cublas_utils.h | 6 +
src/runtime/contrib/json/json_runtime.h | 2 +-
src/runtime/debug.cc | 8 +-
src/runtime/hexagon/hexagon_module.h | 5 +-
src/runtime/metal/metal_module.mm | 5 +
src/runtime/relax_vm/attention_kv_cache.cc | 162 ++
src/runtime/static_library.cc | 2 +-
src/runtime/vulkan/vulkan_wrapped_func.h | 5 +
src/script/ir_builder/ir/ir.cc | 21 +-
src/script/ir_builder/tir/ir.cc | 36 +-
src/script/printer/doc_printer/base_doc_printer.cc | 96 +-
.../printer/doc_printer/python_doc_printer.cc | 6 +-
src/script/printer/ir/ir.cc | 4 +-
src/script/printer/tir/block.cc | 6 +-
src/script/printer/tir/expr.cc | 39 +-
src/target/build_common.h | 12 +-
src/target/llvm/codegen_amdgpu.cc | 2 +
src/target/llvm/codegen_blob.cc | 7 +-
src/target/llvm/codegen_cpu.cc | 28 +-
src/target/llvm/codegen_llvm.cc | 34 +-
src/target/llvm/codegen_llvm.h | 6 +
src/target/llvm/codegen_nvptx.cc | 4 +-
src/target/llvm/llvm_module.cc | 20 +-
src/target/source/codegen_c.cc | 4 +-
src/target/source/codegen_cuda.cc | 11 +-
src/target/source/codegen_metal.cc | 22 +-
src/target/source/codegen_webgpu.cc | 53 +-
src/target/source/ptx.cc | 46 +-
src/target/source/source_module.cc | 10 +-
src/target/tag.cc | 4 +-
src/target/target.cc | 32 +-
src/target/target_kind.cc | 7 +-
src/te/operation/create_primfunc.cc | 17 +-
src/tir/analysis/block_access_region_detector.cc | 4 +-
src/tir/analysis/calculate_allocated_memory.cc | 5 +-
src/tir/analysis/control_flow_graph.cc | 8 +-
src/tir/analysis/estimate_flops.cc | 8 +-
src/tir/analysis/identify_memcpy.cc | 8 +-
src/tir/analysis/side_effect.cc | 4 +-
src/tir/analysis/verify_gpu_code.cc | 5 +-
src/tir/analysis/verify_memory.cc | 5 +-
src/tir/analysis/verify_ssa.cc | 5 +-
src/tir/ir/expr.cc | 2 +-
src/tir/ir/stmt_functor.cc | 16 +-
src/tir/ir/transform.cc | 11 +-
src/tir/schedule/analysis.h | 10 +
src/tir/schedule/analysis/analysis.cc | 27 +
src/tir/schedule/analysis/layout.cc | 4 +-
src/tir/schedule/analysis/reducer.cc | 22 +-
src/tir/schedule/concrete_schedule.cc | 26 +
src/tir/schedule/concrete_schedule.h | 4 +
src/tir/schedule/primitive.h | 31 +
src/tir/schedule/primitive/annotate.cc | 16 +-
src/tir/schedule/primitive/blockize_tensorize.cc | 8 +-
src/tir/schedule/primitive/cache_read_write.cc | 4 +-
src/tir/schedule/primitive/compute_at.cc | 16 +-
src/tir/schedule/primitive/decompose_padding.cc | 4 +-
src/tir/schedule/primitive/get_block_loop.cc | 38 +-
src/tir/schedule/primitive/loop_transformation.cc | 202 +-
.../schedule/primitive/reorder_block_iter_var.cc | 148 ++
src/tir/schedule/schedule.cc | 83 +-
src/tir/schedule/state.cc | 5 +-
src/tir/schedule/trace.cc | 12 +-
src/tir/schedule/traced_schedule.cc | 30 +
src/tir/schedule/traced_schedule.h | 3 +
src/tir/transforms/bf16_legalize.cc | 16 +-
src/tir/transforms/compact_buffer_region.cc | 14 +-
src/tir/transforms/default_gpu_schedule.cc | 27 +-
src/tir/transforms/extract_constants.cc | 5 +-
src/tir/transforms/flatten_buffer.cc | 40 +-
src/tir/transforms/hoist_expression.cc | 4 +-
src/tir/transforms/inject_ptx_async_copy.cc | 31 +-
src/tir/transforms/inject_rolling_buffer.cc | 6 +-
src/tir/transforms/ir_utils.cc | 186 +-
src/tir/transforms/lower_async_dma.cc | 5 +-
src/tir/transforms/lower_custom_datatypes.cc | 4 +-
src/tir/transforms/lower_opaque_block.cc | 8 +-
src/tir/transforms/make_packed_api.cc | 6 +-
src/tir/transforms/make_unpacked_api.cc | 4 +-
src/tir/transforms/memhammer_lower_auto_copy.cc | 4 +-
src/tir/transforms/profile_instrumentation.cc | 9 +-
.../reduce_branching_through_overcompute.cc | 4 +-
src/tir/transforms/remap_thread_axis.cc | 27 +-
src/tir/transforms/renew_defs.cc | 4 +-
src/tir/transforms/rewrite_unsafe_select.cc | 4 +-
src/tir/transforms/split_host_device.cc | 23 +-
src/tir/transforms/thread_storage_sync.cc | 2 +-
src/tir/transforms/unify_thread_binding.cc | 17 +-
src/tir/transforms/vectorize_loop.cc | 4 +-
src/topi/einsum.cc | 36 +-
src/topi/transform.cc | 8 +
.../contrib/test_arm_compute_lib/test_add.py | 31 +-
.../contrib/test_arm_compute_lib/test_conv2d.py | 50 +
.../contrib/test_arm_compute_lib/test_dense.py | 43 +
.../test_clml/test_adreno_collage_targets.py | 2 +-
tests/python/contrib/test_ethosu/infra.py | 18 +-
tests/python/contrib/test_ethosu/test_codegen.py | 74 +-
.../contrib/test_ethosu/test_layout_optimizer.py | 38 +
tests/python/contrib/test_ethosu/test_legalize.py | 18 +-
.../test_pass_operations_distribution.py | 8 +-
.../test_hexagon/test_2d_physical_buffers.py | 2 +-
.../test_hexagon/test_async_dma_pipeline.py | 18 -
tests/python/contrib/test_hexagon/test_autotvm.py | 24 +-
.../contrib/test_hexagon/test_maxpool2d_blocked.py | 2 +-
tests/python/driver/tvmc/conftest.py | 2 +-
tests/python/frontend/paddlepaddle/test_forward.py | 25 +-
tests/python/relax/test_analysis.py | 13 +
.../relax/test_analysis_struct_info_analysis.py | 43 +-
...t_blockbuilder.py => test_blockbuilder_core.py} | 3 +-
tests/python/relax/test_blockbuilder_emit_te.py | 71 +
tests/python/relax/test_codegen_cublas.py | 156 ++
tests/python/relax/test_codegen_cutlass.py | 49 +-
tests/python/relax/test_dataflow_pattern.py | 51 +-
tests/python/relax/test_e2e_op_dynamic.py | 104 +
tests/python/relax/test_frontend_from_fx.py | 5 +-
tests/python/relax/test_op_create.py | 73 +-
tests/python/relax/test_op_grad.py | 96 +
tests/python/relax/test_op_gradient_numeric.py | 794 +++++++
tests/python/relax/test_op_index.py | 330 ++-
tests/python/relax/test_op_linear_algebra.py | 83 +
tests/python/relax/test_op_manipulate.py | 179 +-
tests/python/relax/test_op_misc.py | 9 +
tests/python/relax/test_op_nn.py | 443 +++-
tests/python/relax/test_op_statistical.py | 62 +
tests/python/relax/test_op_unary.py | 2 +
tests/python/relax/test_pipeline.py | 58 +-
tests/python/relax/test_relax_operators.py | 40 +-
tests/python/relax/test_runtime_builtin.py | 17 +
tests/python/relax/test_testing_nn.py | 60 +
.../test_transform_combine_parallel_matmul.py | 469 ++++
.../test_transform_decompose_composite_ops.py | 174 --
tests/python/relax/test_transform_decompose_ops.py | 391 +++
tests/python/relax/test_transform_fold_constant.py | 34 +-
tests/python/relax/test_transform_fuse_ops.py | 96 +
.../relax/test_transform_fuse_ops_by_pattern.py | 79 +-
tests/python/relax/test_transform_fuse_tir.py | 305 +++
tests/python/relax/test_transform_gradient.py | 1164 +++++++++
.../relax/test_transform_gradient_numeric.py | 192 ++
.../test_transform_legalize_ops_create_datatype.py | 54 +
.../relax/test_transform_legalize_ops_grad.py | 381 +++
..._transform_legalize_ops_index_linear_algebra.py | 586 ++++-
.../test_transform_legalize_ops_manipulate.py | 351 ++-
.../python/relax/test_transform_legalize_ops_nn.py | 867 +++++--
...st_transform_legalize_ops_search_statistical.py | 312 ++-
.../relax/test_transform_legalize_ops_unary.py | 146 ++
.../relax/test_transform_lift_transform_params.py | 96 +
.../relax/test_transform_meta_schedule_tuning.py | 4 +-
.../test_transform_static_plan_block_memory.py | 173 ++
tests/python/relax/test_tvmscript_parser.py | 54 +
.../relax/test_tvmscript_parser_op_arith_cmp.py | 1 +
.../relax/test_tvmscript_parser_op_create.py | 14 +
.../python/relax/test_tvmscript_parser_op_grad.py | 142 ++
.../test_tvmscript_parser_op_linear_algebra.py | 16 +
.../relax/test_tvmscript_parser_op_manipulate.py | 6 +-
tests/python/relax/test_tvmscript_parser_op_nn.py | 44 +
.../relax/test_tvmscript_parser_op_statistical.py | 15 +
.../relay/collage/demo_collage_partitioner.py | 2 +-
tests/python/relay/test_any.py | 32 +-
tests/python/relay/test_op_grad_level2.py | 2 +-
tests/python/relay/test_op_level2.py | 10 +-
tests/python/relay/test_op_level5.py | 8 +-
.../test_pass_fake_quantization_to_integer.py | 14 +
tests/python/relay/test_pass_merge_composite.py | 47 +-
tests/python/topi/python/test_topi_einsum.py | 52 +-
tests/python/topi/python/test_topi_reduce.py | 20 +-
tests/python/topi/python/test_topi_transform.py | 57 +
tests/python/topi/python/test_topi_vision.py | 78 +-
.../unittest/test_arith_canonical_simplify.py | 36 +
tests/python/unittest/test_arith_deduce_bound.py | 10 +-
.../python/unittest/test_arith_iter_affine_map.py | 131 +-
.../python/unittest/test_arith_rewrite_simplify.py | 68 +-
tests/python/unittest/test_arith_simplify.py | 31 +
.../unittest/test_arith_solve_linear_inequality.py | 15 +
.../python/unittest/test_autotvm_xgboost_model.py | 4 +-
.../unittest/test_meta_schedule_cost_model.py | 26 +
...schedule_feature_extractor_per_store_feature.py | 22 +
.../test_meta_schedule_postproc_rewrite_layout.py | 12 +-
...meta_schedule_postproc_rewrite_unbound_block.py | 58 +-
...chedule_schedule_rule_cross_thread_reduction.py | 14 +-
.../test_meta_schedule_schedule_rule_mlt.py | 100 +-
.../test_meta_schedule_schedule_rule_mlt_intrin.py | 151 +-
.../test_meta_schedule_schedule_rule_mlt_tc.py | 118 +-
.../unittest/test_meta_schedule_space_cpu.py | 2492 ++++++++++----------
.../unittest/test_meta_schedule_space_cuda.py | 1018 ++++----
.../test_meta_schedule_space_cuda_async.py | 115 +-
.../test_meta_schedule_space_cuda_winograd.py | 402 ++--
.../unittest/test_meta_schedule_trace_apply.py | 58 +-
tests/python/unittest/test_object_path.py | 10 +
tests/python/unittest/test_target_codegen_llvm.py | 20 +
.../unittest/test_tir_reorder_block_iter_var.py | 86 +
.../unittest/test_tir_schedule_compute_at.py | 41 +
tests/python/unittest/test_tir_schedule_merge.py | 273 +++
.../python/unittest/test_tir_schedule_reduction.py | 54 +
.../python/unittest/test_tir_schedule_utilities.py | 60 +
.../unittest/test_tir_structural_equal_hash.py | 35 +-
.../test_tir_transform_compact_buffer_region.py | 85 +-
.../unittest/test_tir_transform_flatten_buffer.py | 45 +
.../python/unittest/test_tir_transform_helpers.py | 30 +-
.../test_tir_transform_inject_ptx_async_copy.py | 537 ++++-
.../test_tir_transform_split_host_device.py | 25 +
.../test_tir_transform_unify_thread_binding.py | 25 +
.../test_transform_default_gpu_schedule.py | 52 +-
tests/python/unittest/test_tvmscript_roundtrip.py | 81 +-
.../python/unittest/test_tvmscript_syntax_sugar.py | 25 +
tests/scripts/request_hook/request_hook.py | 1 -
tests/scripts/task_config_build_cpu.sh | 2 +-
version.py | 2 +-
vta/scripts/tune_resnet.py | 24 +-
vta/tutorials/autotvm/tune_alu_vta.py | 24 +-
vta/tutorials/autotvm/tune_relay_vta.py | 24 +-
web/emcc/wasm_runtime.cc | 1 +
web/package.json | 2 +-
web/src/runtime.ts | 3 +-
web/src/webgpu.ts | 130 +-
web/tests/python/webgpu_rpc_test.py | 28 +-
483 files changed, 23948 insertions(+), 5452 deletions(-)
copy python/tvm/relax/testing/__init__.py =>
docker/install/ubuntu_install_jax.sh (67%)
copy docs/reference/api/python/{index.rst => meta_schedule.rst} (67%)
create mode 100644 python/tvm/relax/backend/contrib/cublas.py
create mode 100644 python/tvm/relax/op/_op_gradient.py
copy python/tvm/relax/{testing => op/grad}/__init__.py (85%)
copy python/tvm/relax/{testing/__init__.py => op/grad/_ffi_api.py} (79%)
create mode 100644 python/tvm/relax/op/grad/grad.py
create mode 100644 python/tvm/relax/testing/matmul.py
create mode 100644 python/tvm/relax/transform/legalize_ops/grad.py
create mode 100644 src/arith/product_normal_form.h
create mode 100644 src/relax/backend/contrib/cublas/codegen.cc
create mode 100644 src/relax/backend/contrib/utils.cc
create mode 100644 src/relax/op/tensor/grad.cc
create mode 100644 src/relax/op/tensor/grad.h
create mode 100644 src/relax/transform/combine_parallel_matmul.cc
delete mode 100644 src/relax/transform/decompose_composite_ops.cc
create mode 100644 src/relax/transform/decompose_ops.cc
create mode 100644 src/relax/transform/gradient.cc
create mode 100644 src/runtime/contrib/cublas/cublas_json_runtime.cc
create mode 100644 src/runtime/relax_vm/attention_kv_cache.cc
create mode 100644 src/tir/schedule/primitive/reorder_block_iter_var.cc
rename tests/python/relax/{test_blockbuilder.py => test_blockbuilder_core.py}
(99%)
create mode 100644 tests/python/relax/test_blockbuilder_emit_te.py
create mode 100644 tests/python/relax/test_codegen_cublas.py
create mode 100644 tests/python/relax/test_e2e_op_dynamic.py
create mode 100644 tests/python/relax/test_op_grad.py
create mode 100644 tests/python/relax/test_op_gradient_numeric.py
create mode 100644 tests/python/relax/test_testing_nn.py
create mode 100644 tests/python/relax/test_transform_combine_parallel_matmul.py
delete mode 100644 tests/python/relax/test_transform_decompose_composite_ops.py
create mode 100644 tests/python/relax/test_transform_decompose_ops.py
create mode 100644 tests/python/relax/test_transform_gradient.py
create mode 100644 tests/python/relax/test_transform_gradient_numeric.py
create mode 100644 tests/python/relax/test_transform_legalize_ops_grad.py
create mode 100644 tests/python/relax/test_tvmscript_parser_op_grad.py
create mode 100644 tests/python/unittest/test_tir_reorder_block_iter_var.py
create mode 100644 tests/python/unittest/test_tir_schedule_merge.py