This is an automated email from the ASF dual-hosted git repository.
github-actions[bot] pushed a change to branch nightly
in repository https://gitbox.apache.org/repos/asf/tvm.git
from bc1a904ec1 [Relax][ONNX] Prevent `Div` divide-by-zero crashes (#19566)
add 859498dc01 [TIRx] Bringup TIRx Infrastructure (#19581)
add 740eebe696 [BugFix][Target][LLVM] Use libm for asin/acos instead of
buggy inline Taylor (#19567)
add 48f346bb07 [RFC][CodeGen][CUDA]: Gate fast math intrinsic lowering
behind target option (#19565)
add 7a81a4592c [TVMScript] Handle undefined functions when dumping
IRModule (#19583)
add 13ce30b988 [BugFix][Target][LLVM] Route sinh/cosh/atan/asinh/erf
through libm extern (#19568)
No new revisions were added by this update.
Summary of changes:
.claude/commands/tir-bench.md | 195 +
.claude/commands/tir-build.md | 15 +
.claude/commands/tir-test.md | 44 +
.claude/scripts/monitor_gpu.sh | 124 +
.gitignore | 3 +
.pre-commit-config.yaml | 2 +
docs/arch/introduction_to_module_serialization.rst | 2 +-
docs/deep_dive/relax/tutorials/relax_creation.py | 11 +-
docs/deep_dive/tensor_ir/tutorials/tir_creation.py | 12 +-
.../tensor_ir/tutorials/tir_transformation.py | 2 +-
docs/errors.rst | 2 +-
.../how_to/tutorials/export_and_load_executable.py | 33 +-
.../tutorials/mix_python_and_tvm_with_pymodule.py | 10 +-
docs/install/from_source.rst | 2 +-
include/tvm/ir/function.h | 17 +
include/tvm/runtime/device_api.h | 3 +
include/tvm/s_tir/data_layout.h | 147 +-
include/tvm/script/printer/config.h | 14 +
include/tvm/script/printer/doc.h | 118 +-
include/tvm/tirx/analysis.h | 31 +-
include/tvm/tirx/async_structs.h | 103 +
include/tvm/tirx/buffer.h | 57 +-
include/tvm/tirx/builtin.h | 482 +-
include/tvm/tirx/exec_context.h | 155 +
include/tvm/tirx/exec_scope.h | 248 +
include/tvm/tirx/layout.h | 565 ++
include/tvm/tirx/op.h | 8 +-
include/tvm/tirx/predicate.h | 66 +
include/tvm/tirx/script/builder/frame.h | 187 +-
include/tvm/tirx/script/builder/ir.h | 227 +-
include/tvm/tirx/stmt.h | 343 +-
include/tvm/tirx/stmt_functor.h | 23 +-
include/tvm/tirx/target_builtin/cuda.h | 745 ++
include/tvm/tirx/target_builtin/trn.h | 156 +
include/tvm/tirx/tirx_op.h | 314 +
include/tvm/tirx/tirx_stmt.h | 85 +
include/tvm/tirx/transform.h | 34 +-
include/tvm/topi/transform.h | 6 +-
pyproject.toml | 8 +
python/tvm/__init__.py | 8 +-
python/tvm/contrib/cutlass/attention_operation.py | 14 +-
python/tvm/contrib/nvcc.py | 43 +-
python/tvm/ir/__init__.py | 9 +-
python/tvm/relax/backend/gpu_generic/cumsum.py | 24 +-
python/tvm/relax/backend/gpu_generic/sampling.py | 20 +-
python/tvm/relax/block_builder.py | 4 +-
.../tvm/relax/frontend/nn/llm/_decode_kernels.py | 42 +-
python/tvm/relax/frontend/nn/llm/_kernel_common.py | 54 +-
python/tvm/relax/frontend/nn/llm/_page_kernels.py | 18 +-
.../tvm/relax/frontend/nn/llm/_prefill_kernels.py | 172 +-
.../relax/frontend/nn/llm/position_embedding.py | 10 +-
python/tvm/relax/frontend/nn/llm/tree_attn.py | 120 +-
python/tvm/relax/frontend/nn/op.py | 6 +-
python/tvm/relax/frontend/onnx/onnx_frontend.py | 3 +-
python/tvm/relax/training/optimizer.py | 6 +
python/tvm/relax/training/setup_trainer.py | 2 +
python/tvm/relax/training/trainer.py | 1 +
python/tvm/relax/training/utils.py | 4 +
python/tvm/relax/transform/legalize_ops/grad.py | 2 +-
.../tvm/relax/transform/legalize_ops/inspect_op.py | 36 +-
python/tvm/relax/transform/legalize_ops/nn.py | 18 +-
python/tvm/relax/transform/transform.py | 12 +-
python/tvm/runtime/__init__.py | 1 +
python/tvm/runtime/_tensor.py | 2 +-
python/tvm/runtime/disco/__init__.py | 2 +-
python/tvm/runtime/script_printer.py | 55 +-
python/tvm/s_tir/__init__.py | 2 +-
python/tvm/s_tir/backend/adreno/pipeline.py | 2 +-
python/tvm/s_tir/data_layout.py | 60 +-
.../s_tir/meta_schedule/database/json_database.py | 1 +
.../meta_schedule/database/memory_database.py | 1 +
.../meta_schedule/database/schedule_fn_database.py | 1 +
.../tvm/s_tir/meta_schedule/relax_integration.py | 3 +
python/tvm/s_tir/meta_schedule/runner/runner.py | 3 +-
python/tvm/s_tir/pipeline.py | 17 +-
python/tvm/s_tir/schedule/schedule.py | 219 +-
python/tvm/s_tir/tensor_intrin/arm_cpu.py | 54 +-
python/tvm/s_tir/tensor_intrin/cuda.py | 72 +-
.../tvm/s_tir/tensor_intrin/dot_product_common.py | 4 +-
python/tvm/s_tir/tensor_intrin/hexagon.py | 16 +-
python/tvm/s_tir/tensor_intrin/metal.py | 16 +-
python/tvm/s_tir/tensor_intrin/riscv_cpu.py | 4 +-
python/tvm/s_tir/tensor_intrin/rocm.py | 28 +-
python/tvm/s_tir/tensor_intrin/x86.py | 6 +-
python/tvm/script/ir_builder/ir/__init__.py | 1 +
python/tvm/script/ir_builder/ir/ir.py | 35 +-
python/tvm/script/parser/__init__.py | 2 +-
python/tvm/script/parser/core/entry.py | 26 +-
python/tvm/script/parser/core/evaluator.py | 4 +
python/tvm/script/parser/core/parser.py | 37 +-
python/tvm/script/parser/ir/entry.py | 10 +-
python/tvm/script/printer/doc.py | 9 +-
python/tvm/support.py | 67 +-
python/tvm/target/detect_target.py | 1 +
python/tvm/target/tag_registry/cuda.py | 5 +-
python/tvm/target/target.py | 12 +
python/tvm/te/operation.py | 16 +-
python/tvm/testing/utils.py | 503 +-
python/tvm/tirx/__init__.py | 80 +-
python/tvm/tirx/analysis/analysis.py | 24 +
python/tvm/tirx/bench.py | 657 ++
python/tvm/tirx/buffer.py | 348 +-
python/tvm/tirx/build.py | 20 +-
python/tvm/tirx/compilation_pipeline.py | 197 +
python/tvm/tirx/exec_context.py | 408 +
python/tvm/tirx/exec_scope.py | 84 +
python/tvm/tirx/expr.py | 6 +
python/tvm/tirx/expr_functor.py | 684 ++
python/tvm/tirx/function.py | 19 +-
.../tvm/tirx/lang/__init__.py | 3 -
python/tvm/tirx/lang/alloc_pool.py | 510 ++
python/tvm/tirx/lang/pipeline.py | 315 +
python/tvm/tirx/lang/smem_desc.py | 55 +
python/tvm/tirx/lang/tile_scheduler.py | 818 ++
python/tvm/tirx/lang/warp_role.py | 145 +
python/tvm/tirx/layout.py | 956 +++
python/tvm/tirx/op.py | 8317 +++++++++++++++-----
python/tvm/tirx/operator/__init__.py | 41 +
python/tvm/tirx/operator/intrinsics/_common.py | 62 +
python/tvm/tirx/operator/intrinsics/_schema.py | 180 +
.../tvm/tirx/operator/intrinsics/cuda/__init__.py | 49 +
.../tvm/tirx/operator/intrinsics/cuda/cp_async.py | 910 +++
python/tvm/tirx/operator/intrinsics/cuda/header.py | 809 ++
python/tvm/tirx/operator/intrinsics/cuda/math.py | 501 ++
python/tvm/tirx/operator/intrinsics/cuda/memory.py | 739 ++
python/tvm/tirx/operator/intrinsics/cuda/misc.py | 253 +
python/tvm/tirx/operator/intrinsics/cuda/mma.py | 454 ++
.../tvm/tirx/operator/intrinsics/cuda/nvshmem.py | 161 +
.../tvm/tirx/operator/intrinsics/cuda/registry.py | 77 +
python/tvm/tirx/operator/intrinsics/cuda/sync.py | 472 ++
.../tvm/tirx/operator/intrinsics/cuda/tcgen05.py | 1354 ++++
python/tvm/tirx/operator/intrinsics/cuda/types.py | 71 +
python/tvm/tirx/operator/intrinsics/cuda/utils.py | 82 +
python/tvm/tirx/operator/intrinsics/cuda/wgmma.py | 403 +
.../operator/tile_primitive}/__init__.py | 24 +-
.../tvm/tirx/operator/tile_primitive/common.py | 38 +-
.../operator/tile_primitive/cuda}/__init__.py | 5 +-
.../tirx/operator/tile_primitive/cuda/common.py | 283 +
.../operator/tile_primitive/cuda/copy}/__init__.py | 14 +-
.../tile_primitive/cuda/copy/collective.py | 162 +
.../operator/tile_primitive/cuda/copy/scalar.py | 53 +
.../operator/tile_primitive/cuda/copy/utils.py | 189 +
.../tile_primitive/cuda/copy/vectorized.py | 63 +
.../tile_primitive/cuda/copy_async}/__init__.py | 16 +-
.../tile_primitive/cuda/copy_async/cp_async.py | 56 +
.../tile_primitive/cuda/copy_async/dsmem.py | 226 +
.../tile_primitive/cuda/copy_async/tcgen05_cp.py | 466 ++
.../tile_primitive/cuda/copy_async/tcgen05_ldst.py | 148 +
.../operator/tile_primitive/cuda/copy_async/tma.py | 1287 +++
.../tile_primitive/cuda/copy_async/utils.py | 78 +
.../tile_primitive/cuda/elementwise/__init__.py} | 24 +-
.../tile_primitive/cuda/elementwise/_common.py | 253 +
.../tile_primitive/cuda/elementwise/register.py | 84 +
.../cuda/elementwise/schedule_collective_reg.py | 410 +
.../cuda/elementwise/schedule_collective_smem.py | 132 +
.../cuda/elementwise/schedule_thread.py | 121 +
.../tile_primitive/cuda/elementwise/schema.py | 1165 +++
.../tile_primitive/cuda/exec_scope_utils.py | 108 +
.../tile_primitive/cuda/gemm_async/__init__.py | 3 +-
.../tile_primitive/cuda/gemm_async/tcgen05.py | 935 +++
.../operator/tile_primitive/cuda/gemm_utils.py | 62 +
.../operator/tile_primitive/cuda/layout_utils.py | 326 +
.../tile_primitive/cuda/permute_dims/__init__.py | 3 +-
.../cuda/permute_dims/vectorized_last_2d.py | 151 +
.../tile_primitive/cuda/reduction}/__init__.py | 5 +-
.../tile_primitive/cuda/reduction/local.py | 490 ++
.../tile_primitive/cuda/reduction/shared.py | 300 +
.../tile_primitive/cuda/reduction/sm100_packed.py | 256 +
.../tile_primitive/cuda/reduction/utils.py | 257 +
.../tirx/operator/tile_primitive/cuda/tma_utils.py | 117 +
.../operator/tile_primitive/dispatch_context.py | 205 +
.../tvm/tirx/operator/tile_primitive/dispatcher.py | 329 +
python/tvm/tirx/operator/tile_primitive/ops.py | 596 ++
.../tvm/tirx/operator/tile_primitive/registry.py | 66 +
.../tirx/operator/tile_primitive/trn}/__init__.py | 11 +-
.../tile_primitive/trn/binary}/__init__.py | 4 +-
.../operator/tile_primitive/trn/binary/default.py | 124 +
.../operator/tile_primitive/trn/binary/utils.py | 226 +
.../operator/tile_primitive/trn/common.py} | 34 +-
.../tile_primitive/trn/compose_op}/__init__.py | 8 +-
.../tile_primitive/trn/compose_op/binary_chain.py | 125 +
.../tile_primitive/trn/compose_op/binary_reduce.py | 168 +
.../tile_primitive/trn/compose_op/compose_op.py} | 39 +-
.../tile_primitive/trn/compose_op/reduce_negate.py | 51 +
.../tile_primitive/trn/compose_op/unary_reduce.py | 170 +
.../tile_primitive/trn/compose_op/utils.py} | 36 +-
.../operator/tile_primitive/trn/copy/__init__.py | 3 +-
.../operator/tile_primitive/trn/copy/default.py | 303 +
.../tirx/operator/tile_primitive/trn/dim_utils.py | 262 +
.../operator/tile_primitive/trn/gemm/__init__.py | 3 +-
.../operator/tile_primitive/trn/gemm/default.py | 304 +
.../tile_primitive/trn/instruction_generator.py | 729 ++
.../operator/tile_primitive/trn/private_alloc.py | 195 +
.../tile_primitive/trn/reduction/__init__.py | 3 +-
.../tile_primitive/trn/reduction/default.py | 30 +-
.../operator/tile_primitive/trn/reduction/utils.py | 166 +
.../operator/tile_primitive/trn/select/__init__.py | 3 +-
.../operator/tile_primitive/trn/select/default.py | 144 +
.../operator/tile_primitive/trn/unary}/__init__.py | 5 +-
.../operator/tile_primitive/trn/unary/default.py | 89 +
.../operator/tile_primitive/trn/unary/utils.py | 189 +
.../tile_primitive/trn/unary/with_bias_scale.py | 87 +
.../operator/tile_primitive/trn/workspace_utils.py | 54 +
python/tvm/tirx/pipeline.py | 75 -
.../replay_trace.py => tirx/predicate.py} | 41 +-
python/tvm/tirx/script/__init__.py | 55 +-
python/tvm/tirx/script/builder/__init__.py | 1 +
python/tvm/tirx/script/builder/frame.py | 46 +-
python/tvm/tirx/script/builder/ir.py | 1944 ++++-
python/tvm/tirx/script/builder/tirx.py | 1393 ++++
.../tvm/tirx/script/builder/tmem_pool.py | 8 +-
python/tvm/tirx/script/builder/utils.py | 2 +-
python/tvm/tirx/script/parser/__init__.py | 4 +-
python/tvm/tirx/script/parser/entry.py | 193 +-
python/tvm/tirx/script/parser/parser.py | 383 +-
python/tvm/tirx/stmt.py | 415 +-
python/tvm/tirx/stmt_functor.py | 923 +++
python/tvm/tirx/transform/__init__.py | 1 +
python/tvm/tirx/transform/common.py | 187 +
python/tvm/tirx/transform/transform.py | 27 +
python/tvm/tirx/transform/trn/__init__.py | 38 +
python/tvm/tirx/transform/trn/naive_allocator.py | 101 +
.../tvm/tirx/transform/trn/private_buffer_alloc.py | 140 +
python/tvm/topi/gpu/scan.py | 30 +-
python/tvm/topi/gpu/scatter_elements.py | 2 +-
python/tvm/topi/gpu/scatter_nd.py | 2 +-
python/tvm/topi/gpu/sort.py | 48 +-
python/tvm/topi/index_put.py | 2 +-
python/tvm/topi/nn/conv2d.py | 5 +-
python/tvm/topi/scatter.py | 2 +-
python/tvm/topi/scatter_elements.py | 2 +-
python/tvm/topi/signal.py | 2 +-
python/tvm/topi/sort.py | 30 +-
python/tvm/topi/utils.py | 8 +-
python/tvm/topi/vision/nms.py | 46 +-
python/tvm/topi/vision/nms_util.py | 4 +-
src/arith/canonical_simplify.cc | 32 +
src/arith/ir_mutator_with_analyzer.cc | 120 +-
src/arith/modular_set.cc | 11 +
src/arith/rewrite_simplify.cc | 6 +
src/ir/script_printer.cc | 9 +
src/relax/backend/vm/codegen_vm_tir.cc | 1 +
src/relax/backend/vm/vm_shape_lower.cc | 1 +
src/relax/op/image/resize.cc | 4 +-
src/relax/op/nn/convolution.cc | 54 +-
src/relax/op/nn/pooling.cc | 4 +-
src/relax/op/op_common.cc | 4 +-
src/relax/op/op_common.h | 20 +-
src/relax/op/tensor/inspect.cc | 4 +-
src/relax/op/tensor/manipulate.cc | 14 +-
src/relax/op/tensor/statistical.cc | 2 +-
src/relax/transform/compute_prim_value.cc | 5 +-
src/relax/transform/convert_layout.cc | 16 +-
src/relax/transform/fuse_tir.cc | 1 +
src/relax/transform/infer_layout_utils.cc | 26 +-
src/relax/transform/infer_layout_utils.h | 20 +-
src/runtime/contrib/cutlass/fp16_group_gemm.cuh | 18 +-
.../cutlass/fp16_group_gemm_runner_sm100.cuh | 12 +-
.../cutlass/fp16_group_gemm_runner_sm90.cuh | 12 +-
src/runtime/contrib/cutlass/fp8_gemm.cu | 18 +-
src/runtime/contrib/cutlass/fp8_group_gemm_sm90.cu | 18 +-
.../contrib/cutlass/fp8_groupwise_scaled_gemm.cuh | 73 +-
.../fp8_groupwise_scaled_gemm_runner_sm100.cuh | 12 +-
.../fp8_groupwise_scaled_gemm_runner_sm90.cuh | 12 +-
...p8_groupwise_scaled_group_gemm_runner_sm100.cuh | 12 +-
.../fp8_groupwise_scaled_group_gemm_sm100.cu | 36 +-
src/runtime/contrib/cutlass/gemm_runner.cuh | 12 +-
src/runtime/contrib/nvshmem/dist_gemm.cu | 151 +
src/runtime/contrib/nvshmem/init.cc | 15 +-
src/runtime/contrib/nvshmem/kv_transfer.cu | 72 +-
src/runtime/contrib/nvshmem/memory_allocator.cc | 12 +-
src/runtime/crt/common/crt_runtime_api.c | 659 ++
src/runtime/cuda/cuda_device_api.cc | 193 +-
src/runtime/cuda/cuda_module.cc | 74 +-
src/runtime/disco/builtin.cc | 2 +
src/runtime/meta_data.h | 79 +
src/runtime/thread_storage_scope.h | 44 +-
src/runtime/vm/attn_backend.cc | 11 +-
src/runtime/vm/attn_backend.h | 217 +-
src/runtime/vm/attn_utils.h | 75 +-
src/runtime/vm/paged_kv_cache.cc | 16 +-
src/s_tir/data_layout.cc | 190 +-
src/s_tir/schedule/analysis/reducer.cc | 7 +
src/s_tir/transform/inject_permuted_layout.cc | 4 +-
src/s_tir/transform/lower_async_dma.cc | 3 +-
src/s_tir/transform/lower_opaque_block.cc | 4 +-
.../transform/merge_shared_memory_allocations.cc | 20 +-
src/s_tir/transform/storage_access.cc | 8 +
src/s_tir/transform/unify_thread_binding.cc | 3 +-
src/script/ir_builder/base.cc | 10 +-
src/script/ir_builder/ir/ir.cc | 11 +-
src/script/printer/doc.cc | 42 +
src/script/printer/doc_printer/base_doc_printer.cc | 6 +
src/script/printer/doc_printer/base_doc_printer.h | 15 +
.../printer/doc_printer/python_doc_printer.cc | 74 +
src/script/printer/ir/ir.cc | 25 +-
src/script/printer/utils.h | 7 +
src/target/cuda/codegen_cuda.cc | 828 +-
src/target/cuda/codegen_cuda.h | 64 +-
src/target/cuda/intrin_rule_cuda.cc | 49 +-
src/target/cuda/ptx.cc | 354 +-
src/target/cuda/ptx.h | 87 +-
src/target/llvm/codegen_llvm.cc | 37 +-
src/target/llvm/codegen_llvm.h | 1 +
src/target/llvm/intrin_rule_llvm.cc | 129 +-
src/target/source/codegen_c.cc | 62 +-
src/target/source/codegen_c.h | 3 +
src/target/source/codegen_source_base.h | 4 +-
src/target/source/codegen_trn.cc | 672 ++
src/target/source/codegen_trn.h | 90 +
src/target/tag.cc | 13 +
src/target/target_kind.cc | 22 +-
src/target/webgpu/codegen_webgpu.cc | 10 +
src/target/webgpu/codegen_webgpu.h | 2 +
src/te/operation/create_primfunc.cc | 24 +-
src/tirx/analysis/exec_context.cc | 696 ++
src/tirx/analysis/var_use_def_analysis.cc | 21 +-
src/tirx/analysis/verify_tirx_well_formed.cc | 284 +
src/tirx/analysis/verify_well_formed.cc | 131 +-
src/tirx/ir/async_structs.cc | 87 +
src/tirx/ir/buffer.cc | 76 +-
src/tirx/ir/exec_scope.cc | 442 ++
src/tirx/ir/expr.cc | 1 -
src/tirx/ir/layout/axis_registry.cc | 357 +
src/tirx/ir/layout/compose_layout.cc | 118 +
src/tirx/ir/layout/layout.cc | 89 +
src/tirx/ir/layout/swizzle_layout.cc | 128 +
src/tirx/ir/layout/tile_canonicalize.cc | 146 +
src/tirx/ir/layout/tile_core.cc | 279 +
src/tirx/ir/layout/tile_direct_sum_ops.cc | 264 +
src/tirx/ir/layout/tile_internal.h | 53 +
src/tirx/ir/layout/tile_slice.cc | 182 +
src/tirx/ir/layout/tile_tile_ops.cc | 411 +
src/tirx/ir/layout/utils.cc | 91 +
src/tirx/ir/layout/utils.h | 93 +
src/{ir/global_info.cc => tirx/ir/predicate.cc} | 51 +-
src/tirx/ir/script/script_complete.cc | 13 +-
src/tirx/ir/script/script_complete.h | 3 +-
src/tirx/ir/specialize.cc | 30 +-
src/tirx/ir/stmt.cc | 76 +-
src/tirx/ir/stmt_functor.cc | 145 +-
src/tirx/ir/tir_visitor_with_path.cc | 117 +-
src/tirx/ir/tir_visitor_with_path.h | 96 +-
src/tirx/ir/tirx_stmt.cc | 70 +
src/tirx/op/builtin.cc | 236 +-
src/tirx/op/op.cc | 93 +-
src/tirx/op/target_builtin/cuda.cc | 340 +
src/tirx/op/target_builtin/trn.cc | 91 +
src/tirx/op/tirx.cc | 235 +
src/tirx/script/builder/frame.cc | 169 +-
src/tirx/script/builder/ir.cc | 483 +-
src/tirx/script/builder/utils.h | 18 +-
src/tirx/script/printer/block.cc | 35 +-
src/tirx/script/printer/buffer.cc | 311 +-
src/tirx/script/printer/expr.cc | 129 +-
src/tirx/script/printer/for_loop.cc | 19 +-
src/tirx/script/printer/function.cc | 70 +-
src/tirx/script/printer/ir.cc | 4 +
src/tirx/script/printer/stmt.cc | 728 +-
src/tirx/script/printer/utils.h | 139 +-
src/tirx/transform/flatten_buffer.cc | 2 +
src/tirx/transform/ir_utils.cc | 32 +-
src/tirx/transform/ir_utils.h | 10 +-
src/tirx/transform/lower_intrin.cc | 20 +-
src/tirx/transform/lower_tirx.cc | 83 +
src/tirx/transform/lower_tirx_cleanup.cc | 402 +
src/tirx/transform/lower_tirx_dedup_tensormap.cc | 315 +
.../transform/lower_tirx_opaque.cc} | 150 +-
src/tirx/transform/lower_tvm_builtin.cc | 5 +-
src/tirx/transform/lower_warp_memory.cc | 39 +-
src/tirx/transform/remove_no_op.cc | 37 +-
src/tirx/transform/remove_no_op.h | 2 +-
src/tirx/transform/split_host_device.cc | 57 +-
src/tirx/transform/storage_rewrite.cc | 48 +-
src/tirx/transform/tile_primitive_dispatch.cc | 1282 +++
src/tirx/transform/unsupported_dtype_legalize.cc | 10 +-
src/tirx/transform/vectorize_loop.cc | 2 +-
tests/cpp/nested_msg_test.cc | 1 -
tests/lint/check_asf_header.py | 2 +
tests/lint/check_file_type.py | 5 +
.../python/arith/test_arith_canonical_simplify.py | 10 +
tests/python/arith/test_arith_domain_touched.py | 6 +-
tests/python/arith/test_arith_modular_set.py | 9 +
tests/python/codegen/test_codegen_assert.py | 16 +-
.../python/codegen/test_codegen_error_handling.py | 28 +-
tests/python/codegen/test_gpu_codegen_allreduce.py | 8 +-
tests/python/codegen/test_inject_ptx_ldg32.py | 2 +-
tests/python/codegen/test_target_codegen.py | 10 +-
.../python/codegen/test_target_codegen_aarch64.py | 60 +-
tests/python/codegen/test_target_codegen_arm.py | 12 +-
tests/python/codegen/test_target_codegen_blob.py | 4 +-
tests/python/codegen/test_target_codegen_bool.py | 9 +-
tests/python/codegen/test_target_codegen_c_host.py | 26 +-
.../codegen/test_target_codegen_cross_llvm.py | 4 +-
tests/python/codegen/test_target_codegen_cuda.py | 112 +-
.../codegen/test_target_codegen_cuda_fastmath.py | 298 +
.../python/codegen/test_target_codegen_cuda_fp4.py | 76 +-
.../python/codegen/test_target_codegen_cuda_fp8.py | 38 +-
tests/python/codegen/test_target_codegen_device.py | 10 +-
tests/python/codegen/test_target_codegen_extern.py | 6 +-
.../codegen/test_target_codegen_gpu_common.py | 4 +-
.../python/codegen/test_target_codegen_hexagon.py | 12 +-
tests/python/codegen/test_target_codegen_llvm.py | 164 +-
.../python/codegen/test_target_codegen_llvm_vla.py | 10 +-
tests/python/codegen/test_target_codegen_metal.py | 22 +-
tests/python/codegen/test_target_codegen_opencl.py | 32 +-
tests/python/codegen/test_target_codegen_riscv.py | 7 +-
tests/python/codegen/test_target_codegen_rocm.py | 16 +-
.../codegen/test_target_codegen_static_init.py | 2 +-
tests/python/codegen/test_target_codegen_vulkan.py | 50 +-
tests/python/codegen/test_target_codegen_x86.py | 4 +-
.../contrib/test_android/test_meta_schedule.py | 2 +-
.../test_hexagon/test_async_dma_pipeline.py | 8 +-
.../test_hexagon/test_benchmark_elemwise_add.py | 2 +-
.../contrib/test_hexagon/test_dma_builtin.py | 4 +-
.../contrib/test_hexagon/test_memory_alloc.py | 2 +-
.../contrib/test_hexagon/test_meta_schedule.py | 4 +-
.../contrib/test_hexagon/test_parallel_hvx.py | 6 +-
.../test_hexagon/test_parallel_hvx_load_vtcm.py | 8 +-
.../contrib/test_hexagon/test_parallel_scalar.py | 6 +-
.../test_relax_2d_buffer_allocation.py | 4 +-
.../test_hexagon/test_software_pipeline_async.py | 4 +-
tests/python/contrib/test_hexagon/test_take.py | 18 +-
.../contrib/test_hexagon/test_thread_pool.py | 4 +-
tests/python/contrib/test_hexagon/test_vtcm.py | 2 +-
.../contrib/test_hexagon/test_vtcm_bandwidth.py | 4 +-
.../python/contrib/test_tir_triton_integration.py | 8 +-
tests/python/disco/test_nvshmem.py | 6 +-
tests/python/disco/test_session.py | 10 +-
tests/python/driver/test_compile.py | 2 +-
tests/python/ir/analysis/test_collect_call_map.py | 6 +-
tests/python/ir/test_datatype_nv_fp8.py | 2 +-
tests/python/ir/test_pass_instrument.py | 4 +-
.../python/ir/test_transform_replace_global_var.py | 24 +-
tests/python/relax/backend/adreno/mod_utils.py | 8 +-
.../test_transform_fold_vdevice_scope_change.py | 16 +-
tests/python/relax/backend/adreno/utils.py | 7 +-
.../test_distributed_transform_lower_distir.py | 22 +-
...ributed_transform_lower_global_to_local_view.py | 96 +-
...est_distributed_transform_propagate_sharding.py | 88 +-
.../test_distributed_tvmscript_parser.py | 12 +-
.../test_distributed_tvmscript_printer.py | 7 +-
tests/python/relax/test_analysis.py | 30 +-
.../python/relax/test_analysis_detect_recursion.py | 2 +-
.../relax/test_analysis_estimate_memory_usage.py | 12 +-
.../test_analysis_suggest_layout_transforms.py | 70 +-
tests/python/relax/test_analysis_well_formed.py | 84 +-
tests/python/relax/test_ast_printer.py | 2 +-
.../python/relax/test_backend_dispatch_sampling.py | 29 +-
.../relax/test_backend_transform_shape_lower.py | 8 +-
tests/python/relax/test_base_py_module.py | 12 +-
tests/python/relax/test_base_py_module_printer.py | 20 +-
.../relax/test_base_py_module_symbolic_shape.py | 4 +-
tests/python/relax/test_blockbuilder_emit_te.py | 11 +-
tests/python/relax/test_codegen_cutlass.py | 40 +-
tests/python/relax/test_dataflow_inplace.py | 46 +-
tests/python/relax/test_dataflow_pattern.py | 6 +-
tests/python/relax/test_dataflow_rewriter.py | 10 +-
tests/python/relax/test_dlpack_integration.py | 2 +-
...eliminate_pad_branch_using_buffer_assumption.py | 12 +-
tests/python/relax/test_frontend_common.py | 12 +-
tests/python/relax/test_frontend_dynamo.py | 18 +-
.../relax/test_frontend_from_exported_program.py | 2 +
tests/python/relax/test_frontend_nn_op.py | 40 +-
tests/python/relax/test_frontend_onnx.py | 8 +-
tests/python/relax/test_frontend_onnx_backend.py | 4 +-
tests/python/relax/test_frontend_stablehlo.py | 5 +
tests/python/relax/test_frontend_tflite.py | 21 +-
tests/python/relax/test_group_gemm_flashinfer.py | 6 +-
tests/python/relax/test_op_gradient_numeric.py | 2 +
tests/python/relax/test_op_index.py | 12 +-
tests/python/relax/test_op_misc.py | 2 +-
.../python/relax/test_optimize_layout_transform.py | 28 +-
tests/python/relax/test_pytorch_integration.py | 4 +-
.../python/relax/test_relax_to_pyfunc_converter.py | 10 +-
..._builtin_paged_attention_kv_cache_flashinfer.py | 4 +-
.../python/relax/test_runtime_builtin_rnn_state.py | 12 +-
tests/python/relax/test_tir_call_source_kernel.py | 8 +-
tests/python/relax/test_transform.py | 34 +-
tests/python/relax/test_transform_alter_op_impl.py | 85 +-
.../test_transform_annotate_tir_op_pattern.py | 34 +-
...st_transform_attach_attr_layout_free_buffers.py | 34 +-
.../relax/test_transform_attach_global_symbol.py | 8 +-
tests/python/relax/test_transform_bind_params.py | 2 +-
tests/python/relax/test_transform_codegen_pass.py | 2 +-
.../relax/test_transform_compute_prim_value.py | 6 +-
tests/python/relax/test_transform_cse.py | 66 +-
.../relax/test_transform_dead_code_elimination.py | 32 +-
tests/python/relax/test_transform_fold_constant.py | 20 +-
tests/python/relax/test_transform_fuse_ops.py | 107 +-
.../relax/test_transform_fuse_ops_by_pattern.py | 42 +-
tests/python/relax/test_transform_fuse_tir.py | 178 +-
.../relax/test_transform_fuse_transpose_matmul.py | 12 +-
tests/python/relax/test_transform_gradient.py | 102 +-
.../relax/test_transform_gradient_te_register.py | 26 +-
tests/python/relax/test_transform_lambda_lift.py | 38 +-
.../relax/test_transform_lazy_transform_params.py | 114 +-
tests/python/relax/test_transform_legalize_ops.py | 26 +-
.../relax/test_transform_legalize_ops_binary.py | 182 +-
.../relax/test_transform_legalize_ops_ccl.py | 12 +-
.../test_transform_legalize_ops_create_datatype.py | 46 +-
.../test_transform_legalize_ops_distributed.py | 4 +-
.../relax/test_transform_legalize_ops_grad.py | 35 +-
.../relax/test_transform_legalize_ops_image.py | 6 +-
..._transform_legalize_ops_index_linear_algebra.py | 60 +-
.../test_transform_legalize_ops_manipulate.py | 154 +-
.../python/relax/test_transform_legalize_ops_nn.py | 179 +-
.../relax/test_transform_legalize_ops_qdq.py | 22 +-
...st_transform_legalize_ops_search_statistical.py | 69 +-
.../relax/test_transform_lift_transform_params.py | 66 +-
.../test_transform_merge_composite_functions.py | 12 +-
.../test_transform_meta_schedule_apply_database.py | 12 +-
.../relax/test_transform_meta_schedule_tuning.py | 8 +-
.../relax/test_transform_normalize_global_var.py | 4 +-
...st_transform_operator_specific_normalization.py | 20 +-
.../relax/test_transform_rewrite_cuda_graph.py | 64 +-
.../test_transform_rewrite_dataflow_reshape.py | 46 +-
...nsform_specialize_primfunc_based_on_callsite.py | 20 +-
.../test_transform_split_layout_rewrite_preproc.py | 30 +-
.../test_transform_static_plan_block_memory.py | 194 +-
.../relax/test_transform_to_mixed_precision.py | 60 +-
tests/python/relax/test_tvmscript_parser.py | 82 +-
tests/python/relax/test_tvmscript_printer_relax.py | 19 +-
tests/python/relax/test_tvmscript_pyfunc.py | 4 +-
.../relax/test_vm_alloc_storage_with_scope.py | 4 +-
tests/python/relax/test_vm_build.py | 26 +-
tests/python/relax/test_vm_codegen_only.py | 8 +-
tests/python/relax/test_vm_codegen_tir.py | 14 +-
tests/python/relax/test_vm_cuda_graph.py | 6 +-
tests/python/relax/texture/test_texture_nd.py | 4 +-
.../python/runtime/test_evaluator_with_preproc.py | 2 +-
tests/python/runtime/test_executable.py | 2 +-
tests/python/runtime/test_runtime_extension.py | 2 +-
tests/python/runtime/test_runtime_rpc.py | 4 +-
...st_s_tir_analysis_calculate_allocated_memory.py | 6 +-
.../test_s_tir_analysis_estimate_tir_flops.py | 14 +-
.../test_s_tir_analysis_identify_memcpy.py | 34 +-
.../test_s_tir_analysis_is_pure_function.py | 18 +-
.../s_tir/analysis/test_s_tir_analysis_oob.py | 10 +-
.../s_tir/analysis/test_sblock_access_region.py | 38 +-
.../analysis/test_sblock_buffer_access_lca.py | 10 +-
.../s_tir/base/test_sblock_dependence_info.py | 6 +-
tests/python/s_tir/base/test_tir_data_layout.py | 56 +-
.../s_tir/base/test_tir_te_extern_primfunc.py | 8 +-
tests/python/s_tir/dlight/test_benchmark.py | 10 +-
tests/python/s_tir/dlight/test_cpu_gemv.py | 36 +-
tests/python/s_tir/dlight/test_cpu_reduction.py | 8 +-
tests/python/s_tir/dlight/test_gpu_conv.py | 4 +-
tests/python/s_tir/dlight/test_gpu_fallback.py | 32 +-
tests/python/s_tir/dlight/test_gpu_gemv.py | 57 +-
.../s_tir/dlight/test_gpu_general_reduction.py | 65 +-
.../python/s_tir/dlight/test_gpu_low_batch_gemv.py | 45 +-
tests/python/s_tir/dlight/test_gpu_matmul.py | 30 +-
.../s_tir/dlight/test_gpu_matmul_tensorize.py | 29 +-
tests/python/s_tir/dlight/test_gpu_reduction.py | 124 +-
tests/python/s_tir/dlight/test_gpu_rmsnorm.py | 16 +-
tests/python/s_tir/dlight/test_gpu_transpose.py | 24 +-
tests/python/s_tir/dlight/test_primitives.py | 2 +-
.../meta_schedule/test_meta_schedule_arg_info.py | 2 +-
.../meta_schedule/test_meta_schedule_builder.py | 6 +-
.../meta_schedule/test_meta_schedule_cost_model.py | 4 +-
.../meta_schedule/test_meta_schedule_database.py | 4 +-
...schedule_feature_extractor_per_store_feature.py | 10 +-
.../test_meta_schedule_measure_callback.py | 2 +-
.../test_meta_schedule_mma_tensorize.py | 4 +-
...eta_schedule_mutator_mutate_compute_location.py | 2 +-
.../test_meta_schedule_mutator_mutate_parallel.py | 2 +-
..._meta_schedule_mutator_mutate_thread_binding.py | 2 +-
.../test_meta_schedule_mutator_mutate_tile_size.py | 2 +-
.../test_meta_schedule_mutator_mutate_unroll.py | 2 +-
.../test_meta_schedule_post_order_apply.py | 8 +-
...ule_postproc_disallow_async_strided_mem_copy.py | 2 +-
...meta_schedule_postproc_disallow_dynamic_loop.py | 4 +-
..._schedule_postproc_rewrite_cooperative_fetch.py | 4 +-
.../test_meta_schedule_postproc_rewrite_layout.py | 24 +-
...e_postproc_rewrite_parallel_vectorize_unroll.py | 20 +-
...ta_schedule_postproc_rewrite_reduction_block.py | 6 +-
...est_meta_schedule_postproc_rewrite_tensorize.py | 8 +-
...meta_schedule_postproc_rewrite_unbound_block.py | 20 +-
.../test_meta_schedule_postproc_verify_gpu_code.py | 16 +-
...est_meta_schedule_postproc_verify_vtcm_limit.py | 2 +-
.../meta_schedule/test_meta_schedule_runner.py | 10 +-
...test_meta_schedule_schedule_rule_add_rfactor.py | 14 +-
...eta_schedule_schedule_rule_apply_custom_rule.py | 2 +-
.../test_meta_schedule_schedule_rule_auto_bind.py | 12 +-
...test_meta_schedule_schedule_rule_auto_inline.py | 24 +-
...chedule_schedule_rule_cross_thread_reduction.py | 34 +-
.../test_meta_schedule_schedule_rule_mlt.py | 24 +-
.../test_meta_schedule_schedule_rule_mlt_intrin.py | 10 +-
.../test_meta_schedule_schedule_rule_mlt_tc.py | 18 +-
...dule_schedule_rule_parallel_vectorize_unroll.py | 8 +-
...hedule_schedule_rule_random_compute_location.py | 4 +-
.../test_meta_schedule_search_strategy.py | 4 +-
.../meta_schedule/test_meta_schedule_space_cpu.py | 90 +-
.../meta_schedule/test_meta_schedule_space_cuda.py | 34 +-
.../test_meta_schedule_space_cuda_async.py | 8 +-
.../test_meta_schedule_space_generator.py | 2 +-
.../test_meta_schedule_space_post_opt.py | 2 +-
.../test_meta_schedule_task_scheduler.py | 6 +-
.../test_meta_schedule_trace_apply.py | 34 +-
.../test_meta_schedule_tune_context.py | 2 +-
.../meta_schedule/test_meta_schedule_tune_tir.py | 4 +-
.../s_tir/schedule/test_tir_schedule_analysis.py | 10 +-
.../test_tir_schedule_annotate_buffer_access.py | 26 +-
.../schedule/test_tir_schedule_block_scope.py | 6 +-
.../s_tir/schedule/test_tir_schedule_blockize.py | 28 +-
.../schedule/test_tir_schedule_cache_index.py | 8 +-
.../schedule/test_tir_schedule_cache_read_write.py | 96 +-
.../s_tir/schedule/test_tir_schedule_compute_at.py | 122 +-
.../schedule/test_tir_schedule_compute_inline.py | 110 +-
.../test_tir_schedule_decompose_padding.py | 31 +-
.../s_tir/schedule/test_tir_schedule_error.py | 4 +-
.../s_tir/schedule/test_tir_schedule_for_kind.py | 63 +-
.../test_tir_schedule_fuse_reduction_epilogue.py | 16 +-
...ir_schedule_fuse_reduction_epilogue_clipping.py | 12 +-
...st_tir_schedule_fuse_reduction_epilogue_relu.py | 10 +-
.../s_tir/schedule/test_tir_schedule_merge.py | 16 +-
.../s_tir/schedule/test_tir_schedule_pad_einsum.py | 16 +-
.../s_tir/schedule/test_tir_schedule_partition.py | 20 +-
.../schedule/test_tir_schedule_read_write_at.py | 8 +-
.../s_tir/schedule/test_tir_schedule_reduction.py | 34 +-
.../s_tir/schedule/test_tir_schedule_reindex.py | 24 +-
.../s_tir/schedule/test_tir_schedule_reorder.py | 36 +-
.../test_tir_schedule_reorder_block_iter_var.py | 4 +-
.../s_tir/schedule/test_tir_schedule_rfactor.py | 272 +-
.../schedule/test_tir_schedule_rolling_buffer.py | 26 +-
.../s_tir/schedule/test_tir_schedule_sampling.py | 6 +-
.../test_tir_schedule_set_axis_separator.py | 18 +-
.../s_tir/schedule/test_tir_schedule_set_dtype.py | 8 +-
.../s_tir/schedule/test_tir_schedule_set_scope.py | 8 +-
.../s_tir/schedule/test_tir_schedule_split_fuse.py | 72 +-
.../s_tir/schedule/test_tir_schedule_state.py | 6 +-
.../test_tir_schedule_state_cached_flags.py | 40 +-
.../schedule/test_tir_schedule_storage_align.py | 6 +-
.../s_tir/schedule/test_tir_schedule_tensorize.py | 44 +-
..._tir_schedule_tensorize_ldmatrix_mma_numeric.py | 3 +-
.../s_tir/schedule/test_tir_schedule_trace.py | 6 +-
.../s_tir/schedule/test_tir_schedule_transform.py | 8 +-
.../schedule/test_tir_schedule_transform_layout.py | 186 +-
.../s_tir/schedule/test_tir_schedule_utilities.py | 16 +-
tests/python/s_tir/test_s_tir_renew_defs.py | 12 +-
...test_s_tir_transform_annotate_irregular_loop.py | 32 +-
.../test_s_tir_transform_canonicalize_loop.py | 12 +-
.../test_s_tir_transform_compact_buffer_region.py | 106 +-
...est_s_tir_transform_convert_blocks_to_opaque.py | 8 +-
.../test_s_tir_transform_default_gpu_schedule.py | 46 +-
.../test_s_tir_transform_hoist_expression.py | 78 +-
.../transform/test_s_tir_transform_hoist_if.py | 34 +-
.../test_s_tir_transform_inject_double_buffer.py | 8 +-
.../test_s_tir_transform_inject_permuted_layout.py | 40 +-
.../test_s_tir_transform_inject_ptx_async_copy.py | 100 +-
.../test_s_tir_transform_inject_ptx_ldg32.py | 4 +-
...est_s_tir_transform_inject_software_pipeline.py | 64 +-
.../test_s_tir_transform_inject_virtual_thread.py | 12 +-
.../test_s_tir_transform_lift_thread_binding.py | 4 +-
.../test_s_tir_transform_loop_partition.py | 72 +-
...s_tir_transform_lower_cross_thread_reduction.py | 82 +-
.../test_s_tir_transform_lower_init_block.py | 8 +-
.../test_s_tir_transform_lower_match_buffer.py | 40 +-
.../test_s_tir_transform_lower_opaque_block.py | 44 +-
...test_s_tir_transform_lower_thread_all_reduce.py | 24 +-
...transform_manifest_shared_memory_local_stage.py | 4 +-
...st_s_tir_transform_memhammer_lower_auto_copy.py | 34 +-
...form_merge_dynamic_shared_memory_allocations.py | 16 +-
...sform_plan_update_buffer_allocation_location.py | 36 +-
.../test_s_tir_transform_profiling_instr.py | 18 +-
.../transform/test_s_tir_transform_remove_undef.py | 18 +-
...transform_remove_weight_layout_rewrite_block.py | 4 +-
...st_s_tir_transform_renormalize_split_pattern.py | 10 +-
.../test_s_tir_transform_rewrite_unsafe_select.py | 6 +-
.../transform/test_s_tir_transform_thread_sync.py | 10 +-
.../test_s_tir_transform_unify_thread_binding.py | 30 +-
tests/python/target/test_arm_target.py | 8 +-
tests/python/target/test_target_target.py | 16 +-
tests/python/target/test_x86_features.py | 21 +
tests/python/te/test_te_create_primfunc.py | 58 +-
.../testing/test_tvm_testing_before_after.py | 18 +-
.../test_tir_analysis_verify_well_formed.py | 82 +-
tests/python/tirx-base/test_tir_base.py | 14 +-
tests/python/tirx-base/test_tir_expr_functor.py | 844 ++
tests/python/tirx-base/test_tir_host_func.py | 4 +-
tests/python/tirx-base/test_tir_imm_values.py | 54 +-
tests/python/tirx-base/test_tir_intrin.py | 2 +-
tests/python/tirx-base/test_tir_op_types.py | 60 +-
tests/python/tirx-base/test_tir_ptx_cp_async.py | 104 +-
.../test_tir_ptx_griddepcontrol.py} | 51 +-
tests/python/tirx-base/test_tir_ptx_ldmatrix.py | 4 +-
tests/python/tirx-base/test_tir_ptx_mma.py | 104 +-
tests/python/tirx-base/test_tir_ptx_mma_sp.py | 16 +-
.../tirx-base/test_tir_ptx_scalar_f32_math.py | 67 +
.../python/tirx-base/test_tir_scalable_datatype.py | 17 +-
tests/python/tirx-base/test_tir_specialize.py | 42 +-
tests/python/tirx-base/test_tir_stmt_functor.py | 1065 +++
.../test_tir_stmt_functor_ir_transform.py | 2 +-
.../tirx-base/test_tir_stmt_functor_substitute.py | 22 +-
.../tirx-base/test_tir_structural_equal_hash.py | 8 +-
tests/python/tirx-base/test_tir_texture_scope.py | 2 +-
.../test_tir_unsafe_hide_buffer_access.py | 6 +-
.../test_tir_inline_private_functions.py | 60 +-
.../test_tir_transform_annotate_device_regions.py | 8 +-
.../test_tir_transform_bf16_legalize.py | 30 +-
.../test_tir_transform_common_subexpr_elim.py | 68 +-
.../test_tir_transform_convert_ssa.py | 60 +-
.../test_tir_transform_device_kernel_launch.py | 44 +-
.../test_tir_transform_flatten_buffer.py | 100 +-
...test_tir_transform_force_narrow_index_to_i32.py | 39 +-
.../test_tir_transform_fp8_legalize.py | 6 +-
.../tirx-transform/test_tir_transform_helpers.py | 58 +-
.../test_tir_transform_lower_tvm_builtin.py | 26 +-
.../test_tir_transform_make_packed_api.py | 42 +-
.../test_tir_transform_narrow_datatype.py | 28 +-
...est_tir_transform_pointer_value_type_rewrite.py | 16 +-
.../test_tir_transform_remove_assume.py | 8 +-
.../test_tir_transform_remove_no_op.py | 120 +-
.../tirx-transform/test_tir_transform_simplify.py | 297 +-
.../test_tir_transform_split_host_device.py | 47 +-
.../test_tir_transform_storage_rewrite.py | 50 +-
.../test_tir_transform_unroll_loop.py | 14 +-
.../tirx-transform/test_tir_transform_vectorize.py | 121 +-
.../python/tirx/__init__.py | 3 -
.../python/tirx/codegen/test_codegen_blackwell.py | 422 +
tests/python/tirx/codegen/test_codegen_cuda.py | 826 ++
tests/python/tirx/codegen/test_codegen_dsmem.py | 94 +
tests/python/tirx/codegen/test_codegen_hopper.py | 1115 +++
tests/python/tirx/codegen/test_codegen_nki.py | 335 +
tests/python/tirx/codegen/test_codegen_nvshmem.py | 309 +
tests/python/tirx/codegen/test_cuda_copy.py | 230 +
tests/python/tirx/codegen/test_cuda_cta_reduce.py | 196 +
tests/python/tirx/codegen/test_cuda_warp_reduce.py | 187 +
.../operator/tile_primitive/cuda/test_binary.py | 772 ++
.../tile_primitive/cuda/test_copy_async_cta.py | 128 +
.../tile_primitive/cuda/test_copy_async_tma.py | 1596 ++++
.../tile_primitive/cuda/test_copy_async_tmem.py | 137 +
.../tile_primitive/cuda/test_copy_dsmem.py | 248 +
.../operator/tile_primitive/cuda/test_copy_sync.py | 440 ++
.../tirx/operator/tile_primitive/cuda/test_fma.py | 332 +
.../tile_primitive/cuda/test_gemm_async.py | 1924 +++++
.../tile_primitive/cuda/test_permute_dims.py | 152 +
.../operator/tile_primitive/cuda/test_reduction.py | 1065 +++
.../tile_primitive/cuda/test_smem_tmem_dispatch.py | 471 ++
.../operator/tile_primitive/cuda/test_unary.py | 1265 +++
.../operator/tile_primitive/test_dispatcher.py | 158 +
.../operator/tile_primitive/trn/test_binary_trn.py | 360 +
.../tile_primitive/trn/test_compose_op_trn.py | 800 ++
.../operator/tile_primitive/trn/test_copy_trn.py | 869 ++
.../operator/tile_primitive/trn/test_gemm_trn.py | 601 ++
.../tile_primitive/trn/test_private_alloc_trn.py | 401 +
.../tile_primitive/trn/test_reduction_trn.py | 289 +
.../operator/tile_primitive/trn/test_select_trn.py | 188 +
.../operator/tile_primitive/trn/test_unary_trn.py | 294 +
tests/python/tirx/test_alloc_pool.py | 117 +
tests/python/tirx/test_bench_utils.py | 213 +
tests/python/tirx/test_buffer_print.py | 392 +
tests/python/tirx/test_control_flow.py | 113 +
tests/python/tirx/test_exec_context.py | 428 +
.../python/tirx/test_exec_scope.py | 51 +-
tests/python/tirx/test_hint.py | 301 +
tests/python/tirx/test_inline.py | 261 +
tests/python/tirx/test_layout.py | 1749 ++++
tests/python/tirx/test_op.py | 223 +
tests/python/tirx/test_parser_printer.py | 1970 +++++
tests/python/tirx/test_printer_tir_namespaces.py | 448 ++
.../python/tirx/test_roundtrip_namespaces.py | 37 +-
tests/python/tirx/test_verifier.py | 431 +
tests/python/tirx/transform/test_expr_functor.py | 844 ++
tests/python/tirx/transform/test_stmt_functor.py | 1158 +++
.../tirx/transform/test_transform_lower_tirx.py | 1572 ++++
.../transform/test_transform_naive_allocator.py | 176 +
.../test_transform_static_horizontal_fusion.py | 5 +-
.../python/tirx/utils.py | 3 -
tests/python/tvmscript/test_tvmscript_complete.py | 25 +-
.../tvmscript/test_tvmscript_error_report.py | 6 +-
.../tvmscript/test_tvmscript_ir_builder_tir.py | 28 +-
.../tvmscript/test_tvmscript_meta_programming.py | 16 +-
tests/python/tvmscript/test_tvmscript_ops.py | 139 +-
.../tvmscript/test_tvmscript_parser_source.py | 2 +-
.../python/tvmscript/test_tvmscript_parser_tir.py | 138 +-
.../tvmscript/test_tvmscript_pep563_closure.py | 30 +-
.../tvmscript/test_tvmscript_printer_annotation.py | 20 +-
.../tvmscript/test_tvmscript_printer_highlight.py | 2 +-
.../python/tvmscript/test_tvmscript_printer_ir.py | 5 +-
.../tvmscript/test_tvmscript_printer_metadata.py | 4 +-
.../test_tvmscript_printer_python_doc_printer.py | 11 +-
.../test_tvmscript_printer_structural_equal.py | 20 +-
.../python/tvmscript/test_tvmscript_printer_tir.py | 150 +-
.../test_tvmscript_printer_underlining.py | 31 +-
.../python/tvmscript/test_tvmscript_regression.py | 16 +-
tests/python/tvmscript/test_tvmscript_roundtrip.py | 275 +-
.../tvmscript/test_tvmscript_syntax_sugar.py | 100 +-
tests/python/tvmscript/test_tvmscript_type.py | 10 +-
tests/scripts/setup-pytest-env.sh | 14 +
791 files changed, 89746 insertions(+), 11326 deletions(-)
create mode 100644 .claude/commands/tir-bench.md
create mode 100644 .claude/commands/tir-build.md
create mode 100644 .claude/commands/tir-test.md
create mode 100755 .claude/scripts/monitor_gpu.sh
create mode 100644 include/tvm/tirx/async_structs.h
create mode 100644 include/tvm/tirx/exec_context.h
create mode 100644 include/tvm/tirx/exec_scope.h
create mode 100644 include/tvm/tirx/layout.h
create mode 100644 include/tvm/tirx/predicate.h
create mode 100644 include/tvm/tirx/target_builtin/cuda.h
create mode 100644 include/tvm/tirx/target_builtin/trn.h
create mode 100644 include/tvm/tirx/tirx_op.h
create mode 100644 include/tvm/tirx/tirx_stmt.h
create mode 100644 python/tvm/tirx/bench.py
create mode 100644 python/tvm/tirx/compilation_pipeline.py
create mode 100644 python/tvm/tirx/exec_context.py
create mode 100644 python/tvm/tirx/exec_scope.py
create mode 100644 python/tvm/tirx/expr_functor.py
copy .markdownlint-cli2.yaml => python/tvm/tirx/lang/__init__.py (97%)
create mode 100644 python/tvm/tirx/lang/alloc_pool.py
create mode 100644 python/tvm/tirx/lang/pipeline.py
create mode 100644 python/tvm/tirx/lang/smem_desc.py
create mode 100644 python/tvm/tirx/lang/tile_scheduler.py
create mode 100644 python/tvm/tirx/lang/warp_role.py
create mode 100644 python/tvm/tirx/layout.py
create mode 100644 python/tvm/tirx/operator/__init__.py
create mode 100644 python/tvm/tirx/operator/intrinsics/_common.py
create mode 100644 python/tvm/tirx/operator/intrinsics/_schema.py
create mode 100644 python/tvm/tirx/operator/intrinsics/cuda/__init__.py
create mode 100644 python/tvm/tirx/operator/intrinsics/cuda/cp_async.py
create mode 100644 python/tvm/tirx/operator/intrinsics/cuda/header.py
create mode 100644 python/tvm/tirx/operator/intrinsics/cuda/math.py
create mode 100644 python/tvm/tirx/operator/intrinsics/cuda/memory.py
create mode 100644 python/tvm/tirx/operator/intrinsics/cuda/misc.py
create mode 100644 python/tvm/tirx/operator/intrinsics/cuda/mma.py
create mode 100644 python/tvm/tirx/operator/intrinsics/cuda/nvshmem.py
create mode 100644 python/tvm/tirx/operator/intrinsics/cuda/registry.py
create mode 100644 python/tvm/tirx/operator/intrinsics/cuda/sync.py
create mode 100644 python/tvm/tirx/operator/intrinsics/cuda/tcgen05.py
create mode 100644 python/tvm/tirx/operator/intrinsics/cuda/types.py
create mode 100644 python/tvm/tirx/operator/intrinsics/cuda/utils.py
create mode 100644 python/tvm/tirx/operator/intrinsics/cuda/wgmma.py
copy python/tvm/{topi/image => tirx/operator/tile_primitive}/__init__.py (54%)
copy tests/python/tvmscript/test_tvmscript_parser_ir.py =>
python/tvm/tirx/operator/tile_primitive/common.py (64%)
copy python/tvm/{contrib/hexagon =>
tirx/operator/tile_primitive/cuda}/__init__.py (91%)
create mode 100644 python/tvm/tirx/operator/tile_primitive/cuda/common.py
copy python/tvm/{relax/backend/adreno/transform =>
tirx/operator/tile_primitive/cuda/copy}/__init__.py (78%)
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/copy/collective.py
create mode 100644 python/tvm/tirx/operator/tile_primitive/cuda/copy/scalar.py
create mode 100644 python/tvm/tirx/operator/tile_primitive/cuda/copy/utils.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/copy/vectorized.py
copy python/tvm/{relax/script/builder =>
tirx/operator/tile_primitive/cuda/copy_async}/__init__.py (70%)
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/copy_async/cp_async.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/copy_async/dsmem.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/copy_async/tcgen05_cp.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/copy_async/tcgen05_ldst.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/copy_async/tma.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/copy_async/utils.py
copy python/tvm/{s_tir/meta_schedule/mutator/mutate_parallel.py =>
tirx/operator/tile_primitive/cuda/elementwise/__init__.py} (56%)
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/elementwise/_common.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/elementwise/register.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/elementwise/schedule_collective_reg.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/elementwise/schedule_collective_smem.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/elementwise/schedule_thread.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/elementwise/schema.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/exec_scope_utils.py
copy .markdownlint-cli2.yaml =>
python/tvm/tirx/operator/tile_primitive/cuda/gemm_async/__init__.py (97%)
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/gemm_async/tcgen05.py
create mode 100644 python/tvm/tirx/operator/tile_primitive/cuda/gemm_utils.py
create mode 100644 python/tvm/tirx/operator/tile_primitive/cuda/layout_utils.py
copy .markdownlint-cli2.yaml =>
python/tvm/tirx/operator/tile_primitive/cuda/permute_dims/__init__.py (95%)
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/permute_dims/vectorized_last_2d.py
copy python/tvm/{contrib/hexagon =>
tirx/operator/tile_primitive/cuda/reduction}/__init__.py (91%)
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/reduction/local.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/reduction/shared.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/reduction/sm100_packed.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/cuda/reduction/utils.py
create mode 100644 python/tvm/tirx/operator/tile_primitive/cuda/tma_utils.py
create mode 100644 python/tvm/tirx/operator/tile_primitive/dispatch_context.py
create mode 100644 python/tvm/tirx/operator/tile_primitive/dispatcher.py
create mode 100644 python/tvm/tirx/operator/tile_primitive/ops.py
create mode 100644 python/tvm/tirx/operator/tile_primitive/registry.py
copy {ci/scripts/github =>
python/tvm/tirx/operator/tile_primitive/trn}/__init__.py (80%)
copy python/tvm/{contrib/hexagon =>
tirx/operator/tile_primitive/trn/binary}/__init__.py (94%)
create mode 100644
python/tvm/tirx/operator/tile_primitive/trn/binary/default.py
create mode 100644 python/tvm/tirx/operator/tile_primitive/trn/binary/utils.py
copy python/tvm/{relax/op/mask.py =>
tirx/operator/tile_primitive/trn/common.py} (61%)
copy python/tvm/{contrib/cutlass =>
tirx/operator/tile_primitive/trn/compose_op}/__init__.py (84%)
create mode 100644
python/tvm/tirx/operator/tile_primitive/trn/compose_op/binary_chain.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/trn/compose_op/binary_reduce.py
copy python/tvm/{s_tir/meta_schedule/feature_extractor/__init__.py =>
tirx/operator/tile_primitive/trn/compose_op/compose_op.py} (50%)
create mode 100644
python/tvm/tirx/operator/tile_primitive/trn/compose_op/reduce_negate.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/trn/compose_op/unary_reduce.py
copy python/tvm/{runtime/disco/__init__.py =>
tirx/operator/tile_primitive/trn/compose_op/utils.py} (57%)
copy .markdownlint-cli2.yaml =>
python/tvm/tirx/operator/tile_primitive/trn/copy/__init__.py (97%)
create mode 100644 python/tvm/tirx/operator/tile_primitive/trn/copy/default.py
create mode 100644 python/tvm/tirx/operator/tile_primitive/trn/dim_utils.py
copy .markdownlint-cli2.yaml =>
python/tvm/tirx/operator/tile_primitive/trn/gemm/__init__.py (97%)
create mode 100644 python/tvm/tirx/operator/tile_primitive/trn/gemm/default.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/trn/instruction_generator.py
create mode 100644 python/tvm/tirx/operator/tile_primitive/trn/private_alloc.py
copy .markdownlint-cli2.yaml =>
python/tvm/tirx/operator/tile_primitive/trn/reduction/__init__.py (97%)
copy ci/scripts/jenkins/git_change_docs.sh =>
python/tvm/tirx/operator/tile_primitive/trn/reduction/default.py (62%)
mode change 100755 => 100644
create mode 100644
python/tvm/tirx/operator/tile_primitive/trn/reduction/utils.py
copy .markdownlint-cli2.yaml =>
python/tvm/tirx/operator/tile_primitive/trn/select/__init__.py (97%)
create mode 100644
python/tvm/tirx/operator/tile_primitive/trn/select/default.py
copy python/tvm/{contrib/hexagon =>
tirx/operator/tile_primitive/trn/unary}/__init__.py (91%)
create mode 100644 python/tvm/tirx/operator/tile_primitive/trn/unary/default.py
create mode 100644 python/tvm/tirx/operator/tile_primitive/trn/unary/utils.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/trn/unary/with_bias_scale.py
create mode 100644
python/tvm/tirx/operator/tile_primitive/trn/workspace_utils.py
delete mode 100644 python/tvm/tirx/pipeline.py
copy python/tvm/{s_tir/meta_schedule/search_strategy/replay_trace.py =>
tirx/predicate.py} (53%)
create mode 100644 python/tvm/tirx/script/builder/tirx.py
copy docker/install/ubuntu_install_coreml.sh =>
python/tvm/tirx/script/builder/tmem_pool.py (87%)
mode change 100755 => 100644
create mode 100644 python/tvm/tirx/transform/common.py
create mode 100644 python/tvm/tirx/transform/trn/__init__.py
create mode 100644 python/tvm/tirx/transform/trn/naive_allocator.py
create mode 100644 python/tvm/tirx/transform/trn/private_buffer_alloc.py
create mode 100644 src/runtime/contrib/nvshmem/dist_gemm.cu
create mode 100644 src/runtime/crt/common/crt_runtime_api.c
create mode 100644 src/runtime/meta_data.h
create mode 100644 src/target/source/codegen_trn.cc
create mode 100644 src/target/source/codegen_trn.h
create mode 100644 src/tirx/analysis/exec_context.cc
create mode 100644 src/tirx/analysis/verify_tirx_well_formed.cc
create mode 100644 src/tirx/ir/async_structs.cc
create mode 100644 src/tirx/ir/exec_scope.cc
create mode 100644 src/tirx/ir/layout/axis_registry.cc
create mode 100644 src/tirx/ir/layout/compose_layout.cc
create mode 100644 src/tirx/ir/layout/layout.cc
create mode 100644 src/tirx/ir/layout/swizzle_layout.cc
create mode 100644 src/tirx/ir/layout/tile_canonicalize.cc
create mode 100644 src/tirx/ir/layout/tile_core.cc
create mode 100644 src/tirx/ir/layout/tile_direct_sum_ops.cc
create mode 100644 src/tirx/ir/layout/tile_internal.h
create mode 100644 src/tirx/ir/layout/tile_slice.cc
create mode 100644 src/tirx/ir/layout/tile_tile_ops.cc
create mode 100644 src/tirx/ir/layout/utils.cc
create mode 100644 src/tirx/ir/layout/utils.h
copy src/{ir/global_info.cc => tirx/ir/predicate.cc} (50%)
create mode 100644 src/tirx/ir/tirx_stmt.cc
create mode 100644 src/tirx/op/target_builtin/cuda.cc
create mode 100644 src/tirx/op/target_builtin/trn.cc
create mode 100644 src/tirx/op/tirx.cc
create mode 100644 src/tirx/transform/lower_tirx.cc
create mode 100644 src/tirx/transform/lower_tirx_cleanup.cc
create mode 100644 src/tirx/transform/lower_tirx_dedup_tensormap.cc
copy src/{s_tir/transform/lower_opaque_block.cc =>
tirx/transform/lower_tirx_opaque.cc} (60%)
create mode 100644 src/tirx/transform/tile_primitive_dispatch.cc
create mode 100644 tests/python/codegen/test_target_codegen_cuda_fastmath.py
create mode 100644 tests/python/tirx-base/test_tir_expr_functor.py
copy tests/python/{codegen/test_inject_ptx_ldg32.py =>
tirx-base/test_tir_ptx_griddepcontrol.py} (53%)
create mode 100644 tests/python/tirx-base/test_tir_ptx_scalar_f32_math.py
create mode 100644 tests/python/tirx-base/test_tir_stmt_functor.py
copy .markdownlint-cli2.yaml => tests/python/tirx/__init__.py (97%)
create mode 100644 tests/python/tirx/codegen/test_codegen_blackwell.py
create mode 100644 tests/python/tirx/codegen/test_codegen_cuda.py
create mode 100644 tests/python/tirx/codegen/test_codegen_dsmem.py
create mode 100644 tests/python/tirx/codegen/test_codegen_hopper.py
create mode 100644 tests/python/tirx/codegen/test_codegen_nki.py
create mode 100644 tests/python/tirx/codegen/test_codegen_nvshmem.py
create mode 100644 tests/python/tirx/codegen/test_cuda_copy.py
create mode 100644 tests/python/tirx/codegen/test_cuda_cta_reduce.py
create mode 100644 tests/python/tirx/codegen/test_cuda_warp_reduce.py
create mode 100644
tests/python/tirx/operator/tile_primitive/cuda/test_binary.py
create mode 100644
tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_cta.py
create mode 100644
tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tma.py
create mode 100644
tests/python/tirx/operator/tile_primitive/cuda/test_copy_async_tmem.py
create mode 100644
tests/python/tirx/operator/tile_primitive/cuda/test_copy_dsmem.py
create mode 100644
tests/python/tirx/operator/tile_primitive/cuda/test_copy_sync.py
create mode 100644 tests/python/tirx/operator/tile_primitive/cuda/test_fma.py
create mode 100644
tests/python/tirx/operator/tile_primitive/cuda/test_gemm_async.py
create mode 100644
tests/python/tirx/operator/tile_primitive/cuda/test_permute_dims.py
create mode 100644
tests/python/tirx/operator/tile_primitive/cuda/test_reduction.py
create mode 100644
tests/python/tirx/operator/tile_primitive/cuda/test_smem_tmem_dispatch.py
create mode 100644 tests/python/tirx/operator/tile_primitive/cuda/test_unary.py
create mode 100644 tests/python/tirx/operator/tile_primitive/test_dispatcher.py
create mode 100644
tests/python/tirx/operator/tile_primitive/trn/test_binary_trn.py
create mode 100644
tests/python/tirx/operator/tile_primitive/trn/test_compose_op_trn.py
create mode 100644
tests/python/tirx/operator/tile_primitive/trn/test_copy_trn.py
create mode 100644
tests/python/tirx/operator/tile_primitive/trn/test_gemm_trn.py
create mode 100644
tests/python/tirx/operator/tile_primitive/trn/test_private_alloc_trn.py
create mode 100644
tests/python/tirx/operator/tile_primitive/trn/test_reduction_trn.py
create mode 100644
tests/python/tirx/operator/tile_primitive/trn/test_select_trn.py
create mode 100644
tests/python/tirx/operator/tile_primitive/trn/test_unary_trn.py
create mode 100644 tests/python/tirx/test_alloc_pool.py
create mode 100644 tests/python/tirx/test_bench_utils.py
create mode 100644 tests/python/tirx/test_buffer_print.py
create mode 100644 tests/python/tirx/test_control_flow.py
create mode 100644 tests/python/tirx/test_exec_context.py
copy python/tvm/relax/transform/legalize_ops/adreno/convolution.py =>
tests/python/tirx/test_exec_scope.py (50%)
create mode 100644 tests/python/tirx/test_hint.py
create mode 100644 tests/python/tirx/test_inline.py
create mode 100644 tests/python/tirx/test_layout.py
create mode 100644 tests/python/tirx/test_op.py
create mode 100644 tests/python/tirx/test_parser_printer.py
create mode 100644 tests/python/tirx/test_printer_tir_namespaces.py
copy python/tvm/exec/disco_remote_socket_session.py =>
tests/python/tirx/test_roundtrip_namespaces.py (50%)
create mode 100644 tests/python/tirx/test_verifier.py
create mode 100644 tests/python/tirx/transform/test_expr_functor.py
create mode 100644 tests/python/tirx/transform/test_stmt_functor.py
create mode 100644 tests/python/tirx/transform/test_transform_lower_tirx.py
create mode 100644
tests/python/tirx/transform/test_transform_naive_allocator.py
copy .markdownlint-cli2.yaml =>
tests/python/tirx/transform/test_transform_static_horizontal_fusion.py (96%)
copy .markdownlint-cli2.yaml => tests/python/tirx/utils.py (97%)