This is an automated email from the ASF dual-hosted git repository. tqchen pushed a commit to branch unity-staging in repository https://gitbox.apache.org/repos/asf/tvm.git
commit 90c64c6dcebb11a0526107b62ea408e06d928960 Merge: 37ad0497fe 1e6e2b35ae Author: tqchen <[email protected]> AuthorDate: Wed Sep 6 10:45:29 2023 -0400 [MERGE] Merge main into unity 2023-09-06 NOTE: use the original webgpu impl to make sure webgpu is stable. CMakeLists.txt | 38 ++- apps/android_camera/models/prepare_model.py | 2 +- apps/android_rpc/tests/android_rpc_test.py | 6 +- apps/benchmark/adreno/adreno_gpu_bench_clml.py | 2 +- apps/benchmark/adreno/adreno_gpu_bench_texture.py | 2 +- apps/benchmark/arm_cpu_imagenet_bench.py | 2 +- apps/benchmark/mobile_gpu_imagenet_bench.py | 2 +- apps/cpp_rpc/rpc_env.cc | 16 + apps/hexagon_launcher/README.md | 2 +- apps/ios_rpc/tests/ios_rpc_mobilenet.py | 2 +- apps/ios_rpc/tests/ios_rpc_test.py | 4 +- apps/microtvm/cmsisnn/requirements.txt | 54 +-- apps/microtvm/ethosu/corstone300.ld | 1 + apps/microtvm/ethosu/requirements.txt | 54 +-- apps/microtvm/poetry.lock | 175 +--------- apps/microtvm/pyproject.toml | 2 +- apps/topi_recipe/gemm/android_gemm_square.py | 2 +- cmake/config.cmake | 10 + cmake/modules/LLVM.cmake | 3 + cmake/modules/LibInfo.cmake | 8 + cmake/utils/FindLLVM.cmake | 12 + cmake/utils/FindRCCL.cmake | 52 +++ docker/bash.sh | 17 +- .../ubuntu_download_arm_compute_lib_binaries.sh | 2 +- docker/install/ubuntu_install_python_package.sh | 2 +- docs/conf.py | 10 +- docs/contribute/community.rst | 11 + docs/how_to/deploy/android.rst | 2 +- .../how_to/deploy_models/deploy_model_on_adreno.py | 2 +- .../deploy_models/deploy_model_on_android.py | 2 +- .../ci_logs/resnet-18-NHWC-B1-cuda.json | 48 +-- .../tune_with_autoscheduler/tune_network_arm.py | 2 +- .../tune_with_autoscheduler/tune_network_mali.py | 2 +- gallery/how_to/tune_with_autotvm/tune_relay_arm.py | 2 +- .../tune_with_autotvm/tune_relay_mobile_gpu.py | 2 +- .../how_to/work_with_microtvm/micro_mlperftiny.py | 2 +- include/tvm/arith/analyzer.h | 4 +- include/tvm/node/script_printer.h | 3 + include/tvm/relay/dataflow_pattern.h | 6 + include/tvm/relay/transform.h | 14 +- include/tvm/runtime/module.h | 5 + include/tvm/runtime/ndarray.h | 8 +- include/tvm/runtime/vm/bytecode.h | 19 +- include/tvm/runtime/vm/executable.h | 5 +- include/tvm/target/codegen.h | 16 + include/tvm/target/target_kind.h | 2 +- include/tvm/tir/analysis.h | 4 +- include/tvm/tir/builtin.h | 63 +++- include/tvm/tir/schedule/schedule.h | 2 +- include/tvm/topi/x86/injective.h | 2 +- python/tvm/arith/__init__.py | 1 + python/tvm/arith/analyzer.py | 10 +- python/tvm/arith/int_set.py | 8 + python/tvm/auto_scheduler/measure.py | 2 +- python/tvm/auto_scheduler/search_task.py | 2 +- python/tvm/autotvm/measure/measure_methods.py | 2 +- python/tvm/contrib/cc.py | 6 +- python/tvm/contrib/pipeline_executor_build.py | 2 +- python/tvm/contrib/torch/optimize_torch.py | 24 +- python/tvm/driver/tvmc/model.py | 5 +- python/tvm/ir/expr.py | 6 + python/tvm/meta_schedule/builder/local_builder.py | 2 +- python/tvm/meta_schedule/runner/local_runner.py | 7 +- .../meta_schedule/testing/custom_builder_runner.py | 2 +- python/tvm/relay/backend/contrib/ethosu/codegen.py | 2 +- python/tvm/relay/backend/executor_factory.py | 4 +- python/tvm/relay/dataflow_pattern/__init__.py | 13 + python/tvm/relay/frontend/onnx.py | 9 +- python/tvm/relay/frontend/pytorch.py | 12 +- python/tvm/relay/op/contrib/cmsisnn.py | 14 +- python/tvm/relay/op/contrib/dnnl.py | 2 +- python/tvm/relay/op/strategy/arm_cpu.py | 98 +++--- python/tvm/relay/quantize/_annotate.py | 4 +- python/tvm/rpc/client.py | 17 + python/tvm/runtime/module.py | 5 +- python/tvm/runtime/script_printer.py | 11 + python/tvm/script/ir_builder/tir/ir.py | 12 + python/tvm/script/parser/core/doc.py | 21 +- python/tvm/script/parser/core/evaluator.py | 21 +- python/tvm/testing/runner.py | 2 +- python/tvm/testing/utils.py | 27 +- python/tvm/tir/__init__.py | 13 +- python/tvm/tir/op.py | 164 +++++++++- python/tvm/tir/schedule/schedule.py | 2 +- python/tvm/tir/transform/transform.py | 14 + python/tvm/topi/arm_cpu/conv2d_alter_op.py | 23 +- python/tvm/topi/arm_cpu/conv2d_gemm.py | 45 ++- .../arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py | 8 +- .../topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py | 87 ++++- .../arm_cpu/mprofile/dsp/micro_kernel/max_pool.py | 13 +- .../arm_cpu/mprofile/dsp/micro_kernel/tensordot.py | 7 +- python/tvm/topi/hexagon/compute_poolarea.py | 2 +- python/tvm/topi/hexagon/slice_ops/max_pool2d.py | 2 +- src/arith/canonical_simplify.cc | 58 +++- src/arith/detect_linear_equation.cc | 3 +- src/arith/iter_affine_map.cc | 35 +- src/arith/presburger_set.cc | 276 ++++++++++++++++ src/arith/presburger_set.h | 194 +++++++++++ src/arith/rewrite_simplify.cc | 2 + src/contrib/torch/base64.h | 75 ----- .../tvm_module_wrapper/RuntimeModuleWrapperTVM.cc | 143 +++++--- src/node/script_printer.cc | 3 + src/node/structural_hash.cc | 17 + src/relay/backend/contrib/cmsisnn/compute_luts.cc | 76 +++++ src/relay/backend/contrib/cmsisnn/compute_luts.h | 55 ++++ src/relay/backend/contrib/cmsisnn/relay_to_tir.cc | 151 +++++++-- src/relay/backend/contrib/cmsisnn/target.cc | 3 +- .../backend/contrib/cmsisnn/tir_to_runtime.cc | 84 ++++- src/relay/backend/contrib/ethosu/codegen.cc | 4 +- .../backend/contrib/example_target_hooks/target.cc | 5 +- .../contrib/example_target_hooks/tir_to_runtime.cc | 26 +- src/relay/backend/contrib/uma/targets.cc | 30 +- src/relay/backend/contrib/uma/tir_to_runtime.cc | 34 +- src/relay/backend/vm/compiler.cc | 41 ++- src/relay/backend/vm/manifest_lifetimes.cc | 4 +- src/relay/ir/dataflow_matcher.cc | 21 +- src/relay/ir/dataflow_pattern.cc | 10 + src/relay/ir/dataflow_pattern_functor.cc | 6 +- src/relay/ir/indexed_graph.cc | 7 +- src/relay/op/memory/memory.cc | 20 +- src/relay/op/memory/memory.h | 5 +- src/relay/op/nn/convolution.cc | 4 +- src/relay/qnn/op/convolution.cc | 17 +- src/relay/transforms/annotate_texture_storage.cc | 9 + src/relay/transforms/device_domains.cc | 7 +- src/relay/transforms/memory_alloc.cc | 4 +- src/relay/transforms/to_mixed_precision.cc | 9 +- src/runtime/c_runtime_api.cc | 2 +- src/runtime/contrib/papi/papi.cc | 7 +- src/runtime/hexagon/hexagon_device_api.cc | 2 +- src/runtime/hexagon/ops/conv2d_fp16_hvx.cc | 2 +- src/runtime/library_module.cc | 9 - src/runtime/library_module.h | 10 + src/runtime/metal/metal_device_api.mm | 2 +- src/runtime/opencl/opencl_device_api.cc | 2 +- src/runtime/vm/bytecode.cc | 56 +++- src/runtime/vm/executable.cc | 34 +- src/runtime/vm/profiler/vm.cc | 16 +- src/runtime/vm/vm.cc | 47 ++- src/script/ir_builder/tir/ir.cc | 4 +- src/script/printer/ir_docsifier.cc | 10 +- src/script/printer/utils.h | 7 +- src/support/libinfo.cc | 11 + src/support/str_escape.h | 56 +++- src/target/codegen.cc | 102 ++++-- src/target/opt/build_cuda_on.cc | 18 +- src/target/source/codegen_aocl.cc | 19 +- src/target/source/codegen_c.cc | 153 ++++++--- src/target/source/codegen_c.h | 59 +++- src/target/source/codegen_c_host.cc | 93 +++--- src/target/source/codegen_c_host.h | 3 +- src/target/source/codegen_cuda.cc | 56 +++- src/target/source/codegen_cuda.h | 4 +- src/target/source/codegen_metal.cc | 77 +++-- src/target/source/codegen_metal.h | 3 +- src/target/source/codegen_opencl.cc | 24 +- src/target/source/codegen_vhls.cc | 34 +- src/target/source/ptx.cc | 145 ++++++++- src/target/source/ptx.h | 62 ++++ src/target/source/source_module.cc | 6 +- src/target/spirv/codegen_spirv.cc | 11 + src/target/spirv/codegen_spirv.h | 1 + src/tir/analysis/estimate_flops.cc | 20 +- src/tir/op/builtin.cc | 16 + src/tir/op/op.cc | 26 ++ src/tir/schedule/analysis.h | 2 +- src/tir/schedule/primitive.h | 2 +- .../schedule/primitive/layout_transformation.cc | 2 +- src/tir/transforms/inject_virtual_thread.cc | 2 +- src/tir/transforms/split_host_device.cc | 9 +- src/tir/transforms/storage_rewrite.cc | 112 +++++-- .../cpp/arith_integer_set_test.cc | 50 ++- tests/cpp/target_test.cc | 3 +- tests/python/contrib/test_clml/infrastructure.py | 2 +- tests/python/contrib/test_cmsisnn/test_softmax.py | 43 +++ tests/python/contrib/test_ethosn/test_codegen.py | 4 +- tests/python/frontend/onnx/test_forward.py | 95 ++++++ tests/python/frontend/pytorch/test_forward.py | 10 + tests/python/frontend/tflite/test_forward.py | 35 +- tests/python/relay/aot/test_c_device_api.py | 42 +-- .../relay/aot/test_crt_forward_declarations.py | 10 +- .../opencl_texture/test_conv2d_nchw_texture.py | 361 ++++++++++++++++----- .../opencl_texture/test_conv2d_nhwc_texture.py | 245 ++++++++++---- .../test_depthwise_conv2d_nchw_texture.py | 52 ++- .../test_depthwise_conv2d_nhwc_texture.py | 50 ++- .../relay/opencl_texture/test_injection_texture.py | 33 +- tests/python/relay/opencl_texture/test_network.py | 24 +- .../relay/opencl_texture/test_pool_texture.py | 63 +++- .../relay/opencl_texture/test_reduction_texture.py | 87 +++-- .../relay/opencl_texture/utils/adreno_utils.py | 86 ++++- .../relay/strategy/test_select_implementation.py | 89 ++++- tests/python/relay/test_dataflow_pattern.py | 87 +++++ tests/python/relay/test_pass_alter_op_layout.py | 2 +- tests/python/relay/test_pass_auto_quantize.py | 75 +++++ .../relay/test_pass_dead_code_elimination.py | 18 +- tests/python/relay/test_pass_plan_devices.py | 11 +- tests/python/relay/test_to_mixed_precision.py | 35 +- tests/python/topi/python/test_topi_conv2d_int8.py | 11 +- .../topi/python/test_topi_conv2d_tensordot_opts.py | 28 +- .../unittest/test_arith_canonical_simplify.py | 40 +++ .../unittest/test_arith_detect_linear_equation.py | 4 + .../python/unittest/test_arith_iter_affine_map.py | 27 ++ .../python/unittest/test_arith_rewrite_simplify.py | 2 + ...e_postproc_rewrite_parallel_vectorize_unroll.py | 6 +- .../test_meta_schedule_space_cuda_winograd.py | 4 +- .../unittest/test_roundtrip_runtime_module.py | 121 +++++++ tests/python/unittest/test_runtime_module_load.py | 2 +- tests/python/unittest/test_runtime_rpc.py | 2 +- tests/python/unittest/test_target_codegen_blob.py | 4 +- .../python/unittest/test_target_codegen_c_host.py | 51 ++- .../test_tir_analysis_estimate_tir_flops.py | 28 +- tests/python/unittest/test_tir_op_types.py | 35 ++ tests/python/unittest/test_tir_ptx_cp_async.py | 112 +++++++ .../test_tir_transform_inject_ptx_async_copy.py | 109 ++++--- .../test_tir_transform_lower_warp_memory.py | 10 +- ...form_merge_dynamic_shared_memory_allocations.py | 2 +- ...est_tir_transform_pointer_value_type_rewrite.py | 73 +++++ .../unittest/test_tir_transform_thread_sync.py | 2 +- .../python/unittest/test_tvmscript_error_report.py | 8 - .../unittest/test_tvmscript_ir_builder_tir.py | 20 ++ tests/python/unittest/test_tvmscript_parser_tir.py | 33 ++ .../python/unittest/test_tvmscript_printer_tir.py | 41 +++ tests/scripts/task_python_integration.sh | 2 +- tests/scripts/task_python_microtvm.sh | 1 + vta/python/vta/transform.py | 2 +- web/src/runtime.ts | 2 +- web/src/support.ts | 4 +- web/tests/python/prepare_test_libs.py | 2 +- web/tests/python/webgpu_rpc_test.py | 2 +- web/tests/python/websock_rpc_test.py | 2 +- 230 files changed, 5165 insertions(+), 1458 deletions(-) diff --cc CMakeLists.txt index 7c40a08b9b,4f989a3d90..d869f90b8c --- a/CMakeLists.txt +++ b/CMakeLists.txt @@@ -855,13 -867,14 +881,23 @@@ if(USE_CUDA AND USE_CUTLASS install(TARGETS fpA_intB_gemm EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX}) target_link_libraries(tvm PRIVATE fpA_intB_gemm) target_link_libraries(tvm_runtime PRIVATE fpA_intB_gemm) + + install(TARGETS flash_attn EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX}) + target_link_libraries(tvm PRIVATE -Wl,--no-as-needed flash_attn) + target_link_libraries(tvm_runtime PRIVATE -Wl,--no-as-needed flash_attn) +endif() + +if(USE_CUDA AND USE_NCCL) + target_link_libraries(tvm_runtime PRIVATE nccl) + target_link_libraries(tvm PRIVATE nccl) endif() + + if(USE_CUDA AND USE_NCCL) + target_link_libraries(tvm PRIVATE nccl) + target_link_libraries(tvm_runtime PRIVATE nccl) + endif() + + if(USE_ROCM AND USE_RCCL) + target_link_libraries(tvm PRIVATE rccl) + target_link_libraries(tvm_runtime PRIVATE rccl) + endif() diff --cc include/tvm/relay/transform.h index f4286512e5,35f0852f36..a767c36d71 --- a/include/tvm/relay/transform.h +++ b/include/tvm/relay/transform.h @@@ -47,19 -47,19 +47,31 @@@ using PassInfoNode = tvm::transform::Pa using PassContext = tvm::transform::PassContext; using PassContextNode = tvm::transform::PassContextNode; using Sequential = tvm::transform::Sequential; +using FTVMRelayToTIR = tvm::transform::Pass; +/*! + * \brief TIRToRuntime conversion specific to a TargetKind + * + * This function is responsible for scanning an IRModule for appropriate Target-specific functions + and generating a Runtime module representing the compiled output + * + * \param ir_module Unified IRModule + * \param target Target to filter on or retrieve arguments from + * \return Runtime Module containing compiled functions + */ +using FTVMTIRToRuntime = tvm::runtime::TypedPackedFunc<runtime::Module(IRModule, Target)>; + /*! + * \brief RelayToTIR tvm::transform::Pass specific to a TargetKind + * + * Called before the default lowering passes. + * + * \param mod The module that an optimization pass runs on. + * \param pass_ctx The pass context that can provide information for the optimization. + * + * \return The transformed module. + */ + using FTVMRelayToTIR = tvm::transform::Pass; + /* * \brief Create a function pass. * diff --cc python/tvm/ir/expr.py index bb38cf8ef3,f0f2245e7f..b8b71666ec --- a/python/tvm/ir/expr.py +++ b/python/tvm/ir/expr.py @@@ -197,41 -178,8 +197,47 @@@ class Range(Node, Scriptable) """ return _ffi_api.Range_from_min_extent(min_value, extent, span) + def __eq__(self, other): + return tvm.ir.structural_equal(self, other) + + def __ne__(self, other): + return not self.__eq__(other) ++ + +# TODO(@relax-team): remove when we have a RelaxExpr base class +def is_relax_expr(expr: RelayExpr) -> bool: + """check if a RelayExpr is a Relax expresssion. + + Parameters + ---------- + expr : RelayExpr + The expression to check. + + Returns + ------- + res : bool + If the expression is Relax expression, return True; otherwise return False. + """ + from tvm import relax # pylint: disable=import-outside-toplevel + + if isinstance( + expr, + ( + relax.Call, + relax.Constant, + relax.Tuple, + relax.TupleGetItem, + relax.If, + relax.Var, + relax.DataflowVar, + relax.ShapeExpr, + relax.SeqExpr, + relax.Function, + relax.ExternFunc, + relax.PrimValue, + relax.StringImm, + relax.DataTypeImm, + ), + ): + return True + return False diff --cc python/tvm/meta_schedule/builder/local_builder.py index 23e23dcb3a,e95459e816..ae9ad6574e --- a/python/tvm/meta_schedule/builder/local_builder.py +++ b/python/tvm/meta_schedule/builder/local_builder.py @@@ -278,17 -278,5 +278,17 @@@ def default_export(mod: Module) -> str from tvm.contrib.tar import tar # pylint: disable=import-outside-toplevel artifact_path = os.path.join(tempfile.mkdtemp(), "tvm_tmp_mod." + tar.output_format) - mod.export_library(artifact_path, tar) + mod.export_library(artifact_path, fcompile=tar) return artifact_path + + +@register_func("meta_schedule.builder.get_local_builder") +def get_local_builder() -> LocalBuilder: + """Get the local builder. + + Returns + ------- + builder : LocalBuilder + The local builder. + """ + return LocalBuilder() diff --cc src/node/script_printer.cc index e09ce266b3,28e72be789..f2d985279f --- a/src/node/script_printer.cc +++ b/src/node/script_printer.cc @@@ -99,15 -87,11 +99,18 @@@ PrinterConfig::PrinterConfig(Map<String } if (auto v = config_dict.Get("syntax_sugar")) { n->syntax_sugar = Downcast<IntImm>(v)->value; + } + if (auto v = config_dict.Get("show_object_address")) { + n->show_object_address = Downcast<IntImm>(v)->value; } + // Checking prefixes if they are valid Python identifiers. + CHECK(IsIdentifier(n->ir_prefix)) << "Invalid `ir_prefix`: " << n->ir_prefix; + CHECK(IsIdentifier(n->tir_prefix)) << "Invalid `tir_prefix`: " << n->tir_prefix; + CHECK(IsIdentifier(n->relax_prefix)) << "Invalid `relax_prefix`: " << n->relax_prefix; + CHECK(n->module_alias.empty() || IsIdentifier(n->module_alias)) + << "Invalid `module_alias`: " << n->module_alias; + this->data_ = std::move(n); } diff --cc src/target/codegen.cc index 55af8889e1,d1f2d4a479..9a847ee398 --- a/src/target/codegen.cc +++ b/src/target/codegen.cc @@@ -37,12 -37,25 +37,24 @@@ #include <unordered_set> #include <vector> - namespace tvm { - - using FTVMTIRToRuntime = runtime::TypedPackedFunc<runtime::Module(IRModule, Target)>; + #include "../runtime/library_module.h" + #include "../support/base64.h" + namespace tvm { - namespace codegen { + /*! + * \brief TIRToRuntime conversion specific to a TargetKind + * + * This function is responsible for scanning an IRModule for appropriate Target-specific functions + and generating a Runtime module representing the compiled output + * + * \param ir_module Unified IRModule + * \param target Target to filter on or retrieve arguments from + * \return Runtime Module containing compiled functions + */ + using FTVMTIRToRuntime = tvm::runtime::TypedPackedFunc<runtime::Module(IRModule, Target)>; + runtime::Module Build(IRModule mod, Target target) { if (transform::PassContext::Current() ->GetConfig<Bool>("tir.disable_assert", Bool(false)) diff --cc web/tests/python/prepare_test_libs.py index a63e0655b4,0c34a28828..dca7d89239 --- a/web/tests/python/prepare_test_libs.py +++ b/web/tests/python/prepare_test_libs.py @@@ -55,7 -35,7 +55,7 @@@ def prepare_tir_lib(base_path) fadd = tvm.build(s, [A, B], target, runtime=runtime, name="add_one") wasm_path = os.path.join(base_path, "test_addone.wasm") - fadd.export_library(wasm_path, tvmjs.create_tvmjs_wasm) - fadd.export_library(wasm_path, fcompile=emcc.create_tvmjs_wasm) ++ fadd.export_library(wasm_path, fcompile=tvmjs.create_tvmjs_wasm) if __name__ == "__main__":
