kirliavc opened a new issue, #11572: URL: https://github.com/apache/tvm/issues/11572
I'm running VTA GEMM tutorial code https://github.com/apache/tvm/blob/main/vta/tutorials/matrix_multiply.py. After I change the schedule and parameters of the original code, it failed to build. The original code in matrix_multiply.py uses ```m=16, n=16, o=1```. I changed into ```m=4, n=4, o=4```. I also changed ``` s[A_buf].compute_at(s[C_buf], ko) s[B_buf].compute_at(s[C_buf], ko) ``` into ``` s[A_buf].compute_at(s[C_buf], s[C_buf].op.axis[1]) s[B_buf].compute_at(s[C_buf], s[C_buf].op.axis[1]) ``` So the time to load A and B buffer should change into the lower-level loop nest after changing the schedule. After building attempt, I find that the lowering step finished successfully, and gets the following IR. ``` @main = primfn(A_1: handle, B_1: handle, C_1: handle) -> () attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True} buffers = {A: Buffer(A_2: Pointer(int8), int8, [256], []), B: Buffer(B_2: Pointer(int8), int8, [4096], []), C: Buffer(C_2: Pointer(int8), int8, [256], [])} buffer_map = {A_1: A, B_1: B, C_1: C} { @tir.call_extern("VTASetDebugMode", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), 1, dtype=int32) attr [IterVar(vta: int32, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 2 { attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_uop_scope" = "VTAPushGEMMOp" { @tir.call_extern("VTAUopLoopBegin", 4, 4, 0, 0, dtype=int32) @tir.call_extern("VTAUopLoopBegin", 4, 1, 0, 0, dtype=int32) @tir.vta.uop_push(0, 1, 0, 0, 0, 0, 0, 0, dtype=int32) @tir.call_extern("VTAUopLoopEnd", dtype=int32) @tir.call_extern("VTAUopLoopEnd", dtype=int32) } @tir.vta.coproc_dep_push(2, 1, dtype=int32) } for (ko: int32, 0, 4) { for (bo: int32, 0, 4) { for (co: int32, 0, 4) { let cse_var_1: int32 = (bo*4) { attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 1 { @tir.vta.coproc_dep_pop(2, 1, dtype=int32) @tir.call_extern("VTALoadBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), A_2, (cse_var_1 + ko), 1, 1, 1, 0, 0, 0, 0, 0, 2, dtype=int32) @tir.call_extern("VTALoadBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), B_2, ((co*4) + ko), 1, 1, 1, 0, 0, 0, 0, 0, 1, dtype=int32) @tir.vta.coproc_dep_push(1, 2, dtype=int32) } attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 2 { @tir.vta.coproc_dep_pop(1, 2, dtype=int32) attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_uop_scope" = "VTAPushGEMMOp"; @tir.vta.uop_push(0, 0, (cse_var_1 + co), 0, 0, 0, 0, 0, dtype=int32) @tir.vta.coproc_dep_push(2, 1, dtype=int32) } } } } } @tir.vta.coproc_dep_push(2, 3, dtype=int32) @tir.vta.coproc_dep_pop(2, 1, dtype=int32) attr [IterVar(vta, (nullptr), "ThreadIndex", "vta")] "coproc_scope" = 3 { @tir.vta.coproc_dep_pop(2, 3, dtype=int32) @tir.call_extern("VTAStoreBuffer2D", @tir.tvm_thread_context(@tir.vta.command_handle(, dtype=handle), dtype=handle), 0, 4, C_2, 0, 16, 1, 16, dtype=int32) } @tir.vta.coproc_sync(, dtype=int32) } ``` ### Expected behavior It should build successfully to get the schedule and simulate with Chisel simulator ### Actual behavior It failed to build, and here is the output message. ``` Traceback (most recent call last): File "/home/GROUPS/jlc/tvm/vta/tutorials/matmul_v2.py", line 459, in <module> f(A_nd, B_nd, C_nd) File "/home/GROUPS/jlc/tvm/python/tvm/runtime/module.py", line 178, in __call__ return self.entry_func(*args) File "/home/GROUPS/jlc/tvm/python/tvm/_ffi/_ctypes/packed_func.py", line 237, in __call__ raise get_last_ffi_error() tvm._ffi.base.TVMError: Traceback (most recent call last): 8: TVMFuncCall 7: tvm::runtime::RPCWrappedFunc::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const 6: tvm::runtime::LocalSession::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)> const&) 5: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::WrapPackedFunc(int (*)(TVMValue*, int*, int, TVMValue*, int*, void*), tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)::{lambda(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) 4: my_gemm 3: my_gemm_compute_ 2: VTAPushGEMMOp 1: vta::CommandQueue::PushGEMMOp(void**, int (*)(void*), void*, int) 0: vta::UopKernelMap::Get(void*, int) [clone .part.0] File "/home/GROUPS/jlc/tvm/vta/runtime/runtime.cc", line 565 TVMError: Check failed: (nbytes == 0 || nbytes == sizeof(int)) is false: ``` ### Steps to reproduce Run this python code to reproduce ``` from __future__ import absolute_import, print_function import os import tvm from tvm import te import vta import numpy as np from tvm import rpc from tvm.contrib import utils from vta.testing import simulator env = vta.get_env() host = os.environ.get("VTA_RPC_HOST", "192.168.2.99") port = int(os.environ.get("VTA_RPC_PORT", "9091")) if env.TARGET == "pynq" or env.TARGET == "de10nano": assert tvm.runtime.enabled("rpc") remote = rpc.connect(host, port) vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) elif env.TARGET in ["sim", "tsim"]: remote = rpc.LocalSession() m = 4 n = 4 o = 4 A = te.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="A", dtype=env.inp_dtype) print(A) B = te.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="B", dtype=env.wgt_dtype) print(B) A_buf = te.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: A(*i), "A_buf") B_buf = te.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: B(*i), "B_buf") ko = te.reduce_axis((0, n), name="ko") ki = te.reduce_axis((0, env.BLOCK_IN), name="ki") C_buf = te.compute( (o, m, env.BATCH, env.BLOCK_OUT), lambda bo, co, bi, ci: te.sum( A_buf[bo, ko, bi, ki].astype(env.acc_dtype) * B_buf[co, ko, ci, ki].astype(env.acc_dtype), axis=[ko, ki], ), name="C_buf", ) C = te.compute( (o, m, env.BATCH, env.BLOCK_OUT), lambda *i: C_buf(*i).astype(env.inp_dtype), name="C" ) s = te.create_schedule(C.op) s[A_buf].set_scope(env.inp_scope) s[B_buf].set_scope(env.wgt_scope) s[C_buf].set_scope(env.acc_scope) bo, co, bi, ci = s[C_buf].op.axis s[A_buf].compute_at(s[C_buf], co) s[B_buf].compute_at(s[C_buf], co) s[A_buf].pragma(s[A_buf].op.axis[0], env.dma_copy) s[B_buf].pragma(s[B_buf].op.axis[0], env.dma_copy) s[C].pragma(s[C].op.axis[0], env.dma_copy) s[C_buf].reorder( ko, bo, co, bi, ci, ki ) print(s[C_buf].op.axis) s[C_buf].tensorize(bi, env.gemm) with vta.build_config(debug_flag = (1<<1)): print(vta.lower(s, [A, B, C], simple_mode=True)) my_gemm = vta.build( s, [A, B, C], tvm.target.Target("ext_dev", host=env.target_host), name="my_gemm" ) temp = utils.tempdir() my_gemm.save(temp.relpath("gemm.o")) remote.upload(temp.relpath("gemm.o")) f = remote.load_module("gemm.o") ctx = remote.ext_dev(0) A_orig = np.random.randint(-128, 128, size=(o * env.BATCH, n * env.BLOCK_IN)).astype(A.dtype) B_orig = np.random.randint(-128, 128, size=(m * env.BLOCK_OUT, n * env.BLOCK_IN)).astype(B.dtype) A_packed = A_orig.reshape(o, env.BATCH, n, env.BLOCK_IN).transpose((0, 2, 1, 3)) B_packed = B_orig.reshape(m, env.BLOCK_OUT, n, env.BLOCK_IN).transpose((0, 2, 1, 3)) A_nd = tvm.nd.array(A_packed, ctx) B_nd = tvm.nd.array(B_packed, ctx) C_nd = tvm.nd.array(np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(C.dtype), ctx) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(A_nd, B_nd, C_nd) C_ref = np.dot(A_orig.astype(env.acc_dtype), B_orig.T.astype(env.acc_dtype)).astype(C.dtype) C_ref = C_ref.reshape(o, env.BATCH, m, env.BLOCK_OUT).transpose((0, 2, 1, 3)) np.testing.assert_equal(C_ref, C_nd.numpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("Execution statistics:") for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v)) print("Successful matrix multiply test!") ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
