This is an automated email from the ASF dual-hosted git repository. lmzheng pushed a commit to branch custom_tile_size in repository https://gitbox.apache.org/repos/asf/tvm.git
commit b1fcd45d2f9c04d2ff544c390a4f0bb7dd43c692 Author: Lianmin Zheng <[email protected]> AuthorDate: Fri Feb 5 22:01:11 2021 +0000 update --- tutorials/autotvm/tune_conv2d_cuda.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py index a00fe5f..aa7449e 100644 --- a/tutorials/autotvm/tune_conv2d_cuda.py +++ b/tutorials/autotvm/tune_conv2d_cuda.py @@ -100,9 +100,9 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding): rc, ry, rx = s[conv].op.reduce_axis cfg = autotvm.get_config() - cfg.define_split("tile_f", f, num_outputs=4) # filter / output channel - cfg.define_split("tile_y", y, num_outputs=4) # height - cfg.define_split("tile_x", x, num_outputs=4) # width + cfg.define_split("tile_f", f, num_outputs=4) # filter / output channel -> blockIdx.z, vthread, threadIdx.z, thread_inner + cfg.define_split("tile_y", y, num_outputs=4) # height -> blockIdx.y, vthread, threadIdx.y, thread_inner + cfg.define_split("tile_x", x, num_outputs=4) # width -> blockIdx.x, vthread, threadIdx.x, thread_inner cfg.define_split("tile_rc", rc, num_outputs=3) # input channel cfg.define_split("tile_ry", ry, num_outputs=3) # kernel width cfg.define_split("tile_rx", rx, num_outputs=3) # kernel height @@ -110,6 +110,13 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding): cfg.define_knob("unroll_explicit", [0]) # disable auto unroll ##### space definition end ##### + # Constraints (read this from deviceQuery) + # blockIdx.z <= 2^31, blockIdx.y < 2^16, blockIdx.x < 2^16 (Max dimension size of a grid size) + # threadIdx.z <= 1024, threadIdx.y <= 1024 , threadIdx.z <=1024 (Max dimension size of a thread block) + # threadIdx.z * threadIdx.y * threadIdx.z <= 1024 (Maximum number of threads per block) + # + # input buffer + weight buffer in each block < 49152 bytes (Total amount of shared memory per block) + # inline padding pad_data = s[conv].op.input_tensors[0] s[pad_data].compute_inline() @@ -221,6 +228,7 @@ best_config = dispatch_context.query(task.target, task.workload) # Plug your own tile sizes #best_config._entity_map['tile_f'] = SplitEntity([-1, 2, 8, 8]) +# [-1, 2, 8, 8] will be mapped to [blockId, vthread, threadId, local_id] print("\nBest config:") print(best_config)
