adstraw commented on a change in pull request #9390: URL: https://github.com/apache/tvm/pull/9390#discussion_r743819086
########## File path: tests/python/contrib/test_hexagon/test_conv2d_conv2d.py ########## @@ -0,0 +1,340 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys + +import tvm +from tvm import te +from tvm import topi +from tvm.topi import testing + +from .infrastructure import ( + ceildiv, + build_and_run, + get_block_shape, + get_conv2d_nhwc_shape, + get_filter_block_shape, + get_packed_filter_layout, + get_packed_activation_layout, + verify_conv2d, +) + +import numpy as np +import pytest + + +def conv2dconv2d( + shape_input, + pad1, + stride1, + dilation1, + shape_filter1, + pad2, + stride2, + dilation2, + shape_filter2, + k_split_factor, + h_split_factor, + dtype, + storage_scope="global", +): + """ + Conv2d -> Conv2d wherein the input activation is defined by its + logical NHWC layout. The filter is provided in its physical + packed layout (oihw8i32o4i). The input is padded and then packed + into its physical packed layout (nhwc8h8w32c). The resulting + computation is in the same physical packed layout (nhwc8h8w32c). + """ + + # nhwc layout + X = te.placeholder(shape_input, dtype=dtype) + + # oihw8i32o4i layout + filt_packed1 = te.placeholder(shape_filter1, dtype=dtype) + filt_packed2 = te.placeholder(shape_filter2, dtype=dtype) + + # calculate kernel size and output channels + # given oihw8i32o4i filter layout + kernel_size1 = tuple(shape_filter1[2:4]) + out_channels1 = shape_filter1[0] * shape_filter1[5] + + # get the the logical output shape of conv2d #1 + logical_output_shape1 = get_conv2d_nhwc_shape( + shape_input, + kernel_size1, + stride1, + pad1, + dilation1, + out_channels1, + ) + + block_shape = get_block_shape() + block_H, block_W, block_C = block_shape + + # Calculate padded input + N, H, W, C = shape_input + pad_h = (block_H - ((H + pad1[1]) % block_H)) % block_H + pad_w = (block_W - ((W + pad1[3]) % block_W)) % block_W + X_pad = topi.nn.pad( + X, [0, pad1[0], pad1[2], 0], [0, pad_h, pad_w, 0], pad_value=0, name="padded_input" + ) + + # Calculate packed input + packed_shape = get_packed_activation_layout(X_pad.shape, block_shape) + X_packed = te.compute( + packed_shape, + lambda n, ho, wo, co, hi, wi, ci: X_pad[ + n, ho * block_H + hi, wo * block_W + wi, co * block_C + ci + ], + name="packed_input", + ) + + filter_Cio, filter_Ki, filter_Cii = get_filter_block_shape() + filter_Ci = filter_Cio * filter_Cii + + rh = te.reduce_axis((0, kernel_size1[0]), name="rh") + rw = te.reduce_axis((0, kernel_size1[1]), name="rw") + rc = te.reduce_axis((0, C), name="rc") + + def compute(n, ho, wo, ko, hi, wi, ki): + h = ho * block_H + hi + h_contig = h * stride1[0] + rh + h_block_id = h_contig // block_H + h_block_offset = h_contig % block_H + + w = wo * block_W + wi + w_contig = w * stride1[1] + rw + w_block_id = w_contig // block_W + w_block_offset = w_contig % block_W + + c_block_id = rc // block_C + c_block_offset = rc % block_C + + rco = rc // filter_Ci + rcio = (rc % filter_Ci) // filter_Cii + rcii = rc % filter_Cii + + return te.sum( + X_packed[ + n, + h_block_id, + w_block_id, + c_block_id, + h_block_offset, + w_block_offset, + c_block_offset, + ] + * filt_packed1[ko, rco, rh, rw, rcio, ki, rcii], + axis=[rh, rw, rc], + ) + + output_shape1 = get_packed_activation_layout(logical_output_shape1, block_shape) + temp_Y = te.compute(output_shape1, compute, name="temp_output") + + # calculate kernel size and output channels + # given oihw8i32o4i filter layout + kernel_size2 = tuple(shape_filter2[2:4]) + out_channels2 = shape_filter2[0] * shape_filter2[5] + + # get the the logical output shape of conv2d #2 + logical_input_shape2 = logical_output_shape1 + logical_output_shape2 = get_conv2d_nhwc_shape( + logical_input_shape2, + kernel_size2, + stride2, + pad2, + dilation2, + out_channels2, + ) + + rh = te.reduce_axis((0, kernel_size2[0]), name="rh") + rw = te.reduce_axis((0, kernel_size2[1]), name="rw") + rc = te.reduce_axis((0, logical_input_shape2[3]), name="rc") + + def compute2(n, ho, wo, ko, hi, wi, ki): + h = ho * block_H + hi + h_contig = h * stride2[0] + rh + h_block_id = h_contig // block_H + h_block_offset = h_contig % block_H + + w = wo * block_W + wi + w_contig = w * stride2[1] + rw + w_block_id = w_contig // block_W + w_block_offset = w_contig % block_W + + c_block_id = rc // block_C + c_block_offset = rc % block_C + + rco = rc // filter_Ci + rcio = (rc % filter_Ci) // filter_Cii + rcii = rc % filter_Cii Review comment: Yes. I would like to do this in a refactor which is already work in progress. ########## File path: tests/python/contrib/test_hexagon/test_conv2d_conv2d.md ########## @@ -0,0 +1,860 @@ +<!--- Licensed to the Apache Software Foundation (ASF) under one --> +<!--- or more contributor license agreements. See the NOTICE file --> +<!--- distributed with this work for additional information --> +<!--- regarding copyright ownership. The ASF licenses this file --> +<!--- to you under the Apache License, Version 2.0 (the --> +<!--- "License"); you may not use this file except in compliance --> +<!--- with the License. You may obtain a copy of the License at --> + +<!--- http://www.apache.org/licenses/LICENSE-2.0 --> + +<!--- Unless required by applicable law or agreed to in writing, --> +<!--- software distributed under the License is distributed on an --> +<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --> +<!--- KIND, either express or implied. See the License for the --> +<!--- specific language governing permissions and limitations --> +<!--- under the License. --> + +Hexagon conv2d -> conv2d schedules + +# Baseline conv2d -> conv2d + +This is a baseline 1x1 conv2d -> 1x1 conv2d schedule for Hexagon. + +## Command + +pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_conv2d.py::TestConv2dConv2dPackedFilter::test_conv2d[1-64-128-0-1-1-128-1-1-128-1-1-float32-llvm]" + +## Parameters + +| Parameter | Value | +| ------------------------ | ----- | +| Batch | 1 | +| Input Size | 64x64 | +| Input Channel | 128 | +| Conv2d #1 Pad | 0 | +| Conv2d #1 Stride | 1 | +| Conv2d #1 Kernel Size | 1 | +| Conv2d #1 Output Channel | 128 | +| Conv2d #2 Stride | 1 | +| Conv2d #2 Kernel Size | 1 | +| Conv2d #2 Output Channel | 128 | +| k_split | 1 | +| h_split | 1 | + +## Constants + +| Constant | Value | +| ------------------ | ----- | +| Conv2d #2 Pad | 0 | +| Conv2d #1 Dilation | 1 | +| Conv2d #2 Dilation | 1 | + +## Shapes and Layouts + +The input is provided and padded in logical layout and then packed into its physical layout prior to compute. Logical layout / shape information is provided as a reference for phsyical tensors. + +| Tensor | Type | Layout | Shape | Logical Layout | Logical Shape | +| ------------ | -------- | ----------- | ---------------------- | -------------- | ---------------- | +| Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Padded Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Packed Input | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 1 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Temp Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 2 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | + +## Schedule + +This is the conv2d compute schedule: + +``` + for (ko.outer: int32, 0, 4) { + for (ho.outer: int32, 0, 8) { + // caches computed here + for (wo.c: int32, 0, 8) { + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { +``` + +Note that conv2d #1 has an independent loop over the channel out `ko.outer` dimension. This is because the output channels of conv2d #1 are the input channels to conv2d #2 and we compute over all input channels for each conv2d so we must compute over all output channels of conv2d #1 before we compute conv2d #2. + +``` + for (ko.outer_1: int32, 0, 2) { +``` + +## Cache Usage + +*Input Cache* + +We compute over the WC8h8w32c portion of the input so we need 8 * 4 * 8 * 8 * 32 = 64kb for the input cache. + +``` + allocate(packed_input.global: Pointer(global float32), float32, [65536]), storage_scope = global; +``` + +*Filter Cache* + +We compute over the IHW8i32o4i portion of each filter so we need 4 * 1 * 1 * 8 * 32 * 4 = 4kb filter cache. + +``` + allocate(packed_filter.global: Pointer(global float32), float32, [4096]), storage_scope = global; +``` + +Note that there is just one cache which is reused for conv2d / filter #1 and conv2d / filter #2. + +*Output Cache* + +We compute over the WK8h832k portion of the output where `k` denotes the output channel. The output cache is computed for each `ko.outer` which means it should be W * 8h * 8w * 32k = 8 * 8 * 8 * 32 = 16kb. And, in fact, this is the case for a single conv2d case. But, as already noted, for this conv2d -> conv2d case "the output channels of conv2d #1 are the input channels to conv2d #2 and we compute over all input channels for each conv2d so we must compute over all output channels of conv2d #1 before we compute conv2d #2". This means that the output cache must grow accordingly to K * W * 8h * 8w * 32k = 4 * 8 * 8 * 8 * 32 = 64kb. There is a temporary allocation to store the results of conv2d #1: + +``` + allocate(temp_output: Pointer(global float32), float32, [65536]), storage_scope = global; +``` + +Note that the input cache is reused to store the results of conv2d #2. + +## Assumptions + +* n/a + +## To Do + +* n/a + +## Annotated TIR + +``` +primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, output_1: handle) -> () + attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True} + buffers = {output: Buffer(output_2: Pointer(float32), float32, [1, 8, 8, 4, 8, 8, 32], []), // nhw8h8w32c + placeholder_2: Buffer(placeholder_6: Pointer(float32), float32, [4, 4, 1, 1, 8, 32, 4], []), // oihw8i32o4i + placeholder_1: Buffer(placeholder_7: Pointer(float32), float32, [4, 4, 1, 1, 8, 32, 4], []), // oihw8i32o4i + placeholder: Buffer(placeholder_8: Pointer(float32), float32, [1, 64, 64, 128], [])} // nhwc + buffer_map = {placeholder_3: placeholder, placeholder_4: placeholder_1, placeholder_5: placeholder_2, output_1: output} { + allocate(packed_input.global: Pointer(global float32), float32, [65536]), storage_scope = global; + allocate(temp_output: Pointer(global float32), float32, [65536]), storage_scope = global; + allocate(packed_filter.global: Pointer(global float32), float32, [4096]), storage_scope = global; + for (ko.outer: int32, 0, 4) { + for (ho.outer: int32, 0, 8) { + + // input cache read + for (wo: int32, 0, 8) { + for (co: int32, 0, 4) { + for (hi: int32, 0, 8) { + for (wi: int32, 0, 8) { + for (ci: int32, 0, 32) { + packed_input.global[(((((wo*8192) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = + (float32*)placeholder_8[((((((ho.outer*65536) + (hi*8192)) + (wo*1024)) + (wi*128)) + (co*32)) + ci)] + } + } + } + } + } + + // NOTE: compute over all output channels of conv2d #1 before computing conv2d #2 + for (ko.outer_1: int32, 0, 4) { + + // filter #1 cache read + for (co: int32, 0, 4) { + for (cio: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (cii: int32, 0, 4) { + packed_filter.global[((((co*1024) + (cio*128)) + (ki*4)) + cii)] = + (float32*)placeholder_7[(((((ko.outer_1*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] + } + } + } + } + + // conv2d #1 + for (wo: int32, 0, 8) { + + // init temp output to zero + for (hi.init: int32, 0, 8) { + for (wi.init: int32, 0, 8) { + for (ki.init: int32, 0, 32) { + temp_output[(((((wo*8192) + (ko.outer_1*2048)) + (hi.init*256)) + (wi.init*32)) + ki.init)] = 0f32 + } + } + } + + // compute + for (rc.outer: int32, 0, 4) { + for (hi: int32, 0, 8) { + for (wi: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (rc.inner: int32, 0, 32) { + temp_output[(((((wo*8192) + (ko.outer_1*2048)) + (hi*256)) + (wi*32)) + ki)] = + ( + (float32*)temp_output[(((((wo*8192) + (ko.outer_1*2048)) + (hi*256)) + (wi*32)) + ki)] + + ( + (float32*)packed_input.global[(((((wo*8192) + (rc.outer*2048)) + (hi*256)) + (wi*32)) + rc.inner)] * + (float32*)packed_filter.global[((((rc.outer*1024) + (floordiv(rc.inner, 4)*128)) + (ki*4)) + floormod(rc.inner, 4))] + ) + ) + } + } + } + } + } + } + } + + // filter #2 cache read + // NOTE: reusing same filter cache + for (co: int32, 0, 4) { + for (cio: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (cii: int32, 0, 4) { + packed_filter.global[((((co*1024) + (cio*128)) + (ki*4)) + cii)] = + (float32*)placeholder_6[(((((ko.outer*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] + } + } + } + } + + // conv2d #2 + for (wo.c: int32, 0, 8) { + + // init output cache to zero + // NOTE: reusing the input cache as the output cache + for (hi.c.init: int32, 0, 8) { + for (wi.c.init: int32, 0, 8) { + for (ki.c.init: int32, 0, 32) { + packed_input.global[((((wo.c*2048) + (hi.c.init*256)) + (wi.c.init*32)) + ki.c.init)] = 0f32 + } + } + } + + // compute + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { + packed_input.global[((((wo.c*2048) + (hi.c*256)) + (wi.c*32)) + ki.c)] = + ( + (float32*)packed_input.global[((((wo.c*2048) + (hi.c*256)) + (wi.c*32)) + ki.c)] + + ( + (float32*)temp_output[(((((wo.c*8192) + (rc.outer_1*2048)) + (hi.c*256)) + (wi.c*32)) + rc.inner_1)] * + (float32*)packed_filter.global[((((rc.outer_1*1024) + (floordiv(rc.inner_1, 4)*128)) + (ki.c*4)) + floormod(rc.inner_1, 4))] + ) + ) + } + } + } + } + } + } + + // write back output cache + for (wo_1: int32, 0, 8) { + for (hi_1: int32, 0, 8) { + for (wi_1: int32, 0, 8) { + for (ki_1: int32, 0, 32) { + output_2[((((((ho.outer*65536) + (wo_1*8192)) + (ko.outer*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] = + (float32*)packed_input.global[((((wo_1*2048) + (hi_1*256)) + (wi_1*32)) + ki_1)] + } + } + } + } + } + } +} +``` + +# Split on Channel Out and Height + +Uses parameters `k_split` and `h_split` which creates a loop split on the outer channel out `ko` and height `ho` loops creating `outer` and `inner` loops for each split. The cache reads and writes are computed at `ho.outer` which means that cache allocation grow in proportion to `k_split` and `h_split` factors. + +## Command + +pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_conv2d.py::TestConv2dConv2dPackedFilter::test_conv2d[1-64-128-0-1-1-128-1-1-128-2-2-float32-llvm]" + +## Parameters + +| Parameter | Value | +| ------------------------ | ----- | +| Batch | 1 | +| Input Size | 64x64 | +| Input Channel | 128 | +| Conv2d #1 Pad | 0 | +| Conv2d #1 Stride | 1 | +| Conv2d #1 Kernel Size | 1 | +| Conv2d #1 Output Channel | 128 | +| Conv2d #2 Stride | 1 | +| Conv2d #2 Kernel Size | 1 | +| Conv2d #2 Output Channel | 128 | +| k_split | 2 ^ | +| h_split | 2 ^ | + +^ Changes from above + +## Constants + +| Constant | Value | +| ------------------ | ----- | +| Conv2d #2 Pad | 0 | +| Conv2d #1 Dilation | 1 | +| Conv2d #2 Dilation | 1 | + +## Shapes and Layouts + +The input is provided and padded in logical layout and then packed into its physical layout prior to compute. Logical layout / shape information is provided as a reference for phsyical tensors. + +| Tensor | Type | Layout | Shape | Logical Layout | Logical Shape | +| ------------ | -------- | ----------- | ---------------------- | -------------- | ---------------- | +| Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Padded Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Packed Input | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 1 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Temp Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 2 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | + +## Schedule + +This is the conv2d compute schedule: + +``` + for (ko.outer: int32, 0, 2) { + for (ho.outer: int32, 0, 4) { + // caches computed here + for (ko.c.inner: int32, 0, 2) { + for (ho.c.inner: int32, 0, 2) { + for (wo.c: int32, 0, 8) { + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { +``` + +The major change here versus above is the presence of `inner` loops for both channel out `ko` and height `ho` dimensions created from the `k_split` and `h_split` schedule parameters respectively: + + +``` + for (ko.c.inner: int32, 0, 2) { + for (ho.c.inner: int32, 0, 2) { +``` + +The major effect of this change is increased cache usage given that caches are computed at the `ho.outer` level of the loop schedule. This is documented in the next section. + +(Same as above) Note that conv2d #1 has an independent loop over the channel out `ko.outer` dimension. This is because the output channels of conv2d #1 are the input channels to conv2d #2 and we compute over all input channels for each conv2d so we must compute over all output channels of conv2d #1 before we compute conv2d #2. + +``` + for (ko.outer_1: int32, 0, 2) { +``` + +## Cache Usage + +*Input Cache* + +The input cache grows by a factor of `h_split = 2` compared with above: + +``` + allocate(packed_input.global: Pointer(global float32), float32, [131072]), storage_scope = global; +``` + +*Filter Cache* + +The filter cache grows by a factor of `k_split = 2` compared with above: + +``` + allocate(packed_filter.global: Pointer(global float32), float32, [8192]), storage_scope = global; +``` + +(Same as above) Note that there is just one cache which is reused for conv2d / filter #1 and conv2d / filter #2. + +*Output Cache* + +The output cache grows by a factor of `k_split = 2` compared with above: + +``` + allocate(temp_output: Pointer(global float32), float32, [131072]), storage_scope = global; +``` + +(Same as above) Note that the input cache is reused to store the results of conv2d #2. + +## Assumptions + +* n/a + +## To Do + +* n/a + +## Annotated TIR + +``` +primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, output_1: handle) -> () + attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True} + buffers = {output: Buffer(output_2: Pointer(float32), float32, [1, 8, 8, 4, 8, 8, 32], []), // nhw8h8w32c + placeholder_2: Buffer(placeholder_6: Pointer(float32), float32, [4, 4, 1, 1, 8, 32, 4], []), // oihw8i32o4i + placeholder_1: Buffer(placeholder_7: Pointer(float32), float32, [4, 4, 1, 1, 8, 32, 4], []), // oihw8i32o4i + placeholder: Buffer(placeholder_8: Pointer(float32), float32, [1, 64, 64, 128], [])} // nhwc + buffer_map = {placeholder_3: placeholder, placeholder_4: placeholder_1, placeholder_5: placeholder_2, output_1: output} { + allocate(packed_input.global: Pointer(global float32), float32, [131072]), storage_scope = global; + allocate(temp_output: Pointer(global float32), float32, [131072]), storage_scope = global; + allocate(packed_filter.global: Pointer(global float32), float32, [8192]), storage_scope = global; + for (ko.outer: int32, 0, 2) { + for (ho.outer: int32, 0, 4) { + + // input cache read + for (ho.inner: int32, 0, 2) { + for (wo: int32, 0, 8) { + for (co: int32, 0, 4) { + for (hi: int32, 0, 8) { + for (wi: int32, 0, 8) { + for (ci: int32, 0, 32) { + packed_input.global[((((((ho.inner*65536) + (wo*8192)) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = + (float32*)placeholder_8[(((((((ho.outer*131072) + (ho.inner*65536)) + (hi*8192)) + (wo*1024)) + (wi*128)) + (co*32)) + ci)] + } + } + } + } + } + } + + // NOTE: compute over all output channels of conv2d #1 before computing conv2d #2 + for (ko.outer_1: int32, 0, 2) { + for (ko.inner: int32, 0, 2) { + // filter #1 cache read + for (co: int32, 0, 4) { + for (cio: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (cii: int32, 0, 4) { + packed_filter.global[(((((ko.inner*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] = + (float32*)placeholder_7[((((((ko.outer_1*8192) + (ko.inner*4096)) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] + } + } + } + } + } + + // conv2d #1 + for (ko.inner: int32, 0, 2) { + for (ho.inner: int32, 0, 2) { + for (wo: int32, 0, 8) { + + // init temp output to zero + for (hi.init: int32, 0, 8) { + for (wi.init: int32, 0, 8) { + for (ki.init: int32, 0, 32) { + temp_output[(((((((ho.inner*65536) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi.init*256)) + (wi.init*32)) + ki.init)] = 0f32 + } + } + } + + // compute + for (rc.outer: int32, 0, 4) { + for (hi: int32, 0, 8) { + for (wi: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (rc.inner: int32, 0, 32) { + temp_output[(((((((ho.inner*65536) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] = + ( + (float32*)temp_output[(((((((ho.inner*65536) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] + + ( + (float32*)packed_input.global[((((((ho.inner*65536) + (wo*8192)) + (rc.outer*2048)) + (hi*256)) + (wi*32)) + rc.inner)] * + (float32*)packed_filter.global[(((((ko.inner*4096) + (rc.outer*1024)) + (floordiv(rc.inner, 4)*128)) + (ki*4)) + floormod(rc.inner, 4))] + ) + ) + } + } + } + } + } + } + } + } + } + + // filter #2 cache read + // NOTE: reusing same filter cache + for (ko.inner: int32, 0, 2) { + for (co: int32, 0, 4) { + for (cio: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (cii: int32, 0, 4) { + packed_filter.global[(((((ko.inner*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] = + (float32*)placeholder_6[((((((ko.outer*8192) + (ko.inner*4096)) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] + } + } + } + } + } + + // conv2d #2 + for (ko.c.inner: int32, 0, 2) { + for (ho.c.inner: int32, 0, 2) { + for (wo.c: int32, 0, 8) { + + // init output cache to zero + // NOTE: reusing the input cache as the output cache + for (hi.c.init: int32, 0, 8) { + for (wi.c.init: int32, 0, 8) { + for (ki.c.init: int32, 0, 32) { + packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c.init*256)) + (wi.c.init*32)) + ki.c.init)] = 0f32 + } + } + } + + // compute + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { + packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] = + ( + (float32*)packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] + + ( + (float32*)temp_output[((((((ho.c.inner*65536) + (wo.c*8192)) + (rc.outer_1*2048)) + (hi.c*256)) + (wi.c*32)) + rc.inner_1)] * + (float32*)packed_filter.global[(((((ko.c.inner*4096) + (rc.outer_1*1024)) + (floordiv(rc.inner_1, 4)*128)) + (ki.c*4)) + floormod(rc.inner_1, 4))] + ) + ) + } + } + } + } + } + } + } + } + + // write back output cache + for (ko.inner_1: int32, 0, 2) { + for (ho.inner_1: int32, 0, 2) { + for (wo_1: int32, 0, 8) { + for (hi_1: int32, 0, 8) { + for (wi_1: int32, 0, 8) { + for (ki_1: int32, 0, 32) { + output_2[((((((((ho.outer*131072) + (ho.inner_1*65536)) + (wo_1*8192)) + (ko.outer*4096)) + (ko.inner_1*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] = + (float32*)packed_input.global[((((((ho.inner_1*32768) + (wo_1*4096)) + (ko.inner_1*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] + } + } + } + } + } + } + } + } +} +``` + +# 3x3 conv2d -> conv2d (no padding) + +Change from a 1x1 filter to a 3x3 filter. + +## Command + +pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_conv2d.py::TestConv2dConv2dPackedFilter::test_conv2d[1-64-128-0-1-3-128-1-3-128-2-2-float32-llvm]" + +## Parameters + +| Parameter | Value | +| ------------------------ | ----- | +| Batch | 1 | +| Input Size | 64x64 | +| Input Channel | 128 | +| Conv2d #1 Pad | 0 | +| Conv2d #1 Stride | 1 | +| Conv2d #1 Kernel Size | 3 ^ | +| Conv2d #1 Output Channel | 128 | +| Conv2d #2 Stride | 1 | +| Conv2d #2 Kernel Size | 3 ^ | +| Conv2d #2 Output Channel | 128 | +| k_split | 2 | +| h_split | 2 | + +^ Changes from above + +## Constants + +| Constant | Value | +| ------------------ | ----- | +| Conv2d #2 Pad | 0 | +| Conv2d #1 Dilation | 1 | +| Conv2d #2 Dilation | 1 | + +## Shapes and Layouts + +The input is provided and padded in logical layout and then packed into its physical layout prior to compute. Logical layout / shape information is provided as a reference for phsyical tensors. + +| Tensor | Type | Layout | Shape | Logical Layout | Logical Shape | +| ------------ | -------- | ----------- | ---------------------- | -------------- | ---------------- | +| Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Padded Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Packed Input | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 1 | Physical | OIHW8i32o4i | [4, 4, 3, 3, 8, 32, 4] | OIHW | [128, 128, 3, 3] | +| Temp Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 62, 62, 128] | +| Filter 2 | Physical | OIHW8i32o4i | [4, 4, 3, 3, 8, 32, 4] | OIHW | [128, 128, 3, 3] | +| Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 60, 60, 128] | + +## Schedule + +This is the conv2d compute schedule: + +``` + for (ko.outer: int32, 0, 2) { + for (ho.outer: int32, 0, 4) { + // caches computed here + for (ko.c.inner: int32, 0, 2) { + for (ho.c.inner: int32, 0, 2) { + for (wo.c: int32, 0, 8) { + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (rh_1: int32, 0, 3) { + for (rw_1: int32, 0, 3) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { Review comment: This was meant to give an idea of the general compute schedule --- the order of the loops and where the caches are computed. But, I agree it's confusing given the level of detail elsewhere in the README. E.g. the difference between `ho.outer` and `ho.outer_1` as you mentioned in your comment. I have added a more detailed schedule overview for each test case which I believe will address your concern. ########## File path: tests/python/contrib/test_hexagon/test_conv2d_conv2d.md ########## @@ -0,0 +1,860 @@ +<!--- Licensed to the Apache Software Foundation (ASF) under one --> +<!--- or more contributor license agreements. See the NOTICE file --> +<!--- distributed with this work for additional information --> +<!--- regarding copyright ownership. The ASF licenses this file --> +<!--- to you under the Apache License, Version 2.0 (the --> +<!--- "License"); you may not use this file except in compliance --> +<!--- with the License. You may obtain a copy of the License at --> + +<!--- http://www.apache.org/licenses/LICENSE-2.0 --> + +<!--- Unless required by applicable law or agreed to in writing, --> +<!--- software distributed under the License is distributed on an --> +<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --> +<!--- KIND, either express or implied. See the License for the --> +<!--- specific language governing permissions and limitations --> +<!--- under the License. --> + +Hexagon conv2d -> conv2d schedules + +# Baseline conv2d -> conv2d + +This is a baseline 1x1 conv2d -> 1x1 conv2d schedule for Hexagon. + +## Command + +pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_conv2d.py::TestConv2dConv2dPackedFilter::test_conv2d[1-64-128-0-1-1-128-1-1-128-1-1-float32-llvm]" + +## Parameters + +| Parameter | Value | +| ------------------------ | ----- | +| Batch | 1 | +| Input Size | 64x64 | +| Input Channel | 128 | +| Conv2d #1 Pad | 0 | +| Conv2d #1 Stride | 1 | +| Conv2d #1 Kernel Size | 1 | +| Conv2d #1 Output Channel | 128 | +| Conv2d #2 Stride | 1 | +| Conv2d #2 Kernel Size | 1 | +| Conv2d #2 Output Channel | 128 | +| k_split | 1 | +| h_split | 1 | + +## Constants + +| Constant | Value | +| ------------------ | ----- | +| Conv2d #2 Pad | 0 | +| Conv2d #1 Dilation | 1 | +| Conv2d #2 Dilation | 1 | + +## Shapes and Layouts + +The input is provided and padded in logical layout and then packed into its physical layout prior to compute. Logical layout / shape information is provided as a reference for phsyical tensors. + +| Tensor | Type | Layout | Shape | Logical Layout | Logical Shape | +| ------------ | -------- | ----------- | ---------------------- | -------------- | ---------------- | +| Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Padded Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Packed Input | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 1 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Temp Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 2 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | + +## Schedule + +This is the conv2d compute schedule: + +``` + for (ko.outer: int32, 0, 4) { + for (ho.outer: int32, 0, 8) { + // caches computed here + for (wo.c: int32, 0, 8) { + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { +``` + +Note that conv2d #1 has an independent loop over the channel out `ko.outer` dimension. This is because the output channels of conv2d #1 are the input channels to conv2d #2 and we compute over all input channels for each conv2d so we must compute over all output channels of conv2d #1 before we compute conv2d #2. + +``` + for (ko.outer_1: int32, 0, 2) { +``` + +## Cache Usage + +*Input Cache* + +We compute over the WC8h8w32c portion of the input so we need 8 * 4 * 8 * 8 * 32 = 64kb for the input cache. + +``` + allocate(packed_input.global: Pointer(global float32), float32, [65536]), storage_scope = global; +``` + +*Filter Cache* + +We compute over the IHW8i32o4i portion of each filter so we need 4 * 1 * 1 * 8 * 32 * 4 = 4kb filter cache. + +``` + allocate(packed_filter.global: Pointer(global float32), float32, [4096]), storage_scope = global; +``` + +Note that there is just one cache which is reused for conv2d / filter #1 and conv2d / filter #2. + +*Output Cache* + +We compute over the WK8h832k portion of the output where `k` denotes the output channel. The output cache is computed for each `ko.outer` which means it should be W * 8h * 8w * 32k = 8 * 8 * 8 * 32 = 16kb. And, in fact, this is the case for a single conv2d case. But, as already noted, for this conv2d -> conv2d case "the output channels of conv2d #1 are the input channels to conv2d #2 and we compute over all input channels for each conv2d so we must compute over all output channels of conv2d #1 before we compute conv2d #2". This means that the output cache must grow accordingly to K * W * 8h * 8w * 32k = 4 * 8 * 8 * 8 * 32 = 64kb. There is a temporary allocation to store the results of conv2d #1: + +``` + allocate(temp_output: Pointer(global float32), float32, [65536]), storage_scope = global; +``` + +Note that the input cache is reused to store the results of conv2d #2. + +## Assumptions + +* n/a + +## To Do + +* n/a + +## Annotated TIR + +``` +primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, output_1: handle) -> () + attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True} + buffers = {output: Buffer(output_2: Pointer(float32), float32, [1, 8, 8, 4, 8, 8, 32], []), // nhw8h8w32c + placeholder_2: Buffer(placeholder_6: Pointer(float32), float32, [4, 4, 1, 1, 8, 32, 4], []), // oihw8i32o4i + placeholder_1: Buffer(placeholder_7: Pointer(float32), float32, [4, 4, 1, 1, 8, 32, 4], []), // oihw8i32o4i + placeholder: Buffer(placeholder_8: Pointer(float32), float32, [1, 64, 64, 128], [])} // nhwc + buffer_map = {placeholder_3: placeholder, placeholder_4: placeholder_1, placeholder_5: placeholder_2, output_1: output} { + allocate(packed_input.global: Pointer(global float32), float32, [65536]), storage_scope = global; + allocate(temp_output: Pointer(global float32), float32, [65536]), storage_scope = global; + allocate(packed_filter.global: Pointer(global float32), float32, [4096]), storage_scope = global; + for (ko.outer: int32, 0, 4) { + for (ho.outer: int32, 0, 8) { + + // input cache read + for (wo: int32, 0, 8) { + for (co: int32, 0, 4) { + for (hi: int32, 0, 8) { + for (wi: int32, 0, 8) { + for (ci: int32, 0, 32) { + packed_input.global[(((((wo*8192) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = + (float32*)placeholder_8[((((((ho.outer*65536) + (hi*8192)) + (wo*1024)) + (wi*128)) + (co*32)) + ci)] + } + } + } + } + } + + // NOTE: compute over all output channels of conv2d #1 before computing conv2d #2 + for (ko.outer_1: int32, 0, 4) { + + // filter #1 cache read + for (co: int32, 0, 4) { + for (cio: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (cii: int32, 0, 4) { + packed_filter.global[((((co*1024) + (cio*128)) + (ki*4)) + cii)] = + (float32*)placeholder_7[(((((ko.outer_1*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] + } + } + } + } + + // conv2d #1 + for (wo: int32, 0, 8) { + + // init temp output to zero + for (hi.init: int32, 0, 8) { + for (wi.init: int32, 0, 8) { + for (ki.init: int32, 0, 32) { + temp_output[(((((wo*8192) + (ko.outer_1*2048)) + (hi.init*256)) + (wi.init*32)) + ki.init)] = 0f32 + } + } + } + + // compute + for (rc.outer: int32, 0, 4) { + for (hi: int32, 0, 8) { + for (wi: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (rc.inner: int32, 0, 32) { + temp_output[(((((wo*8192) + (ko.outer_1*2048)) + (hi*256)) + (wi*32)) + ki)] = + ( + (float32*)temp_output[(((((wo*8192) + (ko.outer_1*2048)) + (hi*256)) + (wi*32)) + ki)] + + ( + (float32*)packed_input.global[(((((wo*8192) + (rc.outer*2048)) + (hi*256)) + (wi*32)) + rc.inner)] * + (float32*)packed_filter.global[((((rc.outer*1024) + (floordiv(rc.inner, 4)*128)) + (ki*4)) + floormod(rc.inner, 4))] + ) + ) + } + } + } + } + } + } + } + + // filter #2 cache read + // NOTE: reusing same filter cache + for (co: int32, 0, 4) { + for (cio: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (cii: int32, 0, 4) { + packed_filter.global[((((co*1024) + (cio*128)) + (ki*4)) + cii)] = + (float32*)placeholder_6[(((((ko.outer*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] + } + } + } + } + + // conv2d #2 + for (wo.c: int32, 0, 8) { + + // init output cache to zero + // NOTE: reusing the input cache as the output cache + for (hi.c.init: int32, 0, 8) { + for (wi.c.init: int32, 0, 8) { + for (ki.c.init: int32, 0, 32) { + packed_input.global[((((wo.c*2048) + (hi.c.init*256)) + (wi.c.init*32)) + ki.c.init)] = 0f32 + } + } + } + + // compute + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { + packed_input.global[((((wo.c*2048) + (hi.c*256)) + (wi.c*32)) + ki.c)] = + ( + (float32*)packed_input.global[((((wo.c*2048) + (hi.c*256)) + (wi.c*32)) + ki.c)] + + ( + (float32*)temp_output[(((((wo.c*8192) + (rc.outer_1*2048)) + (hi.c*256)) + (wi.c*32)) + rc.inner_1)] * + (float32*)packed_filter.global[((((rc.outer_1*1024) + (floordiv(rc.inner_1, 4)*128)) + (ki.c*4)) + floormod(rc.inner_1, 4))] + ) + ) + } + } + } + } + } + } + + // write back output cache + for (wo_1: int32, 0, 8) { + for (hi_1: int32, 0, 8) { + for (wi_1: int32, 0, 8) { + for (ki_1: int32, 0, 32) { + output_2[((((((ho.outer*65536) + (wo_1*8192)) + (ko.outer*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] = + (float32*)packed_input.global[((((wo_1*2048) + (hi_1*256)) + (wi_1*32)) + ki_1)] + } + } + } + } + } + } +} +``` + +# Split on Channel Out and Height + +Uses parameters `k_split` and `h_split` which creates a loop split on the outer channel out `ko` and height `ho` loops creating `outer` and `inner` loops for each split. The cache reads and writes are computed at `ho.outer` which means that cache allocation grow in proportion to `k_split` and `h_split` factors. + +## Command + +pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_conv2d.py::TestConv2dConv2dPackedFilter::test_conv2d[1-64-128-0-1-1-128-1-1-128-2-2-float32-llvm]" + +## Parameters + +| Parameter | Value | +| ------------------------ | ----- | +| Batch | 1 | +| Input Size | 64x64 | +| Input Channel | 128 | +| Conv2d #1 Pad | 0 | +| Conv2d #1 Stride | 1 | +| Conv2d #1 Kernel Size | 1 | +| Conv2d #1 Output Channel | 128 | +| Conv2d #2 Stride | 1 | +| Conv2d #2 Kernel Size | 1 | +| Conv2d #2 Output Channel | 128 | +| k_split | 2 ^ | +| h_split | 2 ^ | + +^ Changes from above + +## Constants + +| Constant | Value | +| ------------------ | ----- | +| Conv2d #2 Pad | 0 | +| Conv2d #1 Dilation | 1 | +| Conv2d #2 Dilation | 1 | + +## Shapes and Layouts + +The input is provided and padded in logical layout and then packed into its physical layout prior to compute. Logical layout / shape information is provided as a reference for phsyical tensors. + +| Tensor | Type | Layout | Shape | Logical Layout | Logical Shape | +| ------------ | -------- | ----------- | ---------------------- | -------------- | ---------------- | +| Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Padded Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Packed Input | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 1 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Temp Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 2 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | + +## Schedule + +This is the conv2d compute schedule: + +``` + for (ko.outer: int32, 0, 2) { + for (ho.outer: int32, 0, 4) { + // caches computed here + for (ko.c.inner: int32, 0, 2) { + for (ho.c.inner: int32, 0, 2) { + for (wo.c: int32, 0, 8) { + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { +``` + +The major change here versus above is the presence of `inner` loops for both channel out `ko` and height `ho` dimensions created from the `k_split` and `h_split` schedule parameters respectively: + + +``` + for (ko.c.inner: int32, 0, 2) { + for (ho.c.inner: int32, 0, 2) { +``` + +The major effect of this change is increased cache usage given that caches are computed at the `ho.outer` level of the loop schedule. This is documented in the next section. + +(Same as above) Note that conv2d #1 has an independent loop over the channel out `ko.outer` dimension. This is because the output channels of conv2d #1 are the input channels to conv2d #2 and we compute over all input channels for each conv2d so we must compute over all output channels of conv2d #1 before we compute conv2d #2. + +``` + for (ko.outer_1: int32, 0, 2) { +``` + +## Cache Usage + +*Input Cache* + +The input cache grows by a factor of `h_split = 2` compared with above: + +``` + allocate(packed_input.global: Pointer(global float32), float32, [131072]), storage_scope = global; +``` + +*Filter Cache* + +The filter cache grows by a factor of `k_split = 2` compared with above: + +``` + allocate(packed_filter.global: Pointer(global float32), float32, [8192]), storage_scope = global; +``` + +(Same as above) Note that there is just one cache which is reused for conv2d / filter #1 and conv2d / filter #2. + +*Output Cache* + +The output cache grows by a factor of `k_split = 2` compared with above: + +``` + allocate(temp_output: Pointer(global float32), float32, [131072]), storage_scope = global; +``` + +(Same as above) Note that the input cache is reused to store the results of conv2d #2. + +## Assumptions + +* n/a + +## To Do + +* n/a + +## Annotated TIR + +``` +primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, output_1: handle) -> () + attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True} + buffers = {output: Buffer(output_2: Pointer(float32), float32, [1, 8, 8, 4, 8, 8, 32], []), // nhw8h8w32c + placeholder_2: Buffer(placeholder_6: Pointer(float32), float32, [4, 4, 1, 1, 8, 32, 4], []), // oihw8i32o4i + placeholder_1: Buffer(placeholder_7: Pointer(float32), float32, [4, 4, 1, 1, 8, 32, 4], []), // oihw8i32o4i + placeholder: Buffer(placeholder_8: Pointer(float32), float32, [1, 64, 64, 128], [])} // nhwc + buffer_map = {placeholder_3: placeholder, placeholder_4: placeholder_1, placeholder_5: placeholder_2, output_1: output} { + allocate(packed_input.global: Pointer(global float32), float32, [131072]), storage_scope = global; + allocate(temp_output: Pointer(global float32), float32, [131072]), storage_scope = global; + allocate(packed_filter.global: Pointer(global float32), float32, [8192]), storage_scope = global; + for (ko.outer: int32, 0, 2) { + for (ho.outer: int32, 0, 4) { + + // input cache read + for (ho.inner: int32, 0, 2) { + for (wo: int32, 0, 8) { + for (co: int32, 0, 4) { + for (hi: int32, 0, 8) { + for (wi: int32, 0, 8) { + for (ci: int32, 0, 32) { + packed_input.global[((((((ho.inner*65536) + (wo*8192)) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = + (float32*)placeholder_8[(((((((ho.outer*131072) + (ho.inner*65536)) + (hi*8192)) + (wo*1024)) + (wi*128)) + (co*32)) + ci)] + } + } + } + } + } + } + + // NOTE: compute over all output channels of conv2d #1 before computing conv2d #2 + for (ko.outer_1: int32, 0, 2) { + for (ko.inner: int32, 0, 2) { + // filter #1 cache read + for (co: int32, 0, 4) { + for (cio: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (cii: int32, 0, 4) { + packed_filter.global[(((((ko.inner*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] = + (float32*)placeholder_7[((((((ko.outer_1*8192) + (ko.inner*4096)) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] + } + } + } + } + } + + // conv2d #1 + for (ko.inner: int32, 0, 2) { + for (ho.inner: int32, 0, 2) { + for (wo: int32, 0, 8) { + + // init temp output to zero + for (hi.init: int32, 0, 8) { + for (wi.init: int32, 0, 8) { + for (ki.init: int32, 0, 32) { + temp_output[(((((((ho.inner*65536) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi.init*256)) + (wi.init*32)) + ki.init)] = 0f32 + } + } + } + + // compute + for (rc.outer: int32, 0, 4) { + for (hi: int32, 0, 8) { + for (wi: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (rc.inner: int32, 0, 32) { + temp_output[(((((((ho.inner*65536) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] = + ( + (float32*)temp_output[(((((((ho.inner*65536) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] + + ( + (float32*)packed_input.global[((((((ho.inner*65536) + (wo*8192)) + (rc.outer*2048)) + (hi*256)) + (wi*32)) + rc.inner)] * + (float32*)packed_filter.global[(((((ko.inner*4096) + (rc.outer*1024)) + (floordiv(rc.inner, 4)*128)) + (ki*4)) + floormod(rc.inner, 4))] + ) + ) + } + } + } + } + } + } + } + } + } + + // filter #2 cache read + // NOTE: reusing same filter cache + for (ko.inner: int32, 0, 2) { + for (co: int32, 0, 4) { + for (cio: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (cii: int32, 0, 4) { + packed_filter.global[(((((ko.inner*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] = + (float32*)placeholder_6[((((((ko.outer*8192) + (ko.inner*4096)) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] + } + } + } + } + } + + // conv2d #2 + for (ko.c.inner: int32, 0, 2) { + for (ho.c.inner: int32, 0, 2) { + for (wo.c: int32, 0, 8) { + + // init output cache to zero + // NOTE: reusing the input cache as the output cache + for (hi.c.init: int32, 0, 8) { + for (wi.c.init: int32, 0, 8) { + for (ki.c.init: int32, 0, 32) { + packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c.init*256)) + (wi.c.init*32)) + ki.c.init)] = 0f32 + } + } + } + + // compute + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { + packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] = + ( + (float32*)packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] + + ( + (float32*)temp_output[((((((ho.c.inner*65536) + (wo.c*8192)) + (rc.outer_1*2048)) + (hi.c*256)) + (wi.c*32)) + rc.inner_1)] * + (float32*)packed_filter.global[(((((ko.c.inner*4096) + (rc.outer_1*1024)) + (floordiv(rc.inner_1, 4)*128)) + (ki.c*4)) + floormod(rc.inner_1, 4))] + ) + ) + } + } + } + } + } + } + } + } + + // write back output cache + for (ko.inner_1: int32, 0, 2) { + for (ho.inner_1: int32, 0, 2) { + for (wo_1: int32, 0, 8) { + for (hi_1: int32, 0, 8) { + for (wi_1: int32, 0, 8) { + for (ki_1: int32, 0, 32) { + output_2[((((((((ho.outer*131072) + (ho.inner_1*65536)) + (wo_1*8192)) + (ko.outer*4096)) + (ko.inner_1*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] = + (float32*)packed_input.global[((((((ho.inner_1*32768) + (wo_1*4096)) + (ko.inner_1*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] + } + } + } + } + } + } + } + } +} +``` + +# 3x3 conv2d -> conv2d (no padding) + +Change from a 1x1 filter to a 3x3 filter. + +## Command + +pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_conv2d.py::TestConv2dConv2dPackedFilter::test_conv2d[1-64-128-0-1-3-128-1-3-128-2-2-float32-llvm]" + +## Parameters + +| Parameter | Value | +| ------------------------ | ----- | +| Batch | 1 | +| Input Size | 64x64 | +| Input Channel | 128 | +| Conv2d #1 Pad | 0 | +| Conv2d #1 Stride | 1 | +| Conv2d #1 Kernel Size | 3 ^ | +| Conv2d #1 Output Channel | 128 | +| Conv2d #2 Stride | 1 | +| Conv2d #2 Kernel Size | 3 ^ | +| Conv2d #2 Output Channel | 128 | +| k_split | 2 | +| h_split | 2 | + +^ Changes from above + +## Constants + +| Constant | Value | +| ------------------ | ----- | +| Conv2d #2 Pad | 0 | +| Conv2d #1 Dilation | 1 | +| Conv2d #2 Dilation | 1 | + +## Shapes and Layouts + +The input is provided and padded in logical layout and then packed into its physical layout prior to compute. Logical layout / shape information is provided as a reference for phsyical tensors. + +| Tensor | Type | Layout | Shape | Logical Layout | Logical Shape | +| ------------ | -------- | ----------- | ---------------------- | -------------- | ---------------- | +| Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Padded Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Packed Input | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 1 | Physical | OIHW8i32o4i | [4, 4, 3, 3, 8, 32, 4] | OIHW | [128, 128, 3, 3] | +| Temp Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 62, 62, 128] | +| Filter 2 | Physical | OIHW8i32o4i | [4, 4, 3, 3, 8, 32, 4] | OIHW | [128, 128, 3, 3] | +| Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 60, 60, 128] | + +## Schedule + +This is the conv2d compute schedule: + +``` + for (ko.outer: int32, 0, 2) { + for (ho.outer: int32, 0, 4) { + // caches computed here + for (ko.c.inner: int32, 0, 2) { + for (ho.c.inner: int32, 0, 2) { + for (wo.c: int32, 0, 8) { + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (rh_1: int32, 0, 3) { + for (rw_1: int32, 0, 3) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { Review comment: This was meant to give an idea of the general compute schedule --- the order of the loops and where the caches are computed. But, I agree it's confusing given the level of detail elsewhere in the README. E.g. the difference between `ho.outer` and `ho.outer_1` as you mentioned in your comment. I was glossing over the difference between these two iterators here and that lead to confusion. I have added a more detailed schedule overview for each test case which I believe will address your concern. ########## File path: tests/python/contrib/test_hexagon/test_conv2d_conv2d.md ########## @@ -0,0 +1,860 @@ +<!--- Licensed to the Apache Software Foundation (ASF) under one --> +<!--- or more contributor license agreements. See the NOTICE file --> +<!--- distributed with this work for additional information --> +<!--- regarding copyright ownership. The ASF licenses this file --> +<!--- to you under the Apache License, Version 2.0 (the --> +<!--- "License"); you may not use this file except in compliance --> +<!--- with the License. You may obtain a copy of the License at --> + +<!--- http://www.apache.org/licenses/LICENSE-2.0 --> + +<!--- Unless required by applicable law or agreed to in writing, --> +<!--- software distributed under the License is distributed on an --> +<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --> +<!--- KIND, either express or implied. See the License for the --> +<!--- specific language governing permissions and limitations --> +<!--- under the License. --> + +Hexagon conv2d -> conv2d schedules + +# Baseline conv2d -> conv2d + +This is a baseline 1x1 conv2d -> 1x1 conv2d schedule for Hexagon. + +## Command + +pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_conv2d.py::TestConv2dConv2dPackedFilter::test_conv2d[1-64-128-0-1-1-128-1-1-128-1-1-float32-llvm]" + +## Parameters + +| Parameter | Value | +| ------------------------ | ----- | +| Batch | 1 | +| Input Size | 64x64 | +| Input Channel | 128 | +| Conv2d #1 Pad | 0 | +| Conv2d #1 Stride | 1 | +| Conv2d #1 Kernel Size | 1 | +| Conv2d #1 Output Channel | 128 | +| Conv2d #2 Stride | 1 | +| Conv2d #2 Kernel Size | 1 | +| Conv2d #2 Output Channel | 128 | +| k_split | 1 | +| h_split | 1 | + +## Constants + +| Constant | Value | +| ------------------ | ----- | +| Conv2d #2 Pad | 0 | +| Conv2d #1 Dilation | 1 | +| Conv2d #2 Dilation | 1 | + +## Shapes and Layouts + +The input is provided and padded in logical layout and then packed into its physical layout prior to compute. Logical layout / shape information is provided as a reference for phsyical tensors. + +| Tensor | Type | Layout | Shape | Logical Layout | Logical Shape | +| ------------ | -------- | ----------- | ---------------------- | -------------- | ---------------- | +| Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Padded Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Packed Input | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 1 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Temp Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 2 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | + +## Schedule + +This is the conv2d compute schedule: + +``` + for (ko.outer: int32, 0, 4) { + for (ho.outer: int32, 0, 8) { + // caches computed here + for (wo.c: int32, 0, 8) { + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { +``` + +Note that conv2d #1 has an independent loop over the channel out `ko.outer` dimension. This is because the output channels of conv2d #1 are the input channels to conv2d #2 and we compute over all input channels for each conv2d so we must compute over all output channels of conv2d #1 before we compute conv2d #2. Review comment: Agree. This is confusing. Yes, you are correct that `ko.outer` is the iterator over conv2d #2 output channels and `ko.outer_1` is the iterator over conv2d #1 output channels. Just like with the schedule overview above, I was glossing over this level of detail both here in the notes. I have clarified the notes for all test cases which I believe will address your concern. ########## File path: tests/python/contrib/test_hexagon/test_conv2d_conv2d.md ########## @@ -0,0 +1,860 @@ +<!--- Licensed to the Apache Software Foundation (ASF) under one --> +<!--- or more contributor license agreements. See the NOTICE file --> +<!--- distributed with this work for additional information --> +<!--- regarding copyright ownership. The ASF licenses this file --> +<!--- to you under the Apache License, Version 2.0 (the --> +<!--- "License"); you may not use this file except in compliance --> +<!--- with the License. You may obtain a copy of the License at --> + +<!--- http://www.apache.org/licenses/LICENSE-2.0 --> + +<!--- Unless required by applicable law or agreed to in writing, --> +<!--- software distributed under the License is distributed on an --> +<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --> +<!--- KIND, either express or implied. See the License for the --> +<!--- specific language governing permissions and limitations --> +<!--- under the License. --> + +Hexagon conv2d -> conv2d schedules + +# Baseline conv2d -> conv2d + +This is a baseline 1x1 conv2d -> 1x1 conv2d schedule for Hexagon. + +## Command + +pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_conv2d.py::TestConv2dConv2dPackedFilter::test_conv2d[1-64-128-0-1-1-128-1-1-128-1-1-float32-llvm]" + +## Parameters + +| Parameter | Value | +| ------------------------ | ----- | +| Batch | 1 | +| Input Size | 64x64 | +| Input Channel | 128 | +| Conv2d #1 Pad | 0 | +| Conv2d #1 Stride | 1 | +| Conv2d #1 Kernel Size | 1 | +| Conv2d #1 Output Channel | 128 | +| Conv2d #2 Stride | 1 | +| Conv2d #2 Kernel Size | 1 | +| Conv2d #2 Output Channel | 128 | +| k_split | 1 | +| h_split | 1 | + +## Constants + +| Constant | Value | +| ------------------ | ----- | +| Conv2d #2 Pad | 0 | +| Conv2d #1 Dilation | 1 | +| Conv2d #2 Dilation | 1 | + +## Shapes and Layouts + +The input is provided and padded in logical layout and then packed into its physical layout prior to compute. Logical layout / shape information is provided as a reference for phsyical tensors. + +| Tensor | Type | Layout | Shape | Logical Layout | Logical Shape | +| ------------ | -------- | ----------- | ---------------------- | -------------- | ---------------- | +| Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Padded Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Packed Input | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 1 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Temp Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 2 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | + +## Schedule + +This is the conv2d compute schedule: + +``` + for (ko.outer: int32, 0, 4) { + for (ho.outer: int32, 0, 8) { + // caches computed here + for (wo.c: int32, 0, 8) { + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { +``` + +Note that conv2d #1 has an independent loop over the channel out `ko.outer` dimension. This is because the output channels of conv2d #1 are the input channels to conv2d #2 and we compute over all input channels for each conv2d so we must compute over all output channels of conv2d #1 before we compute conv2d #2. + +``` + for (ko.outer_1: int32, 0, 2) { +``` + +## Cache Usage + +*Input Cache* + +We compute over the WC8h8w32c portion of the input so we need 8 * 4 * 8 * 8 * 32 = 64kb for the input cache. + +``` + allocate(packed_input.global: Pointer(global float32), float32, [65536]), storage_scope = global; +``` + +*Filter Cache* + +We compute over the IHW8i32o4i portion of each filter so we need 4 * 1 * 1 * 8 * 32 * 4 = 4kb filter cache. + +``` + allocate(packed_filter.global: Pointer(global float32), float32, [4096]), storage_scope = global; +``` + +Note that there is just one cache which is reused for conv2d / filter #1 and conv2d / filter #2. + +*Output Cache* + +We compute over the WK8h832k portion of the output where `k` denotes the output channel. The output cache is computed for each `ko.outer` which means it should be W * 8h * 8w * 32k = 8 * 8 * 8 * 32 = 16kb. And, in fact, this is the case for a single conv2d case. But, as already noted, for this conv2d -> conv2d case "the output channels of conv2d #1 are the input channels to conv2d #2 and we compute over all input channels for each conv2d so we must compute over all output channels of conv2d #1 before we compute conv2d #2". This means that the output cache must grow accordingly to K * W * 8h * 8w * 32k = 4 * 8 * 8 * 8 * 32 = 64kb. There is a temporary allocation to store the results of conv2d #1: + +``` + allocate(temp_output: Pointer(global float32), float32, [65536]), storage_scope = global; +``` + +Note that the input cache is reused to store the results of conv2d #2. Review comment: Added the TODO in the README. Will add to the backlog as well. ########## File path: tests/python/contrib/test_hexagon/test_conv2d_conv2d.md ########## @@ -0,0 +1,860 @@ +<!--- Licensed to the Apache Software Foundation (ASF) under one --> +<!--- or more contributor license agreements. See the NOTICE file --> +<!--- distributed with this work for additional information --> +<!--- regarding copyright ownership. The ASF licenses this file --> +<!--- to you under the Apache License, Version 2.0 (the --> +<!--- "License"); you may not use this file except in compliance --> +<!--- with the License. You may obtain a copy of the License at --> + +<!--- http://www.apache.org/licenses/LICENSE-2.0 --> + +<!--- Unless required by applicable law or agreed to in writing, --> +<!--- software distributed under the License is distributed on an --> +<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --> +<!--- KIND, either express or implied. See the License for the --> +<!--- specific language governing permissions and limitations --> +<!--- under the License. --> + +Hexagon conv2d -> conv2d schedules + +# Baseline conv2d -> conv2d + +This is a baseline 1x1 conv2d -> 1x1 conv2d schedule for Hexagon. + +## Command + +pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_conv2d.py::TestConv2dConv2dPackedFilter::test_conv2d[1-64-128-0-1-1-128-1-1-128-1-1-float32-llvm]" + +## Parameters + +| Parameter | Value | +| ------------------------ | ----- | +| Batch | 1 | +| Input Size | 64x64 | +| Input Channel | 128 | +| Conv2d #1 Pad | 0 | +| Conv2d #1 Stride | 1 | +| Conv2d #1 Kernel Size | 1 | +| Conv2d #1 Output Channel | 128 | +| Conv2d #2 Stride | 1 | +| Conv2d #2 Kernel Size | 1 | +| Conv2d #2 Output Channel | 128 | +| k_split | 1 | +| h_split | 1 | + +## Constants + +| Constant | Value | +| ------------------ | ----- | +| Conv2d #2 Pad | 0 | +| Conv2d #1 Dilation | 1 | +| Conv2d #2 Dilation | 1 | + +## Shapes and Layouts + +The input is provided and padded in logical layout and then packed into its physical layout prior to compute. Logical layout / shape information is provided as a reference for phsyical tensors. + +| Tensor | Type | Layout | Shape | Logical Layout | Logical Shape | +| ------------ | -------- | ----------- | ---------------------- | -------------- | ---------------- | +| Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Padded Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Packed Input | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 1 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Temp Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 2 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | + +## Schedule + +This is the conv2d compute schedule: + +``` + for (ko.outer: int32, 0, 4) { + for (ho.outer: int32, 0, 8) { + // caches computed here + for (wo.c: int32, 0, 8) { + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { +``` + +Note that conv2d #1 has an independent loop over the channel out `ko.outer` dimension. This is because the output channels of conv2d #1 are the input channels to conv2d #2 and we compute over all input channels for each conv2d so we must compute over all output channels of conv2d #1 before we compute conv2d #2. Review comment: Agree. This is confusing. Yes, you are correct that `ko.outer` is the iterator over the second conv2d output channels and `ko.outer_1` is the iterator over the first conv2d output channels. Just like with the schedule overview above, I was glossing over this level of detail both here in the notes. I have clarified the notes for all test cases which I believe will address your concern. ########## File path: tests/python/contrib/test_hexagon/test_conv2d_conv2d.md ########## @@ -0,0 +1,860 @@ +<!--- Licensed to the Apache Software Foundation (ASF) under one --> +<!--- or more contributor license agreements. See the NOTICE file --> +<!--- distributed with this work for additional information --> +<!--- regarding copyright ownership. The ASF licenses this file --> +<!--- to you under the Apache License, Version 2.0 (the --> +<!--- "License"); you may not use this file except in compliance --> +<!--- with the License. You may obtain a copy of the License at --> + +<!--- http://www.apache.org/licenses/LICENSE-2.0 --> + +<!--- Unless required by applicable law or agreed to in writing, --> +<!--- software distributed under the License is distributed on an --> +<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY --> +<!--- KIND, either express or implied. See the License for the --> +<!--- specific language governing permissions and limitations --> +<!--- under the License. --> + +Hexagon conv2d -> conv2d schedules + +# Baseline conv2d -> conv2d + +This is a baseline 1x1 conv2d -> 1x1 conv2d schedule for Hexagon. + +## Command + +pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_conv2d.py::TestConv2dConv2dPackedFilter::test_conv2d[1-64-128-0-1-1-128-1-1-128-1-1-float32-llvm]" + +## Parameters + +| Parameter | Value | +| ------------------------ | ----- | +| Batch | 1 | +| Input Size | 64x64 | +| Input Channel | 128 | +| Conv2d #1 Pad | 0 | +| Conv2d #1 Stride | 1 | +| Conv2d #1 Kernel Size | 1 | +| Conv2d #1 Output Channel | 128 | +| Conv2d #2 Stride | 1 | +| Conv2d #2 Kernel Size | 1 | +| Conv2d #2 Output Channel | 128 | +| k_split | 1 | +| h_split | 1 | + +## Constants + +| Constant | Value | +| ------------------ | ----- | +| Conv2d #2 Pad | 0 | +| Conv2d #1 Dilation | 1 | +| Conv2d #2 Dilation | 1 | + +## Shapes and Layouts + +The input is provided and padded in logical layout and then packed into its physical layout prior to compute. Logical layout / shape information is provided as a reference for phsyical tensors. + +| Tensor | Type | Layout | Shape | Logical Layout | Logical Shape | +| ------------ | -------- | ----------- | ---------------------- | -------------- | ---------------- | +| Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Padded Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Packed Input | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 1 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Temp Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 2 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | + +## Schedule + +This is the conv2d compute schedule: + +``` + for (ko.outer: int32, 0, 4) { + for (ho.outer: int32, 0, 8) { + // caches computed here + for (wo.c: int32, 0, 8) { + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { +``` + +Note that conv2d #1 has an independent loop over the channel out `ko.outer` dimension. This is because the output channels of conv2d #1 are the input channels to conv2d #2 and we compute over all input channels for each conv2d so we must compute over all output channels of conv2d #1 before we compute conv2d #2. + +``` + for (ko.outer_1: int32, 0, 2) { +``` + +## Cache Usage + +*Input Cache* + +We compute over the WC8h8w32c portion of the input so we need 8 * 4 * 8 * 8 * 32 = 64kb for the input cache. + +``` + allocate(packed_input.global: Pointer(global float32), float32, [65536]), storage_scope = global; +``` + +*Filter Cache* + +We compute over the IHW8i32o4i portion of each filter so we need 4 * 1 * 1 * 8 * 32 * 4 = 4kb filter cache. + +``` + allocate(packed_filter.global: Pointer(global float32), float32, [4096]), storage_scope = global; +``` + +Note that there is just one cache which is reused for conv2d / filter #1 and conv2d / filter #2. + +*Output Cache* + +We compute over the WK8h832k portion of the output where `k` denotes the output channel. The output cache is computed for each `ko.outer` which means it should be W * 8h * 8w * 32k = 8 * 8 * 8 * 32 = 16kb. And, in fact, this is the case for a single conv2d case. But, as already noted, for this conv2d -> conv2d case "the output channels of conv2d #1 are the input channels to conv2d #2 and we compute over all input channels for each conv2d so we must compute over all output channels of conv2d #1 before we compute conv2d #2". This means that the output cache must grow accordingly to K * W * 8h * 8w * 32k = 4 * 8 * 8 * 8 * 32 = 64kb. There is a temporary allocation to store the results of conv2d #1: + +``` + allocate(temp_output: Pointer(global float32), float32, [65536]), storage_scope = global; +``` + +Note that the input cache is reused to store the results of conv2d #2. + +## Assumptions + +* n/a + +## To Do + +* n/a + +## Annotated TIR + +``` +primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, output_1: handle) -> () + attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True} + buffers = {output: Buffer(output_2: Pointer(float32), float32, [1, 8, 8, 4, 8, 8, 32], []), // nhw8h8w32c + placeholder_2: Buffer(placeholder_6: Pointer(float32), float32, [4, 4, 1, 1, 8, 32, 4], []), // oihw8i32o4i + placeholder_1: Buffer(placeholder_7: Pointer(float32), float32, [4, 4, 1, 1, 8, 32, 4], []), // oihw8i32o4i + placeholder: Buffer(placeholder_8: Pointer(float32), float32, [1, 64, 64, 128], [])} // nhwc + buffer_map = {placeholder_3: placeholder, placeholder_4: placeholder_1, placeholder_5: placeholder_2, output_1: output} { + allocate(packed_input.global: Pointer(global float32), float32, [65536]), storage_scope = global; + allocate(temp_output: Pointer(global float32), float32, [65536]), storage_scope = global; + allocate(packed_filter.global: Pointer(global float32), float32, [4096]), storage_scope = global; + for (ko.outer: int32, 0, 4) { + for (ho.outer: int32, 0, 8) { + + // input cache read + for (wo: int32, 0, 8) { + for (co: int32, 0, 4) { + for (hi: int32, 0, 8) { + for (wi: int32, 0, 8) { + for (ci: int32, 0, 32) { + packed_input.global[(((((wo*8192) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = + (float32*)placeholder_8[((((((ho.outer*65536) + (hi*8192)) + (wo*1024)) + (wi*128)) + (co*32)) + ci)] + } + } + } + } + } + + // NOTE: compute over all output channels of conv2d #1 before computing conv2d #2 + for (ko.outer_1: int32, 0, 4) { + + // filter #1 cache read + for (co: int32, 0, 4) { + for (cio: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (cii: int32, 0, 4) { + packed_filter.global[((((co*1024) + (cio*128)) + (ki*4)) + cii)] = + (float32*)placeholder_7[(((((ko.outer_1*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] + } + } + } + } + + // conv2d #1 + for (wo: int32, 0, 8) { + + // init temp output to zero + for (hi.init: int32, 0, 8) { + for (wi.init: int32, 0, 8) { + for (ki.init: int32, 0, 32) { + temp_output[(((((wo*8192) + (ko.outer_1*2048)) + (hi.init*256)) + (wi.init*32)) + ki.init)] = 0f32 + } + } + } + + // compute + for (rc.outer: int32, 0, 4) { + for (hi: int32, 0, 8) { + for (wi: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (rc.inner: int32, 0, 32) { + temp_output[(((((wo*8192) + (ko.outer_1*2048)) + (hi*256)) + (wi*32)) + ki)] = + ( + (float32*)temp_output[(((((wo*8192) + (ko.outer_1*2048)) + (hi*256)) + (wi*32)) + ki)] + + ( + (float32*)packed_input.global[(((((wo*8192) + (rc.outer*2048)) + (hi*256)) + (wi*32)) + rc.inner)] * + (float32*)packed_filter.global[((((rc.outer*1024) + (floordiv(rc.inner, 4)*128)) + (ki*4)) + floormod(rc.inner, 4))] + ) + ) + } + } + } + } + } + } + } + + // filter #2 cache read + // NOTE: reusing same filter cache + for (co: int32, 0, 4) { + for (cio: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (cii: int32, 0, 4) { + packed_filter.global[((((co*1024) + (cio*128)) + (ki*4)) + cii)] = + (float32*)placeholder_6[(((((ko.outer*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] + } + } + } + } + + // conv2d #2 + for (wo.c: int32, 0, 8) { + + // init output cache to zero + // NOTE: reusing the input cache as the output cache + for (hi.c.init: int32, 0, 8) { + for (wi.c.init: int32, 0, 8) { + for (ki.c.init: int32, 0, 32) { + packed_input.global[((((wo.c*2048) + (hi.c.init*256)) + (wi.c.init*32)) + ki.c.init)] = 0f32 + } + } + } + + // compute + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { + packed_input.global[((((wo.c*2048) + (hi.c*256)) + (wi.c*32)) + ki.c)] = + ( + (float32*)packed_input.global[((((wo.c*2048) + (hi.c*256)) + (wi.c*32)) + ki.c)] + + ( + (float32*)temp_output[(((((wo.c*8192) + (rc.outer_1*2048)) + (hi.c*256)) + (wi.c*32)) + rc.inner_1)] * + (float32*)packed_filter.global[((((rc.outer_1*1024) + (floordiv(rc.inner_1, 4)*128)) + (ki.c*4)) + floormod(rc.inner_1, 4))] + ) + ) + } + } + } + } + } + } + + // write back output cache + for (wo_1: int32, 0, 8) { + for (hi_1: int32, 0, 8) { + for (wi_1: int32, 0, 8) { + for (ki_1: int32, 0, 32) { + output_2[((((((ho.outer*65536) + (wo_1*8192)) + (ko.outer*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] = + (float32*)packed_input.global[((((wo_1*2048) + (hi_1*256)) + (wi_1*32)) + ki_1)] + } + } + } + } + } + } +} +``` + +# Split on Channel Out and Height + +Uses parameters `k_split` and `h_split` which creates a loop split on the outer channel out `ko` and height `ho` loops creating `outer` and `inner` loops for each split. The cache reads and writes are computed at `ho.outer` which means that cache allocation grow in proportion to `k_split` and `h_split` factors. + +## Command + +pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_conv2d.py::TestConv2dConv2dPackedFilter::test_conv2d[1-64-128-0-1-1-128-1-1-128-2-2-float32-llvm]" + +## Parameters + +| Parameter | Value | +| ------------------------ | ----- | +| Batch | 1 | +| Input Size | 64x64 | +| Input Channel | 128 | +| Conv2d #1 Pad | 0 | +| Conv2d #1 Stride | 1 | +| Conv2d #1 Kernel Size | 1 | +| Conv2d #1 Output Channel | 128 | +| Conv2d #2 Stride | 1 | +| Conv2d #2 Kernel Size | 1 | +| Conv2d #2 Output Channel | 128 | +| k_split | 2 ^ | +| h_split | 2 ^ | + +^ Changes from above + +## Constants + +| Constant | Value | +| ------------------ | ----- | +| Conv2d #2 Pad | 0 | +| Conv2d #1 Dilation | 1 | +| Conv2d #2 Dilation | 1 | + +## Shapes and Layouts + +The input is provided and padded in logical layout and then packed into its physical layout prior to compute. Logical layout / shape information is provided as a reference for phsyical tensors. + +| Tensor | Type | Layout | Shape | Logical Layout | Logical Shape | +| ------------ | -------- | ----------- | ---------------------- | -------------- | ---------------- | +| Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Padded Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Packed Input | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 1 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Temp Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 2 | Physical | OIHW8i32o4i | [4, 4, 1, 1, 8, 32, 4] | OIHW | [128, 128, 1, 1] | +| Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | + +## Schedule + +This is the conv2d compute schedule: + +``` + for (ko.outer: int32, 0, 2) { + for (ho.outer: int32, 0, 4) { + // caches computed here + for (ko.c.inner: int32, 0, 2) { + for (ho.c.inner: int32, 0, 2) { + for (wo.c: int32, 0, 8) { + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { +``` + +The major change here versus above is the presence of `inner` loops for both channel out `ko` and height `ho` dimensions created from the `k_split` and `h_split` schedule parameters respectively: + + +``` + for (ko.c.inner: int32, 0, 2) { + for (ho.c.inner: int32, 0, 2) { +``` + +The major effect of this change is increased cache usage given that caches are computed at the `ho.outer` level of the loop schedule. This is documented in the next section. + +(Same as above) Note that conv2d #1 has an independent loop over the channel out `ko.outer` dimension. This is because the output channels of conv2d #1 are the input channels to conv2d #2 and we compute over all input channels for each conv2d so we must compute over all output channels of conv2d #1 before we compute conv2d #2. + +``` + for (ko.outer_1: int32, 0, 2) { +``` + +## Cache Usage + +*Input Cache* + +The input cache grows by a factor of `h_split = 2` compared with above: + +``` + allocate(packed_input.global: Pointer(global float32), float32, [131072]), storage_scope = global; +``` + +*Filter Cache* + +The filter cache grows by a factor of `k_split = 2` compared with above: + +``` + allocate(packed_filter.global: Pointer(global float32), float32, [8192]), storage_scope = global; +``` + +(Same as above) Note that there is just one cache which is reused for conv2d / filter #1 and conv2d / filter #2. + +*Output Cache* + +The output cache grows by a factor of `k_split = 2` compared with above: + +``` + allocate(temp_output: Pointer(global float32), float32, [131072]), storage_scope = global; +``` + +(Same as above) Note that the input cache is reused to store the results of conv2d #2. + +## Assumptions + +* n/a + +## To Do + +* n/a + +## Annotated TIR + +``` +primfn(placeholder_3: handle, placeholder_4: handle, placeholder_5: handle, output_1: handle) -> () + attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True} + buffers = {output: Buffer(output_2: Pointer(float32), float32, [1, 8, 8, 4, 8, 8, 32], []), // nhw8h8w32c + placeholder_2: Buffer(placeholder_6: Pointer(float32), float32, [4, 4, 1, 1, 8, 32, 4], []), // oihw8i32o4i + placeholder_1: Buffer(placeholder_7: Pointer(float32), float32, [4, 4, 1, 1, 8, 32, 4], []), // oihw8i32o4i + placeholder: Buffer(placeholder_8: Pointer(float32), float32, [1, 64, 64, 128], [])} // nhwc + buffer_map = {placeholder_3: placeholder, placeholder_4: placeholder_1, placeholder_5: placeholder_2, output_1: output} { + allocate(packed_input.global: Pointer(global float32), float32, [131072]), storage_scope = global; + allocate(temp_output: Pointer(global float32), float32, [131072]), storage_scope = global; + allocate(packed_filter.global: Pointer(global float32), float32, [8192]), storage_scope = global; + for (ko.outer: int32, 0, 2) { + for (ho.outer: int32, 0, 4) { + + // input cache read + for (ho.inner: int32, 0, 2) { + for (wo: int32, 0, 8) { + for (co: int32, 0, 4) { + for (hi: int32, 0, 8) { + for (wi: int32, 0, 8) { + for (ci: int32, 0, 32) { + packed_input.global[((((((ho.inner*65536) + (wo*8192)) + (co*2048)) + (hi*256)) + (wi*32)) + ci)] = + (float32*)placeholder_8[(((((((ho.outer*131072) + (ho.inner*65536)) + (hi*8192)) + (wo*1024)) + (wi*128)) + (co*32)) + ci)] + } + } + } + } + } + } + + // NOTE: compute over all output channels of conv2d #1 before computing conv2d #2 + for (ko.outer_1: int32, 0, 2) { + for (ko.inner: int32, 0, 2) { + // filter #1 cache read + for (co: int32, 0, 4) { + for (cio: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (cii: int32, 0, 4) { + packed_filter.global[(((((ko.inner*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] = + (float32*)placeholder_7[((((((ko.outer_1*8192) + (ko.inner*4096)) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] + } + } + } + } + } + + // conv2d #1 + for (ko.inner: int32, 0, 2) { + for (ho.inner: int32, 0, 2) { + for (wo: int32, 0, 8) { + + // init temp output to zero + for (hi.init: int32, 0, 8) { + for (wi.init: int32, 0, 8) { + for (ki.init: int32, 0, 32) { + temp_output[(((((((ho.inner*65536) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi.init*256)) + (wi.init*32)) + ki.init)] = 0f32 + } + } + } + + // compute + for (rc.outer: int32, 0, 4) { + for (hi: int32, 0, 8) { + for (wi: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (rc.inner: int32, 0, 32) { + temp_output[(((((((ho.inner*65536) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] = + ( + (float32*)temp_output[(((((((ho.inner*65536) + (wo*8192)) + (ko.outer_1*4096)) + (ko.inner*2048)) + (hi*256)) + (wi*32)) + ki)] + + ( + (float32*)packed_input.global[((((((ho.inner*65536) + (wo*8192)) + (rc.outer*2048)) + (hi*256)) + (wi*32)) + rc.inner)] * + (float32*)packed_filter.global[(((((ko.inner*4096) + (rc.outer*1024)) + (floordiv(rc.inner, 4)*128)) + (ki*4)) + floormod(rc.inner, 4))] + ) + ) + } + } + } + } + } + } + } + } + } + + // filter #2 cache read + // NOTE: reusing same filter cache + for (ko.inner: int32, 0, 2) { + for (co: int32, 0, 4) { + for (cio: int32, 0, 8) { + for (ki: int32, 0, 32) { + for (cii: int32, 0, 4) { + packed_filter.global[(((((ko.inner*4096) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] = + (float32*)placeholder_6[((((((ko.outer*8192) + (ko.inner*4096)) + (co*1024)) + (cio*128)) + (ki*4)) + cii)] + } + } + } + } + } + + // conv2d #2 + for (ko.c.inner: int32, 0, 2) { + for (ho.c.inner: int32, 0, 2) { + for (wo.c: int32, 0, 8) { + + // init output cache to zero + // NOTE: reusing the input cache as the output cache + for (hi.c.init: int32, 0, 8) { + for (wi.c.init: int32, 0, 8) { + for (ki.c.init: int32, 0, 32) { + packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c.init*256)) + (wi.c.init*32)) + ki.c.init)] = 0f32 + } + } + } + + // compute + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { + packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] = + ( + (float32*)packed_input.global[((((((ho.c.inner*32768) + (wo.c*4096)) + (ko.c.inner*2048)) + (hi.c*256)) + (wi.c*32)) + ki.c)] + + ( + (float32*)temp_output[((((((ho.c.inner*65536) + (wo.c*8192)) + (rc.outer_1*2048)) + (hi.c*256)) + (wi.c*32)) + rc.inner_1)] * + (float32*)packed_filter.global[(((((ko.c.inner*4096) + (rc.outer_1*1024)) + (floordiv(rc.inner_1, 4)*128)) + (ki.c*4)) + floormod(rc.inner_1, 4))] + ) + ) + } + } + } + } + } + } + } + } + + // write back output cache + for (ko.inner_1: int32, 0, 2) { + for (ho.inner_1: int32, 0, 2) { + for (wo_1: int32, 0, 8) { + for (hi_1: int32, 0, 8) { + for (wi_1: int32, 0, 8) { + for (ki_1: int32, 0, 32) { + output_2[((((((((ho.outer*131072) + (ho.inner_1*65536)) + (wo_1*8192)) + (ko.outer*4096)) + (ko.inner_1*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] = + (float32*)packed_input.global[((((((ho.inner_1*32768) + (wo_1*4096)) + (ko.inner_1*2048)) + (hi_1*256)) + (wi_1*32)) + ki_1)] + } + } + } + } + } + } + } + } +} +``` + +# 3x3 conv2d -> conv2d (no padding) + +Change from a 1x1 filter to a 3x3 filter. + +## Command + +pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_conv2d.py::TestConv2dConv2dPackedFilter::test_conv2d[1-64-128-0-1-3-128-1-3-128-2-2-float32-llvm]" + +## Parameters + +| Parameter | Value | +| ------------------------ | ----- | +| Batch | 1 | +| Input Size | 64x64 | +| Input Channel | 128 | +| Conv2d #1 Pad | 0 | +| Conv2d #1 Stride | 1 | +| Conv2d #1 Kernel Size | 3 ^ | +| Conv2d #1 Output Channel | 128 | +| Conv2d #2 Stride | 1 | +| Conv2d #2 Kernel Size | 3 ^ | +| Conv2d #2 Output Channel | 128 | +| k_split | 2 | +| h_split | 2 | + +^ Changes from above + +## Constants + +| Constant | Value | +| ------------------ | ----- | +| Conv2d #2 Pad | 0 | +| Conv2d #1 Dilation | 1 | +| Conv2d #2 Dilation | 1 | + +## Shapes and Layouts + +The input is provided and padded in logical layout and then packed into its physical layout prior to compute. Logical layout / shape information is provided as a reference for phsyical tensors. + +| Tensor | Type | Layout | Shape | Logical Layout | Logical Shape | +| ------------ | -------- | ----------- | ---------------------- | -------------- | ---------------- | +| Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Padded Input | Logical | NHWC | [1, 64, 64, 128] | | | +| Packed Input | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 64, 64, 128] | +| Filter 1 | Physical | OIHW8i32o4i | [4, 4, 3, 3, 8, 32, 4] | OIHW | [128, 128, 3, 3] | +| Temp Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 62, 62, 128] | +| Filter 2 | Physical | OIHW8i32o4i | [4, 4, 3, 3, 8, 32, 4] | OIHW | [128, 128, 3, 3] | +| Output | Physical | NHWC8h8w32c | [1, 8, 8, 4, 8, 8, 32] | NHWC | [1, 60, 60, 128] | + +## Schedule + +This is the conv2d compute schedule: + +``` + for (ko.outer: int32, 0, 2) { + for (ho.outer: int32, 0, 4) { + // caches computed here + for (ko.c.inner: int32, 0, 2) { + for (ho.c.inner: int32, 0, 2) { + for (wo.c: int32, 0, 8) { + for (rc.outer_1: int32, 0, 4) { + for (hi.c: int32, 0, 8) { + for (wi.c: int32, 0, 8) { + for (rh_1: int32, 0, 3) { + for (rw_1: int32, 0, 3) { + for (ki.c: int32, 0, 32) { + for (rc.inner_1: int32, 0, 32) { + +``` + +The major change here is the presence of the the kernel height `rh` and width `rw` dimensions. + +``` + for (rh_1: int32, 0, 3) { + for (rw_1: int32, 0, 3) { +``` + +(Same as above) Note that conv2d #1 has an independent loop over the channel out `ko.outer` dimension. This is because the output channels of conv2d #1 are the input channels to conv2d #2 and we compute over all input channels for each conv2d so we must compute over all output channels of conv2d #1 before we compute conv2d #2. + +``` + for (ko.outer_1: int32, 0, 2) { +``` + +(Different from above) Note that conv2d #1 also has an independent loop over some portion of the `ho.outer` dimension. This is due to the fact that the 3x3 filter will "fall off the bottome" of the input and thus the vertically adjacent "full width" and "full depth" slice of the input must be a) prefetched into the input cache for conv2d #1 and b) produced in the temporary output cache of conv2d #2. + +``` + for (ho.outer_1: int32, 0, 2) { Review comment: Added some more info in the README to help explain. Short story: the first conv2d must produce sufficient output in the height dimension prior to the second conv2d starting. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
