Alex-grovety commented on issue #8717:
URL: https://github.com/apache/tvm/issues/8717#issuecomment-915247652
1.) TIR Primfunc for first example:
```
PrimFunc([placeholder, placeholder, placeholder, T_cast])
attrs={"from_legacy_te_schedule": (bool)1, "global_symbol":
"tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_21",
"tir.noalias": (bool)1} {
allocate DepthwiseConv2d[int32 * 401408], storage_scope = global
for (i, 0, 56) {
for (j, 0, 56) {
for (c, 0, 128) {
DepthwiseConv2d[(((i*7168) + (j*128)) + c)] = 0
for (di, 0, 3) {
for (dj, 0, 3) {
DepthwiseConv2d[(((i*7168) + (j*128)) + c)] =
(DepthwiseConv2d[(((i*7168) + (j*128)) + c)] + (int32(tir.if_then_else(((((1 <=
(i + di)) && ((i + di) < 57)) && (1 <= (j + dj))) && ((j + dj) < 57)),
placeholder[((((((i*7168) + (di*7168)) + (j*128)) + (dj*128)) + c) - 7296)],
(int16)0))*int32(placeholder[(((di*384) + (dj*128)) + c)])))
}
}
}
}
}
for (ax0.ax1.fused.ax2.fused.ax3.fused, 0, 401408) {
T_cast[ax0.ax1.fused.ax2.fused.ax3.fused] =
int16(uint8(max(min(tir.q_multiply_shift((DepthwiseConv2d[ax0.ax1.fused.ax2.fused.ax3.fused]
+ placeholder[floormod(ax0.ax1.fused.ax2.fused.ax3.fused, 128)]), 2080045879,
31, -4), 255), 0)))
}
}
```
TIR Primfunc for second example:
```
PrimFunc([placeholder, placeholder, placeholder, placeholder, T_cast])
attrs={"from_legacy_te_schedule": (bool)1, "global_symbol":
"tvmgen_default_fused_nn_conv2d_subtract_add_fixed_point_multiply_clip_cast_10",
"tir.noalias": (bool)1} {
allocate PaddedInput[uint8 * 430592], storage_scope = global
allocate DepthwiseConv2d[int32 * 401408], storage_scope = global
for (i1, 0, 58) {
for (i2, 0, 58) {
for (i3, 0, 128) {
PaddedInput[(((i1*7424) + (i2*128)) + i3)] = placeholder[(((i1*7424)
+ (i2*128)) + i3)]
}
}
}
for (i, 0, 56) {
for (j, 0, 56) {
for (c, 0, 128) {
DepthwiseConv2d[(((i*7168) + (j*128)) + c)] = 0
for (di, 0, 3) {
for (dj, 0, 3) {
DepthwiseConv2d[(((i*7168) + (j*128)) + c)] =
(DepthwiseConv2d[(((i*7168) + (j*128)) + c)] + (int32(PaddedInput[(((((i*7424)
+ (di*7424)) + (j*128)) + (dj*128)) + c)])*int32(placeholder[(((di*384) +
(dj*128)) + c)])))
}
}
}
}
}
for (ax1, 0, 56) {
for (ax2, 0, 56) {
for (ax3, 0, 128) {
DepthwiseConv2d[(((ax1*7168) + (ax2*128)) + ax3)] =
(DepthwiseConv2d[(((ax1*7168) + (ax2*128)) + ax3)] - placeholder[(((ax1*7168) +
(ax2*128)) + ax3)])
}
}
}
for (ax1, 0, 56) {
for (ax2, 0, 56) {
for (ax3, 0, 128) {
DepthwiseConv2d[(((ax1*7168) + (ax2*128)) + ax3)] =
(DepthwiseConv2d[(((ax1*7168) + (ax2*128)) + ax3)] + placeholder[ax3])
}
}
}
for (i1, 0, 56) {
for (i2, 0, 56) {
for (i3, 0, 128) {
DepthwiseConv2d[(((i1*7168) + (i2*128)) + i3)] =
tir.q_multiply_shift(DepthwiseConv2d[(((i1*7168) + (i2*128)) + i3)],
2080045879, 31, -4)
}
}
}
for (i1, 0, 56) {
for (i2, 0, 56) {
for (i3, 0, 128) {
DepthwiseConv2d[(((i1*7168) + (i2*128)) + i3)] =
max(min(DepthwiseConv2d[(((i1*7168) + (i2*128)) + i3)], 255), 0)
}
}
}
for (ax1, 0, 56) {
for (ax2, 0, 56) {
for (ax3, 0, 128) {
T_cast[(((ax1*7168) + (ax2*128)) + ax3)] =
uint8(DepthwiseConv2d[(((ax1*7168) + (ax2*128)) + ax3)])
}
}
}
}
```
3.) For second example with fusion we get (from default_lib1.c for
test_quant_mobilenet_tfl):
```
TVM_DLL int32_t
tvmgen_default_fused_nn_conv2d_subtract_add_fixed_point_multiply_clip_cast_10(void*
args, void* arg_type_ids, int32_t num_args, void* out_ret_value, void*
out_ret_tcode, void* resource_handle) {
void* arg0 = (((TVMValue*)args)[0].v_handle);
int32_t arg0_code = ((int32_t*)arg_type_ids)[(0)];
void* arg1 = (((TVMValue*)args)[1].v_handle);
int32_t arg1_code = ((int32_t*)arg_type_ids)[(1)];
void* arg2 = (((TVMValue*)args)[2].v_handle);
int32_t arg2_code = ((int32_t*)arg_type_ids)[(2)];
void* arg3 = (((TVMValue*)args)[3].v_handle);
int32_t arg3_code = ((int32_t*)arg_type_ids)[(3)];
void* arg4 = (((TVMValue*)args)[4].v_handle);
int32_t arg4_code = ((int32_t*)arg_type_ids)[(4)];
void* placeholder = (((DLTensor*)arg0)[0].data);
void* arg0_shape = (((DLTensor*)arg0)[0].shape);
void* arg0_strides = (((DLTensor*)arg0)[0].strides);
int32_t dev_id = (((DLTensor*)arg0)[0].device.device_id);
void* placeholder1 = (((DLTensor*)arg1)[0].data);
void* arg1_shape = (((DLTensor*)arg1)[0].shape);
void* arg1_strides = (((DLTensor*)arg1)[0].strides);
void* placeholder2 = (((DLTensor*)arg2)[0].data);
void* arg2_shape = (((DLTensor*)arg2)[0].shape);
void* arg2_strides = (((DLTensor*)arg2)[0].strides);
void* placeholder3 = (((DLTensor*)arg3)[0].data);
void* arg3_shape = (((DLTensor*)arg3)[0].shape);
void* arg3_strides = (((DLTensor*)arg3)[0].strides);
void* T_cast = (((DLTensor*)arg4)[0].data);
void* arg4_shape = (((DLTensor*)arg4)[0].shape);
void* arg4_strides = (((DLTensor*)arg4)[0].strides);
if (!(arg0_strides == NULL)) {
}
if (!(arg1_strides == NULL)) {
}
if (!(arg2_strides == NULL)) {
}
if (!(arg3_strides == NULL)) {
}
if (!(arg4_strides == NULL)) {
}
void* DepthwiseConv2d = TVMBackendAllocWorkspace(1, dev_id,
(uint64_t)1605632, 0, 32);
if (DepthwiseConv2d == NULL) {
return -1;
}
for (int32_t i = 0; i < 56; ++i) {
for (int32_t j = 0; j < 56; ++j) {
for (int32_t c = 0; c < 128; ++c) {
((int32_t*)DepthwiseConv2d)[((((i * 7168) + (j * 128)) + c))] = 0;
for (int32_t di = 0; di < 3; ++di) {
for (int32_t dj = 0; dj < 3; ++dj) {
((int32_t*)DepthwiseConv2d)[((((i * 7168) + (j * 128)) + c))] =
(((int32_t*)DepthwiseConv2d)[((((i * 7168) + (j * 128)) + c))] +
(((int32_t)((uint8_t*)placeholder)[((((((i * 7424) + (di * 7424)) + (j * 128))
+ (dj * 128)) + c))]) * ((int32_t)((int8_t*)placeholder1)[((((di * 384) + (dj *
128)) + c))])));
}
}
}
}
}
for (int32_t ax0_ax1_fused_ax2_fused_ax3_fused = 0;
ax0_ax1_fused_ax2_fused_ax3_fused < 401408;
++ax0_ax1_fused_ax2_fused_ax3_fused) {
int32_t _1 = (int32_t)(((((0 != 0) ?
(((int64_t)((((int32_t*)DepthwiseConv2d)[(ax0_ax1_fused_ax2_fused_ax3_fused)] +
((int32_t*)placeholder3)[((ax0_ax1_fused_ax2_fused_ax3_fused & 127))]) -
((int32_t*)placeholder2)[(ax0_ax1_fused_ax2_fused_ax3_fused)])) <<
((int64_t)0)) :
((int64_t)((((int32_t*)DepthwiseConv2d)[(ax0_ax1_fused_ax2_fused_ax3_fused)] +
((int32_t*)placeholder3)[((ax0_ax1_fused_ax2_fused_ax3_fused & 127))]) -
((int32_t*)placeholder2)[(ax0_ax1_fused_ax2_fused_ax3_fused)]))) *
(int64_t)2080045879) + ((int64_t)1 << ((int64_t)((4 + 31) - 1)))) >>
((int64_t)(4 + 31)));
int32_t _2 = (_1) < (255) ? (_1) : (255);
((uint8_t*)T_cast)[(ax0_ax1_fused_ax2_fused_ax3_fused)] =
((uint8_t)((_2) > (0) ? (_2) : (0)));
}
if (TVMBackendFreeWorkspace(1, dev_id, DepthwiseConv2d) != 0) {
return -1;
}
return 0;
}
```
TIR Primfunc:
```
PrimFunc([placeholder, placeholder, placeholder, placeholder, T_cast])
attrs={"from_legacy_te_schedule": (bool)1, "global_symbol":
"tvmgen_default_fused_nn_conv2d_subtract_add_fixed_point_multiply_clip_cast_10",
"tir.noalias": (bool)1} {
allocate DepthwiseConv2d[int32 * 401408], storage_scope = global
for (i, 0, 56) {
for (j, 0, 56) {
for (c, 0, 128) {
DepthwiseConv2d[(((i*7168) + (j*128)) + c)] = 0
for (di, 0, 3) {
for (dj, 0, 3) {
DepthwiseConv2d[(((i*7168) + (j*128)) + c)] =
(DepthwiseConv2d[(((i*7168) + (j*128)) + c)] + (int32(placeholder[(((((i*7424)
+ (di*7424)) + (j*128)) + (dj*128)) + c)])*int32(placeholder[(((di*384) +
(dj*128)) + c)])))
}
}
}
}
}
for (ax0.ax1.fused.ax2.fused.ax3.fused, 0, 401408) {
T_cast[ax0.ax1.fused.ax2.fused.ax3.fused] =
uint8(max(min(tir.q_multiply_shift(((DepthwiseConv2d[ax0.ax1.fused.ax2.fused.ax3.fused]
+ placeholder[floormod(ax0.ax1.fused.ax2.fused.ax3.fused, 128)]) -
placeholder[ax0.ax1.fused.ax2.fused.ax3.fused]), 2080045879, 31, -4), 255), 0))
}
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]