[GitHub] [tvm] Alex-grovety commented on issue #8717: Fusion of operations and cast in mobilenet v1 conv2d causing large feature maps

GitBox Wed, 08 Sep 2021 06:39:03 -0700


Alex-grovety commented on issue #8717:
URL: https://github.com/apache/tvm/issues/8717#issuecomment-915247652



   1.) TIR Primfunc for first example:
   ```
   PrimFunc([placeholder, placeholder, placeholder, T_cast]) 
attrs={"from_legacy_te_schedule": (bool)1, "global_symbol": 
"tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_21", 
"tir.noalias": (bool)1} {
     allocate DepthwiseConv2d[int32 * 401408], storage_scope = global
     for (i, 0, 56) {
       for (j, 0, 56) {
         for (c, 0, 128) {
           DepthwiseConv2d[(((i*7168) + (j*128)) + c)] = 0
           for (di, 0, 3) {
             for (dj, 0, 3) {
               DepthwiseConv2d[(((i*7168) + (j*128)) + c)] = 
(DepthwiseConv2d[(((i*7168) + (j*128)) + c)] + (int32(tir.if_then_else(((((1 <= 
(i + di)) && ((i + di) < 57)) && (1 <= (j + dj))) && ((j + dj) < 57)), 
placeholder[((((((i*7168) + (di*7168)) + (j*128)) + (dj*128)) + c) - 7296)], 
(int16)0))*int32(placeholder[(((di*384) + (dj*128)) + c)])))
             }
           }
         }
       }
     }
     for (ax0.ax1.fused.ax2.fused.ax3.fused, 0, 401408) {
       T_cast[ax0.ax1.fused.ax2.fused.ax3.fused] = 
int16(uint8(max(min(tir.q_multiply_shift((DepthwiseConv2d[ax0.ax1.fused.ax2.fused.ax3.fused]
 + placeholder[floormod(ax0.ax1.fused.ax2.fused.ax3.fused, 128)]), 2080045879, 
31, -4), 255), 0)))
     }
   }
   ```
   TIR Primfunc for second example:
   ```
   PrimFunc([placeholder, placeholder, placeholder, placeholder, T_cast]) 
attrs={"from_legacy_te_schedule": (bool)1, "global_symbol": 
"tvmgen_default_fused_nn_conv2d_subtract_add_fixed_point_multiply_clip_cast_10",
 "tir.noalias": (bool)1} {
     allocate PaddedInput[uint8 * 430592], storage_scope = global
     allocate DepthwiseConv2d[int32 * 401408], storage_scope = global
     for (i1, 0, 58) {
       for (i2, 0, 58) {
         for (i3, 0, 128) {
           PaddedInput[(((i1*7424) + (i2*128)) + i3)] = placeholder[(((i1*7424) 
+ (i2*128)) + i3)]
         }
       }
     }
     for (i, 0, 56) {
       for (j, 0, 56) {
         for (c, 0, 128) {
           DepthwiseConv2d[(((i*7168) + (j*128)) + c)] = 0
           for (di, 0, 3) {
             for (dj, 0, 3) {
               DepthwiseConv2d[(((i*7168) + (j*128)) + c)] = 
(DepthwiseConv2d[(((i*7168) + (j*128)) + c)] + (int32(PaddedInput[(((((i*7424) 
+ (di*7424)) + (j*128)) + (dj*128)) + c)])*int32(placeholder[(((di*384) + 
(dj*128)) + c)])))
             }
           }
         }
       }
     }
     for (ax1, 0, 56) {
       for (ax2, 0, 56) {
         for (ax3, 0, 128) {
           DepthwiseConv2d[(((ax1*7168) + (ax2*128)) + ax3)] = 
(DepthwiseConv2d[(((ax1*7168) + (ax2*128)) + ax3)] - placeholder[(((ax1*7168) + 
(ax2*128)) + ax3)])
         }
       }
     }
     for (ax1, 0, 56) {
       for (ax2, 0, 56) {
         for (ax3, 0, 128) {
           DepthwiseConv2d[(((ax1*7168) + (ax2*128)) + ax3)] = 
(DepthwiseConv2d[(((ax1*7168) + (ax2*128)) + ax3)] + placeholder[ax3])
         }
       }
     }
     for (i1, 0, 56) {
       for (i2, 0, 56) {
         for (i3, 0, 128) {
           DepthwiseConv2d[(((i1*7168) + (i2*128)) + i3)] = 
tir.q_multiply_shift(DepthwiseConv2d[(((i1*7168) + (i2*128)) + i3)], 
2080045879, 31, -4)
         }
       }
     }
     for (i1, 0, 56) {
       for (i2, 0, 56) {
         for (i3, 0, 128) {
           DepthwiseConv2d[(((i1*7168) + (i2*128)) + i3)] = 
max(min(DepthwiseConv2d[(((i1*7168) + (i2*128)) + i3)], 255), 0)
         }
       }
     }
     for (ax1, 0, 56) {
       for (ax2, 0, 56) {
         for (ax3, 0, 128) {
           T_cast[(((ax1*7168) + (ax2*128)) + ax3)] = 
uint8(DepthwiseConv2d[(((ax1*7168) + (ax2*128)) + ax3)])
         }
       }
     }
   }
   ```
   
   3.) For second example with fusion we get (from default_lib1.c for 
test_quant_mobilenet_tfl):
   ```
   TVM_DLL int32_t 
tvmgen_default_fused_nn_conv2d_subtract_add_fixed_point_multiply_clip_cast_10(void*
 args, void* arg_type_ids, int32_t num_args, void* out_ret_value, void* 
out_ret_tcode, void* resource_handle) {
     void* arg0 = (((TVMValue*)args)[0].v_handle);
     int32_t arg0_code = ((int32_t*)arg_type_ids)[(0)];
     void* arg1 = (((TVMValue*)args)[1].v_handle);
     int32_t arg1_code = ((int32_t*)arg_type_ids)[(1)];
     void* arg2 = (((TVMValue*)args)[2].v_handle);
     int32_t arg2_code = ((int32_t*)arg_type_ids)[(2)];
     void* arg3 = (((TVMValue*)args)[3].v_handle);
     int32_t arg3_code = ((int32_t*)arg_type_ids)[(3)];
     void* arg4 = (((TVMValue*)args)[4].v_handle);
     int32_t arg4_code = ((int32_t*)arg_type_ids)[(4)];
     void* placeholder = (((DLTensor*)arg0)[0].data);
     void* arg0_shape = (((DLTensor*)arg0)[0].shape);
     void* arg0_strides = (((DLTensor*)arg0)[0].strides);
     int32_t dev_id = (((DLTensor*)arg0)[0].device.device_id);
     void* placeholder1 = (((DLTensor*)arg1)[0].data);
     void* arg1_shape = (((DLTensor*)arg1)[0].shape);
     void* arg1_strides = (((DLTensor*)arg1)[0].strides);
     void* placeholder2 = (((DLTensor*)arg2)[0].data);
     void* arg2_shape = (((DLTensor*)arg2)[0].shape);
     void* arg2_strides = (((DLTensor*)arg2)[0].strides);
     void* placeholder3 = (((DLTensor*)arg3)[0].data);
     void* arg3_shape = (((DLTensor*)arg3)[0].shape);
     void* arg3_strides = (((DLTensor*)arg3)[0].strides);
     void* T_cast = (((DLTensor*)arg4)[0].data);
     void* arg4_shape = (((DLTensor*)arg4)[0].shape);
     void* arg4_strides = (((DLTensor*)arg4)[0].strides);
     if (!(arg0_strides == NULL)) {
     }
     if (!(arg1_strides == NULL)) {
     }
     if (!(arg2_strides == NULL)) {
     }
     if (!(arg3_strides == NULL)) {
     }
     if (!(arg4_strides == NULL)) {
     }
     void* DepthwiseConv2d = TVMBackendAllocWorkspace(1, dev_id, 
(uint64_t)1605632, 0, 32);
     if (DepthwiseConv2d == NULL) {
       return -1;
     }
     for (int32_t i = 0; i < 56; ++i) {
       for (int32_t j = 0; j < 56; ++j) {
         for (int32_t c = 0; c < 128; ++c) {
           ((int32_t*)DepthwiseConv2d)[((((i * 7168) + (j * 128)) + c))] = 0;
           for (int32_t di = 0; di < 3; ++di) {
             for (int32_t dj = 0; dj < 3; ++dj) {
               ((int32_t*)DepthwiseConv2d)[((((i * 7168) + (j * 128)) + c))] = 
(((int32_t*)DepthwiseConv2d)[((((i * 7168) + (j * 128)) + c))] + 
(((int32_t)((uint8_t*)placeholder)[((((((i * 7424) + (di * 7424)) + (j * 128)) 
+ (dj * 128)) + c))]) * ((int32_t)((int8_t*)placeholder1)[((((di * 384) + (dj * 
128)) + c))])));
             }
           }
         }
       }
     }
     for (int32_t ax0_ax1_fused_ax2_fused_ax3_fused = 0; 
ax0_ax1_fused_ax2_fused_ax3_fused < 401408; 
++ax0_ax1_fused_ax2_fused_ax3_fused) {
       int32_t _1 = (int32_t)(((((0 != 0) ? 
(((int64_t)((((int32_t*)DepthwiseConv2d)[(ax0_ax1_fused_ax2_fused_ax3_fused)] + 
((int32_t*)placeholder3)[((ax0_ax1_fused_ax2_fused_ax3_fused & 127))]) - 
((int32_t*)placeholder2)[(ax0_ax1_fused_ax2_fused_ax3_fused)])) << 
((int64_t)0)) : 
((int64_t)((((int32_t*)DepthwiseConv2d)[(ax0_ax1_fused_ax2_fused_ax3_fused)] + 
((int32_t*)placeholder3)[((ax0_ax1_fused_ax2_fused_ax3_fused & 127))]) - 
((int32_t*)placeholder2)[(ax0_ax1_fused_ax2_fused_ax3_fused)]))) * 
(int64_t)2080045879) + ((int64_t)1 << ((int64_t)((4 + 31) - 1)))) >> 
((int64_t)(4 + 31)));
       int32_t _2 = (_1) < (255) ? (_1) : (255);
       ((uint8_t*)T_cast)[(ax0_ax1_fused_ax2_fused_ax3_fused)] = 
((uint8_t)((_2) > (0) ? (_2) : (0)));
     }
     if (TVMBackendFreeWorkspace(1, dev_id, DepthwiseConv2d) != 0) {
       return -1;
     }
     return 0;
   }
   ```
   TIR Primfunc:
   ```
   PrimFunc([placeholder, placeholder, placeholder, placeholder, T_cast]) 
attrs={"from_legacy_te_schedule": (bool)1, "global_symbol": 
"tvmgen_default_fused_nn_conv2d_subtract_add_fixed_point_multiply_clip_cast_10",
 "tir.noalias": (bool)1} {
     allocate DepthwiseConv2d[int32 * 401408], storage_scope = global
     for (i, 0, 56) {
       for (j, 0, 56) {
         for (c, 0, 128) {
           DepthwiseConv2d[(((i*7168) + (j*128)) + c)] = 0
           for (di, 0, 3) {
             for (dj, 0, 3) {
               DepthwiseConv2d[(((i*7168) + (j*128)) + c)] = 
(DepthwiseConv2d[(((i*7168) + (j*128)) + c)] + (int32(placeholder[(((((i*7424) 
+ (di*7424)) + (j*128)) + (dj*128)) + c)])*int32(placeholder[(((di*384) + 
(dj*128)) + c)])))
             }
           }
         }
       }
     }
     for (ax0.ax1.fused.ax2.fused.ax3.fused, 0, 401408) {
       T_cast[ax0.ax1.fused.ax2.fused.ax3.fused] = 
uint8(max(min(tir.q_multiply_shift(((DepthwiseConv2d[ax0.ax1.fused.ax2.fused.ax3.fused]
 + placeholder[floormod(ax0.ax1.fused.ax2.fused.ax3.fused, 128)]) - 
placeholder[ax0.ax1.fused.ax2.fused.ax3.fused]), 2080045879, 31, -4), 255), 0))
     }
   }
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [tvm] Alex-grovety commented on issue #8717: Fusion of operations and cast in mobilenet v1 conv2d causing large feature maps

Reply via email to