[GitHub] [tvm] Alex-grovety commented on issue #8717: Fusion of operations and cast in mobilenet v1 conv2d causing large feature maps

GitBox Tue, 07 Sep 2021 05:14:41 -0700


Alex-grovety commented on issue #8717:
URL: https://github.com/apache/tvm/issues/8717#issuecomment-914252736



   Hello @Mousius,
   I found that DepthwiseConv2d doesn't get fused as it uses `default_schedule` 
with `auto_inline = False`, so if set `auto_inline = True` we get (from 
`default_lib1.c` for test_quant_mobilenet_tfl):
   ```
   TVM_DLL int32_t 
tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_21(void* 
args, void* arg_type_ids, int32_t num_args, void* out_ret_value, void* 
out_ret_tcode, void* resource_handle) {
     void* arg0 = (((TVMValue*)args)[0].v_handle);
     int32_t arg0_code = ((int32_t*)arg_type_ids)[(0)];
     void* arg1 = (((TVMValue*)args)[1].v_handle);
     int32_t arg1_code = ((int32_t*)arg_type_ids)[(1)];
     void* arg2 = (((TVMValue*)args)[2].v_handle);
     int32_t arg2_code = ((int32_t*)arg_type_ids)[(2)];
     void* arg3 = (((TVMValue*)args)[3].v_handle);
     int32_t arg3_code = ((int32_t*)arg_type_ids)[(3)];
     void* placeholder = (((DLTensor*)arg0)[0].data);
     void* arg0_shape = (((DLTensor*)arg0)[0].shape);
     void* arg0_strides = (((DLTensor*)arg0)[0].strides);
     int32_t dev_id = (((DLTensor*)arg0)[0].device.device_id);
     void* placeholder1 = (((DLTensor*)arg1)[0].data);
     void* arg1_shape = (((DLTensor*)arg1)[0].shape);
     void* arg1_strides = (((DLTensor*)arg1)[0].strides);
     void* placeholder2 = (((DLTensor*)arg2)[0].data);
     void* arg2_shape = (((DLTensor*)arg2)[0].shape);
     void* arg2_strides = (((DLTensor*)arg2)[0].strides);
     void* T_cast = (((DLTensor*)arg3)[0].data);
     void* arg3_shape = (((DLTensor*)arg3)[0].shape);
     void* arg3_strides = (((DLTensor*)arg3)[0].strides);
     if (!(arg0_strides == NULL)) {
     }
     if (!(arg1_strides == NULL)) {
     }
     if (!(arg2_strides == NULL)) {
     }
     if (!(arg3_strides == NULL)) {
     }
     void* DepthwiseConv2d = TVMBackendAllocWorkspace(1, dev_id, 
(uint64_t)1605632, 0, 32);
     if (DepthwiseConv2d == NULL) {
       return -1;
     }
     for (int32_t i = 0; i < 56; ++i) {
       for (int32_t j = 0; j < 56; ++j) {
         for (int32_t c = 0; c < 128; ++c) {
           ((int32_t*)DepthwiseConv2d)[((((i * 7168) + (j * 128)) + c))] = 0;
           for (int32_t di = 0; di < 3; ++di) {
             for (int32_t dj = 0; dj < 3; ++dj) {
               ((int32_t*)DepthwiseConv2d)[((((i * 7168) + (j * 128)) + c))] = 
(((int32_t*)DepthwiseConv2d)[((((i * 7168) + (j * 128)) + c))] + 
(((int32_t)(((((1 <= (i + di)) && ((i + di) < 57)) && (1 <= (j + dj))) && ((j + 
dj) < 57)) ? ((int16_t*)placeholder)[(((((((i * 7168) + (di * 7168)) + (j * 
128)) + (dj * 128)) + c) - 7296))] : (int16_t)0)) * 
((int32_t)((int16_t*)placeholder1)[((((di * 384) + (dj * 128)) + c))])));
             }
           }
         }
       }
     }
     for (int32_t ax0_ax1_fused_ax2_fused_ax3_fused = 0; 
ax0_ax1_fused_ax2_fused_ax3_fused < 401408; 
++ax0_ax1_fused_ax2_fused_ax3_fused) {
       int32_t _1 = (int32_t)(((((0 != 0) ? 
(((int64_t)(((int32_t*)DepthwiseConv2d)[(ax0_ax1_fused_ax2_fused_ax3_fused)] + 
((int32_t*)placeholder2)[((ax0_ax1_fused_ax2_fused_ax3_fused & 127))])) << 
((int64_t)0)) : 
((int64_t)(((int32_t*)DepthwiseConv2d)[(ax0_ax1_fused_ax2_fused_ax3_fused)] + 
((int32_t*)placeholder2)[((ax0_ax1_fused_ax2_fused_ax3_fused & 127))]))) * 
(int64_t)2080045879) + ((int64_t)1 << ((int64_t)((4 + 31) - 1)))) >> 
((int64_t)(4 + 31)));
       int32_t _2 = (_1) < (255) ? (_1) : (255);
       ((int16_t*)T_cast)[(ax0_ax1_fused_ax2_fused_ax3_fused)] = 
((int16_t)((uint8_t)((_2) > (0) ? (_2) : (0))));
     }
     if (TVMBackendFreeWorkspace(1, dev_id, DepthwiseConv2d) != 0) {
       return -1;
     }
     return 0;
   }
   ```
   
   we get cast to int16 if hardware hasn't support for fast int8 arithmetic 
operations. For hardware that supports fast int8 arithmetic operations we get 
(from `default_lib1.c` for test_quant_mobilenet_tfl):
   ```
   TVM_DLL int32_t 
tvmgen_default_fused_nn_conv2d_subtract_add_fixed_point_multiply_clip_cast_10(void*
 args, void* arg_type_ids, int32_t num_args, void* out_ret_value, void* 
out_ret_tcode, void* resource_handle) {
     void* arg0 = (((TVMValue*)args)[0].v_handle);
     int32_t arg0_code = ((int32_t*)arg_type_ids)[(0)];
     void* arg1 = (((TVMValue*)args)[1].v_handle);
     int32_t arg1_code = ((int32_t*)arg_type_ids)[(1)];
     void* arg2 = (((TVMValue*)args)[2].v_handle);
     int32_t arg2_code = ((int32_t*)arg_type_ids)[(2)];
     void* arg3 = (((TVMValue*)args)[3].v_handle);
     int32_t arg3_code = ((int32_t*)arg_type_ids)[(3)];
     void* arg4 = (((TVMValue*)args)[4].v_handle);
     int32_t arg4_code = ((int32_t*)arg_type_ids)[(4)];
     void* placeholder = (((DLTensor*)arg0)[0].data);
     void* arg0_shape = (((DLTensor*)arg0)[0].shape);
     void* arg0_strides = (((DLTensor*)arg0)[0].strides);
     int32_t dev_id = (((DLTensor*)arg0)[0].device.device_id);
     void* placeholder1 = (((DLTensor*)arg1)[0].data);
     void* arg1_shape = (((DLTensor*)arg1)[0].shape);
     void* arg1_strides = (((DLTensor*)arg1)[0].strides);
     void* placeholder2 = (((DLTensor*)arg2)[0].data);
     void* arg2_shape = (((DLTensor*)arg2)[0].shape);
     void* arg2_strides = (((DLTensor*)arg2)[0].strides);
     void* placeholder3 = (((DLTensor*)arg3)[0].data);
     void* arg3_shape = (((DLTensor*)arg3)[0].shape);
     void* arg3_strides = (((DLTensor*)arg3)[0].strides);
     void* T_cast = (((DLTensor*)arg4)[0].data);
     void* arg4_shape = (((DLTensor*)arg4)[0].shape);
     void* arg4_strides = (((DLTensor*)arg4)[0].strides);
     if (!(arg0_strides == NULL)) {
     }
     if (!(arg1_strides == NULL)) {
     }
     if (!(arg2_strides == NULL)) {
     }
     if (!(arg3_strides == NULL)) {
     }
     if (!(arg4_strides == NULL)) {
     }
     void* PaddedInput = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)430592, 
1, 8);
     if (PaddedInput == NULL) {
       return -1;
     }
     void* DepthwiseConv2d = TVMBackendAllocWorkspace(1, dev_id, 
(uint64_t)1605632, 0, 32);
     if (DepthwiseConv2d == NULL) {
       return -1;
     }
     for (int32_t i1 = 0; i1 < 58; ++i1) {
       for (int32_t i2 = 0; i2 < 58; ++i2) {
         for (int32_t i3 = 0; i3 < 128; ++i3) {
           ((uint8_t*)PaddedInput)[((((i1 * 7424) + (i2 * 128)) + i3))] = 
((uint8_t*)placeholder)[((((i1 * 7424) + (i2 * 128)) + i3))];
         }
       }
     }
     for (int32_t i = 0; i < 56; ++i) {
       for (int32_t j = 0; j < 56; ++j) {
         for (int32_t c = 0; c < 128; ++c) {
           ((int32_t*)DepthwiseConv2d)[((((i * 7168) + (j * 128)) + c))] = 0;
           for (int32_t di = 0; di < 3; ++di) {
             for (int32_t dj = 0; dj < 3; ++dj) {
               ((int32_t*)DepthwiseConv2d)[((((i * 7168) + (j * 128)) + c))] = 
(((int32_t*)DepthwiseConv2d)[((((i * 7168) + (j * 128)) + c))] + 
(((int32_t)((uint8_t*)PaddedInput)[((((((i * 7424) + (di * 7424)) + (j * 128)) 
+ (dj * 128)) + c))]) * ((int32_t)((int8_t*)placeholder1)[((((di * 384) + (dj * 
128)) + c))])));
             }
           }
         }
       }
     }
     for (int32_t ax1 = 0; ax1 < 56; ++ax1) {
       for (int32_t ax2 = 0; ax2 < 56; ++ax2) {
         for (int32_t ax3 = 0; ax3 < 128; ++ax3) {
           ((int32_t*)DepthwiseConv2d)[((((ax1 * 7168) + (ax2 * 128)) + ax3))] 
= (((int32_t*)DepthwiseConv2d)[((((ax1 * 7168) + (ax2 * 128)) + ax3))] - 
((int32_t*)placeholder2)[((((ax1 * 7168) + (ax2 * 128)) + ax3))]);
         }
       }
     }
     for (int32_t ax11 = 0; ax11 < 56; ++ax11) {
       for (int32_t ax21 = 0; ax21 < 56; ++ax21) {
         for (int32_t ax31 = 0; ax31 < 128; ++ax31) {
           ((int32_t*)DepthwiseConv2d)[((((ax11 * 7168) + (ax21 * 128)) + 
ax31))] = (((int32_t*)DepthwiseConv2d)[((((ax11 * 7168) + (ax21 * 128)) + 
ax31))] + ((int32_t*)placeholder3)[(ax31)]);
         }
       }
     }
     for (int32_t i11 = 0; i11 < 56; ++i11) {
       for (int32_t i21 = 0; i21 < 56; ++i21) {
         for (int32_t i31 = 0; i31 < 128; ++i31) {
           ((int32_t*)DepthwiseConv2d)[((((i11 * 7168) + (i21 * 128)) + i31))] 
= ((int32_t)(((((0 != 0) ? (((int64_t)((int32_t*)DepthwiseConv2d)[((((i11 * 
7168) + (i21 * 128)) + i31))]) << ((int64_t)0)) : 
((int64_t)((int32_t*)DepthwiseConv2d)[((((i11 * 7168) + (i21 * 128)) + i31))])) 
* (int64_t)2080045879) + ((int64_t)1 << ((int64_t)((4 + 31) - 1)))) >> 
((int64_t)(4 + 31))));
         }
       }
     }
     for (int32_t i12 = 0; i12 < 56; ++i12) {
       for (int32_t i22 = 0; i22 < 56; ++i22) {
         for (int32_t i32 = 0; i32 < 128; ++i32) {
           int32_t _1 = ((int32_t*)DepthwiseConv2d)[((((i12 * 7168) + (i22 * 
128)) + i32))];
           int32_t _2 = (_1) < (255) ? (_1) : (255);
           ((int32_t*)DepthwiseConv2d)[((((i12 * 7168) + (i22 * 128)) + i32))] 
= ((_2) > (0) ? (_2) : (0));
         }
       }
     }
     for (int32_t ax12 = 0; ax12 < 56; ++ax12) {
       for (int32_t ax22 = 0; ax22 < 56; ++ax22) {
         for (int32_t ax32 = 0; ax32 < 128; ++ax32) {
           ((uint8_t*)T_cast)[((((ax12 * 7168) + (ax22 * 128)) + ax32))] = 
((uint8_t)((int32_t*)DepthwiseConv2d)[((((ax12 * 7168) + (ax22 * 128)) + 
ax32))]);
         }
       }
     }
     if (TVMBackendFreeWorkspace(1, dev_id, DepthwiseConv2d) != 0) {
       return -1;
     }
     if (TVMBackendFreeWorkspace(1, dev_id, PaddedInput) != 0) {
       return -1;
     }
     return 0;
   }
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [tvm] Alex-grovety commented on issue #8717: Fusion of operations and cast in mobilenet v1 conv2d causing large feature maps

Reply via email to