echuraev commented on a change in pull request #8636:
URL: https://github.com/apache/tvm/pull/8636#discussion_r693947325



##########
File path: python/tvm/topi/gpu/conv2d_nhwc.py
##########
@@ -85,15 +87,17 @@ def schedule_conv2d_nhwc_direct(cfg, s, Conv):
     thread_yz = te.thread_axis((0, vthread_n), "vthread", name="vy")
 
     # Schedule for output
-    ni, hi, wi, fi = s[output].op.axis
-    bz = s[output].fuse(hi, wi)
+    ni, _, wi, fi = s[output].op.axis
+    bz = wi
+    fi, vec = s[output].split(fi, factor=vec_factor)
+    s[output].vectorize(vec)

Review comment:
       Sorry for the late reply, I was on vacation. Thank you for your question 
@masahi!
   Today in the latest commit I added vectorization to the conv2d inner loop. 
The diff of generated OpenCL kernel you can see below:
   ```diff
   __kernel void my_conv_kernel0(__global float* restrict inp, __global float* 
restrict w, __global float* restrict Conv2dOutput) {
     float Conv2dOutput_local[4];
     __local float PaddedInput_shared[24];
     __local float w_shared[256];
     float PaddedInput_shared_local[1];
     float w_shared_local[4];
     for (int yy = 0; yy < 298; ++yy) {
   -    for (int ff_c_init = 0; ff_c_init < 4; ++ff_c_init) {
   -      Conv2dOutput_local[(ff_c_init)] = 0.000000e+00f;
   -    }
   +    vstore4(((float4)(0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 
0.000000e+00f)), 0, Conv2dOutput_local + 0);
       for (int rc_outer = 0; rc_outer < 2; ++rc_outer) {
         for (int ry = 0; ry < 3; ++ry) {
           for (int rx = 0; rx < 3; ++rx) {
             barrier(CLK_LOCAL_MEM_FENCE);
             PaddedInput_shared[(((((int)get_local_id(1)) * 4) + 
((int)get_local_id(0))))] = inp[((((((((yy * 9600) + (ry * 9600)) + 
(((int)get_group_id(2)) * 32)) + (rx * 32)) + (rc_outer * 16)) + 
(((int)get_local_id(1)) * 4)) + ((int)get_local_id(0))))];
             for (int ax2_ax3_fused_outer_outer_outer = 0; 
ax2_ax3_fused_outer_outer_outer < 8; ++ax2_ax3_fused_outer_outer_outer) {
               vstore2(vload2(0, w + (((((((ry * 3072) + (rx * 1024)) + 
(rc_outer * 512)) + (ax2_ax3_fused_outer_outer_outer * 64)) + 
((((((int)get_local_id(1)) * 8) + (((int)get_local_id(0)) * 2)) >> 4) * 32)) + 
(((int)get_group_id(0)) * 16)) + (((((int)get_local_id(1)) * 8) + 
(((int)get_local_id(0)) * 2)) & 15))), 0, w_shared + 
(((ax2_ax3_fused_outer_outer_outer * 32) + (((int)get_local_id(1)) * 8)) + 
(((int)get_local_id(0)) * 2)));
             }
             barrier(CLK_LOCAL_MEM_FENCE);
             for (int rc_inner = 0; rc_inner < 16; ++rc_inner) {
               if (((int)get_local_id(1)) < 1) {
                 PaddedInput_shared_local[(0)] = 
PaddedInput_shared[(((((int)get_local_id(1)) * 24) + rc_inner))];
               }
               for (int ax3 = 0; ax3 < 4; ++ax3) {
                 w_shared_local[(ax3)] = w_shared[((((rc_inner * 16) + 
(((int)get_local_id(0)) * 4)) + ax3))];
               }
   -            for (int ff_c = 0; ff_c < 4; ++ff_c) {
   -              if (((int)get_local_id(1)) < 1) {
   -                Conv2dOutput_local[(ff_c)] = (Conv2dOutput_local[(ff_c)] + 
(PaddedInput_shared_local[(0)] * w_shared_local[(ff_c)]));
   -              }
   +            if (((int)get_local_id(1)) < 1) {
   +              vstore4((vload4(0, Conv2dOutput_local + 0) + 
(((float4)(PaddedInput_shared_local[(0)], PaddedInput_shared_local[(0)], 
PaddedInput_shared_local[(0)], PaddedInput_shared_local[(0)])) * vload4(0, 
w_shared_local + 0))), 0, Conv2dOutput_local + 0);
               }
             }
           }
         }
       }
       for (int ff_outer_inner = 0; ff_outer_inner < 2; ++ff_outer_inner) {
         if (((int)get_local_id(1)) < 1) {
           vstore2(vload2(0, Conv2dOutput_local + (ff_outer_inner * 2)), 0, 
Conv2dOutput + ((((((((int)get_local_id(1)) * 2841728) + (yy * 9536)) + 
(((int)get_group_id(2)) * 32)) + (((int)get_group_id(0)) * 16)) + 
(((int)get_local_id(0)) * 4)) + (ff_outer_inner * 2)));
         }
       }
     }
   }
   ```
   
   With this vectorization, the execution time didn't change in comparison with 
previous code generation.
   
   
   What about performance boost. First, let me share my performance numbers 
(the numbers are average execution time in 10 runs) which I got today on the 
Samsung Galaxy A71:
   - Original code (w/o any changes): 1225.95 ms
   - Code with bug fix (read about bug fix below): 1383.18 ms
   - With vectorization: 189.36 ms
   
   *Bug fix.* In the first commit, I also fixed one accuracy problem in OpenCL. 
[Here](https://github.com/apache/tvm/pull/8636/commits/0a18676916dd8656110f8a27ab0fe6be154136db#diff-194f66fb68e225fb63696eabb5aac5f3e30357c8e8bb25d33e2713eb5de322aaL88-L89)
 after fusing `hi` and `wi` it was possible that the value of OpenCL 
`global_work_size` for `clEnqueueNDRangeKernel` was too high and some values in 
output tensor were not calculated.
   
   > I wonder where 6-7x perf improvement comes from?
   
   To answer on this question, let's compare the generated OpenCL code for 
version with bug fix and with the latest code:
   ```diff
   __kernel void my_conv_kernel0(__global float* restrict inp, __global float* 
restrict w, __global float* restrict Conv2dOutput) {
   -  float Conv2dOutput_local[2];
   +  float Conv2dOutput_local[4];
     __local float PaddedInput_shared[24];
     __local float w_shared[256];
     float PaddedInput_shared_local[1];
   -  float w_shared_local[2];
   +  float w_shared_local[4];
     for (int yy = 0; yy < 298; ++yy) {
   -    for (int ff_c_init = 0; ff_c_init < 2; ++ff_c_init) {
   -      Conv2dOutput_local[(ff_c_init)] = 0.000000e+00f;
   -    }
   +    vstore4(((float4)(0.000000e+00f, 0.000000e+00f, 0.000000e+00f, 
0.000000e+00f)), 0, Conv2dOutput_local + 0);
       for (int rc_outer = 0; rc_outer < 2; ++rc_outer) {
         for (int ry = 0; ry < 3; ++ry) {
           for (int rx = 0; rx < 3; ++rx) {
             barrier(CLK_LOCAL_MEM_FENCE);
             PaddedInput_shared[(((((int)get_local_id(1)) * 4) + 
((int)get_local_id(0))))] = inp[((((((((yy * 9600) + (ry * 9600)) + 
(((int)get_group_id(2)) * 32)) + (rx * 32)) + (rc_outer * 16)) + 
(((int)get_local_id(1)) * 4)) + ((int)get_local_id(0))))];
   -          for (int ax2_ax3_fused_outer_outer = 0; ax2_ax3_fused_outer_outer 
< 8; ++ax2_ax3_fused_outer_outer) {
   -            w_shared[((((ax2_ax3_fused_outer_outer * 32) + 
((((((int)get_local_id(1)) * 4) + ((int)get_local_id(0))) >> 3) * 16)) + 
(((((int)get_local_id(1)) * 4) + ((int)get_local_id(0))) & 7)))] = w[((((((((ry 
* 3072) + (rx * 1024)) + (rc_outer * 512)) + (ax2_ax3_fused_outer_outer * 64)) 
+ ((((((int)get_local_id(1)) * 4) + ((int)get_local_id(0))) >> 3) * 32)) + 
(((int)get_group_id(0)) * 8)) + (((((int)get_local_id(1)) * 4) + 
((int)get_local_id(0))) & 7)))];
   +          for (int ax2_ax3_fused_outer_outer_outer = 0; 
ax2_ax3_fused_outer_outer_outer < 8; ++ax2_ax3_fused_outer_outer_outer) {
   +            vstore2(vload2(0, w + (((((((ry * 3072) + (rx * 1024)) + 
(rc_outer * 512)) + (ax2_ax3_fused_outer_outer_outer * 64)) + 
((((((int)get_local_id(1)) * 8) + (((int)get_local_id(0)) * 2)) >> 4) * 32)) + 
(((int)get_group_id(0)) * 16)) + (((((int)get_local_id(1)) * 8) + 
(((int)get_local_id(0)) * 2)) & 15))), 0, w_shared + 
(((ax2_ax3_fused_outer_outer_outer * 32) + (((int)get_local_id(1)) * 8)) + 
(((int)get_local_id(0)) * 2)));
             }
             barrier(CLK_LOCAL_MEM_FENCE);
             for (int rc_inner = 0; rc_inner < 16; ++rc_inner) {
               if (((int)get_local_id(1)) < 1) {
                 PaddedInput_shared_local[(0)] = 
PaddedInput_shared[(((((int)get_local_id(1)) * 24) + rc_inner))];
               }
   -            for (int ax3 = 0; ax3 < 2; ++ax3) {
   -              w_shared_local[(ax3)] = w_shared[((((rc_inner * 16) + 
(((int)get_local_id(0)) * 2)) + ax3))];
   +            for (int ax3 = 0; ax3 < 4; ++ax3) {
   +              w_shared_local[(ax3)] = w_shared[((((rc_inner * 16) + 
(((int)get_local_id(0)) * 4)) + ax3))];
               }
   -            for (int ff_c = 0; ff_c < 2; ++ff_c) {
   -              if (((int)get_local_id(1)) < 1) {
   -                Conv2dOutput_local[(ff_c)] = (Conv2dOutput_local[(ff_c)] + 
(PaddedInput_shared_local[(0)] * w_shared_local[(ff_c)]));
   -              }
   +            if (((int)get_local_id(1)) < 1) {
   +              vstore4((vload4(0, Conv2dOutput_local + 0) + 
(((float4)(PaddedInput_shared_local[(0)], PaddedInput_shared_local[(0)], 
PaddedInput_shared_local[(0)], PaddedInput_shared_local[(0)])) * vload4(0, 
w_shared_local + 0))), 0, Conv2dOutput_local + 0);
               }
             }
           }
         }
       }
   -    for (int ff_inner = 0; ff_inner < 2; ++ff_inner) {
   +    for (int ff_outer_inner = 0; ff_outer_inner < 2; ++ff_outer_inner) {
         if (((int)get_local_id(1)) < 1) {
   -        Conv2dOutput[(((((((((int)get_local_id(1)) * 2841728) + (yy * 
9536)) + (((int)get_group_id(2)) * 32)) + (((int)get_group_id(0)) * 8)) + 
(((int)get_local_id(0)) * 2)) + ff_inner))] = Conv2dOutput_local[(ff_inner)];
   +        vstore2(vload2(0, Conv2dOutput_local + (ff_outer_inner * 2)), 0, 
Conv2dOutput + ((((((((int)get_local_id(1)) * 2841728) + (yy * 9536)) + 
(((int)get_group_id(2)) * 32)) + (((int)get_group_id(0)) * 16)) + 
(((int)get_local_id(0)) * 4)) + (ff_outer_inner * 2)));
         }
       }
     }
   }
   ```
   
   I suppose that the performance boost is connected with decreasing memory 
latency. We read more data in one execution unit and store them in vector data 
types.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to