LeiWang1999 commented on PR #15462:
URL: https://github.com/apache/tvm/pull/15462#issuecomment-1663807057

   done, and we have about 5x speedup.
   
   ```bash
   Time cost is:  8.849430084228516 ms
   Name                                                  Duration (us)  Percent 
 Device  Count                                                                  
                     Argument Shapes  
   fused_relax_nn_conv2d_relax_add_relax_nn_relu_cudnn         1124.35    13.64 
  cuda0      1     float16[128, 224, 224, 3], float16[64, 7, 7, 3], float16[1, 
1, 1, 64], float16[128, 112, 112, 64]  
   fused_relax_nn_conv2d_relax_add_cudnn                        680.29     8.25 
  cuda0      2       float16[128, 56, 56, 64], float16[64, 3, 3, 64], 
float16[1, 1, 1, 64], float16[128, 56, 56, 64]  
   fused_relax_nn_conv2d_relax_add_relax_nn_relu1_cudnn         665.60     8.07 
  cuda0      2       float16[128, 56, 56, 64], float16[64, 3, 3, 64], 
float16[1, 1, 1, 64], float16[128, 56, 56, 64]  
   add                                                          528.38     6.41 
  cuda0      2                          float16[128, 56, 56, 64], float16[128, 
56, 56, 64], float16[128, 56, 56, 64]  
   fused_relax_nn_conv2d_relax_add5_cudnn                       519.17     6.30 
  cuda0      2      float16[128, 7, 7, 512], float16[512, 3, 3, 512], 
float16[1, 1, 1, 512], float16[128, 7, 7, 512]  
   fused_relax_nn_conv2d_relax_add1_cudnn                       508.93     6.17 
  cuda0      2  float16[128, 28, 28, 128], float16[128, 3, 3, 128], float16[1, 
1, 1, 128], float16[128, 28, 28, 128]  
   fused_relax_nn_conv2d_relax_add3_cudnn                       474.24     5.75 
  cuda0      2  float16[128, 14, 14, 256], float16[256, 3, 3, 256], float16[1, 
1, 1, 256], float16[128, 14, 14, 256]  
   relu                                                         444.41     5.39 
  cuda0      2                                                    float16[128, 
56, 56, 64], float16[128, 56, 56, 64]  
   max_pool2d                                                   389.12     4.72 
  cuda0      1                                                  float16[128, 
112, 112, 64], float16[128, 56, 56, 64]  
   add1                                                         268.29     3.25 
  cuda0      2                       float16[128, 28, 28, 128], float16[128, 
28, 28, 128], float16[128, 28, 28, 128]  
   fused_relax_nn_conv2d_relax_add_relax_nn_relu7_cudnn         257.02     3.12 
  cuda0      1      float16[128, 7, 7, 512], float16[512, 3, 3, 512], 
float16[1, 1, 1, 512], float16[128, 7, 7, 512]  
   fused_relax_nn_conv2d_relax_add_relax_nn_relu3_cudnn         252.26     3.06 
  cuda0      1  float16[128, 28, 28, 128], float16[128, 3, 3, 128], float16[1, 
1, 1, 128], float16[128, 28, 28, 128]  
   fused_relax_nn_conv2d_relax_add_relax_nn_relu5_cudnn         236.54     2.87 
  cuda0      1  float16[128, 14, 14, 256], float16[256, 3, 3, 256], float16[1, 
1, 1, 256], float16[128, 14, 14, 256]  
   relu1                                                        225.28     2.73 
  cuda0      2                                                  float16[128, 
28, 28, 128], float16[128, 28, 28, 128]  
   transpose                                                    179.20     2.17 
  cuda0      1                                                  float16[128, 3, 
224, 224], float16[128, 224, 224, 3]  
   fused_relax_nn_conv2d_relax_add_relax_nn_relu2_cudnn         176.13     2.14 
  cuda0      1    float16[128, 56, 56, 64], float16[128, 3, 3, 64], float16[1, 
1, 1, 128], float16[128, 28, 28, 128]  
   fused_relax_nn_conv2d_relax_add2_cudnn                       172.03     2.09 
  cuda0      1    float16[128, 56, 56, 64], float16[128, 1, 1, 64], float16[1, 
1, 1, 128], float16[128, 28, 28, 128]  
   fused_relax_nn_conv2d_relax_add4_cudnn                       157.69     1.91 
  cuda0      1  float16[128, 28, 28, 128], float16[256, 1, 1, 128], float16[1, 
1, 1, 256], float16[128, 14, 14, 256]  
   fused_relax_nn_conv2d_relax_add_relax_nn_relu4_cudnn         154.62     1.88 
  cuda0      1  float16[128, 28, 28, 128], float16[256, 3, 3, 128], float16[1, 
1, 1, 256], float16[128, 14, 14, 256]  
   fused_relax_nn_conv2d_relax_add6_cudnn                       147.46     1.79 
  cuda0      1    float16[128, 14, 14, 256], float16[512, 1, 1, 256], 
float16[1, 1, 1, 512], float16[128, 7, 7, 512]  
   add2                                                         140.29     1.70 
  cuda0      2                       float16[128, 14, 14, 256], float16[128, 
14, 14, 256], float16[128, 14, 14, 256]  
   fused_relax_nn_conv2d_relax_add_relax_nn_relu6_cudnn         135.17     1.64 
  cuda0      1    float16[128, 14, 14, 256], float16[512, 3, 3, 256], 
float16[1, 1, 1, 512], float16[128, 7, 7, 512]  
   relu2                                                        116.96     1.42 
  cuda0      2                                                  float16[128, 
14, 14, 256], float16[128, 14, 14, 256]  
   matmul                                                        76.80     0.93 
  cuda0      1                                             float16[128, 512], 
float16[512, 1000], float16[128, 1000]  
   add3                                                          67.58     0.82 
  cuda0      2                             float16[128, 7, 7, 512], 
float16[128, 7, 7, 512], float16[128, 7, 7, 512]  
   relu3                                                         52.41     0.64 
  cuda0      2                                                      
float16[128, 7, 7, 512], float16[128, 7, 7, 512]  
   adaptive_avg_pool2d                                           17.41     0.21 
  cuda0      1                                                      
float16[128, 7, 7, 512], float16[128, 1, 1, 512]  
   add4                                                           5.12     0.06 
  cuda0      1                                                 float16[128, 
1000], float16[1000], float16[128, 1000]  
   vm.builtin.check_tensor_info                                   2.05     0.02 
  cuda0      1                                                                  
           float16[128, 3, 224, 224]  
   vm.builtin.reshape                                             2.05     0.02 
  cuda0      1                                                                  
                   float16[128, 512]  
   vm.builtin.match_shape                                         1.02     0.01 
  cuda0      1                                                                  
           float16[128, 3, 224, 224]  
   vm.builtin.reshape                                             1.02     0.01 
  cuda0      1                                                                  
             float16[128, 1, 1, 512]  
   vm.builtin.reshape                                             1.02     0.01 
  cuda0      1                                                                  
             float16[128, 512, 1, 1]  
   ----------                                                                   
                                                                                
                                      
   Sum                                                         8179.91    99.23 
            46                                                                  
                                      
   Total                                                       6316.38          
   cpu0      1                                                                  
                                      
   Total                                                       8243.20          
  cuda0      1                                                                  
                                      
   
   Configuration
   -------------
   Number of threads: 32
   Executor: VM
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to