oleg-trott commented on issue #17665: No speedup from using FP16 (4 times 
slower than PyTorch)
URL: 
https://github.com/apache/incubator-mxnet/issues/17665#issuecomment-592804066
 
 
   @ptrendx 
   
   Profiler output for the `resnet50_v2` code (as posted above, FP16, but no 
multi-precision):
   
   ```
   
               Type  Time(%)      Time     Calls       Avg       Min       Max  
Name
    GPU activities:   36.53%  8.03789s      1400  5.7413ms  262.53us  18.796ms  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS0_14MultiSGDKernelIN7mshadow4half6half_tELb0ELb0EEEJNS0_19MultiSGDKernelParamIS6_S6_EENS_9OpReqTypeEEEEviDpT0_
                       9.06%  1.99393s     20197  98.723us  2.1440us  850.04us  
void nchwToNhwcKernel<__half, __half, float, bool=1, bool=0>(int, int, int, 
int, __half const *, __half*, float, float)
                       7.33%  1.61244s      5022  321.08us  128.70us  1.1131ms  
turing_s1688cudnn_fp16_128x128_ldg8_wgrad_idx_exp_interior_nhwc_nt_v1
                       3.87%  850.75ms      5000  170.15us  17.888us  820.19us  
void op_generic_tensor_kernel<int=2, __half, float, __half, int=256, 
cudnnGenericOp_t=13, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, 
int=2>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, 
cudnnTensorStruct, __half const *, float, float, float, float, dimArray, 
reducedDivisorArray, bool)
                       3.25%  714.28ms      2400  297.62us  93.055us  845.85us  
void cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, 
int=14>(float, float, float, float, cudnnTensorStruct, __half2 const *, 
cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=14>, 
__half2 const , cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, 
int=2, int=14>, cudnnTensorStruct*, float const *, float*, float const *, float 
const , float const , float, cudnn::reduced_divisor, int, float*, 
cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, 
cudnnStatus_t*, bool)
                       3.18%  700.83ms      2008  349.02us  253.15us  661.25us  
volta_fp16_s884cudnn_fp16_256x128_ldg8_dgrad_f2f_exp_interior_nhwc2nchw_tt_v1
                       3.03%  666.57ms      3200  208.30us  58.976us  700.03us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS0_10mshadow_op8identityELi1EEEJPN7mshadow4half6half_tEPKS9_EEEviDpT0_
                       2.86%  629.54ms      5000  125.91us  11.968us  642.81us  
void op_generic_tensor_kernel<int=2, __half, float, __half, int=256, 
cudnnGenericOp_t=8, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, 
int=1>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, 
cudnnTensorStruct, __half const *, float, float, float, float, dimArray, 
reducedDivisorArray, bool)
                       2.01%  443.40ms      2500  177.36us  57.855us  485.28us  
void cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, 
int=20>(cudnnTensorStruct, __half2 const *, 
cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, int=20>, 
cudnnTensorStruct*, float const *, float const , float, float, float*, float 
const *, float const *, float const *, float, float, cudnn::reduced_divisor, 
int, float*, cudnn::detail::bnFwPersistentState*, int, float, float, float, 
int, float, float, cudnnStatus_t*, bool)
                       2.01%  441.25ms      1507  292.80us  150.34us  406.05us  
turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1
                       1.86%  409.88ms      1600  256.18us  69.536us  761.85us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS0_3SumEJPN7mshadow4half7half2_tENS_9OpReqTypeES7_S7_EEEviDpT0_
                       1.81%  397.44ms      1600  248.40us  72.864us  632.19us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS0_10mshadow_op4plusELi1EEEJPN7mshadow4half6half_tESA_SA_EEEviDpT0_
                       1.77%  388.79ms      1216  319.73us  274.40us  1.0458ms  
volta_sgemm_128x64_nt
                       1.65%  362.15ms      1105  327.74us  278.27us  390.75us  
turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_f2f_exp_small_nhwc_tn_v1
                       1.37%  300.68ms      4222  71.218us  10.208us  640.96us  
void nhwcToNchwKernel<__half, __half, float, bool=1, bool=0>(int, int, int, 
int, __half const *, __half*, float, float)
                       1.14%  250.31ms      1304  191.96us  163.36us  333.57us  
turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_f2f_exp_interior_nhwc_tn_v1
                       0.92%  202.14ms       100  2.0214ms  1.9949ms  2.0394ms  
void cudnn::detail::pooling_bw_kernel_max<__half, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, 
bool=0>(cudnnTensorStruct, __half const *, 
cudnn::detail::pooling_bw_kernel_max<__half, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, __half 
const , cudnn::detail::pooling_bw_kernel_max<__half, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, __half 
const , cudnn::detail::pooling_bw_kernel_max<__half, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, 
cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, 
cudnn::reduced_divisor, float)
                       0.91%  200.70ms      5022  39.963us  3.6480us  626.27us  
void nhwcToNchwKernel<float, __half, float, bool=1, bool=0>(int, int, int, int, 
float const *, __half*, float, float)
                       0.90%  199.08ms       802  248.23us  238.33us  957.47us  
volta_fp16_s884cudnn_fp16_128x128_ldg8_dgrad_f2f_exp_interior_nhwc2nchw_tt_v1
                       0.86%  189.25ms       303  624.60us  558.97us  907.48us  
dgrad_1x1_stride_2x2
   
   ```
   
   FP16 + multi-precision looks very different:
   
   ```
               Type  Time(%)      Time     Calls       Avg       Min       Max  
Name
    GPU activities:   11.37%  1.55330s     19997  77.676us  2.1120us  440.13us  
void nchwToNhwcKernel<__half, __half, float, bool=1, bool=0>(int, int, int, 
int, __half const *, __half*, float, float)
                      10.49%  1.43320s      4822  297.22us  128.13us  1.1295ms  
turing_s1688cudnn_fp16_128x128_ldg8_wgrad_idx_exp_interior_nhwc_nt_v1
                       6.14%  838.84ms      5000  167.77us  17.887us  817.88us  
void op_generic_tensor_kernel<int=2, __half, float, __half, int=256, 
cudnnGenericOp_t=13, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, 
int=2>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, 
cudnnTensorStruct, __half const *, float, float, float, float, dimArray, 
reducedDivisorArray, bool)
                       4.90%  670.07ms      2400  279.19us  92.895us  719.13us  
void cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, 
int=14>(float, float, float, float, cudnnTensorStruct, __half2 const *, 
cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=14>, 
__half2 const , cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, 
int=2, int=14>, cudnnTensorStruct*, float const *, float*, float const *, float 
const , float const , float, cudnn::reduced_divisor, int, float*, 
cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, 
cudnnStatus_t*, bool)
                       4.73%  646.86ms       430  1.5043ms  734.72us  4.1373ms  
void cudnn::detail::wgrad_alg0_engine<__half, int=128, int=6, int=8, int=3, 
int=3, int=5, bool=1, int=512>(int, int, int, __half const *, int, 
cudnn::detail::wgrad_alg0_engine<__half, int=128, int=6, int=8, int=3, int=3, 
int=5, bool=1, int=512>*, __half const , kernel_grad_params, int, float, int, 
int, int, int)
                       4.61%  629.83ms      5000  125.97us  12.063us  626.20us  
void op_generic_tensor_kernel<int=2, __half, float, __half, int=256, 
cudnnGenericOp_t=8, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, 
int=1>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, 
cudnnTensorStruct, __half const *, float, float, float, float, dimArray, 
reducedDivisorArray, bool)
                       4.39%  600.18ms      1608  373.25us  261.12us  666.72us  
volta_fp16_s884cudnn_fp16_256x128_ldg8_dgrad_f2f_exp_interior_nhwc2nchw_tt_v1
                       4.38%  597.94ms      3200  186.86us  58.720us  412.00us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS0_10mshadow_op8identityELi1EEEJPN7mshadow4half6half_tEPKS9_EEEviDpT0_
                       3.25%  443.86ms      2500  177.55us  57.760us  495.61us  
void cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, 
int=20>(cudnnTensorStruct, __half2 const *, 
cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, int=20>, 
cudnnTensorStruct*, float const *, float const , float, float, float*, float 
const *, float const *, float const *, float, float, cudnn::reduced_divisor, 
int, float*, cudnn::detail::bnFwPersistentState*, int, float, float, float, 
int, float, float, cudnnStatus_t*, bool)
                       3.22%  439.58ms      1507  291.69us  149.92us  411.23us  
turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1
                       2.91%  397.03ms      1600  248.15us  72.736us  637.12us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS0_10mshadow_op4plusELi1EEEJPN7mshadow4half6half_tESA_SA_EEEviDpT0_
                       2.89%  394.91ms      1600  246.82us  69.727us  565.18us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS0_3SumEJPN7mshadow4half7half2_tENS_9OpReqTypeES7_S7_EEEviDpT0_
                       2.63%  358.72ms      1105  324.64us  276.51us  388.96us  
turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_f2f_exp_small_nhwc_tn_v1
                       2.20%  300.11ms      4222  71.081us  10.048us  432.00us  
void nhwcToNchwKernel<__half, __half, float, bool=1, bool=0>(int, int, int, 
int, __half const *, __half*, float, float)
                       1.82%  248.61ms      1304  190.65us  163.55us  329.79us  
turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_f2f_exp_interior_nhwc_tn_v1
                       1.74%  238.11ms       716  332.55us  273.31us  1.0432ms  
volta_sgemm_128x64_nt
                       1.48%  202.58ms       100  2.0258ms  1.9909ms  2.0511ms  
void cudnn::detail::pooling_bw_kernel_max<__half, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, 
bool=0>(cudnnTensorStruct, __half const *, 
cudnn::detail::pooling_bw_kernel_max<__half, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, __half 
const , cudnn::detail::pooling_bw_kernel_max<__half, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, __half 
const , cudnn::detail::pooling_bw_kernel_max<__half, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, 
cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, 
cudnn::reduced_divisor, float)
                       1.45%  198.56ms       802  247.58us  238.78us  256.09us  
volta_fp16_s884cudnn_fp16_128x128_ldg8_dgrad_f2f_exp_interior_nhwc2nchw_tt_v1
                       1.45%  198.10ms       412  480.82us  261.02us  1.2413ms  
volta_fp16_scudnn_fp16_128x128_stridedB_interior_nn_v1
                       1.39%  190.39ms       303  628.36us  556.54us  708.03us  
dgrad_1x1_stride_2x2
                       1.28%  175.54ms       301  583.18us  555.87us  597.08us  
volta_fp16_s884cudnn_fp16_256x64_ldg8_dgrad_f2f_exp_small_nhwc2nchw_tt_v1
                       1.16%  158.07ms       302  523.41us  497.89us  634.91us  
volta_fp16_s884cudnn_fp16_256x128_ldg8_dgrad_f2f_exp_small_nhwc2nchw_tt_v1
                       1.15%  157.52ms      4822  32.667us  3.6480us  241.02us  
void nhwcToNchwKernel<float, __half, float, bool=1, bool=0>(int, int, int, int, 
float const *, __half*, float, float)
                       1.10%  150.19ms       502  299.18us  141.79us  431.45us  
volta_fp16_s884cudnn_fp16_256x64_ldg8_dgrad_f2f_exp_interior_nhwc2nchw_tt_v1
                       1.09%  148.41ms      1800  82.448us  41.151us  160.13us  
void cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, 
int=7>(float, float, float, float, cudnnTensorStruct, __half2 const *, 
cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=7>, 
__half2 const , cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, 
int=2, int=7>, cudnnTensorStruct*, float const *, float*, float const *, float 
const , float const , float, cudnn::reduced_divisor, int, float*, 
cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, 
cudnnStatus_t*, bool)
                       0.97%  132.01ms       332  397.62us  143.10us  7.1807ms  
volta_cgemm_32x32_tn
                       0.95%  129.19ms       704  183.51us  124.00us  390.24us  
turing_fp16_s1688cudnn_fp16_128x128_ldg8_relu_f2f_exp_interior_nhwc_tn_v1
                       0.83%  113.92ms       101  1.1279ms  1.0902ms  1.5125ms  
volta_fp16_scudnn_fp16_128x64_relu_medium_nn_v1
                       0.80%  109.48ms       301  363.71us  354.72us  369.98us  
turing_fp16_s1688cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1
                       0.80%  109.22ms       216  505.65us  48.575us  9.9029ms  
volta_gcgemm_32x32_nt
   
   ```
   

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to