oleg-trott commented on issue #17665: No speedup from using FP16 (4 times slower than PyTorch) URL: https://github.com/apache/incubator-mxnet/issues/17665#issuecomment-592804066 @ptrendx Profiler output for the `resnet50_v2` code (as posted above, FP16, but no multi-precision): ``` Type Time(%) Time Calls Avg Min Max Name GPU activities: 36.53% 8.03789s 1400 5.7413ms 262.53us 18.796ms _ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS0_14MultiSGDKernelIN7mshadow4half6half_tELb0ELb0EEEJNS0_19MultiSGDKernelParamIS6_S6_EENS_9OpReqTypeEEEEviDpT0_ 9.06% 1.99393s 20197 98.723us 2.1440us 850.04us void nchwToNhwcKernel<__half, __half, float, bool=1, bool=0>(int, int, int, int, __half const *, __half*, float, float) 7.33% 1.61244s 5022 321.08us 128.70us 1.1131ms turing_s1688cudnn_fp16_128x128_ldg8_wgrad_idx_exp_interior_nhwc_nt_v1 3.87% 850.75ms 5000 170.15us 17.888us 820.19us void op_generic_tensor_kernel<int=2, __half, float, __half, int=256, cudnnGenericOp_t=13, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, int=2>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, float, float, float, float, dimArray, reducedDivisorArray, bool) 3.25% 714.28ms 2400 297.62us 93.055us 845.85us void cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=14>(float, float, float, float, cudnnTensorStruct, __half2 const *, cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=14>, __half2 const , cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=14>, cudnnTensorStruct*, float const *, float*, float const *, float const , float const , float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool) 3.18% 700.83ms 2008 349.02us 253.15us 661.25us volta_fp16_s884cudnn_fp16_256x128_ldg8_dgrad_f2f_exp_interior_nhwc2nchw_tt_v1 3.03% 666.57ms 3200 208.30us 58.976us 700.03us _ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS0_10mshadow_op8identityELi1EEEJPN7mshadow4half6half_tEPKS9_EEEviDpT0_ 2.86% 629.54ms 5000 125.91us 11.968us 642.81us void op_generic_tensor_kernel<int=2, __half, float, __half, int=256, cudnnGenericOp_t=8, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, int=1>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, float, float, float, float, dimArray, reducedDivisorArray, bool) 2.01% 443.40ms 2500 177.36us 57.855us 485.28us void cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, int=20>(cudnnTensorStruct, __half2 const *, cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, int=20>, cudnnTensorStruct*, float const *, float const , float, float, float*, float const *, float const *, float const *, float, float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool) 2.01% 441.25ms 1507 292.80us 150.34us 406.05us turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1 1.86% 409.88ms 1600 256.18us 69.536us 761.85us _ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS0_3SumEJPN7mshadow4half7half2_tENS_9OpReqTypeES7_S7_EEEviDpT0_ 1.81% 397.44ms 1600 248.40us 72.864us 632.19us _ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS0_10mshadow_op4plusELi1EEEJPN7mshadow4half6half_tESA_SA_EEEviDpT0_ 1.77% 388.79ms 1216 319.73us 274.40us 1.0458ms volta_sgemm_128x64_nt 1.65% 362.15ms 1105 327.74us 278.27us 390.75us turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_f2f_exp_small_nhwc_tn_v1 1.37% 300.68ms 4222 71.218us 10.208us 640.96us void nhwcToNchwKernel<__half, __half, float, bool=1, bool=0>(int, int, int, int, __half const *, __half*, float, float) 1.14% 250.31ms 1304 191.96us 163.36us 333.57us turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_f2f_exp_interior_nhwc_tn_v1 0.92% 202.14ms 100 2.0214ms 1.9949ms 2.0394ms void cudnn::detail::pooling_bw_kernel_max<__half, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>(cudnnTensorStruct, __half const *, cudnn::detail::pooling_bw_kernel_max<__half, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, __half const , cudnn::detail::pooling_bw_kernel_max<__half, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, __half const , cudnn::detail::pooling_bw_kernel_max<__half, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, float) 0.91% 200.70ms 5022 39.963us 3.6480us 626.27us void nhwcToNchwKernel<float, __half, float, bool=1, bool=0>(int, int, int, int, float const *, __half*, float, float) 0.90% 199.08ms 802 248.23us 238.33us 957.47us volta_fp16_s884cudnn_fp16_128x128_ldg8_dgrad_f2f_exp_interior_nhwc2nchw_tt_v1 0.86% 189.25ms 303 624.60us 558.97us 907.48us dgrad_1x1_stride_2x2 ``` FP16 + multi-precision looks very different: ``` Type Time(%) Time Calls Avg Min Max Name GPU activities: 11.37% 1.55330s 19997 77.676us 2.1120us 440.13us void nchwToNhwcKernel<__half, __half, float, bool=1, bool=0>(int, int, int, int, __half const *, __half*, float, float) 10.49% 1.43320s 4822 297.22us 128.13us 1.1295ms turing_s1688cudnn_fp16_128x128_ldg8_wgrad_idx_exp_interior_nhwc_nt_v1 6.14% 838.84ms 5000 167.77us 17.887us 817.88us void op_generic_tensor_kernel<int=2, __half, float, __half, int=256, cudnnGenericOp_t=13, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, int=2>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, float, float, float, float, dimArray, reducedDivisorArray, bool) 4.90% 670.07ms 2400 279.19us 92.895us 719.13us void cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=14>(float, float, float, float, cudnnTensorStruct, __half2 const *, cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=14>, __half2 const , cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=14>, cudnnTensorStruct*, float const *, float*, float const *, float const , float const , float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool) 4.73% 646.86ms 430 1.5043ms 734.72us 4.1373ms void cudnn::detail::wgrad_alg0_engine<__half, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>(int, int, int, __half const *, int, cudnn::detail::wgrad_alg0_engine<__half, int=128, int=6, int=8, int=3, int=3, int=5, bool=1, int=512>*, __half const , kernel_grad_params, int, float, int, int, int, int) 4.61% 629.83ms 5000 125.97us 12.063us 626.20us void op_generic_tensor_kernel<int=2, __half, float, __half, int=256, cudnnGenericOp_t=8, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, int=1>(cudnnTensorStruct, __half*, cudnnTensorStruct, __half const *, cudnnTensorStruct, __half const *, float, float, float, float, dimArray, reducedDivisorArray, bool) 4.39% 600.18ms 1608 373.25us 261.12us 666.72us volta_fp16_s884cudnn_fp16_256x128_ldg8_dgrad_f2f_exp_interior_nhwc2nchw_tt_v1 4.38% 597.94ms 3200 186.86us 58.720us 412.00us _ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS0_10mshadow_op8identityELi1EEEJPN7mshadow4half6half_tEPKS9_EEEviDpT0_ 3.25% 443.86ms 2500 177.55us 57.760us 495.61us void cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, int=20>(cudnnTensorStruct, __half2 const *, cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, int=20>, cudnnTensorStruct*, float const *, float const , float, float, float*, float const *, float const *, float const *, float, float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnFwPersistentState*, int, float, float, float, int, float, float, cudnnStatus_t*, bool) 3.22% 439.58ms 1507 291.69us 149.92us 411.23us turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_filter1x1_stg8_interior_nchw_nn_v1 2.91% 397.03ms 1600 248.15us 72.736us 637.12us _ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS0_10mshadow_op4plusELi1EEEJPN7mshadow4half6half_tESA_SA_EEEviDpT0_ 2.89% 394.91ms 1600 246.82us 69.727us 565.18us _ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS0_3SumEJPN7mshadow4half7half2_tENS_9OpReqTypeES7_S7_EEEviDpT0_ 2.63% 358.72ms 1105 324.64us 276.51us 388.96us turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_f2f_exp_small_nhwc_tn_v1 2.20% 300.11ms 4222 71.081us 10.048us 432.00us void nhwcToNchwKernel<__half, __half, float, bool=1, bool=0>(int, int, int, int, __half const *, __half*, float, float) 1.82% 248.61ms 1304 190.65us 163.55us 329.79us turing_fp16_s1688cudnn_fp16_256x128_ldg8_relu_f2f_exp_interior_nhwc_tn_v1 1.74% 238.11ms 716 332.55us 273.31us 1.0432ms volta_sgemm_128x64_nt 1.48% 202.58ms 100 2.0258ms 1.9909ms 2.0511ms void cudnn::detail::pooling_bw_kernel_max<__half, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>(cudnnTensorStruct, __half const *, cudnn::detail::pooling_bw_kernel_max<__half, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, __half const , cudnn::detail::pooling_bw_kernel_max<__half, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, __half const , cudnn::detail::pooling_bw_kernel_max<__half, float, cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, float) 1.45% 198.56ms 802 247.58us 238.78us 256.09us volta_fp16_s884cudnn_fp16_128x128_ldg8_dgrad_f2f_exp_interior_nhwc2nchw_tt_v1 1.45% 198.10ms 412 480.82us 261.02us 1.2413ms volta_fp16_scudnn_fp16_128x128_stridedB_interior_nn_v1 1.39% 190.39ms 303 628.36us 556.54us 708.03us dgrad_1x1_stride_2x2 1.28% 175.54ms 301 583.18us 555.87us 597.08us volta_fp16_s884cudnn_fp16_256x64_ldg8_dgrad_f2f_exp_small_nhwc2nchw_tt_v1 1.16% 158.07ms 302 523.41us 497.89us 634.91us volta_fp16_s884cudnn_fp16_256x128_ldg8_dgrad_f2f_exp_small_nhwc2nchw_tt_v1 1.15% 157.52ms 4822 32.667us 3.6480us 241.02us void nhwcToNchwKernel<float, __half, float, bool=1, bool=0>(int, int, int, int, float const *, __half*, float, float) 1.10% 150.19ms 502 299.18us 141.79us 431.45us volta_fp16_s884cudnn_fp16_256x64_ldg8_dgrad_f2f_exp_interior_nhwc2nchw_tt_v1 1.09% 148.41ms 1800 82.448us 41.151us 160.13us void cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=7>(float, float, float, float, cudnnTensorStruct, __half2 const *, cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=7>, __half2 const , cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=7>, cudnnTensorStruct*, float const *, float*, float const *, float const , float const , float, cudnn::reduced_divisor, int, float*, cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, cudnnStatus_t*, bool) 0.97% 132.01ms 332 397.62us 143.10us 7.1807ms volta_cgemm_32x32_tn 0.95% 129.19ms 704 183.51us 124.00us 390.24us turing_fp16_s1688cudnn_fp16_128x128_ldg8_relu_f2f_exp_interior_nhwc_tn_v1 0.83% 113.92ms 101 1.1279ms 1.0902ms 1.5125ms volta_fp16_scudnn_fp16_128x64_relu_medium_nn_v1 0.80% 109.48ms 301 363.71us 354.72us 369.98us turing_fp16_s1688cudnn_fp16_256x64_ldg8_relu_f2f_exp_small_nhwc_tn_v1 0.80% 109.22ms 216 505.65us 48.575us 9.9029ms volta_gcgemm_32x32_nt ```
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected] With regards, Apache Git Services
