rahul003 commented on issue #9774: mx.io.ImageRecordIter does not respect dtype 
argument
URL: 
https://github.com/apache/incubator-mxnet/issues/9774#issuecomment-366823518
 
 
   Hey @KellenSunderland 
   
   I ran Resnet50 with Imagenet and got about 70% speedup. Some of the top ones 
don't seem to have s884 but some operations do. Can I improve the speed further?
   Here's a log of the profiler
   ```
    GPU activities:    7.58%  179.746s    965213  186.22us  1.3440us  30.955ms  
[CUDA memcpy HtoD]
                       6.60%  156.366s   1547648  101.03us  2.0480us  711.93us  
void nchwToNhwcKernel<__half, __half, float, bool=1>(int, int, int, int, __half 
const *, __half*, float, float)
                       5.84%  138.416s    108321  1.2778ms  314.97us  2.4579ms  
volta_fp16_scudnn_fp16_128x64_relu_interior_nn_v1
                       5.02%  119.036s    336336  353.92us  88.063us  1.2733ms  
void cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, 
int=14>(float, float, float, float, cudnnTensorStruct, __half2 const *, 
cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=14>, 
__half2 const , cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, 
int=2, int=14>, cudnnTensorStruct*, float const *, float*, float const *, float 
const , float const , float, cudnn::reduced_divisor, int, float*, 
cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, 
cudnnStatus_t*, bool)
                       4.83%  114.400s    400400  285.71us  33.088us  1.2040ms  
void cudnn::detail::activation_bw_4d_kernel<__half, float, int=128, int=1, 
int=4, cudnn::detail::relu_func<float, cudnnNanPropagation_t=0, 
bool=0>>(cudnnTensorStruct, __half const *, __half const , 
cudnn::detail::activation_bw_4d_kernel<__half, float, int=128, int=1, int=4, 
cudnn::detail::relu_func<float, cudnnNanPropagation_t=0, bool=0>>, __half const 
, cudnnTensorStruct*, float, cudnnTensorStruct*, int, cudnnTensorStruct*)
                       4.72%  111.789s     40190  2.7815ms  1.4254ms  8.5978ms  
void cudnn::detail::wgrad_alg0_engine<__half, int=128, int=6, int=8, int=3, 
int=3, int=5, bool=1, int=512>(int, int, int, __half const *, int, 
cudnn::detail::wgrad_alg0_engine<__half, int=128, int=6, int=8, int=3, int=3, 
int=5, bool=1, int=512>*, __half const , kernel_grad_params, int, float, int, 
int, int, int)
                       4.71%  111.533s     16040  6.9534ms  4.7594ms  10.580ms  
void cudnn::detail::dgrad2d_alg1_1<__half, int=0, int=6, int=7, int=5, int=4, 
int=5, bool=1, bool=1>(int, int, int, __half const *, int, __half const , int, 
cudnn::detail::dgrad2d_alg1_1<__half, int=0, int=6, int=7, int=5, int=4, int=5, 
bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int)
                       4.14%  98.2065s    304388  322.64us  198.69us  723.45us  
volta_s884cudnn_fp16_128x128_ldg8_wgrad_exp_interior_nhwc_nt_v1
                       3.60%  85.2418s     56116  1.5190ms  625.92us  2.4792ms  
volta_fp16_scudnn_fp16_128x128_stridedB_interior_nn_v1
                       3.54%  83.8923s    336336  249.43us  65.888us  908.99us  
void cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, 
int=20>(cudnnTensorStruct, __half2 const *, 
cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, int=20>, 
cudnnTensorStruct*, float const *, float const , float, float, float*, float 
const *, float const *, float const *, float, float, cudnn::reduced_divisor, 
int, float*, cudnn::detail::bnFwPersistentState*, int, float, float, float, 
int, float, float, cudnnStatus_t*, bool)
                       3.36%  79.5726s     40168  1.9810ms  801.53us  10.105ms  
void cudnn::detail::dgrad_engine<__half, int=128, int=6, int=7, int=3, int=3, 
int=5, bool=1>(int, int, int, __half const *, int, __half const , int, 
cudnn::detail::dgrad_engine<__half, int=128, int=6, int=7, int=3, int=3, int=5, 
bool=1>*, kernel_grad_params, int, int, float, int, int, int)
                       2.96%  70.1196s    416400  168.39us  22.303us  684.35us  
void cudnn::detail::activation_fw_4d_kernel<__half, float, int=128, int=1, 
int=4, cudnn::detail::relu_func<float, cudnnNanPropagation_t=0, 
bool=0>>(cudnnTensorStruct, __half const *, 
cudnn::detail::activation_fw_4d_kernel<__half, float, int=128, int=1, int=4, 
cudnn::detail::relu_func<float, cudnnNanPropagation_t=0, bool=0>>, 
cudnnTensorStruct*, float, cudnnTensorStruct*, int, cudnnTensorStruct*)
                       2.89%  68.4710s     16082  4.2576ms  1.6871ms  6.6937ms  
void cudnn::detail::dgrad_engine<__half, int=512, int=6, int=5, int=3, int=3, 
int=3, bool=1>(int, int, int, __half const *, int, __half const , int, 
cudnn::detail::dgrad_engine<__half, int=512, int=6, int=5, int=3, int=3, int=3, 
bool=1>*, kernel_grad_params, int, int, float, int, int, int)
                       2.86%  67.7144s    174945  387.06us  290.78us  902.65us  
volta_fp16_s884cudnn_fp16_256x128_ldg8_relu_f2f_exp_interior_nhwc2nchw_tn_v1
                       2.81%  66.5956s      8008  8.3161ms  8.2356ms  8.5289ms  
void cudnn::detail::bn_bw_1C11_kernel_new<__half, float, float2, int=512, 
bool=1, int=1>(float, cudnn::detail::bn_bw_1C11_kernel_new<__half, float, 
float2, int=512, bool=1, int=1>, cudnn::detail::bn_bw_1C11_kernel_new<__half, 
float, float2, int=512, bool=1, int=1>, 
cudnn::detail::bn_bw_1C11_kernel_new<__half, float, float2, int=512, bool=1, 
int=1>, cudnnTensorStruct, __half const *, float, __half const , float, 
cudnnTensorStruct*, cudnn::detail::bn_bw_1C11_kernel_new<__half, float, float2, 
int=512, bool=1, int=1> const *, cudnn::detail::bn_bw_1C11_kernel_new<__half, 
float, float2, int=512, bool=1, int=1>*, 
cudnn::detail::bn_bw_1C11_kernel_new<__half, float, float2, int=512, bool=1, 
int=1> const *, cudnn::detail::bn_bw_1C11_kernel_new<__half, float, float2, 
int=512, bool=1, int=1> const , cudnn::detail::bn_bw_1C11_kernel_new<__half, 
float, float2, int=512, bool=1, int=1> const , 
cudnn::detail::bn_bw_1C11_kernel_new<__half, float, float2, int=512, bool=1, 
int=1>)
                       2.71%  64.1674s     16164  3.9698ms  1.2205ms  4.2252ms  
volta_fp16_scudnn_fp16_128x128_stridedB_splitK_interior_nn_v1
                       2.64%  62.5345s      8008  7.8090ms  7.7552ms  8.5208ms  
void cudnn::detail::bn_fw_tr_1C11_kernel_new<__half, float, int=512, bool=1, 
int=1>(cudnnTensorStruct, __half const *, 
cudnn::detail::bn_fw_tr_1C11_kernel_new<__half, float, int=512, bool=1, int=1>, 
cudnnTensorStruct*, float const *, float const , cudnnTensorStruct*, 
cudnnTensorStruct*, cudnnTensorStruct**, float const *, float const *, float 
const *, cudnnTensorStruct*, cudnnTensorStruct*)
                       2.28%  53.9698s    954403  56.548us  1.2160us  256.37ms  
[CUDA memcpy DtoH]
                       2.25%  53.3050s    133248  400.04us  115.33us  954.87us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS0_10mshadow_op4plusELi1EEEJPN7mshadow4half7half2_tESA_SA_EEEviDpT0_
                       2.21%  52.3835s    128128  408.84us  114.27us  953.12us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS0_3SumEJPN7mshadow4half7half2_tENS_9OpReqTypeES7_S7_EEEviDpT0_
                       2.09%  49.4748s     99967  494.91us  437.95us  612.13us  
volta_fp16_s884cudnn_fp16_128x128_ldg8_relu_f2f_exp_small_nhwc2nchw_tn_v1
                       1.91%  45.1522s      8016  5.6328ms  5.5531ms  5.9249ms  
void cudnn::detail::dgrad2d_alg1_1<__half, int=0, int=6, int=6, int=5, int=4, 
int=4, bool=1, bool=1>(int, int, int, __half const *, int, __half const , int, 
cudnn::detail::dgrad2d_alg1_1<__half, int=0, int=6, int=6, int=5, int=4, int=4, 
bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int)
                       1.75%  41.4783s      8064  5.1436ms  856.09us  7.4089ms  
void cudnn::detail::wgrad_alg0_engine<__half, int=512, int=6, int=5, int=3, 
int=3, int=3, bool=1, int=512>(int, int, int, __half const *, int, 
cudnn::detail::wgrad_alg0_engine<__half, int=512, int=6, int=5, int=3, int=3, 
int=3, bool=1, int=512>*, __half const , kernel_grad_params, int, float, int, 
int, int, int)
                       1.69%  40.0064s    112152  356.72us  277.31us  864.38us  
volta_fp16_s884cudnn_fp16_256x128_ldg8_dgrad_f2f_exp_interior_nhwc2nchw_tt_v1
                       1.43%  33.8762s     48075  704.65us  353.79us  870.81us  
volta_s884cudnn_fp16_64x256_sliced1x4_ldg8_wgrad_exp_interior_nhwc_nt_v1
                       1.37%  32.5869s      8092  4.0271ms  1.8701ms  12.452ms  
void cudnn::detail::dgrad_engine<__half, int=128, int=6, int=8, int=3, int=3, 
int=5, bool=1>(int, int, int, __half const *, int, __half const , int, 
cudnn::detail::dgrad_engine<__half, int=128, int=6, int=8, int=3, int=3, int=5, 
bool=1>*, kernel_grad_params, int, int, float, int, int, int)
                       1.31%  31.0537s     64080  484.61us  441.79us  619.93us  
volta_fp16_s884cudnn_fp16_256x128_ldg8_dgrad_f2f_exp_small_nhwc2nchw_tt_v1
                       1.23%  29.2180s      8052  3.6287ms  1.4834ms  3.8291ms  
void cudnn::detail::wgrad_alg0_engine<__half, int=128, int=6, int=7, int=3, 
int=3, int=5, bool=1, int=512>(int, int, int, __half const *, int, 
cudnn::detail::wgrad_alg0_engine<__half, int=128, int=6, int=7, int=3, int=3, 
int=5, bool=1, int=512>*, __half const , kernel_grad_params, int, float, int, 
int, int, int)
                       1.16%  27.5703s    112686  244.66us  1.3750us  749.50us  
void scalePackedTensor_kernel<__half, float>(cudnnTensor4dStruct, __half*, 
float)
                       1.02%  24.0620s      8008  3.0047ms  2.8931ms  3.2269ms  
void cudnn::detail::pooling_bw_kernel_max<__half, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, 
bool=0>(cudnnTensorStruct, __half const *, 
cudnn::detail::pooling_bw_kernel_max<__half, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, __half 
const , cudnn::detail::pooling_bw_kernel_max<__half, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, __half 
const , cudnn::detail::pooling_bw_kernel_max<__half, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, bool=0>, 
cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, 
cudnn::reduced_divisor, float)
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to