rahul003 commented on issue #9774: mx.io.ImageRecordIter does not respect dtype 
argument
URL: 
https://github.com/apache/incubator-mxnet/issues/9774#issuecomment-366835890
 
 
   But for Resnet 110 on Cifar10, fp16 is much slower. Do you see something 
fishy here? There are barely any operations with s884 in their names. All the 
top ones don't. So fp16 would not help us with small networks/models? 
   
   ```
   fp16
    GPU activities:   64.80%  28.7596s     87602  328.30us  24.128us  444.10us  
void cudnn::detail::wgrad_alg0_engine<__half, int=512, int=6, int=5, int=3, 
int=3, int=3, bool=1, int=512>(int, int, int, __half const *, int, 
cudnn::detail::wgrad_alg0_engine<__half, int=512, int=6, int=5, int=3, int=3, 
int=3, bool=1, int=512>*, __half const , kernel_grad_params, int, float, int, 
int, int, int)
                      11.44%  5.07525s    120474  42.127us  27.776us  70.752us  
void cudnn::winograd::winograd3x3Kernel<__half, float, int=4, int=1, int=8, 
bool=0>(cudnn::maxwell::winograd::KernelParams)
                       9.65%  4.28153s     61959  69.102us  66.112us  88.128us  
void cudnn::winograd::winograd3x3Kernel<__half, float, int=2, int=2, int=8, 
bool=0>(cudnn::maxwell::winograd::KernelParams)
                       1.90%  845.06ms    182434  4.6320us  3.4240us  12.256us  
void cudnn::winograd::generateWinogradTilesKernel<int=0, __half, 
float>(cudnn::winograd::GenerateWinogradTilesParams<__half, float>)
                       1.40%  621.33ms     29716  20.909us  19.744us  26.400us  
void cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, 
int=14>(float, float, float, float, cudnnTensorStruct, __half2 const *, 
cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, int=2, int=14>, 
__half2 const , cudnn::detail::bn_bw_1C11_singleread_fp16<int=512, int=1, 
int=2, int=14>, cudnnTensorStruct*, float const *, float*, float const *, float 
const , float const , float, cudnn::reduced_divisor, int, float*, 
cudnn::detail::bnBwPersistentState*, int, float, float, float, int, float, 
cudnnStatus_t*, bool)
                       1.39%  615.34ms     85238  7.2190us  3.8080us  11.776us  
void cudnn::detail::activation_bw_4d_kernel<__half, float, int=128, int=1, 
int=4, cudnn::detail::relu_func<float, cudnnNanPropagation_t=0, 
bool=0>>(cudnnTensorStruct, __half const *, __half const , 
cudnn::detail::activation_bw_4d_kernel<__half, float, int=128, int=1, int=4, 
cudnn::detail::relu_func<float, cudnnNanPropagation_t=0, bool=0>>, __half const 
, cudnnTensorStruct*, float, cudnnTensorStruct*, int, cudnnTensorStruct*)
                       1.27%  564.01ms     29716  18.980us  11.648us  22.080us  
void cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, 
int=20>(cudnnTensorStruct, __half2 const *, 
cudnn::detail::bn_fw_tr_1C11_singleread_fp16<int=512, int=1, int=2, int=20>, 
cudnnTensorStruct*, float const *, float const , float, float, float*, float 
const *, float const *, float const *, float, float, cudnn::reduced_divisor, 
int, float*, cudnn::detail::bnFwPersistentState*, int, float, float, float, 
int, float, float, cudnnStatus_t*, bool)
                       1.01%  446.51ms    102351  4.3620us  3.1360us  11.296us  
void cudnn::detail::activation_fw_4d_kernel<__half, float, int=128, int=1, 
int=4, cudnn::detail::relu_func<float, cudnnNanPropagation_t=0, 
bool=0>>(cudnnTensorStruct, __half const *, 
cudnn::detail::activation_fw_4d_kernel<__half, float, int=128, int=1, int=4, 
cudnn::detail::relu_func<float, cudnnNanPropagation_t=0, bool=0>>, 
cudnnTensorStruct*, float, cudnnTensorStruct*, int, cudnnTensorStruct*)
   
   API calls:   37.66%  44.8500s   1042145  43.036us  5.0250us  2.6695ms  
cudaStreamSynchronize
                      25.68%  30.5900s   1391180  21.988us  6.6070us  12.689ms  
cudaLaunch
                      11.41%  13.5923s    266335  51.034us  7.7360us  9.7191ms  
cudaMemcpy2DAsync
         
   
   ```
   ```
   fp32
   GPU activities:   29.29%  6.95644s     87602  79.409us  19.072us  133.28us  
void cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, 
int=3, int=3, bool=1, int=512>(int, int, int, float const *, int, 
cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, 
int=3, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, 
int, int, int)
                      16.85%  4.00136s     87609  45.672us  36.960us  71.200us  
void cudnn::winograd::winograd3x3Kernel<float, float, int=4, int=1, int=8, 
bool=0>(cudnn::maxwell::winograd::KernelParams)
                       9.11%  2.16456s     28155  76.879us  72.737us  92.384us  
void cudnn::winograd::winograd3x3Kernel<float, float, int=2, int=2, int=8, 
bool=0>(cudnn::maxwell::winograd::KernelParams)
                       8.96%  2.12819s     33807  62.951us  22.560us  65.729us  
volta_scudnn_128x32_relu_small_nn_v1
                       4.66%  1.10676s     86020  12.866us  7.4880us  16.224us  
void cudnn::detail::bn_fw_tr_1C11_singleread<float, int=512, bool=1, int=1, 
int=2, int=0>(cudnnTensorStruct, float const *, 
cudnn::detail::bn_fw_tr_1C11_singleread<float, int=512, bool=1, int=1, int=2, 
int=0>, cudnnTensorStruct*, float const *, float const , float, float, float*, 
float const *, float const *, float const *, float, float, 
cudnn::reduced_divisor, int, float*, cudnn::detail::bnFwPersistentState*, int, 
float, float, float, int, float, float, cudnnStatus_t*, bool)
                       4.43%  1.05193s     86020  12.228us  7.5520us  24.448us  
void cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, 
int=0>(float, float, float, float, cudnnTensorStruct, float const *, 
cudnn::detail::bn_bw_1C11_singleread<float, int=512, bool=1, int=1, int=2, 
int=0>, float const , cudnn::detail::bn_bw_1C11_singleread<float, int=512, 
bool=1, int=1, int=2, int=0>, cudnnTensorStruct*, float const *, float*, float 
const *, float const , float const , float, cudnn::reduced_divisor, int, 
float*, cudnn::detail::bnBwPersistentState*, int, float, float, float, int, 
float, cudnnStatus_t*, bool)
                       3.24%  769.36ms     85238  9.0260us  4.0960us  16.224us  
void cudnn::detail::activation_bw_4d_kernel<float, float, int=128, int=1, 
int=4, cudnn::detail::relu_func<float, cudnnNanPropagation_t=0, 
bool=0>>(cudnnTensorStruct, float const *, float const , 
cudnn::detail::activation_bw_4d_kernel<float, float, int=128, int=1, int=4, 
cudnn::detail::relu_func<float, cudnnNanPropagation_t=0, bool=0>>, float const 
, cudnnTensorStruct*, float, cudnnTensorStruct*, int, cudnnTensorStruct*)
                       2.84%  675.48ms    102351  6.5990us  3.9360us  13.344us  
void cudnn::detail::activation_fw_4d_kernel<float, float, int=128, int=1, 
int=4, cudnn::detail::relu_func<float, cudnnNanPropagation_t=0, 
bool=0>>(cudnnTensorStruct, float const *, 
cudnn::detail::activation_fw_4d_kernel<float, float, int=128, int=1, int=4, 
cudnn::detail::relu_func<float, cudnnNanPropagation_t=0, bool=0>>, 
cudnnTensorStruct*, float, cudnnTensorStruct*, int, cudnnTensorStruct*)
                       2.80%  665.55ms     32869  20.248us  19.680us  39.808us  
volta_sgemm_128x64_nn
                       2.66%  631.06ms    116704  5.4070us  4.6400us  22.592us  
void cudnn::winograd::generateWinogradTilesKernel<int=0, float, 
float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)
                       2.12%  503.18ms    261188  1.9260us  1.6640us  14.976us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS0_12SGDMomKernelEJPfS4_S4_S4_fffffNS_9OpReqTypeEEEEviDpT0_
                       1.56%  370.84ms    262127  1.4140us  1.2800us  12.384us  
[CUDA memcpy DtoD]
                       1.41%  335.58ms     32873  10.208us  8.2240us  18.688us  
void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, 
float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)
                       1.30%  308.23ms     32873  9.3760us  8.5760us  23.360us  
void cudnn::winograd_nonfused::winogradForwardData4x4<float, 
float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
                       1.22%  289.39ms    203358  1.4230us  1.3120us  12.224us  
[CUDA memset]
                       1.21%  286.42ms     42228  6.7820us  3.5200us  13.312us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS0_3SumEJPfNS_9OpReqTypeES4_S4_EEEviDpT0_
                       1.19%  283.47ms     50706  5.5900us  2.9440us  12.352us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS0_10mshadow_op4plusELi1EEEJPfS7_S7_EEEviDpT0_
                       1.03%  244.04ms    172374  1.4150us  1.1840us  23.425us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS1_10set_to_intILi0EEELi1EEEJPfEEEviDpT0_
   
   API calls:   30.01%  23.7375s   1042031  22.780us  5.0120us  2.8708ms  
cudaStreamSynchronize
                      26.13%  20.6708s   1452568  14.230us  5.9830us  17.019ms  
cudaLaunch
                      12.79%  10.1212s    267274  37.868us  8.3050us  10.654ms  
cudaMemcpy2DAsync
        
    ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to