KellenSunderland commented on issue #14684: When setting 
MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION no speedup observed
URL: 
https://github.com/apache/incubator-mxnet/issues/14684#issuecomment-482674372
 
 
   nvprof summary when autotuning manually disabled via env var:
   ```
   OK
   ==14890== Profiling application: python -m nose --nocapture 
tests/python/tensorrt/test_resnet18.py
   ==14890== Profiling result:
               Type  Time(%)      Time     Calls       Avg       Min       Max  
Name
    GPU activities:   14.18%  5.09395s      2800  1.8193ms  1.2814ms  3.9355ms  
volta_s884cudnn_fp16_128x128_ldg8_relu_exp_interior_nhwc_tn_v1
                      12.01%  4.31486s      4000  1.0787ms  99.232us  4.0081ms  
void nhwcToNchwKernel<float, float, float, bool=1, bool=0>(int, int, int, int, 
float const *, float*, float, float)
                      11.85%  4.25536s      5100  834.38us  107.01us  3.9957ms  
void cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, 
int=1>(float, cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, 
int=1>, cudnnTensorStruct, float const *, float, cudnnTensorStruct*, float, 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1> const *, 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1> const , 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1> const , 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1> const , 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1>)
                      11.17%  4.01310s      5000  802.62us  103.26us  3.6340ms  
void op_generic_tensor_kernel<int=2, float, float, float, int=256, 
cudnnGenericOp_t=8, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, 
int=1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, 
cudnnTensorStruct, float const *, float, float, float, float, dimArray, 
reducedDivisorArray)
                       9.34%  3.35664s      7800  430.34us  1.9840us  3.4739ms  
void nchwToNhwcKernel<float, __half, float, bool=1, bool=0>(int, int, int, int, 
float const *, __half*, float, float)
                       9.07%  3.25925s      1600  2.0370ms  586.53us  5.1388ms  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS0_10mshadow_op4plusELi1EEEJPfS7_S7_EEEviDpT0_
                       8.84%  3.17497s       100  31.750ms  31.199ms  41.755ms  
volta_s884cudnn_fp16_256x64_ldg8_relu_exp_medium_nhwc_tn_v1
                       4.42%  1.58918s      3000  529.73us  334.53us  1.5099ms  
volta_gcgemm_32x32_nt
                       3.01%  1.07966s       700  1.5424ms  1.4746ms  1.8325ms  
volta_sgemm_128x64_nn
                       2.50%  898.49ms       300  2.9950ms  2.9387ms  3.9902ms  
volta_s884cudnn_fp16_256x64_ldg8_relu_exp_small_nhwc_tn_v1
                       1.92%  689.92ms       400  1.7248ms  1.3669ms  2.7854ms  
volta_s884cudnn_fp16_256x128_ldg8_relu_exp_interior_nhwc_tn_v1
                       1.66%  595.43ms       100  5.9543ms  5.8628ms  6.1521ms  
void cudnn::detail::implicit_convolve_sgemm<float, float, int=512, int=6, 
int=8, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, float 
const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, 
int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>*, 
kernel_conv_params, int, float, float, int, float, float, int, int)
                       1.36%  487.73ms      1000  487.73us  203.01us  1.0285ms  
void cudnn::winograd_nonfused::winogradForwardData4x4<float, 
float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
                       1.33%  479.32ms       300  1.5977ms  1.5479ms  2.0199ms  
volta_sgemm_128x128_nn
                       1.27%  455.52ms      3000  151.84us  110.47us  316.39us  
void fft2d_r2c_32x32<float, bool=0, unsigned int=0, bool=0>(float2*, float 
const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, 
bool, int2, int, int)
                       1.14%  409.97ms      1000  409.97us  175.14us  711.01us  
void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, 
float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)
                       1.10%  395.13ms       300  1.3171ms  649.35us  2.1332ms  
volta_s884cudnn_fp16_256x64_ldg8_relu_exp_interior_nhwc_tn_v1
                       0.98%  352.85ms      3000  117.62us  85.248us  376.96us  
void fft2d_c2r_32x32<float, bool=1, bool=0, unsigned int=0, bool=0, 
bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, 
float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
                       0.91%  326.59ms       100  3.2659ms  3.2229ms  3.5159ms  
volta_s884cudnn_fp16_256x128_ldg8_relu_exp_small_nhwc_tn_v1
                       0.69%  248.24ms       200  1.2412ms  1.8880us  3.1299ms  
void nchwToNhwcKernel<float, __half, float, bool=1, bool=1>(int, int, int, int, 
float const *, __half*, float, float)
                       0.65%  235.05ms       100  2.3505ms  2.3323ms  2.5774ms  
void cudnn::detail::pooling_fw_4d_kernel<float, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, int=0, 
bool=0>(cudnnTensorStruct, float const *, 
cudnn::detail::pooling_fw_4d_kernel<float, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, int=0, bool=0>, 
cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, 
cudnn::reduced_divisor, float)
                       0.30%  109.00ms       200  545.01us  214.31us  1.2223ms  
void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=1>(float2*, float 
const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, 
bool, int2, int, int)
                       0.14%  48.678ms      1000  48.678us  7.9040us  182.91us  
void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, 
float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)
                       0.08%  27.217ms       100  272.17us  267.97us  276.51us  
void cudnn::detail::pooling_fw_4d_kernel<float, float, 
cudnn::detail::averpooling_func<float>, int=1, bool=0>(cudnnTensorStruct, float 
const *, cudnn::detail::pooling_fw_4d_kernel<float, float, 
cudnn::detail::averpooling_func<float>, int=1, bool=0>, cudnnTensorStruct*, 
cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, 
float)
                       0.02%  8.8654ms       273  32.474us     832ns  893.22us  
[CUDA memcpy HtoD]
                       0.02%  7.4614ms         1  7.4614ms  7.4614ms  7.4614ms  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS0_19SampleUniformKernelIN7mshadow3gpuEEEJNS_6common6random13RandGeneratorIS5_fEEllllPfSB_SB_EEEviDpT0_
                       0.02%  5.8127ms      4000  1.4530us  1.1840us  3.0400us  
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
                       0.01%  4.5088ms       100  45.087us  44.032us  46.016us  
volta_sgemm_64x32_sliced1x4_tn
                       0.00%  1.0191ms       100  10.190us  10.048us  10.944us  
[CUDA memcpy DtoH]
                       0.00%  390.31us       155  2.5180us     896ns  23.040us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS1_10set_to_intILi0EEELi1EEEJPfEEEviDpT0_
                       0.00%  234.15us       308     760ns     704ns  1.2160us  
[CUDA memset]
                       0.00%  179.52us       100  1.7950us  1.7280us  1.8560us  
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, int=8, 
mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, int=2, float>, float>, 
mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 
int=1, float>, float, int=2, int=1>, float>>(mshadow::gpu, long, 
mshadow::Shape<int=2>, int=2)
                       0.00%  122.34us       100  1.2230us  1.1840us  2.0480us  
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, int=8, 
mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, int=1, float>, float>, 
mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float>>(mshadow::gpu, 
long, mshadow::Shape<int=2>, int=1)
                       0.00%  75.904us         4  18.976us  18.464us  19.776us  
mxnet::common::random::rand_generator_seed_kernel(curandStatePhilox4_32_10*, 
int, unsigned int)
         API calls:   83.18%  35.0825s      1719  20.409ms  2.4440us  81.612ms  
cudaStreamSynchronize
                       6.52%  2.75084s         4  687.71ms  53.991us  1.37538s  
cudaStreamCreate
                       5.78%  2.43728s        22  110.79ms  10.430us  1.21843s  
cudaStreamCreateWithFlags
                       3.36%  1.41612s        24  59.005ms     460ns  380.36ms  
cudaFree
                       0.78%  327.24ms     45660  7.1660us  4.5180us  321.35us  
cudaLaunchKernel
                       0.18%  74.622ms       428  174.35us  85.580us  337.30us  
cudaMemGetInfo
                       0.08%  32.204ms       476  67.656us  7.3130us  1.1096ms  
cudaMalloc
                       0.04%  18.140ms       361  50.249us  5.2090us  917.33us  
cudaMemcpy2DAsync
                       0.02%  6.8661ms     41696     164ns      91ns  184.63us  
cudaGetLastError
                       0.01%  5.0846ms      6200     820ns     531ns  182.36us  
cudaStreamWaitEvent
                       0.01%  4.1680ms      5598     744ns     300ns  3.5560us  
cudaDeviceGetAttribute
                       0.01%  3.2336ms       308  10.498us  5.6600us  178.83us  
cudaMemsetAsync
                       0.01%  3.1566ms         4  789.14us  439.39us  1.0725ms  
cudaGetDeviceProperties
                       0.01%  2.2504ms      2102  1.0700us     521ns  203.18us  
cudaEventRecord
                       0.00%  2.0044ms      1524  1.3150us     551ns  51.497us  
cudaSetDevice
                       0.00%  1.8772ms       375  5.0050us     130ns  298.67us  
cuDeviceGetAttribute
                       0.00%  1.3166ms       216  6.0950us     511ns  493.32us  
cudaEventCreateWithFlags
                       0.00%  1.2994ms       912  1.4240us     470ns  67.767us  
cudaFuncSetAttribute
                       0.00%  689.62us       448  1.5390us     350ns  7.4940us  
cudaGetDevice
                       0.00%  653.04us         2  326.52us  32.952us  620.09us  
cudaHostAlloc
                       0.00%  590.76us         4  147.69us  120.24us  223.85us  
cuDeviceTotalMem
                       0.00%  556.15us       100  5.5610us  4.0880us  8.4350us  
cudaBindTexture
                       0.00%  419.63us       206  2.0370us  1.2320us  4.5490us  
cudaEventCreate
                       0.00%  411.75us      1960     210ns     160ns     701ns  
cudaPeekAtLastError
                       0.00%  262.82us        12  21.901us  8.9360us  62.768us  
cudaMemcpy
                       0.00%  255.52us       100  2.5550us  2.1640us  5.8610us  
cudaEventQuery
                       0.00%  233.79us         4  58.447us  43.963us  75.261us  
cuDeviceGetName
                       0.00%  211.93us       200  1.0590us     882ns  2.2940us  
cudaEventDestroy
                       0.00%  90.850us         8  11.356us  9.8280us  12.844us  
cudaStreamCreateWithPriority
                       0.00%  69.841us       100     698ns     651ns  1.0520us  
cudaUnbindTexture
                       0.00%  18.905us         2  9.4520us  1.8630us  17.042us  
cudaHostGetDevicePointer
                       0.00%  7.3340us         4  1.8330us     311ns  6.0010us  
cudaGetDeviceCount
                       0.00%  3.8870us         6     647ns     290ns  1.5430us  
cuDeviceGetCount
                       0.00%  2.6950us         3     898ns     781ns  1.1120us  
cuInit
                       0.00%  2.4950us         5     499ns     300ns     811ns  
cuDeviceGet
                       0.00%  2.3650us         2  1.1820us     982ns  1.3830us  
cudaDeviceGetStreamPriorityRange
                       0.00%  2.1840us         1  2.1840us  2.1840us  2.1840us  
cuDeviceGetPCIBusId
                       0.00%  1.6620us         3     554ns     360ns     691ns  
cuDriverGetVersion
                       0.00%  1.2130us         4     303ns     200ns     421ns  
cuDeviceGetUuid
   ```
   nvprof summary with some local code changes to autotuning to allow selection 
of cudnn mixed math mode:
   ```
   OK
   ==15515== Profiling application: python -m nose --nocapture 
tests/python/tensorrt/test_resnet18.py
   ==15515== Profiling result:
               Type  Time(%)      Time     Calls       Avg       Min       Max  
Name
    GPU activities:   13.40%  5.05724s      2810  1.7997ms  1.2716ms  3.3043ms  
volta_s884cudnn_fp16_128x128_ldg8_relu_exp_interior_nhwc_tn_v1
                      11.26%  4.24892s      5100  833.12us  107.26us  3.6032ms  
void cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, 
int=1>(float, cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, 
int=1>, cudnnTensorStruct, float const *, float, cudnnTensorStruct*, float, 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1> const *, 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1> const , 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1> const , 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1> const , 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1>)
                      11.09%  4.18593s      4045  1.0348ms  3.7120us  4.2788ms  
void nhwcToNchwKernel<float, float, float, bool=1, bool=0>(int, int, int, int, 
float const *, float*, float, float)
                      10.64%  4.01643s      5000  803.29us  103.20us  3.4755ms  
void op_generic_tensor_kernel<int=2, float, float, float, int=256, 
cudnnGenericOp_t=8, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, 
int=1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, 
cudnnTensorStruct, float const *, float, float, float, float, dimArray, 
reducedDivisorArray)
                       9.40%  3.54724s      7921  447.83us  1.9840us  3.8538ms  
void nchwToNhwcKernel<float, __half, float, bool=1, bool=0>(int, int, int, int, 
float const *, __half*, float, float)
                       8.64%  3.26151s      1600  2.0384ms  589.22us  5.4096ms  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS0_10mshadow_op4plusELi1EEEJPfS7_S7_EEEviDpT0_
                       8.42%  3.17879s       101  31.473ms  30.896ms  41.752ms  
volta_s884cudnn_fp16_256x64_ldg8_relu_exp_medium_nhwc_tn_v1
                       3.82%  1.44283s       308  4.6845ms  4.1789ms  5.8893ms  
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
                       2.84%  1.07107s       704  1.5214ms  1.4529ms  2.1326ms  
volta_sgemm_128x64_nn
                       2.53%  952.72ms       503  1.8941ms  1.3666ms  2.9157ms  
volta_s884cudnn_fp16_256x128_ldg8_relu_exp_interior_nhwc_tn_v1
                       2.44%  919.53ms       306  3.0050ms  2.7753ms  3.3901ms  
volta_s884cudnn_fp16_256x128_ldg8_relu_exp_small_nhwc_tn_v1
                       1.34%  505.07ms      1016  497.12us  208.51us  2.0073ms  
void cudnn::winograd_nonfused::winogradForwardData4x4<float, 
float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
                       1.26%  476.33ms       304  1.5669ms  1.5136ms  3.2364ms  
volta_sgemm_128x128_nn
                       1.11%  419.77ms      1016  413.16us  173.38us  1.5986ms  
void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, 
float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)
                       1.06%  399.43ms       302  1.3226ms  651.11us  1.8372ms  
volta_s884cudnn_fp16_256x64_ldg8_relu_exp_interior_nhwc_tn_v1
                       0.99%  372.24ms       689  540.26us  34.848us  9.6280ms  
volta_gcgemm_32x32_nt
                       0.66%  248.94ms       203  1.2263ms  1.9840us  3.1277ms  
void nchwToNhwcKernel<float, __half, float, bool=1, bool=1>(int, int, int, int, 
float const *, __half*, float, float)
                       0.64%  242.64ms        36  6.7400ms  1.6590ms  11.372ms  
void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, 
int=5, bool=1>(int, int, int, float const *, int, float const , int, 
cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, 
bool=1>*, kernel_grad_params, int, int, float, int, int, int)
                       0.62%  235.12ms       100  2.3512ms  2.3274ms  2.6129ms  
void cudnn::detail::pooling_fw_4d_kernel<float, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, int=0, 
bool=0>(cudnnTensorStruct, float const *, 
cudnn::detail::pooling_fw_4d_kernel<float, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, int=0, bool=0>, 
cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, 
cudnn::reduced_divisor, float)
                       0.47%  176.52ms        26  6.7892ms  3.7929ms  10.104ms  
volta_scudnn_128x128_stridedB_splitK_interior_nn_v1
                       0.43%  163.37ms       507  322.23us  147.42us  7.8177ms  
volta_gcgemm_64x64_nt
                       0.42%  158.08ms        28  5.6456ms  3.1871ms  7.4907ms  
void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, 
int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, 
cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, 
int=5, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, 
int, int, int)
                       0.37%  141.02ms        23  6.1315ms  661.22us  111.86ms  
turing_s1688cudnn_fp16_128x128_ldg8_wgrad_idx_exp_interior_nhwc_nt_v1
                       0.36%  134.98ms        26  5.1917ms  2.8959ms  6.6312ms  
void cudnn::detail::implicit_convolve_sgemm<float, float, int=512, int=6, 
int=8, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, float 
const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, 
int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>*, 
kernel_conv_params, int, float, float, int, float, float, int, int)
                       0.26%  97.324ms         2  48.662ms  48.660ms  48.664ms  
void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, 
int=5, bool=0, bool=1>(int, int, int, float const *, int, float const , int, 
cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, int=5, 
bool=0, bool=1>*, kernel_grad_params, int, int, float, int, int)
                       0.26%  97.125ms         8  12.141ms  6.4080ms  21.061ms  
volta_scudnn_128x128_stridedB_splitK_xregs_large_nn_v1
                       0.25%  94.988ms        12  7.9156ms  4.5169ms  10.316ms  
void cudnn::detail::dgrad_engine<float, int=128, int=6, int=7, int=3, int=3, 
int=5, bool=1>(int, int, int, float const *, int, float const , int, 
cudnn::detail::dgrad_engine<float, int=128, int=6, int=7, int=3, int=3, int=5, 
bool=1>*, kernel_grad_params, int, int, float, int, int, int)
                       0.25%  93.742ms         4  23.435ms  15.013ms  31.871ms  
void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, 
int=5, bool=1, bool=1>(int, int, int, float const *, int, float const , int, 
cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, int=5, 
bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int)
                       0.22%  82.904ms         6  13.817ms  12.607ms  14.446ms  
void pointwise_mult_and_sum_complex<float2, int=8, int=4>(float2*, float2*, 
float2*, int, int, int, int, int, float2)
                       0.19%  73.304ms         6  12.217ms  6.3788ms  19.340ms  
void cudnn::detail::implicit_convolve_sgemm<float, float, int=128, int=5, 
int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, float 
const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, 
int=128, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, 
kernel_conv_params, int, float, float, int, float, float, int, int)
                       0.19%  72.452ms         8  9.0565ms  5.7770ms  11.804ms  
void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=7, int=3, 
int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, 
cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=7, int=3, int=3, 
int=5, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, 
int, int, int)
                       0.19%  70.002ms         6  11.667ms  4.3561ms  16.232ms  
void cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, 
int=3, int=3, bool=1, int=512>(int, int, int, float const *, int, 
cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, 
int=3, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, 
int, int, int)
                       0.17%  65.375ms        13  5.0288ms  2.8111ms  6.3536ms  
void cudnn::detail::explicit_convolve_sgemm<float, int, int=512, int=6, int=8, 
int=3, int=3, int=5, int=0, bool=1>(int, int, int, float const *, int, float 
const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=512, int=6, 
int=8, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, int, 
float, float, int, float const *, float const *)
                       0.16%  59.405ms        10  5.9405ms  3.4627ms  7.8461ms  
void cudnn::detail::implicit_convolve_sgemm<float, float, int=128, int=6, 
int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, float 
const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, 
int=128, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>*, 
kernel_conv_params, int, float, float, int, float, float, int, int)
                       0.16%  58.491ms        18  3.2495ms  769.12us  6.1755ms  
void transpose_readWrite_alignment_kernel<float2, float2, int=1, bool=0, int=6, 
int=4, int=4>(cublasTransposeParams<float2>, float2 const *, float2*, float2 
const *)
                       0.13%  50.357ms        86  585.55us  9.7930us  1.1498ms  
void fft1d_r2c_32<float, float, float2, bool=0, bool=0>(float2*, float const *, 
int, int3, int3, int2, int2)
                       0.13%  49.964ms        23  2.1723ms  178.11us  6.6909ms  
void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, 
cudnnTensor4dStruct, float const *, float*, int)
                       0.13%  48.234ms      1016  47.474us  3.0720us  187.94us  
void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, 
float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)
                       0.12%  46.584ms        12  3.8820ms  2.8701ms  5.8556ms  
void DSE::vector_fft<int=0, int=1, int=256, int=16, int=16, int=1, float, 
float, float2>(float2*, float2, int, int3, float2*)
                       0.12%  43.493ms        12  3.6244ms  2.7776ms  5.3013ms  
void DSE::regular_fft_pad<int=0, int=1, int=256, int=16, int=16, int=1, float, 
float, float2>(float2*, float*, int, int3, float*, int, float*, float*, int, 
int, int, int, int, bool)
                       0.11%  42.344ms       533  79.445us  44.288us  584.39us  
void fft2d_c2r_32x32<float, bool=1, bool=0, unsigned int=0, bool=0, 
bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, 
float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
                       0.11%  39.906ms       533  74.869us  42.912us  666.27us  
void fft2d_r2c_32x32<float, bool=1, unsigned int=0, bool=0>(float2*, float 
const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, 
bool, int2, int, int)
                       0.10%  39.438ms        10  3.9438ms  2.0747ms  8.4053ms  
volta_gcgemm_64x32_nt
                       0.10%  38.789ms        74  524.17us  137.98us  1.0079ms  
void fft1d_c2r_32<float2, float, float, bool=0, bool=1, bool=0, bool=0>(float*, 
float2 const *, int, int3, int3, int2, int, float, float, float*, float*)
                       0.10%  36.962ms         6  6.1603ms  4.5783ms  8.8220ms  
void DSE::regular_fft_clip<int=1, int=2, int=256, int=16, int=16, int=1, float, 
float, float2>(float*, float2*, int, int3, float2*, int, float2*, float2*, int, 
int, int, int, int, float, float, bool, int, float, float)
                       0.10%  36.422ms       548  66.463us  19.904us  5.8921ms  
volta_cgemm_32x32_tn
                       0.09%  35.153ms       129  272.50us     896ns  1.9033ms  
void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
                       0.09%  34.957ms         2  17.478ms  17.473ms  17.484ms  
void cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, 
int=3, int=3, bool=0, int=512>(int, int, int, float const *, int, 
cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, 
int=3, bool=0, int=512>*, float const , kernel_grad_params, int, float, int, 
int, int, int)
                       0.09%  34.492ms         4  8.6230ms  8.5680ms  8.7119ms  
volta_cgemm_32x64_tn
                       0.09%  34.063ms         4  8.5158ms  3.3317ms  13.708ms  
void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, 
int=5, bool=0>(int, int, int, float const *, int, float const , int, 
cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, 
bool=0>*, kernel_grad_params, int, int, float, int, int, int)
                       0.09%  33.719ms         3  11.240ms  5.8078ms  14.690ms  
void cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, int=5, 
int=3, int=3, int=3, int=0, bool=0>(int, int, int, float const *, int, float 
const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, 
int=5, int=3, int=3, int=3, int=0, bool=0>*, kernel_conv_params, int, int, 
float, float, int, float const *, float const *)
                       0.09%  33.531ms       597  56.166us  8.8320us  640.29us  
void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=0, bool=0, 
bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, 
float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
                       0.08%  31.495ms         9  3.4994ms  2.4863ms  5.4466ms  
volta_scudnn_128x128_stridedB_interior_nn_v1
                       0.08%  31.056ms       603  51.501us  11.232us  416.00us  
void fft2d_r2c_32x32<float, bool=0, unsigned int=0, bool=0>(float2*, float 
const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, 
bool, int2, int, int)
                       0.08%  30.558ms         7  4.3654ms  2.4309ms  5.3042ms  
volta_scudnn_128x128_relu_interior_nn_v1
                       0.07%  27.287ms       100  272.87us  267.46us  331.74us  
void cudnn::detail::pooling_fw_4d_kernel<float, float, 
cudnn::detail::averpooling_func<float>, int=1, bool=0>(cudnnTensorStruct, float 
const *, cudnn::detail::pooling_fw_4d_kernel<float, float, 
cudnn::detail::averpooling_func<float>, int=1, bool=0>, cudnnTensorStruct*, 
cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, 
float)
                       0.07%  27.067ms         5  5.4133ms  5.3398ms  5.6404ms  
volta_scudnn_128x128_relu_small_nn_v1
                       0.07%  25.481ms         3  8.4938ms  8.0045ms  8.8875ms  
void cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=6, int=7, 
int=3, int=3, int=5, int=0, bool=0>(int, int, int, float const *, int, float 
const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=6, 
int=7, int=3, int=3, int=5, int=0, bool=0>*, kernel_conv_params, int, int, 
float, float, int, float const *, float const *)
                       0.07%  24.764ms        12  2.0636ms  9.6960us  6.8492ms  
void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=0>(float2*, float 
const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, 
bool, int2, int, int)
                       0.06%  23.241ms         4  5.8103ms  5.6200ms  5.9622ms  
volta_scudnn_128x128_stridedB_splitK_small_nn_v1
                       0.06%  23.227ms         6  3.8711ms  2.8561ms  5.8348ms  
void DSE::vector_fft<int=1, int=2, int=256, int=16, int=16, int=1, float, 
float, float2>(float2*, float2, int, int3, float2*)
                       0.06%  22.670ms         4  5.6675ms  5.5902ms  5.7560ms  
volta_scudnn_128x128_stridedB_splitK_medium_nn_v1
                       0.06%  22.175ms         8  2.7719ms  1.3040ms  5.1467ms  
volta_sgemm_128x128_nt
                       0.06%  22.056ms         4  5.5141ms  3.2305ms  7.8496ms  
volta_scudnn_128x64_stridedB_splitK_interior_nn_v1
                       0.06%  21.579ms         2  10.789ms  10.777ms  10.802ms  
void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=6, int=5, int=4, 
int=4, bool=1, bool=1>(int, int, int, float const *, int, float const , int, 
cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=6, int=5, int=4, int=4, 
bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int)
                       0.05%  19.623ms         7  2.8033ms  1.0100ms  5.0506ms  
volta_scudnn_128x64_relu_interior_nn_v1
                       0.05%  18.229ms         9  2.0255ms  1.3395ms  3.3920ms  
volta_s884cudnn_fp16_256x128_ldg8_dgrad_exp_interior_nhwc2nchw_tt_v1
                       0.05%  17.966ms        10  1.7966ms  11.488us  7.0949ms  
void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=1>(float2*, float 
const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, 
bool, int2, int, int)
                       0.05%  17.306ms         6  2.8843ms  1.4342ms  5.8354ms  
void DSE::vector_fft<int=0, int=1, int=128, int=8, int=8, int=1, float, float, 
float2>(float2*, float2, int, int3, float2*)
                       0.04%  15.174ms         6  2.5289ms  1.3160ms  4.9404ms  
void DSE::regular_fft_pad<int=0, int=1, int=128, int=16, int=32, int=1, float, 
float, float2>(float2*, float*, int, int3, float*, int, float*, float*, int, 
int, int, int, int, bool)
                       0.04%  14.933ms         2  7.4666ms  7.4635ms  7.4696ms  
void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, 
int=3, int=5, bool=0, int=512>(int, int, int, float const *, int, 
cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, 
int=5, bool=0, int=512>*, float const , kernel_grad_params, int, float, int, 
int, int, int)
                       0.03%  13.058ms         3  4.3525ms  2.2559ms  8.5455ms  
void DSE::regular_fft_clip<int=1, int=2, int=128, int=16, int=32, int=1, float, 
float, float2>(float*, float2*, int, int3, float2*, int, float2*, float2*, int, 
int, int, int, int, float, float, bool, int, float, float)
                       0.03%  11.448ms         8  1.4310ms  1.4170ms  1.4500ms  
volta_sgemm_128x64_nt
                       0.03%  11.420ms         2  5.7100ms  5.6846ms  5.7354ms  
volta_scudnn_128x64_stridedB_small_nn_v1
                       0.03%  11.125ms         2  5.5623ms  5.5477ms  5.5769ms  
volta_scudnn_128x64_relu_small_nn_v1
                       0.03%  10.688ms         2  5.3441ms  5.3318ms  5.3565ms  
volta_scudnn_128x128_stridedB_small_nn_v1
                       0.03%  9.4698ms         1  9.4698ms  9.4698ms  9.4698ms  
volta_scudnn_128x64_relu_medium_nn_v1
                       0.02%  8.6419ms         3  2.8806ms  1.4405ms  5.7557ms  
void DSE::vector_fft<int=1, int=2, int=128, int=8, int=8, int=1, float, float, 
float2>(float2*, float2, int, int3, float2*)
                       0.02%  8.6103ms       273  31.539us     864ns  871.46us  
[CUDA memcpy HtoD]
                       0.02%  8.5760ms         6  1.4293ms  535.55us  3.2656ms  
void fft2d_r2c_64x64<float>(float2*, float const *, int, int, int, int, int, 
int, int, int)
                       0.02%  8.2911ms         8  1.0364ms  246.40us  2.2406ms  
void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, 
float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)
                       0.02%  8.1236ms         2  4.0618ms  4.0554ms  4.0682ms  
void cudnn::detail::implicit_convolve_sgemm<float, float, int=1024, int=6, 
int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, float 
const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, 
int=1024, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>*, 
kernel_conv_params, int, float, float, int, float, float, int, int)
                       0.02%  7.5464ms         1  7.5464ms  7.5464ms  7.5464ms  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS0_19SampleUniformKernelIN7mshadow3gpuEEEJNS_6common6random13RandGeneratorIS5_fEEllllPfSB_SB_EEEviDpT0_
                       0.02%  7.4052ms         2  3.7026ms  3.2546ms  4.1506ms  
void cudnn::detail::explicit_convolve_sgemm<float, int, int=1024, int=6, int=7, 
int=3, int=3, int=5, int=0, bool=1>(int, int, int, float const *, int, float 
const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=1024, 
int=6, int=7, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, 
int, float, float, int, float const *, float const *)
                       0.02%  7.2140ms         8  901.75us  228.99us  1.9611ms  
void cudnn::winograd_nonfused::winogradWgradData4x4<float, 
float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
                       0.02%  6.8404ms         3  2.2801ms  1.0550ms  3.1240ms  
volta_scudnn_128x64_stridedB_interior_nn_v1
                       0.02%  5.8239ms      4078  1.4280us  1.0560us  3.2000us  
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
                       0.01%  5.6494ms         2  2.8247ms  2.7725ms  2.8769ms  
volta_s884cudnn_fp16_256x128_ldg8_dgrad_exp_small_nhwc2nchw_tt_v1
                       0.01%  4.9043ms         5  980.85us  292.00us  3.7150ms  
void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, 
bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, 
float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
                       0.01%  4.3870ms       100  43.869us  43.200us  44.960us  
volta_sgemm_64x32_sliced1x4_tn
                       0.01%  4.0243ms         1  4.0243ms  4.0243ms  4.0243ms  
volta_scudnn_128x32_relu_interior_nn_v1
                       0.01%  3.9077ms         3  1.3026ms  460.42us  2.9866ms  
void fft2d_c2r_64x64<float, bool=0>(float*, float2*, int, int, int, int, int, 
int, int, int, int, int, float, float, int, float*, float*)
                       0.01%  3.6527ms         1  3.6527ms  3.6527ms  3.6527ms  
void cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=6, int=7, 
int=3, int=3, int=5, int=0, bool=1>(int, int, int, float const *, int, float 
const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=6, 
int=7, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, int, 
float, float, int, float const *, float const *)
                       0.01%  3.0749ms         2  1.5375ms  1.5373ms  1.5376ms  
void cudnn::detail::implicit_convolve_sgemm<float, float, int=1024, int=5, 
int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, float 
const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, 
int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, 
kernel_conv_params, int, float, float, int, float, float, int, int)
                       0.01%  3.0193ms         1  3.0193ms  3.0193ms  3.0193ms  
volta_s884cudnn_fp16_256x64_ldg8_relu_exp_small_nhwc_tn_v1
                       0.01%  3.0026ms         1  3.0026ms  3.0026ms  3.0026ms  
volta_s884cudnn_fp16_256x64_ldg8_dgrad_exp_small_nhwc2nchw_tt_v1
                       0.01%  2.8697ms         1  2.8697ms  2.8697ms  2.8697ms  
volta_s884cudnn_fp16_128x128_ldg8_dgrad_exp_small_nhwc2nchw_tt_v1
                       0.01%  2.5288ms        12  210.73us  14.496us  772.55us  
void fft1d_r2c_32<float, float, float2, bool=1, bool=0>(float2*, float const *, 
int, int3, int3, int2, int2)
                       0.01%  2.4241ms         2  1.2120ms  774.08us  1.6500ms  
volta_s884cudnn_fp16_256x64_ldg8_dgrad_exp_interior_nhwc2nchw_tt_v1
                       0.00%  1.6242ms         1  1.6242ms  1.6242ms  1.6242ms  
void cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, int=5, 
int=3, int=3, int=3, int=0, bool=1>(int, int, int, float const *, int, float 
const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, 
int=5, int=3, int=3, int=3, int=0, bool=1>*, kernel_conv_params, int, int, 
float, float, int, float const *, float const *)
                       0.00%  1.3018ms         1  1.3018ms  1.3018ms  1.3018ms  
volta_s884cudnn_fp16_128x128_ldg8_dgrad_exp_interior_nhwc2nchw_tt_v1
                       0.00%  1.2119ms       308  3.9340us  3.0400us  91.873us  
void cudnn::winograd::generateWinogradTilesKernel<int=0, float, 
float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)
                       0.00%  1.0802ms         4  270.06us  78.817us  571.11us  
void flip_filter<float, float>(float*, float const *, int, int, int, int)
                       0.00%  1.0155ms       100  10.154us  10.048us  11.232us  
[CUDA memcpy DtoH]
                       0.00%  586.69us         8  73.336us  8.8640us  205.31us  
void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, 
float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)
                       0.00%  397.51us       155  2.5640us     896ns  23.073us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS1_10set_to_intILi0EEELi1EEEJPfEEEviDpT0_
                       0.00%  352.96us       426     828ns     704ns  1.6640us  
[CUDA memset]
                       0.00%  178.66us       100  1.7860us  1.7600us  1.9200us  
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, int=8, 
mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, int=2, float>, float>, 
mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 
int=1, float>, float, int=2, int=1>, float>>(mshadow::gpu, long, 
mshadow::Shape<int=2>, int=2)
                       0.00%  121.50us       100  1.2150us  1.1520us  2.0480us  
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, int=8, 
mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, int=1, float>, float>, 
mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float>>(mshadow::gpu, 
long, mshadow::Shape<int=2>, int=1)
                       0.00%  95.105us         8  11.888us  1.6320us  31.232us  
compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, 
float2 const *, int, int)
                       0.00%  76.000us         4  19.000us  18.432us  19.648us  
mxnet::common::random::rand_generator_seed_kernel(curandStatePhilox4_32_10*, 
int, unsigned int)
                       0.00%  68.416us        46  1.4870us  1.3120us  2.0800us  
cudnn::gemm::computeWgradSplitKOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
                       0.00%  60.224us        46  1.3090us  1.1840us  1.6640us  
cudnn::gemm::computeWgradBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
                       0.00%  39.168us        23  1.7020us  1.6000us  1.9520us  
cudnn::gemm::computeWgradOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
                       0.00%  31.904us        32     997ns     800ns  1.0560us  
cudnn::gemm::computeBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
                       0.00%  2.9120us         1  2.9120us  2.9120us  2.9120us  
void nhwcToNchwKernel<float, float, float, bool=1, bool=1>(int, int, int, int, 
float const *, float*, float, float)
         API calls:   73.35%  33.2313s      1719  19.332ms  1.6640us  71.954ms  
cudaStreamSynchronize
                       8.06%  3.65055s       413  8.8391ms  1.0750ms  116.99ms  
cudaEventSynchronize
                       5.64%  2.55627s         4  639.07ms  62.838us  1.27814s  
cudaStreamCreate
                       5.02%  2.27368s        22  103.35ms  10.129us  1.14912s  
cudaStreamCreateWithFlags
                       3.61%  1.63397s       423  3.8628ms     431ns  339.18ms  
cudaFree
                       3.20%  1.45069s       910  1.5942ms  5.9110us  65.384ms  
cudaMalloc
                       0.75%  339.63ms     42263  8.0360us  4.5490us  338.43us  
cudaLaunchKernel
                       0.18%  82.715ms       497  166.43us  75.802us  283.23us  
cudaMemGetInfo
                       0.06%  26.611ms       912  29.178us     410ns  12.483ms  
cudaFuncSetAttribute
                       0.04%  17.356ms       361  48.076us  5.2600us  915.07us  
cudaMemcpy2DAsync
                       0.02%  6.9943ms     38409     182ns      90ns  180.90us  
cudaGetLastError
                       0.01%  5.7843ms       426  13.578us  4.9390us  656.69us  
cudaMemsetAsync
                       0.01%  5.1182ms      1422  3.5990us     531ns  166.17us  
cudaEventRecord
                       0.01%  4.2874ms      5640     760ns     280ns  202.70us  
cudaDeviceGetAttribute
                       0.01%  3.0592ms         4  764.79us  438.02us  1.0360ms  
cudaGetDeviceProperties
                       0.01%  2.4158ms      1971  1.2250us     541ns  42.449us  
cudaStreamWaitEvent
                       0.00%  1.8599ms      1524  1.2200us     551ns  25.287us  
cudaSetDevice
                       0.00%  1.8104ms       375  4.8270us     110ns  302.84us  
cuDeviceGetAttribute
                       0.00%  1.4840ms       354  4.1920us     481ns  387.61us  
cudaEventCreateWithFlags
                       0.00%  1.1766ms       413  2.8480us  1.2320us  6.3020us  
cudaEventElapsedTime
                       0.00%  993.37us       230  4.3190us  1.7030us  41.909us  
cudaBindTexture
                       0.00%  937.61us       655  1.4310us     331ns  5.0890us  
cudaGetDevice
                       0.00%  698.00us         2  349.00us  31.279us  666.72us  
cudaHostAlloc
                       0.00%  557.15us         4  139.29us  108.98us  227.35us  
cuDeviceTotalMem
                       0.00%  439.33us      1960     224ns     160ns     761ns  
cudaPeekAtLastError
                       0.00%  303.15us        12  25.262us  8.0850us  55.845us  
cudaMemcpy
                       0.00%  291.00us       182  1.5980us     510ns  2.8850us  
cudaEventDestroy
                       0.00%  286.73us       230  1.2460us     511ns  2.6150us  
cudaUnbindTexture
                       0.00%  282.26us       100  2.8220us  2.2040us  5.0600us  
cudaEventQuery
                       0.00%  208.14us         4  52.035us  41.428us  77.075us  
cuDeviceGetName
                       0.00%  142.87us        50  2.8570us  1.0020us  13.055us  
cudaEventCreate
                       0.00%  90.188us         8  11.273us  9.5380us  12.804us  
cudaStreamCreateWithPriority
                       0.00%  7.2850us         4  1.8210us     351ns  5.1400us  
cudaGetDeviceCount
                       0.00%  3.7470us         2  1.8730us  1.8640us  1.8830us  
cudaHostGetDevicePointer
                       0.00%  3.3650us         6     560ns     280ns  1.5930us  
cuDeviceGetCount
                       0.00%  2.7850us         2  1.3920us  1.1520us  1.6330us  
cudaDeviceGetStreamPriorityRange
                       0.00%  2.3640us         3     788ns     571ns  1.0120us  
cuInit
                       0.00%  2.1240us         1  2.1240us  2.1240us  2.1240us  
cuDeviceGetPCIBusId
                       0.00%  1.9740us         5     394ns     250ns     791ns  
cuDeviceGet
                       0.00%  1.4920us         3     497ns     380ns     581ns  
cuDriverGetVersion
                       0.00%     934ns         4     233ns     171ns     391ns  
cuDeviceGetUuid
   ```
   
   nvprof without local changes:
   ```
   OK
   ==16580== Profiling application: python -m nose --nocapture 
tests/python/tensorrt/test_resnet18.py
   ==16580== Profiling result:
               Type  Time(%)      Time     Calls       Avg       Min       Max  
Name
    GPU activities:   17.26%  5.82397s      2007  2.9018ms  840.77us  5.5379ms  
volta_scudnn_128x64_relu_interior_nn_v1
                      14.37%  4.85036s      1307  3.7111ms  2.4247ms  6.2049ms  
volta_scudnn_128x128_relu_interior_nn_v1
                      12.64%  4.26646s      5100  836.56us  107.49us  4.2456ms  
void cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, 
int=1>(float, cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, 
int=1>, cudnnTensorStruct, float const *, float, cudnnTensorStruct*, float, 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1> const *, 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1> const , 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1> const , 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1> const , 
cudnn::detail::bn_fw_inf_1C11_kernel_new<float, float, bool=1, int=1>)
                      11.88%  4.00900s      5000  801.80us  102.98us  3.6902ms  
void op_generic_tensor_kernel<int=2, float, float, float, int=256, 
cudnnGenericOp_t=8, cudnnNanPropagation_t=0, cudnnDimOrder_t=0, 
int=1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const *, 
cudnnTensorStruct, float const *, float, float, float, float, dimArray, 
reducedDivisorArray)
                       9.63%  3.25088s      1600  2.0318ms  584.96us  5.5360ms  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS0_10mshadow_op4plusELi1EEEJPfS7_S7_EEEviDpT0_
                       5.32%  1.79633s       305  5.8896ms  5.3480ms  6.2147ms  
volta_scudnn_128x128_relu_small_nn_v1
                       4.46%  1.50527s       308  4.8872ms  4.1811ms  5.8611ms  
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1
                       3.37%  1.13806s       704  1.6166ms  1.4525ms  1.9001ms  
volta_sgemm_128x64_nn
                       3.11%  1.04966s       301  3.4872ms  3.4512ms  4.0844ms  
volta_scudnn_128x32_relu_interior_nn_v1
                       2.05%  690.43ms       101  6.8359ms  6.2262ms  8.7965ms  
volta_scudnn_128x64_relu_medium_nn_v1
                       1.50%  506.92ms       304  1.6675ms  1.5213ms  3.2035ms  
volta_sgemm_128x128_nn
                       1.49%  501.23ms      1016  493.34us  203.62us  1.9543ms  
void cudnn::winograd_nonfused::winogradForwardData4x4<float, 
float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
                       1.24%  417.70ms      1016  411.13us  174.98us  1.5269ms  
void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, 
float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)
                       1.11%  373.00ms       689  541.36us  40.192us  9.5589ms  
volta_gcgemm_32x32_nt
                       0.72%  244.26ms        36  6.7850ms  1.6632ms  11.361ms  
void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, 
int=5, bool=1>(int, int, int, float const *, int, float const , int, 
cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, 
bool=1>*, kernel_grad_params, int, int, float, int, int, int)
                       0.72%  242.92ms       100  2.4292ms  2.3373ms  2.5336ms  
void cudnn::detail::pooling_fw_4d_kernel<float, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, int=0, 
bool=0>(cudnnTensorStruct, float const *, 
cudnn::detail::pooling_fw_4d_kernel<float, float, 
cudnn::detail::maxpooling_func<float, cudnnNanPropagation_t=0>, int=0, bool=0>, 
cudnnTensorStruct*, cudnnPoolingStruct, float, cudnnPoolingStruct, int, 
cudnn::reduced_divisor, float)
                       0.53%  178.42ms        26  6.8624ms  3.7675ms  10.406ms  
volta_scudnn_128x128_stridedB_splitK_interior_nn_v1
                       0.48%  162.44ms       507  320.40us  145.25us  7.7573ms  
volta_gcgemm_64x64_nt
                       0.47%  158.56ms        28  5.6629ms  3.1609ms  7.5050ms  
void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, 
int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, 
cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, 
int=5, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, 
int, int, int)
                       0.42%  143.11ms        23  6.2220ms  657.83us  113.92ms  
turing_s1688cudnn_fp16_128x128_ldg8_wgrad_idx_exp_interior_nhwc_nt_v1
                       0.41%  137.64ms        26  5.2940ms  2.8950ms  6.6520ms  
void cudnn::detail::implicit_convolve_sgemm<float, float, int=512, int=6, 
int=8, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, float 
const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, 
int=512, int=6, int=8, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>*, 
kernel_conv_params, int, float, float, int, float, float, int, int)
                       0.31%  104.35ms         8  13.043ms  6.4004ms  28.410ms  
volta_scudnn_128x128_stridedB_splitK_xregs_large_nn_v1
                       0.29%  97.450ms         2  48.725ms  48.693ms  48.757ms  
void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, 
int=5, bool=0, bool=1>(int, int, int, float const *, int, float const , int, 
cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, int=5, 
bool=0, bool=1>*, kernel_grad_params, int, int, float, int, int)
                       0.28%  95.232ms        12  7.9360ms  4.5037ms  10.366ms  
void cudnn::detail::dgrad_engine<float, int=128, int=6, int=7, int=3, int=3, 
int=5, bool=1>(int, int, int, float const *, int, float const , int, 
cudnn::detail::dgrad_engine<float, int=128, int=6, int=7, int=3, int=3, int=5, 
bool=1>*, kernel_grad_params, int, int, float, int, int, int)
                       0.28%  93.752ms         4  23.438ms  15.116ms  31.757ms  
void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, 
int=5, bool=1, bool=1>(int, int, int, float const *, int, float const , int, 
cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=7, int=5, int=4, int=5, 
bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int)
                       0.25%  83.395ms         6  13.899ms  12.643ms  14.597ms  
void pointwise_mult_and_sum_complex<float2, int=8, int=4>(float2*, float2*, 
float2*, int, int, int, int, int, float2)
                       0.22%  75.303ms         6  12.551ms  6.4412ms  21.700ms  
void cudnn::detail::implicit_convolve_sgemm<float, float, int=128, int=5, 
int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, float 
const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, 
int=128, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, 
kernel_conv_params, int, float, float, int, float, float, int, int)
                       0.21%  72.541ms         8  9.0676ms  5.8252ms  11.804ms  
void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=7, int=3, 
int=3, int=5, bool=1, int=512>(int, int, int, float const *, int, 
cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=7, int=3, int=3, 
int=5, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, 
int, int, int)
                       0.21%  72.472ms       121  598.94us  2.0160us  3.5500ms  
void nchwToNhwcKernel<float, __half, float, bool=1, bool=0>(int, int, int, int, 
float const *, __half*, float, float)
                       0.21%  69.811ms         6  11.635ms  4.3795ms  16.118ms  
void cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, 
int=3, int=3, bool=1, int=512>(int, int, int, float const *, int, 
cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, 
int=3, bool=1, int=512>*, float const , kernel_grad_params, int, float, int, 
int, int, int)
                       0.20%  65.869ms        13  5.0669ms  2.8130ms  6.3887ms  
void cudnn::detail::explicit_convolve_sgemm<float, int, int=512, int=6, int=8, 
int=3, int=3, int=5, int=0, bool=1>(int, int, int, float const *, int, float 
const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=512, int=6, 
int=8, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, int, 
float, float, int, float const *, float const *)
                       0.18%  59.831ms        10  5.9831ms  3.5280ms  7.8479ms  
void cudnn::detail::implicit_convolve_sgemm<float, float, int=128, int=6, 
int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, float 
const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, 
int=128, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>*, 
kernel_conv_params, int, float, float, int, float, float, int, int)
                       0.18%  59.777ms        18  3.3210ms  775.84us  6.4399ms  
void transpose_readWrite_alignment_kernel<float2, float2, int=1, bool=0, int=6, 
int=4, int=4>(cublasTransposeParams<float2>, float2 const *, float2*, float2 
const *)
                       0.15%  50.489ms        86  587.08us  10.336us  1.1409ms  
void fft1d_r2c_32<float, float, float2, bool=0, bool=0>(float2*, float const *, 
int, int3, int3, int2, int2)
                       0.15%  49.342ms      1016  48.564us  3.2960us  175.39us  
void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, 
float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)
                       0.14%  48.608ms        23  2.1134ms  180.32us  6.7064ms  
void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, 
cudnnTensor4dStruct, float const *, float*, int)
                       0.14%  47.200ms         2  23.600ms  23.597ms  23.603ms  
void cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, 
int=3, int=3, bool=0, int=512>(int, int, int, float const *, int, 
cudnn::detail::wgrad_alg0_engine<float, int=512, int=6, int=5, int=3, int=3, 
int=3, bool=0, int=512>*, float const , kernel_grad_params, int, float, int, 
int, int, int)
                       0.14%  46.637ms        12  3.8864ms  2.8884ms  5.8737ms  
void DSE::vector_fft<int=0, int=1, int=256, int=16, int=16, int=1, float, 
float, float2>(float2*, float2, int, int3, float2*)
                       0.13%  44.696ms       533  83.856us  50.177us  569.83us  
void fft2d_c2r_32x32<float, bool=1, bool=0, unsigned int=0, bool=0, 
bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, 
float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
                       0.13%  43.354ms        12  3.6129ms  2.7553ms  5.3166ms  
void DSE::regular_fft_pad<int=0, int=1, int=256, int=16, int=16, int=1, float, 
float, float2>(float2*, float*, int, int3, float*, int, float*, float*, int, 
int, int, int, int, bool)
                       0.12%  41.697ms         1  41.697ms  41.697ms  41.697ms  
volta_s884cudnn_fp16_256x64_ldg8_relu_exp_medium_nhwc_tn_v1
                       0.12%  40.014ms        10  4.0014ms  2.0726ms  8.5663ms  
volta_gcgemm_64x32_nt
                       0.12%  39.472ms       533  74.057us  42.528us  682.98us  
void fft2d_r2c_32x32<float, bool=1, unsigned int=0, bool=0>(float2*, float 
const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, 
bool, int2, int, int)
                       0.12%  38.894ms        74  525.60us  136.42us  1.0232ms  
void fft1d_c2r_32<float2, float, float, bool=0, bool=1, bool=0, bool=0>(float*, 
float2 const *, int, int3, int3, int2, int, float, float, float*, float*)
                       0.11%  37.067ms         3  12.356ms  5.8981ms  18.248ms  
void cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, int=5, 
int=3, int=3, int=3, int=0, bool=0>(int, int, int, float const *, int, float 
const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, 
int=5, int=3, int=3, int=3, int=0, bool=0>*, kernel_conv_params, int, int, 
float, float, int, float const *, float const *)
                       0.11%  36.363ms       548  66.356us  19.968us  5.8917ms  
volta_cgemm_32x32_tn
                       0.11%  36.285ms         6  6.0475ms  4.5707ms  8.7570ms  
void DSE::regular_fft_clip<int=1, int=2, int=256, int=16, int=16, int=1, float, 
float, float2>(float*, float2*, int, int3, float2*, int, float2*, float2*, int, 
int, int, int, int, float, float, bool, int, float, float)
                       0.11%  35.877ms       129  278.11us     800ns  1.8610ms  
void scalePackedTensor_kernel<float, float>(cudnnTensor4dStruct, float*, float)
                       0.10%  34.537ms         4  8.6343ms  8.5678ms  8.7111ms  
volta_cgemm_32x64_tn
                       0.10%  34.082ms         4  8.5205ms  3.3328ms  13.704ms  
void cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, 
int=5, bool=0>(int, int, int, float const *, int, float const , int, 
cudnn::detail::dgrad_engine<float, int=128, int=6, int=8, int=3, int=3, int=5, 
bool=0>*, kernel_grad_params, int, int, float, int, int, int)
                       0.10%  33.277ms       597  55.740us  7.8400us  655.81us  
void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=0, bool=0, 
bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, 
float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
                       0.09%  31.716ms       603  52.596us  13.408us  430.18us  
void fft2d_r2c_32x32<float, bool=0, unsigned int=0, bool=0>(float2*, float 
const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, 
bool, int2, int, int)
                       0.09%  31.566ms         9  3.5074ms  2.5011ms  5.4303ms  
volta_scudnn_128x128_stridedB_interior_nn_v1
                       0.08%  26.656ms       100  266.56us  262.02us  270.91us  
void cudnn::detail::pooling_fw_4d_kernel<float, float, 
cudnn::detail::averpooling_func<float>, int=1, bool=0>(cudnnTensorStruct, float 
const *, cudnn::detail::pooling_fw_4d_kernel<float, float, 
cudnn::detail::averpooling_func<float>, int=1, bool=0>, cudnnTensorStruct*, 
cudnnPoolingStruct, float, cudnnPoolingStruct, int, cudnn::reduced_divisor, 
float)
                       0.07%  25.290ms         3  8.4298ms  8.0690ms  8.6313ms  
void cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=6, int=7, 
int=3, int=3, int=5, int=0, bool=0>(int, int, int, float const *, int, float 
const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=6, 
int=7, int=3, int=3, int=5, int=0, bool=0>*, kernel_conv_params, int, int, 
float, float, int, float const *, float const *)
                       0.07%  24.664ms        12  2.0554ms  9.7920us  6.8901ms  
void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=0>(float2*, float 
const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, 
bool, int2, int, int)
                       0.07%  23.301ms         4  5.8252ms  5.6255ms  5.9667ms  
volta_scudnn_128x128_stridedB_splitK_small_nn_v1
                       0.07%  22.953ms         6  3.8254ms  2.8567ms  5.7274ms  
void DSE::vector_fft<int=1, int=2, int=256, int=16, int=16, int=1, float, 
float, float2>(float2*, float2, int, int3, float2*)
                       0.07%  22.723ms         4  5.6807ms  5.5928ms  5.7781ms  
volta_scudnn_128x128_stridedB_splitK_medium_nn_v1
                       0.07%  22.179ms         4  5.5448ms  3.1963ms  7.9010ms  
volta_scudnn_128x64_stridedB_splitK_interior_nn_v1
                       0.07%  22.177ms        45  492.83us  3.7440us  3.4996ms  
void nhwcToNchwKernel<float, float, float, bool=1, bool=0>(int, int, int, int, 
float const *, float*, float, float)
                       0.07%  22.093ms         8  2.7616ms  1.3035ms  5.1459ms  
volta_sgemm_128x128_nt
                       0.06%  21.667ms         2  10.833ms  10.824ms  10.843ms  
void cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=6, int=5, int=4, 
int=4, bool=1, bool=1>(int, int, int, float const *, int, float const , int, 
cudnn::detail::dgrad2d_alg1_1<float, int=0, int=6, int=6, int=5, int=4, int=4, 
bool=1, bool=1>*, kernel_grad_params, int, int, float, int, int)
                       0.06%  21.037ms        10  2.1037ms  1.2885ms  3.0508ms  
volta_s884cudnn_fp16_128x128_ldg8_relu_exp_interior_nhwc_tn_v1
                       0.05%  18.352ms        10  1.8352ms  14.048us  7.0852ms  
void fft2d_r2c_32x32<float, bool=0, unsigned int=1, bool=1>(float2*, float 
const *, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, 
bool, int2, int, int)
                       0.05%  18.055ms         6  3.0092ms  1.4470ms  5.8244ms  
void DSE::vector_fft<int=0, int=1, int=128, int=8, int=8, int=1, float, float, 
float2>(float2*, float2, int, int3, float2*)
                       0.05%  18.055ms         9  2.0061ms  1.3350ms  3.2205ms  
volta_s884cudnn_fp16_256x128_ldg8_dgrad_exp_interior_nhwc2nchw_tt_v1
                       0.05%  17.620ms         6  2.9367ms  2.7701ms  3.1701ms  
volta_s884cudnn_fp16_256x128_ldg8_relu_exp_small_nhwc_tn_v1
                       0.05%  15.332ms         6  2.5554ms  1.3206ms  4.9532ms  
void DSE::regular_fft_pad<int=0, int=1, int=128, int=16, int=32, int=1, float, 
float, float2>(float2*, float*, int, int3, float*, int, float*, float*, int, 
int, int, int, int, bool)
                       0.04%  14.872ms         2  7.4359ms  7.4295ms  7.4423ms  
void cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, 
int=3, int=5, bool=0, int=512>(int, int, int, float const *, int, 
cudnn::detail::wgrad_alg0_engine<float, int=128, int=6, int=8, int=3, int=3, 
int=5, bool=0, int=512>*, float const , kernel_grad_params, int, float, int, 
int, int, int)
                       0.04%  13.061ms         3  4.3535ms  2.2559ms  8.5455ms  
void DSE::regular_fft_clip<int=1, int=2, int=128, int=16, int=32, int=1, float, 
float, float2>(float*, float2*, int, int3, float2*, int, float2*, float2*, int, 
int, int, int, int, float, float, bool, int, float, float)
                       0.03%  11.448ms         8  1.4310ms  1.4185ms  1.4505ms  
volta_sgemm_128x64_nt
                       0.03%  11.397ms         2  5.6985ms  5.6877ms  5.7093ms  
volta_scudnn_128x64_stridedB_small_nn_v1
                       0.03%  11.194ms         2  5.5969ms  5.5443ms  5.6495ms  
volta_scudnn_128x64_relu_small_nn_v1
                       0.03%  10.689ms         2  5.3445ms  5.3323ms  5.3567ms  
volta_scudnn_128x128_stridedB_small_nn_v1
                       0.03%  8.7617ms       273  32.094us     864ns  882.59us  
[CUDA memcpy HtoD]
                       0.03%  8.6110ms         3  2.8703ms  1.4365ms  5.7361ms  
void DSE::vector_fft<int=1, int=2, int=128, int=8, int=8, int=1, float, float, 
float2>(float2*, float2, int, int3, float2*)
                       0.03%  8.4792ms         2  4.2396ms  4.0871ms  4.3921ms  
void cudnn::detail::implicit_convolve_sgemm<float, float, int=1024, int=6, 
int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>(int, int, int, float 
const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, 
int=1024, int=6, int=7, int=3, int=3, int=5, int=1, bool=1, bool=0, bool=1>*, 
kernel_conv_params, int, float, float, int, float, float, int, int)
                       0.02%  8.4246ms         6  1.4041ms  543.14us  3.1097ms  
void fft2d_r2c_64x64<float>(float2*, float const *, int, int, int, int, int, 
int, int, int)
                       0.02%  8.3013ms         8  1.0377ms  264.23us  2.2399ms  
void cudnn::winograd_nonfused::winogradWgradDelta4x4<float, 
float>(cudnn::winograd_nonfused::WinogradDeltaParams<float, float>)
                       0.02%  7.6022ms         1  7.6022ms  7.6022ms  7.6022ms  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS0_19SampleUniformKernelIN7mshadow3gpuEEEJNS_6common6random13RandGeneratorIS5_fEEllllPfSB_SB_EEEviDpT0_
                       0.02%  7.4782ms         2  3.7391ms  3.3239ms  4.1542ms  
void cudnn::detail::explicit_convolve_sgemm<float, int, int=1024, int=6, int=7, 
int=3, int=3, int=5, int=0, bool=1>(int, int, int, float const *, int, float 
const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=1024, 
int=6, int=7, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, 
int, float, float, int, float const *, float const *)
                       0.02%  7.3935ms         8  924.19us  237.73us  1.9679ms  
void cudnn::winograd_nonfused::winogradWgradData4x4<float, 
float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)
                       0.02%  6.5836ms         3  2.1945ms  1.0678ms  2.8729ms  
volta_scudnn_128x64_stridedB_interior_nn_v1
                       0.02%  6.5640ms         3  2.1880ms  1.3532ms  2.6835ms  
volta_s884cudnn_fp16_256x128_ldg8_relu_exp_interior_nhwc_tn_v1
                       0.02%  5.8489ms      4078  1.4340us     992ns  3.1680us  
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
                       0.02%  5.6401ms         2  2.8200ms  2.7781ms  2.8620ms  
volta_s884cudnn_fp16_256x128_ldg8_dgrad_exp_small_nhwc2nchw_tt_v1
                       0.02%  5.5618ms         3  1.8539ms  2.5600us  3.1295ms  
void nchwToNhwcKernel<float, __half, float, bool=1, bool=1>(int, int, int, int, 
float const *, __half*, float, float)
                       0.01%  4.7113ms         5  942.26us  282.98us  3.5477ms  
void fft2d_c2r_32x32<float, bool=0, bool=0, unsigned int=1, bool=0, 
bool=0>(float*, float2 const *, int, int, int, int, int, int, int, int, int, 
float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int)
                       0.01%  4.7031ms       100  47.031us  44.352us  48.032us  
volta_sgemm_64x32_sliced1x4_tn
                       0.01%  3.9111ms         3  1.3037ms  461.15us  2.9863ms  
void fft2d_c2r_64x64<float, bool=0>(float*, float2*, int, int, int, int, int, 
int, int, int, int, int, float, float, int, float*, float*)
                       0.01%  3.6273ms         1  3.6273ms  3.6273ms  3.6273ms  
void cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=6, int=7, 
int=3, int=3, int=5, int=0, bool=1>(int, int, int, float const *, int, float 
const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=6, 
int=7, int=3, int=3, int=5, int=0, bool=1>*, kernel_conv_params, int, int, 
float, float, int, float const *, float const *)
                       0.01%  3.1171ms         2  1.5585ms  1.5565ms  1.5605ms  
void cudnn::detail::implicit_convolve_sgemm<float, float, int=1024, int=5, 
int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>(int, int, int, float 
const *, int, float*, cudnn::detail::implicit_convolve_sgemm<float, float, 
int=1024, int=5, int=5, int=3, int=3, int=3, int=1, bool=1, bool=0, bool=1>*, 
kernel_conv_params, int, float, float, int, float, float, int, int)
                       0.01%  3.0100ms         1  3.0100ms  3.0100ms  3.0100ms  
volta_s884cudnn_fp16_256x64_ldg8_dgrad_exp_small_nhwc2nchw_tt_v1
                       0.01%  2.9735ms         1  2.9735ms  2.9735ms  2.9735ms  
volta_s884cudnn_fp16_256x64_ldg8_relu_exp_small_nhwc_tn_v1
                       0.01%  2.8700ms         1  2.8700ms  2.8700ms  2.8700ms  
volta_s884cudnn_fp16_128x128_ldg8_dgrad_exp_small_nhwc2nchw_tt_v1
                       0.01%  2.5329ms        12  211.08us  14.240us  775.04us  
void fft1d_r2c_32<float, float, float2, bool=1, bool=0>(float2*, float const *, 
int, int3, int3, int2, int2)
                       0.01%  2.4597ms         2  1.2299ms  796.35us  1.6634ms  
volta_s884cudnn_fp16_256x64_ldg8_dgrad_exp_interior_nhwc2nchw_tt_v1
                       0.01%  2.3151ms         2  1.1575ms  664.29us  1.6508ms  
volta_s884cudnn_fp16_256x64_ldg8_relu_exp_interior_nhwc_tn_v1
                       0.00%  1.6250ms         1  1.6250ms  1.6250ms  1.6250ms  
void cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, int=5, 
int=3, int=3, int=3, int=0, bool=1>(int, int, int, float const *, int, float 
const , int, cudnn::detail::explicit_convolve_sgemm<float, int, int=128, int=5, 
int=5, int=3, int=3, int=3, int=0, bool=1>*, kernel_conv_params, int, int, 
float, float, int, float const *, float const *)
                       0.00%  1.2918ms         1  1.2918ms  1.2918ms  1.2918ms  
volta_s884cudnn_fp16_128x128_ldg8_dgrad_exp_interior_nhwc2nchw_tt_v1
                       0.00%  1.2076ms       308  3.9200us  2.9440us  92.448us  
void cudnn::winograd::generateWinogradTilesKernel<int=0, float, 
float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)
                       0.00%  1.0820ms         4  270.51us  78.273us  570.31us  
void flip_filter<float, float>(float*, float const *, int, int, int, int)
                       0.00%  1.0228ms       100  10.227us  10.080us  11.296us  
[CUDA memcpy DtoH]
                       0.00%  585.86us         8  73.232us  8.4160us  206.05us  
void cudnn::winograd_nonfused::winogradWgradOutput4x4<float, 
float>(cudnn::winograd_nonfused::WinogradWgradOutputParams<float, float>)
                       0.00%  394.59us       155  2.5450us     896ns  24.224us  
_ZN5mxnet2op8mxnet_op20mxnet_generic_kernelINS1_11op_with_reqINS1_10set_to_intILi0EEELi1EEEJPfEEEviDpT0_
                       0.00%  207.49us       226     918ns     736ns  1.5680us  
[CUDA memset]
                       0.00%  187.23us       100  1.8720us  1.7920us  1.9520us  
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, int=8, 
mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, int=2, float>, float>, 
mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 
int=1, float>, float, int=2, int=1>, float>>(mshadow::gpu, long, 
mshadow::Shape<int=2>, int=2)
                       0.00%  128.90us       100  1.2880us  1.2160us  2.0480us  
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, int=8, 
mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, int=1, float>, float>, 
mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float>>(mshadow::gpu, 
long, mshadow::Shape<int=2>, int=1)
                       0.00%  94.592us         8  11.824us  1.6000us  30.944us  
compute_gemm_pointers(float2**, float2 const *, int, float2 const *, int, 
float2 const *, int, int)
                       0.00%  75.360us         4  18.840us  18.496us  19.424us  
mxnet::common::random::rand_generator_seed_kernel(curandStatePhilox4_32_10*, 
int, unsigned int)
                       0.00%  69.664us        46  1.5140us  1.3440us  2.4960us  
cudnn::gemm::computeWgradSplitKOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
                       0.00%  61.185us        46  1.3300us  1.2160us  1.7600us  
cudnn::gemm::computeWgradBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
                       0.00%  40.672us        23  1.7680us  1.6320us  2.0160us  
cudnn::gemm::computeWgradOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)
                       0.00%  32.288us        32  1.0090us     768ns  1.5040us  
cudnn::gemm::computeBOffsetsKernel(cudnn::gemm::ComputeBOffsetsParams)
                       0.00%  2.9760us         1  2.9760us  2.9760us  2.9760us  
void nhwcToNchwKernel<float, float, float, bool=1, bool=1>(int, int, int, int, 
float const *, float*, float, float)
         API calls:   69.11%  29.2974s      1719  17.043ms  3.2370us  40.209ms  
cudaStreamSynchronize
                       8.70%  3.68928s       413  8.9329ms  1.0706ms  119.18ms  
cudaEventSynchronize
                       6.77%  2.86866s        22  130.39ms  9.9590us  1.43409s  
cudaStreamCreateWithFlags
                       6.20%  2.62799s         4  657.00ms  41.117us  1.31395s  
cudaStreamCreate
                       5.25%  2.22686s       424  5.2520ms     461ns  533.05ms  
cudaFree
                       2.99%  1.26905s       911  1.3930ms  5.8310us  74.805ms  
cudaMalloc
                       0.65%  276.87ms     30263  9.1480us  4.7190us  312.89us  
cudaLaunchKernel
                       0.20%  83.899ms       498  168.47us  81.743us  243.49us  
cudaMemGetInfo
                       0.04%  18.297ms       361  50.683us  4.5490us  913.10us  
cudaMemcpy2DAsync
                       0.01%  5.4620ms      1422  3.8410us     581ns  193.57us  
cudaEventRecord
                       0.01%  5.0385ms     26409     190ns      90ns  189.37us  
cudaGetLastError
                       0.01%  4.3456ms      5628     772ns     280ns  3.4770us  
cudaDeviceGetAttribute
                       0.01%  3.0948ms         4  773.69us  435.66us  1.0535ms  
cudaGetDeviceProperties
                       0.01%  2.8315ms       226  12.528us  3.4660us  167.68us  
cudaMemsetAsync
                       0.01%  2.5062ms      1971  1.2710us     541ns  3.3970us  
cudaStreamWaitEvent
                       0.01%  2.1681ms       375  5.7810us     120ns  622.49us  
cuDeviceGetAttribute
                       0.00%  2.0113ms      1524  1.3190us     551ns  65.272us  
cudaSetDevice
                       0.00%  1.6765ms       354  4.7350us     480ns  325.69us  
cudaEventCreateWithFlags
                       0.00%  1.2218ms       413  2.9580us  1.2420us  12.504us  
cudaEventElapsedTime
                       0.00%  1.1588ms       912  1.2700us     431ns  67.727us  
cudaFuncSetAttribute
                       0.00%  938.06us       657  1.4270us     331ns  8.1860us  
cudaGetDevice
                       0.00%  924.72us       230  4.0200us  1.7230us  11.101us  
cudaBindTexture
                       0.00%  716.43us         2  358.22us  31.239us  685.19us  
cudaHostAlloc
                       0.00%  501.09us         4  125.27us  109.51us  165.95us  
cuDeviceTotalMem
                       0.00%  475.13us      1960     242ns     160ns     611ns  
cudaPeekAtLastError
                       0.00%  323.81us       230  1.4070us     511ns  2.2840us  
cudaUnbindTexture
                       0.00%  308.53us        12  25.710us  8.6360us  58.520us  
cudaMemcpy
                       0.00%  301.00us       182  1.6530us     481ns  2.7250us  
cudaEventDestroy
                       0.00%  280.89us       100  2.8080us  2.3850us  6.5220us  
cudaEventQuery
                       0.00%  201.97us         4  50.492us  40.897us  67.817us  
cuDeviceGetName
                       0.00%  121.70us        50  2.4340us  1.1120us  8.9470us  
cudaEventCreate
                       0.00%  91.960us         8  11.495us  9.8680us  13.335us  
cudaStreamCreateWithPriority
                       0.00%  3.9370us         2  1.9680us  1.7930us  2.1440us  
cudaHostGetDevicePointer
                       0.00%  3.7470us         1  3.7470us  3.7470us  3.7470us  
cuDeviceGetPCIBusId
                       0.00%  3.0950us         6     515ns     251ns  1.4520us  
cuDeviceGetCount
                       0.00%  3.0060us         2  1.5030us  1.4830us  1.5230us  
cudaDeviceGetStreamPriorityRange
                       0.00%  2.2040us         3     734ns     631ns     822ns  
cuInit
                       0.00%  2.0320us         4     508ns     210ns     841ns  
cudaGetDeviceCount
                       0.00%  1.5440us         5     308ns     190ns     511ns  
cuDeviceGet
                       0.00%  1.2120us         3     404ns     290ns     491ns  
cuDriverGetVersion
                       0.00%  1.0210us         4     255ns     190ns     341ns  
cuDeviceGetUuid
   ```
   
   So the summary is we need a few code changes to actually enable this 
feature.  I'll have to think a little about what the easiest change is and then 
I'll open a PR.  I'm also a little concerned here that the non-TC cudnn 
convolution implementation seems faster.

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to