ptrendx commented on a change in pull request #16039: FullyConnected Bias 
performance improvement on GPU
URL: https://github.com/apache/incubator-mxnet/pull/16039#discussion_r326332767
 
 

 ##########
 File path: src/operator/nn/fully_connected-inl.h
 ##########
 @@ -169,19 +355,7 @@ void FCBackward(const OpContext &ctx, const 
FullyConnectedParam &param,
   linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]);
   // gradient of bias
   if (!param.no_bias) {
-    Tensor<xpu, 1, DType> gbias = in_grad[fullc::kBias].get<xpu, 1, DType>(s);
-    TBlob grad_blob = TBlob(grad);
-    TBlob gbias_blob = TBlob(gbias);
-    mxnet::TShape x(1, 0);
-    mxnet::TShape small;
-    if (shape_assign(&gbias_blob.shape_, Shape2(param.num_hidden, 1))) {
-      small = gbias_blob.shape_;
-    } else {
-      small = ReduceAxesShapeImpl(grad_blob.shape_, 
dmlc::optional<mxnet::TShape>(x), true, false);
-    }
-    ReduceAxesComputeImpl<xpu, mshadow::red::sum, false, false,
-                          mshadow_op::identity>(ctx, {grad_blob}, 
{req[fullc::kBias]},
-                                                {in_grad[fullc::kBias]}, 
small);
+      AddBiasGrad(in_grad[fullc::kBias], grad, req[fullc::kBias], 
param.num_hidden, ctx);
 
 Review comment:
   Tested on TitanV, shape (3584,768) (one of the shapes in BERT), 10 
FullyConnected layers back to back.
   
   Old:
   float16
   ```
   247.14us        void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 
int=8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, int=2, 
mshadow::half::half_t>, mshadow::half::half_t>, 
mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 
int=1, mshadow::half::half_t>, mshadow::half::half_t, int=2, int=1>, 
mshadow::half::half_t>>(mshadow::gpu, int, mshadow::Shape<int=2>, int=2)
   517.79us        void mxnet::op::broadcast::reduce_kernel<mshadow::red::sum, 
int=2, mshadow::half::half_t, mshadow::half::half_t, mshadow::half::half_t, 
mxnet::op::mshadow_op::identity, int=2>(int, int, bool, mshadow::half::half_t 
const *, mshadow::half::half_t*, mshadow::Shape<int=2>, mshadow::Shape, 
mshadow::Shape, mshadow::Shape, int, bool)
   25.984us        void 
mxnet::op::broadcast::reduce_lines_kernel<mshadow::red::sum, 
mshadow::half::half_t>(int, int, bool, int, mshadow::half::half_t co
   nst *, mxnet::op::broadcast::reduce_lines_kernel<mshadow::red::sum, 
mshadow::half::half_t>*)
   ```
   float32
   ```
   396.90us        void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 
int=8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, int=2, float>, float>, 
mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 
int=1, float>, float, int=2, int=1>, float>>(mshadow::gpu, int, 
mshadow::Shape<int=2>, int=2)
   476.22us        void mxnet::op::broadcast::reduce_kernel<mshadow::red::sum, 
int=2, float, float, float, mxnet::op::mshadow_op::identity, int=2>(int, int, 
bool, float const *, float*, mshadow::Shape<int=2>, mshadow::Shape, 
mshadow::Shape, mshadow::Shape, int, bool)
   25.729us        void 
mxnet::op::broadcast::reduce_lines_kernel<mshadow::red::sum, float>(int, int, 
bool, int, float const *, 
mxnet::op::broadcast::reduce_lines_kernel<mshadow::red::sum, float>*)
   ```
   
   New:
   float16
   ```
   177.76us        void mxnet::op::add_bias_kernel<mshadow::half::half_t, 
double>(mshadow::half::half_t*, mshadow::half::half_t, unsigned long, unsigned 
long)
   135.23us        void mxnet::op::AddBiasGradKernelPhase1<double, 
mshadow::half::half_t, float>(float*, mshadow::half::half_t const *, unsigned 
long, unsigned long)
   68.481us        void 
mxnet::op::AddBiasGradKernelPhase2<mshadow::half::half_t, float>(float const *, 
mshadow::half::half_t*, int, int, mxnet::OpReqType)
   ```
   float32
   ```
   390.72us        void mxnet::op::add_bias_kernel<float, double>(float*, 
float, unsigned long, unsigned long)
   242.72us        void mxnet::op::AddBiasGradKernelPhase1<double, float, 
float>(float*, float const *, unsigned long, unsigned long)
   42.753us        void mxnet::op::AddBiasGradKernelPhase2<float, float>(float 
const *, float*, int, int, mxnet::OpReqType)
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to