TaoLv commented on a change in pull request #16735: Use single-bit for mask in 
dropout operator
URL: https://github.com/apache/incubator-mxnet/pull/16735#discussion_r379233744
 
 

 ##########
 File path: src/operator/nn/dropout-inl.h
 ##########
 @@ -152,15 +181,20 @@ class DropoutOp {
                           const std::vector<TBlob> &out_grad) {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2, DType> grad = out_grad[dropout::kOut].FlatTo2D<xpu, 
DType>(s);
-    Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, 
DType>(s);
+    Tensor<xpu, 1, uint8_t> mask = out_data[dropout::kMask].FlatTo1D<xpu, 
uint8_t>(s);
     Tensor<xpu, 2, DType> gdata = in_grad[dropout::kData].FlatTo2D<xpu, 
DType>(s);
     DType *ingradptr = gdata.dptr_;
     const DType *outgradptr = grad.dptr_;
-    const DType *maskptr = mask.dptr_;
-    const int count = mask.shape_[0] * mask.shape_[1];
-#pragma omp parallel for 
num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-    for (int i = 0; i < count; ++i) {
-      ingradptr[i] = outgradptr[i] * maskptr[i];
+    const uint8_t *maskptr = mask.dptr_;
+    const index_t count = grad.shape_[0] * grad.shape_[1];
+    const float pk_1 = 1.0f / this->pkeep_;
+    const int nthr = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+#pragma omp parallel for num_threads(nthr) schedule(static, 8)
+    for (index_t i = 0; i < count; ++i) {
+      auto mask_idx = i >> 3;  // div 8;
+      uint8_t mask_offset = i & 7;  // mod 8
+      bool mask_val = maskptr[mask_idx] & (1U << mask_offset);
+      ingradptr[i] = outgradptr[i] * mask_val * pk_1;
 
 Review comment:
   Let's also use blocking in the backward path:
   
   ```cpp
       const int blk_size = 64;
       const int nblk = count / blk_size;
   
   #pragma omp parallel for num_threads(nthr) schedule(static, 8)
       for (index_t b = 0; b < nblk; ++b) {
         for (index_t k = 0; k < blk_size; ++k) {
           index_t i = b * blk_size + k;
           auto mask_idx = i >> 3;  // div 8;
           uint8_t mask_offset = i & 7;  // mod 8
           bool mask_val = maskptr[mask_idx] & (1U << mask_offset);
           ingradptr[i] = outgradptr[i] * mask_val * pk_1;
         }
       }
   
       // tail
       if (nblk * blk_size < count) {
         for (index_t i = nblk * blk_size; i < count; ++i) {
           auto mask_idx = i >> 3;  // div 8;
           uint8_t mask_offset = i & 7;  // mod 8
           bool mask_val = maskptr[mask_idx] & (1U << mask_offset);
           ingradptr[i] = outgradptr[i] * mask_val * pk_1;
         }
       }
     }
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to