TaoLv commented on a change in pull request #16735: Use single-bit for mask in 
dropout operator
URL: https://github.com/apache/incubator-mxnet/pull/16735#discussion_r377497299
 
 

 ##########
 File path: src/operator/nn/dropout-inl.h
 ##########
 @@ -123,25 +124,33 @@ class DropoutOp {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     RandGenerator<xpu, DType> *pgen = 
ctx.requested[0].get_parallel_random<xpu, DType>();
     CHECK_NOTNULL(pgen);
-    Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, 
DType>(s);
+    Tensor<xpu, 1, uint8_t> mask = out_data[dropout::kMask].FlatTo1D<xpu, 
uint8_t>(s);
     Tensor<xpu, 2, DType> data = in_data[dropout::kData].FlatTo2D<xpu, 
DType>(s);
     Tensor<xpu, 2, DType> out = out_data[dropout::kOut].FlatTo2D<xpu, 
DType>(s);
     DType *outptr = out.dptr_;
     DType *dataptr = data.dptr_;
-    auto maskptr = reinterpret_cast<int *>(mask.dptr_);
-    int count = mask.shape_[0] * mask.shape_[1];
-    if (sizeof(DType) > sizeof(int)) {
-      // allocating new buffer to avoiding memory overlapping between 
`mask.dptr_` and `maskptr`
-      Tensor<xpu, 1, int> temp = ctx.requested[1].get_space_typed<xpu, 1, 
int>(Shape1(count), s);
-      maskptr = temp.dptr_;
-    }
-    BernoulliGenerate(*pgen, count, this->pkeep_, maskptr);
+
+    index_t count = data.shape_[0] * data.shape_[1];
+    // allocating buffer for MKL routine to calculate int32 based maskptr
+    Tensor<xpu, 1, int> temp_space =
+      ctx.requested[1].get_space_typed<xpu, 1, int>(Shape1(count), s);
+    auto mkl_mask = temp_space.dptr_;
+
+    BernoulliGenerate(*pgen, count, this->pkeep_, mkl_mask);
     const float pk_1 = 1.0f / this->pkeep_;
-#pragma omp parallel for 
num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
-    for (int i = 0; i < count; ++i) {
-      const DType maskVal = static_cast<DType>(maskptr[i]) * pk_1;
-      outptr[i] = dataptr[i] * maskVal;
-      mask.dptr_[i] = maskVal;
+    const int nthr = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+#pragma omp parallel for num_threads(nthr) schedule(static, 8)
+    for (index_t i = 0; i < count; ++i) {
+      outptr[i] = dataptr[i] * mkl_mask[i] * pk_1;
+      auto mask_idx = i >> 3;  // div 8
+      uint8_t mask_offset = i & 7;  // mod 8
+      if (mkl_mask[i]) {
+        // set bit
+        mask.dptr_[mask_idx] |= 1U << mask_offset;
+      } else {
+        // clear bit
+        mask.dptr_[mask_idx] &= ~(1U << mask_offset);
+      }
 
 Review comment:
   Can you try the below code snippet? Blocking might help on cache contention.
   ```cpp
       const int nblk = count / 64;
   
   #pragma omp parallel for num_threads(nthr) schedule(static, 8)
       for (index_t nb = 0; nb < nblk; ++nb) {
         for (index_t k = 0; k < 64; ++k) {
           const index_t i = nb * 64 + k;
           outptr[i] = dataptr[i] * mkl_mask[i] * pk_1;
           auto mask_idx = i >> 3;  // div 8
           uint8_t mask_offset = i & 7;  // mod 8
           if (mkl_mask[i]) {
             // set bit
             mask.dptr_[mask_idx] |= 1U << mask_offset;
           } else {
             // clear bit
             mask.dptr_[mask_idx] &= ~(1U << mask_offset);
           }
         }
       }
   
       // tail
       if (nblk * 64 < count) {
         for (index_t i = nblk * 64; i < count; ++i) {
           outptr[i] = dataptr[i] * mkl_mask[i] * pk_1;
           auto mask_idx = i >> 3;  // div 8
           uint8_t mask_offset = i & 7;  // mod 8
           if (mkl_mask[i]) {
             // set bit
             mask.dptr_[mask_idx] |= 1U << mask_offset;
           } else {
             // clear bit
             mask.dptr_[mask_idx] &= ~(1U << mask_offset);
           }
         }
       }
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to