TaoLv commented on a change in pull request #16735: Use single-bit for mask in
dropout operator
URL: https://github.com/apache/incubator-mxnet/pull/16735#discussion_r379233744
##########
File path: src/operator/nn/dropout-inl.h
##########
@@ -152,15 +181,20 @@ class DropoutOp {
const std::vector<TBlob> &out_grad) {
Stream<xpu> *s = ctx.get_stream<xpu>();
Tensor<xpu, 2, DType> grad = out_grad[dropout::kOut].FlatTo2D<xpu,
DType>(s);
- Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu,
DType>(s);
+ Tensor<xpu, 1, uint8_t> mask = out_data[dropout::kMask].FlatTo1D<xpu,
uint8_t>(s);
Tensor<xpu, 2, DType> gdata = in_grad[dropout::kData].FlatTo2D<xpu,
DType>(s);
DType *ingradptr = gdata.dptr_;
const DType *outgradptr = grad.dptr_;
- const DType *maskptr = mask.dptr_;
- const int count = mask.shape_[0] * mask.shape_[1];
-#pragma omp parallel for
num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
- for (int i = 0; i < count; ++i) {
- ingradptr[i] = outgradptr[i] * maskptr[i];
+ const uint8_t *maskptr = mask.dptr_;
+ const index_t count = grad.shape_[0] * grad.shape_[1];
+ const float pk_1 = 1.0f / this->pkeep_;
+ const int nthr = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+#pragma omp parallel for num_threads(nthr) schedule(static, 8)
+ for (index_t i = 0; i < count; ++i) {
+ auto mask_idx = i >> 3; // div 8;
+ uint8_t mask_offset = i & 7; // mod 8
+ bool mask_val = maskptr[mask_idx] & (1U << mask_offset);
+ ingradptr[i] = outgradptr[i] * mask_val * pk_1;
Review comment:
Let's also use blocking in the backward path:
```cpp
const int blk_size = 64;
const int nblk = count / blk_size;
#pragma omp parallel for num_threads(nthr) schedule(static, 8)
for (index_t b = 0; b < nblk; ++b) {
for (index_t k = 0; k < blk_size; ++k) {
index_t i = b * blk_size + k;
auto mask_idx = i >> 3; // div 8;
uint8_t mask_offset = i & 7; // mod 8
bool mask_val = maskptr[mask_idx] & (1U << mask_offset);
ingradptr[i] = outgradptr[i] * mask_val * pk_1;
}
}
// tail
if (nblk * blk_size < count) {
for (index_t i = nblk * blk_size; i < count; ++i) {
auto mask_idx = i >> 3; // div 8;
uint8_t mask_offset = i & 7; // mod 8
bool mask_val = maskptr[mask_idx] & (1U << mask_offset);
ingradptr[i] = outgradptr[i] * mask_val * pk_1;
}
}
}
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services