TaoLv commented on a change in pull request #16735: Use single-bit for mask in
dropout operator
URL: https://github.com/apache/incubator-mxnet/pull/16735#discussion_r377497299
##########
File path: src/operator/nn/dropout-inl.h
##########
@@ -123,25 +124,33 @@ class DropoutOp {
Stream<xpu> *s = ctx.get_stream<xpu>();
RandGenerator<xpu, DType> *pgen =
ctx.requested[0].get_parallel_random<xpu, DType>();
CHECK_NOTNULL(pgen);
- Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu,
DType>(s);
+ Tensor<xpu, 1, uint8_t> mask = out_data[dropout::kMask].FlatTo1D<xpu,
uint8_t>(s);
Tensor<xpu, 2, DType> data = in_data[dropout::kData].FlatTo2D<xpu,
DType>(s);
Tensor<xpu, 2, DType> out = out_data[dropout::kOut].FlatTo2D<xpu,
DType>(s);
DType *outptr = out.dptr_;
DType *dataptr = data.dptr_;
- auto maskptr = reinterpret_cast<int *>(mask.dptr_);
- int count = mask.shape_[0] * mask.shape_[1];
- if (sizeof(DType) > sizeof(int)) {
- // allocating new buffer to avoiding memory overlapping between
`mask.dptr_` and `maskptr`
- Tensor<xpu, 1, int> temp = ctx.requested[1].get_space_typed<xpu, 1,
int>(Shape1(count), s);
- maskptr = temp.dptr_;
- }
- BernoulliGenerate(*pgen, count, this->pkeep_, maskptr);
+
+ index_t count = data.shape_[0] * data.shape_[1];
+ // allocating buffer for MKL routine to calculate int32 based maskptr
+ Tensor<xpu, 1, int> temp_space =
+ ctx.requested[1].get_space_typed<xpu, 1, int>(Shape1(count), s);
+ auto mkl_mask = temp_space.dptr_;
+
+ BernoulliGenerate(*pgen, count, this->pkeep_, mkl_mask);
const float pk_1 = 1.0f / this->pkeep_;
-#pragma omp parallel for
num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
- for (int i = 0; i < count; ++i) {
- const DType maskVal = static_cast<DType>(maskptr[i]) * pk_1;
- outptr[i] = dataptr[i] * maskVal;
- mask.dptr_[i] = maskVal;
+ const int nthr = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+#pragma omp parallel for num_threads(nthr) schedule(static, 8)
+ for (index_t i = 0; i < count; ++i) {
+ outptr[i] = dataptr[i] * mkl_mask[i] * pk_1;
+ auto mask_idx = i >> 3; // div 8
+ uint8_t mask_offset = i & 7; // mod 8
+ if (mkl_mask[i]) {
+ // set bit
+ mask.dptr_[mask_idx] |= 1U << mask_offset;
+ } else {
+ // clear bit
+ mask.dptr_[mask_idx] &= ~(1U << mask_offset);
+ }
Review comment:
Can you try the below code snippet? Blocking might help on cache contention.
```cpp
const int nblk = count / 64;
#pragma omp parallel for num_threads(nthr) schedule(static, 8)
for (index_t nb = 0; nb < nblk; ++nb) {
for (index_t k = 0; k < 64; ++k) {
const index_t i = nb * 64 + k;
outptr[i] = dataptr[i] * mkl_mask[i] * pk_1;
auto mask_idx = i >> 3; // div 8
uint8_t mask_offset = i & 7; // mod 8
if (mkl_mask[i]) {
// set bit
mask.dptr_[mask_idx] |= 1U << mask_offset;
} else {
// clear bit
mask.dptr_[mask_idx] &= ~(1U << mask_offset);
}
}
}
// tail
if (nblk * 64 < count) {
for (index_t i = nblk * 64; i < count; ++i) {
outptr[i] = dataptr[i] * mkl_mask[i] * pk_1;
auto mask_idx = i >> 3; // div 8
uint8_t mask_offset = i & 7; // mod 8
if (mkl_mask[i]) {
// set bit
mask.dptr_[mask_idx] |= 1U << mask_offset;
} else {
// clear bit
mask.dptr_[mask_idx] &= ~(1U << mask_offset);
}
}
}
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services