apeforest commented on a change in pull request #15545: Softmax optimization
for GPU
URL: https://github.com/apache/incubator-mxnet/pull/15545#discussion_r315994975
##########
File path: src/operator/nn/softmax-inl.h
##########
@@ -188,89 +180,77 @@ struct log_softmax_bwd {
}
};
-
template<typename OP1, typename OP2, int Req, bool negate,
- typename AType, typename DType, typename OType, int ndim>
+ typename AType, typename DType, typename OType, typename IType, int
ndim>
inline void SoftmaxGrad(Stream<cpu> *s, OType *out, OType *ograd,
- DType *igrad, Shape<ndim> shape, int axis,
- const DType temperature) {
+ DType *igrad, IType *length, Shape<ndim> shape,
+ int axis, const DType temperature) {
index_t M = shape[axis];
index_t N = shape.Size()/M;
Shape<ndim> stride = calc_stride(shape);
Shape<ndim> sshape = shape;
sshape[axis] = 1;
index_t sa = stride[axis];
- #pragma omp parallel for
- for (index_t i = 0; i < N; ++i) {
- index_t base = unravel_dot(i, sshape, stride);
+ if (length != nullptr) {
+ #pragma omp parallel for
+ for (index_t i = 0; i < N; ++i) {
+ index_t base = unravel_dot(i, sshape, stride);
+ index_t len = static_cast<index_t>(length[i]);
- AType sum = AType(0);
- for (index_t j = 0; j < M; ++j) {
- sum += OP1::Map(ograd[base + j*sa], out[base + j*sa]);
- }
-
- // By default temperature is 1.0.
- // Adding a branch here to save the CPU 'divide-by-1' computation at
runtime
- DType final_result;
- if (temperature == 1.0) {
- for (index_t j = 0; j < M; ++j) {
- final_result = negate ?
- -OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) :
- OP2::Map(ograd[base + j*sa], out[base + j*sa], sum);
- KERNEL_ASSIGN(igrad[base + j*sa], Req, final_result);
- }
- } else {
- for (index_t j = 0; j < M; ++j) {
- final_result = negate ?
- -OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) /
temperature :
- OP2::Map(ograd[base + j*sa], out[base + j*sa], sum) /
temperature;
- KERNEL_ASSIGN(igrad[base + j*sa], Req, final_result);
+ AType sum = AType(0);
+ for (index_t j = 0; j < len; ++j) {
+ sum += OP1::Map(ograd[base + j*sa], out[base + j*sa]);
}
- }
- }
-}
-template<typename OP1, typename OP2, int Req, bool negate,
- typename AType, typename DType, typename OType, typename IType, int
ndim>
-inline void SoftmaxWithLengthGrad(Stream<cpu> *s, OType *out, OType *ograd,
- DType *igrad, IType *length, Shape<ndim>
shape,
- int axis, const DType temperature) {
- index_t M = shape[axis];
- index_t N = shape.Size()/M;
- Shape<ndim> stride = calc_stride(shape);
- Shape<ndim> sshape = shape;
- sshape[axis] = 1;
- index_t sa = stride[axis];
-
- #pragma omp parallel for
- for (index_t i = 0; i < N; ++i) {
- index_t base = unravel_dot(i, sshape, stride);
- index_t len = static_cast<index_t>(length[i]);
-
- AType sum = AType(0);
- for (index_t j = 0; j < len; ++j) {
- sum += OP1::Map(ograd[base + j*sa], out[base + j*sa]);
+ // By default temperature is 1.0.
+ // Adding a branch here to save the CPU 'divide-by-1' computation at
runtime
+ DType final_result;
+ if (temperature == 1.0) {
Review comment:
Yes, I have done performance comparison earlier. This check speed up the
operator by 30%
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services