larroy commented on a change in pull request #15545: Softmax optimization for 
GPU
URL: https://github.com/apache/incubator-mxnet/pull/15545#discussion_r315953722
 
 

 ##########
 File path: src/operator/nn/softmax-inl.h
 ##########
 @@ -313,71 +294,134 @@ __global__ void softmax_compute_kernel(DType *in, OType 
*out, index_t M, int axi
 
   for (index_t i = x; i < M; i += x_size) {
     val = negate ? -in[base + i*sa] : in[base + i*sa];
-    out[base + i*sa] = OP::Map((val - smax)/static_cast<DType>(temperature), 
ssum);
+    out[base + i*sa] =
+      (i < len) ? OType(OP::Map((val - smax)/static_cast<DType>(temperature), 
ssum)) : OType(0.0f);
   }
 }
 
-template<typename OP, bool negate, typename AType, typename DType, typename 
OType, int ndim>
-inline void Softmax(Stream<gpu> *s, DType *in, OType *out,
-                    Shape<ndim> shape, int axis, const double temperature) {
-  const int x_bits = 7;
-  const int x_size = 1 << x_bits;
-  index_t M = shape[axis];
-  index_t N = shape.Size()/M;
-  Shape<ndim> stride = calc_stride(shape);
-  Shape<ndim> sshape = shape;
-  sshape[axis] = 1;
+const int softmax_threads_per_block = 512;
+
+template<typename OP, bool negate, typename AType, typename LType,
+  typename DType, typename OType, typename IType>
+__global__ void softmax_stride1_compute_kernel(const DType *in, OType *out, 
IType *length,
+                                               const index_t M, const double 
temperature,
+                                               const int rows_per_block, const 
index_t total_rows) {
+  __shared__ AType scratch[softmax_threads_per_block];
+  __shared__ LType persistent_storage[20 * 1024 / sizeof(LType)];
+  const int warp_size = 32;
+  const int threads_per_row = softmax_threads_per_block / rows_per_block;
+  const int my_local_row = threadIdx.x / threads_per_row;
+  const int my_row = blockIdx.x * rows_per_block + my_local_row;
+  if (my_row >= total_rows) return;
+  const int my_id = threadIdx.x % threads_per_row;
+  const int entries_per_load = sizeof(LType)/sizeof(DType);
+  const index_t len = length == nullptr ? M : 
static_cast<index_t>(length[my_row]);
+  // Due to usage of MSHADOW_TYPE_SWITCH macro we are generating
+  // kernels where sizeof(LType) may be less than sizeof(DType),
+  // resulting in entries_per_load being 0.
+  // This is not a valid combination and is being checked against
+  // in the launcher code. This switch here is just to silence
+  // the division by zero warning generated for such invalid cases.
+  const int row_length = entries_per_load > 0 ? M / entries_per_load : 0;
+
+  const LType* in_aligned = reinterpret_cast<const LType*>(in);
+  size_t base = my_row * row_length;
+
+  for (index_t i = my_id; i < row_length; i += threads_per_row) {
+    persistent_storage[my_local_row * row_length + i] = in_aligned[base + i];
+  }
+  DType * row = reinterpret_cast<DType *>(persistent_storage + my_local_row * 
row_length);
+  __syncthreads();
 
-  softmax_compute_kernel<x_bits, OP, negate, AType, ndim>
-    <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
-      in, out, M, axis, sshape, stride, temperature);
-  MSHADOW_CUDA_POST_KERNEL_CHECK(softmax_compute_kernel);
-}
+  DType my_max_value;
 
 Review comment:
   Can we add a comment or maybe a more descriptive name? is this the max of 
the stride? 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to