[GitHub] [singa] chrishkchris commented on issue #555: SINGA-490 Optimization on GPU Malloc and Cudastream

GitBox Tue, 12 Nov 2019 23:49:18 -0800

chrishkchris commented on issue #555: SINGA-490 Optimization on GPU Malloc and 
Cudastream
URL: https://github.com/apache/singa/pull/555#issuecomment-553281211
 
 
   Double checking of cuda codes (I removed the cuda memset, so I need to check 
everything related to cuda)
   
   I have to check three files: 1. tensor.cc, 2. tenor_math_cuda.h, 3. 
math_kernal.cu
   
   1. tensor.cc: check if (i) after a new tensor is created (ii) output tensor, 
their values are read before any assignments
   
   For examples:
   
   (a) SumColumns and SumRows are based on Mult on the input M (or its 
transpose T) and "one", while "one" is initialized with setValue.
   
   ```cpp
   void SumColumns(const Tensor &M, Tensor *v) {
     if (M.transpose()) {
       Tensor X = Transpose(M);
       SumRows(X, v);
     } else {
       CHECK_EQ(M.nDim(), 2u);
       // CHECK_EQ(v->nDim(), 1u); (chonho) shape of v is 2-element tuple
       size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1);
       CHECK_EQ(nb_row, v->Size());
   
       Tensor one(Shape{nb_col}, M.device(), M.data_type());
       one.SetValue(1.0f);  // TODO(wangwei) cast type
       Mult(M, one, v);
     }
   }
   void SumRows(const Tensor &M, Tensor *v) {
     if (M.transpose()) {
       Tensor X = Transpose(M);
       SumColumns(X, v);
     } else {
       CHECK_EQ(M.nDim(), 2u);
       // CHECK_EQ(v->nDim(), 1u); (chonho) shape of v is 2-element tuple
       size_t nb_row = M.shape(0), nb_col = M.shape(1);
       CHECK_EQ(nb_col, v->Size());
   
       Tensor one(Shape{nb_row}, M.device(), M.data_type());
       one.SetValue(1.0f);  // TODO(wangwei) cast type
       Tensor X = Transpose(M);
       Mult(X, one, v);
     }
   }
   ```
   
   (b) Sum is based on SumRows and SumColumns, where the created tensor out is 
used as output only (no reading)
   
   ```cpp
   Tensor Sum(const Tensor &M, int axis) {
     if (axis == 0) {
       Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
       SumRows(M, &out);
       return out;
     } else {
       CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
       Tensor out(Shape{M.shape(0)}, M.device(), M.data_type());
       SumColumns(M, &out);
       return out;
     }
   }
   ```
   
   
   (c) Mult of tensors A and tensor B: It creates a output tensor out and pass 
it to the GEMV or GEMM defined in tenor_math_cuda.h
   
   ```cpp
   Tensor Mult(const Tensor &A, const Tensor &B) {
     Shape s;
     s.push_back(A.shape(0));
     if (B.nDim() == 2) s.push_back(B.shape(1));
     Tensor out(s, A.device(), A.data_type());
     Mult(A, B, &out);
     return out;
   }
   
   void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
     Mult(1.0f, A, B, 0.0f, out);
   }
   
   template <typename SType>
   void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType 
beta,
             Tensor *C) {
     CHECK_EQ(A.shape().size(), 2u);
     if (B.nDim() == 1u) {
       TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
         auto a = TypeCast<SType, DType>(alpha);
         auto b = TypeCast<SType, DType>(beta);
         C->device()->Exec([a, A, b, B, C](Context * ctx) {
           GEMV<DType, Lang>(a, A, B, b, C, ctx);
         }, {A.block(), B.block()}, {C->block()});
       });
     } else {
       CHECK(!C->transpose());
       TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
         auto a = TypeCast<SType, DType>(alpha);
         auto b = TypeCast<SType, DType>(beta);
         C->device()->Exec([a, A, b, B, C](Context * ctx) {
           GEMM<DType, Lang>(a, A, B, b, C,
           ctx);
         }, {A.block(), B.block()}, {C->block()});
       });
     }
   }
   ```
   
   2. tenor_math_cuda.h: check if (i) after a new tensor is created (ii) output 
tensor, their values are read before any assignments
   
   For examples: 
   
   (a) Two tensors EltwiseMult: New tensor t is created but passed into cudnn 
transform as output (to store the transform of input), as well as the original 
output pointer.
   
   ```cpp
   void EltwiseMult<float, lang::Cuda>(const Tensor& in1,
                                       const Tensor& in2, Tensor* out,
                                       Context* ctx) {
     const float* inPtr1 = static_cast<const float*>(in1.block()->data());
     const float* inPtr2 = static_cast<const float*>(in2.block()->data());
     float* outPtr = static_cast<float*>(out->block()->mutable_data());
     const size_t num = in1.Size();
   
     //if both in1 and in2 are not transposed, and have the same strides,
     //we proceed to normal cuda::mult
     if (!in1.transpose() && !in2.transpose() && (in1.stride() == 
in2.stride())) {
       cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
     } else { //else we check whether in1 or in2 or both are transposed
       if (in1.transpose() && in2.transpose()) {
         Tensor t(in1.shape(), in1.device(), in1.data_type());
         Transform<float, lang::Cuda>(in1, &t, ctx);
         Transform<float, lang::Cuda>(in2, out, ctx);
         float* tPtr = static_cast<float*>(t.block()->mutable_data());
         cuda::mult(num, tPtr, outPtr, outPtr, ctx->stream);
       } else if (in1.transpose()) {
         Transform<float, lang::Cuda>(in1, out, ctx);
         cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream);
       } else if (in2.transpose()) {
         Transform<float, lang::Cuda>(in2, out, ctx);
         cuda::mult(num, inPtr1, outPtr, outPtr, ctx->stream);
       }
     }
   }
   ```
   
   (b) One tensor one constant EltwiseMult: Pass it to cuda kernal 
(math_kernal.cu) directly
   
   ```cpp
   /// out = in * x
   template <>
   void EltwiseMult<float, lang::Cuda>(const Tensor& in,
                                       const float x, Tensor* out, Context* 
ctx) {
     const float* inPtr = static_cast<const float*>(in.block()->data());
     float* outPtr = static_cast<float*>(out->block()->mutable_data());
     const size_t num = in.Size();
     cuda::mult(num, inPtr, x, outPtr, ctx->stream);
   }
   
   ```
   3. math_kernal.cu: check if any cuda kernel read from the output value 
before assignment
   
   For examples: 
   
   (a) cuda sum column (I think this sum column is not in use anymore, while 
using the general one in tensor.cc line 1162 based on Mult instead): the output 
is set 0 before adding all the columns up
    
   ```cpp
   __global__ void kernel_sum_col(const float *src_mat_data, float 
*dst_vec_data,
                                  int rows, int cols, int stride) {
     int index = blockIdx.x * blockDim.x + threadIdx.x;
     int num_threads = blockDim.x * gridDim.x;
     for (; index < rows; index += num_threads) {
       dst_vec_data[index] = 0.0f;
       for (int k = 0; k < cols; k++) {
         dst_vec_data[index] += src_mat_data[index * stride + k];
       }
     }
   }
   ```
   
   (b) cuda sum row (I think this sum column is not in use anymore, while using 
the general one in tensor.cc line 1147 based on Mult instead): the output is 
set to the aux local variable after the calculation is completed
   
   ```cpp
   __global__ void kernel_sum_row(const float *src_mat_data, float 
*dst_vec_data,
                                  int rows, int cols, int stride) {
     int j = blockIdx.x;
     int THREADS = blockDim.x;
     if (j >= cols) {
       return;
     }
   
     __shared__ float aux[CU1DBLOCK];
     int steps = (rows - 1) / THREADS + 1;
     aux[threadIdx.x] = src_mat_data[j + threadIdx.x * stride];
     for (int i = 1; i < steps; ++i) {
       if (threadIdx.x + i * THREADS < rows) {
         aux[threadIdx.x] +=
             src_mat_data[j + (threadIdx.x + i * THREADS) * stride];
       }
     }
   
     int total_threads = THREADS;
     __syncthreads();
     while (total_threads > 1) {
       int half_point = ((1 + total_threads) >> 1);
       if (threadIdx.x < half_point) {
         if (threadIdx.x + half_point < total_threads) {
           aux[threadIdx.x] += aux[threadIdx.x + half_point];
         }
       }
       __syncthreads();
       total_threads = ((total_threads + 1) >> 1);
     }
   
     __syncthreads();
     dst_vec_data[j] = aux[0];
   }
   ```
   
   (c) KernelComputeCrossEntropy: loss is set 0 before using
   
   ```cpp
   __global__ void KernelComputeCrossEntropy(const bool int_target, const 
size_t batchsize,
                                             const size_t dim, const float *p,
                                             const int *t, float *loss) {
     size_t sample = blockIdx.x * blockDim.x + threadIdx.x;
     size_t num_threads = blockDim.x * gridDim.x;
     if (int_target) {
       for (; sample < batchsize; sample += num_threads) {
         float prob_of_truth = p[sample * dim + t[sample]];
         loss[sample] = -std::log(max(prob_of_truth, FLT_MIN));
       }
     } else {
       for (; sample < batchsize; sample += num_threads) {
         float sum = 0.f;
         for (size_t j = 0; j < dim; j++) {
           sum += t[sample * dim + j];
         }
         loss[sample] = 0;
         for (size_t j = 0, offset = sample * dim; j < dim; j++, offset++) {
           loss[sample] -= t[offset] / sum * std::log(max(p[offset], FLT_MIN));
         }
       }
     }
   }
   ```
   
   Results: new values are assigned to the output, while there is no read from 
output before assignment


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

[GitHub] [singa] chrishkchris commented on issue #555: SINGA-490 Optimization on GPU Malloc and Cudastream

Reply via email to