chrishkchris commented on issue #555: SINGA-490 Optimization on GPU Malloc and Cudastream URL: https://github.com/apache/singa/pull/555#issuecomment-553281211 Double checking of cuda codes (I removed the cuda memset, so I need to check everything related to cuda) I have to check three files: 1. tensor.cc, 2. tenor_math_cuda.h, 3. math_kernal.cu 1. tensor.cc: check if (i) after a new tensor is created (ii) output tensor, their values are read before any assignments For examples: (a) SumColumns and SumRows are based on Mult on the input M (or its transpose T) and "one", while "one" is initialized with setValue. ```cpp void SumColumns(const Tensor &M, Tensor *v) { if (M.transpose()) { Tensor X = Transpose(M); SumRows(X, v); } else { CHECK_EQ(M.nDim(), 2u); // CHECK_EQ(v->nDim(), 1u); (chonho) shape of v is 2-element tuple size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1); CHECK_EQ(nb_row, v->Size()); Tensor one(Shape{nb_col}, M.device(), M.data_type()); one.SetValue(1.0f); // TODO(wangwei) cast type Mult(M, one, v); } } void SumRows(const Tensor &M, Tensor *v) { if (M.transpose()) { Tensor X = Transpose(M); SumColumns(X, v); } else { CHECK_EQ(M.nDim(), 2u); // CHECK_EQ(v->nDim(), 1u); (chonho) shape of v is 2-element tuple size_t nb_row = M.shape(0), nb_col = M.shape(1); CHECK_EQ(nb_col, v->Size()); Tensor one(Shape{nb_row}, M.device(), M.data_type()); one.SetValue(1.0f); // TODO(wangwei) cast type Tensor X = Transpose(M); Mult(X, one, v); } } ``` (b) Sum is based on SumRows and SumColumns, where the created tensor out is used as output only (no reading) ```cpp Tensor Sum(const Tensor &M, int axis) { if (axis == 0) { Tensor out(Shape{M.shape(1)}, M.device(), M.data_type()); SumRows(M, &out); return out; } else { CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis; Tensor out(Shape{M.shape(0)}, M.device(), M.data_type()); SumColumns(M, &out); return out; } } ``` (c) Mult of tensors A and tensor B: It creates a output tensor out and pass it to the GEMV or GEMM defined in tenor_math_cuda.h ```cpp Tensor Mult(const Tensor &A, const Tensor &B) { Shape s; s.push_back(A.shape(0)); if (B.nDim() == 2) s.push_back(B.shape(1)); Tensor out(s, A.device(), A.data_type()); Mult(A, B, &out); return out; } void Mult(const Tensor &A, const Tensor &B, Tensor *out) { Mult(1.0f, A, B, 0.0f, out); } template <typename SType> void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta, Tensor *C) { CHECK_EQ(A.shape().size(), 2u); if (B.nDim() == 1u) { TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, { auto a = TypeCast<SType, DType>(alpha); auto b = TypeCast<SType, DType>(beta); C->device()->Exec([a, A, b, B, C](Context * ctx) { GEMV<DType, Lang>(a, A, B, b, C, ctx); }, {A.block(), B.block()}, {C->block()}); }); } else { CHECK(!C->transpose()); TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, { auto a = TypeCast<SType, DType>(alpha); auto b = TypeCast<SType, DType>(beta); C->device()->Exec([a, A, b, B, C](Context * ctx) { GEMM<DType, Lang>(a, A, B, b, C, ctx); }, {A.block(), B.block()}, {C->block()}); }); } } ``` 2. tenor_math_cuda.h: check if (i) after a new tensor is created (ii) output tensor, their values are read before any assignments For examples: (a) Two tensors EltwiseMult: New tensor t is created but passed into cudnn transform as output (to store the transform of input), as well as the original output pointer. ```cpp void EltwiseMult<float, lang::Cuda>(const Tensor& in1, const Tensor& in2, Tensor* out, Context* ctx) { const float* inPtr1 = static_cast<const float*>(in1.block()->data()); const float* inPtr2 = static_cast<const float*>(in2.block()->data()); float* outPtr = static_cast<float*>(out->block()->mutable_data()); const size_t num = in1.Size(); //if both in1 and in2 are not transposed, and have the same strides, //we proceed to normal cuda::mult if (!in1.transpose() && !in2.transpose() && (in1.stride() == in2.stride())) { cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream); } else { //else we check whether in1 or in2 or both are transposed if (in1.transpose() && in2.transpose()) { Tensor t(in1.shape(), in1.device(), in1.data_type()); Transform<float, lang::Cuda>(in1, &t, ctx); Transform<float, lang::Cuda>(in2, out, ctx); float* tPtr = static_cast<float*>(t.block()->mutable_data()); cuda::mult(num, tPtr, outPtr, outPtr, ctx->stream); } else if (in1.transpose()) { Transform<float, lang::Cuda>(in1, out, ctx); cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream); } else if (in2.transpose()) { Transform<float, lang::Cuda>(in2, out, ctx); cuda::mult(num, inPtr1, outPtr, outPtr, ctx->stream); } } } ``` (b) One tensor one constant EltwiseMult: Pass it to cuda kernal (math_kernal.cu) directly ```cpp /// out = in * x template <> void EltwiseMult<float, lang::Cuda>(const Tensor& in, const float x, Tensor* out, Context* ctx) { const float* inPtr = static_cast<const float*>(in.block()->data()); float* outPtr = static_cast<float*>(out->block()->mutable_data()); const size_t num = in.Size(); cuda::mult(num, inPtr, x, outPtr, ctx->stream); } ``` 3. math_kernal.cu: check if any cuda kernel read from the output value before assignment For examples: (a) cuda sum column (I think this sum column is not in use anymore, while using the general one in tensor.cc line 1162 based on Mult instead): the output is set 0 before adding all the columns up ```cpp __global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data, int rows, int cols, int stride) { int index = blockIdx.x * blockDim.x + threadIdx.x; int num_threads = blockDim.x * gridDim.x; for (; index < rows; index += num_threads) { dst_vec_data[index] = 0.0f; for (int k = 0; k < cols; k++) { dst_vec_data[index] += src_mat_data[index * stride + k]; } } } ``` (b) cuda sum row (I think this sum column is not in use anymore, while using the general one in tensor.cc line 1147 based on Mult instead): the output is set to the aux local variable after the calculation is completed ```cpp __global__ void kernel_sum_row(const float *src_mat_data, float *dst_vec_data, int rows, int cols, int stride) { int j = blockIdx.x; int THREADS = blockDim.x; if (j >= cols) { return; } __shared__ float aux[CU1DBLOCK]; int steps = (rows - 1) / THREADS + 1; aux[threadIdx.x] = src_mat_data[j + threadIdx.x * stride]; for (int i = 1; i < steps; ++i) { if (threadIdx.x + i * THREADS < rows) { aux[threadIdx.x] += src_mat_data[j + (threadIdx.x + i * THREADS) * stride]; } } int total_threads = THREADS; __syncthreads(); while (total_threads > 1) { int half_point = ((1 + total_threads) >> 1); if (threadIdx.x < half_point) { if (threadIdx.x + half_point < total_threads) { aux[threadIdx.x] += aux[threadIdx.x + half_point]; } } __syncthreads(); total_threads = ((total_threads + 1) >> 1); } __syncthreads(); dst_vec_data[j] = aux[0]; } ``` (c) KernelComputeCrossEntropy: loss is set 0 before using ```cpp __global__ void KernelComputeCrossEntropy(const bool int_target, const size_t batchsize, const size_t dim, const float *p, const int *t, float *loss) { size_t sample = blockIdx.x * blockDim.x + threadIdx.x; size_t num_threads = blockDim.x * gridDim.x; if (int_target) { for (; sample < batchsize; sample += num_threads) { float prob_of_truth = p[sample * dim + t[sample]]; loss[sample] = -std::log(max(prob_of_truth, FLT_MIN)); } } else { for (; sample < batchsize; sample += num_threads) { float sum = 0.f; for (size_t j = 0; j < dim; j++) { sum += t[sample * dim + j]; } loss[sample] = 0; for (size_t j = 0, offset = sample * dim; j < dim; j++, offset++) { loss[sample] -= t[offset] / sum * std::log(max(p[offset], FLT_MIN)); } } } } ``` Results: new values are assigned to the output, while there is no read from output before assignment
---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services