SINGA-100 Implement layers using CUDNN for GPU training Fix a bug: a failed test when running "make test",caused by src/test/test_math.cc:349. modified file: math_kernel.cu: singa_gpu_sum_row() and singa_gpu_sum_col() set threads_per_block and num_blocks wrong value
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/05680dd1 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/05680dd1 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/05680dd1 Branch: refs/heads/master Commit: 05680dd174297cd7be8447aacf334299724c6575 Parents: 15b23a6 Author: seaok <[email protected]> Authored: Fri Dec 11 17:21:07 2015 +0800 Committer: seaok <[email protected]> Committed: Fri Dec 11 17:21:07 2015 +0800 ---------------------------------------------------------------------- src/test/test_math.cc | 7 ++++--- src/utils/math_kernel.cu | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05680dd1/src/test/test_math.cc ---------------------------------------------------------------------- diff --git a/src/test/test_math.cc b/src/test/test_math.cc index 0b9f0ff..6583b3d 100644 --- a/src/test/test_math.cc +++ b/src/test/test_math.cc @@ -313,14 +313,15 @@ TEST(MathTest, TestDotGPU) { cudaFree(B_gpu); } -TEST(MathTest, TestSingaSumColGPU) { +TEST(MathTest, TestSingaSumRowGPU) { float A[3][4]; float B[4]; float C[4]; for (int i = 0; i < 3; i++) { for (int j = 0; j < 4; j++) { - A[i][j] = i + j; + //A[i][j] = i + j; + A[i][j] = 1.0f; } } @@ -335,7 +336,7 @@ TEST(MathTest, TestSingaSumColGPU) { cudaMalloc(reinterpret_cast<void**>(&A_gpu), 12*sizeof(float)); cudaMalloc(reinterpret_cast<void**>(&B_gpu), 4*sizeof(float)); cudaMemcpy(A_gpu, A, 12*sizeof(float), cudaMemcpyHostToDevice); - //singa_gpu_sum_row(A_gpu, B_gpu, 3, 4, 4); + singa_gpu_sum_row(A_gpu, B_gpu, 3, 4, 4); cudaMemcpy(B, B_gpu, 4*sizeof(float), cudaMemcpyDeviceToHost); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/05680dd1/src/utils/math_kernel.cu ---------------------------------------------------------------------- diff --git a/src/utils/math_kernel.cu b/src/utils/math_kernel.cu index 3650c09..fe543d7 100644 --- a/src/utils/math_kernel.cu +++ b/src/utils/math_kernel.cu @@ -339,21 +339,21 @@ void singa_gpu_sum_vec(float *data, float *sum , int n) { kernel_sum_vec<<<num_blocks, threads_per_block>>>(data, sum, n); } -void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, +void singa_gpu_sum_row(const float *src_mat_data, float *dst_vec_data, int rows, int cols, int stride) { int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows; int num_blocks = cols; - kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, + kernel_sum_row<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data, rows, cols, stride); } -void singa_gpu_sum_row(const float *src_mat_data, float *dst_vec_data, +void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, int rows, int cols, int stride) { int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols; int num_blocks = rows; - kernel_sum_row<<<num_blocks, threads_per_block>>>(src_mat_data, + kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data, rows, cols, stride); }
