SINGA-80 New Blob Level and Address Level Math Operation Interface Update math functions for gpu. fix compile bug from Makefile.gpu due to 32bit and 64bit
close #74 Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/a65a9535 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/a65a9535 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/a65a9535 Branch: refs/heads/master Commit: a65a9535e4d3df21faf0d67b7891df065a006fff Parents: 86284f1 Author: seaok <[email protected]> Authored: Sun Nov 15 21:35:10 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Mon Nov 16 10:18:51 2015 +0800 ---------------------------------------------------------------------- Makefile.gpu | 2 +- include/singa/utils/math_addr.h | 12 +- include/singa/utils/math_kernel.h | 87 +++-- include/singa/utils/singa_op.h | 54 +-- src/test/test_math.cc | 60 +--- src/utils/math_kernel.cu | 590 +++++++++++++++------------------ 6 files changed, 379 insertions(+), 426 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a65a9535/Makefile.gpu ---------------------------------------------------------------------- diff --git a/Makefile.gpu b/Makefile.gpu index 9c9051c..2dc2a71 100644 --- a/Makefile.gpu +++ b/Makefile.gpu @@ -27,7 +27,7 @@ CUDA_DIR := /usr/local/cuda #CUDA_DIR := # Lib folder for system and external libs. You may need to change it. -LIBRARY_DIRS := $(HOME_DIR)/lib64 $(HOME_DIR)/lib $(HOME_DIR)/local/lib $(CUDA_DIR)/lib $(CUDA_DIR)/lib64 +LIBRARY_DIRS := $(HOME_DIR)/lib64 $(HOME_DIR)/lib $(HOME_DIR)/local/lib $(CUDA_DIR)/lib64 $(CUDA_DIR)/lib # Header folder for system and external libs. You may need to change it. INCLUDE_DIRS := $(HOME_DIR)/include ./include $(HOME_DIR)/local/include/zookeeper $(CUDA_DIR)/include # g++ location, should support c++11, tested with 4.8.1 http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a65a9535/include/singa/utils/math_addr.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h index fe19f14..9b91e70 100644 --- a/include/singa/utils/math_addr.h +++ b/include/singa/utils/math_addr.h @@ -29,7 +29,7 @@ extern "C" { #endif #include "singa/utils/singa_op.h" #ifdef USE_GPU -#include "cublas_v2.h" +#include <cublas_v2.h> #endif @@ -177,6 +177,16 @@ void gpu_e_f(const int n, const Dtype alpha, Dtype * A) { } template<typename Op, typename Dtype> +void gpu_e_f(const int n, const Dtype * A, Dtype * B) { + Op::CudaMap(A, B, n); +} + +template<typename Op, typename Dtype> +void gpu_e_f(const int n, const Dtype * A, const Dtype * B, const Dtype * C) { + Op::CudaMap(A, B, C, n); +} + +template<typename Op, typename Dtype> void gpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) { Op::CudaMap(alpha, A, B, n); } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a65a9535/include/singa/utils/math_kernel.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/math_kernel.h b/include/singa/utils/math_kernel.h index f5d3e34..d763283 100644 --- a/include/singa/utils/math_kernel.h +++ b/include/singa/utils/math_kernel.h @@ -1,59 +1,78 @@ -#ifndef MATH_KERNEL_H -#define MATH_KERNEL_H - -namespace singa{ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ +#ifndef SINGA_UTILS_MATH_KERNEL_H_ +#define SINGA_UTILS_MATH_KERNEL_H_ + +namespace singa { extern "C" { - void singa_gpu_sum_vec(float *data, float *sum , long n); - - void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride); - - void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data, long rows, long cols, long stride); - - void singa_gpu_set_value(float *data, float value, long n); - - void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_sum_vec(float *data, float *sum , int n); - void singa_gpu_scale_grad(float *data, float alpha, long n); + void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, + int rows, int cols, int stride); - void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_add_vec_row(const float *src_vec_data, + const float *src_mat_data, float *des_mat_data, + int rows, int cols, int stride); - void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_exp(const float *src_data, float *des_data, int n); - void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_log(const float *src_data, float *des_data, int n); - void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_sigmoid(const float *src_data, float *des_data, int n); - void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, int n); - void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_relu(const float *src_data, float *des_data, int n); - void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_relu_grad(const float *src_data, float *des_data, int n); - void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_tanh(const float *src_data, float *des_data, int n); - void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_tanh_grad(const float *src_data, float *des_data, int n); - void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_softplus(const float *src_data, float *des_data, int n); - void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_softplus_grad(const float *src_data, float *des_data, int n); - void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_square(const float *src_data, float *des_data, int n); - void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_square_grad(const float *src_data, float *des_data, int n); - void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n); + void singa_gpu_sqrt(const float *src_data, float *des_data, int n); - void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); + void singa_gpu_pow(const float *src_data_a, const float *src_data_b, + float *des_data, int n); - void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); + void singa_gpu_mult(const float *src_data_a, const float *src_data_b, + float *des_data, int n); - void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); + void singa_gpu_div(const float *src_data_a, const float *src_data_b, + float *des_data, int n); - void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n); + void singa_gpu_set_value(float *data, float value, int n); + void singa_gpu_threshold(const float *src_data, float *des_data, int n); }; -} +} // namespace singa -#endif +#endif // SINGA_UTILS_MATH_KERNEL_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a65a9535/include/singa/utils/singa_op.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/singa_op.h b/include/singa/utils/singa_op.h index 78fe955..c934050 100644 --- a/include/singa/utils/singa_op.h +++ b/include/singa/utils/singa_op.h @@ -27,7 +27,7 @@ #ifdef USE_GPU #include <cuda_runtime.h> -#include "cublas_v2.h" +#include <cublas_v2.h> #include "singa/utils/math_kernel.h" #endif // USE_GPU @@ -44,9 +44,8 @@ struct Exp { *b = exp(a); } #ifdef USE_GPU - inline static void CudaMap(Dtype alpha, const Dtype * a, - Dtype * b, int n) { - singa::singa_gpu_exp(a, b, alpha, n); + inline static void CudaMap(const Dtype * a, Dtype * b, int n) { + singa::singa_gpu_exp(a, b, n); } #endif // USE_GPU }; @@ -59,6 +58,9 @@ struct Log { *b = log(a); } #ifdef USE_GPU + inline static void CudaMap(const Dtype * a, Dtype * b, int n) { + singa::singa_gpu_log(a, b, n); + } #endif // USE_GPU }; @@ -68,9 +70,8 @@ struct Sigmoid { *b = 1.0f / (1.0f + expf(-a)); } #ifdef USE_GPU - inline static void CudaMap(const Dtype * a, - Dtype * b, int n) { - singa::singa_gpu_sigmoid(a, b, 1, n); + inline static void CudaMap(const Dtype * a, Dtype * b, int n) { + singa::singa_gpu_sigmoid(a, b, n); } #endif // USE_GPU }; @@ -80,8 +81,8 @@ struct SigmoidGrad { *b = a * (1.0f - a); } #ifdef USE_GPU - inline static void CudaMap(Dtype alpha, const Dtype * a, Dtype * b, int n) { - singa::singa_gpu_sigmoid_grad(a, b, 1, n); + inline static void CudaMap(const Dtype * a, Dtype * b, int n) { + singa::singa_gpu_sigmoid_grad(a, b, n); } #endif // USE_GPU }; @@ -93,7 +94,7 @@ struct Relu { } #ifdef USE_GPU inline static void CudaMap(const Dtype * a, Dtype * b, int n) { - singa::singa_gpu_relu(a, b, 1, n); + singa::singa_gpu_relu(a, b, n); } #endif // USE_GPU }; @@ -105,7 +106,7 @@ struct ReluGrad { } #ifdef USE_GPU inline static void CudaMap(const Dtype * a, Dtype * b, int n) { - singa::singa_gpu_relu_grad(a, b, 1, n); + singa::singa_gpu_relu_grad(a, b, n); } #endif // USE_GPU }; @@ -117,7 +118,7 @@ struct Tanh { } #ifdef USE_GPU inline static void CudaMap(const Dtype * a, Dtype * b, int n) { - singa::singa_gpu_tanh(a, b, 1, n); + singa::singa_gpu_tanh(a, b, n); } #endif // USE_GPU }; @@ -129,7 +130,7 @@ struct TanhGrad { } #ifdef USE_GPU inline static void CudaMap(const Dtype * a, Dtype * b, int n) { - singa::singa_gpu_tanh_grad(a, b, 1, n); + singa::singa_gpu_tanh_grad(a, b, n); } #endif // USE_GPU }; @@ -141,7 +142,7 @@ struct Softplus { } #ifdef USE_GPU inline static void CudaMap(const Dtype * a, Dtype * b, int n) { - singa::singa_gpu_softplus(a, b, 1, n); + singa::singa_gpu_softplus(a, b, n); } #endif // USE_GPU }; @@ -152,8 +153,7 @@ struct SoftplusGrad { *b = 1.0f / (1.0f + expf(-a)); } #ifdef USE_GPU - inline static void CudaMap(const Dtype * a, - Dtype * b, int n) { + inline static void CudaMap(const Dtype * a, Dtype * b, int n) { singa::singa_gpu_softplus_grad(a, b, n); } #endif // USE_GPU @@ -165,8 +165,7 @@ struct Square { *b = a * a; } #ifdef USE_GPU - inline static void CudaMap(const Dtype * a, - Dtype * b, int n) { + inline static void CudaMap(const Dtype * a, Dtype * b, int n) { singa::singa_gpu_square(a, b, n); } #endif // USE_GPU @@ -178,8 +177,7 @@ struct SquareGrad { *b = 2 * sqrt(a); } #ifdef USE_GPU - inline static void CudaMap(const Dtype * a, - Dtype * b, int n) { + inline static void CudaMap(const Dtype * a, Dtype * b, int n) { singa::singa_gpu_square_grad(a, b, 1, n); } #endif // USE_GPU @@ -191,8 +189,7 @@ struct Sqrt { *b = sqrt(a); } #ifdef USE_GPU - inline static void CudaMap(const Dtype * a, - Dtype * b, int n) { + inline static void CudaMap(const Dtype * a, Dtype * b, int n) { singa::singa_gpu_sqrt(a, b, n); } #endif // USE_GPU @@ -207,6 +204,12 @@ struct Pow { inline static void Map(const Dtype & a, const Dtype &b, Dtype * c) { *c = pow(a, b); } +#ifdef USE_GPU + inline static void CudaMap(const Dtype * a, + const Dtype * b, Dtype * c, int n) { + singa::singa_gpu_pow(a, b, c, n); + } +#endif // USE_GPU }; template<typename Dtype> struct Mult { @@ -214,8 +217,9 @@ struct Mult { *c = a * b; } #ifdef USE_GPU - inline static void CudaMap(const Dtype* a, const Dtype* b, Dtype* c, int n) { - singa::singa_gpu_mult(a, b, c, 1, 1, n); + inline static void CudaMap(const Dtype * a, + const Dtype * b, Dtype * c, int n) { + singa::singa_gpu_mult(a, b, c, n); } #endif // USE_GPU }; @@ -228,7 +232,7 @@ struct Div { #ifdef USE_GPU inline static void CudaMap(const Dtype * a, const Dtype * b, Dtype * c, int n) { - singa::singa_gpu_div(a, b, c, 1, 1, n); + singa::singa_gpu_div(a, b, c, n); } #endif // USE_GPU }; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a65a9535/src/test/test_math.cc ---------------------------------------------------------------------- diff --git a/src/test/test_math.cc b/src/test/test_math.cc index e64d567..a5bf180 100644 --- a/src/test/test_math.cc +++ b/src/test/test_math.cc @@ -43,9 +43,9 @@ TEST(MathTest, TestGemvCPU) { float C[3] = {}; float D[3] = {}; - for(int i = 0; i < 4; i++) + for(int i = 0; i < 3; i++) { - for(int j = 0; j < 3; j++) + for(int j = 0; j < 4; j++) { A[j][i] = i-j + i*j; } @@ -153,7 +153,7 @@ TEST(MathTest, TestGemmGPU) { cudaMemcpy(A_gpu,A,3*2*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(B_gpu,B,3*2*sizeof(float),cudaMemcpyHostToDevice); - gpu_gemm(A_gpu, B_gpu, 2, 2, 3 , 1, 0, true, false, C_gpu); + gpu_gemm<float>(A_gpu, B_gpu, 2, 2, 3 , 1, 0, true, false, C_gpu); cudaMemcpy(C,C_gpu,2*2*sizeof(float),cudaMemcpyDeviceToHost); @@ -213,7 +213,7 @@ TEST(MathTest, TestGemvGPU) { cudaMemcpy(B_gpu,B,4*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(C_gpu,C,3*sizeof(float),cudaMemcpyHostToDevice); - gpu_gemv(A_gpu, B_gpu, 4, 3, 1, 1, true, C_gpu); + gpu_gemv<float>(A_gpu, B_gpu, 4, 3, 1, 1, true, C_gpu); cudaMemcpy(C,C_gpu,3*sizeof(float),cudaMemcpyDeviceToHost); @@ -262,17 +262,18 @@ TEST(MathTest, TestAxpyGPU) { cudaMemcpy(A_gpu,A,4*3*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(B_gpu,B,3*4*sizeof(float),cudaMemcpyHostToDevice); - gpu_axpy(A_gpu, 12, 2, B_gpu); + gpu_axpy<float>(A_gpu, 12, 2, B_gpu); cudaMemcpy(A,A_gpu,4*3*sizeof(float),cudaMemcpyDeviceToHost); cudaMemcpy(B,B_gpu,3*4*sizeof(float),cudaMemcpyDeviceToHost); - for(int i = 0; i < 12; i++)D[0][i] += 2*C[0][i]; + //for(int i = 0; i < 12; i++)D[0][i] += 2*C[0][i]; - for(int i = 0; i < 3; i++) + for(int i = 0; i < 4; i++) { - for(int j = 0; j < 4; j++) + for(int j = 0; j < 3; j++) { + D[i][j] += C[i][j]; ASSERT_EQ(B[i][j],D[i][j]); } } @@ -300,7 +301,7 @@ TEST(MathTest, TestDotGPU) { cudaMemcpy(A_gpu,A,12*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(B_gpu,B,12*sizeof(float),cudaMemcpyHostToDevice); - float gpu_ret=gpu_dot(A_gpu,B_gpu,12); + float gpu_ret=gpu_dot<float>(A_gpu,B_gpu,12); float cpu_ret=0.0f; for(int i = 0; i < 12; i++) @@ -418,7 +419,6 @@ TEST(MathTest, TestSingaSetValueGPU) { float A[3][4]; float* A_gpu=NULL; - float* B_gpu=NULL; cudaMalloc((void**)&A_gpu, 3*4*sizeof(float)); @@ -444,59 +444,25 @@ TEST(MathTest, TestEopGPU) { float A[10] = {}; float B[10] = {}; - float C[10] = {}; - float D[10] = {}; - float O[10] = {}; for(int i = 0; i < 10; i++) { A[i] = i; B[i] = -i; - C[i] = i; - O[i] = 0.0f; - } float* A_gpu=NULL; float* B_gpu=NULL; - float* C_gpu=NULL; - float* O_gpu=NULL; cudaMalloc((void**)&A_gpu, 10*sizeof(float)); cudaMalloc((void**)&B_gpu, 10*sizeof(float)); - cudaMalloc((void**)&C_gpu, 10*sizeof(float)); - cudaMalloc((void**)&O_gpu, 10*sizeof(float)); cudaMemcpy(A_gpu,A,10*sizeof(float),cudaMemcpyHostToDevice); cudaMemcpy(B_gpu,B,10*sizeof(float),cudaMemcpyHostToDevice); - cudaMemcpy(C_gpu,C,10*sizeof(float),cudaMemcpyHostToDevice); - cudaMemcpy(O_gpu,O,10*sizeof(float),cudaMemcpyHostToDevice); - - gpu_e_f<singa::op::Set>(5, 15, O_gpu); - cudaMemcpy(O,O_gpu,10*sizeof(float),cudaMemcpyDeviceToHost); - - for(int i = 0; i < 5; i++) - { - ASSERT_EQ(O[i]-15,0); - } - for(int i = 5; i < 10; i++) - { - ASSERT_EQ(O[i],0); - } - gpu_e_f<singa::op::Scale>(10, C_gpu, 2, C_gpu); - cudaMemcpy(C,C_gpu,10*sizeof(float),cudaMemcpyDeviceToHost); - for(int i = 0; i < 10; i++) - { - ASSERT_EQ(C[i]-2*i,0); - } - - gpu_e_f<singa::op::Add>(10, A_gpu, B_gpu, 0, 0, O_gpu); - cudaMemcpy(O,O_gpu,10*sizeof(float),cudaMemcpyDeviceToHost); + gpu_e_f<singa::op::Sigmoid<float>, float>(10, A_gpu, B_gpu); - for(int i = 0; i < 10; i++) - { - ASSERT_EQ(O[i],0); - } + cudaFree(A_gpu); + cudaFree(B_gpu); } #endif // USE_GPU http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a65a9535/src/utils/math_kernel.cu ---------------------------------------------------------------------- diff --git a/src/utils/math_kernel.cu b/src/utils/math_kernel.cu index 203f261..4dd91e0 100644 --- a/src/utils/math_kernel.cu +++ b/src/utils/math_kernel.cu @@ -1,4 +1,25 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ #include <cmath> +#include <algorithm> #include "singa/utils/math_kernel.h" #define CU2DBLOCK_X 32 @@ -7,433 +28,366 @@ #define CU1DBLOCK 1024 #define CU1DBLOCKF 1024.0 - -//Cuda Kernel Functions +// Cuda Kernel Functions __global__ -void kernel_sum_vec(float *data, float *sum , long n) -{ - int THREADS = blockDim.x; - - __shared__ float aux[CU1DBLOCK]; - int steps = (n - 1) / THREADS + 1; - aux[threadIdx.x] = data[threadIdx.x]; - - for(int i=1; i<steps; ++i) { - if(threadIdx.x+i*THREADS < n) { - aux[threadIdx.x] += data[threadIdx.x+i*THREADS]; - } - } - - int total_threads = THREADS; - __syncthreads(); - - while(total_threads > 1) { - int half_point = ((1+total_threads) >> 1); - if (threadIdx.x < half_point) { - if(threadIdx.x+half_point < total_threads) { - aux[threadIdx.x] += aux[threadIdx.x + half_point]; - } - } - __syncthreads(); - total_threads = ((total_threads+1) >> 1); - } - - __syncthreads(); - *sum = aux[0]; -} +void kernel_sum_vec(float *data, float *sum , int n) { + int THREADS = blockDim.x; -__global__ -void kernel_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride) -{ - int j = blockIdx.x; - int THREADS = blockDim.x; - if(j >= cols) { - return; - } - - __shared__ float aux[CU1DBLOCK]; - int steps = (rows - 1) / THREADS + 1; - aux[threadIdx.x] = src_mat_data[j+threadIdx.x*stride]; - for(int i=1; i<steps; ++i) { - if(threadIdx.x+i*THREADS < rows) { - aux[threadIdx.x] += src_mat_data[j+(threadIdx.x+i*THREADS)*stride]; - } - } - - int total_threads = THREADS; - __syncthreads(); - while(total_threads > 1) { - int half_point = ((1+total_threads) >> 1); - if (threadIdx.x < half_point) { - if(threadIdx.x+half_point < total_threads) { - aux[threadIdx.x] += aux[threadIdx.x + half_point]; - } - } - __syncthreads(); - total_threads = ((total_threads+1) >> 1); - } - - __syncthreads(); - dst_vec_data[j] = aux[0]; -} + __shared__ float aux[CU1DBLOCK]; + int steps = (n - 1) / THREADS + 1; + aux[threadIdx.x] = data[threadIdx.x]; -__global__ -void kernel_add_vec_row(const float *src_vec_data, const float *src_mat_data, float* des_mat_data,long rows, long cols, long stride) -{ - long i = blockIdx.x * blockDim.x + threadIdx.x; - long j = blockIdx.y * blockDim.y + threadIdx.y; - long num_threads_x = blockDim.x * gridDim.x; - long num_threads_y = blockDim.y * gridDim.y; - long index = 0; - for(; i<cols && j<rows; i+=num_threads_x, j+=num_threads_y) { - index = j * stride + i; - des_mat_data[index] = src_mat_data[index] + src_vec_data[i]; - } -} + for (int i = 1; i < steps; ++i) { + if (threadIdx.x + i * THREADS < n) { + aux[threadIdx.x] += data[threadIdx.x+i*THREADS]; + } + } -__global__ static -void kernel_set_value(float *data, float value, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - data[index] = value; - } + int total_threads = THREADS; + __syncthreads(); + + while (total_threads > 1) { + int half_point = ((1+total_threads) >> 1); + if (threadIdx.x < half_point) { + if (threadIdx.x+half_point < total_threads) { + aux[threadIdx.x] += aux[threadIdx.x + half_point]; + } + } + __syncthreads(); + total_threads = ((total_threads+1) >> 1); + } + + __syncthreads(); + *sum = aux[0]; } __global__ -void kernel_scale(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data[index] * alpha; - } +void kernel_sum_col(const float *src_mat_data, + float *dst_vec_data, int rows, int cols, int stride) { + int j = blockIdx.x; + int THREADS = blockDim.x; + if (j >= cols) { + return; + } + + __shared__ float aux[CU1DBLOCK]; + int steps = (rows - 1) / THREADS + 1; + aux[threadIdx.x] = src_mat_data[j+threadIdx.x*stride]; + for (int i = 1; i < steps; ++i) { + if (threadIdx.x+i*THREADS < rows) { + aux[threadIdx.x] += src_mat_data[j+(threadIdx.x+i*THREADS)*stride]; + } + } + + int total_threads = THREADS; + __syncthreads(); + while (total_threads > 1) { + int half_point = ((1+total_threads) >> 1); + if (threadIdx.x < half_point) { + if (threadIdx.x+half_point < total_threads) { + aux[threadIdx.x] += aux[threadIdx.x + half_point]; + } + } + __syncthreads(); + total_threads = ((total_threads+1) >> 1); + } + + __syncthreads(); + dst_vec_data[j] = aux[0]; } __global__ -void kernel_scale_grad(float *data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - data[index] = alpha; - } +void kernel_add_vec_row(const float *src_vec_data, const float *src_mat_data, + float* des_mat_data, int rows, int cols, int stride) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + int num_threads_x = blockDim.x * gridDim.x; + int num_threads_y = blockDim.y * gridDim.y; + int index = 0; + for (; i < cols && j < rows; i += num_threads_x, j += num_threads_y) { + index = j * stride + i; + des_mat_data[index] = src_mat_data[index] + src_vec_data[i]; + } } __global__ -void kernel_exp(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = pow(-src_data[index],alpha); - } +void kernel_exp(const float *src_data, float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = exp(src_data[index]); + } } __global__ -void kernel_exp_grad(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data[index] * log(alpha); - } +void kernel_log(const float *src_data, float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = log(src_data[index]); + } } __global__ -void kernel_sigmoid(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = 1.0f / (1.0f + expf(-src_data[index]) * alpha); - } +void kernel_sigmoid(const float *src_data, float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = 1.0f / (1.0f + expf(-src_data[index])); + } } __global__ -void kernel_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data[index] * (1.0f - src_data[index]) * alpha; - } +void kernel_sigmoid_grad(const float *src_data, float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = src_data[index] * (1.0f - src_data[index]); + } } __global__ -void kernel_relu(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = 1.0f / ( 1 - alpha ) * max( src_data[index], 0.0f ) + alpha * src_data[index]; - } +void kernel_relu(const float *src_data, float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = max(src_data[index], 0.0f); + } } __global__ -void kernel_relu_grad(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data[index] > 0.0f ? 1.0f : alpha; - } +void kernel_relu_grad(const float *src_data, float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f; + } } - __global__ -void kernel_tanh(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = tanhf( src_data[index] * alpha ); - } +void kernel_tanh(const float *src_data, float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = tanhf(src_data[index]); + } } __global__ -void kernel_tanh_grad(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = alpha * (1.0f - src_data[index] * src_data[index] ); - } +void kernel_tanh_grad(const float *src_data, float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = (1.0f - src_data[index] * src_data[index]); + } } __global__ -void kernel_softplus(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = logf(1 + expf(src_data[index])); - } +void kernel_softplus(const float *src_data, float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = logf(1 + expf(src_data[index])); + } } __global__ -void kernel_softplus_grad(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = 1.0f / (1.0f + expf(-src_data[index])); - } +void kernel_softplus_grad(const float *src_data, float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = 1.0f / (1.0f + expf(-src_data[index])); + } } __global__ -void kernel_square(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data[index] * src_data[index]; - } +void kernel_square(const float *src_data, float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = src_data[index] * src_data[index]; + } } __global__ -void kernel_square_grad(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = 2 * sqrt(src_data[index]); - } +void kernel_square_grad(const float *src_data, float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = 2 * sqrt(src_data[index]); + } } __global__ -void kernel_sqrt(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = sqrt(src_data[index]); - } +void kernel_sqrt(const float *src_data, float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = sqrt(src_data[index]); + } } __global__ -void kernel_threshold(const float *src_data, float *des_data, float alpha, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f; - } +void kernel_pow(const float *src_data_a, const float *src_data_b, + float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = pow(src_data_a[index], src_data_b[index]); + } } __global__ -void kernel_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data_a[index] + src_data_b[index]; - } +void kernel_mult(const float *src_data_a, const float *src_data_b, + float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = src_data_a[index] * src_data_b[index]; + } } __global__ -void kernel_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data_a[index] - src_data_b[index]; - } +void kernel_div(const float *src_data_a, const float *src_data_b, + float *des_data, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = src_data_a[index] / src_data_b[index]; + } } -__global__ -void kernel_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data_a[index] * src_data_b[index]; - } +__global__ static +void kernel_set_value(float *data, float value, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + data[index] = value; + } } __global__ -void kernel_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - long index = blockIdx.x * blockDim.x + threadIdx.x; - long num_threads = blockDim.x * gridDim.x; - for(; index<n; index+=num_threads) { - des_data[index] = src_data_a[index] / src_data_b[index]; - } +void kernel_threshold(const float *src_data, float *des_data, + float alpha, int n) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int num_threads = blockDim.x * gridDim.x; + for (; index < n; index += num_threads) { + des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f; + } } // -namespace singa{ +namespace singa { -void singa_gpu_sum_vec(float *data, float *sum , long n) -{ - long threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n; - // here, we only need one block - long num_blocks = 1; +void singa_gpu_sum_vec(float *data, float *sum , int n) { + int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n; + // here, we only need one block + int num_blocks = 1; - kernel_sum_vec<<<num_blocks, threads_per_block>>>(data, sum, n); + kernel_sum_vec<<<num_blocks, threads_per_block>>>(data, sum, n); } -void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride) -{ - long threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows; - long num_blocks = cols; +void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, + int rows, int cols, int stride) { + int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows; + int num_blocks = cols; - kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data, rows, cols, stride); + kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data, + rows, cols, stride); } -void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data ,long rows, long cols, long stride) -{ - dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y); - dim3 num_blocks(cols/threads_per_block.x + (cols%threads_per_block.x == 0 ? 0 : 1), rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1)); - kernel_add_vec_row<<<num_blocks, threads_per_block>>>(src_vec_data, src_mat_data, des_mat_data,rows, cols, stride); +void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, + float *des_mat_data , int rows, int cols, int stride) { + dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y); + dim3 num_blocks(cols/threads_per_block.x + + (cols%threads_per_block.x == 0 ? 0 : 1), + rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1)); + kernel_add_vec_row<<<num_blocks, threads_per_block>>> + (src_vec_data, src_mat_data, des_mat_data, rows, cols, stride); } -void singa_gpu_set_value(float *data, float value, long n) -{ - kernel_set_value<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, value, n); +void singa_gpu_exp(const float *src_data, float *des_data, int n) { + kernel_exp<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n); } -void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_scale<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_log(const float *src_data, float *des_data, int n) { + kernel_log<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n); } -void singa_gpu_scale_grad(float *data, float alpha, long n) -{ - kernel_scale_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, alpha, n); +void singa_gpu_sigmoid(const float *src_data, float *des_data, int n) { + kernel_sigmoid<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n); } -void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_exp<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, + int n) { + kernel_sigmoid_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>> + (src_data, des_data, n); } -void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_exp_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_relu(const float *src_data, float *des_data, int n) { + kernel_relu<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n); } -void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_sigmoid<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_relu_grad(const float *src_data, float *des_data, int n) { + kernel_relu_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n); } -void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_sigmoid_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_tanh(const float *src_data, float *des_data, int n) { + kernel_tanh<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n); } -void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_relu<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_tanh_grad(const float *src_data, float *des_data, int n) { + kernel_tanh_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n); } -void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_relu_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_softplus(const float *src_data, float *des_data, int n) { + kernel_softplus<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n); } -void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_tanh<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_softplus_grad(const float *src_data, float *des_data, int n) { + kernel_softplus_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>> + (src_data, des_data, n); } -void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_tanh_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_square(const float *src_data, float *des_data, int n) { + kernel_square<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n); } -void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_softplus<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_square_grad(const float *src_data, float *des_data, int n) { + kernel_square_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n); } -void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_softplus_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_sqrt(const float *src_data, float *des_data, int n) { + kernel_sqrt<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n); } -void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_square<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_pow(const float *src_data_a, const float *src_data_b, + float *des_data, int n) { + kernel_pow<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>> + (src_data_a, src_data_b, des_data, n); } -void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_square_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_mult(const float *src_data_a, const float *src_data_b, + float *des_data, int n) { + kernel_mult<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>> + (src_data_a, src_data_b, des_data, n); } -void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_sqrt<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_div(const float *src_data_a, const float *src_data_b, + float *des_data, int n) { + kernel_div<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>> + (src_data_a, src_data_b, des_data, n); } -void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n) -{ - kernel_threshold<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +void singa_gpu_set_value(float *data, float value, int n) { + kernel_set_value<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, value, n); } -void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - kernel_add<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n); +void singa_gpu_threshold(const float *src_data, float *des_data, + float alpha, int n) { + kernel_threshold<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>> + (src_data, des_data, alpha, n); } -void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - kernel_sub<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n); -} - -void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - kernel_mult<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n); -} - -void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) -{ - kernel_div<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n); -} - - -}//namespace singa_gpu +} // namespace singa
