http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/src/blob/test.cc ---------------------------------------------------------------------- diff --git a/src/blob/test.cc b/src/blob/test.cc deleted file mode 100644 index d13ed5e..0000000 --- a/src/blob/test.cc +++ /dev/null @@ -1,165 +0,0 @@ -#include <iostream> - -#include "singa_op.h" -#include "math_addr.h" - -using namespace std; - -void test_gemm1() -{ - float A[3][2] = {}; - float B[3][2] = {}; - float C[2][2] = {}; - for(int i = 0; i < 3; i++) - for(int j = 0; j < 2; j++) - { - A[i][j] = i+j; - B[i][j] = i+j - i*j; - } - cpu_gemm(A[0], B[0], 2, 2, 3 , 1, 0, true, false, C[0]); - float D[2][2] = {}; - for(int i = 0; i < 2; i++) - for(int j = 0; j < 2; j++) - { - D[i][j] = 0; - for(int k = 0; k < 3; k++) - D[i][j] += A[k][i]*B[k][j]; - } - for(int i = 0; i < 2; i++) - for(int j = 0; j < 2; j++) - { - cout<<C[i][j] - D[i][j]<<endl; - } -} - - -void test_gemm2() -{ - float A[2][3] = {}; - float B[3][2] = {}; - float C[2][2] = {}; - for(int i = 0; i < 3; i++) - for(int j = 0; j < 2; j++) - { - A[j][i] = i-j; - B[i][j] = i+j + i*j; - } - cpu_gemm(A[0], B[0], 2, 2, 3 , 1, 0, false, false, C[0]); - float D[2][2] = {}; - for(int i = 0; i < 2; i++) - for(int j = 0; j < 2; j++) - { - D[i][j] = 0; - for(int k = 0; k < 3; k++) - D[i][j] += A[i][k]*B[k][j]; - } - for(int i = 0; i < 2; i++) - for(int j = 0; j < 2; j++) - { - cout<<C[i][j] - D[i][j]<<endl; - } -} - - -void test_gemv() -{ - float A[4][3] = {}; - float B[4]= {}; - float C[3] = {}; - float D[3] = {}; - for(int i = 0; i < 4; i++) - { - for(int j = 0; j < 3; j++) - { - A[j][i] = i-j + i*j; - } - } - for(int i = 0; i < 4; i++)B[i] = i; - for(int i = 0; i < 3; i++)C[i] = 10; - cpu_gemv(A[0], B, 4, 3, 1, 1, true, C); - for(int i = 0; i < 3; i++) - for(int j = 0; j < 4; j++) - { - D[i] += A[j][i]*B[j]; - } - for(int i = 0; i < 3; i++)cout<<C[i] - D[i] - 10<<endl; -} - -void test_axpy() -{ - float A[4][3] = {}; - float C[4][3] = {}; - float B[3][4] = {}; - float D[3][4] = {}; - for(int i = 0; i < 4; i++) - { - for(int j = 0; j < 3; j++) - { - A[i][j] = i-j + i*j; - B[j][i] = i-j + i*j; - C[i][j] = A[i][j]; - D[j][i] = B[j][i]; - } - } - cpu_axpy(A[0], 12, 2, B[0]); - for(int i = 0; i < 12; i++)D[0][i] += 2*C[0][i]; - for(int i = 0; i < 3; i++) - { - for(int j = 0; j < 4; j++) - { - cout<<B[i][j] - D[i][j]<<endl; - } - } -} - -void test_eop() -{ - float A[10] = {}; - float B[10] = {}; - float C[10] = {}; - float D[10] = {}; - float O[10] = {}; - for(int i = 0; i < 10; i++) - { - A[i] = i; - B[i] = -i; - C[i] = i; - } - cpu_e_f<op::Set>(5, 15, O); - for(int i = 0; i < 5; i++)cout<<O[i] - 15<<endl; - for(int i = 5; i < 10; i++)cout<<O[i]<<endl; - cpu_e_f<op::Scale>(10, C, 2, C); - for(int i = 0; i < 10; i++)cout<<C[i] - 2* i<<endl; - cpu_e_f<op::Add>(10, A, B, 0, 0, O); - for(int i = 0; i < 10; i++)cout<<O[i]<<endl; -} - -void test_exrd() -{ - float A[3][10] = {}; - float B[3] = {}; - for(int i = 0; i < 3; i++) - for(int j = 0; j < 10; j++) - { - A[i][j] = (i + 1)*j; - } - cpu_reduce_f<op::Sum>(A[0], 3, 10, B); - for(int i = 0; i < 3; i++) B[i] -= 45*(i+1); - for(int i = 0; i < 3; i++)cout<<B[i]<<endl; - cpu_expand_f<op::Repmat>(B, 3, 10, A[0]); - cpu_reduce_f<op::Sum>(A[0], 3, 10, B); - for(int i = 0; i < 3; i++)cout<<B[i]<<endl; -} - -int main() -{ - test_gemm1() ; - test_gemm2(); - test_gemv(); - test_axpy(); - test_eop(); - test_exrd(); - return 0; -} - -
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/src/test/test_math.cc ---------------------------------------------------------------------- diff --git a/src/test/test_math.cc b/src/test/test_math.cc index a8a9490..8043168 100644 --- a/src/test/test_math.cc +++ b/src/test/test_math.cc @@ -1,8 +1,7 @@ #include "gtest/gtest.h" -#include "singa/blob/math_addr.h" -#include "singa/blob/math_blob.h" -#include "singa/blob/math_kernel.h" -#include "singa/blob/singa_op.h" +#include "singa/utils/math_addr.h" +#include "singa/utils/math_kernel.h" +#include "singa/utils/singa_op.h" #include <cuda_runtime.h> #include "cublas_v2.h" @@ -37,10 +36,10 @@ TEST(MathTest, TestGemmCPU) { } TEST(MathTest, TestGemvCPU) { - float A[4][3] = {}; - float B[4]= {}; - float C[3] = {}; - float D[3] = {}; + float A[4][3] = {}; + float B[4]= {}; + float C[3] = {}; + float D[3] = {}; for(int i = 0; i < 4; i++) { @@ -51,8 +50,8 @@ TEST(MathTest, TestGemvCPU) { } for(int i = 0; i < 4; i++)B[i] = i; - for(int i = 0; i < 3; i++)C[i] = 10; - cpu_gemv(A[0], B, 4, 3, 1, 1, true, C); + for(int i = 0; i < 3; i++)C[i] = 10; + cpu_gemv(A[0], B, 4, 3, 1, 1, true, C); for(int i = 0; i < 3; i++) { @@ -69,9 +68,9 @@ TEST(MathTest, TestGemvCPU) { TEST(MathTest, TestAxpyCPU) { - float A[4][3] = {}; - float C[4][3] = {}; - float B[3][4] = {}; + float A[4][3] = {}; + float C[4][3] = {}; + float B[3][4] = {}; float D[3][4] = {}; for(int i = 0; i < 4; i++) @@ -113,7 +112,7 @@ TEST(MathTest, TestEopCPU) { A[i] = i; B[i] = -i; C[i] = i; - + } cpu_e_f<singa_op::Set>(5, 15, O); @@ -336,7 +335,7 @@ TEST(MathTest, TestSingaSumColGPU) { A[i][j]=i+j; } } - + for(int i = 0; i < 4; i++) { B[i]=0.0f; @@ -462,7 +461,7 @@ TEST(MathTest, TestEopGPU) { B[i] = -i; C[i] = i; O[i] = 0.0f; - + } float* A_gpu=NULL; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/src/utils/blob.cc ---------------------------------------------------------------------- diff --git a/src/utils/blob.cc b/src/utils/blob.cc index f720fae..cd164e7 100644 --- a/src/utils/blob.cc +++ b/src/utils/blob.cc @@ -7,9 +7,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at -* +* * http://www.apache.org/licenses/LICENSE-2.0 -* +* * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -277,6 +277,12 @@ void Blob<Dtype>::ToProto(singa::BlobProto* proto) const { } template <typename Dtype> +void Blob<Dtype>::SetValue(Dtype v) { + Dtype* ptr = mutable_cpu_data(); + for (int i =0; i < count(); i++) + ptr[i] = v; +} +template <typename Dtype> void Blob<Dtype>::ShareData(const Blob& other) { CHECK_EQ(count_, other.count()); data_ = other.data_; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d3379cba/src/utils/math_kernel.cu ---------------------------------------------------------------------- diff --git a/src/utils/math_kernel.cu b/src/utils/math_kernel.cu new file mode 100644 index 0000000..203f261 --- /dev/null +++ b/src/utils/math_kernel.cu @@ -0,0 +1,439 @@ +#include <cmath> +#include "singa/utils/math_kernel.h" + +#define CU2DBLOCK_X 32 +#define CU2DBLOCK_Y 32 + +#define CU1DBLOCK 1024 +#define CU1DBLOCKF 1024.0 + + +//Cuda Kernel Functions + +__global__ +void kernel_sum_vec(float *data, float *sum , long n) +{ + int THREADS = blockDim.x; + + __shared__ float aux[CU1DBLOCK]; + int steps = (n - 1) / THREADS + 1; + aux[threadIdx.x] = data[threadIdx.x]; + + for(int i=1; i<steps; ++i) { + if(threadIdx.x+i*THREADS < n) { + aux[threadIdx.x] += data[threadIdx.x+i*THREADS]; + } + } + + int total_threads = THREADS; + __syncthreads(); + + while(total_threads > 1) { + int half_point = ((1+total_threads) >> 1); + if (threadIdx.x < half_point) { + if(threadIdx.x+half_point < total_threads) { + aux[threadIdx.x] += aux[threadIdx.x + half_point]; + } + } + __syncthreads(); + total_threads = ((total_threads+1) >> 1); + } + + __syncthreads(); + *sum = aux[0]; +} + +__global__ +void kernel_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride) +{ + int j = blockIdx.x; + int THREADS = blockDim.x; + if(j >= cols) { + return; + } + + __shared__ float aux[CU1DBLOCK]; + int steps = (rows - 1) / THREADS + 1; + aux[threadIdx.x] = src_mat_data[j+threadIdx.x*stride]; + for(int i=1; i<steps; ++i) { + if(threadIdx.x+i*THREADS < rows) { + aux[threadIdx.x] += src_mat_data[j+(threadIdx.x+i*THREADS)*stride]; + } + } + + int total_threads = THREADS; + __syncthreads(); + while(total_threads > 1) { + int half_point = ((1+total_threads) >> 1); + if (threadIdx.x < half_point) { + if(threadIdx.x+half_point < total_threads) { + aux[threadIdx.x] += aux[threadIdx.x + half_point]; + } + } + __syncthreads(); + total_threads = ((total_threads+1) >> 1); + } + + __syncthreads(); + dst_vec_data[j] = aux[0]; +} + +__global__ +void kernel_add_vec_row(const float *src_vec_data, const float *src_mat_data, float* des_mat_data,long rows, long cols, long stride) +{ + long i = blockIdx.x * blockDim.x + threadIdx.x; + long j = blockIdx.y * blockDim.y + threadIdx.y; + long num_threads_x = blockDim.x * gridDim.x; + long num_threads_y = blockDim.y * gridDim.y; + long index = 0; + for(; i<cols && j<rows; i+=num_threads_x, j+=num_threads_y) { + index = j * stride + i; + des_mat_data[index] = src_mat_data[index] + src_vec_data[i]; + } +} + +__global__ static +void kernel_set_value(float *data, float value, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + data[index] = value; + } +} + +__global__ +void kernel_scale(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = src_data[index] * alpha; + } +} + +__global__ +void kernel_scale_grad(float *data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + data[index] = alpha; + } +} + +__global__ +void kernel_exp(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = pow(-src_data[index],alpha); + } +} + +__global__ +void kernel_exp_grad(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = src_data[index] * log(alpha); + } +} + +__global__ +void kernel_sigmoid(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = 1.0f / (1.0f + expf(-src_data[index]) * alpha); + } +} + +__global__ +void kernel_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = src_data[index] * (1.0f - src_data[index]) * alpha; + } +} + +__global__ +void kernel_relu(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = 1.0f / ( 1 - alpha ) * max( src_data[index], 0.0f ) + alpha * src_data[index]; + } +} + +__global__ +void kernel_relu_grad(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = src_data[index] > 0.0f ? 1.0f : alpha; + } +} + + +__global__ +void kernel_tanh(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = tanhf( src_data[index] * alpha ); + } +} + +__global__ +void kernel_tanh_grad(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = alpha * (1.0f - src_data[index] * src_data[index] ); + } +} + +__global__ +void kernel_softplus(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = logf(1 + expf(src_data[index])); + } +} + +__global__ +void kernel_softplus_grad(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = 1.0f / (1.0f + expf(-src_data[index])); + } +} + +__global__ +void kernel_square(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = src_data[index] * src_data[index]; + } +} + +__global__ +void kernel_square_grad(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = 2 * sqrt(src_data[index]); + } +} + +__global__ +void kernel_sqrt(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = sqrt(src_data[index]); + } +} + +__global__ +void kernel_threshold(const float *src_data, float *des_data, float alpha, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f; + } +} + +__global__ +void kernel_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = src_data_a[index] + src_data_b[index]; + } +} + +__global__ +void kernel_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = src_data_a[index] - src_data_b[index]; + } +} + +__global__ +void kernel_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = src_data_a[index] * src_data_b[index]; + } +} + +__global__ +void kernel_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) +{ + long index = blockIdx.x * blockDim.x + threadIdx.x; + long num_threads = blockDim.x * gridDim.x; + for(; index<n; index+=num_threads) { + des_data[index] = src_data_a[index] / src_data_b[index]; + } +} + +// +namespace singa{ + +void singa_gpu_sum_vec(float *data, float *sum , long n) +{ + long threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n; + // here, we only need one block + long num_blocks = 1; + + kernel_sum_vec<<<num_blocks, threads_per_block>>>(data, sum, n); +} + +void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride) +{ + long threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows; + long num_blocks = cols; + + kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data, rows, cols, stride); +} + +void singa_gpu_add_vec_row(const float *src_vec_data, const float *src_mat_data, float *des_mat_data ,long rows, long cols, long stride) +{ + dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y); + dim3 num_blocks(cols/threads_per_block.x + (cols%threads_per_block.x == 0 ? 0 : 1), rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1)); + kernel_add_vec_row<<<num_blocks, threads_per_block>>>(src_vec_data, src_mat_data, des_mat_data,rows, cols, stride); +} + +void singa_gpu_set_value(float *data, float value, long n) +{ + kernel_set_value<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, value, n); +} + +void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_scale<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_scale_grad(float *data, float alpha, long n) +{ + kernel_scale_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, alpha, n); +} + +void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_exp<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_exp_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_sigmoid<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_sigmoid_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_relu<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_relu_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_tanh<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_tanh_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_softplus<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_softplus_grad(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_softplus_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_square(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_square<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_square_grad(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_square_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_sqrt<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, long n) +{ + kernel_threshold<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, alpha, n); +} + +void singa_gpu_add(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) +{ + kernel_add<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n); +} + +void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) +{ + kernel_sub<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n); +} + +void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) +{ + kernel_mult<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n); +} + +void singa_gpu_div(const float *src_data_a, const float *src_data_b, float *des_data, float alpha, float beta, long n) +{ + kernel_div<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, des_data, alpha, beta, n); +} + + +}//namespace singa_gpu
