SINGA-80 New Blob Level and Address Level Math Operation Interface

Update math functions for gpu.
fix compile bug from Makefile.gpu due to 32bit and 64bit

close #74


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/a65a9535
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/a65a9535
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/a65a9535

Branch: refs/heads/master
Commit: a65a9535e4d3df21faf0d67b7891df065a006fff
Parents: 86284f1
Author: seaok <[email protected]>
Authored: Sun Nov 15 21:35:10 2015 +0800
Committer: Wei Wang <[email protected]>
Committed: Mon Nov 16 10:18:51 2015 +0800

----------------------------------------------------------------------
 Makefile.gpu                      |   2 +-
 include/singa/utils/math_addr.h   |  12 +-
 include/singa/utils/math_kernel.h |  87 +++--
 include/singa/utils/singa_op.h    |  54 +--
 src/test/test_math.cc             |  60 +---
 src/utils/math_kernel.cu          | 590 +++++++++++++++------------------
 6 files changed, 379 insertions(+), 426 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a65a9535/Makefile.gpu
----------------------------------------------------------------------
diff --git a/Makefile.gpu b/Makefile.gpu
index 9c9051c..2dc2a71 100644
--- a/Makefile.gpu
+++ b/Makefile.gpu
@@ -27,7 +27,7 @@ CUDA_DIR := /usr/local/cuda
 #CUDA_DIR :=
 
 # Lib folder for system and external libs. You may need to change it.
-LIBRARY_DIRS := $(HOME_DIR)/lib64 $(HOME_DIR)/lib $(HOME_DIR)/local/lib 
$(CUDA_DIR)/lib $(CUDA_DIR)/lib64
+LIBRARY_DIRS := $(HOME_DIR)/lib64 $(HOME_DIR)/lib $(HOME_DIR)/local/lib 
$(CUDA_DIR)/lib64 $(CUDA_DIR)/lib
 # Header folder for system and external libs. You may need to change it.
 INCLUDE_DIRS := $(HOME_DIR)/include ./include 
$(HOME_DIR)/local/include/zookeeper $(CUDA_DIR)/include
 # g++ location, should support c++11, tested with 4.8.1

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a65a9535/include/singa/utils/math_addr.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_addr.h b/include/singa/utils/math_addr.h
index fe19f14..9b91e70 100644
--- a/include/singa/utils/math_addr.h
+++ b/include/singa/utils/math_addr.h
@@ -29,7 +29,7 @@ extern "C" {
 #endif
 #include "singa/utils/singa_op.h"
 #ifdef USE_GPU
-#include "cublas_v2.h"
+#include <cublas_v2.h>
 #endif
 
 
@@ -177,6 +177,16 @@ void gpu_e_f(const int n, const Dtype alpha, Dtype * A) {
 }
 
 template<typename Op, typename Dtype>
+void gpu_e_f(const int n, const Dtype * A, Dtype * B) {
+  Op::CudaMap(A, B, n);
+}
+
+template<typename Op, typename Dtype>
+void gpu_e_f(const int n, const Dtype * A, const Dtype * B, const Dtype * C) {
+  Op::CudaMap(A, B, C, n);
+}
+
+template<typename Op, typename Dtype>
 void gpu_e_f(const int n, const Dtype * A, const Dtype alpha, Dtype * B) {
   Op::CudaMap(alpha, A, B, n);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a65a9535/include/singa/utils/math_kernel.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/math_kernel.h 
b/include/singa/utils/math_kernel.h
index f5d3e34..d763283 100644
--- a/include/singa/utils/math_kernel.h
+++ b/include/singa/utils/math_kernel.h
@@ -1,59 +1,78 @@
-#ifndef MATH_KERNEL_H
-#define MATH_KERNEL_H
-
-namespace singa{
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#ifndef SINGA_UTILS_MATH_KERNEL_H_
+#define SINGA_UTILS_MATH_KERNEL_H_
+
+namespace singa {
 
 extern "C" {
-       void singa_gpu_sum_vec(float *data, float *sum , long n);
-
-       void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, 
long rows, long cols, long stride);
-
-       void singa_gpu_add_vec_row(const float *src_vec_data, const float 
*src_mat_data, float *des_mat_data, long rows, long cols, long stride);
-
-       void singa_gpu_set_value(float *data, float value, long n);
-
-       void singa_gpu_scale(const float *src_data, float *des_data, float 
alpha, long n);
+  void singa_gpu_sum_vec(float *data, float *sum , int n);
 
-       void singa_gpu_scale_grad(float *data, float alpha, long n);
+  void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data,
+    int rows, int cols, int stride);
 
-       void singa_gpu_exp(const float *src_data, float *des_data, float alpha, 
long n);
+  void singa_gpu_add_vec_row(const float *src_vec_data,
+    const float *src_mat_data, float *des_mat_data,
+    int rows, int cols, int stride);
 
-       void singa_gpu_exp_grad(const float *src_data, float *des_data, float 
alpha, long n);
+  void singa_gpu_exp(const float *src_data, float *des_data, int n);
 
-       void singa_gpu_sigmoid(const float *src_data, float *des_data, float 
alpha, long n);
+  void singa_gpu_log(const float *src_data, float *des_data, int n);
 
-       void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, 
float alpha, long n);
+  void singa_gpu_sigmoid(const float *src_data, float *des_data, int n);
 
-       void singa_gpu_relu(const float *src_data, float *des_data, float 
alpha, long n);
+  void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, int n);
 
-       void singa_gpu_relu_grad(const float *src_data, float *des_data, float 
alpha, long n);
+  void singa_gpu_relu(const float *src_data, float *des_data, int n);
 
-       void singa_gpu_tanh(const float *src_data, float *des_data, float 
alpha, long n);
+  void singa_gpu_relu_grad(const float *src_data, float *des_data, int n);
 
-       void singa_gpu_tanh_grad(const float *src_data, float *des_data, float 
alpha, long n);
+  void singa_gpu_tanh(const float *src_data, float *des_data, int n);
 
-       void singa_gpu_softplus(const float *src_data, float *des_data, float 
alpha, long n);
+  void singa_gpu_tanh_grad(const float *src_data, float *des_data, int n);
 
-       void singa_gpu_softplus_grad(const float *src_data, float *des_data, 
float alpha, long n);
+  void singa_gpu_softplus(const float *src_data, float *des_data, int n);
 
-       void singa_gpu_square(const float *src_data, float *des_data, float 
alpha, long n);
+  void singa_gpu_softplus_grad(const float *src_data, float *des_data, int n);
 
-       void singa_gpu_square_grad(const float *src_data, float *des_data, 
float alpha, long n);
+  void singa_gpu_square(const float *src_data, float *des_data, int n);
 
-       void singa_gpu_sqrt(const float *src_data, float *des_data, float 
alpha, long n);
+  void singa_gpu_square_grad(const float *src_data, float *des_data, int n);
 
-       void singa_gpu_threshold(const float *src_data, float *des_data, float 
alpha, long n);
+  void singa_gpu_sqrt(const float *src_data, float *des_data, int n);
 
-       void singa_gpu_add(const float *src_data_a, const float *src_data_b, 
float *des_data, float alpha, float beta, long n);
+  void singa_gpu_pow(const float *src_data_a, const float *src_data_b,
+    float *des_data, int n);
 
-       void singa_gpu_sub(const float *src_data_a, const float *src_data_b, 
float *des_data, float alpha, float beta, long n);
+  void singa_gpu_mult(const float *src_data_a, const float *src_data_b,
+    float *des_data, int n);
 
-       void singa_gpu_mult(const float *src_data_a, const float *src_data_b, 
float *des_data, float alpha, float beta, long n);
+  void singa_gpu_div(const float *src_data_a, const float *src_data_b,
+    float *des_data, int n);
 
-       void singa_gpu_div(const float *src_data_a, const float *src_data_b, 
float *des_data, float alpha, float beta, long n);
+  void singa_gpu_set_value(float *data, float value, int n);
 
+  void singa_gpu_threshold(const float *src_data, float *des_data, int n);
 };
 
-}
+}  // namespace singa
 
-#endif
+#endif  // SINGA_UTILS_MATH_KERNEL_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a65a9535/include/singa/utils/singa_op.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/singa_op.h b/include/singa/utils/singa_op.h
index 78fe955..c934050 100644
--- a/include/singa/utils/singa_op.h
+++ b/include/singa/utils/singa_op.h
@@ -27,7 +27,7 @@
 
 #ifdef USE_GPU
 #include <cuda_runtime.h>
-#include "cublas_v2.h"
+#include <cublas_v2.h>
 #include "singa/utils/math_kernel.h"
 #endif  // USE_GPU
 
@@ -44,9 +44,8 @@ struct Exp {
     *b = exp(a);
   }
 #ifdef USE_GPU
-  inline static void CudaMap(Dtype alpha,  const Dtype * a,
-      Dtype * b, int n) {
-    singa::singa_gpu_exp(a, b, alpha, n);
+  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
+    singa::singa_gpu_exp(a, b, n);
   }
 #endif  // USE_GPU
 };
@@ -59,6 +58,9 @@ struct Log {
     *b = log(a);
   }
 #ifdef USE_GPU
+  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
+    singa::singa_gpu_log(a, b, n);
+  }
 #endif  // USE_GPU
 };
 
@@ -68,9 +70,8 @@ struct Sigmoid {
     *b = 1.0f / (1.0f + expf(-a));
   }
 #ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      Dtype * b, int n) {
-    singa::singa_gpu_sigmoid(a, b, 1, n);
+  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
+    singa::singa_gpu_sigmoid(a, b, n);
   }
 #endif  // USE_GPU
 };
@@ -80,8 +81,8 @@ struct SigmoidGrad {
     *b = a * (1.0f - a);
   }
 #ifdef USE_GPU
-  inline static void CudaMap(Dtype alpha,  const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_sigmoid_grad(a, b, 1, n);
+  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
+    singa::singa_gpu_sigmoid_grad(a, b, n);
   }
 #endif  // USE_GPU
 };
@@ -93,7 +94,7 @@ struct Relu {
   }
 #ifdef USE_GPU
   inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_relu(a, b, 1, n);
+    singa::singa_gpu_relu(a, b, n);
   }
 #endif  // USE_GPU
 };
@@ -105,7 +106,7 @@ struct ReluGrad {
   }
 #ifdef USE_GPU
   inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_relu_grad(a, b, 1, n);
+    singa::singa_gpu_relu_grad(a, b, n);
   }
 #endif  // USE_GPU
 };
@@ -117,7 +118,7 @@ struct Tanh {
   }
 #ifdef USE_GPU
   inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_tanh(a, b, 1, n);
+    singa::singa_gpu_tanh(a, b, n);
   }
 #endif  // USE_GPU
 };
@@ -129,7 +130,7 @@ struct TanhGrad {
   }
 #ifdef USE_GPU
   inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_tanh_grad(a, b, 1, n);
+    singa::singa_gpu_tanh_grad(a, b, n);
   }
 #endif  // USE_GPU
 };
@@ -141,7 +142,7 @@ struct Softplus {
   }
 #ifdef USE_GPU
   inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
-    singa::singa_gpu_softplus(a, b, 1, n);
+    singa::singa_gpu_softplus(a, b, n);
   }
 #endif  // USE_GPU
 };
@@ -152,8 +153,7 @@ struct SoftplusGrad {
     *b = 1.0f / (1.0f + expf(-a));
   }
 #ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      Dtype * b, int n) {
+  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
     singa::singa_gpu_softplus_grad(a, b, n);
   }
 #endif  // USE_GPU
@@ -165,8 +165,7 @@ struct Square {
     *b = a * a;
   }
 #ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      Dtype * b, int n) {
+  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
     singa::singa_gpu_square(a, b, n);
   }
 #endif  // USE_GPU
@@ -178,8 +177,7 @@ struct SquareGrad {
     *b = 2 * sqrt(a);
   }
 #ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      Dtype * b, int n) {
+  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
     singa::singa_gpu_square_grad(a, b, 1, n);
   }
 #endif  // USE_GPU
@@ -191,8 +189,7 @@ struct Sqrt {
     *b = sqrt(a);
   }
 #ifdef USE_GPU
-  inline static void CudaMap(const Dtype * a,
-      Dtype * b, int n) {
+  inline static void CudaMap(const Dtype * a, Dtype * b, int n) {
     singa::singa_gpu_sqrt(a, b, n);
   }
 #endif  // USE_GPU
@@ -207,6 +204,12 @@ struct Pow {
   inline static void Map(const Dtype & a, const Dtype &b, Dtype * c) {
     *c = pow(a, b);
   }
+#ifdef USE_GPU
+  inline static void CudaMap(const Dtype * a,
+      const Dtype * b, Dtype * c, int n) {
+    singa::singa_gpu_pow(a, b, c, n);
+  }
+#endif  // USE_GPU
 };
 template<typename Dtype>
 struct Mult {
@@ -214,8 +217,9 @@ struct Mult {
     *c =  a * b;
   }
 #ifdef USE_GPU
-  inline static void CudaMap(const Dtype* a, const Dtype* b, Dtype* c, int n) {
-    singa::singa_gpu_mult(a, b, c, 1, 1, n);
+  inline static void CudaMap(const Dtype * a,
+      const Dtype * b, Dtype * c, int n) {
+    singa::singa_gpu_mult(a, b, c, n);
   }
 #endif  // USE_GPU
 };
@@ -228,7 +232,7 @@ struct Div {
 #ifdef USE_GPU
   inline static void CudaMap(const Dtype * a,
       const Dtype * b, Dtype * c, int n) {
-    singa::singa_gpu_div(a, b, c, 1, 1, n);
+    singa::singa_gpu_div(a, b, c, n);
   }
 #endif  // USE_GPU
 };

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a65a9535/src/test/test_math.cc
----------------------------------------------------------------------
diff --git a/src/test/test_math.cc b/src/test/test_math.cc
index e64d567..a5bf180 100644
--- a/src/test/test_math.cc
+++ b/src/test/test_math.cc
@@ -43,9 +43,9 @@ TEST(MathTest, TestGemvCPU) {
        float C[3] = {};
        float D[3] = {};
 
-       for(int i = 0; i < 4; i++)
+       for(int i = 0; i < 3; i++)
        {
-               for(int j = 0; j < 3; j++)
+               for(int j = 0; j < 4; j++)
                {
                        A[j][i] = i-j + i*j;
                }
@@ -153,7 +153,7 @@ TEST(MathTest, TestGemmGPU) {
        cudaMemcpy(A_gpu,A,3*2*sizeof(float),cudaMemcpyHostToDevice);
        cudaMemcpy(B_gpu,B,3*2*sizeof(float),cudaMemcpyHostToDevice);
 
-       gpu_gemm(A_gpu, B_gpu, 2, 2, 3 , 1, 0, true, false, C_gpu);
+       gpu_gemm<float>(A_gpu, B_gpu, 2, 2, 3 , 1, 0, true, false, C_gpu);
 
        cudaMemcpy(C,C_gpu,2*2*sizeof(float),cudaMemcpyDeviceToHost);
 
@@ -213,7 +213,7 @@ TEST(MathTest, TestGemvGPU) {
        cudaMemcpy(B_gpu,B,4*sizeof(float),cudaMemcpyHostToDevice);
        cudaMemcpy(C_gpu,C,3*sizeof(float),cudaMemcpyHostToDevice);
 
-       gpu_gemv(A_gpu, B_gpu, 4, 3, 1, 1, true, C_gpu);
+       gpu_gemv<float>(A_gpu, B_gpu, 4, 3, 1, 1, true, C_gpu);
 
        cudaMemcpy(C,C_gpu,3*sizeof(float),cudaMemcpyDeviceToHost);
 
@@ -262,17 +262,18 @@ TEST(MathTest, TestAxpyGPU) {
        cudaMemcpy(A_gpu,A,4*3*sizeof(float),cudaMemcpyHostToDevice);
        cudaMemcpy(B_gpu,B,3*4*sizeof(float),cudaMemcpyHostToDevice);
 
-       gpu_axpy(A_gpu, 12, 2, B_gpu);
+       gpu_axpy<float>(A_gpu, 12, 2, B_gpu);
 
        cudaMemcpy(A,A_gpu,4*3*sizeof(float),cudaMemcpyDeviceToHost);
        cudaMemcpy(B,B_gpu,3*4*sizeof(float),cudaMemcpyDeviceToHost);
 
-       for(int i = 0; i < 12; i++)D[0][i] += 2*C[0][i];
+       //for(int i = 0; i < 12; i++)D[0][i] += 2*C[0][i];
 
-       for(int i = 0; i < 3; i++)
+       for(int i = 0; i < 4; i++)
        {
-               for(int j = 0; j < 4; j++)
+               for(int j = 0; j < 3; j++)
                {
+                       D[i][j] += C[i][j];
                        ASSERT_EQ(B[i][j],D[i][j]);
                }
        }
@@ -300,7 +301,7 @@ TEST(MathTest, TestDotGPU) {
 
        cudaMemcpy(A_gpu,A,12*sizeof(float),cudaMemcpyHostToDevice);
        cudaMemcpy(B_gpu,B,12*sizeof(float),cudaMemcpyHostToDevice);
-       float gpu_ret=gpu_dot(A_gpu,B_gpu,12);
+       float gpu_ret=gpu_dot<float>(A_gpu,B_gpu,12);
 
        float cpu_ret=0.0f;
        for(int i = 0; i < 12; i++)
@@ -418,7 +419,6 @@ TEST(MathTest, TestSingaSetValueGPU) {
        float A[3][4];
 
        float* A_gpu=NULL;
-       float* B_gpu=NULL;
 
        cudaMalloc((void**)&A_gpu, 3*4*sizeof(float));
 
@@ -444,59 +444,25 @@ TEST(MathTest, TestEopGPU) {
 
        float A[10] = {};
        float B[10] = {};
-       float C[10] = {};
-       float D[10] = {};
-       float O[10] = {};
 
        for(int i = 0; i < 10; i++)
        {
                A[i] = i;
                B[i] = -i;
-               C[i] = i;
-               O[i] = 0.0f;
-
        }
 
        float* A_gpu=NULL;
        float* B_gpu=NULL;
-       float* C_gpu=NULL;
-       float* O_gpu=NULL;
 
        cudaMalloc((void**)&A_gpu, 10*sizeof(float));
        cudaMalloc((void**)&B_gpu, 10*sizeof(float));
-       cudaMalloc((void**)&C_gpu, 10*sizeof(float));
-       cudaMalloc((void**)&O_gpu, 10*sizeof(float));
 
        cudaMemcpy(A_gpu,A,10*sizeof(float),cudaMemcpyHostToDevice);
        cudaMemcpy(B_gpu,B,10*sizeof(float),cudaMemcpyHostToDevice);
-       cudaMemcpy(C_gpu,C,10*sizeof(float),cudaMemcpyHostToDevice);
-       cudaMemcpy(O_gpu,O,10*sizeof(float),cudaMemcpyHostToDevice);
-
-       gpu_e_f<singa::op::Set>(5, 15, O_gpu);
-       cudaMemcpy(O,O_gpu,10*sizeof(float),cudaMemcpyDeviceToHost);
-
-       for(int i = 0; i < 5; i++)
-       {
-               ASSERT_EQ(O[i]-15,0);
-       }
-       for(int i = 5; i < 10; i++)
-       {
-               ASSERT_EQ(O[i],0);
-       }
-       gpu_e_f<singa::op::Scale>(10, C_gpu, 2, C_gpu);
-       cudaMemcpy(C,C_gpu,10*sizeof(float),cudaMemcpyDeviceToHost);
 
-       for(int i = 0; i < 10; i++)
-       {
-               ASSERT_EQ(C[i]-2*i,0);
-       }
-
-       gpu_e_f<singa::op::Add>(10, A_gpu, B_gpu, 0, 0, O_gpu);
-       cudaMemcpy(O,O_gpu,10*sizeof(float),cudaMemcpyDeviceToHost);
+       gpu_e_f<singa::op::Sigmoid<float>, float>(10, A_gpu, B_gpu);
 
-       for(int i = 0; i < 10; i++)
-       {
-               ASSERT_EQ(O[i],0);
-       }
+       cudaFree(A_gpu);
+       cudaFree(B_gpu);
 }
 #endif  // USE_GPU

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a65a9535/src/utils/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/utils/math_kernel.cu b/src/utils/math_kernel.cu
index 203f261..4dd91e0 100644
--- a/src/utils/math_kernel.cu
+++ b/src/utils/math_kernel.cu
@@ -1,4 +1,25 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
 #include <cmath>
+#include <algorithm>
 #include "singa/utils/math_kernel.h"
 
 #define CU2DBLOCK_X 32
@@ -7,433 +28,366 @@
 #define CU1DBLOCK 1024
 #define CU1DBLOCKF 1024.0
 
-
-//Cuda Kernel Functions
+// Cuda Kernel Functions
 
 __global__
-void kernel_sum_vec(float *data, float *sum , long n)
-{
-       int THREADS = blockDim.x;
-
-       __shared__ float aux[CU1DBLOCK];
-       int steps = (n - 1) / THREADS + 1;
-       aux[threadIdx.x] = data[threadIdx.x];
-
-       for(int i=1; i<steps; ++i) {
-               if(threadIdx.x+i*THREADS < n) {
-                       aux[threadIdx.x] += data[threadIdx.x+i*THREADS];
-               }
-       }
-
-       int total_threads = THREADS;
-       __syncthreads();
-
-       while(total_threads > 1) {
-               int half_point = ((1+total_threads) >> 1);
-               if (threadIdx.x < half_point) {
-                       if(threadIdx.x+half_point < total_threads) {
-                               aux[threadIdx.x] += aux[threadIdx.x + 
half_point];
-                       }
-               }
-               __syncthreads();
-               total_threads = ((total_threads+1) >> 1);
-       }
-
-       __syncthreads();
-       *sum = aux[0];
-}
+void kernel_sum_vec(float *data, float *sum , int n) {
+  int THREADS = blockDim.x;
 
-__global__
-void kernel_sum_col(const float *src_mat_data, float *dst_vec_data, long rows, 
long cols, long stride)
-{
-           int j = blockIdx.x;
-               int THREADS = blockDim.x;
-               if(j >= cols) {
-                       return;
-               }
-
-               __shared__ float aux[CU1DBLOCK];
-               int steps = (rows - 1) / THREADS + 1;
-               aux[threadIdx.x] = src_mat_data[j+threadIdx.x*stride];
-               for(int i=1; i<steps; ++i) {
-                       if(threadIdx.x+i*THREADS < rows) {
-                               aux[threadIdx.x] += 
src_mat_data[j+(threadIdx.x+i*THREADS)*stride];
-                       }
-               }
-
-               int total_threads = THREADS;
-               __syncthreads();
-               while(total_threads > 1) {
-                       int half_point = ((1+total_threads) >> 1);
-                       if (threadIdx.x < half_point) {
-                               if(threadIdx.x+half_point < total_threads) {
-                                       aux[threadIdx.x] += aux[threadIdx.x + 
half_point];
-                               }
-                       }
-                       __syncthreads();
-                       total_threads = ((total_threads+1) >> 1);
-               }
-
-               __syncthreads();
-               dst_vec_data[j] = aux[0];
-}
+  __shared__ float aux[CU1DBLOCK];
+  int steps = (n - 1) / THREADS + 1;
+  aux[threadIdx.x] = data[threadIdx.x];
 
-__global__
-void kernel_add_vec_row(const float *src_vec_data, const float *src_mat_data, 
float* des_mat_data,long rows, long cols, long stride)
-{
-       long i = blockIdx.x * blockDim.x + threadIdx.x;
-       long j = blockIdx.y * blockDim.y + threadIdx.y;
-       long num_threads_x = blockDim.x * gridDim.x;
-       long num_threads_y = blockDim.y * gridDim.y;
-       long index = 0;
-       for(; i<cols && j<rows; i+=num_threads_x, j+=num_threads_y) {
-               index = j * stride + i;
-               des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
-       }
-}
+  for (int i = 1; i < steps; ++i) {
+    if (threadIdx.x + i * THREADS < n) {
+      aux[threadIdx.x] += data[threadIdx.x+i*THREADS];
+    }
+  }
 
-__global__ static
-void kernel_set_value(float *data, float value, long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               data[index] = value;
-       }
+  int total_threads = THREADS;
+  __syncthreads();
+
+  while (total_threads > 1) {
+    int half_point = ((1+total_threads) >> 1);
+    if (threadIdx.x < half_point) {
+      if (threadIdx.x+half_point < total_threads) {
+        aux[threadIdx.x] += aux[threadIdx.x + half_point];
+      }
+    }
+    __syncthreads();
+    total_threads = ((total_threads+1) >> 1);
+  }
+
+  __syncthreads();
+  *sum = aux[0];
 }
 
 __global__
-void kernel_scale(const float *src_data, float *des_data, float alpha, long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = src_data[index] * alpha;
-       }
+void kernel_sum_col(const float *src_mat_data,
+    float *dst_vec_data, int rows, int cols, int stride) {
+  int j = blockIdx.x;
+  int THREADS = blockDim.x;
+  if (j >= cols) {
+    return;
+  }
+
+  __shared__ float aux[CU1DBLOCK];
+  int steps = (rows - 1) / THREADS + 1;
+  aux[threadIdx.x] = src_mat_data[j+threadIdx.x*stride];
+  for (int i = 1; i < steps; ++i) {
+    if (threadIdx.x+i*THREADS < rows) {
+      aux[threadIdx.x] += src_mat_data[j+(threadIdx.x+i*THREADS)*stride];
+    }
+  }
+
+  int total_threads = THREADS;
+  __syncthreads();
+  while (total_threads > 1) {
+    int half_point = ((1+total_threads) >> 1);
+    if (threadIdx.x < half_point) {
+      if (threadIdx.x+half_point < total_threads) {
+        aux[threadIdx.x] += aux[threadIdx.x + half_point];
+      }
+    }
+    __syncthreads();
+    total_threads = ((total_threads+1) >> 1);
+  }
+
+  __syncthreads();
+  dst_vec_data[j] = aux[0];
 }
 
 __global__
-void kernel_scale_grad(float *data, float alpha, long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               data[index] = alpha;
-       }
+void kernel_add_vec_row(const float *src_vec_data, const float *src_mat_data,
+    float* des_mat_data, int rows, int cols, int stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int num_threads_x = blockDim.x * gridDim.x;
+  int num_threads_y = blockDim.y * gridDim.y;
+  int index = 0;
+  for (; i < cols && j < rows; i += num_threads_x, j += num_threads_y) {
+    index = j * stride + i;
+    des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
+  }
 }
 
 __global__
-void kernel_exp(const float *src_data, float *des_data, float alpha, long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = pow(-src_data[index],alpha);
-       }
+void kernel_exp(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = exp(src_data[index]);
+  }
 }
 
 __global__
-void kernel_exp_grad(const float *src_data, float *des_data, float alpha, long 
n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = src_data[index] * log(alpha);
-       }
+void kernel_log(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = log(src_data[index]);
+  }
 }
 
 __global__
-void kernel_sigmoid(const float *src_data, float *des_data, float alpha, long 
n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = 1.0f / (1.0f + expf(-src_data[index]) * 
alpha);
-       }
+void kernel_sigmoid(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+  }
 }
 
 __global__
-void kernel_sigmoid_grad(const float *src_data, float *des_data, float alpha, 
long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = src_data[index] * (1.0f - src_data[index]) * 
alpha;
-       }
+void kernel_sigmoid_grad(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+  des_data[index] = src_data[index] * (1.0f - src_data[index]);
+  }
 }
 
 __global__
-void kernel_relu(const float *src_data, float *des_data, float alpha, long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = 1.0f / ( 1 - alpha ) * max( src_data[index], 
0.0f ) + alpha * src_data[index];
-       }
+void kernel_relu(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = max(src_data[index], 0.0f);
+  }
 }
 
 __global__
-void kernel_relu_grad(const float *src_data, float *des_data, float alpha, 
long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = src_data[index] > 0.0f ? 1.0f : alpha;
-       }
+void kernel_relu_grad(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
+  }
 }
 
-
 __global__
-void kernel_tanh(const float *src_data, float *des_data, float alpha, long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = tanhf( src_data[index] * alpha );
-       }
+void kernel_tanh(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = tanhf(src_data[index]);
+  }
 }
 
 __global__
-void kernel_tanh_grad(const float *src_data, float *des_data, float alpha, 
long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = alpha * (1.0f - src_data[index] * 
src_data[index] );
-       }
+void kernel_tanh_grad(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = (1.0f - src_data[index] * src_data[index]);
+  }
 }
 
 __global__
-void kernel_softplus(const float *src_data, float *des_data, float alpha, long 
n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = logf(1 + expf(src_data[index]));
-       }
+void kernel_softplus(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = logf(1 + expf(src_data[index]));
+  }
 }
 
 __global__
-void kernel_softplus_grad(const float *src_data, float *des_data, float alpha, 
long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
-       }
+void kernel_softplus_grad(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+  }
 }
 
 __global__
-void kernel_square(const float *src_data, float *des_data, float alpha, long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = src_data[index] * src_data[index];
-       }
+void kernel_square(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] * src_data[index];
+  }
 }
 
 __global__
-void kernel_square_grad(const float *src_data, float *des_data, float alpha, 
long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = 2 * sqrt(src_data[index]);
-       }
+void kernel_square_grad(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = 2 * sqrt(src_data[index]);
+  }
 }
 
 __global__
-void kernel_sqrt(const float *src_data, float *des_data, float alpha, long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = sqrt(src_data[index]);
-       }
+void kernel_sqrt(const float *src_data, float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = sqrt(src_data[index]);
+  }
 }
 
 __global__
-void kernel_threshold(const float *src_data, float *des_data, float alpha, 
long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
-       }
+void kernel_pow(const float *src_data_a, const float *src_data_b,
+    float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = pow(src_data_a[index], src_data_b[index]);
+  }
 }
 
 __global__
-void kernel_add(const float *src_data_a, const float *src_data_b, float 
*des_data, float alpha, float beta, long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = src_data_a[index] + src_data_b[index];
-       }
+void kernel_mult(const float *src_data_a, const float *src_data_b,
+    float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data_a[index] * src_data_b[index];
+  }
 }
 
 __global__
-void kernel_sub(const float *src_data_a, const float *src_data_b, float 
*des_data, float alpha, float beta, long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = src_data_a[index] - src_data_b[index];
-       }
+void kernel_div(const float *src_data_a, const float *src_data_b,
+    float *des_data, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data_a[index] / src_data_b[index];
+  }
 }
 
-__global__
-void kernel_mult(const float *src_data_a, const float *src_data_b, float 
*des_data, float alpha, float beta, long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = src_data_a[index] * src_data_b[index];
-       }
+__global__ static
+void kernel_set_value(float *data, float value, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    data[index] = value;
+  }
 }
 
 __global__
-void kernel_div(const float *src_data_a, const float *src_data_b, float 
*des_data, float alpha, float beta, long n)
-{
-       long index = blockIdx.x * blockDim.x + threadIdx.x;
-       long num_threads = blockDim.x * gridDim.x;
-       for(; index<n; index+=num_threads) {
-               des_data[index] = src_data_a[index] / src_data_b[index];
-       }
+void kernel_threshold(const float *src_data, float *des_data,
+    float alpha, int n) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int num_threads = blockDim.x * gridDim.x;
+  for (; index < n; index += num_threads) {
+    des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
+  }
 }
 
 //
-namespace singa{
+namespace singa {
 
-void singa_gpu_sum_vec(float *data, float *sum , long n)
-{
-       long threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
-       // here, we only need one block
-       long num_blocks = 1;
+void singa_gpu_sum_vec(float *data, float *sum , int n) {
+  int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
+  //  here, we only need one block
+  int num_blocks = 1;
 
-       kernel_sum_vec<<<num_blocks, threads_per_block>>>(data, sum, n);
+  kernel_sum_vec<<<num_blocks, threads_per_block>>>(data, sum, n);
 }
 
-void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data, long 
rows, long cols, long stride)
-{
-       long threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
-       long num_blocks = cols;
+void singa_gpu_sum_col(const float *src_mat_data, float *dst_vec_data,
+    int rows, int cols, int stride) {
+  int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
+  int num_blocks = cols;
 
-       kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, 
dst_vec_data, rows, cols, stride);
+  kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data,
+      rows, cols, stride);
 }
 
-void singa_gpu_add_vec_row(const float *src_vec_data, const float 
*src_mat_data, float *des_mat_data ,long rows, long cols, long stride)
-{
-       dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
-       dim3 num_blocks(cols/threads_per_block.x + (cols%threads_per_block.x == 
0 ? 0 : 1), rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1));
-       kernel_add_vec_row<<<num_blocks, threads_per_block>>>(src_vec_data, 
src_mat_data, des_mat_data,rows, cols, stride);
+void singa_gpu_add_vec_row(const float *src_vec_data, const float 
*src_mat_data,
+    float *des_mat_data , int rows, int cols, int stride) {
+  dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
+  dim3 num_blocks(cols/threads_per_block.x +
+    (cols%threads_per_block.x == 0 ? 0 : 1),
+    rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1));
+  kernel_add_vec_row<<<num_blocks, threads_per_block>>>
+    (src_vec_data, src_mat_data, des_mat_data, rows, cols, stride);
 }
 
-void singa_gpu_set_value(float *data, float value, long n)
-{
-       kernel_set_value<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, value, n);
+void singa_gpu_exp(const float *src_data, float *des_data, int n) {
+  kernel_exp<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
 }
 
-void singa_gpu_scale(const float *src_data, float *des_data, float alpha, long 
n)
-{
-       kernel_scale<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, 
alpha, n);
+void singa_gpu_log(const float *src_data, float *des_data, int n) {
+  kernel_log<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
 }
 
-void singa_gpu_scale_grad(float *data, float alpha, long n)
-{
-       kernel_scale_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, alpha, n);
+void singa_gpu_sigmoid(const float *src_data, float *des_data, int n) {
+  kernel_sigmoid<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
 }
 
-void singa_gpu_exp(const float *src_data, float *des_data, float alpha, long n)
-{
-       kernel_exp<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, 
alpha, n);
+void singa_gpu_sigmoid_grad(const float *src_data, float *des_data,
+    int n) {
+  kernel_sigmoid_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>
+    (src_data, des_data, n);
 }
 
-void singa_gpu_exp_grad(const float *src_data, float *des_data, float alpha, 
long n)
-{
-       kernel_exp_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, 
alpha, n);
+void singa_gpu_relu(const float *src_data, float *des_data, int n) {
+  kernel_relu<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
 }
 
-void singa_gpu_sigmoid(const float *src_data, float *des_data, float alpha, 
long n)
-{
-       kernel_sigmoid<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, 
alpha, n);
+void singa_gpu_relu_grad(const float *src_data, float *des_data, int n) {
+  kernel_relu_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
 }
 
-void singa_gpu_sigmoid_grad(const float *src_data, float *des_data, float 
alpha, long n)
-{
-       kernel_sigmoid_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, 
des_data, alpha, n);
+void singa_gpu_tanh(const float *src_data, float *des_data, int n) {
+  kernel_tanh<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
 }
 
-void singa_gpu_relu(const float *src_data, float *des_data, float alpha, long 
n)
-{
-       kernel_relu<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, 
alpha, n);
+void singa_gpu_tanh_grad(const float *src_data, float *des_data, int n) {
+  kernel_tanh_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
 }
 
-void singa_gpu_relu_grad(const float *src_data, float *des_data, float alpha, 
long n)
-{
-       kernel_relu_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, 
des_data, alpha, n);
+void singa_gpu_softplus(const float *src_data, float *des_data, int n) {
+  kernel_softplus<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
 }
 
-void singa_gpu_tanh(const float *src_data, float *des_data, float alpha, long 
n)
-{
-       kernel_tanh<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, 
alpha, n);
+void singa_gpu_softplus_grad(const float *src_data, float *des_data, int n) {
+  kernel_softplus_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>
+    (src_data, des_data, n);
 }
 
-void singa_gpu_tanh_grad(const float *src_data, float *des_data, float alpha, 
long n)
-{
-       kernel_tanh_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, 
des_data, alpha, n);
+void singa_gpu_square(const float *src_data, float *des_data, int n) {
+  kernel_square<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
 }
 
-void singa_gpu_softplus(const float *src_data, float *des_data, float alpha, 
long n)
-{
-       kernel_softplus<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, 
alpha, n);
+void singa_gpu_square_grad(const float *src_data, float *des_data, int n) {
+  kernel_square_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, 
n);
 }
 
-void singa_gpu_softplus_grad(const float *src_data, float *des_data, float 
alpha, long n)
-{
-       kernel_softplus_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, 
des_data, alpha, n);
+void singa_gpu_sqrt(const float *src_data, float *des_data, int n) {
+  kernel_sqrt<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, n);
 }
 
-void singa_gpu_square(const float *src_data, float *des_data, float alpha, 
long n)
-{
-       kernel_square<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, 
alpha, n);
+void singa_gpu_pow(const float *src_data_a, const float *src_data_b,
+    float *des_data, int n) {
+  kernel_pow<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>
+    (src_data_a, src_data_b, des_data, n);
 }
 
-void singa_gpu_square_grad(const float *src_data, float *des_data, float 
alpha, long n)
-{
-       kernel_square_grad<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, 
des_data, alpha, n);
+void singa_gpu_mult(const float *src_data_a, const float *src_data_b,
+    float *des_data, int n) {
+  kernel_mult<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>
+    (src_data_a, src_data_b, des_data, n);
 }
 
-void singa_gpu_sqrt(const float *src_data, float *des_data, float alpha, long 
n)
-{
-       kernel_sqrt<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, des_data, 
alpha, n);
+void singa_gpu_div(const float *src_data_a, const float *src_data_b,
+    float *des_data, int n) {
+  kernel_div<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>
+    (src_data_a, src_data_b, des_data, n);
 }
 
-void singa_gpu_threshold(const float *src_data, float *des_data, float alpha, 
long n)
-{
-       kernel_threshold<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data, 
des_data, alpha, n);
+void singa_gpu_set_value(float *data, float value, int n) {
+  kernel_set_value<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(data, value, n);
 }
 
-void singa_gpu_add(const float *src_data_a, const float *src_data_b, float 
*des_data, float alpha, float beta, long n)
-{
-       kernel_add<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, 
des_data, alpha, beta, n);
+void singa_gpu_threshold(const float *src_data, float *des_data,
+    float alpha, int n) {
+  kernel_threshold<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>
+    (src_data, des_data, alpha, n);
 }
 
-void singa_gpu_sub(const float *src_data_a, const float *src_data_b, float 
*des_data, float alpha, float beta, long n)
-{
-       kernel_sub<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, 
des_data, alpha, beta, n);
-}
-
-void singa_gpu_mult(const float *src_data_a, const float *src_data_b, float 
*des_data, float alpha, float beta, long n)
-{
-       kernel_mult<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, 
des_data, alpha, beta, n);
-}
-
-void singa_gpu_div(const float *src_data_a, const float *src_data_b, float 
*des_data, float alpha, float beta, long n)
-{
-       kernel_div<<<ceil(n/CU1DBLOCKF), CU1DBLOCKF>>>(src_data_a, src_data_b, 
des_data, alpha, beta, n);
-}
-
-
-}//namespace singa_gpu
+}  // namespace singa


Reply via email to