[03/19] incubator-singa git commit: SINGA-80 New Blob Level and Address Level Math Operation Interface

kaiping Sun, 15 Nov 2015 22:09:21 -0800

SINGA-80 New Blob Level and Address Level Math Operation Interface


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/c13e0370
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/c13e0370
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/c13e0370

Branch: refs/heads/master
Commit: c13e0370906371ca001e755268c51e656dfde0fa
Parents: bbb7dbc
Author: seaok <[email protected]>
Authored: Fri Oct 30 10:02:22 2015 +0800
Committer: Wei Wang <[email protected]>
Committed: Mon Nov 9 17:04:48 2015 +0800

----------------------------------------------------------------------
 Makefile.gpu                     | 153 ++++++++++++++
 include/singa/blob/math_kernel.h |  14 ++
 src/blob/math_addr.cc            |  64 ++++++
 src/blob/math_blob.cc            | 148 +++++++------
 src/blob/math_kernel.cu          |  78 +++++++
 src/test/test_math.cc            | 385 ++++++++++++++++++++++++++++++++++
 6 files changed, 775 insertions(+), 67 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/Makefile.gpu
----------------------------------------------------------------------
diff --git a/Makefile.gpu b/Makefile.gpu
new file mode 100644
index 0000000..9c9051c
--- /dev/null
+++ b/Makefile.gpu
@@ -0,0 +1,153 @@
+#/**
+# * Copyright 2015 The Apache Software Foundation
+# *
+# * Licensed to the Apache Software Foundation (ASF) under one
+# * or more contributor license agreements.  See the NOTICE file
+# * distributed with this work for additional information
+# * regarding copyright ownership.  The ASF licenses this file
+# * to you under the Apache License, Version 2.0 (the
+# * "License"); you may not use this file except in compliance
+# * with the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+
+###################User Config Varaibles #############################
+# third-party library installation folder
+HOME_DIR := /usr
+
+CUDA_DIR := /usr/local/cuda
+
+#CUDA_DIR :=
+
+# Lib folder for system and external libs. You may need to change it.
+LIBRARY_DIRS := $(HOME_DIR)/lib64 $(HOME_DIR)/lib $(HOME_DIR)/local/lib 
$(CUDA_DIR)/lib $(CUDA_DIR)/lib64
+# Header folder for system and external libs. You may need to change it.
+INCLUDE_DIRS := $(HOME_DIR)/include ./include 
$(HOME_DIR)/local/include/zookeeper $(CUDA_DIR)/include
+# g++ location, should support c++11, tested with 4.8.1
+CXX := g++
+CUCXX := nvcc
+
+######################Setting Varialbes#######################################
+LIBRARIES := glog protobuf openblas zmq czmq zookeeper_mt
+
+ifneq ($(CUDA_DIR),)
+       LIBRARIES := $(LIBRARIES) cublas cudart curand
+endif
+
+LDFLAGS := $(foreach librarydir, $(LIBRARY_DIRS), -L$(librarydir))\
+       $(foreach library, $(LIBRARIES), -l$(library))
+# Folder to store compiled files
+BUILD_DIR := .libs
+MSHADOW_FLAGS :=-DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
+ZK_FLAGS :=-DTHREADED -fpermissive
+CXXFLAGS := -O2 -msse3 -Wall -pthread -fPIC -std=c++11 -Wno-unknown-pragmas \
+       $(MSHADOW_FLAGS) -DCPU_ONLY=1 $(ZK_FLAGS)\
+       -funroll-loops $(foreach includedir, $(INCLUDE_DIRS), -I$(includedir))
+CUCXXFLAGS := $(MSHADOW_FLAGS) -std=c++11 -G $(CUDA_ARCH) \
+       $(foreach includedir, $(INCLUDE_DIRS), -I$(includedir))
+
+#Add device compile option
+ifeq ($(CUDA_DIR),)
+       MSHADOW_FLAGS := $(MSHADOW_FLAGS) -DCPU_ONLY
+       CXXFLAGS := $(CXXFLAGS) -DCPU_ONLY
+endif
+
+# find user defined .proto file, and then compute the corresponding .h, .cc
+# files, which cannot be found by shell find, because they haven't been
+# generated currently
+PROTOS := $(shell find src/proto/ -name "*.proto")
+PROTO_SRCS :=$(PROTOS:.proto=.pb.cc)
+PROTO_HDRS :=$(patsubst src%, include%, $(PROTOS:.proto=.pb.h))
+PROTO_OBJS :=$(addprefix $(BUILD_DIR)/, $(PROTO_SRCS:.cc=.o))
+
+# each singa src file will generate a .o file
+SINGA_SRCS := $(shell find src/ \( -path "src/test" -o -path "src/main.cc" -o 
-path "src/utils/tool.cc" \) \
+       -prune -o \( -name "*.cc" -type f \) -print )
+SINGA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(SINGA_SRCS:.cc=.o)) \
+       $(PROTO_OBJS) )
+-include $(SINGA_OBJS:%.o=%.P)
+
+TEST_SRCS :=$(shell find src/test/ -maxdepth 1 -name "*.cc")
+TEST_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(TEST_SRCS:.cc=.o)))
+-include $(TEST_OBJS:%.o=%.P)
+
+TEST_CUDA_SRCS :=$(shell find src/test/ -maxdepth 1 -name "*.cu")
+TEST_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(TEST_CUDA_SRCS:.cu=.o)))
+-include $(TEST_CUDA_OBJS:%.o=%.P)
+
+SINGA_CUDA_SRCS :=$(shell find src/ -maxdepth 2 -name "*.cu")
+SINGA_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, 
$(SINGA_CUDA_SRCS:.cu=.o)))
+-include $(SINGA_CUDA_OBJS:%.o=%.P)
+
+GTEST_SRC := include/gtest/gtest-all.cc
+GTEST_HDR := include/gtest/gtest.h
+GTEST_LIB := $(BUILD_DIR)/libgtest.a
+
+OBJS := $(sort $(SINGA_OBJS) $(TEST_OBJS) )
+CUOBJS := $(sort $(SINGA_CUDA_OBJS) $(TEST_CUDA_OBJS) )
+
+########################Compilation Section###################################
+.PHONY: singa test
+
+singa: $(PROTO_OBJS) $(SINGA_OBJS) $(SINGA_CUDA_OBJS)
+       $(CXX) -shared -o $(BUILD_DIR)/libsinga.so $(SINGA_OBJS)
+       $(CXX) $(SINGA_OBJS) $(SINGA_CUDA_OBJS) src/main.cc -o singa 
$(CXXFLAGS) $(LDFLAGS)
+       @echo
+       $(CXX) $(SINGA_OBJS) $(SINGA_CUDA_OBJS) src/utils/tool.cc -o singatool 
$(CXXFLAGS) $(LDFLAGS)
+       @echo
+
+loader: proto $(LOADER_OBJS)
+       $(CXX) $(LOADER_OBJS) -o $(BUILD_DIR)/loader $(CXXFLAGS) $(LDFLAGS)
+       @echo
+
+test:  proto $(GTEST_LIB) $(TEST_OBJS) $(TEST_CUDA_OBJS) $(SINGA_OBJS) 
$(SINGA_CUDA_OBJS)
+       $(CXX) $(TEST_OBJS) $(TEST_CUDA_OBJS) include/gtest/gtest_main.cc 
$(GTEST_LIB) \
+               $(SINGA_OBJS) $(SINGA_CUDA_OBJS) -o $(BUILD_DIR)/test 
$(CXXFLAGS) $(LDFLAGS)
+       @echo
+
+$(GTEST_LIB): $(GTEST_HDR) $(GTEST_SRC)
+       $(CXX) $(GTEST_SRC) -c -o $(BUILD_DIR)/gtest-all.o $(CXXFLAGS)
+       ar -rv $(GTEST_LIB) $(BUILD_DIR)/gtest-all.o
+
+# compile all files
+$(OBJS):$(BUILD_DIR)/%.o : %.cc
+       @mkdir -p $(dir $@)
+       $(CXX) $<  $(CXXFLAGS) -MMD -c -o $@
+       cp $(BUILD_DIR)/$*.d $(BUILD_DIR)/$*.P; \
+       sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \
+               -e '/^$$/ d' -e 's/$$/ :/' < $(BUILD_DIR)/$*.d >> 
$(BUILD_DIR)/$*.P; \
+       rm -f $*.d
+
+$(CUOBJS):$(BUILD_DIR)/%.o : %.cu
+       @mkdir -p $(dir $@)
+       $(CUCXX) $< -c -o $@ $(CUCXXFLAGS)
+       cp $(BUILD_DIR)/$*.d $(BUILD_DIR)/$*.P; \
+       sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \
+               -e '/^$$/ d' -e 's/$$/ :/' < $(BUILD_DIR)/$*.d >> 
$(BUILD_DIR)/$*.P; \
+       rm -f $*.d
+
+proto: $(PROTO_OBJS)
+
+$(PROTO_SRCS): $(PROTOS)
+       protoc --proto_path=src/proto --cpp_out=src/proto $(PROTOS)
+       mkdir -p include/proto/
+       cp src/proto/*.pb.h include/proto/
+       mkdir -p tool/pb2/
+       touch tool/pb2/__init__.py
+       protoc --proto_path=src/proto --python_out=tool/pb2/ $(PROTOS)
+       @echo
+
+clean:
+       rm -rf *.a *.so
+       rm -rf include/proto/*
+       rm -rf src/proto/*.pb.h src/proto/*.pb.cc
+       rm -rf tool/pb2/*
+       rm -rf $(BUILD_DIR)
+       @echo

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/include/singa/blob/math_kernel.h
----------------------------------------------------------------------
diff --git a/include/singa/blob/math_kernel.h b/include/singa/blob/math_kernel.h
new file mode 100644
index 0000000..9aaf4c2
--- /dev/null
+++ b/include/singa/blob/math_kernel.h
@@ -0,0 +1,14 @@
+#ifndef MATH_KERNEL_H
+#define MATH_KERNEL_H
+
+namespace singa{
+
+extern "C" {
+       void singa_sum_col(float *src_mat_data, float *dst_vec_data, long rows, 
long cols, long stride);
+
+       void singa_add_vec_row(float *src_vec_data, float *src_mat_data, float 
*des_mat_data, long rows, long cols, long stride);
+};
+
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/blob/math_addr.cc
----------------------------------------------------------------------
diff --git a/src/blob/math_addr.cc b/src/blob/math_addr.cc
index f28fdcb..799a749 100644
--- a/src/blob/math_addr.cc
+++ b/src/blob/math_addr.cc
@@ -3,6 +3,9 @@ extern "C"
    #include <cblas.h>
 }
 
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+
 #include "singa/blob/math_addr.h"
 #include "singa/blob/singa_op.h"
 
@@ -47,5 +50,66 @@ float cpu_dot(const float * A, const float * B, const int n)
        return sum;
 }
 
+//Trick: swap A and B
+//
+void gpu_gemm(const float * A, const float * B, const int m, const int n, 
const int k, const float alpha, const float beta, const bool TranA, const bool 
TranB, float * C)
+{
+    int lda = TranA ? m : k;
+    int ldb = TranB ? k : n;
+    int ldc = n;
+
+    cublasOperation_t tA= (TranA==false) ? CUBLAS_OP_N : CUBLAS_OP_T;
+       cublasOperation_t tB= (TranB==false) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+
+    cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb, A, lda, &beta, C, 
ldc);
+
+    cublasDestroy(handle);
+}
+
+void gpu_gemv(const float * A, const float * B, const int m, const int n, 
const float alpha, const float beta, const bool TranA, float * C)
+{
+       int lda = n ;
+       cublasOperation_t tA= (TranA==true) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+       cublasHandle_t handle;
+       cublasCreate(&handle);
+
+       cublasSgemv(handle, tA , n , m ,&alpha , A , lda , B , 1 ,&beta , C , 
1);
+
+       cublasDestroy(handle);
+
+}
+
+
+void gpu_axpy(const float * A, const int n, const float alpha, float * B)
+{
+
+       cublasHandle_t handle;
+       cublasCreate(&handle);
+
+       cublasSaxpy(handle,n,&alpha,A,1,B,1);
+
+       cublasDestroy(handle);
+
+}
+
+
+float gpu_dot(const float * A, const float * B, const int n)
+{
+       cublasHandle_t handle;
+       cublasCreate(&handle);
+
+       float result=0.0;
+
+       cublasSdot(handle,n,A,1,B,1,&result);
+
+       cublasDestroy(handle);
+
+       return result;
+
+}
 
 } // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/blob/math_blob.cc
----------------------------------------------------------------------
diff --git a/src/blob/math_blob.cc b/src/blob/math_blob.cc
index ff81667..9421367 100644
--- a/src/blob/math_blob.cc
+++ b/src/blob/math_blob.cc
@@ -1,4 +1,5 @@
 #include "singa/blob/math_blob.h"
+#include "singa/blob/math_kernel.h"
 
 namespace singa {
 
@@ -18,24 +19,26 @@ int get_size(const std::vector<int>& shape)
 
 void GEMM(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * 
C, float alpha, float beta)
 {
-       if(xpu == cpu)
+       if(check_shape_mmm(A, B, *C))
        {
-         if(check_shape_mmm(A, B, *C))
-         {
-           int m = C->shape().at(0);
+               int m = C->shape().at(0);
            int n = C->shape().at(1);
            int k = A.isTranspose() ? A.shape().at(0) : A.shape().at(1);
            bool TranA = A.isTranspose();
            bool TranB = B.isTranspose();
-           cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, TranA, 
TranB, C->mutable_cpu_data());
-         }
-         else{
-         // report errors here
-         }
+
+               if(xpu == cpu)
+               {
+                       cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, 
beta, TranA, TranB, C->mutable_cpu_data());
+               }
+               if(xpu == gpu)
+               {
+                       //gpu part
+                       gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, 
beta, TranA, TranB, C->mutable_gpu_data());
+               }
        }
-       if(xpu == gpu)
-       {
-         //gpu part
+       else{
+         // report errors here
        }
 }
 //C = alpha*A*B+beta*C, A, B and C are matrix
@@ -50,22 +53,24 @@ void MMDot(XPU xpu, const Blob<float> & A, const 
Blob<float> & B, Blob<float> *
 
 void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> 
* C)
 {
-       if(xpu == cpu)
+       if(check_shape_mvv(A, B, *C))
        {
-               if(check_shape_mvv(A, B, *C))
+               int m = B.shape().at(0);
+               int n = C->shape().at(0);
+               bool TranA = A.isTranspose();
+
+               if(xpu == cpu)
                {
-                       int m = B.shape().at(0);
-                       int n = C->shape().at(0);
-                       bool TranA = A.isTranspose();
                        cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, 1, 0, TranA, 
C->mutable_cpu_data());
                }
-               else{
-                       // report errors here
+               if(xpu == gpu)
+               {
+                       //gpu part
+                       gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, 1, 0, TranA, 
C->mutable_gpu_data());
                }
        }
-       if(xpu == gpu)
-       {
-         //gpu part
+       else{
+               // report errors here
        }
        
 }
@@ -74,21 +79,23 @@ void MVDot(XPU xpu, const Blob<float> & A, const 
Blob<float> & B, Blob<float> *
  
 void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> 
* C)
 {
-       if(xpu == cpu)
+       if(check_shape_vvm(A, B, *C))
        {
-               if(check_shape_vvm(A, B, *C))
+               int m = C->shape().at(0);
+               int n = C->shape().at(1);
+
+               if(xpu == cpu)
                {
-                       int m = C->shape().at(0);
-                       int n = C->shape().at(1);
                        cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, 
false, false, C->mutable_cpu_data());
                }
-               else{
-               // report errors here
+               if(xpu == gpu)
+               {
+                       //gpu part
+                       gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0, 
false, false, C->mutable_gpu_data());
                }
        }
-       if(xpu == gpu)
-       {
-         //gpu part
+       else{
+       // report errors here
        }
 }
 // C is matrix,A and B are vector
@@ -97,20 +104,21 @@ void VVDot(XPU xpu, const Blob<float> & A, const 
Blob<float> & B, Blob<float> *
 float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B)
 {
        float res = 0;
-       if(xpu == cpu)
+       if(check_shape_equal(A, B, B))
        {
-               if(check_shape_equal(A, B, B))
-               {
-                       int n = get_size(A.shape());
+               int n = get_size(A.shape());
+               if(xpu == cpu)
+               {       
                        res = cpu_dot(A.cpu_data(), B.cpu_data(), n);
                }
-               else{
-               // report errors here
+               if(xpu == gpu)
+               {
+               //gpu part
+                       res = gpu_dot(A.gpu_data(), B.gpu_data(), n);
                }
        }
-       if(xpu == gpu)
-       {
-         //gpu part
+       else{
+       // report errors here
        }
        return res;
 }
@@ -118,19 +126,20 @@ float VVdot(XPU xpu, const Blob<float> & A, const 
Blob<float> & B)
 
 void AXPY(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha)
 {
-       if(xpu == cpu)
+       if(check_shape_equal(A, *B, *B))
        {
-               if(check_shape_equal(A, *B, *B))
+
+               if(xpu == cpu)
                {
                        cpu_axpy(A.cpu_data(), get_size(A.shape()), alpha, 
B->mutable_cpu_data());
                }
-               else{
-               // report errors here
+               if(xpu == gpu)
+               {
+                       gpu_axpy(A.gpu_data(), get_size(A.shape()), alpha, 
B->mutable_gpu_data());
                }
        }
-       if(xpu == gpu)
-       {
-         //gpu part
+       else{
+       // report errors here
        }
 }
 // element-wise operation: Bi = alpha*Ai+Bi  A and B should have the same size
@@ -143,47 +152,52 @@ inline void Repmat(XPU xpu, const Blob<float> & A, 
Blob<float> * B)
 
 void MVAdd(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float 
beta)
 {
-       if(xpu == cpu)
+       if(check_shape_mv(*B, A))
        {
-               if(check_shape_mv(*B, A))
+               int m = get_size(A.shape());
+               int n = get_size(B->shape()) / m;
+
+               if(xpu == cpu)
                {
-                       int m = get_size(A.shape());
-                       int n = get_size(B->shape()) / m;
                        const float * univ = cpu_uni_vec(n);
                        cpu_gemm(A.cpu_data(), univ, m, n, 1, alpha, beta, 
false, false, B->mutable_cpu_data());
                        delete univ;
                }
-               else{
-               // report errors here
-               }
+
+               if(xpu == gpu)
+               {
+                       
singa_add_vec_row(B->gpu_data(),A.gpu_data(),A.gpu_data(),m,n,n);
+               //gpu part
+               }       
+       }
+       else{
+       // report errors here
        }
-       if(xpu == gpu)
-       {
-         //gpu part
-       }       
 }
 // A is a vector, B is a matrix , Bij = alpha*Ai+beta*Bij
 // will use gemm. faster than general expand_f
 
 void MVSum(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float 
beta)
 {
-       if(xpu == cpu)
+       if(check_shape_mv(A, *B))
        {
-               if(check_shape_mv(A, *B))
+               int m = get_size(B->shape());
+               int n = get_size(A.shape()) / m;
+
+               if(xpu == cpu)
                {
-                       int m = get_size(B->shape());
-                       int n = get_size(A.shape()) / m;
                        const float * univ = cpu_uni_vec(n);
                        cpu_gemm(A.cpu_data(), univ, m, 1, n, alpha, beta, 
false, false, B->mutable_cpu_data());
                        delete univ;
                }
-               else{
-               // report errors here
+               if(xpu == gpu)
+               {
+                       singa_sum_col(A.gpu_data(),B->gpu_data(),m,n,n);
+               //gpu part
                }
        }
-       if(xpu == gpu)
-       {
-         //gpu part
+       else{
+       // report errors here
        }
 }
 // B is a vector, A is a matrix , Bi = \sigma_j_{alpha*Aij}+beta*Bi

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/blob/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/blob/math_kernel.cu b/src/blob/math_kernel.cu
new file mode 100644
index 0000000..6b2a709
--- /dev/null
+++ b/src/blob/math_kernel.cu
@@ -0,0 +1,78 @@
+#include "singa/blob/math_kernel.h"
+
+#define CU2DBLOCK_X 32
+#define CU2DBLOCK_Y 32
+
+#define CU1DBLOCK 1024 
+#define CU1DBLOCKF 1024.0 
+
+
+//Cuda Kernel Functions
+__global__
+void kernel_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long 
cols, long stride)
+{
+           int j = blockIdx.x;
+               int THREADS = blockDim.x;
+               if(j >= cols) {
+                       return;
+               }
+
+               __shared__ float aux[CU1DBLOCK];
+               int steps = (rows - 1) / THREADS + 1;
+               aux[threadIdx.x] = src_mat_data[j+threadIdx.x*stride];
+               for(int i=1; i<steps; ++i) {
+                       if(threadIdx.x+i*THREADS < rows) {
+                               aux[threadIdx.x] += 
src_mat_data[j+(threadIdx.x+i*THREADS)*stride];
+                       }
+               }
+
+               int total_threads = THREADS;
+               __syncthreads();
+               while(total_threads > 1) {
+                       int half_point = ((1+total_threads) >> 1);
+                       if (threadIdx.x < half_point) {
+                               if(threadIdx.x+half_point < total_threads) {
+                                       aux[threadIdx.x] += aux[threadIdx.x + 
half_point];                                                                    
                                                              
+                               }
+                       }
+                       __syncthreads();
+                       total_threads = ((total_threads+1) >> 1);
+               }
+
+               __syncthreads();
+               dst_vec_data[j] = aux[0];
+}
+
+__global__ 
+void kernel_add_vec_row(float *src_vec_data, float *src_mat_data, float* 
des_mat_data,long rows, long cols, long stride)
+{
+       long i = blockIdx.x * blockDim.x + threadIdx.x;
+       long j = blockIdx.y * blockDim.y + threadIdx.y;
+       long num_threads_x = blockDim.x * gridDim.x;
+       long num_threads_y = blockDim.y * gridDim.y;
+       long index = 0;
+       for(; i<cols && j<rows; i+=num_threads_x, j+=num_threads_y) {
+               index = j * stride + i;
+               des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
+       }
+}
+
+//
+namespace singa{
+
+void singa_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long 
cols, long stride)
+{
+       long threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
+       long num_blocks = cols;
+
+       kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, 
dst_vec_data, rows, cols, stride);
+}
+
+void singa_add_vec_row(float *src_vec_data, float *src_mat_data, float 
*des_mat_data ,long rows, long cols, long stride)
+{
+       dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
+       dim3 num_blocks(cols/threads_per_block.x + (cols%threads_per_block.x == 
0 ? 0 : 1), rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1));
+       kernel_add_vec_row<<<num_blocks, threads_per_block>>>(src_vec_data, 
src_mat_data, des_mat_data,rows, cols, stride);
+}
+
+}//namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/test/test_math.cc
----------------------------------------------------------------------
diff --git a/src/test/test_math.cc b/src/test/test_math.cc
new file mode 100644
index 0000000..3856d1d
--- /dev/null
+++ b/src/test/test_math.cc
@@ -0,0 +1,385 @@
+#include "gtest/gtest.h"
+#include "singa/blob/math_addr.h"
+#include "singa/blob/math_blob.h"
+#include "singa/blob/math_kernel.h"
+#include "singa/blob/singa_op.h"
+
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+
+using namespace singa;
+using namespace std;
+
+TEST(MathTest, TestGemmCPU) {
+       float A[3][2] = {};
+       float B[3][2] = {};
+       float C[2][2] = {};
+       for(int i = 0; i < 3; i++)
+               for(int j = 0; j < 2; j++)
+               {
+                       A[i][j] = i+j;
+                       B[i][j] = i+j - i*j;
+               }
+       cpu_gemm(A[0], B[0], 2, 2, 3 , 1, 0, true, false, C[0]);
+       float D[2][2] = {};
+       for(int i = 0; i < 2; i++)
+               for(int j = 0; j < 2; j++)
+               {
+                       D[i][j] = 0;
+                       for(int k = 0; k < 3; k++)
+                               D[i][j] += A[k][i]*B[k][j];
+               }
+    for(int i = 0; i < 2; i++)
+        for(int j = 0; j < 2; j++)
+        {
+                       ASSERT_EQ(C[i][j], D[i][j]);
+               }
+}
+
+TEST(MathTest, TestGemvCPU) {
+       float A[4][3] = {}; 
+       float B[4]= {}; 
+       float C[3] = {}; 
+       float D[3] = {}; 
+
+       for(int i = 0; i < 4; i++)
+       {
+               for(int j = 0; j < 3; j++)
+               {
+                       A[j][i] = i-j + i*j;
+               }
+       }
+
+       for(int i = 0; i < 4; i++)B[i] = i;
+       for(int i = 0; i < 3; i++)C[i] = 10; 
+       cpu_gemv(A[0], B, 4, 3, 1, 1, true, C); 
+
+       for(int i = 0; i < 3; i++)
+       {
+               for(int j = 0; j < 4; j++)
+               {
+                       D[i] += A[j][i]*B[j];
+               }
+       }
+       for(int i = 0; i < 3; i++)
+       {
+               ASSERT_EQ(C[i], D[i]+10);
+       }
+}
+
+
+TEST(MathTest, TestAxpyCPU) {
+       float A[4][3] = {}; 
+       float C[4][3] = {}; 
+       float B[3][4] = {}; 
+       float D[3][4] = {};
+
+       for(int i = 0; i < 4; i++)
+       {
+               for(int j = 0; j < 3; j++)
+               {
+                       A[i][j] = i-j + i*j;
+                       B[j][i] = i-j + i*j;
+                       C[i][j] = A[i][j];
+                       D[j][i] = B[j][i];
+               }
+       }
+
+       cpu_axpy(A[0], 12, 2, B[0]);
+       for(int i = 0; i < 12; i++)
+       {
+               D[0][i] += 2*C[0][i];
+       }
+
+       for(int i = 0; i < 3; i++)
+       {
+               for(int j = 0; j < 4; j++)
+               {
+                       ASSERT_EQ(B[i][j],D[i][j]);
+               }
+       }
+}
+
+
+TEST(MathTest, TestGemmGPU) {
+       float A[3][2] = {};
+       float B[3][2] = {};
+       float C[2][2] = {};
+       for(int i = 0; i < 3; i++)
+       {
+               for(int j = 0; j < 2; j++)
+               {
+                       A[i][j] = i+j;
+                       B[i][j] = i+j - i*j;
+               }
+       }
+
+       float* A_gpu=NULL;
+       float* B_gpu=NULL;
+       float* C_gpu=NULL;
+
+       cudaMalloc((void**)&A_gpu, 3*2*sizeof(float));
+       cudaMalloc((void**)&B_gpu, 3*2*sizeof(float));
+       cudaMalloc((void**)&C_gpu, 2*2*sizeof(float));
+
+       cudaMemcpy(A_gpu,A,3*2*sizeof(float),cudaMemcpyHostToDevice);
+       cudaMemcpy(B_gpu,B,3*2*sizeof(float),cudaMemcpyHostToDevice);
+
+       gpu_gemm(A_gpu, B_gpu, 2, 2, 3 , 1, 0, true, false, C_gpu);
+
+       cudaMemcpy(C,C_gpu,2*2*sizeof(float),cudaMemcpyDeviceToHost);
+
+       float D[2][2] = {};
+       for(int i = 0; i < 2; i++)
+       {
+               for(int j = 0; j < 2; j++)
+               {
+                       D[i][j] = 0;
+                       for(int k = 0; k < 3; k++)
+                       {
+                               D[i][j] += A[k][i]*B[k][j];
+                       }
+               }
+       }
+
+       for(int i = 0; i < 2; i++)
+       {
+               for(int j = 0; j < 2; j++)
+               {
+                       ASSERT_EQ(C[i][j],D[i][j]);
+               }
+       }
+
+       cudaFree(A_gpu);
+       cudaFree(B_gpu);
+       cudaFree(C_gpu);
+}
+
+
+TEST(MathTest, TestGemvGPU) {
+       float A[4][3] = {};
+       float B[4]= {};
+       float C[3] = {};
+       float D[3] = {};
+
+       for(int i = 0; i < 4; i++)
+       {
+               for(int j = 0; j < 3; j++)
+               {
+                       A[i][j] = i-j + i*j;
+               }
+       }
+
+       for(int i = 0; i < 4; i++)B[i] = i;
+       for(int i = 0; i < 3; i++)C[i] = 10;
+
+       float* A_gpu=NULL;
+       float* B_gpu=NULL;
+       float* C_gpu=NULL;
+
+       cudaMalloc((void**)&A_gpu, 4*3*sizeof(float));
+       cudaMalloc((void**)&B_gpu, 4*sizeof(float));
+       cudaMalloc((void**)&C_gpu, 3*sizeof(float));
+
+       cudaMemcpy(A_gpu,A,4*3*sizeof(float),cudaMemcpyHostToDevice);
+       cudaMemcpy(B_gpu,B,4*sizeof(float),cudaMemcpyHostToDevice);
+       cudaMemcpy(C_gpu,C,3*sizeof(float),cudaMemcpyHostToDevice);
+
+       gpu_gemv(A_gpu, B_gpu, 4, 3, 1, 1, true, C_gpu);
+
+       cudaMemcpy(C,C_gpu,3*sizeof(float),cudaMemcpyDeviceToHost);
+
+       for(int i = 0; i < 3; i++)
+       {
+               for(int j = 0; j < 4; j++)
+               {
+                       D[i] += A[j][i]*B[j];
+               }
+       }
+
+       for(int i = 0; i < 3; i++)
+       {
+               ASSERT_EQ(C[i],D[i]+10);
+       }
+
+       cudaFree(A_gpu);
+       cudaFree(B_gpu);
+       cudaFree(C_gpu);
+}
+
+
+TEST(MathTest, TestAxpyGPU) {
+       float A[4][3] = {};
+       float C[4][3] = {};
+       float B[3][4] = {};
+       float D[3][4] = {};
+
+       for(int i = 0; i < 4; i++)
+       {
+               for(int j = 0; j < 3; j++)
+               {
+                       A[i][j] = i-j + i*j;
+                       B[j][i] = i-j + i*j;
+                       C[i][j] = A[i][j];
+                       D[j][i] = B[j][i];
+               }
+       }
+
+       float* A_gpu=NULL;
+       float* B_gpu=NULL;
+
+       cudaMalloc((void**)&A_gpu, 4*3*sizeof(float));
+       cudaMalloc((void**)&B_gpu, 3*4*sizeof(float));
+
+       cudaMemcpy(A_gpu,A,4*3*sizeof(float),cudaMemcpyHostToDevice);
+       cudaMemcpy(B_gpu,B,3*4*sizeof(float),cudaMemcpyHostToDevice);
+
+       gpu_axpy(A_gpu, 12, 2, B_gpu);
+
+       cudaMemcpy(A,A_gpu,4*3*sizeof(float),cudaMemcpyDeviceToHost);
+       cudaMemcpy(B,B_gpu,3*4*sizeof(float),cudaMemcpyDeviceToHost);
+
+       for(int i = 0; i < 12; i++)D[0][i] += 2*C[0][i];
+
+       for(int i = 0; i < 3; i++)
+       {
+               for(int j = 0; j < 4; j++)
+               {
+                       ASSERT_EQ(B[i][j],D[i][j]);
+               }
+       }
+
+       cudaFree(A_gpu);
+       cudaFree(B_gpu);
+}
+
+
+TEST(MathTest, TestDotGPU) {
+       float A[12];
+       float B[12];
+
+       for(int i = 0; i < 12; i++)
+       {
+               A[i]=i-1;
+               B[i]=i+1;
+       }
+
+       float* A_gpu=NULL;
+       float* B_gpu=NULL;
+
+       cudaMalloc((void**)&A_gpu, 12*sizeof(float));
+       cudaMalloc((void**)&B_gpu, 12*sizeof(float));
+
+       cudaMemcpy(A_gpu,A,12*sizeof(float),cudaMemcpyHostToDevice);
+       cudaMemcpy(B_gpu,B,12*sizeof(float),cudaMemcpyHostToDevice);
+       float gpu_ret=gpu_dot(A_gpu,B_gpu,12);
+
+       float cpu_ret=0.0f;
+       for(int i = 0; i < 12; i++)
+       {
+               cpu_ret+=A[i]*B[i];
+       }
+
+       ASSERT_EQ(gpu_ret,cpu_ret);
+
+       cudaFree(A_gpu);
+       cudaFree(B_gpu);
+
+}
+
+TEST(MathTest, TestSingaSumColGPU) {
+
+       float A[3][4];
+       float B[4];
+       float C[4];
+
+       for(int i = 0; i < 3; i++)
+       {
+               for(int j = 0; j < 4; j++)
+               {
+                       A[i][j]=i+j;
+               }
+       }
+       
+       for(int i = 0; i < 4; i++)
+       {
+               B[i]=0.0f;
+               C[i]=0.0f;
+       }
+
+       float* A_gpu=NULL;
+       float* B_gpu=NULL;
+
+       cudaMalloc((void**)&A_gpu, 12*sizeof(float));
+       cudaMalloc((void**)&B_gpu, 4*sizeof(float));
+       cudaMemcpy(A_gpu,A,12*sizeof(float),cudaMemcpyHostToDevice);
+
+       singa_sum_col(A_gpu,B_gpu,3,4,4);
+
+       cudaMemcpy(B,B_gpu,4*sizeof(float),cudaMemcpyDeviceToHost);
+
+       for(int i = 0; i < 4; i++)
+       {
+               for(int j = 0; j < 3; j++)
+               {
+                       C[i]+=A[j][i];
+               }
+       }
+
+       for(int i = 0; i <4; i++)
+       {
+               ASSERT_EQ(B[i],C[i]);
+       }
+
+       cudaFree(A_gpu);
+       cudaFree(B_gpu);
+}
+
+TEST(MathTest, TestSingaAddVecRowGPU) {
+
+       float A[3][4];
+       float B[4];
+       float C[3][4];
+       float D[3][4];
+
+       for(int i = 0; i < 4; i++)
+       {
+               B[i]=i;
+       }
+
+       for(int i = 0; i < 3; i++)
+       {
+               for(int j = 0; j < 4; j++)
+               {
+                       A[i][j]=i+j;
+                       D[i][j]=A[i][j]+B[j];
+               }
+       }
+
+
+       float* A_gpu=NULL;
+       float* B_gpu=NULL;
+       float* C_gpu=NULL;
+
+       cudaMalloc((void**)&A_gpu, 3*4*sizeof(float));
+       cudaMalloc((void**)&B_gpu, 4*sizeof(float));
+       cudaMalloc((void**)&C_gpu, 3*4*sizeof(float));
+       cudaMemcpy(A_gpu,A,3*4*sizeof(float),cudaMemcpyHostToDevice);
+       cudaMemcpy(B_gpu,B,4*sizeof(float),cudaMemcpyHostToDevice);
+
+       singa_add_vec_row(B_gpu,A_gpu,C_gpu,3,4,4);
+
+       cudaMemcpy(C,C_gpu,3*4*sizeof(float),cudaMemcpyDeviceToHost);
+
+       for(int i = 0; i < 3; i++)
+       {
+               for(int j = 0; j < 4; j++)
+               {
+                       ASSERT_EQ(C[i][j],D[i][j]);
+               }
+       }
+
+       cudaFree(A_gpu);
+       cudaFree(B_gpu);
+       cudaFree(C_gpu);
+}

[03/19] incubator-singa git commit: SINGA-80 New Blob Level and Address Level Math Operation Interface

Reply via email to