SINGA-80 New Blob Level and Address Level Math Operation Interface
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/c13e0370 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/c13e0370 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/c13e0370 Branch: refs/heads/master Commit: c13e0370906371ca001e755268c51e656dfde0fa Parents: bbb7dbc Author: seaok <[email protected]> Authored: Fri Oct 30 10:02:22 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Mon Nov 9 17:04:48 2015 +0800 ---------------------------------------------------------------------- Makefile.gpu | 153 ++++++++++++++ include/singa/blob/math_kernel.h | 14 ++ src/blob/math_addr.cc | 64 ++++++ src/blob/math_blob.cc | 148 +++++++------ src/blob/math_kernel.cu | 78 +++++++ src/test/test_math.cc | 385 ++++++++++++++++++++++++++++++++++ 6 files changed, 775 insertions(+), 67 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/Makefile.gpu ---------------------------------------------------------------------- diff --git a/Makefile.gpu b/Makefile.gpu new file mode 100644 index 0000000..9c9051c --- /dev/null +++ b/Makefile.gpu @@ -0,0 +1,153 @@ +#/** +# * Copyright 2015 The Apache Software Foundation +# * +# * Licensed to the Apache Software Foundation (ASF) under one +# * or more contributor license agreements. See the NOTICE file +# * distributed with this work for additional information +# * regarding copyright ownership. The ASF licenses this file +# * to you under the Apache License, Version 2.0 (the +# * "License"); you may not use this file except in compliance +# * with the License. You may obtain a copy of the License at +# * +# * http://www.apache.org/licenses/LICENSE-2.0 +# * +# * Unless required by applicable law or agreed to in writing, software +# * distributed under the License is distributed on an "AS IS" BASIS, +# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# * See the License for the specific language governing permissions and +# * limitations under the License. +# */ + +###################User Config Varaibles ############################# +# third-party library installation folder +HOME_DIR := /usr + +CUDA_DIR := /usr/local/cuda + +#CUDA_DIR := + +# Lib folder for system and external libs. You may need to change it. +LIBRARY_DIRS := $(HOME_DIR)/lib64 $(HOME_DIR)/lib $(HOME_DIR)/local/lib $(CUDA_DIR)/lib $(CUDA_DIR)/lib64 +# Header folder for system and external libs. You may need to change it. +INCLUDE_DIRS := $(HOME_DIR)/include ./include $(HOME_DIR)/local/include/zookeeper $(CUDA_DIR)/include +# g++ location, should support c++11, tested with 4.8.1 +CXX := g++ +CUCXX := nvcc + +######################Setting Varialbes####################################### +LIBRARIES := glog protobuf openblas zmq czmq zookeeper_mt + +ifneq ($(CUDA_DIR),) + LIBRARIES := $(LIBRARIES) cublas cudart curand +endif + +LDFLAGS := $(foreach librarydir, $(LIBRARY_DIRS), -L$(librarydir))\ + $(foreach library, $(LIBRARIES), -l$(library)) +# Folder to store compiled files +BUILD_DIR := .libs +MSHADOW_FLAGS :=-DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0 +ZK_FLAGS :=-DTHREADED -fpermissive +CXXFLAGS := -O2 -msse3 -Wall -pthread -fPIC -std=c++11 -Wno-unknown-pragmas \ + $(MSHADOW_FLAGS) -DCPU_ONLY=1 $(ZK_FLAGS)\ + -funroll-loops $(foreach includedir, $(INCLUDE_DIRS), -I$(includedir)) +CUCXXFLAGS := $(MSHADOW_FLAGS) -std=c++11 -G $(CUDA_ARCH) \ + $(foreach includedir, $(INCLUDE_DIRS), -I$(includedir)) + +#Add device compile option +ifeq ($(CUDA_DIR),) + MSHADOW_FLAGS := $(MSHADOW_FLAGS) -DCPU_ONLY + CXXFLAGS := $(CXXFLAGS) -DCPU_ONLY +endif + +# find user defined .proto file, and then compute the corresponding .h, .cc +# files, which cannot be found by shell find, because they haven't been +# generated currently +PROTOS := $(shell find src/proto/ -name "*.proto") +PROTO_SRCS :=$(PROTOS:.proto=.pb.cc) +PROTO_HDRS :=$(patsubst src%, include%, $(PROTOS:.proto=.pb.h)) +PROTO_OBJS :=$(addprefix $(BUILD_DIR)/, $(PROTO_SRCS:.cc=.o)) + +# each singa src file will generate a .o file +SINGA_SRCS := $(shell find src/ \( -path "src/test" -o -path "src/main.cc" -o -path "src/utils/tool.cc" \) \ + -prune -o \( -name "*.cc" -type f \) -print ) +SINGA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(SINGA_SRCS:.cc=.o)) \ + $(PROTO_OBJS) ) +-include $(SINGA_OBJS:%.o=%.P) + +TEST_SRCS :=$(shell find src/test/ -maxdepth 1 -name "*.cc") +TEST_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(TEST_SRCS:.cc=.o))) +-include $(TEST_OBJS:%.o=%.P) + +TEST_CUDA_SRCS :=$(shell find src/test/ -maxdepth 1 -name "*.cu") +TEST_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(TEST_CUDA_SRCS:.cu=.o))) +-include $(TEST_CUDA_OBJS:%.o=%.P) + +SINGA_CUDA_SRCS :=$(shell find src/ -maxdepth 2 -name "*.cu") +SINGA_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(SINGA_CUDA_SRCS:.cu=.o))) +-include $(SINGA_CUDA_OBJS:%.o=%.P) + +GTEST_SRC := include/gtest/gtest-all.cc +GTEST_HDR := include/gtest/gtest.h +GTEST_LIB := $(BUILD_DIR)/libgtest.a + +OBJS := $(sort $(SINGA_OBJS) $(TEST_OBJS) ) +CUOBJS := $(sort $(SINGA_CUDA_OBJS) $(TEST_CUDA_OBJS) ) + +########################Compilation Section################################### +.PHONY: singa test + +singa: $(PROTO_OBJS) $(SINGA_OBJS) $(SINGA_CUDA_OBJS) + $(CXX) -shared -o $(BUILD_DIR)/libsinga.so $(SINGA_OBJS) + $(CXX) $(SINGA_OBJS) $(SINGA_CUDA_OBJS) src/main.cc -o singa $(CXXFLAGS) $(LDFLAGS) + @echo + $(CXX) $(SINGA_OBJS) $(SINGA_CUDA_OBJS) src/utils/tool.cc -o singatool $(CXXFLAGS) $(LDFLAGS) + @echo + +loader: proto $(LOADER_OBJS) + $(CXX) $(LOADER_OBJS) -o $(BUILD_DIR)/loader $(CXXFLAGS) $(LDFLAGS) + @echo + +test: proto $(GTEST_LIB) $(TEST_OBJS) $(TEST_CUDA_OBJS) $(SINGA_OBJS) $(SINGA_CUDA_OBJS) + $(CXX) $(TEST_OBJS) $(TEST_CUDA_OBJS) include/gtest/gtest_main.cc $(GTEST_LIB) \ + $(SINGA_OBJS) $(SINGA_CUDA_OBJS) -o $(BUILD_DIR)/test $(CXXFLAGS) $(LDFLAGS) + @echo + +$(GTEST_LIB): $(GTEST_HDR) $(GTEST_SRC) + $(CXX) $(GTEST_SRC) -c -o $(BUILD_DIR)/gtest-all.o $(CXXFLAGS) + ar -rv $(GTEST_LIB) $(BUILD_DIR)/gtest-all.o + +# compile all files +$(OBJS):$(BUILD_DIR)/%.o : %.cc + @mkdir -p $(dir $@) + $(CXX) $< $(CXXFLAGS) -MMD -c -o $@ + cp $(BUILD_DIR)/$*.d $(BUILD_DIR)/$*.P; \ + sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \ + -e '/^$$/ d' -e 's/$$/ :/' < $(BUILD_DIR)/$*.d >> $(BUILD_DIR)/$*.P; \ + rm -f $*.d + +$(CUOBJS):$(BUILD_DIR)/%.o : %.cu + @mkdir -p $(dir $@) + $(CUCXX) $< -c -o $@ $(CUCXXFLAGS) + cp $(BUILD_DIR)/$*.d $(BUILD_DIR)/$*.P; \ + sed -e 's/#.*//' -e 's/^[^:]*: *//' -e 's/ *\\$$//' \ + -e '/^$$/ d' -e 's/$$/ :/' < $(BUILD_DIR)/$*.d >> $(BUILD_DIR)/$*.P; \ + rm -f $*.d + +proto: $(PROTO_OBJS) + +$(PROTO_SRCS): $(PROTOS) + protoc --proto_path=src/proto --cpp_out=src/proto $(PROTOS) + mkdir -p include/proto/ + cp src/proto/*.pb.h include/proto/ + mkdir -p tool/pb2/ + touch tool/pb2/__init__.py + protoc --proto_path=src/proto --python_out=tool/pb2/ $(PROTOS) + @echo + +clean: + rm -rf *.a *.so + rm -rf include/proto/* + rm -rf src/proto/*.pb.h src/proto/*.pb.cc + rm -rf tool/pb2/* + rm -rf $(BUILD_DIR) + @echo http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/include/singa/blob/math_kernel.h ---------------------------------------------------------------------- diff --git a/include/singa/blob/math_kernel.h b/include/singa/blob/math_kernel.h new file mode 100644 index 0000000..9aaf4c2 --- /dev/null +++ b/include/singa/blob/math_kernel.h @@ -0,0 +1,14 @@ +#ifndef MATH_KERNEL_H +#define MATH_KERNEL_H + +namespace singa{ + +extern "C" { + void singa_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride); + + void singa_add_vec_row(float *src_vec_data, float *src_mat_data, float *des_mat_data, long rows, long cols, long stride); +}; + +} + +#endif http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/blob/math_addr.cc ---------------------------------------------------------------------- diff --git a/src/blob/math_addr.cc b/src/blob/math_addr.cc index f28fdcb..799a749 100644 --- a/src/blob/math_addr.cc +++ b/src/blob/math_addr.cc @@ -3,6 +3,9 @@ extern "C" #include <cblas.h> } +#include <cuda_runtime.h> +#include "cublas_v2.h" + #include "singa/blob/math_addr.h" #include "singa/blob/singa_op.h" @@ -47,5 +50,66 @@ float cpu_dot(const float * A, const float * B, const int n) return sum; } +//Trick: swap A and B +// +void gpu_gemm(const float * A, const float * B, const int m, const int n, const int k, const float alpha, const float beta, const bool TranA, const bool TranB, float * C) +{ + int lda = TranA ? m : k; + int ldb = TranB ? k : n; + int ldc = n; + + cublasOperation_t tA= (TranA==false) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t tB= (TranB==false) ? CUBLAS_OP_N : CUBLAS_OP_T; + + cublasHandle_t handle; + cublasCreate(&handle); + + cublasSgemm(handle, tB, tA, n, m, k, &alpha, B, ldb, A, lda, &beta, C, ldc); + + cublasDestroy(handle); +} + +void gpu_gemv(const float * A, const float * B, const int m, const int n, const float alpha, const float beta, const bool TranA, float * C) +{ + int lda = n ; + cublasOperation_t tA= (TranA==true) ? CUBLAS_OP_N : CUBLAS_OP_T; + + cublasHandle_t handle; + cublasCreate(&handle); + + cublasSgemv(handle, tA , n , m ,&alpha , A , lda , B , 1 ,&beta , C , 1); + + cublasDestroy(handle); + +} + + +void gpu_axpy(const float * A, const int n, const float alpha, float * B) +{ + + cublasHandle_t handle; + cublasCreate(&handle); + + cublasSaxpy(handle,n,&alpha,A,1,B,1); + + cublasDestroy(handle); + +} + + +float gpu_dot(const float * A, const float * B, const int n) +{ + cublasHandle_t handle; + cublasCreate(&handle); + + float result=0.0; + + cublasSdot(handle,n,A,1,B,1,&result); + + cublasDestroy(handle); + + return result; + +} } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/blob/math_blob.cc ---------------------------------------------------------------------- diff --git a/src/blob/math_blob.cc b/src/blob/math_blob.cc index ff81667..9421367 100644 --- a/src/blob/math_blob.cc +++ b/src/blob/math_blob.cc @@ -1,4 +1,5 @@ #include "singa/blob/math_blob.h" +#include "singa/blob/math_kernel.h" namespace singa { @@ -18,24 +19,26 @@ int get_size(const std::vector<int>& shape) void GEMM(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C, float alpha, float beta) { - if(xpu == cpu) + if(check_shape_mmm(A, B, *C)) { - if(check_shape_mmm(A, B, *C)) - { - int m = C->shape().at(0); + int m = C->shape().at(0); int n = C->shape().at(1); int k = A.isTranspose() ? A.shape().at(0) : A.shape().at(1); bool TranA = A.isTranspose(); bool TranB = B.isTranspose(); - cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, TranA, TranB, C->mutable_cpu_data()); - } - else{ - // report errors here - } + + if(xpu == cpu) + { + cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, k, alpha, beta, TranA, TranB, C->mutable_cpu_data()); + } + if(xpu == gpu) + { + //gpu part + gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, k, alpha, beta, TranA, TranB, C->mutable_gpu_data()); + } } - if(xpu == gpu) - { - //gpu part + else{ + // report errors here } } //C = alpha*A*B+beta*C, A, B and C are matrix @@ -50,22 +53,24 @@ void MMDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C) { - if(xpu == cpu) + if(check_shape_mvv(A, B, *C)) { - if(check_shape_mvv(A, B, *C)) + int m = B.shape().at(0); + int n = C->shape().at(0); + bool TranA = A.isTranspose(); + + if(xpu == cpu) { - int m = B.shape().at(0); - int n = C->shape().at(0); - bool TranA = A.isTranspose(); cpu_gemv(A.cpu_data(), B.cpu_data(), m, n, 1, 0, TranA, C->mutable_cpu_data()); } - else{ - // report errors here + if(xpu == gpu) + { + //gpu part + gpu_gemv(A.gpu_data(), B.gpu_data(), m, n, 1, 0, TranA, C->mutable_gpu_data()); } } - if(xpu == gpu) - { - //gpu part + else{ + // report errors here } } @@ -74,21 +79,23 @@ void MVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * C) { - if(xpu == cpu) + if(check_shape_vvm(A, B, *C)) { - if(check_shape_vvm(A, B, *C)) + int m = C->shape().at(0); + int n = C->shape().at(1); + + if(xpu == cpu) { - int m = C->shape().at(0); - int n = C->shape().at(1); cpu_gemm(A.cpu_data(), B.cpu_data(), m, n, 1, 1, 0, false, false, C->mutable_cpu_data()); } - else{ - // report errors here + if(xpu == gpu) + { + //gpu part + gpu_gemm(A.gpu_data(), B.gpu_data(), m, n, 1, 1, 0, false, false, C->mutable_gpu_data()); } } - if(xpu == gpu) - { - //gpu part + else{ + // report errors here } } // C is matrix,A and B are vector @@ -97,20 +104,21 @@ void VVDot(XPU xpu, const Blob<float> & A, const Blob<float> & B, Blob<float> * float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B) { float res = 0; - if(xpu == cpu) + if(check_shape_equal(A, B, B)) { - if(check_shape_equal(A, B, B)) - { - int n = get_size(A.shape()); + int n = get_size(A.shape()); + if(xpu == cpu) + { res = cpu_dot(A.cpu_data(), B.cpu_data(), n); } - else{ - // report errors here + if(xpu == gpu) + { + //gpu part + res = gpu_dot(A.gpu_data(), B.gpu_data(), n); } } - if(xpu == gpu) - { - //gpu part + else{ + // report errors here } return res; } @@ -118,19 +126,20 @@ float VVdot(XPU xpu, const Blob<float> & A, const Blob<float> & B) void AXPY(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha) { - if(xpu == cpu) + if(check_shape_equal(A, *B, *B)) { - if(check_shape_equal(A, *B, *B)) + + if(xpu == cpu) { cpu_axpy(A.cpu_data(), get_size(A.shape()), alpha, B->mutable_cpu_data()); } - else{ - // report errors here + if(xpu == gpu) + { + gpu_axpy(A.gpu_data(), get_size(A.shape()), alpha, B->mutable_gpu_data()); } } - if(xpu == gpu) - { - //gpu part + else{ + // report errors here } } // element-wise operation: Bi = alpha*Ai+Bi A and B should have the same size @@ -143,47 +152,52 @@ inline void Repmat(XPU xpu, const Blob<float> & A, Blob<float> * B) void MVAdd(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float beta) { - if(xpu == cpu) + if(check_shape_mv(*B, A)) { - if(check_shape_mv(*B, A)) + int m = get_size(A.shape()); + int n = get_size(B->shape()) / m; + + if(xpu == cpu) { - int m = get_size(A.shape()); - int n = get_size(B->shape()) / m; const float * univ = cpu_uni_vec(n); cpu_gemm(A.cpu_data(), univ, m, n, 1, alpha, beta, false, false, B->mutable_cpu_data()); delete univ; } - else{ - // report errors here - } + + if(xpu == gpu) + { + singa_add_vec_row(B->gpu_data(),A.gpu_data(),A.gpu_data(),m,n,n); + //gpu part + } + } + else{ + // report errors here } - if(xpu == gpu) - { - //gpu part - } } // A is a vector, B is a matrix , Bij = alpha*Ai+beta*Bij // will use gemm. faster than general expand_f void MVSum(XPU xpu, const Blob<float> & A, Blob<float> * B, float alpha, float beta) { - if(xpu == cpu) + if(check_shape_mv(A, *B)) { - if(check_shape_mv(A, *B)) + int m = get_size(B->shape()); + int n = get_size(A.shape()) / m; + + if(xpu == cpu) { - int m = get_size(B->shape()); - int n = get_size(A.shape()) / m; const float * univ = cpu_uni_vec(n); cpu_gemm(A.cpu_data(), univ, m, 1, n, alpha, beta, false, false, B->mutable_cpu_data()); delete univ; } - else{ - // report errors here + if(xpu == gpu) + { + singa_sum_col(A.gpu_data(),B->gpu_data(),m,n,n); + //gpu part } } - if(xpu == gpu) - { - //gpu part + else{ + // report errors here } } // B is a vector, A is a matrix , Bi = \sigma_j_{alpha*Aij}+beta*Bi http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/blob/math_kernel.cu ---------------------------------------------------------------------- diff --git a/src/blob/math_kernel.cu b/src/blob/math_kernel.cu new file mode 100644 index 0000000..6b2a709 --- /dev/null +++ b/src/blob/math_kernel.cu @@ -0,0 +1,78 @@ +#include "singa/blob/math_kernel.h" + +#define CU2DBLOCK_X 32 +#define CU2DBLOCK_Y 32 + +#define CU1DBLOCK 1024 +#define CU1DBLOCKF 1024.0 + + +//Cuda Kernel Functions +__global__ +void kernel_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride) +{ + int j = blockIdx.x; + int THREADS = blockDim.x; + if(j >= cols) { + return; + } + + __shared__ float aux[CU1DBLOCK]; + int steps = (rows - 1) / THREADS + 1; + aux[threadIdx.x] = src_mat_data[j+threadIdx.x*stride]; + for(int i=1; i<steps; ++i) { + if(threadIdx.x+i*THREADS < rows) { + aux[threadIdx.x] += src_mat_data[j+(threadIdx.x+i*THREADS)*stride]; + } + } + + int total_threads = THREADS; + __syncthreads(); + while(total_threads > 1) { + int half_point = ((1+total_threads) >> 1); + if (threadIdx.x < half_point) { + if(threadIdx.x+half_point < total_threads) { + aux[threadIdx.x] += aux[threadIdx.x + half_point]; + } + } + __syncthreads(); + total_threads = ((total_threads+1) >> 1); + } + + __syncthreads(); + dst_vec_data[j] = aux[0]; +} + +__global__ +void kernel_add_vec_row(float *src_vec_data, float *src_mat_data, float* des_mat_data,long rows, long cols, long stride) +{ + long i = blockIdx.x * blockDim.x + threadIdx.x; + long j = blockIdx.y * blockDim.y + threadIdx.y; + long num_threads_x = blockDim.x * gridDim.x; + long num_threads_y = blockDim.y * gridDim.y; + long index = 0; + for(; i<cols && j<rows; i+=num_threads_x, j+=num_threads_y) { + index = j * stride + i; + des_mat_data[index] = src_mat_data[index] + src_vec_data[i]; + } +} + +// +namespace singa{ + +void singa_sum_col(float *src_mat_data, float *dst_vec_data, long rows, long cols, long stride) +{ + long threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows; + long num_blocks = cols; + + kernel_sum_col<<<num_blocks, threads_per_block>>>(src_mat_data, dst_vec_data, rows, cols, stride); +} + +void singa_add_vec_row(float *src_vec_data, float *src_mat_data, float *des_mat_data ,long rows, long cols, long stride) +{ + dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y); + dim3 num_blocks(cols/threads_per_block.x + (cols%threads_per_block.x == 0 ? 0 : 1), rows/threads_per_block.y + (rows%threads_per_block.y == 0 ? 0 : 1)); + kernel_add_vec_row<<<num_blocks, threads_per_block>>>(src_vec_data, src_mat_data, des_mat_data,rows, cols, stride); +} + +}//namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c13e0370/src/test/test_math.cc ---------------------------------------------------------------------- diff --git a/src/test/test_math.cc b/src/test/test_math.cc new file mode 100644 index 0000000..3856d1d --- /dev/null +++ b/src/test/test_math.cc @@ -0,0 +1,385 @@ +#include "gtest/gtest.h" +#include "singa/blob/math_addr.h" +#include "singa/blob/math_blob.h" +#include "singa/blob/math_kernel.h" +#include "singa/blob/singa_op.h" + +#include <cuda_runtime.h> +#include "cublas_v2.h" + +using namespace singa; +using namespace std; + +TEST(MathTest, TestGemmCPU) { + float A[3][2] = {}; + float B[3][2] = {}; + float C[2][2] = {}; + for(int i = 0; i < 3; i++) + for(int j = 0; j < 2; j++) + { + A[i][j] = i+j; + B[i][j] = i+j - i*j; + } + cpu_gemm(A[0], B[0], 2, 2, 3 , 1, 0, true, false, C[0]); + float D[2][2] = {}; + for(int i = 0; i < 2; i++) + for(int j = 0; j < 2; j++) + { + D[i][j] = 0; + for(int k = 0; k < 3; k++) + D[i][j] += A[k][i]*B[k][j]; + } + for(int i = 0; i < 2; i++) + for(int j = 0; j < 2; j++) + { + ASSERT_EQ(C[i][j], D[i][j]); + } +} + +TEST(MathTest, TestGemvCPU) { + float A[4][3] = {}; + float B[4]= {}; + float C[3] = {}; + float D[3] = {}; + + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 3; j++) + { + A[j][i] = i-j + i*j; + } + } + + for(int i = 0; i < 4; i++)B[i] = i; + for(int i = 0; i < 3; i++)C[i] = 10; + cpu_gemv(A[0], B, 4, 3, 1, 1, true, C); + + for(int i = 0; i < 3; i++) + { + for(int j = 0; j < 4; j++) + { + D[i] += A[j][i]*B[j]; + } + } + for(int i = 0; i < 3; i++) + { + ASSERT_EQ(C[i], D[i]+10); + } +} + + +TEST(MathTest, TestAxpyCPU) { + float A[4][3] = {}; + float C[4][3] = {}; + float B[3][4] = {}; + float D[3][4] = {}; + + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 3; j++) + { + A[i][j] = i-j + i*j; + B[j][i] = i-j + i*j; + C[i][j] = A[i][j]; + D[j][i] = B[j][i]; + } + } + + cpu_axpy(A[0], 12, 2, B[0]); + for(int i = 0; i < 12; i++) + { + D[0][i] += 2*C[0][i]; + } + + for(int i = 0; i < 3; i++) + { + for(int j = 0; j < 4; j++) + { + ASSERT_EQ(B[i][j],D[i][j]); + } + } +} + + +TEST(MathTest, TestGemmGPU) { + float A[3][2] = {}; + float B[3][2] = {}; + float C[2][2] = {}; + for(int i = 0; i < 3; i++) + { + for(int j = 0; j < 2; j++) + { + A[i][j] = i+j; + B[i][j] = i+j - i*j; + } + } + + float* A_gpu=NULL; + float* B_gpu=NULL; + float* C_gpu=NULL; + + cudaMalloc((void**)&A_gpu, 3*2*sizeof(float)); + cudaMalloc((void**)&B_gpu, 3*2*sizeof(float)); + cudaMalloc((void**)&C_gpu, 2*2*sizeof(float)); + + cudaMemcpy(A_gpu,A,3*2*sizeof(float),cudaMemcpyHostToDevice); + cudaMemcpy(B_gpu,B,3*2*sizeof(float),cudaMemcpyHostToDevice); + + gpu_gemm(A_gpu, B_gpu, 2, 2, 3 , 1, 0, true, false, C_gpu); + + cudaMemcpy(C,C_gpu,2*2*sizeof(float),cudaMemcpyDeviceToHost); + + float D[2][2] = {}; + for(int i = 0; i < 2; i++) + { + for(int j = 0; j < 2; j++) + { + D[i][j] = 0; + for(int k = 0; k < 3; k++) + { + D[i][j] += A[k][i]*B[k][j]; + } + } + } + + for(int i = 0; i < 2; i++) + { + for(int j = 0; j < 2; j++) + { + ASSERT_EQ(C[i][j],D[i][j]); + } + } + + cudaFree(A_gpu); + cudaFree(B_gpu); + cudaFree(C_gpu); +} + + +TEST(MathTest, TestGemvGPU) { + float A[4][3] = {}; + float B[4]= {}; + float C[3] = {}; + float D[3] = {}; + + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 3; j++) + { + A[i][j] = i-j + i*j; + } + } + + for(int i = 0; i < 4; i++)B[i] = i; + for(int i = 0; i < 3; i++)C[i] = 10; + + float* A_gpu=NULL; + float* B_gpu=NULL; + float* C_gpu=NULL; + + cudaMalloc((void**)&A_gpu, 4*3*sizeof(float)); + cudaMalloc((void**)&B_gpu, 4*sizeof(float)); + cudaMalloc((void**)&C_gpu, 3*sizeof(float)); + + cudaMemcpy(A_gpu,A,4*3*sizeof(float),cudaMemcpyHostToDevice); + cudaMemcpy(B_gpu,B,4*sizeof(float),cudaMemcpyHostToDevice); + cudaMemcpy(C_gpu,C,3*sizeof(float),cudaMemcpyHostToDevice); + + gpu_gemv(A_gpu, B_gpu, 4, 3, 1, 1, true, C_gpu); + + cudaMemcpy(C,C_gpu,3*sizeof(float),cudaMemcpyDeviceToHost); + + for(int i = 0; i < 3; i++) + { + for(int j = 0; j < 4; j++) + { + D[i] += A[j][i]*B[j]; + } + } + + for(int i = 0; i < 3; i++) + { + ASSERT_EQ(C[i],D[i]+10); + } + + cudaFree(A_gpu); + cudaFree(B_gpu); + cudaFree(C_gpu); +} + + +TEST(MathTest, TestAxpyGPU) { + float A[4][3] = {}; + float C[4][3] = {}; + float B[3][4] = {}; + float D[3][4] = {}; + + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 3; j++) + { + A[i][j] = i-j + i*j; + B[j][i] = i-j + i*j; + C[i][j] = A[i][j]; + D[j][i] = B[j][i]; + } + } + + float* A_gpu=NULL; + float* B_gpu=NULL; + + cudaMalloc((void**)&A_gpu, 4*3*sizeof(float)); + cudaMalloc((void**)&B_gpu, 3*4*sizeof(float)); + + cudaMemcpy(A_gpu,A,4*3*sizeof(float),cudaMemcpyHostToDevice); + cudaMemcpy(B_gpu,B,3*4*sizeof(float),cudaMemcpyHostToDevice); + + gpu_axpy(A_gpu, 12, 2, B_gpu); + + cudaMemcpy(A,A_gpu,4*3*sizeof(float),cudaMemcpyDeviceToHost); + cudaMemcpy(B,B_gpu,3*4*sizeof(float),cudaMemcpyDeviceToHost); + + for(int i = 0; i < 12; i++)D[0][i] += 2*C[0][i]; + + for(int i = 0; i < 3; i++) + { + for(int j = 0; j < 4; j++) + { + ASSERT_EQ(B[i][j],D[i][j]); + } + } + + cudaFree(A_gpu); + cudaFree(B_gpu); +} + + +TEST(MathTest, TestDotGPU) { + float A[12]; + float B[12]; + + for(int i = 0; i < 12; i++) + { + A[i]=i-1; + B[i]=i+1; + } + + float* A_gpu=NULL; + float* B_gpu=NULL; + + cudaMalloc((void**)&A_gpu, 12*sizeof(float)); + cudaMalloc((void**)&B_gpu, 12*sizeof(float)); + + cudaMemcpy(A_gpu,A,12*sizeof(float),cudaMemcpyHostToDevice); + cudaMemcpy(B_gpu,B,12*sizeof(float),cudaMemcpyHostToDevice); + float gpu_ret=gpu_dot(A_gpu,B_gpu,12); + + float cpu_ret=0.0f; + for(int i = 0; i < 12; i++) + { + cpu_ret+=A[i]*B[i]; + } + + ASSERT_EQ(gpu_ret,cpu_ret); + + cudaFree(A_gpu); + cudaFree(B_gpu); + +} + +TEST(MathTest, TestSingaSumColGPU) { + + float A[3][4]; + float B[4]; + float C[4]; + + for(int i = 0; i < 3; i++) + { + for(int j = 0; j < 4; j++) + { + A[i][j]=i+j; + } + } + + for(int i = 0; i < 4; i++) + { + B[i]=0.0f; + C[i]=0.0f; + } + + float* A_gpu=NULL; + float* B_gpu=NULL; + + cudaMalloc((void**)&A_gpu, 12*sizeof(float)); + cudaMalloc((void**)&B_gpu, 4*sizeof(float)); + cudaMemcpy(A_gpu,A,12*sizeof(float),cudaMemcpyHostToDevice); + + singa_sum_col(A_gpu,B_gpu,3,4,4); + + cudaMemcpy(B,B_gpu,4*sizeof(float),cudaMemcpyDeviceToHost); + + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 3; j++) + { + C[i]+=A[j][i]; + } + } + + for(int i = 0; i <4; i++) + { + ASSERT_EQ(B[i],C[i]); + } + + cudaFree(A_gpu); + cudaFree(B_gpu); +} + +TEST(MathTest, TestSingaAddVecRowGPU) { + + float A[3][4]; + float B[4]; + float C[3][4]; + float D[3][4]; + + for(int i = 0; i < 4; i++) + { + B[i]=i; + } + + for(int i = 0; i < 3; i++) + { + for(int j = 0; j < 4; j++) + { + A[i][j]=i+j; + D[i][j]=A[i][j]+B[j]; + } + } + + + float* A_gpu=NULL; + float* B_gpu=NULL; + float* C_gpu=NULL; + + cudaMalloc((void**)&A_gpu, 3*4*sizeof(float)); + cudaMalloc((void**)&B_gpu, 4*sizeof(float)); + cudaMalloc((void**)&C_gpu, 3*4*sizeof(float)); + cudaMemcpy(A_gpu,A,3*4*sizeof(float),cudaMemcpyHostToDevice); + cudaMemcpy(B_gpu,B,4*sizeof(float),cudaMemcpyHostToDevice); + + singa_add_vec_row(B_gpu,A_gpu,C_gpu,3,4,4); + + cudaMemcpy(C,C_gpu,3*4*sizeof(float),cudaMemcpyDeviceToHost); + + for(int i = 0; i < 3; i++) + { + for(int j = 0; j < 4; j++) + { + ASSERT_EQ(C[i][j],D[i][j]); + } + } + + cudaFree(A_gpu); + cudaFree(B_gpu); + cudaFree(C_gpu); +}
