SINGA-104 Add Context Class Update the Context class: 1. function, variable names. 2. add random generators for CPU threads.
TODO run test for test_context.cu. Add implicit/automatic init (using device 0). Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/9aff30aa Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/9aff30aa Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/9aff30aa Branch: refs/heads/master Commit: 9aff30aab69f81e45d3986c0337346e0e9170936 Parents: 35de4f9 Author: Wei Wang <[email protected]> Authored: Thu Nov 26 11:53:10 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Thu Nov 26 11:56:37 2015 +0800 ---------------------------------------------------------------------- Makefile.gpu | 10 +-- include/singa/utils/context.h | 140 +++++++++++++++++++++++++++---------- src/test/test_context.cc | 66 ----------------- src/test/test_context.cu | 55 +++++++++++++++ src/utils/context.cc | 82 +++++++++++----------- 5 files changed, 203 insertions(+), 150 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9aff30aa/Makefile.gpu ---------------------------------------------------------------------- diff --git a/Makefile.gpu b/Makefile.gpu index 2fea3b2..35b81b9 100644 --- a/Makefile.gpu +++ b/Makefile.gpu @@ -20,16 +20,16 @@ ###################User Config Varaibles ############################# # third-party library installation folder -HOME_DIR := /usr +HOME_DIR := /home/wangwei/local CUDA_DIR := /usr/local/cuda #CUDA_DIR := # Lib folder for system and external libs. You may need to change it. -LIBRARY_DIRS := $(HOME_DIR)/lib64 $(HOME_DIR)/lib $(HOME_DIR)/local/lib $(CUDA_DIR)/lib64 $(CUDA_DIR)/lib +LIBRARY_DIRS := $(CUDA_DIR)/lib64 $(CUDA_DIR)/lib $(HOME_DIR)/lib64 $(HOME_DIR)/lib # Header folder for system and external libs. You may need to change it. -INCLUDE_DIRS := $(HOME_DIR)/include ./include $(HOME_DIR)/local/include/zookeeper $(CUDA_DIR)/include +INCLUDE_DIRS := $(CUDA_DIR)/include $(HOME_DIR)/include ./include # g++ location, should support c++11, tested with 4.8.1 CXX := g++ CUCXX := nvcc @@ -50,7 +50,7 @@ ZK_FLAGS :=-DTHREADED -fpermissive CXXFLAGS := -O2 -msse3 -Wall -pthread -fPIC -std=c++11 -Wno-unknown-pragmas \ $(MSHADOW_FLAGS) -DCPU_ONLY=1 $(ZK_FLAGS)\ -funroll-loops $(foreach includedir, $(INCLUDE_DIRS), -I$(includedir)) -CUCXXFLAGS := $(MSHADOW_FLAGS) -std=c++11 -G $(CUDA_ARCH) \ +CUCXXFLAGS := $(MSHADOW_FLAGS) -DUSE_GPU -std=c++11 -G $(CUDA_ARCH) \ $(foreach includedir, $(INCLUDE_DIRS), -I$(includedir)) #Add device compile option @@ -84,7 +84,7 @@ TEST_CUDA_SRCS :=$(shell find src/test/ -maxdepth 1 -name "*.cu") TEST_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(TEST_CUDA_SRCS:.cu=.o))) -include $(TEST_CUDA_OBJS:%.o=%.P) -SINGA_CUDA_SRCS :=$(shell find src/ -maxdepth 2 -name "*.cu") +SINGA_CUDA_SRCS := $(shell find src/ \( -path "src/test" \) -prune -o \( -name "*.cu" -type f \) -print ) SINGA_CUDA_OBJS := $(sort $(addprefix $(BUILD_DIR)/, $(SINGA_CUDA_SRCS:.cu=.o))) -include $(SINGA_CUDA_OBJS:%.o=%.P) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9aff30aa/include/singa/utils/context.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/context.h b/include/singa/utils/context.h index 762ae75..7a41dac 100644 --- a/include/singa/utils/context.h +++ b/include/singa/utils/context.h @@ -23,63 +23,129 @@ #define SINGA_UTILS_CONTEXT_H_ #include <vector> +#include <random> +#include <chrono> +#include <thread> +#include <unordered_map> +#include <glog/logging.h> + #ifdef USE_GPU -#include <cublas_v2.h> -#include <cuda_runtime.h> -#include <curand.h> +#include "singa/utils/cuda_utils.h" #endif namespace singa { -const int kDefaultDevice = 20; +// max num of threads per process +const int kNumMaxThreads = 1024; +/** + * Context is used as a global singleton, which stores the mapping from CPU + * thread id to GPU device id. It manages the handlers for GPU + * devices. It also manages the GPU and CPU random generators, which are created + * when accessed. One CPU thread has a CPU random generator. A CPU device + * has a GPU random generator. + */ class Context { - public: - + public: + /** + * Destructor, release random generators and handlers. + */ ~Context(); - - void Setup(); - -#ifdef USE_GPU - int DeviceID(const int index) { - return device_ids_[index]; - } - - void SetDeviceID(const int index, const int id) { - device_ids_[index] = id; - } - - void SetDevice(const int index) { - cudaSetDevice(device_ids_[index]); + /** + * Constructor, init arrays for random generators and handlers. + */ + Context(); + + /** + * @return the ID of the device attached to a given CPU thread: + * if the device is a GPU card, then returns the GPU device ID; + * Else return -1. + */ + int device_id(const std::thread::id tid) { + CHECK(device_id_.find(tid) != device_id_.end()); + return device_id_[tid]; } - cublasHandle_t Handle(const int index) { - return handles_[index]; + /** + * Setup the CPU thread, which may be assigned a GPU device. + * Set the random seed to -1. + * A GPU handler will be created for the GPU device. + * @param[in] thread::id CPU thread ID + * @param[in] device_id GPU device ID + */ + void SetupDevice(const std::thread::id tid, const int did); + + /** + * @copy SetupDevice(const int, const int); + * @param[in] seed random seed + */ + void SetupDevice(const std::thread::id tid, const int did, long long seed); + + /** + * Get the CPU random generator. + * If the generator does not exist, then create it now. + * If the seed is not set, i.e., seed=-1, then get a seed from system time. + * @param[in] thread::id CPU thread ID + * @return the CPU random generator + */ + std::mt19937* rand_generator(const std::thread::id tid) { + if (rand_generator_.find(tid) == rand_generator_.end()) { + CHECK(seed_.find(tid) != seed_.end()); + auto seed = static_cast<unsigned>(seed_[tid]); + if (seed_[tid] == -1) + seed = std::chrono::system_clock::now().time_since_epoch().count(); + rand_generator_[tid] = new std::mt19937(seed); + } + return rand_generator_[tid]; + } +#ifdef USE_GPU + /** + * Get the handler of the GPU device attached to a CPU thread. + * @param[in] thread::id + * @return the GPU handler, or nullptr if this thread does not have any GPU. + */ + cublasHandle_t cublas_handle(const std::thread::id tid) { + CHECK(cublas_handle_.find(tid) != cublas_handle_.end()); + return cublas_handle_[tid]; } - - void CreateHandle(const int index); - - void DestoryHandle(const int index); - - curandGenerator_t GpuRandGenerator(const int index) { - return gpu_rand_generators_[index]; + /** + * Get the random generator of the GPU device assigned to the given thread. + * @param[in] thread::id + * @return random generator. If it does not exist, then create one. + * The random seed will be set to CURAND_RNG_PSEUDO_DEFAULT if it is not set. + */ + curandGenerator_t curand_generator(const std::thread::id tid) { + if (curand_generator_.find(tid) == curand_generator_.end()) { + CHECK(seed_.find(tid) != seed_.end()); + auto seed = seed_[tid]; + // TODO handle user set seed + cudaSetDevice(device_id_[tid]); + curandCreateGenerator(&curand_generator_[tid], CURAND_RNG_PSEUDO_DEFAULT); + } + return curand_generator_[tid]; } - void CreateGpuRandGenerator(const int index); - - void DestoryGpuRandGenerator(const int index); + /* + protected: + void CreateHandle(const int thread::id); + void DestoryHandle(const int thread::id); + void CreateGpuRandGenerator(const int thread::id); + void DestoryGpuRandGenerator(const int thread::id); + */ #endif - protected: - std::vector<int> device_ids_; + protected: + + std::unordered_map<std::thread::id, int> device_id_; + std::unordered_map<std::thread::id, std::mt19937 *> rand_generator_; + std::unordered_map<std::thread::id, int> seed_; #ifdef USE_GPU - std::vector<cublasHandle_t> handles_; - std::vector<curandGenerator_t> gpu_rand_generators_; + std::unordered_map<std::thread::id, cublasHandle_t> cublas_handle_; + std::unordered_map<std::thread::id, curandGenerator_t> curand_generator_; #endif - }; } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9aff30aa/src/test/test_context.cc ---------------------------------------------------------------------- diff --git a/src/test/test_context.cc b/src/test/test_context.cc deleted file mode 100644 index 3a23b23..0000000 --- a/src/test/test_context.cc +++ /dev/null @@ -1,66 +0,0 @@ -#include "gtest/gtest.h" -#include "singa/utils/singleton.h" -#include "singa/utils/context.h" - -//#include <cuda_runtime.h> -//#include "cublas_v2.h" - -using namespace singa; -using namespace std; - -TEST(ContextTest, TestDevice) { - auto context = Singleton<Context>::Instance(); - context->Setup(); - - int index = 4; - int device_id = context->DeviceID(index); - ASSERT_EQ(4,device_id); - - context->SetDeviceID(index,6); - device_id = context->DeviceID(index); - ASSERT_EQ(6,device_id); -} - -TEST(ContextTest, TestHandle) { - auto context = Singleton<Context>::Instance(); - context->Setup(); - - int index = 2; - context->CreateHandle(index); - - float cpu_ret = 0.0f; - float gpu_ret = 0.0f; - - float A[12]; - float B[12]; - - for(int i = 0; i < 12; i++) { - A[i]=i-1; - B[i]=i+1; - } - - float* A_gpu = NULL; - float* B_gpu = NULL; - - cudaMalloc((void**)&A_gpu, 12*sizeof(float)); - cudaMalloc((void**)&B_gpu, 12*sizeof(float)); - - cudaMemcpy(A_gpu,A,12*sizeof(float),cudaMemcpyHostToDevice); - cudaMemcpy(B_gpu,B,12*sizeof(float),cudaMemcpyHostToDevice); - - cublasHandle_t handle = context->Handle(index); - /*cublasHandle_t handle; - cudaSetDevice(0); - cublasCreate(&handle);*/ - - cublasSdot(handle, 12, A_gpu, 1, B_gpu, 1, &gpu_ret); - - for(int i = 0; i < 12;++i) { - cpu_ret += A[i] * B[i]; - } - - ASSERT_EQ(gpu_ret,cpu_ret); - - cudaFree(A_gpu); - cudaFree(B_gpu); -} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9aff30aa/src/test/test_context.cu ---------------------------------------------------------------------- diff --git a/src/test/test_context.cu b/src/test/test_context.cu new file mode 100644 index 0000000..88ab06b --- /dev/null +++ b/src/test/test_context.cu @@ -0,0 +1,55 @@ +#include <thread> +#include "gtest/gtest.h" +#include "singa/utils/singleton.h" +#include "singa/utils/context.h" +#include "singa/utils/cuda_utils.h" + +using namespace singa; +using namespace std; + +TEST(ContextTest, TestDevice) { + auto context = Singleton<Context>::Instance(); + + auto id = std::this_thread::get_id(); + context->SetupDevice(id, 0); + auto device_id = context->device_id(id); + ASSERT_EQ(1,device_id); +} + +TEST(ContextTest, TestHandle) { + auto context = Singleton<Context>::Instance(); + + float cpu_ret = 0.0f; + float gpu_ret = 0.0f; + + float A[12]; + float B[12]; + + for(int i = 0; i < 12; i++) { + A[i]=i-1; + B[i]=i+1; + } + + float* A_gpu = NULL; + float* B_gpu = NULL; + context->SetupDevice(std::this_thread::get_id(), 0); + + cudaMalloc((void**)&A_gpu, 12 * sizeof(float)); + cudaMalloc((void**)&B_gpu, 12 * sizeof(float)); + + cudaMemcpy(A_gpu, A, 12 * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(B_gpu, B, 12 * sizeof(float), cudaMemcpyHostToDevice); + + cublasHandle_t handle = context->cublas_handle(std::this_thread::get_id()); + + cublasSdot(handle, 12, A_gpu, 1, B_gpu, 1, &gpu_ret); + + for(int i = 0; i < 12;++i) { + cpu_ret += A[i] * B[i]; + } + + ASSERT_EQ(gpu_ret,cpu_ret); + + cudaFree(A_gpu); + cudaFree(B_gpu); +} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/9aff30aa/src/utils/context.cc ---------------------------------------------------------------------- diff --git a/src/utils/context.cc b/src/utils/context.cc index 671bec0..37c8f39 100644 --- a/src/utils/context.cc +++ b/src/utils/context.cc @@ -23,66 +23,64 @@ #include "singa/utils/singleton.h" namespace singa { - + Context::~Context() { #ifdef USE_GPU - for(int i = 0; i < kDefaultDevice; ++i) { - SetDevice(i); - - if(handles_[i] != NULL) { - cublasDestroy(handles_[i]); - } - - if(gpu_rand_generators_[i] != NULL) { - curandDestroyGenerator(gpu_rand_generators_[i]); - } + for (auto& entry ï¼ device_id_) { + if (entry.second != -1) { + cudaSetDevice(entry.second); + if (cublas_handle_[entry.first] != nullptr) { + cublasDestroy(cublas_handle_[entry.first]); + cublas_handle_[entry.first] = nullptr; + } + if(curand_generator_[entry.first] != nullptr) { + curandDestroyGenerator(curand_generator_[entry.first]); + curand_generator_[entry.first] = nullptr; + } + } } #endif + for (auto& entry : rand_generator_) { + if (entry.second != nullptr) { + delete entry.second; + entry.second = nullptr; + } + } } -void Context::Setup() { +Context::Context() { } - for(int i = 0; i < kDefaultDevice; ++i) { - //init device index - device_ids_.push_back(i); - } +void Context::SetupDevice(const std::thread::id thread, const int did) { + SetupDevice(thread, did, -1); +} +void Context::SetupDevice(const std::thread::id thread, const int did, + long long seed) { + device_id_[thread] = did; #ifdef USE_GPU - for(int i = 0; i < kDefaultDevice; ++i) { - //init handle - cublasHandle_t handle = NULL; - handles_.push_back(handle); - - curandGenerator_t gpu_rand_generator = NULL; - gpu_rand_generators_.push_back(gpu_rand_generator); + if (did > -1) { + cudaSetDevice(did); + cublasCreate(&handle_[thread]); } #endif + seed_[thread] = seed; } +/* #ifdef USE_GPU -void Context::CreateHandle(const int index) { - SetDevice(device_ids_[index]); - cublasCreate(&handles_[index]); -} - -void Context::DestoryHandle(const int index) { - SetDevice(device_ids_[index]); - cublasDestroy(handles_[index]); - handles_[index] = NULL; +void Context::DestoryHandle(const int thread::id) { + cudaSetDevice(device_id_[thread::id]); + cublasDestroy(handle_[thread::id]); + handle_[thread::id] = nullptr; } -void Context::CreateGpuRandGenerator(const int index) { - SetDevice(device_ids_[index]); - curandCreateGenerator(&gpu_rand_generators_[index], CURAND_RNG_PSEUDO_DEFAULT); +void Context::DestoryGpuRandGenerator(const int thread::id) { + cudaSetDevice(device_id_[thread::id]); + curandDestroyGenerator(curand_generator_[thread::id]); + curand_generator_[thread::id] = nullptr; } - -void Context::DestoryGpuRandGenerator(const int index) { - SetDevice(device_ids_[index]); - curandDestroyGenerator(gpu_rand_generators_[index]); - gpu_rand_generators_[index] = NULL; -} - #endif +*/ } // namespace singa
