Repository: incubator-singa Updated Branches: refs/heads/dev 28678ae83 -> dfc422e5b
SINGA-218 Implementation for RNN CUDNN version Finish the CudnnRNN layer. Pass test for tanh rnn. RNN forward accepts a vector of input tensors: <x0, x1, ... x(n-1), hx, cx> x(i) is the i-th input tensor, hx is the init hidden tensor which could be a dummy tensor. A dummy tensor is a tensor created without shape/device/data_type, during compuation, cudnnRNN would use 0s for this tensor. cx is not necessary for relu/tanh/gru rnn. For lstm, it could also be a dummy tensor like hx. The output is: <y0, y1, ... y(n-1), hy, cy>. relu/tanh/gru rnns does not have cy. lstm have both hy and cy. RNN backward accepts a vector of input gradient tensors: <dy0, dy1, ... dy(n-1), dhy, dcy>. dhy is necessry for all rnns, but could be a dummy tensor, in which case a tensor with 0s would be used for dhy during computation. dcy is used only for lstm, which could also be a dummy tensor. The output is: <dw, <dx0, dx1, ... dx(n-1), dhx, dcx>>, where dhx is a tensor for the gradient of hx. dcx is only used for lstm. The CudnnRNN must be moved onto cuda, otherwise memory error would happen (the weight is on cpu). Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/8e0b1083 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/8e0b1083 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/8e0b1083 Branch: refs/heads/dev Commit: 8e0b1083992f471849bb80b0a8e869767ee9edc0 Parents: c51f944 Author: Wei Wang <[email protected]> Authored: Fri Aug 5 16:43:40 2016 +0800 Committer: Wei Wang <[email protected]> Committed: Wed Aug 10 00:43:11 2016 +0800 ---------------------------------------------------------------------- CMakeLists.txt | 2 +- cmake/Thirdparty/FindCUDNN.cmake | 2 +- include/singa/core/common.h | 15 +- include/singa/core/device.h | 9 +- include/singa/model/layer.h | 9 - include/singa/utils/context.h | 291 --------------- src/CMakeLists.txt | 1 + src/core/tensor/tensor.cc | 22 +- src/core/tensor/tensor_math.h | 2 +- src/core/tensor/tensor_math_cpp.h | 2 +- src/core/tensor/tensor_math_cuda.h | 2 +- src/model/layer/cudnn_rnn.cc | 610 +++++++++++++++++++------------- src/model/layer/cudnn_rnn.h | 44 +-- src/model/layer/rnn.cc | 59 ++- src/model/layer/rnn.h | 31 +- src/model/optimizer/adagrad.cc | 4 +- src/model/optimizer/nesterov.cc | 4 +- src/model/optimizer/rmsprop.cc | 1 + src/model/optimizer/sgd.cc | 4 +- src/proto/model.proto | 18 +- test/singa/test_cudnn_rnn.cc | 273 +++++++------- 21 files changed, 625 insertions(+), 780 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/CMakeLists.txt b/CMakeLists.txt index 23f8ef6..38014ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ CMAKE_MINIMUM_REQUIRED(VERSION 2.6) PROJECT(singa) -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -g") LIST(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Thirdparty) #message(STATUS "module path: ${CMAKE_MODULE_PATH}") http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/cmake/Thirdparty/FindCUDNN.cmake ---------------------------------------------------------------------- diff --git a/cmake/Thirdparty/FindCUDNN.cmake b/cmake/Thirdparty/FindCUDNN.cmake index eefab9d..cefc4fe 100644 --- a/cmake/Thirdparty/FindCUDNN.cmake +++ b/cmake/Thirdparty/FindCUDNN.cmake @@ -27,7 +27,7 @@ IF(CUDNN_FOUND) ELSE() SET(CUDNN_VERSION "${CUDNN_VERSION_MAJOR}.${CUDNN_VERSION_MINOR}.${CUDNN_VERSION_PATCH}") ENDIF() - MESSAGE(STATUS "Found Cudnn_v${CUDNN_VERSION} at ${CUDNN_INCLUDE_DIR}") + MESSAGE(STATUS "Found Cudnn_v${CUDNN_VERSION} at ${CUDNN_INCLUDE_DIR} ${CUDNN_LIBRARIES}") MARK_AS_ADVANCED(CUDNN_INCLUDE_DIR CUDNN_LIBRARIES) ENDIF() http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/include/singa/core/common.h ---------------------------------------------------------------------- diff --git a/include/singa/core/common.h b/include/singa/core/common.h index caa7c67..53a9726 100644 --- a/include/singa/core/common.h +++ b/include/singa/core/common.h @@ -65,8 +65,14 @@ class Block { // Disabled as it is not used currently. // Block(void* ptr, size_t size, size_t offset, std::shared_ptr<atomic<int>> // ref) : data_(ptr), size_(size), offset_(offset), ref_count_(ref) {} - void* mutable_data() const { return static_cast<char*>(data_) + offset_; } - const void* data() const { return static_cast<char*>(data_) + offset_; } + void* mutable_data() { + initialized_ = true; + return static_cast<char*>(data_) + offset_; + } + const void* data() const { + CHECK(initialized_) << "Must initialize data before reading it"; + return static_cast<char*>(data_) + offset_; + } size_t size() const { return size_; } size_t offset() const { return offset_; } int IncRefCount() { @@ -77,11 +83,16 @@ class Block { } int ref_count() const { return ref_count_.load(); } + bool initialized() const { + return initialized_; + } + private: Block() {} void* data_ = nullptr; size_t size_ = 0; size_t offset_ = 0; + bool initialized_ = false; // Disabled as it is not used currently. // std::shared_ptr<std::atomic<int>> ref_count_ = nullptr; std::atomic<int> ref_count_; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/include/singa/core/device.h ---------------------------------------------------------------------- diff --git a/include/singa/core/device.h b/include/singa/core/device.h index cd9a811..778a130 100644 --- a/include/singa/core/device.h +++ b/include/singa/core/device.h @@ -100,7 +100,7 @@ class Device { return lang_; } - std::shared_ptr<Device> host() const { return host_;} + virtual std::shared_ptr<Device> host() const { return host_;} Context* context(int k) { return &ctx_; @@ -140,6 +140,9 @@ class Device { Context ctx_; }; +/// a singleton CppDevice as the host for all devices. +extern std::shared_ptr<Device> defaultDevice; + /// Represent a CPU device which may have multiple threads/executors. /// It runs cpp code. class CppCPU : public Device { @@ -147,6 +150,7 @@ class CppCPU : public Device { ~CppCPU() {}; CppCPU(); + std::shared_ptr<Device> host() const override { return defaultDevice;} void SetRandSeed(unsigned seed) override; protected: void DoExec(function<void(Context*)>&& fn, int executor) override; @@ -161,9 +165,6 @@ class CppCPU : public Device { void Free(void* ptr) override; }; -/// a singleton CppDevice as the host for all devices. -extern std::shared_ptr<Device> defaultDevice; - // Implement Device using OpenCL libs. // class OpenclDevice : public Device { }; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/include/singa/model/layer.h ---------------------------------------------------------------------- diff --git a/include/singa/model/layer.h b/include/singa/model/layer.h index c35f9b8..d31bd95 100644 --- a/include/singa/model/layer.h +++ b/include/singa/model/layer.h @@ -158,12 +158,10 @@ class Layer { /// Move the layer (including its parameters and other internal Tensor) onto /// the given device virtual void ToDevice(std::shared_ptr<Device> device) { - //for (auto p : param_values_) p->ToDevice(device); } /// Set the data type of Tensor in this layer. virtual void AsType(DataType dtype) { - //for (auto p : param_values_) p->AsType(dtype); } /// Serialize the layer info (including params) into a LayerConf proto message @@ -202,12 +200,6 @@ class Layer { return vector<Tensor>{}; } - /// Return a pointer to the 'i'-th parameter Tensor. - Tensor param_value(size_t i) { - CHECK_LT(i, param_values_.size()); - return param_values().at(i); - } - /// Return names of all parmaeters. const vector<string> param_names() { vector<string> pname; @@ -227,7 +219,6 @@ class Layer { protected: std::string name_; - vector<Tensor*> param_values_; vector<ParamSpec> param_specs_; }; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/include/singa/utils/context.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/context.h b/include/singa/utils/context.h deleted file mode 100644 index 6e897e8..0000000 --- a/include/singa/utils/context.h +++ /dev/null @@ -1,291 +0,0 @@ -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - -#ifndef SINGA_UTILS_CONTEXT_H_ -#define SINGA_UTILS_CONTEXT_H_ - -#include <chrono> -#include <random> -#include <thread> -#include <unordered_map> -#include <vector> - -#include "singa/utils/logging.h" - -#ifdef USE_GPU -#include <cublas_v2.h> -#include <cuda.h> -#include <cuda_runtime.h> -#include <curand.h> -// CUDA: various checks for different function calls. -#define CUDA_CHECK(condition) \ -/* Code block avoids redefinition of cudaError_t error */ \ -do { \ -cudaError_t error = condition; \ -CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ -} while (0) - -#ifdef USE_CUDNN -#include <cudnn.h> -#endif - -#endif // USE_GPU - -namespace singa { - -/** - * Context is used as a global singleton, which stores the mapping from CPU - * thread id to GPU device id. If a thread has no GPU, then its associated - * device id is -1. It manages (e.g., creating) the handlers for GPU - * devices. It also manages the GPU and CPU random generators, which are created - * when accessed. One CPU thread has a CPU random generator. A GPU device - * has a GPU random generator, which is accessible after assigning the GPU - * device with a CPU thread via SetupDevice. - */ -class Context { - public: - /** - * Destructor, release random generators and handlers. - */ - ~Context() { -#ifdef USE_GPU - for (auto& entry : device_id_) { - if (entry.second != -1) { - cudaSetDevice(entry.second); - if (cublas_handle_[entry.second] != nullptr) { - cublasDestroy(cublas_handle_[entry.second]); - cublas_handle_[entry.second] = nullptr; - } - if (curand_generator_[entry.second] != nullptr) { - curandDestroyGenerator(curand_generator_[entry.second]); - curand_generator_[entry.second] = nullptr; - } - } - } -#ifdef USE_CUDNN - for (auto& handle : cudnn_handle_) { - if (handle != nullptr) - CHECK_EQ(cudnnDestroy(handle), CUDNN_STATUS_SUCCESS); - handle = nullptr; - } -#endif -#endif - for (auto& entry : rand_generator_) { - if (entry.second != nullptr) { - delete entry.second; - entry.second = nullptr; - } - } - } - /** - * Constructor, init handlers and GPU rand generators to nullptr. - */ - Context() { - for (int i = 0; i < kMaxNumGPU; i++) { -#ifdef USE_GPU - cublas_handle_.push_back(nullptr); - curand_generator_.push_back(nullptr); -#ifdef USE_CUDNN - cudnn_handle_.push_back(nullptr); -#endif -#endif - } - } - - /** - * @return the device ID of the current thread. - */ - int device_id() { - return device_id(std::this_thread::get_id()); - } - /** - * @return the ID of the device attached to a given CPU thread, or -1 if this - * thread has not been attached GPU device. - */ - int device_id(const std::thread::id& tid) { - if (device_id_.find(tid) != device_id_.end()) - return device_id_[tid]; - else - return -2; - } - /** - * Setup the CPU thread, which may be assigned a GPU device. - * If there is no GPU device, then set did to -1. - * Set the random seed to -1. - * @param[in] thread::id CPU thread ID - * @param[in] device_id GPU device ID - */ - void SetupDevice(const std::thread::id& tid, const int did) { - SetupDevice(tid, did, -1); - } - /** - * @copy SetupDevice(const int, const int); - * @param[in] seed random seed - */ - void SetupDevice(const std::thread::id& tid, const int did, const int seed) { - device_id_[tid] = did; - seed_[tid] = seed; - } - - /** - * Activate the GPU device by calling cudaSetDevice. - */ - void ActivateDevice(const int device_id) { - CHECK_GE(device_id, 0); -#ifdef USE_GPU - cudaSetDevice(device_id); -#endif - } - - /** - * \copybreif rand_generator(const std::thread::id&); - * @return the CPU random generator for the calling thread. - */ - std::mt19937* rand_generator() { - return rand_generator(std::this_thread::get_id()); - } - /** - * Get the CPU random generator. - * If the generator does not exist, then create it now. - * If the seed is not set, i.e., seed=-1, then get a seed from system time. - * @param[in] thread::id CPU thread ID - * @return the CPU random generator - */ - std::mt19937* rand_generator(const std::thread::id& tid) { - if (rand_generator_.find(tid) == rand_generator_.end()) { - // CHECK(seed_.find(tid) != seed_.end()); - auto seed = static_cast<unsigned>(seed_[tid]); - if (seed_.find(tid) == seed_.end() || seed_.at(tid) == -1) - seed = std::chrono::system_clock::now().time_since_epoch().count(); - rand_generator_[tid] = new std::mt19937(seed); - } - return rand_generator_[tid]; - } -#ifdef USE_GPU - /** - * \copybreif cublas_handle_(const std::thread::id&); - * @return cublas handle for the calling thread. - */ - cublasHandle_t cublas_handle() { - return cublas_handle(std::this_thread::get_id()); - } - /** - * Get the handler of the GPU which is assigned to the given thread. - * Calls cublas_handle(const int); - */ - cublasHandle_t cublas_handle(const std::thread::id thread_id) { - return cublas_handle(device_id(thread_id)); - } - /** - * Get the handler of the GPU device given its device ID. The device - * must be set up via SetupDevice(const std::thread::id, const int) before - * calling this function. - * @param[in] device_id GPU device ID - * @return the GPU handler - */ - cublasHandle_t cublas_handle(const int device_id) { - CHECK_GE(device_id, 0); - if (cublas_handle_.at(device_id) == nullptr) { - cudaSetDevice(device_id); - cublasCreate(&cublas_handle_[device_id]); - } - return cublas_handle_[device_id]; - } - /** - * Get the rand generator of the GPU device assigned to the given thread. - */ - curandGenerator_t curand_generator(const std::thread::id thread_id) { - return curand_generator(device_id(thread_id)); - } - /** - * Get the random generator of the GPU device given the device id. - * @param[in] device_id GPU device ID - * @return random generator. If it does not exist, then create one. - * The random seed will be set to CURAND_RNG_PSEUDO_DEFAULT if it is not set. - */ - curandGenerator_t curand_generator(const int device_id) { - CHECK_GE(device_id, 0); - CHECK_LT(device_id, cudnn_handle_.size()); - if (curand_generator_.at(device_id) == nullptr) { - // TODO(wangwei) handle user set seed - /* - CHECK(seed_.find(tid) != seed_.end()); - auto seed = seed_[tid]; - */ - ActivateDevice(device_id); - curandCreateGenerator(&curand_generator_[device_id], - CURAND_RNG_PSEUDO_DEFAULT); - } - return curand_generator_[device_id]; - } - -#ifdef USE_CUDNN - cudnnHandle_t cudnn_handle() { - return cudnn_handle(std::this_thread::get_id()); - } - - cudnnHandle_t cudnn_handle(const std::thread::id thread_id) { - return cudnn_handle(device_id(thread_id)); - } - - cudnnHandle_t cudnn_handle(const int device_id) { - CHECK_GE(device_id, 0); - CHECK_LT(device_id, cudnn_handle_.size()); - } -#endif // USE_CUDNN - - protected: - //!< max num of GPUs per process - const int kMaxNumGPU = 64; - //!< map from thread id to device id - std::unordered_map<std::thread::id, int> device_id_; - //!< map from thread id to cpu rand generator - std::unordered_map<std::thread::id, std::mt19937 *> rand_generator_; - //!< map from thread id to cpu rand generator seed - std::unordered_map<std::thread::id, int> seed_; -#ifdef USE_GPU - //!< cublas handler indexed by GPU device ID - std::vector<cublasHandle_t> cublas_handle_; - //!< cublas rand generator indexed by GPU device ID - std::vector<curandGenerator_t> curand_generator_; - -#ifdef USE_CUDNN - std::vector<cudnnHandle_t> cudnn_handle_; -#endif -#endif // USE_GPU -}; - -} // namespace singa - -#endif // SINGA_UTILS_CONTEXT_H_ - if (cudnn_handle_.at(device_id) == nullptr) { - ActivateDevice(device_id); - // LOG(ERROR) << "create cudnn handle for device " << device_id; - CHECK_EQ(cudnnCreate(&cudnn_handle_[device_id]), CUDNN_STATUS_SUCCESS); - } - // LOG(ERROR) << "use cudnn handle from device " << device_id; - return cudnn_handle_[device_id]; - } -#endif - -#endif // USE_GPU - -#ifdef USE_OPENCL http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/src/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 65a81fc..38e6aa3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -64,6 +64,7 @@ AUX_SOURCE_DIRECTORY(model/metric model_source) AUX_SOURCE_DIRECTORY(model/updater model_source) #MESSAGE(STATUS "MODEL ${model_source}") ADD_LIBRARY(singa_model SHARED ${model_source}) +MESSAGE(STATUS "model linker libs ${SINGA_LINKER_LIBS}") TARGET_LINK_LIBRARIES(singa_model ${SINGA_LINKER_LIBS}) LIST(APPEND SINGA_LINKER_LIBS singa_model) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/src/core/tensor/tensor.cc ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc index c16bd29..bd3bc70 100644 --- a/src/core/tensor/tensor.cc +++ b/src/core/tensor/tensor.cc @@ -35,21 +35,29 @@ Tensor::Tensor() { device_ = defaultDevice; } Tensor::Tensor(const Shape &shape, DataType dtype) : data_type_(dtype), device_(defaultDevice), shape_(shape) { device_ = defaultDevice; - block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_)); + size_t size = Product(shape_) * SizeOf(data_type_); + if (size) + block_ = device_->NewBlock(size); } Tensor::Tensor(Shape &&shape, DataType dtype) : data_type_(dtype), device_(defaultDevice), shape_(shape) { device_ = defaultDevice; - block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_)); + size_t size = Product(shape_) * SizeOf(data_type_); + if (size) + block_ = device_->NewBlock(size); } Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device, DataType dtype) : data_type_(dtype), device_(device), shape_(shape) { - block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_)); + size_t size = Product(shape_) * SizeOf(data_type_); + if (size) + block_ = device_->NewBlock(size); } Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype) : data_type_(dtype), device_(device), shape_(shape) { - block_ = device_->NewBlock(Product(shape_) * SizeOf(data_type_)); + size_t size = Product(shape_) * SizeOf(data_type_); + if (size) + block_ = device_->NewBlock(size); } Tensor::Tensor(const Tensor &in) : transpose_(in.transpose_), @@ -57,7 +65,8 @@ Tensor::Tensor(const Tensor &in) device_(in.device_), block_(in.block()), shape_(in.shape_) { - block_->IncRefCount(); + if (block_ != nullptr) + block_->IncRefCount(); } Tensor::Tensor(Tensor &&in) @@ -118,7 +127,8 @@ void Tensor::ToDevice(std::shared_ptr<Device> dst) { // TODO(wangwei) the comparison is very strict. May compare against device ID? if (device_ != dst) { Tensor tmp(shape_, dst, data_type_); - if (block_ != nullptr && Size()) tmp.CopyData(*this); + if (block_ != nullptr && Size() && block_->initialized()) + tmp.CopyData(*this); if (block_ != nullptr && block_->DecRefCount() == 0) device_->FreeBlock(block_); block_ = tmp.block_; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/src/core/tensor/tensor_math.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h index 7732dd2..1914ca6 100644 --- a/src/core/tensor/tensor_math.h +++ b/src/core/tensor/tensor_math.h @@ -341,7 +341,7 @@ void SoftmaxCrossEntropyBwd(const size_t batchsize, const size_t dim, template <typename DType, typename Lang> void RowMax(const size_t nrow, const size_t ncol, const Block *in, - const Block *ret, Context* ctx) { + Block *ret, Context* ctx) { LOG(FATAL) << "Not Implemented"; } // ************************************** http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/src/core/tensor/tensor_math_cpp.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h index 3e0c8ad..941931d 100644 --- a/src/core/tensor/tensor_math_cpp.h +++ b/src/core/tensor/tensor_math_cpp.h @@ -551,7 +551,7 @@ void SoftmaxCrossEntropyBwd<float, lang::Cpp>(const size_t batchsize, template <> void RowMax<float, lang::Cpp>(const size_t nrow, const size_t ncol, - const Block *in, const Block *out, Context *ctx) { + const Block *in, Block *out, Context *ctx) { const float *inPtr = static_cast<const float *>(in->data()); float *outPtr = static_cast<float *>(out->mutable_data()); for (size_t r = 0; r < nrow; r++) { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/src/core/tensor/tensor_math_cuda.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h index 43bfa1b..8b6e939 100644 --- a/src/core/tensor/tensor_math_cuda.h +++ b/src/core/tensor/tensor_math_cuda.h @@ -424,7 +424,7 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(const size_t batchsize, template <> void RowMax<float, lang::Cuda>(const size_t nrow, const size_t ncol, - const Block* in, const Block* out, + const Block* in, Block* out, Context* ctx) { const float* inPtr = static_cast<const float*>(in->data()); float* outPtr = static_cast<float*>(out->mutable_data()); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/src/model/layer/cudnn_rnn.cc ---------------------------------------------------------------------- diff --git a/src/model/layer/cudnn_rnn.cc b/src/model/layer/cudnn_rnn.cc index 6f04e5c..242a342 100644 --- a/src/model/layer/cudnn_rnn.cc +++ b/src/model/layer/cudnn_rnn.cc @@ -30,12 +30,6 @@ CudnnRNN::~CudnnRNN() { CUDNN_CHECK(cudnnDestroyDropoutDescriptor(dropout_desc_)); if (rnn_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyRNNDescriptor(rnn_desc_)); - if (x_descs_ != nullptr) - for (size_t i = 0; i < seqLength_; i++) - CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_descs_[i])); - if (y_descs_ != nullptr) - for (size_t i = 0; i < seqLength_; i++) - CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_descs_[i])); if (hx_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(hx_desc_)); if (hy_desc_ != nullptr) @@ -44,284 +38,392 @@ CudnnRNN::~CudnnRNN() { CUDNN_CHECK(cudnnDestroyTensorDescriptor(cx_desc_)); if (cy_desc_ != nullptr) CUDNN_CHECK(cudnnDestroyTensorDescriptor(cy_desc_)); -} - -void CudnnRNN::Setup(const Shape& in_sample, const LayerConf &conf) { - RNN::Setup(in_sample, conf); - RNNConf rnn_conf = conf.rnn_conf(); - // convert MB to bytes - workspace_byte_limit_ = rnn_conf.workspace_byte_limit() << 20; - inputMode_ = ToLowerCase(rnn_conf.inputmode()); - direction_ = ToLowerCase(rnn_conf.direction()); - mode_ = ToLowerCase(rnn_conf.mode()); - CHECK(inputMode_ == "cudnn_linear_input" || inputMode_ == "cudnn_skip_input") - << "CudnnRNN only supports two inputmodes: cudnn_linear_input, " - "cudnn_skip_input"; - CHECK(direction_ == "cudnn_undirectional" || direction_ == "cudnn_bidirectional") - << "CudnnRNN only supports two directions: cudnn_undirectional, " - "cudnn_bidirectional"; - CHECK(mode_ == "cudnn_rnn_relu" || mode_ == "cudnn_rnn_tanh" || - mode_ == "cudnn_lstm" || mode_ == "cudnn_gru") - << "CudnnRNN only supports four modes: cudnn_rnn_relu, " - "cudnn_rnn_tanh, cudnn_lstm and cudnn_gru"; - // the first constant (4) is the size of float - // the second constant (2, 8, 6) is the number of sets of params - if (mode_ == "cudnn_rnn_relu" || mode_ == "cudnn_rnn_tanh") - weightSize_ = 4 * 2 * (hiddenSize_ * in_sample[2] + hiddenSize_); - else if (mode_ == "cudnn_lstm") - weightSize_ = 4 * 8 * (hiddenSize_ * in_sample[2] + hiddenSize_); - else if (mode_ == "cudnn_gru") - weightSize_ = 4 * 6 * (hiddenSize_ * in_sample[2] + hiddenSize_); - if (direction_ == "cudnn_bidirectional") - weightSize_ = weightSize_ * 2; + if (dhx_desc_ != nullptr) + CUDNN_CHECK(cudnnDestroyTensorDescriptor(dhx_desc_)); + if (dhy_desc_ != nullptr) + CUDNN_CHECK(cudnnDestroyTensorDescriptor(dhy_desc_)); + if (dcx_desc_ != nullptr) + CUDNN_CHECK(cudnnDestroyTensorDescriptor(dcx_desc_)); + if (dcy_desc_ != nullptr) + CUDNN_CHECK(cudnnDestroyTensorDescriptor(dcy_desc_)); + DestroyIODescriptors(); } void CudnnRNN::ToDevice(std::shared_ptr<Device> device) { - weight_.ToDevice(device); + RNN::ToDevice(device); workspace_.ToDevice(device); + reserve_space_.ToDevice(device); } -void CudnnRNN::InitCudnn(const Tensor &input) { - CHECK(!has_init_cudnn_); - DataType dtype = input.data_type(); - auto dev = input.device(); - Context *ctx = dev->context(0); - seqLength_ = input.shape(0); - size_t batchsize = input.shape(1); /*(seqLength, minibatch, inputSize) !!! */ - size_t inputSize = input.shape(2); - size_t numDirections; - if (direction_ == "cudnn_undirectional") - numDirections = 1; - else - numDirections = 2; - x_descs_ = new cudnnTensorDescriptor_t[seqLength_]; - y_descs_ = new cudnnTensorDescriptor_t[seqLength_]; - for (size_t i = 0; i < seqLength_; i++) - CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_descs_[i])); - for (size_t i = 0; i < seqLength_; i++) - CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_descs_[i])); - CUDNN_CHECK(cudnnCreateTensorDescriptor(&hx_desc_)); - CUDNN_CHECK(cudnnCreateTensorDescriptor(&cx_desc_)); - CUDNN_CHECK(cudnnCreateTensorDescriptor(&hy_desc_)); - CUDNN_CHECK(cudnnCreateTensorDescriptor(&cy_desc_)); - CUDNN_CHECK(cudnnCreateFilterDescriptor(&weight_desc_)); +void CudnnRNN::DestroyIODescriptors() { + if (x_descs_ != nullptr) { + for (size_t i = 0; i < seq_length_; i++) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(x_descs_[i])); + CUDNN_CHECK(cudnnDestroyTensorDescriptor(dx_descs_[i])); + } + delete [] x_descs_; + delete [] dx_descs_; + } + if (y_descs_ != nullptr) { + for (size_t i = 0; i < seq_length_; i++) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(y_descs_[i])); + CUDNN_CHECK(cudnnDestroyTensorDescriptor(dy_descs_[i])); + } + delete [] y_descs_; + delete [] dy_descs_; + } +} + +void CudnnRNN::UpdateIODescriptors(size_t len, const vector<Tensor> &inputs) { + bool reset = false; + if (seq_length_ < len) { + DestroyIODescriptors(); + x_descs_ = new cudnnTensorDescriptor_t[len]; + dx_descs_ = new cudnnTensorDescriptor_t[len]; + y_descs_ = new cudnnTensorDescriptor_t[len]; + dy_descs_ = new cudnnTensorDescriptor_t[len]; + for (size_t i = 0; i < len; i++) { + CUDNN_CHECK(cudnnCreateTensorDescriptor(&x_descs_[i])); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&dx_descs_[i])); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&y_descs_[i])); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&dy_descs_[i])); + } + reset = true; + } + + for (size_t i = 0; i < len; i++) { + CHECK_EQ(inputs[i].shape(1), input_dim_); + if (inputs[i].shape(0) != batch_size_ || reset) { + int d[3] = {1, 1, 1}, s[3] = {1, 1, 1}; + d[0] = static_cast<int>(inputs[i].shape(0)); + CHECK_GT(d[0], 0); + d[1] = static_cast<int>(inputs[i].shape(1)); + s[0] = d[1] * d[2]; + s[1] = d[2]; + CUDNN_CHECK(cudnnSetTensorNdDescriptor(x_descs_[i], dtype_, 3, d, s)); + CUDNN_CHECK(cudnnSetTensorNdDescriptor(dx_descs_[i], dtype_, 3, d, s)); + + d[0] = static_cast<int>(inputs[i].shape(0)); + d[1] = static_cast<int>(hidden_dim_ * num_directions_); + s[0] = d[1] * d[2]; + s[1] = d[2]; + CUDNN_CHECK(cudnnSetTensorNdDescriptor(y_descs_[i], dtype_, 3, d, s)); + CUDNN_CHECK(cudnnSetTensorNdDescriptor(dy_descs_[i], dtype_, 3, d, s)); + } + } +} + +// must be called after setting IO descriptors +void CudnnRNN::SetRNNDescriptor(shared_ptr<Device> dev) { + auto ctx = dev->context(0); CUDNN_CHECK(cudnnCreateDropoutDescriptor(&dropout_desc_)); + size_t state_size; + CUDNN_CHECK(cudnnDropoutGetStatesSize(ctx->cudnn_handle, &state_size)); + dropout_state_ = Tensor(Shape{state_size}, dev, kChar); + CUDNN_CHECK(cudnnSetDropoutDescriptor( + dropout_desc_, ctx->cudnn_handle, dropout_, + dropout_state_.block()->mutable_data(), state_size, seed_)); + CUDNN_CHECK(cudnnCreateRNNDescriptor(&rnn_desc_)); + cudnnRNNInputMode_t input_mode; + if (input_mode_ == "linear") + input_mode = CUDNN_LINEAR_INPUT; + else if (input_mode_ == "skip") + input_mode = CUDNN_SKIP_INPUT; + cudnnDirectionMode_t direction; + if (direction_ == "unidirectional") + direction = CUDNN_UNIDIRECTIONAL; + else if (direction_ == "bidirectional") + direction = CUDNN_BIDIRECTIONAL; + + cudnnRNNMode_t rnn_mode; + if (rnn_mode_ == "relu") + rnn_mode = CUDNN_RNN_RELU; + else if (rnn_mode_ == "tanh") + rnn_mode = CUDNN_RNN_TANH; + else if (rnn_mode_ == "lstm") + rnn_mode = CUDNN_LSTM; + else if (rnn_mode_ == "gru") + rnn_mode = CUDNN_GRU; + CUDNN_CHECK(cudnnSetRNNDescriptor(rnn_desc_, hidden_dim_, num_stacks_, + dropout_desc_, input_mode, direction, + rnn_mode, dtype_)); + + size_t weight_size; + CUDNN_CHECK(cudnnGetRNNParamsSize(ctx->cudnn_handle, rnn_desc_, x_descs_[0], + &weight_size, dtype_)); + // check the size manually calculated + CHECK_EQ(weight_size, weight_.Size() * sizeof(float)); + int filter_dim[3] = {static_cast<int>(weight_size), 1, 1}; + CUDNN_CHECK(cudnnCreateFilterDescriptor(&weight_desc_)); + CUDNN_CHECK(cudnnSetFilterNdDescriptor(weight_desc_, dtype_, + CUDNN_TENSOR_NCHW, 3, filter_dim)); +} - int dimA[3] = {batchsize, inputSize, 1}; - int strideA[3] = {dimA[2] * dimA[1], dimA[2], 1}; - for (size_t i = 0; i < seqLength_; i++){ - dimA[0] = batchsize; - dimA[1] = inputSize; - dimA[2] = 1; - strideA[0] = dimA[2] * dimA[1]; - strideA[1] = dimA[2]; - strideA[2] = 1; - CUDNN_CHECK(cudnnSetTensorNdDescriptor(x_descs_[i], GetCudnnDataType(dtype), 3, - dimA, strideA)); - dimA[0] = batchsize; - dimA[1] = hiddenSize_ * numDirections; - dimA[2] = 1; - strideA[0] = dimA[2] * dimA[1]; - strideA[1] = dimA[2]; - strideA[2] = 1; - CUDNN_CHECK(cudnnSetTensorNdDescriptor(y_descs_[i], GetCudnnDataType(dtype), 3, - dimA, strideA)); +void CudnnRNN::ResetHiddenAndCellDescriptors(size_t batch_size) { + if (batch_size_ == 0) { + CUDNN_CHECK(cudnnCreateTensorDescriptor(&cx_desc_)); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&dcx_desc_)); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&cy_desc_)); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&dcy_desc_)); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&hx_desc_)); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&dhx_desc_)); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&hy_desc_)); + CUDNN_CHECK(cudnnCreateTensorDescriptor(&dhy_desc_)); } - - dimA[0] = numLayers_; - dimA[1] = batchsize; - dimA[2] = hiddenSize_ * numDirections; - strideA[0] = dimA[2] * dimA[1]; - strideA[1] = dimA[2]; - strideA[2] = 1; - CUDNN_CHECK(cudnnSetTensorNdDescriptor(hx_desc_, GetCudnnDataType(dtype), 3, - dimA, strideA)); - CUDNN_CHECK(cudnnSetTensorNdDescriptor(cx_desc_, GetCudnnDataType(dtype), 3, - dimA, strideA)); - CUDNN_CHECK(cudnnSetTensorNdDescriptor(hy_desc_, GetCudnnDataType(dtype), 3, - dimA, strideA)); - CUDNN_CHECK(cudnnSetTensorNdDescriptor(cy_desc_, GetCudnnDataType(dtype), 3, - dimA, strideA)); - size_t dropoutStatesSize; - CUDNN_CHECK(cudnnDropoutGetStatesSize(ctx->cudnn_handle, &dropoutStatesSize)); - dropoutStates_ = Tensor(Shape{dropoutStatesSize}, dev, dtype); - CUDNN_CHECK(cudnnSetDropoutDescriptor(dropout_desc_, ctx->cudnn_handle, dropout_, this->dropoutStates_.block()->mutable_data(), dropoutStatesSize, 0x01234567)); - - cudnnRNNInputMode_t inputMode; - cudnnDirectionMode_t direction; - cudnnRNNMode_t mode; - - if (inputMode_ == "cudnn_linear_input" || inputMode_ == "cudnn_skip_input"){ - if (inputMode_ == "cudnn_linear_input") - inputMode = CUDNN_LINEAR_INPUT; - else if (inputMode_ == "cudnn_skip_input") - inputMode = CUDNN_SKIP_INPUT; + int dim[3] = {1, 1, 1}; + dim[0] = static_cast<int>(num_stacks_ * num_directions_); + dim[1] = static_cast<int>(batch_size); + dim[2] = static_cast<int>(hidden_dim_); + int stride[3] = {1, 1, 1}; + stride[0] = dim[1] * dim[2]; + stride[1] = dim[2]; + CUDNN_CHECK(cudnnSetTensorNdDescriptor(hx_desc_, dtype_, 3, dim, stride)); + CUDNN_CHECK(cudnnSetTensorNdDescriptor(dhx_desc_, dtype_, 3, dim, stride)); + CUDNN_CHECK(cudnnSetTensorNdDescriptor(hy_desc_, dtype_, 3, dim, stride)); + CUDNN_CHECK(cudnnSetTensorNdDescriptor(dhy_desc_, dtype_, 3, dim, stride)); + CUDNN_CHECK(cudnnSetTensorNdDescriptor(cx_desc_, dtype_, 3, dim, stride)); + CUDNN_CHECK(cudnnSetTensorNdDescriptor(dcx_desc_, dtype_, 3, dim, stride)); + CUDNN_CHECK(cudnnSetTensorNdDescriptor(cy_desc_, dtype_, 3, dim, stride)); + CUDNN_CHECK(cudnnSetTensorNdDescriptor(dcy_desc_, dtype_, 3, dim, stride)); +} + +void CudnnRNN::UpdateSpaces(size_t seq_length, shared_ptr<Device> dev) { + size_t count; + auto ctx = dev->context(0); + CUDNN_CHECK(cudnnGetRNNWorkspaceSize(ctx->cudnn_handle, rnn_desc_, + seq_length, x_descs_, &count)); + if (workspace_.Size() != count) { + workspace_ = Tensor(Shape{count}, dev, kChar); + // workspace_.SetValue(0); } - if (direction_ == "cudnn_undirectional" || direction_ == "cudnn_bidirectional"){ - if (direction_ == "cudnn_undirectional") - direction = CUDNN_UNIDIRECTIONAL; - else if (direction_ == "cudnn_bidirectional") - direction = CUDNN_BIDIRECTIONAL; + + CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(ctx->cudnn_handle, rnn_desc_, + seq_length, x_descs_, &count)); + if (reserve_space_.Size() != count) { + reserve_space_ = Tensor(Shape{count}, dev, kChar); + // reserve_space_.SetValue(0); } - if (mode_ == "cudnn_rnn_relu" || mode_ == "cudnn_rnn_tanh" || - mode_ == "cudnn_lstm" || mode_ == "cudnn_gru"){ - if (mode_ == "cudnn_rnn_relu") - mode = CUDNN_RNN_RELU; - else if (mode_ == "cudnn_rnn_tanh") - mode = CUDNN_RNN_TANH; - else if (mode_ == "cudnn_lstm") - mode = CUDNN_LSTM; - else if (mode_ == "cudnn_gru") - mode = CUDNN_GRU; +} + +void CudnnRNN::UpdateStates(size_t num_x, const vector<Tensor> &inputs) { + UpdateIODescriptors(num_x, inputs); + size_t new_batch_size = inputs.at(0).shape(0); + if (batch_size_ != new_batch_size) + ResetHiddenAndCellDescriptors(new_batch_size); + if (rnn_desc_ == nullptr) + SetRNNDescriptor(inputs.at(0).device()); + UpdateSpaces(num_x, inputs.at(0).device()); + batch_size_ = new_batch_size; + seq_length_ = num_x; +} + +Tensor CudnnRNN::MergeInputs(size_t num, const vector<Tensor> &in) { + if (num == 1) + return in.at(0); + size_t size = 0; + for (size_t i = 0; i < num; i++) size += in.at(i).Size(); + Tensor out(Shape{size}, in.at(0).device(), in.at(0).data_type()); + for (size_t i = 0, offset = 0; i < num; i++) { + CopyDataToFrom(&out, in.at(i), in.at(i).Size(), offset); + offset += in.at(i).Size(); + } + return out; +} + +vector<Tensor> CudnnRNN::SplitOutput(size_t num, size_t dim, + const vector<Tensor> &in, + const Tensor output) { + vector<Tensor> outputs; + if (num == 1) { + outputs.push_back(output); + } else { + for (size_t i = 0, offset = 0; offset < output.Size(); i++) { + Shape s{in.at(i).shape(0), dim}; + Tensor out(s, output.device(), output.data_type()); + CopyDataToFrom(&out, output, out.Size(), 0, offset); + outputs.push_back(out); + offset += out.Size(); + } + CHECK_EQ(num, outputs.size()); } - CUDNN_CHECK(cudnnSetRNNDescriptor(rnn_desc_, hiddenSize_, numLayers_, dropout_desc_, inputMode, direction, mode, GetCudnnDataType(dtype))); + return outputs; +} - size_t weightSize; - CUDNN_CHECK(cudnnGetRNNParamsSize(ctx->cudnn_handle, rnn_desc_, x_descs_[0], &weightSize, GetCudnnDataType(dtype))); - CHECK_EQ(weightSize, weightSize_); +const vector<Tensor> CudnnRNN::Forward(int flag, const vector<Tensor> &inputs) { + DataType dtype = inputs.at(0).data_type(); + auto dev = inputs.at(0).device(); - int filterDimA[3] = {weightSize_, 1, 1}; - CUDNN_CHECK(cudnnSetFilterNdDescriptor(weight_desc_, GetCudnnDataType(dtype), CUDNN_TENSOR_NCHW, 3, filterDimA)); + // copy input data into a block of contiguous memory + // hx (and cx) is at the end of inputs + CHECK_GT(inputs.size(), 1u + has_cell_); + size_t num_x = inputs.size() - has_cell_ - 1; + Tensor input = MergeInputs(num_x, inputs); + LOG(INFO) << "input size " << input.Size() << " value " << input.L1(); - - CUDNN_CHECK(cudnnGetRNNWorkspaceSize(ctx->cudnn_handle, rnn_desc_, seqLength_, x_descs_, &workspace_count_)); - workspace_ = Tensor(Shape{workspace_count_}, dev, dtype); + if (rnn_desc_ != nullptr) + CHECK_EQ(dtype_, GetCudnnDataType(dtype)) + << "Cannot change cudnn data type during training from " << dtype_ + << " to " << GetCudnnDataType(dtype); + else + dtype_ = GetCudnnDataType(dtype); - CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(ctx->cudnn_handle, rnn_desc_, seqLength_, x_descs_, &ReserveSize_)); - reserve_ = Tensor(Shape{ReserveSize_}, dev, dtype); - has_init_cudnn_ = true; -} + UpdateStates(num_x, inputs); + // CheckFowardShapes(); -const vector<Tensor> CudnnRNN::Forward(int flag, const vector<Tensor>& inputs) { - /*(seqLength, minibatch, inputSize) !!! */ - singa::Tensor input = inputs[0]; - singa::Tensor hx = inputs[1]; - singa:: Tensor cx = inputs[2]; - CHECK_EQ(input.device()->lang(), kCuda); - CHECK_EQ(input.device()->lang(), this->weight_.device()->lang()); - CHECK_EQ(input.nDim(), 3u); - vector<Tensor> data_output; - if (flag & kTrain) buf_.push(input); // buffer the input for backward - size_t batchsize = input.shape(1); /*(seqLength, minibatch, inputSize) !!! */ - DataType dtype = input.data_type(); - auto dev = input.device(); - - if (!has_init_cudnn_) InitCudnn(input); - - - size_t numDirections; - if (direction_ == "cudnn_undirectional") - numDirections = 1; - else - numDirections = 2; - - Shape shape{seqLength_, batchsize, hiddenSize_ * numDirections}; - Tensor output(shape, dev, dtype); - Shape shape1{numLayers_, batchsize, hiddenSize_ * numDirections}; - Tensor hy(shape1, dev, dtype); - Tensor cy(shape1, dev, dtype); - - output.device()->Exec([input, output, hx, hy, cx, cy, this](Context *ctx) { - Block *inblock = input.block(), *outblock = output.block(), - *wblock = this->weight_.block(), *hxblock = hx.block(), - *hyblock = hy.block(), *cxblock = cx.block(), - *cyblock = cy.block(); - cudnnRNNForwardTraining( - ctx->cudnn_handle, this->rnn_desc_, seqLength_, this->x_descs_, - inblock->data(), this->hx_desc_, hxblock->data(), this->cx_desc_, - cxblock->data(), this->weight_desc_, wblock->data(), this->y_descs_, - outblock->mutable_data(), this->hy_desc_, hyblock->mutable_data(), - cy_desc_, cyblock->mutable_data(), this->workspace_.block()->mutable_data(), - this->workspace_count_ * sizeof(float), this->reserve_.block()->mutable_data(), - this->ReserveSize_ * sizeof(float)); -}, {input.block(), weight_.block(), hx.block(), cx.block()}, - {output.block(), hy.block(), cy.block()}, workspace_.block()); - buf_.push(output); - buf_.push(hx); - buf_.push(hy); // in order to assign shape to dhy - buf_.push(cx); - buf_.push(cy); // in order to assign shape to dcy - data_output.push_back(output); - data_output.push_back(hy); - data_output.push_back(cy); - return data_output; + Shape outshape{input.Size() * hidden_dim_ / input_dim_ * num_directions_}; + Tensor output(outshape, dev, dtype); + LOG(INFO) << "output size " << output.Size(); + Tensor hx = inputs.at(num_x); + Shape state_shape{num_stacks_ * num_directions_, batch_size_, hidden_dim_}; + Tensor hy(state_shape, dev, dtype); + Tensor cy, cx; + if (has_cell_) { + cx = inputs.at(num_x + 1); + cy.ResetLike(hy); + } + + LOG(INFO) << "hidden size " << hy.Size(); + LOG(INFO) << "weight size " << weight_.Size() << " value " << weight_.L1(); + Block *inb = input.block(), *outb = output.block(), + *wb = this->weight_.block(), *hxb = hx.block(), *cxb = cx.block(), + *hyb = hy.block(), *cyb = cy.block(), + *wspace = this->workspace_.block(), + *rspace = this->reserve_space_.block(); + if (flag & kTrain) { + dev->Exec( + [inb, outb, wb, hxb, cxb, hyb, cyb, wspace, rspace, this](Context *ctx) { + // clang-format off + cudnnRNNForwardTraining( + ctx->cudnn_handle, + this->rnn_desc_, + this->seq_length_, + this->x_descs_, inb->data(), + this->hx_desc_, hxb == nullptr ? nullptr : hxb->data(), + this->cx_desc_, cxb == nullptr ? nullptr : cxb->data(), + this->weight_desc_, wb->data(), + this->y_descs_, outb->mutable_data(), + this->hy_desc_, hyb->mutable_data(), + this->cy_desc_, cyb == nullptr ? nullptr : cyb->mutable_data(), + wspace->mutable_data(), + this->workspace_.Size(), rspace->mutable_data(), + this->reserve_space_.Size()); + // clang-format on + }, + {inb, wb, hxb, cxb}, {outb, hyb, cyb, wspace, rspace}); + buf_.push(input); + buf_.push(output); + buf_.push(hx); + buf_.push(cx); + } else { + dev->Exec([inb, outb, wb, hxb, cxb, hyb, cyb, wspace, this](Context *ctx) { + // clang-format off + cudnnRNNForwardInference( + ctx->cudnn_handle, + this->rnn_desc_, + this->seq_length_, + this->x_descs_, inb->data(), + this->hx_desc_, hxb == nullptr ? nullptr : hxb->data(), + this->cx_desc_, cxb == nullptr ? nullptr : cxb->data(), + this->weight_desc_, wb->data(), + this->y_descs_, outb->mutable_data(), + this->hy_desc_, hyb->mutable_data(), + this->cy_desc_, cyb == nullptr ? nullptr : cyb->mutable_data(), + wspace->mutable_data(), this->workspace_.Size()); + // clang-format on + }, {inb, wb, hxb, cxb}, {outb, hyb, cyb, wspace}); + } + auto outputs = + SplitOutput(num_x, hidden_dim_ * num_directions_, inputs, output); + outputs.push_back(hy); + if (has_cell_) outputs.push_back(cy); + return outputs; } +// TODO(wangwei) check Tensor device to be on cuda? const std::pair<vector<Tensor>, vector<Tensor>> CudnnRNN::Backward( - int flag, const vector<Tensor>& grads) { - CHECK(has_init_cudnn_); - singa::Tensor grad = grads[0]; - singa::Tensor dhy = grads[1]; - singa::Tensor dcy = grads[2]; - CHECK_EQ(grad.device()->lang(), kCuda); - CHECK_EQ(grad.nDim(), 3u); - CHECK(!buf_.empty()); - Tensor cy = buf_.top(); + int flag, const vector<Tensor> &grads) { + // dhy (and dcy) is at last + const Tensor cx = buf_.top(); // cannot use const Tensor& due to pop() buf_.pop(); - CHECK(!buf_.empty()); - Tensor cx = buf_.top(); + const Tensor hx = buf_.top(); buf_.pop(); - CHECK(!buf_.empty()); - Tensor hy = buf_.top(); + const Tensor y = buf_.top(); buf_.pop(); - CHECK(!buf_.empty()); - Tensor hx = buf_.top(); + const Tensor x = buf_.top(); buf_.pop(); - CHECK(!buf_.empty()); - Tensor src_output = buf_.top(); - buf_.pop(); - CHECK(!buf_.empty()); - Tensor src_data = buf_.top(); - buf_.pop(); - vector<Tensor> param_grad; - vector<Tensor> data_grad; - Tensor dx; - dx.ResetLike(src_data); - Tensor dw; - dw.ResetLike(weight_); - Tensor dhx; - dhx.ResetLike(hx); + + auto dev = y.device(); + auto dtype = y.data_type(); + + CHECK_GT(grads.size(), 1u + has_cell_); + size_t num_dy = grads.size() - has_cell_ - 1; + CHECK_EQ(num_dy, seq_length_); + const Tensor dy = MergeInputs(num_dy, grads); + CHECK_EQ(dy.Size(), y.Size()); + const Tensor dhy = grads.at(num_dy); + Tensor dcy; + if (has_cell_) + dcy = grads.at(num_dy + 1); + + Shape xshape{y.Size() * input_dim_ / hidden_dim_ / num_directions_}; + Tensor dx(xshape, dev, dtype); + Tensor dw(weight_.shape(), dev, dtype); + Shape state_shape{num_stacks_ * num_directions_, batch_size_, hidden_dim_}; + Tensor dhx(state_shape, dev, dtype); Tensor dcx; - dcx.ResetLike(cx); + if (has_cell_) + dcx.ResetLike(dhx); + dw.SetValue(0.0f); + Block *yb = y.block(), *dyb = dy.block(), *dhyb = dhy.block(), + *dcyb = dcy.block(), *xb = x.block(), *cxb = cx.block(), + *wb = weight_.block(), *dwb = dw.block(), *hxb = hx.block(), + *dxb = dx.block(), *dhxb = dhx.block(), *dcxb = dcx.block(), + *wspace = workspace_.block(), *rspace = reserve_space_.block(); + y.device()->Exec( + [yb, dyb, dhyb, dcyb, xb, cxb, wb, dwb, hxb, dxb, dhxb, dcxb, wspace, + rspace, this](Context *ctx) { + // clang-format off + cudnnRNNBackwardData( + ctx->cudnn_handle, + this->rnn_desc_, + this->seq_length_, + this->y_descs_, yb->data(), + this->dy_descs_, dyb->data(), + this->dhy_desc_, dhyb == nullptr ? nullptr : dhyb->data(), + this->dcy_desc_, dcyb == nullptr ? nullptr : dcyb->data(), + this->weight_desc_, wb->data(), + this->hx_desc_, hxb == nullptr ? nullptr : hxb->data(), + this->cx_desc_, cxb == nullptr ? nullptr : cxb->data(), + this->dx_descs_, dxb->mutable_data(), + this->dhx_desc_, dhxb->mutable_data(), + this->dcx_desc_, dcxb == nullptr ? nullptr : dcxb->mutable_data(), + wspace->mutable_data(), this->workspace_.Size(), + rspace->mutable_data(), this->reserve_space_.Size()); + cudnnRNNBackwardWeights( + ctx->cudnn_handle, + this->rnn_desc_, + this->seq_length_, + this->x_descs_, xb->data(), + this->hx_desc_, hxb == nullptr ? nullptr : hxb->data(), + this->y_descs_, yb->data(), + wspace->data(), this->workspace_.Size(), + this->dweight_desc_, dwb->mutable_data(), + rspace->data(), this->reserve_space_.Size()); + // clang-format on + }, + {yb, dyb, dhyb, dcyb, xb, wb, wspace, rspace}, + {dxb, dwb, dhxb, dcxb, wspace, rspace}); - dx.device()->Exec([grad, dw, src_data, src_output, hx, this](Context *ctx) { - Block *inblock = src_data.block(), *srcoutblock = src_output.block(), - *dwblock = dw.block(), *hxblock = hx.block(); - cudnnRNNBackwardWeights( - ctx->cudnn_handle, this->rnn_desc_, seqLength_, this->x_descs_, - inblock->data(), this->hx_desc_, hxblock->data(), this->y_descs_, - srcoutblock->data(), this->workspace_.block()->mutable_data(), - this->workspace_count_ * sizeof(float), this->weight_desc_, - dwblock->mutable_data(), this->reserve_.block()->mutable_data(), - this->ReserveSize_ * sizeof(float)); - }, {src_data.block(), hx.block(), src_output.block()}, {dw.block(), workspace_.block()}); - - // LOG(ERROR) << "backward src"; - dx.device()->Exec([grad, dw, src_output, dx, cy, cx, hy, hx, dhy, dcy, dhx, dcx, this](Context *ctx) { - Block *srcoutblock = src_output.block(), *wblock = this->weight_.block(), *dxblock = dx.block(), - *dyblock = grad.block(), *cxblock = cx.block(), *hxblock = hx.block(), *dhyblock = dhy.block(), - *dcyblock = dcy.block(), *dhxblock = dhx.block(), *dcxblock = dcx.block(); - cudnnRNNBackwardData( - ctx->cudnn_handle, this->rnn_desc_, seqLength_, this->y_descs_, srcoutblock->data(), - this->y_descs_, dyblock->data(), this->hy_desc_, dhyblock->data(), - this->cy_desc_, dcyblock->data(), this->weight_desc_, wblock->data(), - this->hx_desc_, hxblock->data(), this->cx_desc_, cxblock->data(), - this->x_descs_, dxblock->mutable_data(), this->hx_desc_, dhxblock->mutable_data(), - this->cx_desc_, dcxblock->mutable_data(), this->workspace_.block()->mutable_data(), - this->workspace_count_ * sizeof(float), this->reserve_.block()->mutable_data(), - this->ReserveSize_ * sizeof(float)); - }, {hx.block(), src_output.block(), grad.block(), grad.block(), dhy.block(), dcy.block(), - this->weight_.block(), hx.block(), cx.block()}, - {dx.block(), dhx.block(), dcx.block(), reserve_.block(), workspace_.block()}); - param_grad.push_back(dw); - data_grad.push_back(dx); - data_grad.push_back(dhx); - data_grad.push_back(dcx); - return std::make_pair(data_grad, param_grad); + vector <Tensor> param_grad{dw}; + auto data_grads = SplitOutput(num_dy, input_dim_, grads, dx); + data_grads.push_back(dhx); + if (has_cell_) + data_grads.push_back(dcx); + return std::make_pair(data_grads, param_grad); } } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/src/model/layer/cudnn_rnn.h ---------------------------------------------------------------------- diff --git a/src/model/layer/cudnn_rnn.h b/src/model/layer/cudnn_rnn.h index b1e9f43..d2f8db5 100644 --- a/src/model/layer/cudnn_rnn.h +++ b/src/model/layer/cudnn_rnn.h @@ -20,6 +20,7 @@ #define SRC_MODEL_LAYER_CUDNN_RNN_H_ #include "singa/singa_config.h" #ifdef USE_CUDNN +#if CUDNN_VERSION_MAJOR >= 5 #include <string> #include <utility> #include <vector> @@ -41,45 +42,46 @@ class CudnnRNN : public RNN { const std::string layer_type() const override { return "CudnnRNN"; } const vector<Tensor> Forward(int flag, const vector<Tensor>& inputs) override; - const std::pair<vector<Tensor>, vector<Tensor>> Backward(int flag, const vector<Tensor>& grads) override; - - /// \copydoc Layer::Setup(const LayerConf&); - void Setup(const Shape& in_sample, const LayerConf &conf) override; + const std::pair<vector<Tensor>, vector<Tensor>> Backward( + int flag, const vector<Tensor>& grads) override; void ToDevice(std::shared_ptr<Device> device) override; - size_t workspace_byte_limit() { return workspace_byte_limit_; } - // string prefer() { return prefer_; } - string inputMode() const { return inputMode_; } - string direction() const { return direction_; } - string mode() const { return mode_; } - - protected: - /// Init cudnn related data structures. - void InitCudnn(const Tensor& input); + void SetRNNDescriptor(shared_ptr<Device> dev); + void ResetHiddenAndCellDescriptors(size_t batch_size); + void DestroyIODescriptors(); + void UpdateIODescriptors(size_t num, const vector<Tensor>& inputs); + void UpdateSpaces(size_t num, shared_ptr<Device> dev); + void UpdateStates(size_t num, const vector<Tensor>& inputs); + Tensor MergeInputs(size_t num, const vector<Tensor>& in); + vector<Tensor> SplitOutput(size_t num, size_t dim, const vector<Tensor>& in, + const Tensor output); protected: - bool has_init_cudnn_ = false; cudnnTensorDescriptor_t* x_descs_ = nullptr; + cudnnTensorDescriptor_t* dx_descs_ = nullptr; cudnnTensorDescriptor_t* y_descs_ = nullptr; + cudnnTensorDescriptor_t* dy_descs_ = nullptr; cudnnTensorDescriptor_t hx_desc_ = nullptr; + cudnnTensorDescriptor_t dhx_desc_ = nullptr; cudnnTensorDescriptor_t cx_desc_ = nullptr; + cudnnTensorDescriptor_t dcx_desc_ = nullptr; cudnnTensorDescriptor_t hy_desc_ = nullptr; + cudnnTensorDescriptor_t dhy_desc_ = nullptr; cudnnTensorDescriptor_t cy_desc_ = nullptr; + cudnnTensorDescriptor_t dcy_desc_ = nullptr; cudnnFilterDescriptor_t weight_desc_ = nullptr; + cudnnFilterDescriptor_t dweight_desc_ = nullptr; cudnnRNNDescriptor_t rnn_desc_ = nullptr; cudnnDropoutDescriptor_t dropout_desc_ = nullptr; - size_t workspace_byte_limit_, workspace_count_; - size_t ReserveSize_; + cudnnDataType_t dtype_ = CUDNN_DATA_FLOAT; Tensor workspace_; - string inputMode_; - string direction_; - string mode_; - Tensor reserve_; - Tensor dropoutStates_; + Tensor reserve_space_; + Tensor dropout_state_; }; } // namespace singa +#endif // CUDNN_VERSION_MAJOR >= 5 #endif // USE_CUDNN #endif // SRC_MODEL_LAYER_CUDNN_RNN_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/src/model/layer/rnn.cc ---------------------------------------------------------------------- diff --git a/src/model/layer/rnn.cc b/src/model/layer/rnn.cc index 493a5e4..6b831a7 100644 --- a/src/model/layer/rnn.cc +++ b/src/model/layer/rnn.cc @@ -19,20 +19,64 @@ #include "./rnn.h" #include <vector> #include "singa/model/layer.h" +#include "singa/utils/string.h" namespace singa { void RNN::Setup(const Shape& in_sample, const LayerConf &conf) { Layer::Setup(in_sample, conf); + RNNConf rnn_conf = conf.rnn_conf(); - hiddenSize_ = rnn_conf.hiddensize(); - CHECK_GT(hiddenSize_, 0u); + hidden_dim_ = rnn_conf.hidden_dim(); + CHECK_GT(hidden_dim_, 0u); + num_stacks_ = rnn_conf.num_stacks(); + CHECK_GT(num_stacks_, 0u); + input_dim_ = Product(in_sample); + CHECK_GT(input_dim_, 0u); + dropout_ = rnn_conf.dropout(); + CHECK_GE(dropout_, 0); - numLayers_ = rnn_conf.numlayers(); - CHECK_GT(numLayers_, 0u); + input_mode_ = ToLowerCase(rnn_conf.input_mode()); + CHECK(input_mode_ == "linear" || input_mode_ == "skip") + << "Input mode of " << input_mode_ << " is not supported; Please use " + << "'linear' and 'skip'"; - dropout_ = rnn_conf.dropout(); - CHECK_GE(dropout_, 0u); + direction_ = ToLowerCase(rnn_conf.direction()); + if (direction_ == "unidirectional") + num_directions_ = 1; + else if (direction_ == "bidirectional") + num_directions_ = 2; + else + LOG(FATAL) << "Direction of " << direction_ + << " is not supported; Please use unidirectional or bidirectional"; + + rnn_mode_ = ToLowerCase(rnn_conf.rnn_mode()); + if (rnn_mode_ == "lstm") { + has_cell_ = true; + } else if (rnn_mode_ !="relu" && rnn_mode_ != "tanh" && rnn_mode_ != "gru") { + LOG(FATAL) << "RNN memory unit (mode) of " << rnn_mode_ + << " is not supported Please use 'relu', 'tanh', 'lstm' and 'gru'"; + } + // the first constant (4) is the size of float + // the second constant (2, 8, 6) is the number of sets of params + int mult = 1; + if (rnn_mode_ == "relu" || rnn_mode_ == "tanh") + mult *= 1; + else if (rnn_mode_ == "lstm") + mult *= 4; + else if (rnn_mode_ == "gru") + mult *= 3; + if (direction_ == "bidirectional") + mult *= 2; + + size_t weight_size = 0; + for (size_t i = 0; i < num_stacks_; i++) { + size_t dim = hidden_dim_ * (in_sample[0] + hidden_dim_ + 2); + if (i > 0) + dim = hidden_dim_ * (hidden_dim_ + hidden_dim_ + 2); + weight_size += mult * dim; + } + weight_.Reshape(Shape{weight_size}); } const vector<Tensor> RNN::Forward(int flag, const vector<Tensor>& inputs) { @@ -40,7 +84,8 @@ const vector<Tensor> RNN::Forward(int flag, const vector<Tensor>& inputs) { return data_output; } -const std::pair<vector<Tensor>, vector<Tensor>> RNN::Backward(int flag, const vector<Tensor>& grads) { +const std::pair<vector<Tensor>, vector<Tensor>> RNN::Backward(int flag, + const vector<Tensor>& grads) { vector<Tensor> param_grad; vector<Tensor> data_grad; return std::make_pair(data_grad, param_grad); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/src/model/layer/rnn.h ---------------------------------------------------------------------- diff --git a/src/model/layer/rnn.h b/src/model/layer/rnn.h index ec5a35d..3750021 100644 --- a/src/model/layer/rnn.h +++ b/src/model/layer/rnn.h @@ -47,38 +47,37 @@ class RNN : public Layer { const std::pair<vector<Tensor>, vector<Tensor>> Backward( int flag, const vector<Tensor>& grads) override; - - size_t hiddenSize() const { return hiddenSize_; } - size_t numLayers() const { return numLayers_; } - size_t weightSize() const { return weightSize_; } - float dropout() const { return dropout_; } - void set_weight(Tensor w) { weight_.ResetLike(w); weight_.CopyData(w); } - + const vector<Tensor> param_values() override { + return vector<Tensor>{weight_}; + } void ToDevice(std::shared_ptr<Device> device) override; /// Return the internal state stack, which should be empty at the beginning - /// of - /// one iteration. + /// of one iteration. // std::stack<Tensor> states() const { return states_; } + string input_mode() const { return input_mode_; } + string direction() const { return direction_; } + string rnn_mode() const { return rnn_mode_; } + protected: /// Storing input or output from Forward(), which are used in Backward(). /// Rules: /// 1. push the 'input' or 'output' into states_ if the flag of Forward() is /// for kTrain and 'input' or 'output' is necessary for Backward(). /// 2. pop data out in Backward(). - // std::stack<Tensor*> states_; std::stack<Tensor> buf_; - size_t hiddenSize_; - size_t numLayers_; - size_t numLinearLayer_; - size_t seqLength_; - size_t weightSize_; /*all the weights and biases*/ - float dropout_; + bool has_cell_ = false; + size_t num_directions_ = 1; + size_t input_dim_ = 0, hidden_dim_ = 0, num_stacks_ = 0, seq_length_ = 0; + size_t batch_size_ = 0; + size_t seed_ = 0x1234567; + float dropout_ = 0.0f; + string input_mode_, direction_, rnn_mode_; Tensor weight_; }; } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/src/model/optimizer/adagrad.cc ---------------------------------------------------------------------- diff --git a/src/model/optimizer/adagrad.cc b/src/model/optimizer/adagrad.cc index 3ed1855..cdb3fac 100644 --- a/src/model/optimizer/adagrad.cc +++ b/src/model/optimizer/adagrad.cc @@ -27,8 +27,10 @@ void AdaGrad::Setup(const OptimizerConf& conf) { delta_ = conf.delta(); } // value = value - lr*grad/sqrt(history+delta) void AdaGrad::Apply(int step, float lr, const string& name, const Tensor& grad, Tensor& value) { - if (history_gradient_.find(name) == history_gradient_.end()) + if (history_gradient_.find(name) == history_gradient_.end()) { history_gradient_[name].ResetLike(value); + history_gradient_[name].SetValue(0.0f); + } Tensor& history = history_gradient_[name]; Tensor tmp = Square(grad); history += tmp; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/src/model/optimizer/nesterov.cc ---------------------------------------------------------------------- diff --git a/src/model/optimizer/nesterov.cc b/src/model/optimizer/nesterov.cc index e5354b1..051499b 100644 --- a/src/model/optimizer/nesterov.cc +++ b/src/model/optimizer/nesterov.cc @@ -34,8 +34,10 @@ void Nesterov::Apply(int step, float lr, const string& name, const Tensor& grad, Tensor& value) { if (momentum_generator_) { float mom = momentum_generator_(step); - if (history_gradient_.find(name) == history_gradient_.end()) + if (history_gradient_.find(name) == history_gradient_.end()) { history_gradient_[name].ResetLike(value); + history_gradient_[name].SetValue(0.0f); + } Tensor& history = history_gradient_[name]; Tensor tmp = history.Clone(); history *= mom; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/src/model/optimizer/rmsprop.cc ---------------------------------------------------------------------- diff --git a/src/model/optimizer/rmsprop.cc b/src/model/optimizer/rmsprop.cc index 6d77ccd..13e2a75 100644 --- a/src/model/optimizer/rmsprop.cc +++ b/src/model/optimizer/rmsprop.cc @@ -32,6 +32,7 @@ void RMSProp::Apply(int step, float lr, const string& name, const Tensor& grad, Tensor& value) { if (history_gradient_.find(name) == history_gradient_.end()) { history_gradient_[name].ResetLike(value); + history_gradient_[name].SetValue(0.0f); } Tensor& history = history_gradient_[name]; history *= rho_; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/src/model/optimizer/sgd.cc ---------------------------------------------------------------------- diff --git a/src/model/optimizer/sgd.cc b/src/model/optimizer/sgd.cc index 2797fc6..d78d5b8 100644 --- a/src/model/optimizer/sgd.cc +++ b/src/model/optimizer/sgd.cc @@ -36,8 +36,10 @@ void SGD::Apply(int step, float lr, const string& name, const Tensor& grad, if (momentum_generator_) { float mom = momentum_generator_(step); if (mom != 0) { - if (history_gradient_.find(name) == history_gradient_.end()) + if (history_gradient_.find(name) == history_gradient_.end()) { history_gradient_[name].ResetLike(value); + history_gradient_[name].SetValue(0.0f); + } Tensor& history = history_gradient_[name]; history *= mom; Axpy(lr, grad, &history); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/src/proto/model.proto ---------------------------------------------------------------------- diff --git a/src/proto/model.proto b/src/proto/model.proto index d8193f1..31ebfc3 100644 --- a/src/proto/model.proto +++ b/src/proto/model.proto @@ -393,19 +393,19 @@ message ConvolutionConf { } message RNNConf { - optional uint32 hiddensize = 1; // The number of hiddensize - optional uint32 numlayers = 2; // The number of stacked RNN layers + optional uint32 hidden_dim = 1; // The number of hiddensize + optional uint32 num_stacks = 2; // The number of stacked RNN layers optional float dropout = 3 [default = 0]; - optional int32 workspace_byte_limit = 50 [default = 512]; + optional bool remember_state = 4 [default = false]; // cudnn inputmode - // options: "cudnn_linear_input", "cudnn_skip_input" - optional string inputmode = 51 [default = "cudnn_linear_input"]; + // options: "linear", "skip" + optional string input_mode = 7 [default = "linear"]; // cudnn direction - // options: "cudnn_undirectional", "cudnn_bidirectional" - optional string direction = 52 [default = "cudnn_undirectional"]; + // options: "unidirectional", "bidirectional" + optional string direction = 8 [default = "unidirectional"]; // cudnn RNN mode - // options: "cudnn_rnn_relu", "cudnn_rnn_tanh", "cudnn_lstm", "cudnn_gru" - optional string mode = 53 [default = "cudnn_rnn_relu"]; + // options: "relu", "tanh", "lstm", "gru" + optional string rnn_mode = 9 [default = "relu"]; } /* http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/8e0b1083/test/singa/test_cudnn_rnn.cc ---------------------------------------------------------------------- diff --git a/test/singa/test_cudnn_rnn.cc b/test/singa/test_cudnn_rnn.cc index 1a79d7b..ebbf0aa 100644 --- a/test/singa/test_cudnn_rnn.cc +++ b/test/singa/test_cudnn_rnn.cc @@ -26,187 +26,154 @@ using singa::CudnnRNN; using singa::Shape; -TEST(CudnnRNN, Setup) { +using singa::Tensor; +class TestCudnnRNN : public ::testing::Test { + protected: + virtual void SetUp() { + singa::RNNConf *rnnconf = conf.mutable_rnn_conf(); + rnnconf->set_hidden_dim(hidden_dim); + rnnconf->set_num_stacks(1); + rnnconf->set_dropout(1); + rnnconf->set_input_mode("linear"); + rnnconf->set_direction("unidirectional"); + rnnconf->set_rnn_mode("tanh"); + } + singa::LayerConf conf; + size_t hidden_dim = 4; +}; + +TEST_F(TestCudnnRNN, Setup) { CudnnRNN rnn; EXPECT_EQ("CudnnRNN", rnn.layer_type()); - - singa::LayerConf conf; - singa::RNNConf *rnnconf = conf.mutable_rnn_conf(); - rnnconf->set_hiddensize(2); - rnnconf->set_numlayers(1); - rnnconf->set_dropout(0); - rnnconf->set_inputmode("cudnn_linear_input"); - rnnconf->set_direction("cudnn_undirectional"); - rnnconf->set_mode("cudnn_rnn_tanh"); - // MB - rnnconf->set_workspace_byte_limit(256); - rnn.Setup(Shape{4, 1, 2}, conf); - - EXPECT_EQ(2u, rnn.hiddenSize()); - EXPECT_EQ(1u, rnn.numLayers()); - EXPECT_EQ(0u, rnn.dropout()); - EXPECT_EQ("cudnn_linear_input", rnn.inputMode()); - EXPECT_EQ("cudnn_undirectional", rnn.direction()); - EXPECT_EQ("cudnn_rnn_tanh", rnn.mode()); - EXPECT_EQ(256u << 20, rnn.workspace_byte_limit()); + rnn.Setup(Shape{2}, conf); + auto weight = rnn.param_values().at(0); + EXPECT_EQ(weight.Size(), hidden_dim * (2 + hidden_dim + 2)); } -TEST(CudnnRNN, Forward) { +TEST_F(TestCudnnRNN, Forward) { auto cuda = std::make_shared<singa::CudaGPU>(); const size_t seqLength = 4, batchsize = 1, dim = 2; - const size_t numLayers = 1, hiddensize = 2, numDirections = 1; const float x[seqLength * batchsize * dim] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; - singa::Tensor in(singa::Shape{seqLength, batchsize, dim}, cuda); - in.CopyDataFromHostPtr(x, seqLength * batchsize * dim); + vector<Tensor> inputs; + for (size_t i = 0; i < seqLength; i++) { + Tensor t(Shape{batchsize, dim}, cuda); + t.CopyDataFromHostPtr(x + i * t.Size(), t.Size()); + inputs.push_back(t); + } - - const float hx_data[numLayers * batchsize * hiddensize * numDirections] = {1.0f, 1.0f}; - singa::Tensor hx(singa::Shape{numLayers, batchsize, hiddensize * numDirections}, cuda); - hx.CopyDataFromHostPtr(hx_data, numLayers * batchsize * hiddensize * numDirections); + singa::Tensor hx; + inputs.push_back(hx); - const float cx_data[numLayers * batchsize * hiddensize * numDirections] = {1.0f, 1.0f}; - singa::Tensor cx(singa::Shape{numLayers, batchsize, hiddensize * numDirections}, cuda); - cx.CopyDataFromHostPtr(cx_data, numLayers * batchsize * hiddensize * numDirections); - CudnnRNN rnn; - - singa::LayerConf conf; - singa::RNNConf *rnnconf = conf.mutable_rnn_conf(); - rnnconf->set_hiddensize(2); - rnnconf->set_numlayers(1); - rnnconf->set_dropout(0); - rnnconf->set_inputmode("cudnn_linear_input"); - rnnconf->set_direction("cudnn_undirectional"); - rnnconf->set_mode("cudnn_rnn_tanh"); - // MB - rnnconf->set_workspace_byte_limit(256); - rnn.Setup(Shape{4, 1, 2}, conf); - - - size_t weightSize = rnn.weightSize(); + rnn.Setup(Shape{dim}, conf); + rnn.ToDevice(cuda); + + auto weight = rnn.param_values().at(0); + size_t weightSize = weight.Size(); float we[weightSize]; + float wvalue = 0.1f; for (size_t i = 0; i < weightSize; i++) - we[i] = 1.0f; - singa::Tensor weight(singa::Shape{weightSize, 1, 1}, cuda); + we[i] = wvalue; weight.CopyDataFromHostPtr(we, weightSize); - rnn.set_weight(weight); - - vector<singa::Tensor> input_array; - input_array.push_back(in); - input_array.push_back(hx); - input_array.push_back(cx); - const auto ret = rnn.Forward(singa::kTrain, input_array); - // singa::CppCPU host(0, 1); - singa::Tensor out1 = ret[0]; - out1.ToHost(); - const float *outptr1 = out1.data<float>(); - EXPECT_EQ(8u, out1.Size()); - EXPECT_NEAR(1.0f, outptr1[0], 0.0001); // tanh 6 - EXPECT_NEAR(1.0f, outptr1[1], 0.0001); - EXPECT_NEAR(1.0f, outptr1[2], 0.0001); - EXPECT_NEAR(1.0f, outptr1[3], 0.0001); - EXPECT_NEAR(1.0f, outptr1[4], 0.0001); - EXPECT_NEAR(1.0f, outptr1[5], 0.0001); - EXPECT_NEAR(1.0f, outptr1[6], 0.0001); - EXPECT_NEAR(1.0f, outptr1[7], 0.0001); - - singa::Tensor hy1 = ret[1]; - hy1.ToHost(); - const float *hyptr1 = hy1.data<float>(); - EXPECT_EQ(2u, hy1.Size()); - EXPECT_NEAR(1.0f, hyptr1[0], 0.0001); - EXPECT_NEAR(1.0f, hyptr1[1], 0.0001); + + const auto ret = rnn.Forward(singa::kEval, inputs); + EXPECT_EQ(ret.size(), seqLength + 1); + vector<float> hxptr(hidden_dim, 0.0f); + for (size_t i = 0; i < seqLength; i++) { + auto y = ret[i]; + y.ToHost(); + auto yptr = y.data<float>(); + vector<float> tmp; + for (size_t j = 0; j < hidden_dim; j++) { + float ty = 0; + for (size_t k = 0; k < dim; k++) { + ty += x[i * dim + k] * wvalue; + } + ty += wvalue; + for (size_t k = 0; k < hidden_dim; k++) { + ty += hxptr[k] * wvalue; + } + ty += wvalue; + ty = tanh(ty); + EXPECT_NEAR(ty, yptr[j], 1e-4); + tmp.push_back(ty); + } + std::copy(tmp.begin(), tmp.end(), hxptr.begin()); + } } -TEST(CudnnRNN, Backward) { - // src_data +TEST_F(TestCudnnRNN, Backward) { auto cuda = std::make_shared<singa::CudaGPU>(); const size_t seqLength = 4, batchsize = 1, dim = 2; - const size_t numLayers = 1, hiddensize = 2, numDirections = 1; const float x[seqLength * batchsize * dim] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; - singa::Tensor in(singa::Shape{seqLength, batchsize, dim}, cuda); - in.CopyDataFromHostPtr(x, seqLength * batchsize * dim); - const float hx_data[numLayers * batchsize * hiddensize * numDirections] = {1.0f, 1.0f}; - singa::Tensor hx(singa::Shape{numLayers, batchsize, hiddensize * numDirections}, cuda); - hx.CopyDataFromHostPtr(hx_data, numLayers * batchsize * hiddensize * numDirections); + vector<Tensor> inputs; + for (size_t i = 0; i < seqLength; i++) { + Tensor t(Shape{batchsize, dim}, cuda); + t.CopyDataFromHostPtr(x + i * t.Size(), t.Size()); + inputs.push_back(t); + } - const float cx_data[numLayers * batchsize * hiddensize * numDirections] = {1.0f, 1.0f}; - singa::Tensor cx(singa::Shape{numLayers, batchsize, hiddensize * numDirections}, cuda); - cx.CopyDataFromHostPtr(cx_data, numLayers * batchsize * hiddensize * numDirections); + singa::Tensor hx; + inputs.push_back(hx); CudnnRNN rnn; + rnn.Setup(Shape{dim}, conf); + rnn.ToDevice(cuda); - singa::LayerConf conf; - singa::RNNConf *rnnconf = conf.mutable_rnn_conf(); - rnnconf->set_hiddensize(2); - rnnconf->set_numlayers(1); - rnnconf->set_dropout(0); - rnnconf->set_inputmode("cudnn_linear_input"); - rnnconf->set_direction("cudnn_undirectional"); - rnnconf->set_mode("cudnn_rnn_tanh"); - // MB - rnnconf->set_workspace_byte_limit(256); - rnn.Setup(Shape{4, 1, 2}, conf); - - size_t weightSize = rnn.weightSize(); + auto weight = rnn.param_values().at(0); + size_t weightSize = weight.Size(); float we[weightSize]; + float wvalue = 0.1f; for (size_t i = 0; i < weightSize; i++) - we[i] = 1.0f; - singa::Tensor weight(singa::Shape{weightSize, 1, 1}, cuda); + we[i] = wvalue; weight.CopyDataFromHostPtr(we, weightSize); - rnn.set_weight(weight); - - - vector<singa::Tensor> input_array; - input_array.push_back(in); - input_array.push_back(hx); - input_array.push_back(cx); - const auto ret = rnn.Forward(singa::kTrain, input_array); - - // grad - const float dy[seqLength * batchsize * hiddensize * numDirections] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; - singa::Tensor grad(singa::Shape{seqLength, batchsize, hiddensize * numDirections}, - cuda); - grad.CopyDataFromHostPtr(dy, seqLength * batchsize * hiddensize * numDirections); - - const float dhy_data[numLayers * batchsize * hiddensize * numDirections] = {1.0f, 1.0f}; - singa::Tensor dhy(singa::Shape{numLayers, batchsize, hiddensize * numDirections}, - cuda); - dhy.CopyDataFromHostPtr(dhy_data, numLayers * batchsize * hiddensize * numDirections); - - const float dcy_data[numLayers * batchsize * hiddensize * numDirections] = {1.0f, 1.0f}; - singa::Tensor dcy(singa::Shape{numLayers, batchsize, hiddensize * numDirections}, - cuda); - dcy.CopyDataFromHostPtr(dcy_data, numLayers * batchsize * hiddensize * numDirections); - - vector<singa::Tensor> grad_array; - grad_array.push_back(grad); - grad_array.push_back(dhy); - grad_array.push_back(dcy); - const auto ret_back = rnn.Backward(singa::kTrain, grad_array); - // singa::CppCPU host(0, 1); - singa::Tensor in_grad = ret_back.first[0]; - in_grad.ToHost(); - const float *dx = in_grad.data<float>(); - EXPECT_EQ(8u, in_grad.Size()); - EXPECT_NEAR(0.14, dx[0], 0.0001); - EXPECT_NEAR(0.14, dx[1], 0.0001); - EXPECT_NEAR(0.1596, dx[2], 0.0001); - EXPECT_NEAR(0.1596, dx[3], 0.0001); - EXPECT_NEAR(0.1623, dx[4], 0.0001); - EXPECT_NEAR(0.1623, dx[5], 0.0001); - EXPECT_NEAR(0.1627, dx[6], 0.0001); - EXPECT_NEAR(0.1627, dx[7], 0.0001); - - singa::Tensor dhx_grad = ret_back.first[1]; - dhx_grad.ToHost(); - const float *dhx = dhx_grad.data<float>(); - EXPECT_EQ(2u, dhx_grad.Size()); - EXPECT_NEAR(0.1627, dhx[0], 0.0001); - EXPECT_NEAR(0.1627, dhx[1], 0.0001); + + const auto outs = rnn.Forward(singa::kTrain, inputs); + + float dyptr[seqLength * batchsize * hidden_dim]; + for (size_t i = 0; i < seqLength * batchsize * hidden_dim; i++) + dyptr[i] = i * 0.1f; + vector<Tensor> grads; + for (size_t i = 0; i < seqLength; i++) { + Tensor dy(Shape{batchsize, hidden_dim}, cuda); + dy.CopyDataFromHostPtr(dyptr + i * dy.Size(), dy.Size()); + grads.push_back(dy); + } + Tensor dhy; + grads.push_back(dhy); + vector<float> dhyptr(hidden_dim, 0.0f); + const auto ret = rnn.Backward(singa::kTrain, grads); + for (size_t i = seqLength - 1; i > 0 ; i --) { + auto dx = ret.first[i]; + auto y = outs[i].Clone(); + y.ToHost(); + dx.ToHost(); + auto dxptr = dx.data<float>(); + auto yptr = y.data<float>(); + for (size_t j = 0; j < hidden_dim; j++) { + dhyptr[j] += dyptr[i * hidden_dim + j]; + dhyptr[j] *= 1 - yptr[j] * yptr[j]; + } + for (size_t k = 0; k < dim; k++) { + float tdx = 0; + for (size_t j = 0; j < hidden_dim; j++) { + tdx += dhyptr[j] * wvalue; + } + EXPECT_NEAR(tdx, dxptr[k], 1e-4); + } + vector<float> tmp; + for (size_t k = 0; k < hidden_dim; k++) { + float tdhy = 0; + for (size_t j = 0; j < hidden_dim; j++) { + tdhy += dhyptr[j] * wvalue; + } + tmp.push_back(tdhy); + } + std::copy(tmp.begin(), tmp.end(), dhyptr.begin()); + } } #endif // USE_CUDNN
