SINGA-173 OpenCL device support and implementation Implemented OpenclDevice Created Opencl kernels for usage Added Opencl support to the build process Added Opencl support to Travis Unit test additions and bug fixes
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/35c89308 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/35c89308 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/35c89308 Branch: refs/heads/dev Commit: 35c89308588f175bbe9bbe6b5d3f6b82e410e2c9 Parents: 4e7f3c1 Author: Tan Li Boon <[email protected]> Authored: Sun Jul 10 21:02:08 2016 +0800 Committer: Tan Li Boon <[email protected]> Committed: Tue Jul 26 18:39:08 2016 +0800 ---------------------------------------------------------------------- .travis.yml | 3 + CMakeLists.txt | 1 + cmake/Dependencies.cmake | 16 + cmake/Templates/singa_config.h.in | 2 + cmake/Thirdparty/FindOpenCL.cmake | 170 +++++ include/singa/core/common.h | 18 + include/singa/core/device.h | 52 +- include/singa/core/opencl_device.h | 132 ++++ include/singa/core/platform.h | 105 +++ include/singa/utils/context.h | 28 +- include/singa/utils/opencl_utils.h | 144 ++++ src/core/device/cpp_cpu.cc | 11 + src/core/device/opencl_device.cc | 227 +++++- src/core/device/platform.cc | 4 + src/core/tensor/tensor_math_opencl.cl | 569 +++++++++++++++ src/core/tensor/tensor_math_opencl.h | 1074 +++++++++++++++++++++++++++- src/utils/opencl_utils.cc | 63 ++ test/singa/test_opencl.cc | 179 +++++ 18 files changed, 2734 insertions(+), 64 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/.travis.yml ---------------------------------------------------------------------- diff --git a/.travis.yml b/.travis.yml index effde83..f7434d8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,9 @@ dist: trusty before_install: - sudo apt-get -qq update - sudo apt-get install -qq -y libopenblas-dev libgoogle-glog-dev libprotobuf-dev protobuf-compiler + - sudo apt-get install -qq -y opencl-headers ocl-icd-* + - wget https://github.com/KhronosGroup/OpenCL-CLHPP/releases/download/v2.0.9/cl2.hpp + - sudo mv cl2.hpp /usr/include/CL/ #- sudo apt-get install -qq libgtest-dev before_script: http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/CMakeLists.txt b/CMakeLists.txt index d3cd776..bd39a0a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,7 @@ OPTION(USE_CUDNN "Use Cudnn libs" ON) OPTION(USE_OPENCV "Use opencv" OFF) OPTION(USE_LMDB "Use LMDB libs" OFF) OPTION(USE_PYTHON "Generate py wrappers" ON) +OPTION(USE_OPENCL "Use OpenCL" ON) INCLUDE("cmake/Dependencies.cmake") INCLUDE("cmake/Utils.cmake") http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/cmake/Dependencies.cmake ---------------------------------------------------------------------- diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 3345399..68d0bfc 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -28,6 +28,22 @@ IF(USE_CBLAS) MESSAGE(STATUS "FOUND cblas at ${CBLAS_LIBRARIES}") ENDIF() +IF(USE_OPENCL) + FIND_PACKAGE(OpenCL REQUIRED) + IF(NOT OPENCL_FOUND) + MESSAGE(SEND_ERROR "OpenCL was requested, but not found.") + ELSE() + INCLUDE_DIRECTORIES(SYSTEM ${OpenCL_INCPATH}) + LIST(APPEND SINGA_LINKER_LIBS ${OPENCL_LIBRARIES}) + MESSAGE(STATUS "Found OpenCL at ${OPENCL_INCLUDE_DIRS}") + IF(NOT OPENCL_HAS_CPP_BINDINGS) + MESSAGE(SEND_ERROR "OpenCL C++ bindings cl2.hpp was not found.") + ELSE() + MESSAGE(STATUS "Found OpenCL C++ bindings.") + ENDIF() + ENDIF() +ENDIF() + FIND_PACKAGE(Glog REQUIRED) INCLUDE_DIRECTORIES(SYSTEM ${GLOG_INCLUDE_DIRS}) LIST(APPEND SINGA_LINKER_LIBS ${GLOG_LIBRARIES}) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/cmake/Templates/singa_config.h.in ---------------------------------------------------------------------- diff --git a/cmake/Templates/singa_config.h.in b/cmake/Templates/singa_config.h.in index d6d3eeb..0220d18 100644 --- a/cmake/Templates/singa_config.h.in +++ b/cmake/Templates/singa_config.h.in @@ -15,6 +15,8 @@ #cmakedefine USE_CUDNN #cmakedefine CUDNN_VERSION_MAJOR @CUDNN_VERSION_MAJOR@ +#cmakedefine USE_OPENCL + // lmdb #cmakedefine USE_LMDB http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/cmake/Thirdparty/FindOpenCL.cmake ---------------------------------------------------------------------- diff --git a/cmake/Thirdparty/FindOpenCL.cmake b/cmake/Thirdparty/FindOpenCL.cmake new file mode 100644 index 0000000..3c7daeb --- /dev/null +++ b/cmake/Thirdparty/FindOpenCL.cmake @@ -0,0 +1,170 @@ +# This script was taken from https://github.com/elhigu/cmake-findopencl +# and modified to support finding OpenCL 2.x C++ bindings. + +# Find OpenCL +# +# To set manually the paths, define these environment variables: +# OpenCL_INCPATH - Include path (e.g. OpenCL_INCPATH=/opt/cuda/4.0/cuda/include) +# OpenCL_LIBPATH - Library path (e.h. OpenCL_LIBPATH=/usr/lib64/nvidia) +# +# Once done this will define +# OPENCL_FOUND - system has OpenCL +# OPENCL_INCLUDE_DIRS - the OpenCL include directory +# OPENCL_LIBRARIES - link these to use OpenCL +# OPENCL_HAS_CPP_BINDINGS - system has also cl2.hpp + +FIND_PACKAGE(PackageHandleStandardArgs) + +SET (OPENCL_VERSION_STRING "0.1.0") +SET (OPENCL_VERSION_MAJOR 0) +SET (OPENCL_VERSION_MINOR 1) +SET (OPENCL_VERSION_PATCH 0) + +IF (APPLE) + + # IF OpenCL_LIBPATH is given use it and don't use default path + IF (DEFINED ENV{OpenCL_LIBPATH}) + FIND_LIBRARY(OPENCL_LIBRARIES OpenCL PATHS ENV OpenCL_LIBPATH NO_DEFAULT_PATH) + ELSE () + FIND_LIBRARY(OPENCL_LIBRARIES OpenCL DOC "OpenCL lib for OSX") + ENDIF () + + # IF OpenCL_INCPATH is given use it and find for CL/cl.h and OpenCL/cl.h do not try to find default paths + IF (DEFINED ENV{OpenCL_INCPATH}) + FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h OpenCL/cl.h PATHS ENV OpenCL_INCPATH NO_DEFAULT_PATH) + FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl2.hpp OpenCL/cl2.hpp PATHS ${OPENCL_INCLUDE_DIRS} NO_DEFAULT_PATH) + ELSE () + FIND_PATH(OPENCL_INCLUDE_DIRS OpenCL/cl.h DOC "Include for OpenCL on OSX") + FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS OpenCL/cl2.hpp DOC "Include for OpenCL CPP bindings on OSX") + ENDIF () + +ELSE (APPLE) + + IF (WIN32) + + # Find OpenCL includes and libraries from environment variables provided by vendor + SET(OPENCL_INCLUDE_SEARCH_PATHS) + SET(OPENCL_LIBRARY_SEARCH_PATHS) + SET(OPENCL_LIBRARY_64_SEARCH_PATHS) + + # Nvidia + IF (DEFINED ENV{CUDA_INC_PATH}) + SET(OPENCL_INCLUDE_SEARCH_PATHS ${OPENCL_INCLUDE_SEARCH_PATHS} $ENV{CUDA_INC_PATH}) + SET(OPENCL_LIBRARY_64_SEARCH_PATHS ${OPENCL_LIBRARY_64_SEARCH_PATHS} $ENV{CUDA_LIB_PATH}/../lib64) + SET(OPENCL_LIBRARY_SEARCH_PATHS ${OPENCL_LIBRARY_SEARCH_PATHS} $ENV{CUDA_LIB_PATH}/../lib) + ENDIF() + + # Intel SDK + IF (DEFINED ENV{INTELOCSDKROOT}) + SET(OPENCL_INCLUDE_SEARCH_PATHS ${OPENCL_INCLUDE_SEARCH_PATHS} $ENV{INTELOCSDKROOT}/include) + SET(OPENCL_LIBRARY_64_SEARCH_PATHS ${OPENCL_LIBRARY_64_SEARCH_PATHS} $ENV{INTELOCSDKROOT}/lib/x64) + SET(OPENCL_LIBRARY_SEARCH_PATHS ${OPENCL_LIBRARY_SEARCH_PATHS} $ENV{INTELOCSDKROOT}/lib/x86) + ENDIF() + + # AMD SDK + IF (DEFINED ENV{AMDAPPSDKROOT}) + SET(OPENCL_INCLUDE_SEARCH_PATHS ${OPENCL_INCLUDE_SEARCH_PATHS} $ENV{AMDAPPSDKROOT}/include) + SET(OPENCL_LIBRARY_64_SEARCH_PATHS ${OPENCL_LIBRARY_64_SEARCH_PATHS} $ENV{AMDAPPSDKROOT}/lib/x86_64) + SET(OPENCL_LIBRARY_SEARCH_PATHS ${OPENCL_LIBRARY_SEARCH_PATHS} $ENV{AMDAPPSDKROOT}/lib/x86) + ENDIF() + + # Override search paths with OpenCL_INCPATH env variable + IF (DEFINED ENV{OpenCL_INCPATH}) + SET(OPENCL_INCLUDE_SEARCH_PATHS $ENV{OpenCL_INCPATH}) + ENDIF () + + # Override search paths with OpenCL_LIBPATH env variable + IF (DEFINED ENV{OpenCL_INCPATH}) + SET(OPENCL_LIBRARY_SEARCH_PATHS $ENV{OpenCL_LIBPATH}) + SET(OPENCL_LIBRARY_64_SEARCH_PATHS $ENV{OpenCL_LIBPATH}) + ENDIF () + + FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS ${OPENCL_INCLUDE_SEARCH_PATHS}) + FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl2.hpp PATHS ${OPENCL_INCLUDE_SEARCH_PATHS}) + + FIND_LIBRARY(_OPENCL_32_LIBRARIES OpenCL.lib HINTS ${OPENCL_LIBRARY_SEARCH_PATHS} PATHS ${OPENCL_LIB_DIR} ENV PATH) + FIND_LIBRARY(_OPENCL_64_LIBRARIES OpenCL.lib HINTS ${OPENCL_LIBRARY_64_SEARCH_PATHS} PATHS ${OPENCL_LIB_DIR} ENV PATH) + + # Check if 64bit or 32bit versions links fine + SET (_OPENCL_VERSION_SOURCE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/openclversion.c") + #SET (_OPENCL_VERSION_SOURCE "${CMAKE_BINARY_DIR}/test.c") + FILE (WRITE "${_OPENCL_VERSION_SOURCE}" + " + #if __APPLE__ + #include <OpenCL/cl.h> + #else /* !__APPLE__ */ + #include <CL/cl.h> + #endif /* __APPLE__ */ + int main() + { + cl_int result; + cl_platform_id id; + result = clGetPlatformIDs(1, &id, NULL); + return result != CL_SUCCESS; + } + ") + + TRY_COMPILE(_OPENCL_64_COMPILE_SUCCESS ${CMAKE_BINARY_DIR} "${_OPENCL_VERSION_SOURCE}" + CMAKE_FLAGS + "-DINCLUDE_DIRECTORIES:STRING=${OPENCL_INCLUDE_DIRS}" + CMAKE_FLAGS + "-DLINK_LIBRARIES:STRING=${_OPENCL_64_LIBRARIES}" + ) + + IF(_OPENCL_64_COMPILE_SUCCESS) + message(STATUS "OpenCL 64bit lib found.") + SET(OPENCL_LIBRARIES ${_OPENCL_64_LIBRARIES}) + ELSE() + TRY_COMPILE(_OPENCL_32_COMPILE_SUCCESS ${CMAKE_BINARY_DIR} "${_OPENCL_VERSION_SOURCE}" + CMAKE_FLAGS + "-DINCLUDE_DIRECTORIES:STRING=${OPENCL_INCLUDE_DIRS}" + CMAKE_FLAGS + "-DLINK_LIBRARIES:STRING=${_OPENCL_32_LIBRARIES}" + ) + IF(_OPENCL_32_COMPILE_SUCCESS) + message(STATUS "OpenCL 32bit lib found.") + SET(OPENCL_LIBRARIES ${_OPENCL_32_LIBRARIES}) + ELSE() + message(STATUS "Couldn't link opencl..") + ENDIF() + ENDIF() + + + ELSE (WIN32) + + IF (CYGWIN) + SET (CMAKE_FIND_LIBRARY_SUFFIXES .lib) + SET (OCL_LIB_SUFFIX .lib) + ENDIF (CYGWIN) + + # Unix style platforms + FIND_LIBRARY(OPENCL_LIBRARIES OpenCL${OCL_LIB_SUFFIX} + PATHS ENV LD_LIBRARY_PATH ENV OpenCL_LIBPATH + ) + + GET_FILENAME_COMPONENT(OPENCL_LIB_DIR ${OPENCL_LIBRARIES} PATH) + GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE) + + # The AMD SDK currently does not place its headers + # in /usr/include, therefore also search relative + # to the library + FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS ${_OPENCL_INC_CAND} "/usr/local/cuda/include" "/opt/AMDAPP/include" ENV OpenCL_INCPATH) + FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl2.hpp PATHS ${_OPENCL_INC_CAND} "/usr/local/cuda/include" "/opt/AMDAPP/include" ENV OpenCL_INCPATH) + + ENDIF (WIN32) + +ENDIF (APPLE) + +FIND_PACKAGE_HANDLE_STANDARD_ARGS(OpenCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS) + +IF(_OPENCL_CPP_INCLUDE_DIRS) + SET( OPENCL_HAS_CPP_BINDINGS TRUE ) + LIST( APPEND OPENCL_INCLUDE_DIRS ${_OPENCL_CPP_INCLUDE_DIRS} ) + # This is often the same, so clean up + LIST( REMOVE_DUPLICATES OPENCL_INCLUDE_DIRS ) +ENDIF(_OPENCL_CPP_INCLUDE_DIRS) + +MARK_AS_ADVANCED( + OPENCL_INCLUDE_DIRS +) + http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/include/singa/core/common.h ---------------------------------------------------------------------- diff --git a/include/singa/core/common.h b/include/singa/core/common.h index 0a52a05..9586286 100644 --- a/include/singa/core/common.h +++ b/include/singa/core/common.h @@ -32,9 +32,20 @@ #ifdef USE_CUDNN #include <cudnn.h> #endif +#endif // USE_CUDA + + +#ifdef USE_OPENCL +#define CL_HPP_MINIMUM_OPENCL_VERSION 120 +#define CL_HPP_TARGET_OPENCL_VERSION 120 +#include <CL/cl2.hpp> +#include <map> #endif + using std::atomic; + namespace singa { + namespace lang { /// To implemente functions using cpp libraries typedef struct _Cpp { } Cpp; @@ -85,7 +96,14 @@ typedef struct _Context { #ifdef USE_CUDNN cudnnHandle_t cudnn_handle; #endif +#endif // USE_CUDA + +#ifdef USE_OPENCL + std::shared_ptr<std::map<std::string, cl::Kernel>> kernels; + cl::CommandQueue ocl_cmdq; + cl::Context ocl_ctx; #endif + } Context; } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/include/singa/core/device.h ---------------------------------------------------------------------- diff --git a/include/singa/core/device.h b/include/singa/core/device.h index e0e8676..4c775a1 100644 --- a/include/singa/core/device.h +++ b/include/singa/core/device.h @@ -34,7 +34,9 @@ using std::vector; using std::string; using std::function; using std::shared_ptr; + namespace singa { + /// Allocate memory and execute Tensor operations. /// There are three types of devices distinguished by their programming /// languages, namely cpp, cuda and opencl. @@ -187,56 +189,6 @@ class CudaGPU : public Device { #endif // USE_CUDA -/// For querying physical devices and creating singa::Device instances. -class Platform { - public: - /// Return the number of total avaiable GPUs - static int GetNumGPUs(); - - /// Return the device IDs of available GPUs. - /// TODO(wangwei) return the IDs according to free memory in decending order - static const vector<int> GetGPUIDs(); - - static const std::pair<size_t, size_t> GetGPUMemSize(const int device); - /// Return the memory of a GPU <free, total> - static const vector<std::pair<size_t, size_t>> GetGPUMemSize(); - - /// Return a string containing all hardware info, e.g., version, memory size. - static const string DeviceQuery(int id, bool verbose = false); - - /// Create a set of CudaGPU Device using 'num_devices' free GPUs. - static const vector<shared_ptr<Device> > - CreateCudaGPUs(const size_t num_devices, size_t init_size = 0); - - /// Create a set of CudaGPU Device using given GPU IDs. - static const vector<shared_ptr<Device> > - CreateCudaGPUs(const vector<int> &devices, size_t init_size = 0); - - /// Create a set of OpenclGPU Device using 'num_devices' free GPUs. - const vector<shared_ptr<Device>> CreateOpenclGPUs(const size_t num_devices); - - /// Create a set of OpenclGPU Device using given GPU IDs. - const vector<shared_ptr<Device>> CreateOpenclGPUs(const vector<int>& id); - /// This function is implementd by Caffe (http://caffe.berkeleyvision.org/). - /// This function checks the availability of GPU #device_id. - /// It attempts to create a context on the device by calling cudaFree(0). - /// cudaSetDevice() alone is not sufficient to check the availability. - /// It lazily records device_id, however, does not initialize a - /// context. So it does not know if the host thread has the permission to use - /// the device or not. - /// - /// In a shared environment where the devices are set to EXCLUSIVE_PROCESS - /// or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess - /// even if the device is exclusively occupied by another process or thread. - /// Cuda operations that initialize the context are needed to check - /// the permission. cudaFree(0) is one of those with no side effect, - /// except the context initialization. - static bool CheckDevice(const int device_id); - -// private: - Platform() {}; // No need to construct an instance as it has no member fields -}; - } // namespace singa #endif // SINGA_CORE_DEVICE_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/include/singa/core/opencl_device.h ---------------------------------------------------------------------- diff --git a/include/singa/core/opencl_device.h b/include/singa/core/opencl_device.h new file mode 100644 index 0000000..14b6fe7 --- /dev/null +++ b/include/singa/core/opencl_device.h @@ -0,0 +1,132 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SINGA_CORE_OPENCL_DEVICE_H_ +#define SINGA_CORE_OPENCL_DEVICE_H_ + +#include "singa/core/device.h" + +#ifdef USE_OPENCL +// http://github.khronos.org/OpenCL-CLHPP/ +// cl2.hpp includes cl.h, do not re-include. +#define CL_HPP_MINIMUM_OPENCL_VERSION 120 +#define CL_HPP_TARGET_OPENCL_VERSION 120 +#include <map> +#include <memory> +#include <CL/cl2.hpp> + +#include "singa/utils/opencl_utils.h" + +namespace singa { + +// Implement Device using OpenCL libs. +class OpenclDevice : public singa::Device { +public: + + // TODO: Constructor arguments to consider: + // Path to kernel sources? + // Select only certain device types? + OpenclDevice(int id = 0, int num_executors = 1); + ~OpenclDevice(); + + /// Get the specified kernel. + cl::Kernel GetKernel(const std::string& kname, cl_int* status = nullptr); + + /// Get the command queue associated with this device. + cl::CommandQueue GetCmdQ() { return cmdq; } + + /// Prints information about all Devices in each Platform. + void PrintAllDeviceInfo(); + + /// Prints status about CL source code builds. + void PrintClBuildInfo(cl::Program &p); + +// Overridden, inherited methods + void SetRandSeed(unsigned seed) override; + + void CopyDataToFrom(Block* dst, Block* src, size_t nBytes, + CopyDirection direction, int dst_offset = 0, + int src_offset = 0); +/* + void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes = 0, + size_t dst_offset = 0) override;*/ + +protected: + /// The OpenCL device that this object represents. + /// Each OpenclDevice contains exactly one cl::Device for the lifetime of the + /// object. + cl::Device this_device; + + /// Each OpenclDevice has one OpenCL context. It is created along with the + /// creation of this object. + cl::Context ocl_ctx; + + /// The CommandQueue that is associated with this device. + /// Since each OpenclDevice contains only one cl::Device and one cl::Context, + /// it naturally also contains one cl::CommandQueue that is associated + /// with said Device and Context. + cl::CommandQueue cmdq; + + /// A list of kernels that has been compiled on this device. + std::shared_ptr<std::map<std::string, cl::Kernel>> kernels; + + /// Searches the given paths for all .cl files and builds + /// OpenCL programs, then stores them in the Kernels map. + void BuildPrograms(const std::string &kdir = cl_src_path); + +// Overridden, inherited methods. + + void DoExec(function<void(Context*)>&& fn, int executor) override; + + void CopyToFrom(void* dst, const void* src, size_t nBytes, + CopyDirection direction, Context* ctx = nullptr) override; + + /// Allocates memory on this OpenCL device + /// by creating and returning an empty cl::Buffer object. + /// with the indicated size. + void* Malloc(int size) override; + + /// Converts the void pointer into a Buffer object, then deletes the object. + /// This has the effect of freeing up device memory. + void Free(void* ptr) override; + +private: + + /// Copies a data block from host to device. + /// src: a pointer to an array of data. + /// dst: a pointer to a cl::Buffer object. + void WriteToDevice(cl::Buffer* dst, const void* src, const size_t size); + + /// Reads a data block from device to host. + /// src: a pointer to an cl::Buffer object. + /// dst: a pointer to an malloc'ed empty array. + void ReadFromDevice(void* dst, const cl::Buffer* src, const size_t size); + + /// Duplicates a block of data on the device. + /// src: a pointer to the original cl::Buffer object. + /// dst: a pointer to the new cl::Buffer object to copy the data into. + void CopyDeviceBuffer(cl::Buffer* dst, const cl::Buffer* src, const size_t size); + + static const std::string cl_src_path; +}; + +} // namespace singa + +#endif // USE_OPENCL + +#endif // SINGA_CORE_OPENCL_DEVICE_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/include/singa/core/platform.h ---------------------------------------------------------------------- diff --git a/include/singa/core/platform.h b/include/singa/core/platform.h new file mode 100644 index 0000000..ff1bbea --- /dev/null +++ b/include/singa/core/platform.h @@ -0,0 +1,105 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SINGA_CORE_PLATFORM_H_ +#define SINGA_CORE_PLATFORM_H_ + +#include <memory> +#include <vector> + +#include "singa/core/device.h" +#include "singa/singa_config.h" + +#ifdef USE_CUDA +#include "singa/utils/cuda_utils.h" +#endif // USE_CUDA + +#ifdef USE_OPENCL +#include <cl/cl2.hpp> +#endif // USE_OPENCL + +namespace singa { + +/// This class queries all available calculating devices on a given machine +/// grouped according to manufacturer or device drivers. All methods should be static. +/// If CUDA or OPENCL are not enabled, then the respective related methods should +/// return something that indicates their absence (for example, 0 devices); +/// however they should always be available regardless of compile-time switches. +class Platform { +public: + + /// Constructor. + Platform(); + + /// Return the number of total available GPUs + static int GetNumGPUs(); + + /// Return the device IDs of available GPUs. + /// TODO(wangwei) return the IDs according to free memory in decending order + static const std::vector<int> GetGPUIDs(); + + static const std::pair<size_t, size_t> GetGPUMemSize(const int device); + + /// Return the memory of a GPU <free, total> + static const std::vector<std::pair<size_t, size_t>> GetGPUMemSize(); + + /// Return a string containing all hardware info, e.g., version, memory size. + static const std::string DeviceQuery(int id, bool verbose = false); + + /// Create a set of CudaGPU Device using 'num_devices' free GPUs. + static const std::vector<std::shared_ptr<Device>> + CreateCudaGPUs(const size_t num_devices, size_t init_size = 0); + + /// Create a set of CudaGPU Device using given GPU IDs. + static const std::vector<std::shared_ptr<Device>> + CreateCudaGPUs(const std::vector<int> &devices, size_t init_size = 0); + + /// Create a \p num_devices set of valid OpenCL devices, regardless of platforms. + /// If there are fewer valid devices than requested, then this method will return as many as possible. + /// If OpenCL is not in use, this method will return an empty array. + const std::vector<std::shared_ptr<Device>> CreateOpenclDevices(const size_t num_devices); + + /// Create a set of valid OpenCL devices, regardless of platforms, assigning \p id to each device in sequence. + /// If there are fewer valid devices than requested, then this method will return as many as possible. + /// If OpenCL is not in use, this method will return an empty array. + const std::vector<std::shared_ptr<Device>> CreateOpenclDevices(const vector<int>& id); + + /// This function is implementd by Caffe (http://caffe.berkeleyvision.org/). + /// This function checks the availability of GPU #device_id. + /// It attempts to create a context on the device by calling cudaFree(0). + /// cudaSetDevice() alone is not sufficient to check the availability. + /// It lazily records device_id, however, does not initialize a + /// context. So it does not know if the host thread has the permission to use + /// the device or not. + /// + /// In a shared environment where the devices are set to EXCLUSIVE_PROCESS + /// or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess + /// even if the device is exclusively occupied by another process or thread. + /// Cuda operations that initialize the context are needed to check + /// the permission. cudaFree(0) is one of those with no side effect, + /// except the context initialization. + static bool CheckDevice(const int device_id); + + +private: + cl::Platform clPlatform; +}; + +} // namespace singa + +#endif // SINGA_CORE_PLATFORM_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/include/singa/utils/context.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/context.h b/include/singa/utils/context.h index 1d47215..6e897e8 100644 --- a/include/singa/utils/context.h +++ b/include/singa/utils/context.h @@ -47,7 +47,7 @@ CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ #include <cudnn.h> #endif -#endif +#endif // USE_GPU namespace singa { @@ -249,17 +249,8 @@ class Context { cudnnHandle_t cudnn_handle(const int device_id) { CHECK_GE(device_id, 0); CHECK_LT(device_id, cudnn_handle_.size()); - if (cudnn_handle_.at(device_id) == nullptr) { - ActivateDevice(device_id); - // LOG(ERROR) << "create cudnn handle for device " << device_id; - CHECK_EQ(cudnnCreate(&cudnn_handle_[device_id]), CUDNN_STATUS_SUCCESS); - } - // LOG(ERROR) << "use cudnn handle from device " << device_id; - return cudnn_handle_[device_id]; } -#endif - -#endif +#endif // USE_CUDNN protected: //!< max num of GPUs per process @@ -279,9 +270,22 @@ class Context { #ifdef USE_CUDNN std::vector<cudnnHandle_t> cudnn_handle_; #endif -#endif +#endif // USE_GPU }; } // namespace singa #endif // SINGA_UTILS_CONTEXT_H_ + if (cudnn_handle_.at(device_id) == nullptr) { + ActivateDevice(device_id); + // LOG(ERROR) << "create cudnn handle for device " << device_id; + CHECK_EQ(cudnnCreate(&cudnn_handle_[device_id]), CUDNN_STATUS_SUCCESS); + } + // LOG(ERROR) << "use cudnn handle from device " << device_id; + return cudnn_handle_[device_id]; + } +#endif + +#endif // USE_GPU + +#ifdef USE_OPENCL http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/include/singa/utils/opencl_utils.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/opencl_utils.h b/include/singa/utils/opencl_utils.h new file mode 100644 index 0000000..664a9e1 --- /dev/null +++ b/include/singa/utils/opencl_utils.h @@ -0,0 +1,144 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#ifndef SINGA_UTILS_OPENCL_UTILS_H_ +#define SINGA_UTILS_OPENCL_UTILS_H_ + +#ifdef USE_OPENCL + +#include <iostream> + +// http://github.khronos.org/OpenCL-CLHPP/ +// cl2.hpp includes cl.h, do not re-include. +#define CL_HPP_MINIMUM_OPENCL_VERSION 120 +#define CL_HPP_TARGET_OPENCL_VERSION 120 +#include <CL/cl2.hpp> + +#define CL_BREAK_ON_FAILURE if (status != CL_SUCCESS) return; + + +inline const char* clGetBuildInfoString(const cl_build_status status) { + switch (status) { + case CL_BUILD_NONE: return "CL_BUILD_NONE"; + case CL_BUILD_ERROR: return "CL_BUILD_ERROR"; + case CL_BUILD_SUCCESS: return "CL_BUILD_SUCCESS"; + case CL_BUILD_IN_PROGRESS: return "CL_BUILD_IN_PROGRESS"; + default: return ""; + } +} + + +inline const char* clGetErrorString(const cl_int status) { + + switch(status) { + + // Run-time and JIT compiler errors + case 0: return "CL_SUCCESS"; + case -1: return "CL_DEVICE_NOT_FOUND"; + case -2: return "CL_DEVICE_NOT_AVAILABLE"; + case -3: return "CL_COMPILER_NOT_AVAILABLE"; + case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case -5: return "CL_OUT_OF_RESOURCES"; + case -6: return "CL_OUT_OF_HOST_MEMORY"; + case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case -8: return "CL_MEM_COPY_OVERLAP"; + case -9: return "CL_IMAGE_FORMAT_MISMATCH"; + case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case -11: return "CL_BUILD_PROGRAM_FAILURE"; + case -12: return "CL_MAP_FAILURE"; + case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; + case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; + case -15: return "CL_COMPILE_PROGRAM_FAILURE"; + case -16: return "CL_LINKER_NOT_AVAILABLE"; + case -17: return "CL_LINK_PROGRAM_FAILURE"; + case -18: return "CL_DEVICE_PARTITION_FAILED"; + case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; + + // Compile-time errors + case -30: return "CL_INVALID_VALUE"; + case -31: return "CL_INVALID_DEVICE_TYPE"; + case -32: return "CL_INVALID_PLATFORM"; + case -33: return "CL_INVALID_DEVICE"; + case -34: return "CL_INVALID_CONTEXT"; + case -35: return "CL_INVALID_QUEUE_PROPERTIES"; + case -36: return "CL_INVALID_COMMAND_QUEUE"; + case -37: return "CL_INVALID_HOST_PTR"; + case -38: return "CL_INVALID_MEM_OBJECT"; + case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case -40: return "CL_INVALID_IMAGE_SIZE"; + case -41: return "CL_INVALID_SAMPLER"; + case -42: return "CL_INVALID_BINARY"; + case -43: return "CL_INVALID_BUILD_OPTIONS"; + case -44: return "CL_INVALID_PROGRAM"; + case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; + case -46: return "CL_INVALID_KERNEL_NAME"; + case -47: return "CL_INVALID_KERNEL_DEFINITION"; + case -48: return "CL_INVALID_KERNEL"; + case -49: return "CL_INVALID_ARG_INDEX"; + case -50: return "CL_INVALID_ARG_VALUE"; + case -51: return "CL_INVALID_ARG_SIZE"; + case -52: return "CL_INVALID_KERNEL_ARGS"; + case -53: return "CL_INVALID_WORK_DIMENSION"; + case -54: return "CL_INVALID_WORK_GROUP_SIZE"; + case -55: return "CL_INVALID_WORK_ITEM_SIZE"; + case -56: return "CL_INVALID_GLOBAL_OFFSET"; + case -57: return "CL_INVALID_EVENT_WAIT_LIST"; + case -58: return "CL_INVALID_EVENT"; + case -59: return "CL_INVALID_OPERATION"; + case -60: return "CL_INVALID_GL_OBJECT"; + case -61: return "CL_INVALID_BUFFER_SIZE"; + case -62: return "CL_INVALID_MIP_LEVEL"; + case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; + case -64: return "CL_INVALID_PROPERTY"; + case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; + case -66: return "CL_INVALID_COMPILER_OPTIONS"; + case -67: return "CL_INVALID_LINKER_OPTIONS"; + case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; + + // Extension errors + case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; + case -1001: return "CL_PLATFORM_NOT_FOUND_KHR"; + case -1002: return "CL_INVALID_D3D10_DEVICE_KHR"; + case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR"; + case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; + case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; + + default: return "Unknown OpenCL status"; + } +} + + +/// Special function used to perform error checking and logging. +inline bool OCL_CHECK(const cl_int status, const char* what) { + if (status == CL_SUCCESS) return true; // Nothing wrong. + LOG(ERROR) << status << ": " << clGetErrorString(status) << " " << what << std::endl; + return false; +} + +/// Prints information about the specified Platform. +void PrintPlatformInfo(const cl::Platform &p); + +/// Prints information about the specified Device. +void PrintDeviceInfo(const cl::Device &dev); + +#endif // USE_OPENCL + +#endif // SINGA_UTILS_OPENCL_UTILS_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/src/core/device/cpp_cpu.cc ---------------------------------------------------------------------- diff --git a/src/core/device/cpp_cpu.cc b/src/core/device/cpp_cpu.cc index 401645d..2b3e63b 100644 --- a/src/core/device/cpp_cpu.cc +++ b/src/core/device/cpp_cpu.cc @@ -15,22 +15,30 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #include "singa/core/device.h" + namespace singa { + std::shared_ptr<Device> defaultDevice=std::make_shared<CppCPU>(); + CppCPU::CppCPU() : Device(0, 1) { lang_ = kCpp; //host_ = nullptr; } + void CppCPU::SetRandSeed(unsigned seed) { ctx_.random_generator.seed(seed); } + + void CppCPU::DoExec(function<void(Context*)>&& fn, int executor) { CHECK_EQ(executor, 0); fn(&ctx_); } + void* CppCPU::Malloc(int size) { if (size > 0) { void *ptr = malloc(size); @@ -41,13 +49,16 @@ void* CppCPU::Malloc(int size) { } } + void CppCPU::Free(void* ptr) { if (ptr != nullptr) free(ptr); } + void CppCPU::CopyToFrom(void* dst, const void* src, size_t nBytes, CopyDirection direction, Context* ctx) { memcpy(dst, src, nBytes); } + } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/src/core/device/opencl_device.cc ---------------------------------------------------------------------- diff --git a/src/core/device/opencl_device.cc b/src/core/device/opencl_device.cc index 76c646e..053ac4f 100644 --- a/src/core/device/opencl_device.cc +++ b/src/core/device/opencl_device.cc @@ -15,10 +15,235 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "singa/core/device.h" + +#include <iostream> +#include <fstream> +#include <sstream> +#include <string> + +#include "singa/core/opencl_device.h" +#include "singa/utils/tinydir.h" + +#ifdef USE_OPENCL + +using std::string; + namespace singa { +const string OpenclDevice::cl_src_path = "../src/core/tensor"; + +OpenclDevice::OpenclDevice(int id, int num_executors) + : Device(id, num_executors) { + lang_ = kOpencl; + this->kernels = std::make_shared<std::map<std::string, cl::Kernel>>(); + + // Create the OpenCL Device, Context, and CommandQueue. + /// TODO: This merely chooses the first device on the first platform. + cl_int status = CL_SUCCESS; + + std::vector<cl::Platform> platforms; + status = cl::Platform::get(&platforms); + OCL_CHECK(status, "Failed to find any OpenCL platforms!"); + + std::vector<cl::Device> devices; + status = platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices); + OCL_CHECK(status, "Failed to get list of devices from platform!"); + + this->this_device = cl::Device(devices[0]); + this->ocl_ctx = cl::Context(this_device, nullptr, nullptr, nullptr, &status); + OCL_CHECK(status, "Failed to create context!"); + this->cmdq = cl::CommandQueue(ocl_ctx, this_device, CL_QUEUE_PROFILING_ENABLE, &status); + OCL_CHECK(status, "Failed to create a command queue!"); + BuildPrograms(); + + ctx_.kernels = kernels; + ctx_.ocl_cmdq = cmdq; + ctx_.ocl_ctx = ocl_ctx; } + + +OpenclDevice::~OpenclDevice() { + + // Flush and finish the command queue. + cmdq.flush(); + cmdq.finish(); +} + + +cl::Kernel OpenclDevice::GetKernel(const std::string& kname, cl_int* status) { + if (!status) *status = CL_SUCCESS; + if (kernels->find(kname) == kernels->end()) { + // TODO: Not found + LOG(ERROR) << "Error: Kernel " << kname << " could not be found!"; + if (!status) *status = CL_INVALID_KERNEL; + } + return kernels->at(kname); +} + +/* +void OpenclDevice::PrintAllDeviceInfo() { + cl_int status = CL_SUCCESS; + + for (auto dev : devices) { + PrintDeviceInfo(d); + } +} +*/ + + +void OpenclDevice::PrintClBuildInfo(cl::Program &p) { + cl_int status = CL_SUCCESS; + + auto buildStatus = p.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(&status); + for (auto pair : buildStatus) + std::cout << clGetBuildInfoString(pair.second) << std::endl; + + auto buildLog = p.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&status); + for (auto pair : buildLog) + std::cout << pair.second << std::endl; +} + + +void OpenclDevice::SetRandSeed(unsigned seed) { seed = seed; } + + +void OpenclDevice::CopyDataToFrom(Block* dst, Block* src, size_t nBytes, + CopyDirection direction, int dst_offset, int src_offset) { + // Pointers must be valid. + if (!dst || !src) return; + + CopyToFrom(dst->mutable_data(), src->data(), nBytes, direction); +} + +/* +void OpenclDevice::CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes, size_t dst_offset) { + CopyToFrom(dst->mutable_data(), src, 4, kHostToDevice); +} +*/ + +void OpenclDevice::BuildPrograms(const std::string &kdir) { + cl_int status = CL_SUCCESS; + + tinydir_dir dir; + tinydir_open(&dir, kdir.c_str()); + + while (dir.has_next) { + tinydir_file file; + tinydir_readfile(&dir, &file); + std::string ext(file.extension); + if (ext.compare("cl") != 0) { + tinydir_next(&dir); + continue; + } + + std::ifstream clFile(file.path, std::ios_base::binary); + std::stringstream buffer; + buffer << clFile.rdbuf(); + std::string clSrc(buffer.str()); + + cl::Program program(this->ocl_ctx, clSrc, false, &status); + OCL_CHECK(status, "Program creation failed."); + status = program.build({this_device}, "-cl-std=CL1.2"); + if (status == CL_SUCCESS) { + std::vector<cl::Kernel> built_kernels; + status = program.createKernels(&built_kernels); + OCL_CHECK(status, "Failed to create kernels in built program."); + + for (auto k : built_kernels) { + std::string name = k.getInfo<CL_KERNEL_FUNCTION_NAME>(&status); + this->kernels->insert(std::make_pair(name, k)); + } + } else { + OCL_CHECK(status, "Build failed on source path"); + LOG(ERROR) << file.path << std::endl; + PrintClBuildInfo(program); + } + + tinydir_next(&dir); + } +} + +// Device IO functions. +// TODO: +// Research - MapBuffers can improve performance when the device uses shared memory +// but is more complex to understand. http://stackoverflow.com/questions/22057692/whats-the-difference-between-clenqueuemapbuffer-and-clenqueuewritebuffer +// Intel graphics (and possibly AMD APUs) should use MapBuffers? +// https://software.intel.com/en-us/articles/getting-the-most-from-opencl-12-how-to-increase-performance-by-minimizing-buffer-copies-on-intel-processor-graphics + + +void OpenclDevice::DoExec(function<void(Context*)>&& fn, int executor) { + fn(&ctx_); +} + +// NOTE: ASSUMES dst AND/OR src POINTERS CAN BE CAST TO cl::Buffer POINTERS! +void OpenclDevice::CopyToFrom(void* dst, const void* src, size_t nBytes, + CopyDirection direction, Context* ctx) { + // Pointers must be valid. + if (!dst || !src) return; + + switch(direction) { + case kHostToDevice: { + WriteToDevice(static_cast<cl::Buffer*>(dst), src, nBytes); + return; + } + case kDeviceToHost: { + ReadFromDevice(dst, static_cast<const cl::Buffer*>(src), nBytes); + return; + } + case kDeviceToDevice: { + CopyDeviceBuffer(static_cast<cl::Buffer*>(dst), static_cast<const cl::Buffer*>(src), nBytes); + return; + } + default: + return; + } +} + + +void* OpenclDevice::Malloc(int size) { + cl_int status = CL_SUCCESS; + + cl::Buffer* buffer = new cl::Buffer(ocl_ctx, CL_MEM_READ_WRITE, size, nullptr, &status); + OCL_CHECK(status, "Unable to allocate memory in OpenCL device."); + + return static_cast<void*>(buffer); +} + + +void OpenclDevice::Free(void* p) { + if (!p) return; + cl::Buffer* buffer = static_cast<cl::Buffer*>(p); + delete buffer; +} + + +void OpenclDevice::WriteToDevice(cl::Buffer* dst, const void* src, const size_t size) { + cl_int status = CL_SUCCESS; + + status = cmdq.enqueueWriteBuffer(*dst, CL_TRUE, 0, size, src); + OCL_CHECK(status, "Unable to write data to OpenCL device."); +} + + +void OpenclDevice::ReadFromDevice(void* dst, const cl::Buffer* src, const size_t size) { + cl_int status = CL_SUCCESS; + + status = cmdq.enqueueReadBuffer(*src, CL_TRUE, 0, size, dst); + OCL_CHECK(status, "Unable to read data from OpenCL device."); +} + + +// dst: cl::Buffer pointer src: cl::Buffer pointer +void OpenclDevice::CopyDeviceBuffer(cl::Buffer* dst, const cl::Buffer* src, const size_t size) { + cl_int status = CL_SUCCESS; + + status = cmdq.enqueueCopyBuffer(*src, *dst, 0, 0, size); + OCL_CHECK(status, "Unable to copy buffer in OpenCL device."); +} + +} // namespace singa + +#endif // USE_OPENCL http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/src/core/device/platform.cc ---------------------------------------------------------------------- diff --git a/src/core/device/platform.cc b/src/core/device/platform.cc index 1e2dc4a..984df69 100644 --- a/src/core/device/platform.cc +++ b/src/core/device/platform.cc @@ -18,9 +18,12 @@ #include "singa/core/device.h" #include "singa/singa_config.h" + #ifdef USE_CUDA #include "singa/utils/cuda_utils.h" + namespace singa { + int Platform::GetNumGPUs() { int count; CUDA_CHECK(cudaGetDeviceCount(&count)); @@ -135,4 +138,5 @@ Platform::CreateCudaGPUs(const vector<int> &devices, size_t init_size) { } } // namespace singa + #endif // USE_CUDA http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/src/core/tensor/tensor_math_opencl.cl ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_opencl.cl b/src/core/tensor/tensor_math_opencl.cl new file mode 100644 index 0000000..56eef44 --- /dev/null +++ b/src/core/tensor/tensor_math_opencl.cl @@ -0,0 +1,569 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// ************************************** +// Element-wise functions +// ************************************** + +// Sum is basically reduction. +// This reduction code is serial reduction modified from AMD's example. +// http://developer.amd.com/resources/documentation-articles/articles-whitepapers/opencl-optimization-case-study-simple-reductions/ +__kernel +void clkernel_abs(const int num, __global const float* in, __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = fabs(in[i]); +} + +__kernel +void clkernel_add_scalar(const int num, float x, __global const float* in, __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = in[i] + x; +} + +__kernel +void clkernel_add(const int num, __global const float* in1, __global const float* in2, + __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = in1[i] + in2[i]; +} + +__kernel +void clkernel_clamp(const int num, float low, float high, __global const float* in, + __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = clamp(in[i], low, high); +} + +__kernel +void clkernel_divide_scalar_matx(const int num, __global const float* in1, const float x, + __global const float* in2, __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = in1[i] / x; +} + +__kernel +void clkernel_divide_scalar_xmat(const int num, const float x, __global const float* in1, + __global const float* in2, __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = x / in1[i]; +} + +__kernel +void clkernel_divide(const int num, __global const float* in1, __global const float* in2, + __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = in1[i] / in2[i]; +} + +__kernel +void clkernel_eltmult_scalar(const int num, const float x, __global const float* in, + __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = in[i] * x; +} + +__kernel +void clkernel_eltmult(const int num, __global const float* in1, __global const float* in2, + __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = in1[i] * in2[i]; +} + +__kernel +void clkernel_exp(const int num, __global const float* in, __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = exp(in[i]); +} + +__kernel +void clkernel_le(const int num, __global const float* in, const float x, + __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = (in[i] <= x) ? 1.0f : 0.0f; +} + +__kernel +void clkernel_log(const int num, __global const float* in, __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = log(in[i]); +} + +__kernel +void clkernel_lt(const int num, __global const float* in, const float x, + __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = (in[i] < x) ? 1.0f : 0.0f; +} + +__kernel +void clkernel_ge(const int num, __global const float* in, const float x, + __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = (in[i] >= x) ? 1.0f : 0.0f; +} + +__kernel +void clkernel_gt(const int num, __global const float* in, const float x, + __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = (in[i] > x) ? 1.0f : 0.0f; +} + +__kernel +void clkernel_pow_scalar(const int num, const float x, __global const float* in, + __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = pow(in[i], x); +} + +__kernel +void clkernel_pow(const int num, __global const float* in1, __global const float* in2, + __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = pow(in1[i], in2[i]); +} + +__kernel +void clkernel_relu(const int num, __global const float* in, __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = (in[i] > 0) ? in[i] : 0.0f; +} + +__kernel +void clkernel_set(const int num, const float x, __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = x; +} + +__kernel +void clkernel_sigmoid(const int num, __global const float* in, __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = 1 / (1 + exp(-(in[i]))); +} + +__kernel +void clkernel_sign(const int num, __global const float* in, __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = sign(in[i]); +} + +__kernel +void clkernel_sqrt(const int num, __global const float* in, __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = sqrt(in[i]); +} + +// kernel for square is called pow(2). + +__kernel +void clkernel_subtract_scalar(const int num, __global const float* in, const float x, + __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = in[i] - x; +} + +__kernel +void clkernel_subtract(const int num, __global const float* in1, __global const float* in2, + __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = in1[i] - in2[i]; +} + +// reduce3 kernel from +// https://github.com/sschaetz/nvidia-opencl-examples/blob/master/OpenCL/src/oclReduction/oclReduction_kernel.cl +__kernel +void clkernel_sum(const int num, __global const float* in, __global float* out, + __local float* sdata) { + const int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0); + const int tid = get_local_id(0); + sdata[tid] = (i < num) ? in[i] : 0.0f; + + // Perform the first level of reduction. + if (i + get_local_size(0) < num) { + sdata[tid] += in[i + get_local_size(0)]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int s = get_local_size(0)/2; s > 0; s >>= 1) { + if (tid > s) { + sdata[tid] += sdata[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (tid == 0) { + out[get_group_id(0)] = sdata[0]; + } +} + +__kernel +void clkernel_tanh(const int num, __global const float* in, __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = tanh(in[i]); +} + +// ************************************** +// Random functions +// ************************************** + +// See: distribution.cl + +// ********************************************************* +// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas +// ********************************************************* + +__kernel +void clkernel_amax(const int num, __global const float* in, __global int* ret, + __local uint* sdata, __local size_t* temp) { + const int gid = get_global_id(0); + const int tid = get_local_id(0); + + for(int s = get_local_size(0)/2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] = (in[sdata[tid]] > in[tid+s]) ? sdata[tid] : tid; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + ret[0] = sdata[0]; + } +} + + +/* TODO: Fix line 284:20. +__kernel +void clkernel_amin(const int num, __global const float* in, __global int* ret, + __local float* sdata, __local size_t* temp) { + const int gid = get_global_id(0); + const int tid = get_local_id(0); + + // Initialize the values to pos infinity. + sdata[tid] = (gid < num) ? in[gid] : INFINITY; + barrier(CLK_LOCAL_MEM_FENCE); + + for(int s = get_local_size(0)/2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] = (in[sdata[tid]] < in[tid+s]) ? sdata[tid] : tid; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + ret[0] = sdata[0]; + } +}*/ + + +__kernel +void clkernel_asum(const int num, __global const float* in, __global float* out, + __local float* sdata) { + const int tid = get_local_id(0); + const int i = get_global_id(0); + + // Initialize + sdata[tid] = (i < num) ? in[i] : INFINITY; + // Perform the first level of reduction. + if (i + get_local_size(0) < num) { + sdata[tid] += in[i + get_local_size(0)]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for(int s = get_local_size(0)/2; s > 0; s >>= 1) { + if (tid < s) { + sdata[tid] = fabs(sdata[tid + s]); + } + barrier(CLK_LOCAL_MEM_FENCE); + } + if (tid == 0) { + out[0] = sdata[0]; + } +} + +__kernel +void clkernel_axpy(const int num, float alpha, __global const float* in, + __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = fma(alpha, in[i], out[i]); +} + +// This kernel is essentially the same as Sum, except that during the process +// of reading in data to the local memory, the value is also doubled. +// Then, just before submitting the sum to out, we do a square-root on it. +__kernel +void clkernel_nrm2(const int num, __global const float* in, __global float* out, + __local float* sdata) { + const int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0); + const int tid = get_local_id(0); + sdata[tid] = (i < num) ? (in[i] * in[i]) : 0.0f; + + // Perform the first level of reduction. + if (i + get_local_size(0) < num) { + sdata[tid] += in[i + get_local_size(0)]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + for (int s = get_local_size(0)/2; s > 0; s >>= 1) { + if (tid > s) { + sdata[tid] += sdata[tid + s]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (tid == 0) { + out[get_group_id(0)] = sqrt(sdata[0]); + } +} + +__kernel +void clkernel_scale(const int num, float x, __global float* out) { + const int i = get_global_id(0); + if (i >= num) return; + out[i] = x * out[i]; +} + +__kernel +void clkernel_dot(const int num, __global const float* in1, __global const float* in2, + __global float* out, __local float* scratch) { + const int i = get_global_id(0); + if (i >= num) return; + int offset = i << 2; + scratch[i] = in1[offset] * in2[offset]; + +} + +// Third kernel from http://www.bealto.com/gpu-gemv_intro.html +// y = α*A*v + β*y +__kernel +void clkernel_gemv(const int m, const int n, const float alpha, + __global const float* A, __global const float* v, + const float beta, __global float* out) { + const int i = get_global_id(0); + float sum = 0.0f; + for (int k = 0; k < n; k++) { + sum += fma(alpha, A[i + m * k], v[k]) + beta * out[i + m * k]; + } + out[i] = sum; +} + +// http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-dgmm +// X[j] = x[j*inc(x)] if inc(x) ⥠0 +// = x[(Ï â 1)*|inc(x)| â j*|inc(x)|] if inc(x) < 0 + +// C = diag( X )*A +__kernel +void clkernel_dgmm_left(const int nrow, const int ncol, + __global const float* M, __global const float* v, + __global float* out) { + const uint gidx = get_global_id(0); + + uint offset = gidx * ncol; + for (uint i = 0; i < ncol; i++) { + out[offset + i] = M[offset + i] * v[i]; + } +} + +// C = A*diag( X ) +__kernel +void clkernel_dgmm_right(const int nrow, const int ncol, + __global const float* M, __global const float* v, + __global float* out) { + const uint gidx = get_global_id(0); + + uint offset = gidx * ncol; + for (uint i = 0; i < ncol; i++) { + out[offset + i] = M[offset + i] * v[gidx]; + } +} + +// TODO: Optimize with Reference from http://www.cedricnugteren.nl/tutorial.php?page=1 +// C = α*A*B + β*C +__kernel +void clkernel_gemm(const int nrowA, const int ncolB, const int ncolA, const float alpha, + __global const float *A, __global const float* B, const float beta, + __global float* C) { + const uint gidx = get_global_id(0); + const uint gidy = get_global_id(1); + + float acc = 0.0f; + for (uint i = 0; i < ncolA; i++) { + acc = fma(A[i * nrowA + gidx], B[gidy * ncolA + i] * alpha, acc); + } + + C[gidy * nrowA + gidx] = fma(C[gidy * nrowA + gidx], beta, acc); +} + + +__kernel +void clkernel_crossentropy(const uint batchsize, const uint dim, + __global const float* p, __global const int* t, + __global float* loss) { + const uint gidx = get_global_id(0); + if (gidx >= batchsize) return; + + int truth_idx = t[gidx]; + if (truth_idx <= 0) return; + float prob_of_truth = p[gidx + truth_idx]; + loss[gidx] = -log(fmax(prob_of_truth, -FLT_MIN)); +} + + +__kernel +void clkernel_softmaxentropy(const uint batchsize, const uint dim, + __global const float* p, __global const int* t, + __global float* grad) { + const uint gidx = get_global_id(0); + if (gidx >= batchsize) return; + + int truth_idx = t[gidx]; + if (truth_idx <= 0) return; + grad[gidx * dim + truth_idx] -= 1.0; +} + + +// ************************************** +// Matrix functions +// ************************************** +/* +__kernel +void clkernel_addcol(int nrow, int ncol, __global const float* A, __global const float* v, __global float* out) { + const int i = get_global_id(0); + const int j = get_global_id(1); + if (i >= nrow) return; + if (j >= ncol) return; + ret[j] = A[j + nrow * i] + v[j]; +} + +__kernel +void clkernel_addrow(int nrow, int ncol, __global const float* A, __global const float* v, __global float* out) { + const int i = get_global_id(0); + const int j = get_global_id(1); + if (i >= nrow) return; + if (j >= ncol) return; + out[i] = A[i + ncol * j] + v[i]; +} + +__kernel +void clkernel_outerproduct(int m, const int n, __global const float* in1, __global const float* in2, __global float* out) { + const int col = get_global_id(0); + const int row = get_global_id(1); + + // TODO: This +} + +__kernel +void clkernel_sumcol(int nrow, int ncol, __global const float* in, __global float* out) { + const int i = get_global_id(0); + if (i >= nrow) return; + + float sum = 0.0f; + for (int j = 0; j < nrow; j++) { + sum += input[nrow * i + j]; + } + out[i] = sum; +} +*/ +__kernel +void clkernel_sumrow(int nrow, int ncol, __global const float* in, __global float* out) { + const int idx = get_global_id(0); + if (idx >= nrow) return; + + float sum = 0.0f; + for (int j = 0; j < ncol; j++) { + sum += in[j + ncol * idx]; + } + out[idx] = sum; +} + + +// Adapted from http://code.haskell.org/HsOpenCL/tests/bench/transpose.cl +#define BLOCK_DIM 16 +__kernel +void clkernel_transpose(uint nrow, uint ncol, + __global const float* in, __global float* out, + __local float* sdata) { + uint gidx = get_global_id(0); + uint gidy = get_global_id(1); + + if ((gidx < ncol) && (gidy < nrow)) { + uint id_in = gidy * ncol + gidx; + sdata[get_local_id(1) * (BLOCK_DIM+1) + get_local_id(0)] = in[id_in]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + gidx = get_group_id(1) * BLOCK_DIM + get_local_id(0); + gidy = get_group_id(0) * BLOCK_DIM + get_local_id(1); + if ((gidx < nrow) && (gidy < ncol)) { + uint id_out = gidy * nrow + gidx; + out[id_out] = sdata[get_local_id(0) * (BLOCK_DIM + 1) + get_local_id(1)]; + } +} +/* +__kernel +void clkernel_transpose2(uint nrow, uint ncol, __global const float* in, __global float* out, __local float* sdata) { + const uint lidx = get_local_id(0); + const uint lidy = get_local_id(1); + const uint id0 = get_group_id(0) * ncol * lidx; + const uint id1 = get_group_id(1) * nrow * lidy; + + if (id0 < nrow && id1 < ncol) { + sdata[lidx][lidy] = in[id1 * nrow + id0]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + const uint new_id0 = get_group_id(1) * nrow + lidx; + const uint new_id1 = get_group_id(0) * ncol + lidy; + + if (new_id0 < ncol && new_id1 < nrow) { + out[new_id1 * ncol + new_id0] = sdata[lidx][lidy]; + } +}*/ + +__kernel +void clkernel_diagvec_left(uint vsize, __global const float* vin, __global float* out) { + const uint gid = get_global_id(0); + + for (uint i = 0; i < vsize; i++) + out[gid * vsize + i] = (i == gid) ? vin[gid] : 0.0f; +}
