SINGA-243 ViennaCL backend for OpenCL support - Replaced current OpenCL backend with ViennaCL libs - Removed need for OpenCL C++ headers - Updated unit tests files to match changes - Updated Dependencies.cmake to match changes
- Added license information. Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/595302a3 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/595302a3 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/595302a3 Branch: refs/heads/master Commit: 595302a36e9fee2ef63b57ddf81fe75a5fe00a40 Parents: 65bf582 Author: Tan Li Boon <[email protected]> Authored: Wed Aug 17 12:48:18 2016 +0800 Committer: Tan Li Boon <[email protected]> Committed: Wed Aug 31 10:53:21 2016 +0800 ---------------------------------------------------------------------- .travis.yml | 6 +- CMakeLists.txt | 3 +- LICENSE | 4 +- cmake/Dependencies.cmake | 25 +- cmake/Thirdparty/FindViennaCL.cmake | 45 ++ include/singa/core/common.h | 10 +- include/singa/core/device.h | 56 +- include/singa/utils/opencl_utils.h | 142 +--- src/core/device/opencl_device.cc | 182 ++--- src/core/tensor/tensor_math_opencl.cl | 19 +- src/core/tensor/tensor_math_opencl.h | 1007 ++++++++-------------------- src/utils/opencl_utils.cc | 63 -- test/singa/test_opencl.cc | 700 ++++++++----------- test/singa/test_opencl_device.cc | 108 +++ 14 files changed, 822 insertions(+), 1548 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/.travis.yml ---------------------------------------------------------------------- diff --git a/.travis.yml b/.travis.yml index 8b1f89c..d9bba51 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,13 +7,13 @@ before_install: - sudo apt-get -qq update - sudo apt-get install -qq -y libopenblas-dev libgoogle-glog-dev libprotobuf-dev protobuf-compiler - sudo apt-get install -qq -y opencl-headers ocl-icd-* - - wget https://github.com/KhronosGroup/OpenCL-CLHPP/releases/download/v2.0.9/cl2.hpp - - sudo mv cl2.hpp /usr/include/CL/ +#- wget https://github.com/KhronosGroup/OpenCL-CLHPP/releases/download/v2.0.9/cl2.hpp +#- sudo mv cl2.hpp /usr/include/CL/ #- sudo apt-get install -qq libgtest-dev before_script: - mkdir build && cd build - - cmake .. -DUSE_CUDA=OFF -DUSE_CUDNN=OFF -DUSE_PYTHON=OFF -DBUILD_OPENCL_TESTS=OFF + - cmake .. -DUSE_CUDA=OFF -DUSE_CUDNN=OFF -DUSE_PYTHON=OFF script: - make http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f6bea2..611cee4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,9 +43,8 @@ OPTION(USE_CUDNN "Use Cudnn libs" ON) OPTION(USE_OPENCV "Use opencv" OFF) OPTION(USE_LMDB "Use LMDB libs" OFF) OPTION(USE_PYTHON "Generate py wrappers" OFF) -#OPTION(USE_OPENCL "Use OpenCL" OFF) +OPTION(USE_OPENCL "Use OpenCL" OFF) OPTION(ENABLE_DIST "enable distributed training" OFF) -#OPTION(BUILD_OPENCL_TESTS "Build OpenCL tests" OFF) INCLUDE("cmake/Dependencies.cmake") INCLUDE("cmake/Utils.cmake") http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/LICENSE ---------------------------------------------------------------------- diff --git a/LICENSE b/LICENSE index f658def..4f9d1e7 100644 --- a/LICENSE +++ b/LICENSE @@ -303,7 +303,9 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. =========================================================================== SINGA bundles the following under BSD 2-clause license: -include/singa/utils/cuda_utils.h, src/core/tensor/distribution.cl +include/singa/utils/cuda_utils.h +src/core/tensor/distribution.cl +cmake/ThirdParty/FindViennaCL.cmake COPYRIGHT http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/cmake/Dependencies.cmake ---------------------------------------------------------------------- diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index b5fda6d..aa2212b 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -50,23 +50,26 @@ IF(USE_CBLAS) FIND_PACKAGE(CBLAS REQUIRED) INCLUDE_DIRECTORIES(SYSTEM ${CBLAS_INCLUDE_DIR}) LIST(APPEND SINGA_LINKER_LIBS ${CBLAS_LIBRARIES}) - MESSAGE(STATUS "FOUND cblas at ${CBLAS_LIBRARIES}") + MESSAGE(STATUS "Found cblas at ${CBLAS_LIBRARIES}") ENDIF() IF(USE_OPENCL) FIND_PACKAGE(OpenCL REQUIRED) - IF(NOT OPENCL_FOUND) + IF(NOT OpenCL_FOUND) MESSAGE(SEND_ERROR "OpenCL was requested, but not found.") ELSE() - INCLUDE_DIRECTORIES(SYSTEM ${OpenCL_INCPATH}) - LIST(APPEND SINGA_LINKER_LIBS ${OPENCL_LIBRARIES}) - MESSAGE(STATUS "Found OpenCL at ${OPENCL_INCLUDE_DIRS}") - IF(NOT OPENCL_HAS_CPP_BINDINGS) - MESSAGE(SEND_ERROR "OpenCL C++ bindings cl2.hpp was not found.") - ELSE() - MESSAGE(STATUS "Found OpenCL C++ bindings.") - ENDIF() - ENDIF() + MESSAGE(STATUS "Found OpenCL headers at ${OpenCL_INCLUDE_DIRS}") + INCLUDE_DIRECTORIES(SYSTEM ${OpenCL_INCLUDE_DIR}) + LIST(APPEND SINGA_LINKER_LIBS ${OpenCL_LIBRARIES}) + FIND_PACKAGE(ViennaCL REQUIRED) + IF(NOT ViennaCL_FOUND) + MESSAGE(SEND_ERROR "ViennaCL is required if OpenCL is enabled.") + ELSE() + MESSAGE(STATUS "Found ViennaCL headers at ${ViennaCL_INCLUDE_DIR}") + INCLUDE_DIRECTORIES(SYSTEM ${ViennaCL_INCLUDE_DIR}) + LIST(APPEND SINGA_LINKER_LIBS ${ViennaCL_LIBRARIES}) + ENDIF() + ENDIF() ENDIF() FIND_PACKAGE(Glog REQUIRED) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/cmake/Thirdparty/FindViennaCL.cmake ---------------------------------------------------------------------- diff --git a/cmake/Thirdparty/FindViennaCL.cmake b/cmake/Thirdparty/FindViennaCL.cmake new file mode 100644 index 0000000..263c80f --- /dev/null +++ b/cmake/Thirdparty/FindViennaCL.cmake @@ -0,0 +1,45 @@ +# This file is retrieved from caffe/cmake/Modules/FindViennaCL.cmake. + +SET(ViennaCL_WITH_OPENCL TRUE) + +SET(VIENNACL_INCLUDE_SEARCH_PATHS + .. + /usr/include + /usr/local/include + /opt/ViennaCL/include + $ENV{VIENNACL_HOME} + $ENV{VIENNACL_HOME}/include +) + +FIND_PATH(ViennaCL_INCLUDE_DIR NAMES viennacl/forwards.h PATHS ${VIENNACL_INCLUDE_SEARCH_PATHS}) + +SET(ViennaCL_FOUND ON) + +# Check include files +IF(NOT ViennaCL_INCLUDE_DIR) + SET(ViennaCL_FOUND OFF) + MESSAGE(STATUS "Could not find ViennaCL include. Turning ViennaCL_FOUND off") +ENDIF() + +IF (ViennaCL_FOUND) + IF (NOT ViennaCL_FIND_QUIETLY) + MESSAGE(STATUS "Found ViennaCL include: ${ViennaCL_INCLUDE_DIR}") + ENDIF (NOT ViennaCL_FIND_QUIETLY) +ELSE (ViennaCL_FOUND) + IF (ViennaCL_FIND_REQUIRED) + MESSAGE(FATAL_ERROR "Could not find ViennaCL") + ENDIF (ViennaCL_FIND_REQUIRED) +ENDIF (ViennaCL_FOUND) + +IF(ViennaCL_WITH_OPENCL) + find_package(OpenCL REQUIRED) +ENDIF(ViennaCL_WITH_OPENCL) + +set(ViennaCL_INCLUDE_DIRS ${ViennaCL_INCLUDE_DIR} ${OPENCL_INCLUDE_DIRS}) +set(ViennaCL_LIBRARIES ${OPENCL_LIBRARIES}) + +MARK_AS_ADVANCED( + ViennaCL_INCLUDE_DIR + ViennaCL_INCLUDE_DIRS + ViennaCL_LIBRARIES +) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/include/singa/core/common.h ---------------------------------------------------------------------- diff --git a/include/singa/core/common.h b/include/singa/core/common.h index 53a9726..dc552c1 100644 --- a/include/singa/core/common.h +++ b/include/singa/core/common.h @@ -36,10 +36,7 @@ #ifdef USE_OPENCL -#define CL_HPP_MINIMUM_OPENCL_VERSION 120 -#define CL_HPP_TARGET_OPENCL_VERSION 120 -#include <CL/cl2.hpp> -#include <unordered_map> +#include "singa/utils/opencl_utils.h" #endif // USE_OPENCL using std::atomic; @@ -110,9 +107,8 @@ typedef struct _Context { #endif // USE_CUDA #ifdef USE_OPENCL - std::shared_ptr<std::unordered_map<std::string, cl::Kernel>> kernels; - cl::CommandQueue ocl_cmdq; - cl::Context ocl_ctx; + // This stores the context ID of the OpenCL context controlled by ViennaCL. + long vcl_ctx_id; #endif } Context; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/include/singa/core/device.h ---------------------------------------------------------------------- diff --git a/include/singa/core/device.h b/include/singa/core/device.h index 810d41f..62fa250 100644 --- a/include/singa/core/device.h +++ b/include/singa/core/device.h @@ -36,12 +36,6 @@ #endif // USE_CUDA #ifdef USE_OPENCL -// http://github.khronos.org/OpenCL-CLHPP/ -// cl2.hpp includes cl.h, do not re-include. -#define CL_HPP_MINIMUM_OPENCL_VERSION 120 -#define CL_HPP_TARGET_OPENCL_VERSION 120 -#include <unordered_map> -#include <CL/cl2.hpp> #include "singa/utils/opencl_utils.h" #endif // USE_OPENCL @@ -217,50 +211,26 @@ public: OpenclDevice(int id = 0, int num_executors = 1); ~OpenclDevice(); - /// Get the specified kernel. - cl::Kernel GetKernel(const std::string& kname, cl_int* status = nullptr); - - /// Get the command queue associated with this device. - cl::CommandQueue GetCmdQ() { return cmdq; } - - /// Prints information about all Devices in each Platform. - void PrintAllDeviceInfo(); - - /// Prints status about CL source code builds. - void PrintClBuildInfo(cl::Program &p); - // Overridden, inherited methods void SetRandSeed(unsigned seed) override; void CopyDataToFrom(Block* dst, Block* src, size_t nBytes, CopyDirection direction, int dst_offset = 0, int src_offset = 0); -/* - void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes = 0, - size_t dst_offset = 0) override;*/ protected: /// The OpenCL device that this object represents. /// Each OpenclDevice contains exactly one cl::Device for the lifetime of the /// object. - cl::Device this_device; + viennacl::ocl::device this_device; /// Each OpenclDevice has one OpenCL context. It is created along with the /// creation of this object. - cl::Context ocl_ctx; - - /// The CommandQueue that is associated with this device. - /// Since each OpenclDevice contains only one cl::Device and one cl::Context, - /// it naturally also contains one cl::CommandQueue that is associated - /// with said Device and Context. - cl::CommandQueue cmdq; - - /// A list of kernels that has been compiled on this device. - std::shared_ptr<std::unordered_map<std::string, cl::Kernel>> kernels; + viennacl::ocl::context vcl_ctx; /// Searches the given paths for all .cl files and builds /// OpenCL programs, then stores them in the Kernels map. - void BuildPrograms(const std::string &kdir = cl_src_path); + void BuildPrograms(const std::string &kdir); // Overridden, inherited methods. @@ -280,21 +250,6 @@ protected: private: - /// Copies a data block from host to device. - /// src: a pointer to an array of data. - /// dst: a pointer to a cl::Buffer object. - void WriteToDevice(cl::Buffer* dst, const void* src, const size_t size); - - /// Reads a data block from device to host. - /// src: a pointer to an cl::Buffer object. - /// dst: a pointer to an malloc'ed empty array. - void ReadFromDevice(void* dst, const cl::Buffer* src, const size_t size); - - /// Duplicates a block of data on the device. - /// src: a pointer to the original cl::Buffer object. - /// dst: a pointer to the new cl::Buffer object to copy the data into. - void CopyDeviceBuffer(cl::Buffer* dst, const cl::Buffer* src, const size_t size); - static const std::string cl_src_path; }; #endif // USE_OPENCL @@ -368,11 +323,6 @@ public: /// except the context initialization. static bool CheckDevice(const int device_id); - -private: -#ifdef USE_OPENCL - cl::Platform clPlatform; -#endif // USE_OPENCL }; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/include/singa/utils/opencl_utils.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/opencl_utils.h b/include/singa/utils/opencl_utils.h index 664a9e1..8c05643 100644 --- a/include/singa/utils/opencl_utils.h +++ b/include/singa/utils/opencl_utils.h @@ -24,121 +24,47 @@ #ifdef USE_OPENCL -#include <iostream> +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -// http://github.khronos.org/OpenCL-CLHPP/ -// cl2.hpp includes cl.h, do not re-include. #define CL_HPP_MINIMUM_OPENCL_VERSION 120 #define CL_HPP_TARGET_OPENCL_VERSION 120 -#include <CL/cl2.hpp> -#define CL_BREAK_ON_FAILURE if (status != CL_SUCCESS) return; - - -inline const char* clGetBuildInfoString(const cl_build_status status) { - switch (status) { - case CL_BUILD_NONE: return "CL_BUILD_NONE"; - case CL_BUILD_ERROR: return "CL_BUILD_ERROR"; - case CL_BUILD_SUCCESS: return "CL_BUILD_SUCCESS"; - case CL_BUILD_IN_PROGRESS: return "CL_BUILD_IN_PROGRESS"; - default: return ""; - } -} - - -inline const char* clGetErrorString(const cl_int status) { - - switch(status) { - - // Run-time and JIT compiler errors - case 0: return "CL_SUCCESS"; - case -1: return "CL_DEVICE_NOT_FOUND"; - case -2: return "CL_DEVICE_NOT_AVAILABLE"; - case -3: return "CL_COMPILER_NOT_AVAILABLE"; - case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; - case -5: return "CL_OUT_OF_RESOURCES"; - case -6: return "CL_OUT_OF_HOST_MEMORY"; - case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE"; - case -8: return "CL_MEM_COPY_OVERLAP"; - case -9: return "CL_IMAGE_FORMAT_MISMATCH"; - case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; - case -11: return "CL_BUILD_PROGRAM_FAILURE"; - case -12: return "CL_MAP_FAILURE"; - case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; - case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; - case -15: return "CL_COMPILE_PROGRAM_FAILURE"; - case -16: return "CL_LINKER_NOT_AVAILABLE"; - case -17: return "CL_LINK_PROGRAM_FAILURE"; - case -18: return "CL_DEVICE_PARTITION_FAILED"; - case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; - - // Compile-time errors - case -30: return "CL_INVALID_VALUE"; - case -31: return "CL_INVALID_DEVICE_TYPE"; - case -32: return "CL_INVALID_PLATFORM"; - case -33: return "CL_INVALID_DEVICE"; - case -34: return "CL_INVALID_CONTEXT"; - case -35: return "CL_INVALID_QUEUE_PROPERTIES"; - case -36: return "CL_INVALID_COMMAND_QUEUE"; - case -37: return "CL_INVALID_HOST_PTR"; - case -38: return "CL_INVALID_MEM_OBJECT"; - case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; - case -40: return "CL_INVALID_IMAGE_SIZE"; - case -41: return "CL_INVALID_SAMPLER"; - case -42: return "CL_INVALID_BINARY"; - case -43: return "CL_INVALID_BUILD_OPTIONS"; - case -44: return "CL_INVALID_PROGRAM"; - case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; - case -46: return "CL_INVALID_KERNEL_NAME"; - case -47: return "CL_INVALID_KERNEL_DEFINITION"; - case -48: return "CL_INVALID_KERNEL"; - case -49: return "CL_INVALID_ARG_INDEX"; - case -50: return "CL_INVALID_ARG_VALUE"; - case -51: return "CL_INVALID_ARG_SIZE"; - case -52: return "CL_INVALID_KERNEL_ARGS"; - case -53: return "CL_INVALID_WORK_DIMENSION"; - case -54: return "CL_INVALID_WORK_GROUP_SIZE"; - case -55: return "CL_INVALID_WORK_ITEM_SIZE"; - case -56: return "CL_INVALID_GLOBAL_OFFSET"; - case -57: return "CL_INVALID_EVENT_WAIT_LIST"; - case -58: return "CL_INVALID_EVENT"; - case -59: return "CL_INVALID_OPERATION"; - case -60: return "CL_INVALID_GL_OBJECT"; - case -61: return "CL_INVALID_BUFFER_SIZE"; - case -62: return "CL_INVALID_MIP_LEVEL"; - case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; - case -64: return "CL_INVALID_PROPERTY"; - case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; - case -66: return "CL_INVALID_COMPILER_OPTIONS"; - case -67: return "CL_INVALID_LINKER_OPTIONS"; - case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; - - // Extension errors - case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; - case -1001: return "CL_PLATFORM_NOT_FOUND_KHR"; - case -1002: return "CL_INVALID_D3D10_DEVICE_KHR"; - case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR"; - case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; - case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; - - default: return "Unknown OpenCL status"; +#ifndef VIENNACL_WITH_OPENCL + #define VIENNACL_WITH_OPENCL +#endif + +#ifndef __APPLE__ + #include "CL/cl.h" +#else + #include "OpenCL/cl.h" +#endif + +#include <viennacl/backend/opencl.hpp> + +#include <viennacl/ocl/device.hpp> +#include <viennacl/ocl/platform.hpp> +#include <viennacl/ocl/backend.hpp> +#include <viennacl/ocl/device_utils.hpp> +#include <viennacl/ocl/utils.hpp> +#include <viennacl/ocl/program.hpp> +#include <viennacl/ocl/kernel.hpp> + + +inline viennacl::ocl::handle<cl_mem> +WrapHandle(cl_mem in, viennacl::ocl::context *ctx) { + if (in != nullptr) { + viennacl::ocl::handle<cl_mem> memhandle(in, *ctx); + memhandle.inc(); + return memhandle; + } else { + cl_int err; + cl_mem dummy = clCreateBuffer(ctx->handle().get(), CL_MEM_READ_WRITE, 0, + nullptr, &err); + viennacl::ocl::handle<cl_mem> memhandle(dummy, *ctx); + return memhandle; } } - -/// Special function used to perform error checking and logging. -inline bool OCL_CHECK(const cl_int status, const char* what) { - if (status == CL_SUCCESS) return true; // Nothing wrong. - LOG(ERROR) << status << ": " << clGetErrorString(status) << " " << what << std::endl; - return false; -} - -/// Prints information about the specified Platform. -void PrintPlatformInfo(const cl::Platform &p); - -/// Prints information about the specified Device. -void PrintDeviceInfo(const cl::Device &dev); - #endif // USE_OPENCL #endif // SINGA_UTILS_OPENCL_UTILS_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/src/core/device/opencl_device.cc ---------------------------------------------------------------------- diff --git a/src/core/device/opencl_device.cc b/src/core/device/opencl_device.cc index b941cd2..6b371c4 100644 --- a/src/core/device/opencl_device.cc +++ b/src/core/device/opencl_device.cc @@ -23,89 +23,41 @@ #include "singa/core/device.h" #include "singa/utils/tinydir.h" +#include "singa/utils/opencl_utils.h" #ifdef USE_OPENCL -using std::string; +using namespace viennacl; +using namespace viennacl::backend::opencl; namespace singa { -const string OpenclDevice::cl_src_path = "../src/core/tensor"; +const std::string OpenclDevice::cl_src_path = "../src/core/tensor"; OpenclDevice::OpenclDevice(int id, int num_executors) : Device(id, num_executors) { + CHECK_GE(id, 0); lang_ = kOpencl; - this->kernels = std::make_shared<std::unordered_map<string, cl::Kernel>>(); - - // Create the OpenCL Device, Context, and CommandQueue. - /// TODO: This merely chooses the first device on the first platform. - cl_int status = CL_SUCCESS; - - std::vector<cl::Platform> platforms; - status = cl::Platform::get(&platforms); - OCL_CHECK(status, "Failed to find any OpenCL platforms!"); - - std::vector<cl::Device> devices; - status = platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices); - OCL_CHECK(status, "Failed to get list of devices from platform!"); - - this->this_device = cl::Device(devices[0]); - this->ocl_ctx = cl::Context(this_device, nullptr, nullptr, nullptr, &status); - OCL_CHECK(status, "Failed to create context!"); - - this->cmdq = cl::CommandQueue(ocl_ctx, this_device, CL_QUEUE_PROFILING_ENABLE, &status); - OCL_CHECK(status, "Failed to create a command queue!"); - - BuildPrograms(); - - ctx_.kernels = kernels; - ctx_.ocl_cmdq = cmdq; - ctx_.ocl_ctx = ocl_ctx; + + ocl::current_context().build_options("-cl-std=CL1.2"); + + ctx_.vcl_ctx_id = 0; + this->this_device = ocl::current_device(); + + BuildPrograms(cl_src_path); } OpenclDevice::~OpenclDevice() { // Flush and finish the command queue. + auto cmdq = ocl::current_context().get_queue(); + cmdq.flush(); cmdq.finish(); } -cl::Kernel OpenclDevice::GetKernel(const std::string& kname, cl_int* status) { - if (!status) *status = CL_SUCCESS; - if (kernels->find(kname) == kernels->end()) { - // TODO: Not found - LOG(ERROR) << "Error: Kernel " << kname << " could not be found!"; - if (!status) *status = CL_INVALID_KERNEL; - } - return kernels->at(kname); -} - -/* -void OpenclDevice::PrintAllDeviceInfo() { - cl_int status = CL_SUCCESS; - - for (auto dev : devices) { - PrintDeviceInfo(d); - } -} -*/ - - -void OpenclDevice::PrintClBuildInfo(cl::Program &p) { - cl_int status = CL_SUCCESS; - - auto buildStatus = p.getBuildInfo<CL_PROGRAM_BUILD_STATUS>(&status); - for (auto pair : buildStatus) - std::cout << clGetBuildInfoString(pair.second) << std::endl; - - auto buildLog = p.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&status); - for (auto pair : buildLog) - std::cout << pair.second << std::endl; -} - - void OpenclDevice::SetRandSeed(unsigned seed) { seed = seed; } @@ -113,19 +65,33 @@ void OpenclDevice::CopyDataToFrom(Block* dst, Block* src, size_t nBytes, CopyDirection direction, int dst_offset, int src_offset) { // Pointers must be valid. if (!dst || !src) return; + + auto ocl_ctx = viennacl::ocl::get_context(ctx_.vcl_ctx_id); - CopyToFrom(dst->mutable_data(), src->data(), nBytes, direction); + switch(direction) { + case kHostToDevice: { + auto dst_handle = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), &ocl_ctx); + memory_write(dst_handle, dst_offset, nBytes, src->data()); + return; + } + case kDeviceToHost: { + auto src_handle = WrapHandle(static_cast<cl_mem>(src->mutable_data()), &ocl_ctx); + memory_read(src_handle, src_offset, nBytes, dst->mutable_data()); + return; + } + case kDeviceToDevice: { + auto src_handle = WrapHandle(static_cast<cl_mem>(src->mutable_data()), &ocl_ctx); + auto dst_handle = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), &ocl_ctx); + memory_copy(src_handle, dst_handle, src_offset, dst_offset, nBytes); + return; + } + default: + return; + } } -/* -void OpenclDevice::CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes, size_t dst_offset) { - CopyToFrom(dst->mutable_data(), src, 4, kHostToDevice); -} -*/ void OpenclDevice::BuildPrograms(const std::string &kdir) { - cl_int status = CL_SUCCESS; - tinydir_dir dir; tinydir_open(&dir, kdir.c_str()); @@ -137,63 +103,47 @@ void OpenclDevice::BuildPrograms(const std::string &kdir) { tinydir_next(&dir); continue; } - + std::ifstream clFile(file.path, std::ios_base::binary); std::stringstream buffer; buffer << clFile.rdbuf(); std::string clSrc(buffer.str()); - cl::Program program(this->ocl_ctx, clSrc, false, &status); - OCL_CHECK(status, "Program creation failed."); - status = program.build({this_device}, "-cl-std=CL1.2"); - if (status == CL_SUCCESS) { - std::vector<cl::Kernel> built_kernels; - status = program.createKernels(&built_kernels); - OCL_CHECK(status, "Failed to create kernels in built program."); - - for (auto k : built_kernels) { - std::string name = k.getInfo<CL_KERNEL_FUNCTION_NAME>(&status); - this->kernels->insert(std::make_pair(name, k)); - } - } else { - OCL_CHECK(status, "Build failed on source path"); - LOG(ERROR) << file.path << std::endl; - PrintClBuildInfo(program); - } + std::string name(file.name); + ocl::current_context().add_program(clSrc, name); tinydir_next(&dir); } } -// Device IO functions. -// TODO: -// Research - MapBuffers can improve performance when the device uses shared memory -// but is more complex to understand. http://stackoverflow.com/questions/22057692/whats-the-difference-between-clenqueuemapbuffer-and-clenqueuewritebuffer -// Intel graphics (and possibly AMD APUs) should use MapBuffers? -// https://software.intel.com/en-us/articles/getting-the-most-from-opencl-12-how-to-increase-performance-by-minimizing-buffer-copies-on-intel-processor-graphics - void OpenclDevice::DoExec(function<void(Context*)>&& fn, int executor) { fn(&ctx_); } -// NOTE: ASSUMES dst AND/OR src POINTERS CAN BE CAST TO cl::Buffer POINTERS! + void OpenclDevice::CopyToFrom(void* dst, const void* src, size_t nBytes, CopyDirection direction, Context* ctx) { // Pointers must be valid. if (!dst || !src) return; + + auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id); switch(direction) { case kHostToDevice: { - WriteToDevice(static_cast<cl::Buffer*>(dst), src, nBytes); + auto dst_handle = WrapHandle(static_cast<cl_mem>(dst), &ocl_ctx); + memory_write(dst_handle, 0, nBytes, src); return; } case kDeviceToHost: { - ReadFromDevice(dst, static_cast<const cl::Buffer*>(src), nBytes); + auto src_handle = WrapHandle((const cl_mem)src, &ocl_ctx); + memory_read(src_handle, 0, nBytes, dst); return; } case kDeviceToDevice: { - CopyDeviceBuffer(static_cast<cl::Buffer*>(dst), static_cast<const cl::Buffer*>(src), nBytes); + auto src_handle = WrapHandle((const cl_mem)src, &ocl_ctx); + auto dst_handle = WrapHandle(static_cast<cl_mem>(dst), &ocl_ctx); + memory_copy(src_handle, dst_handle, 0, 0, nBytes); return; } default: @@ -203,10 +153,7 @@ void OpenclDevice::CopyToFrom(void* dst, const void* src, size_t nBytes, void* OpenclDevice::Malloc(int size) { - cl_int status = CL_SUCCESS; - - cl::Buffer* buffer = new cl::Buffer(ocl_ctx, CL_MEM_READ_WRITE, size, nullptr, &status); - OCL_CHECK(status, "Unable to allocate memory in OpenCL device."); + cl_mem buffer = memory_create(ocl::current_context(), size, nullptr); return static_cast<void*>(buffer); } @@ -214,33 +161,8 @@ void* OpenclDevice::Malloc(int size) { void OpenclDevice::Free(void* p) { if (!p) return; - cl::Buffer* buffer = static_cast<cl::Buffer*>(p); - delete buffer; -} - - -void OpenclDevice::WriteToDevice(cl::Buffer* dst, const void* src, const size_t size) { - cl_int status = CL_SUCCESS; - - status = cmdq.enqueueWriteBuffer(*dst, CL_TRUE, 0, size, src); - OCL_CHECK(status, "Unable to write data to OpenCL device."); -} - - -void OpenclDevice::ReadFromDevice(void* dst, const cl::Buffer* src, const size_t size) { - cl_int status = CL_SUCCESS; - - status = cmdq.enqueueReadBuffer(*src, CL_TRUE, 0, size, dst); - OCL_CHECK(status, "Unable to read data from OpenCL device."); -} - - -// dst: cl::Buffer pointer src: cl::Buffer pointer -void OpenclDevice::CopyDeviceBuffer(cl::Buffer* dst, const cl::Buffer* src, const size_t size) { - cl_int status = CL_SUCCESS; - - status = cmdq.enqueueCopyBuffer(*src, *dst, 0, 0, size); - OCL_CHECK(status, "Unable to copy buffer in OpenCL device."); + cl_mem buffer = static_cast<cl_mem>(p); + clReleaseMemObject(buffer); } } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/src/core/tensor/tensor_math_opencl.cl ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_opencl.cl b/src/core/tensor/tensor_math_opencl.cl index f9cf96e..7b89970 100644 --- a/src/core/tensor/tensor_math_opencl.cl +++ b/src/core/tensor/tensor_math_opencl.cl @@ -24,7 +24,7 @@ // This reduction code is serial reduction modified from AMD's example. // http://developer.amd.com/resources/documentation-articles/articles-whitepapers/opencl-optimization-case-study-simple-reductions/ __kernel -void clkernel_abs(const int num, __global const float* in, __global float* out) { +void clkernel_fabs(const int num, __global const float* in, __global float* out) { const int i = get_global_id(0); if (i >= num) return; out[i] = fabs(in[i]); @@ -462,7 +462,7 @@ void clkernel_crossentropy(const uint batchsize, const uint dim, int truth_idx = t[gidx]; if (truth_idx <= 0) return; - float prob_of_truth = p[gidx + truth_idx]; + float prob_of_truth = p[gidx * dim + truth_idx]; loss[gidx] = -log(fmax(prob_of_truth, -FLT_MIN)); } @@ -480,6 +480,21 @@ void clkernel_softmaxentropy(const uint batchsize, const uint dim, } +__kernel +void clkernel_rowmax(const uint nrow, const uint ncol, + __global const float* in, __global float* out) { + const uint row_id = get_global_id(0); + if (row_id >= nrow) return; + + float row_max_val = -FLT_MAX; + for (uint i = 0; i < ncol; i++) { + row_max_val = fmax(row_max_val, in[row_id * ncol + i]); + } + + out[row_id] = row_max_val; +} + + // ************************************** // Matrix functions // ************************************** http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/src/core/tensor/tensor_math_opencl.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_opencl.h b/src/core/tensor/tensor_math_opencl.h index c289a56..c387031 100644 --- a/src/core/tensor/tensor_math_opencl.h +++ b/src/core/tensor/tensor_math_opencl.h @@ -19,17 +19,27 @@ #ifndef SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_ #ifdef USE_OPENCL -#include <limits> -#include "singa/utils/opencl_utils.h" #include "tensor_math.h" +#include "singa/utils/opencl_utils.h" -namespace singa { +#include <viennacl/scalar.hpp> +#include <viennacl/vector.hpp> +#include <viennacl/matrix.hpp> + +#include <viennacl/linalg/inner_prod.hpp> +#include <viennacl/linalg/norm_2.hpp> +#include <viennacl/linalg/sum.hpp> +#include <viennacl/linalg/scalar_operations.hpp> +#include <viennacl/linalg/vector_operations.hpp> +#include <viennacl/linalg/matrix_operations.hpp> -// Some forward declarations of utility functions that only exist here. -void Transpose(const size_t nrow, const size_t ncol, cl::Buffer& in, cl::Buffer& out, Context* ctx); -void DiagVec_Left(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx); -void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx); +#include <viennacl/ocl/kernel.hpp> + +using viennacl::ocl::get_context; +using viennacl::ocl::enqueue; + +namespace singa { // ************************************** // Element-wise functions @@ -37,436 +47,250 @@ void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* template<> void Abs<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_abs"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_fabs"); + + viennacl::vector<float> v_in((const cl_mem)in->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); + + v_out = v_in; + enqueue(kernel((cl_int)num, v_in, v_out)); } template<> void Add<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_add_scalar"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, x); - kernel.setArg(2, inbuf); - kernel.setArg(3, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + + viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx); + viennacl::vector<float> v_in((const cl_mem)in->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); + + v_out = v_in + x_in; } template<> void Add<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_add"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data())); - cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, in1buf); - kernel.setArg(2, in2buf); - kernel.setArg(3, outbuf); + viennacl::vector<float> v_in1((const cl_mem)in1->data(), num); + viennacl::vector<float> v_in2((const cl_mem)in2->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + v_out = v_in1 + v_in2; } template<> -void Clamp<float, lang::Opencl>(const size_t num, const float low, const float high, const Block* in, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_clamp"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, low); - kernel.setArg(2, high); - kernel.setArg(3, inbuf); - kernel.setArg(4, outbuf); +void Clamp<float, lang::Opencl>(const size_t num, const float low, const float high, + const Block* in, Block* out, Context* ctx) { + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_clamp"); + + viennacl::vector<float> v_in((const cl_mem)in->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + enqueue(kernel((cl_int)num, low, high, v_in, v_out)); } template<> void Div<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_divide_scalar_matx"; - auto kernel = ctx->kernels->at(kname); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx); + viennacl::vector<float> v_in((const cl_mem)in->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, x); - kernel.setArg(3, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + v_out = viennacl::linalg::element_div(v_in, x_in); } template<> void Div<float, lang::Opencl>(const size_t num, const float x, const Block* in, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_divide_scalar_xmat"; - auto kernel = ctx->kernels->at(kname); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, x); - kernel.setArg(2, inbuf); - kernel.setArg(3, outbuf); + viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx); + viennacl::vector<float> v_in((const cl_mem)in->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + v_out = viennacl::linalg::element_div(x_in, v_in); } template<> void Div<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_divide"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data())); - cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, in1buf); - kernel.setArg(2, in2buf); - kernel.setArg(3, outbuf); + viennacl::vector<float> v_in1((const cl_mem)in1->data(), num); + viennacl::vector<float> v_in2((const cl_mem)in2->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + v_out = viennacl::linalg::element_div(v_in1, v_in2); } template<> void EltwiseMult<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_eltmult_scalar"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, x); - kernel.setArg(2, inbuf); - kernel.setArg(3, outbuf); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + + viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx); + viennacl::vector<float> v_in((const cl_mem)in->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + v_out = viennacl::linalg::element_prod(v_in, x_in); } template<> void EltwiseMult<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; + viennacl::vector<float> v_in1((const cl_mem)in1->data(), num); + viennacl::vector<float> v_in2((const cl_mem)in2->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - std::string kname = "clkernel_eltmult"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data())); - cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, in1buf); - kernel.setArg(2, in2buf); - kernel.setArg(3, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + v_out = viennacl::linalg::element_prod(v_in1, v_in2); } template<> void Exp<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_exp"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + viennacl::vector<float> v_in((const cl_mem)in->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); + + v_out = viennacl::linalg::element_exp(v_in); } template<> void LE<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_le"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, x); - kernel.setArg(3, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_le"); + + viennacl::vector<float> in_buf((const cl_mem)in->data(), num); + viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num); + + enqueue(kernel((cl_int)num, in_buf, x, out_buf)); } template<> void Log<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_log"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + viennacl::vector<float> v_in((const cl_mem)in->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); + + v_out = viennacl::linalg::element_log(v_in); } template<> void LT<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_lt"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, x); - kernel.setArg(3, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_lt"); + + viennacl::vector<float> in_buf((const cl_mem)in->data(), num); + viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num); + + enqueue(kernel((cl_int)num, in_buf, x, out_buf)); } template<> void GE<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_ge"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, x); - kernel.setArg(3, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_ge"); + + viennacl::vector<float> in_buf((const cl_mem)in->data(), num); + viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num); + + enqueue(kernel((cl_int)num, in_buf, x, out_buf)); } template<> void GT<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_gt"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, x); - kernel.setArg(3, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_gt"); + + viennacl::vector<float> in_buf((const cl_mem)in->data(), num); + viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num); + + enqueue(kernel((cl_int)num, in_buf, x, out_buf)); } template<> void Pow<float, lang::Opencl>(const size_t num, const Block* in, float x, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_pow_scalar"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, x); - kernel.setArg(2, inbuf); - kernel.setArg(3, outbuf); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + + viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx); + viennacl::vector<float> v_in((const cl_mem)in->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + v_out = viennacl::linalg::element_pow(v_in, x_in); } template<> void Pow<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_pow"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data())); - cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, in1buf); - kernel.setArg(2, in2buf); - kernel.setArg(3, outbuf); + viennacl::vector<float> v_in1((const cl_mem)in1->data(), num); + viennacl::vector<float> v_in2((const cl_mem)in2->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + v_out = viennacl::linalg::element_pow(v_in1, v_in2); } template<> void ReLU<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_relu"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_relu"); + + viennacl::vector<float> in_buf((const cl_mem)in->data(), num); + viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num); + + enqueue(kernel((cl_int)num, in_buf, out_buf)); } + template<> void Set<float, lang::Opencl>(const size_t num, const float x, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; + auto ocl_ctx = get_context(ctx->vcl_ctx_id); - std::string kname = "clkernel_set"; - auto kernel = ctx->kernels->at(kname); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, x); - kernel.setArg(2, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + v_out = viennacl::scalar_vector<float>(num, x, ocl_ctx); } template<> void Sigmoid<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_sigmoid"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + + const viennacl::vector<float> zero = viennacl::zero_vector<float>(num, ocl_ctx); + const viennacl::vector<float> one = viennacl::scalar_vector<float>(num, 1.0f, ocl_ctx); + + viennacl::vector<float> v_in((const cl_mem)in->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); + + v_out = viennacl::linalg::element_div(one, viennacl::linalg::element_exp(zero - v_in) + one); } template<> void Sign<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_sign"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_abs"); + + viennacl::vector<float> in_buf((const cl_mem)in->data(), num); + viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num); + + enqueue(kernel(num, in_buf, out_buf)); } template<> void Sqrt<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_sqrt"; - auto kernel = ctx->kernels->at(kname); + viennacl::vector<float> v_in((const cl_mem)in->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + v_out = viennacl::linalg::element_sqrt(v_in); } @@ -478,168 +302,85 @@ void Square<float, lang::Opencl>(const size_t num, const Block* in, Block* out, template<> void Sub<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_subtract_scalar"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, x); - kernel.setArg(3, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + Add<float, lang::Opencl>(num, in, -x, out, ctx); } template<> void Sub<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; + viennacl::vector<float> v_in1((const cl_mem)in1->data(), num); + viennacl::vector<float> v_in2((const cl_mem)in2->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - std::string kname = "clkernel_subtract"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data())); - cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, in1buf); - kernel.setArg(2, in2buf); - kernel.setArg(3, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + v_out = v_in1 - v_in2; } template<> void Sum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) { - cl_int status = CL_SUCCESS; + viennacl::vector<float> v_in((const cl_mem)in->data(), num); - std::string kname = "clkernel_reduce"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - - size_t size = sizeof(float) * num; - cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status); - OCL_CHECK(status, "Failed to create buffer!"); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, outval); - kernel.setArg(3, cl::Local(size)); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); - - float* temp = new float[num]; - status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp); - OCL_CHECK(status, "Failed to read from buffer!"); - out[0] = temp[0]; - delete temp; + out[0] = viennacl::linalg::sum(v_in); } template<> void Tanh<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_tanh"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + viennacl::vector<float> v_in((const cl_mem)in->data(), num); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + v_out = viennacl::linalg::element_tanh(v_in); } // ************************************** // Random functions // ************************************** -/// Seed value required for generating distributions. -static unsigned int seed[4] = {0, 32, 42, 888}; /// Number of generation rounds used in the current algorithm. static cl_uint rounds = 8; template<> void Bernoulli<float, lang::Opencl>(const size_t num, const float p, Block* out, Context *ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "PRNG_threefry4x32_bernoulli"; - auto kernel = ctx->kernels->at(kname); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("distribution.cl", "PRNG_threefry4x32_bernoulli"); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, outbuf); - kernel.setArg(1, seed); - kernel.setArg(2, 0.0f); // inf - kernel.setArg(3, 1.0f); // sup - kernel.setArg(4, p); // threshold - kernel.setArg(5, rounds); - kernel.setArg(6, cl_uint(num) / 4); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + viennacl::ocl::packed_cl_uint seed = {0, 32, 42, 888}; + + enqueue(kernel(v_out, seed, 0.0f, 1.0f, p, rounds, cl_uint(num / 4))); } template<> void Gaussian<float, lang::Opencl>(const size_t num, const float mean, const float std, Block* out, Context *ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "PRNG_threefry4x32_gaussian"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, outbuf); - kernel.setArg(1, seed); - kernel.setArg(2, mean); // E - kernel.setArg(3, std); // V - kernel.setArg(4, rounds); - kernel.setArg(5, cl_uint(num) / 4); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("distribution.cl", "PRNG_threefry4x32_gaussian"); + + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); + + viennacl::ocl::packed_cl_uint seed = {0, 32, 42, 888}; + + enqueue(kernel(v_out, seed, mean, std, rounds, cl_uint(num/4))); } template<> void Uniform<float, lang::Opencl>(const size_t num, const float low, const float high, Block* out, Context *ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "PRNG_threefry4x32_uniform"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("distribution.cl", "PRNG_threefry4x32_uniform"); - status = kernel.setArg(0, outbuf); OCL_CHECK(status, "kernel arg 0"); - status = kernel.setArg(1, seed); OCL_CHECK(status, "kernel arg 1"); - status = kernel.setArg(2, low); OCL_CHECK(status, "kernel arg 2"); - status = kernel.setArg(3, high); OCL_CHECK(status, "kernel arg 3"); - status = kernel.setArg(4, rounds); OCL_CHECK(status, "kernel arg 4"); - status = kernel.setArg(5, cl_uint(num) / 4); OCL_CHECK(status, "kernel arg 5"); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + viennacl::ocl::packed_cl_uint seed = {0, 32, 42, 888}; + + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); + + enqueue(kernel(v_out, seed, low, high, rounds, cl_uint(num/4))); } // ********************************************************* // BLAS functions, ref to http://docs.nvidia.com/cuda/cublas // ********************************************************* - +/* template<> void Amax<float, lang::Opencl>(const size_t num, const Block* in, size_t* out, Context* ctx) { cl_int status = CL_SUCCESS; @@ -699,7 +440,7 @@ void Amin<float, lang::Opencl>(const size_t num, const Block* in, size_t* out, C delete temp; } - + template<> void Asum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) { cl_int status = CL_SUCCESS; @@ -727,256 +468,141 @@ void Asum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Co out[0] = temp[0]; delete temp; } - - +*/ +/// out = alpha * in + out template<> void Axpy<float, lang::Opencl>(const size_t num, const float alpha, const Block* in, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_axpy"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, alpha); - kernel.setArg(2, inbuf); - kernel.setArg(3, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + viennacl::vector<float> inbuf((const cl_mem)in->data(), num); + viennacl::vector<float> outbuf(static_cast<cl_mem>(out->mutable_data()), num); + + outbuf += alpha * inbuf; } - +/// out = ||in||_2^2, i.e, L2 norm. template<> void Nrm2<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_nrm2"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - - size_t size = sizeof(float) * num; - cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status); - OCL_CHECK(status, "Failed to create buffer!"); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, inbuf); - kernel.setArg(2, outval); - kernel.setArg(3, cl::Local(sizeof(float) * (std::pow(2, num)))); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); - - float* temp = new float[num]; - status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp); - OCL_CHECK(status, "Failed to read from buffer!"); - out[0] = temp[0]; - delete temp; + viennacl::vector<float> inbuf((const cl_mem)in->data(), num); + + out[0] = viennacl::linalg::norm_2(inbuf); } template<> void Scale<float, lang::Opencl>(const size_t num, const float x, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_scale"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, x); - kernel.setArg(2, outbuf); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + + viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + v_out = viennacl::linalg::element_prod(v_out, x_in); } template<> void Dot<float, lang::Opencl>(const size_t num, const Block *in1, const Block *in2, float *out, Context *ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_dot"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data())); - cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data())); - - size_t size = sizeof(float) * num; - cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status); - OCL_CHECK(status, "Failed to create buffer!"); - - kernel.setArg(0, (cl_int)num); - kernel.setArg(1, in1buf); - kernel.setArg(2, in2buf); - kernel.setArg(3, outval); - kernel.setArg(4, cl::Local(size)); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); - - float* temp = new float[num]; - status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp); - OCL_CHECK(status, "Failed to read from buffer!"); - out[0] = temp[0]; - delete temp; + viennacl::vector<float> in1_buf((const cl_mem)in1->data(), num); + viennacl::vector<float> in2_buf((const cl_mem)in2->data(), num); + + out[0] = viennacl::linalg::inner_prod(in1_buf, in2_buf); } - +/// out = alpha * A * v + beta * out. template<> void GEMV<float, lang::Opencl>(bool trans, const size_t m, const size_t n, const float alpha, const Block *A, const Block *v, const float beta, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_gemv"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer Abuf = *(static_cast<cl::Buffer*>(A->mutable_data())); - cl::Buffer vbuf = *(static_cast<cl::Buffer*>(v->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)m); - kernel.setArg(1, (cl_int)n); - kernel.setArg(2, alpha); - kernel.setArg(3, Abuf); - kernel.setArg(4, vbuf); - kernel.setArg(5, beta); - kernel.setArg(6, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(m, n)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + + viennacl::matrix<float> A_in((const cl_mem)A->data(), m, n); + viennacl::vector<float> v_in((const cl_mem)v->data(), trans ? m : n); + viennacl::vector<float> o_in(static_cast<cl_mem>(out->mutable_data()), trans ? n : m); + + if (trans) viennacl::trans(A_in); + + o_in *= beta; + o_in += alpha * viennacl::linalg::prod(A_in, v_in); } +/// multiply a matrix with a diagnoal matrix constructed using values from 'v'. +/// if matrix_lef_side is true, do M*v; else do v*M template<> void DGMM<float, lang::Opencl>(bool side_right, const size_t nrow, const size_t ncol, const Block *M, const Block *v, Block *out, Context *ctx) { - cl_int status = CL_SUCCESS; - - cl::Buffer Mbuf = *(static_cast<cl::Buffer*>(M->mutable_data())); - cl::Buffer vbuf = *(static_cast<cl::Buffer*>(v->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - std::string kname; + viennacl::matrix<float> M_buf((const cl_mem)M->data(), nrow, ncol); + viennacl::vector<float> v_buf((const cl_mem)v->data(), nrow); + viennacl::matrix<float> out_buf(static_cast<cl_mem>(out->mutable_data()), nrow, ncol); + + auto diag = viennacl::diag(v_buf); + if (side_right) { - DiagVec_Right(ncol, vbuf, vbuf, ctx); - kname = "clkernel_dgmm_right"; + out_buf = viennacl::linalg::prod(diag, M_buf); } else { - DiagVec_Left(nrow, vbuf, vbuf, ctx); - kname = "clkernel_dgmm_left"; + out_buf = viennacl::linalg::prod(M_buf, diag); } - - auto kernel = ctx->kernels->at(kname); - - kernel.setArg(0, (cl_int)nrow); - kernel.setArg(1, (cl_int)ncol); - kernel.setArg(2, Mbuf); - kernel.setArg(3, vbuf); - kernel.setArg(4, outbuf); - kernel.setArg(5, cl::Local(sizeof(float) * nrow * ncol)); - - cl::NDRange global(nrow); // Only nrow because current implementation is 1 dimensional -// cl::NDRange local(); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global); - OCL_CHECK(status, "Failed to enqueue kernel function!"); } - +/// C = alpha * A * B + beta * C. template<> void GEMM<float, lang::Opencl>(const bool transA, const bool transB, const size_t nrowA, const size_t ncolB, const size_t ncolA, const float alpha, const Block *A, const Block *B, const float beta, Block *C, Context *ctx) { - cl_int status = CL_SUCCESS; - std::string kname = "clkernel_gemm"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer Abuf = *(static_cast<cl::Buffer*>(A->mutable_data())); - cl::Buffer Bbuf = *(static_cast<cl::Buffer*>(B->mutable_data())); - cl::Buffer Cbuf = *(static_cast<cl::Buffer*>(C->mutable_data())); - - // If matrix A needs to be transposed, do it. - if (transA) - Transpose(nrowA, ncolA, Abuf, Abuf, ctx); - - // If vector B needs to be transposed, do it. - if (transB) - Transpose(nrowA, ncolB, Bbuf, Bbuf, ctx); - - kernel.setArg(0, (cl_int)nrowA); - kernel.setArg(1, (cl_int)ncolB); - kernel.setArg(2, (cl_int)ncolA); - kernel.setArg(3, alpha); - kernel.setArg(4, Abuf); - kernel.setArg(5, Bbuf); - kernel.setArg(6, beta); - kernel.setArg(7, Cbuf); - kernel.setArg(8, cl::Local(sizeof(float) * nrowA * ncolB)); - kernel.setArg(9, cl::Local(sizeof(float) * nrowA * ncolB)); - -// TODO: Try to make the work group size a power of 2 given an arbitrary matrix. - cl::NDRange global(nrowA, ncolB); - cl::NDRange local(nrowA, ncolB); + viennacl::matrix<float> A_buf((const cl_mem)A->data(), nrowA, ncolA); + viennacl::matrix<float> B_buf((const cl_mem)B->data(), ncolA, ncolB); + viennacl::matrix<float> C_buf(static_cast<cl_mem>(C->mutable_data()), nrowA, ncolB); + + if (transA) viennacl::trans(A_buf); + if (transB) viennacl::trans(B_buf); - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global, local); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + C_buf *= beta; + C_buf += alpha * viennacl::linalg::prod(A_buf, B_buf); } + template <> void ComputeCrossEntropy<float, lang::Opencl>(const size_t batchsize, const size_t dim, const Block *p, const Block *t, Block *loss, Context *ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_crossentropy"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer pbuf = *(static_cast<cl::Buffer*>(p->mutable_data())); - cl::Buffer tbuf = *(static_cast<cl::Buffer*>(t->mutable_data())); - cl::Buffer lossbuf = *(static_cast<cl::Buffer*>(loss->mutable_data())); - - kernel.setArg(0, (cl_uint)batchsize); - kernel.setArg(1, (cl_uint)dim); - kernel.setArg(2, pbuf); - kernel.setArg(3, tbuf); - kernel.setArg(4, lossbuf); - - cl::NDRange global(batchsize); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_crossentropy"); + + viennacl::vector<float> p_buf((const cl_mem)p->data(), batchsize); + viennacl::vector<float> t_buf((const cl_mem)t->data(), batchsize); + viennacl::vector<float> loss_buf(static_cast<cl_mem>(loss->mutable_data()), batchsize); + + enqueue(kernel((cl_uint)batchsize, (cl_uint)dim, p_buf, t_buf, loss_buf)); } + template <> void SoftmaxCrossEntropyBwd<float, lang::Opencl>(const size_t batchsize, const size_t dim, const Block *p, const Block *t, Block *grad, Context *ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_softmaxentropy"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer pbuf = *(static_cast<cl::Buffer*>(p->mutable_data())); - cl::Buffer tbuf = *(static_cast<cl::Buffer*>(t->mutable_data())); - cl::Buffer gradbuf = *(static_cast<cl::Buffer*>(grad->mutable_data())); - - kernel.setArg(0, (cl_uint)batchsize); - kernel.setArg(1, (cl_uint)dim); - kernel.setArg(2, pbuf); - kernel.setArg(3, tbuf); - kernel.setArg(4, gradbuf); + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_softmaxentropy"); + + viennacl::vector<float> p_buf((const cl_mem)p->data(), batchsize); + viennacl::vector<float> t_buf((const cl_mem)t->data(), batchsize); + viennacl::vector<float> grad_buf(static_cast<cl_mem>(grad->mutable_data()), batchsize); + + enqueue(kernel((cl_uint)batchsize, (cl_uint)dim, p_buf, t_buf, grad_buf)); +} - cl::NDRange global(batchsize); - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global); - OCL_CHECK(status, "Failed to enqueue kernel function!"); +template<> +void RowMax<float, lang::Opencl>(const size_t nrow, const size_t ncol, + const Block *in, Block *out, Context *ctx) { + auto ocl_ctx = get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_rowmax"); + +// kernel.global_work_size(0, nrow); + + viennacl::matrix<float> in_buf((const cl_mem)in->data(), nrow, ncol); + viennacl::vector<float> outbuf(static_cast<cl_mem>(out->mutable_data()), nrow); + + enqueue(kernel((cl_uint)nrow, (cl_uint)ncol, in_buf, outbuf)); } // ************************************** @@ -985,129 +611,46 @@ void SoftmaxCrossEntropyBwd<float, lang::Opencl>(const size_t batchsize, const s /* template<> void AddCol<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* A, const Block* v, Block* out, Context* ctx) { - std::string kname = "clkernel_addcol"; - auto kernel = ctx->kernels->at(kname); - kernel.setArg(0, (cl_int)nrow); - kernel.setArg(1, (cl_int)ncol); - kernel.setArg(2, static_cast<const float*>(A->mutable_data())); - kernel.setArg(3, static_cast<const float*>(v->mutable_data())); - kernel.setArg(3, static_cast<float*>(out->mutable_data())); - ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol)); } + template<> void AddRow<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* A, const Block* v, Block* out, Context* ctx) { - std::string kname = "clkernel_addrow"; - auto kernel = ctx->kernels->at(kname); - kernel.setArg(0, (cl_int)nrow); - kernel.setArg(1, (cl_int)ncol); - kernel.setArg(2, static_cast<const float*>(A->mutable_data())); - kernel.setArg(3, static_cast<const float*>(v->mutable_data())); - kernel.setArg(3, static_cast<float*>(out->mutable_data())); - ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol)); } + template<> void Outer<float, lang::Opencl>(const size_t m, const size_t n, const Block* lhs, const Block* rhs, Block* out, Context* ctx) { - std::string kname = "clkernel_outerproduct"; - auto kernel = ctx->kernels->at(kname); - kernel.setArg(0, (cl_int)m); - kernel.setArg(1, (cl_int)n); - kernel.setArg(2, static_cast<const float*>(lhs->data())); - kernel.setArg(3, static_cast<const float*>(rhs->data())); - kernel.setArg(4, static_cast<float*>(out->mutable_data())); - - ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(m, n)); + viennacl::vector<float> lhs_in((const cl_mem)lhs->data(), m); + viennacl::vector<float> rhs_in((const cl_mem)rhs->data(), n); + viennacl::matrix<float> out_buf(static_cast<cl_mem>(out->mutable_data()), m, n); + + out_buf = viennacl::linalg::outer_prod(lhs_in, rhs_in); } -template<> -void SumColumns<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) { - std::string kname = "clkernel_sumcol"; - auto kernel = ctx->kernels->at(kname); - kernel.setArg(0, (cl_int)nrow); - kernel.setArg(1, (cl_int)ncol); - kernel.setArg(2, static_cast<const float*>(in->mutable_data())); - kernel.setArg(3, static_cast<float*>(out->mutable_data())); - ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol)); -}*/ -/* template<> -void SumRows<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_sumrow"; - auto kernel = ctx->kernels->at(kname); - - cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)nrow); - kernel.setArg(1, (cl_int)ncol); - kernel.setArg(2, inbuf); - kernel.setArg(3, outbuf); - kernel.setArg(4, cl::Local(sizeof(float) * nrow * ncol)); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol)); -} -*/ - - -#define BLOCK_DIM 16 - -void Transpose(const size_t nrow, const size_t ncol, cl::Buffer& in, cl::Buffer& out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_transpose"; - auto kernel = ctx->kernels->at(kname); - - kernel.setArg(0, (cl_uint)nrow); - kernel.setArg(1, (cl_uint)ncol); - kernel.setArg(2, in); - kernel.setArg(3, out); - kernel.setArg(4, cl::Local((BLOCK_DIM + 1) * BLOCK_DIM)); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); -} - -#undef BLOCK_DIM - - -/// This is a utility function that transforms a single-row vector into a diagonal matrix. -/// For example, a vector of size n will become a matrix of size n*n where only the positions nx == ny will have values. -void DiagVec_Left(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx) { - cl_int status = CL_SUCCESS; - - std::string kname = "clkernel_diagvec_left"; - auto kernel = ctx->kernels->at(kname); +void SumColumns<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) { + viennacl::matrix<float> m_in((const cl_mem)in->data(), nrow, ncol); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), nrow); - kernel.setArg(0, (cl_uint)size); - kernel.setArg(1, in); - kernel.setArg(2, out); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(size)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); + v_out = viennacl::linalg::column_sum(m_in); } -void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx) { - cl_int status = CL_SUCCESS; - std::string kname = "clkernel_diagvec_right"; - auto kernel = ctx->kernels->at(kname); - - kernel.setArg(0, (cl_uint)size); - kernel.setArg(1, in); - kernel.setArg(2, out); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(size)); - OCL_CHECK(status, "Failed to enqueue kernel function!"); +template<> +void SumRows<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) { + viennacl::matrix<float> m_in((const cl_mem)in->data(), nrow, ncol); + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), ncol); + + v_out = viennacl::linalg::column_sum(m_in); } +*/ } // namespace singa #endif // USE_OPENCL -#endif // SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_ +#endif // SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_v_in + x; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/595302a3/src/utils/opencl_utils.cc ---------------------------------------------------------------------- diff --git a/src/utils/opencl_utils.cc b/src/utils/opencl_utils.cc deleted file mode 100644 index e4fe69b..0000000 --- a/src/utils/opencl_utils.cc +++ /dev/null @@ -1,63 +0,0 @@ -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - -#include "singa/utils/opencl_utils.h" - -#ifdef USE_OPENCL - -void PrintDeviceInfo(const cl::Device &dev) { - cl_int status = CL_SUCCESS; - - LOG(INFO) << "\tDevice type: " << dev.getInfo<CL_DEVICE_TYPE>(&status); - LOG(INFO) << "\tUnified memory: " << dev.getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>(&status); - LOG(INFO) << "\tClock speed (MHz): " << dev.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(&status); - LOG(INFO) << "\tECC memory: " << dev.getInfo<CL_DEVICE_ERROR_CORRECTION_SUPPORT>(&status); - LOG(INFO) << "\tLittle endian: " << dev.getInfo<CL_DEVICE_ENDIAN_LITTLE>(&status); - LOG(INFO) << "\tCompute units: " << dev.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(&status); - LOG(INFO) << "\tMax work grp size: " << dev.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(&status); -//LOG(INFO) << "\tMax work item size: " << dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>(&status); - LOG(INFO) << "\tMax item dimension: " << dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS>(&status); - LOG(INFO) << "\tQueue properties: " << dev.getInfo<CL_DEVICE_QUEUE_PROPERTIES>(&status); - LOG(INFO) << "\tExecution capabilities: " << dev.getInfo<CL_DEVICE_EXECUTION_CAPABILITIES>(&status); - LOG(INFO) << "\tMax mem alloc size: " << dev.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>(&status); - LOG(INFO) << "\tGlobal mem size: " << dev.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>(&status); - LOG(INFO) << "\tLocal mem size: " << dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(&status); - LOG(INFO) << "\n"; - - OCL_CHECK(status, "Failed to retrieve device information!"); -} - - -void PrintPlatformInfo(const cl::Platform &p) { - cl_int status = CL_SUCCESS; - - LOG(INFO) << "\tName: " << p.getInfo<CL_PLATFORM_NAME>(&status); - LOG(INFO) << "\tProfile: " << p.getInfo<CL_PLATFORM_PROFILE>(&status); - LOG(INFO) << "\tVersion: " << p.getInfo<CL_PLATFORM_VERSION>(&status); - LOG(INFO) << "\tVendor: " << p.getInfo<CL_PLATFORM_VENDOR>(&status); - LOG(INFO) << "\tExtensions: " << p.getInfo<CL_PLATFORM_EXTENSIONS>(&status); - LOG(INFO) << "\n"; - - OCL_CHECK(status, "Failed to retrieve platform information!"); -} - - -#endif // USE_OPENCL
