Repository: incubator-singa Updated Branches: refs/heads/dev 4e7f3c13b -> 464dcda63
SINGA-173 OpenCL device support and implementation. Added option to Cmake to skip building tests for OpenCL. More unit tests, more bugfixes. Adding distribution.cl, and unit tests. Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/57f3a1eb Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/57f3a1eb Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/57f3a1eb Branch: refs/heads/dev Commit: 57f3a1eb3fb90ff3806ba7eade33b7d24170bf4d Parents: 35c8930 Author: Tan Li Boon <[email protected]> Authored: Sun Jul 17 15:44:58 2016 +0800 Committer: Tan Li Boon <[email protected]> Committed: Tue Jul 26 18:39:08 2016 +0800 ---------------------------------------------------------------------- .travis.yml | 2 +- CMakeLists.txt | 3 +- include/singa/core/device.h | 67 ++ include/singa/core/platform.h | 105 --- src/core/device/opencl_device.cc | 3 +- src/core/device/platform.cc | 8 + src/core/tensor/distribution.cl | 1020 ++++++++++++++++++++++++++++ src/core/tensor/tensor_math_opencl.cl | 57 +- src/core/tensor/tensor_math_opencl.h | 79 ++- test/CMakeLists.txt | 5 + test/singa/test_opencl.cc | 546 +++++++++++++-- 11 files changed, 1690 insertions(+), 205 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/.travis.yml ---------------------------------------------------------------------- diff --git a/.travis.yml b/.travis.yml index f7434d8..8b1f89c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ before_install: before_script: - mkdir build && cd build - - cmake .. -DUSE_CUDA=OFF -DUSE_CUDNN=OFF -DUSE_PYTHON=OFF + - cmake .. -DUSE_CUDA=OFF -DUSE_CUDNN=OFF -DUSE_PYTHON=OFF -DBUILD_OPENCL_TESTS=OFF script: - make http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/CMakeLists.txt b/CMakeLists.txt index bd39a0a..8f862f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,8 @@ OPTION(USE_CUDNN "Use Cudnn libs" ON) OPTION(USE_OPENCV "Use opencv" OFF) OPTION(USE_LMDB "Use LMDB libs" OFF) OPTION(USE_PYTHON "Generate py wrappers" ON) -OPTION(USE_OPENCL "Use OpenCL" ON) +OPTION(USE_OPENCL "Use OpenCL" OFF) +OPTION(BUILD_OPENCL_TESTS "Build OpenCL tests" OFF) INCLUDE("cmake/Dependencies.cmake") INCLUDE("cmake/Utils.cmake") http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/include/singa/core/device.h ---------------------------------------------------------------------- diff --git a/include/singa/core/device.h b/include/singa/core/device.h index 4c775a1..a7f6488 100644 --- a/include/singa/core/device.h +++ b/include/singa/core/device.h @@ -189,6 +189,73 @@ class CudaGPU : public Device { #endif // USE_CUDA + + +/// This class queries all available calculating devices on a given machine +/// grouped according to manufacturer or device drivers. All methods should be static. +/// If CUDA or OPENCL are not enabled, then the respective related methods should +/// return something that indicates their absence (for example, 0 devices); +/// however they should always be available regardless of compile-time switches. +class Platform { +public: + + /// Constructor. + Platform(); + + /// Return the number of total available GPUs + static int GetNumGPUs(); + + /// Return the device IDs of available GPUs. + /// TODO(wangwei) return the IDs according to free memory in decending order + static const std::vector<int> GetGPUIDs(); + + static const std::pair<size_t, size_t> GetGPUMemSize(const int device); + + /// Return the memory of a GPU <free, total> + static const std::vector<std::pair<size_t, size_t>> GetGPUMemSize(); + + /// Return a string containing all hardware info, e.g., version, memory size. + static const std::string DeviceQuery(int id, bool verbose = false); + + /// Create a set of CudaGPU Device using 'num_devices' free GPUs. + static const std::vector<std::shared_ptr<Device>> + CreateCudaGPUs(const size_t num_devices, size_t init_size = 0); + + /// Create a set of CudaGPU Device using given GPU IDs. + static const std::vector<std::shared_ptr<Device>> + CreateCudaGPUs(const std::vector<int> &devices, size_t init_size = 0); + + /// Create a \p num_devices set of valid OpenCL devices, regardless of platforms. + /// If there are fewer valid devices than requested, then this method will return as many as possible. + /// If OpenCL is not in use, this method will return an empty array. + const std::vector<std::shared_ptr<Device>> CreateOpenclDevices(const size_t num_devices); + + /// Create a set of valid OpenCL devices, regardless of platforms, assigning \p id to each device in sequence. + /// If there are fewer valid devices than requested, then this method will return as many as possible. + /// If OpenCL is not in use, this method will return an empty array. + const std::vector<std::shared_ptr<Device>> CreateOpenclDevices(const vector<int>& id); + + /// This function is implementd by Caffe (http://caffe.berkeleyvision.org/). + /// This function checks the availability of GPU #device_id. + /// It attempts to create a context on the device by calling cudaFree(0). + /// cudaSetDevice() alone is not sufficient to check the availability. + /// It lazily records device_id, however, does not initialize a + /// context. So it does not know if the host thread has the permission to use + /// the device or not. + /// + /// In a shared environment where the devices are set to EXCLUSIVE_PROCESS + /// or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess + /// even if the device is exclusively occupied by another process or thread. + /// Cuda operations that initialize the context are needed to check + /// the permission. cudaFree(0) is one of those with no side effect, + /// except the context initialization. + static bool CheckDevice(const int device_id); + + +private: + cl::Platform clPlatform; +}; + } // namespace singa #endif // SINGA_CORE_DEVICE_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/include/singa/core/platform.h ---------------------------------------------------------------------- diff --git a/include/singa/core/platform.h b/include/singa/core/platform.h deleted file mode 100644 index ff1bbea..0000000 --- a/include/singa/core/platform.h +++ /dev/null @@ -1,105 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SINGA_CORE_PLATFORM_H_ -#define SINGA_CORE_PLATFORM_H_ - -#include <memory> -#include <vector> - -#include "singa/core/device.h" -#include "singa/singa_config.h" - -#ifdef USE_CUDA -#include "singa/utils/cuda_utils.h" -#endif // USE_CUDA - -#ifdef USE_OPENCL -#include <cl/cl2.hpp> -#endif // USE_OPENCL - -namespace singa { - -/// This class queries all available calculating devices on a given machine -/// grouped according to manufacturer or device drivers. All methods should be static. -/// If CUDA or OPENCL are not enabled, then the respective related methods should -/// return something that indicates their absence (for example, 0 devices); -/// however they should always be available regardless of compile-time switches. -class Platform { -public: - - /// Constructor. - Platform(); - - /// Return the number of total available GPUs - static int GetNumGPUs(); - - /// Return the device IDs of available GPUs. - /// TODO(wangwei) return the IDs according to free memory in decending order - static const std::vector<int> GetGPUIDs(); - - static const std::pair<size_t, size_t> GetGPUMemSize(const int device); - - /// Return the memory of a GPU <free, total> - static const std::vector<std::pair<size_t, size_t>> GetGPUMemSize(); - - /// Return a string containing all hardware info, e.g., version, memory size. - static const std::string DeviceQuery(int id, bool verbose = false); - - /// Create a set of CudaGPU Device using 'num_devices' free GPUs. - static const std::vector<std::shared_ptr<Device>> - CreateCudaGPUs(const size_t num_devices, size_t init_size = 0); - - /// Create a set of CudaGPU Device using given GPU IDs. - static const std::vector<std::shared_ptr<Device>> - CreateCudaGPUs(const std::vector<int> &devices, size_t init_size = 0); - - /// Create a \p num_devices set of valid OpenCL devices, regardless of platforms. - /// If there are fewer valid devices than requested, then this method will return as many as possible. - /// If OpenCL is not in use, this method will return an empty array. - const std::vector<std::shared_ptr<Device>> CreateOpenclDevices(const size_t num_devices); - - /// Create a set of valid OpenCL devices, regardless of platforms, assigning \p id to each device in sequence. - /// If there are fewer valid devices than requested, then this method will return as many as possible. - /// If OpenCL is not in use, this method will return an empty array. - const std::vector<std::shared_ptr<Device>> CreateOpenclDevices(const vector<int>& id); - - /// This function is implementd by Caffe (http://caffe.berkeleyvision.org/). - /// This function checks the availability of GPU #device_id. - /// It attempts to create a context on the device by calling cudaFree(0). - /// cudaSetDevice() alone is not sufficient to check the availability. - /// It lazily records device_id, however, does not initialize a - /// context. So it does not know if the host thread has the permission to use - /// the device or not. - /// - /// In a shared environment where the devices are set to EXCLUSIVE_PROCESS - /// or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess - /// even if the device is exclusively occupied by another process or thread. - /// Cuda operations that initialize the context are needed to check - /// the permission. cudaFree(0) is one of those with no side effect, - /// except the context initialization. - static bool CheckDevice(const int device_id); - - -private: - cl::Platform clPlatform; -}; - -} // namespace singa - -#endif // SINGA_CORE_PLATFORM_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/src/core/device/opencl_device.cc ---------------------------------------------------------------------- diff --git a/src/core/device/opencl_device.cc b/src/core/device/opencl_device.cc index 053ac4f..d4d1fe5 100644 --- a/src/core/device/opencl_device.cc +++ b/src/core/device/opencl_device.cc @@ -44,13 +44,12 @@ OpenclDevice::OpenclDevice(int id, int num_executors) std::vector<cl::Platform> platforms; status = cl::Platform::get(&platforms); OCL_CHECK(status, "Failed to find any OpenCL platforms!"); - + std::vector<cl::Device> devices; status = platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices); OCL_CHECK(status, "Failed to get list of devices from platform!"); this->this_device = cl::Device(devices[0]); - this->ocl_ctx = cl::Context(this_device, nullptr, nullptr, nullptr, &status); OCL_CHECK(status, "Failed to create context!"); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/src/core/device/platform.cc ---------------------------------------------------------------------- diff --git a/src/core/device/platform.cc b/src/core/device/platform.cc index 984df69..80826c0 100644 --- a/src/core/device/platform.cc +++ b/src/core/device/platform.cc @@ -16,6 +16,14 @@ * limitations under the License. */ +#ifdef USE_CUDA +#include "singa/utils/cuda_utils.h" +#endif // USE_CUDA + +#ifdef USE_OPENCL +#include <cl/cl2.hpp> +#endif // USE_OPENCL + #include "singa/core/device.h" #include "singa/singa_config.h" http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/src/core/tensor/distribution.cl ---------------------------------------------------------------------- diff --git a/src/core/tensor/distribution.cl b/src/core/tensor/distribution.cl new file mode 100644 index 0000000..ce298c0 --- /dev/null +++ b/src/core/tensor/distribution.cl @@ -0,0 +1,1020 @@ +// This code is adapted from https://github.com/amd/OpenCL-caffe/blob/stable/src/caffe/ocl/random.cl + +//Note: random generator has two parts +//first part: the open sourced threefy random generator kernel from DE Shaw Research +//second part. we wrap the kernel up to generate uniform, bernoulli and gaussion distribution generators. + +//begin: the open sourced random generator from DE Shaw Research +//https://www.deshawresearch.com/resources_random123.html +typedef uint uint32_t; + +struct r123array4x32 { + uint32_t v[4]; +}; + +enum r123_enum_threefry32x4 { + R_32x4_0_0 = 10, + R_32x4_0_1 = 26, + R_32x4_1_0 = 11, + R_32x4_1_1 = 21, + R_32x4_2_0 = 13, + R_32x4_2_1 = 27, + R_32x4_3_0 = 23, + R_32x4_3_1 = 5, + R_32x4_4_0 = 6, + R_32x4_4_1 = 20, + R_32x4_5_0 = 17, + R_32x4_5_1 = 11, + R_32x4_6_0 = 25, + R_32x4_6_1 = 10, + R_32x4_7_0 = 18, + R_32x4_7_1 = 20 +}; + +inline uint32_t RotL_32(uint32_t x, unsigned int N) { + return (x << (N & 31)) | (x >> ((32 - N) & 31)); +} + +typedef struct r123array4x32 threefry4x32_ctr_t; +typedef struct r123array4x32 threefry4x32_key_t; +typedef struct r123array4x32 threefry4x32_ukey_t; + +inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, threefry4x32_ctr_t in, threefry4x32_key_t k) { + threefry4x32_ctr_t X; + uint32_t ks[4 + 1]; + int i; + ks[4] = 0x1BD11BDA; + + { + ks[0] = k.v[0]; + X.v[0] = in.v[0]; + ks[4] ^= k.v[0]; + + ks[1] = k.v[1]; + X.v[1] = in.v[1]; + ks[4] ^= k.v[1]; + + ks[2] = k.v[2]; + X.v[2] = in.v[2]; + ks[4] ^= k.v[2]; + + ks[3] = k.v[3]; + X.v[3] = in.v[3]; + ks[4] ^= k.v[3]; + } + + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + + if (Nrounds > 0) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 1) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 2) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 3) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 3) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 1; + } + + if (Nrounds > 4) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 5) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 6) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 7) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 7) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 2; + } + + if (Nrounds > 8) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 9) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 10) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 11) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 11) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 3; + } + + if (Nrounds > 12) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 13) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 14) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 15) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 15) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 4; + } + + if (Nrounds > 16) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 17) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 18) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 19) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 19) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 5; + } + + if (Nrounds > 20) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 21) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 22) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 23) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 23) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 6; + } + + if (Nrounds > 24) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 25) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 26) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 27) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 27) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 7; + } + + if (Nrounds > 28) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 29) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 30) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 31) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 31) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 8; + } + + if (Nrounds > 32) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 33) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 34) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 35) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 35) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 9; + } + + if (Nrounds > 36) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 37) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 38) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 39) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 39) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 10; + } + + if (Nrounds > 40) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 41) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + if (Nrounds > 42) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 43) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 43) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 11; + } + + if (Nrounds > 44) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 45) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 46) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 47) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 47) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 12; + } + + if (Nrounds > 48) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 49) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 50) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 51) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 51) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 13; + } + + if (Nrounds > 52) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 53) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 54) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 55) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 55) { + X.v[0] += ks[4]; + X.v[1] += ks[0]; + X.v[2] += ks[1]; + X.v[3] += ks[2]; + X.v[4 - 1] += 14; + } + + if (Nrounds > 56) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 57) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 58) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 59) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 59) { + X.v[0] += ks[0]; + X.v[1] += ks[1]; + X.v[2] += ks[2]; + X.v[3] += ks[3]; + X.v[4 - 1] += 15; + } + + if (Nrounds > 60) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 61) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 62) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 63) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 63) { + X.v[0] += ks[1]; + X.v[1] += ks[2]; + X.v[2] += ks[3]; + X.v[3] += ks[4]; + X.v[4 - 1] += 16; + } + + if (Nrounds > 64) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_0_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_0_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 65) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_1_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_1_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 66) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_2_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_2_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 67) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_3_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_3_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 67) { + X.v[0] += ks[2]; + X.v[1] += ks[3]; + X.v[2] += ks[4]; + X.v[3] += ks[0]; + X.v[4 - 1] += 17; + } + + if (Nrounds > 68) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_4_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_4_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 69) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_5_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_5_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 70) { + X.v[0] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_6_0); + X.v[1] ^= X.v[0]; + X.v[2] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_6_1); + X.v[3] ^= X.v[2]; + } + + if (Nrounds > 71) { + X.v[0] += X.v[3]; + X.v[3] = RotL_32(X.v[3], R_32x4_7_0); + X.v[3] ^= X.v[0]; + X.v[2] += X.v[1]; + X.v[1] = RotL_32(X.v[1], R_32x4_7_1); + X.v[1] ^= X.v[2]; + } + + if (Nrounds > 71) { + X.v[0] += ks[3]; + X.v[1] += ks[4]; + X.v[2] += ks[0]; + X.v[3] += ks[1]; + X.v[4 - 1] += 18; + } + return X; +} +//end: the open sourced random generator from DE Shaw Research + +// ************************** +// BERNOULLI DISTRIBUTION +// ************************** + +__kernel void PRNG_threefry4x32_bernoulli( + __global float4 *randomnumber, + threefry4x32_ctr_t ctr_i, + float inf, float sup, + float threshold, + uint nrounds, uint numrandom) { + + size_t gdx = get_global_id(0); + + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; + + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + + threefry4x32_ctr_t random4; + + if ( gdx < numrandom ) { + random4 = threefry4x32_R(nrounds, ctr, ukey); + float4 frnd; + frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f; + frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f; + frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f; + frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) < threshold ? 1.0f : 0.0f; + randomnumber[gdx] = frnd; + } +} + +// ************************** +// UNIFORM DISTRIBUTION (float) +// ************************** + +__kernel void PRNG_threefry4x32_uniform( + __global float4 *randomnumber, + threefry4x32_ctr_t ctr_i, + float inf, float sup, + uint nrounds, uint numrandom) { + + size_t gdx = get_global_id(0); + + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; + + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + + threefry4x32_ctr_t random4; + + if ( gdx < numrandom ) { + random4 = threefry4x32_R(nrounds, ctr, ukey); + float4 frnd; + frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ); + frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ); + frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ); + frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ); + randomnumber[gdx] = frnd; + } +} + +// ************************** +// UNIFORM DISTRIBUTION (uint) +// ************************** + +__kernel void PRNG_threefry4x32_uint_uniform( + __global uint4 *randomnumber, + threefry4x32_ctr_t ctr_i, + uint inf, uint sup, + uint nrounds, uint numrandom) { + + size_t gdx = get_global_id(0); + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey; + + ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx; + + threefry4x32_ctr_t random4; + + if ( gdx < numrandom ) { + random4 = threefry4x32_R(nrounds, ctr, ukey); + uint4 frnd; + frnd.x = random4.v[0] % (sup - inf) + inf; + frnd.y = random4.v[1] % (sup - inf) + inf; + frnd.z = random4.v[2] % (sup - inf) + inf; + frnd.w = random4.v[3] % (sup - inf) + inf; + randomnumber[gdx] = frnd; + } +} + +// ************************** +// GAUSSIAN DISTRIBUTION +// ************************** + +__kernel void PRNG_threefry4x32_gaussian( + __global float4 *randomnumber, + threefry4x32_ctr_t ctr_i, + float E, float V, + uint nrounds, uint numrandom) { + + size_t gdx = get_global_id(0); + + uint maxUint = 0; + maxUint--; + float r = (float)maxUint; + + threefry4x32_ctr_t ctr = ctr_i; + threefry4x32_ukey_t ukey1, ukey2; + + ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx; + ukey2.v[0] = ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0; + + threefry4x32_ctr_t random1, random2; + + if ( gdx < numrandom ) { + random1 = threefry4x32_R(nrounds, ctr, ukey1); + random2 = threefry4x32_R(nrounds, ctr, ukey2); + float4 frnd1; + + float r1 = (((float)random1.v[0]) / r); // generate a random sequence of uniform distribution + float r2 = (((float)random2.v[0]) / r); + float r3 = (((float)random1.v[1]) / r); + float r4 = (((float)random2.v[1]) / r); + float r5 = (((float)random1.v[2]) / r); + float r6 = (((float)random2.v[2]) / r); + float r7 = (((float)random1.v[3]) / r); + float r8 = (((float)random2.v[3]) / r); + + if(r2 == 0 || r4 == 0 || r6 == 0 || r8 == 0) { + r2 += 0.0001; + r4 += 0.0001; + r6 += 0.0001; + r8 += 0.0001; + } + + frnd1.x = cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.x = sin(2*M_PI*r1)*sqrt(-2.0*log(r2)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.y = sin(2*M_PI*r3)*sqrt(-2.0*log(r4)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * V + E;// return a pseudo sequence of normal distribution using two above uniform noise data + //frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8)); // return the quadrature counterpart of the foregoing pseudo normal distribution sequence + + randomnumber[gdx] = frnd1; + } +} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/src/core/tensor/tensor_math_opencl.cl ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_opencl.cl b/src/core/tensor/tensor_math_opencl.cl index 56eef44..f9cf96e 100644 --- a/src/core/tensor/tensor_math_opencl.cl +++ b/src/core/tensor/tensor_math_opencl.cl @@ -55,7 +55,7 @@ void clkernel_clamp(const int num, float low, float high, __global const float* __kernel void clkernel_divide_scalar_matx(const int num, __global const float* in1, const float x, - __global const float* in2, __global float* out) { + __global float* out) { const int i = get_global_id(0); if (i >= num) return; out[i] = in1[i] / x; @@ -63,7 +63,7 @@ void clkernel_divide_scalar_matx(const int num, __global const float* in1, const __kernel void clkernel_divide_scalar_xmat(const int num, const float x, __global const float* in1, - __global const float* in2, __global float* out) { + __global float* out) { const int i = get_global_id(0); if (i >= num) return; out[i] = x / in1[i]; @@ -159,7 +159,7 @@ __kernel void clkernel_relu(const int num, __global const float* in, __global float* out) { const int i = get_global_id(0); if (i >= num) return; - out[i] = (in[i] > 0) ? in[i] : 0.0f; + out[i] = (in[i] >= 0.0f) ? in[i] : 0.0f; } __kernel @@ -371,8 +371,9 @@ void clkernel_dot(const int num, __global const float* in1, __global const float } -// Third kernel from http://www.bealto.com/gpu-gemv_intro.html +// First kernel from http://www.bealto.com/gpu-gemv_intro.html // y = α*A*v + β*y +// fma(a, b, c) == (a * b) + c with infinite precision __kernel void clkernel_gemv(const int m, const int n, const float alpha, __global const float* A, __global const float* v, @@ -380,7 +381,7 @@ void clkernel_gemv(const int m, const int n, const float alpha, const int i = get_global_id(0); float sum = 0.0f; for (int k = 0; k < n; k++) { - sum += fma(alpha, A[i + m * k], v[k]) + beta * out[i + m * k]; + sum += fma(beta, out[i + m * k], alpha * A[i + m * k] * v[k]); } out[i] = sum; } @@ -418,18 +419,37 @@ void clkernel_dgmm_right(const int nrow, const int ncol, // TODO: Optimize with Reference from http://www.cedricnugteren.nl/tutorial.php?page=1 // C = α*A*B + β*C __kernel -void clkernel_gemm(const int nrowA, const int ncolB, const int ncolA, const float alpha, - __global const float *A, __global const float* B, const float beta, - __global float* C) { - const uint gidx = get_global_id(0); - const uint gidy = get_global_id(1); +void clkernel_gemm(const uint nrowA, const uint ncolB, const uint ncolA, const float alpha, + __global const float* A, __global const float* B, const float beta, + __global float* C, __local float* Asub, __local float* Bsub) { + const uint lidx = get_local_id(0); + const uint lidy = get_local_id(1); + const uint TS = get_local_size(0); // Tile size + const uint gidx = TS * get_group_id(0) + lidx; // Row ID of C (0..M) + const uint gidy = TS * get_group_id(1) + lidy; // Row ID of C (0..N) + + // Initialise the accumulation register float acc = 0.0f; - for (uint i = 0; i < ncolA; i++) { - acc = fma(A[i * nrowA + gidx], B[gidy * ncolA + i] * alpha, acc); + + // Loop over all tiles + const int numtiles = ncolA / TS; + for (int t = 0; t < numtiles; t++) { + const int tiledRow = TS * t + lidx; + const int tiledCol = TS * t + lidy; + Asub[lidy * TS + lidx] = A[tiledCol * nrowA + gidx]; + Bsub[lidy * TS + lidx] = B[gidy * ncolA + tiledRow]; + + barrier(CLK_LOCAL_MEM_FENCE); + + for(int k = 0; k < TS; k++) { + acc += Asub[k * TS + lidx] * Bsub[lidy * TS + k] * alpha; + } + + barrier(CLK_LOCAL_MEM_FENCE); } - - C[gidy * nrowA + gidx] = fma(C[gidy * nrowA + gidx], beta, acc); + + C[gidy * nrowA + gidx] = fma(beta, C[gidy * nrowA + gidx], acc); } @@ -567,3 +587,12 @@ void clkernel_diagvec_left(uint vsize, __global const float* vin, __global float for (uint i = 0; i < vsize; i++) out[gid * vsize + i] = (i == gid) ? vin[gid] : 0.0f; } + + +__kernel +void clkernel_diagvec_right(uint vsize, __global const float* vin, __global float* out) { + const uint gid = get_global_id(0); + + for (uint i = 0; i < vsize; i++) + out[gid * vsize + i] = (i == gid) ? vin[gid] : 0.0f; +} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/src/core/tensor/tensor_math_opencl.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_opencl.h b/src/core/tensor/tensor_math_opencl.h index bfc051d..c289a56 100644 --- a/src/core/tensor/tensor_math_opencl.h +++ b/src/core/tensor/tensor_math_opencl.h @@ -19,7 +19,7 @@ #ifndef SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_ #ifdef USE_OPENCL -//#include <CL/cl2.hpp> +#include <limits> #include "singa/utils/opencl_utils.h" #include "tensor_math.h" @@ -122,7 +122,7 @@ void Div<float, lang::Opencl>(const size_t num, const Block* in, const float x, std::string kname = "clkernel_divide_scalar_matx"; auto kernel = ctx->kernels->at(kname); - + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); @@ -142,7 +142,7 @@ void Div<float, lang::Opencl>(const size_t num, const float x, const Block* in, std::string kname = "clkernel_divide_scalar_xmat"; auto kernel = ctx->kernels->at(kname); - + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); @@ -162,7 +162,7 @@ void Div<float, lang::Opencl>(const size_t num, const Block* in1, const Block* i std::string kname = "clkernel_divide"; auto kernel = ctx->kernels->at(kname); - + cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data())); cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data())); cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); @@ -401,7 +401,7 @@ void Set<float, lang::Opencl>(const size_t num, const float x, Block* out, Conte std::string kname = "clkernel_set"; auto kernel = ctx->kernels->at(kname); - + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); kernel.setArg(0, (cl_int)num); @@ -568,21 +568,29 @@ void Tanh<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Co // Random functions // ************************************** +/// Seed value required for generating distributions. +static unsigned int seed[4] = {0, 32, 42, 888}; +/// Number of generation rounds used in the current algorithm. +static cl_uint rounds = 8; + template<> void Bernoulli<float, lang::Opencl>(const size_t num, const float p, Block* out, Context *ctx) { cl_int status = CL_SUCCESS; -//std::string kname = "clkernel_bernoulli"; std::string kname = "PRNG_threefry4x32_bernoulli"; auto kernel = ctx->kernels->at(kname); cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - kernel.setArg(0, (cl_int)(num / 4)); // Divide by 4 because kernel uses float4 as argument. - kernel.setArg(1, p); - kernel.setArg(2, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + kernel.setArg(0, outbuf); + kernel.setArg(1, seed); + kernel.setArg(2, 0.0f); // inf + kernel.setArg(3, 1.0f); // sup + kernel.setArg(4, p); // threshold + kernel.setArg(5, rounds); + kernel.setArg(6, cl_uint(num) / 4); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4)); OCL_CHECK(status, "Failed to enqueue kernel function!"); } @@ -591,18 +599,19 @@ template<> void Gaussian<float, lang::Opencl>(const size_t num, const float mean, const float std, Block* out, Context *ctx) { cl_int status = CL_SUCCESS; -//std::string kname = "clkernel_gaussian"; std::string kname = "PRNG_threefry4x32_gaussian"; auto kernel = ctx->kernels->at(kname); cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - kernel.setArg(0, (cl_int)(num / 4)); - kernel.setArg(1, mean); - kernel.setArg(2, std); - kernel.setArg(3, outbuf); + kernel.setArg(0, outbuf); + kernel.setArg(1, seed); + kernel.setArg(2, mean); // E + kernel.setArg(3, std); // V + kernel.setArg(4, rounds); + kernel.setArg(5, cl_uint(num) / 4); - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4)); OCL_CHECK(status, "Failed to enqueue kernel function!"); } @@ -611,18 +620,19 @@ template<> void Uniform<float, lang::Opencl>(const size_t num, const float low, const float high, Block* out, Context *ctx) { cl_int status = CL_SUCCESS; -//std::string kname = "clkernel_uniform"; std::string kname = "PRNG_threefry4x32_uniform"; auto kernel = ctx->kernels->at(kname); cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); - - kernel.setArg(0, (cl_int)(num / 4)); - kernel.setArg(1, low); - kernel.setArg(2, high); - kernel.setArg(3, outbuf); - - status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + + status = kernel.setArg(0, outbuf); OCL_CHECK(status, "kernel arg 0"); + status = kernel.setArg(1, seed); OCL_CHECK(status, "kernel arg 1"); + status = kernel.setArg(2, low); OCL_CHECK(status, "kernel arg 2"); + status = kernel.setArg(3, high); OCL_CHECK(status, "kernel arg 3"); + status = kernel.setArg(4, rounds); OCL_CHECK(status, "kernel arg 4"); + status = kernel.setArg(5, cl_uint(num) / 4); OCL_CHECK(status, "kernel arg 5"); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num/4)); OCL_CHECK(status, "Failed to enqueue kernel function!"); } @@ -887,17 +897,17 @@ void GEMM<float, lang::Opencl>(const bool transA, const bool transB, std::string kname = "clkernel_gemm"; auto kernel = ctx->kernels->at(kname); - + cl::Buffer Abuf = *(static_cast<cl::Buffer*>(A->mutable_data())); cl::Buffer Bbuf = *(static_cast<cl::Buffer*>(B->mutable_data())); cl::Buffer Cbuf = *(static_cast<cl::Buffer*>(C->mutable_data())); // If matrix A needs to be transposed, do it. - if (!transA) + if (transA) Transpose(nrowA, ncolA, Abuf, Abuf, ctx); // If vector B needs to be transposed, do it. - if (!transB) + if (transB) Transpose(nrowA, ncolB, Bbuf, Bbuf, ctx); kernel.setArg(0, (cl_int)nrowA); @@ -908,10 +918,13 @@ void GEMM<float, lang::Opencl>(const bool transA, const bool transB, kernel.setArg(5, Bbuf); kernel.setArg(6, beta); kernel.setArg(7, Cbuf); - - cl::NDRange global(nrowA, ncolA); - cl::NDRange local(32, 32); - + kernel.setArg(8, cl::Local(sizeof(float) * nrowA * ncolB)); + kernel.setArg(9, cl::Local(sizeof(float) * nrowA * ncolB)); + +// TODO: Try to make the work group size a power of 2 given an arbitrary matrix. + cl::NDRange global(nrowA, ncolB); + cl::NDRange local(nrowA, ncolB); + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global, local); OCL_CHECK(status, "Failed to enqueue kernel function!"); } @@ -1070,7 +1083,7 @@ void DiagVec_Left(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* c std::string kname = "clkernel_diagvec_left"; auto kernel = ctx->kernels->at(kname); - + kernel.setArg(0, (cl_uint)size); kernel.setArg(1, in); kernel.setArg(2, out); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/test/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 95c3cf7..632a2cd 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -4,6 +4,11 @@ ADD_LIBRARY(gtest STATIC EXCLUDE_FROM_ALL "gtest/gtest.h" "gtest/gtest-all.cc") AUX_SOURCE_DIRECTORY(singa singa_test_source) +IF(NOT BUILD_OPENCL_TESTS) + MESSAGE(STATUS "Skipping OpenCL tests") + LIST(REMOVE_ITEM singa_test_source "singa/test_opencl.cc") +ENDIF() + ADD_EXECUTABLE(test_singa "gtest/gtest_main.cc" ${singa_test_source}) ADD_DEPENDENCIES(test_singa singa_core singa_utils) MESSAGE(STATUS "link libs" ${singa_linker_libs}) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/test/singa/test_opencl.cc ---------------------------------------------------------------------- diff --git a/test/singa/test_opencl.cc b/test/singa/test_opencl.cc index f426559..0a335e5 100644 --- a/test/singa/test_opencl.cc +++ b/test/singa/test_opencl.cc @@ -30,6 +30,50 @@ using singa::Block; using singa::Shape; using singa::Tensor; +class OpenCL_TensorMath : public ::testing::Test { +protected: + + OpenCL_TensorMath() { + for (int i = 0; i < 4; i++) { + float4[i] = (float)i; + float4zero[i] = 0.0f; + } + + for (int i = 0; i < 16; i++) { + float16[i] = (float)i; + float16zero[i] = 0.0f; + } + + auto ocl_dev = std::make_shared<OpenclDevice>(); + + tf4in = Tensor(Shape{1, 4}, ocl_dev); + tf4in.CopyDataFromHostPtr(float4, 4); + + tf4zin = Tensor(Shape{1, 4}, ocl_dev); + tf4zin.CopyDataFromHostPtr(float4zero, 4); + + tf16in = Tensor(Shape{4, 4}, ocl_dev); + tf16in.CopyDataFromHostPtr(float16, 16); + + tf16zin = Tensor(Shape{4, 4}, ocl_dev); + tf16zin.CopyDataFromHostPtr(float16zero, 16); + + float empty[10000] = {}; + empty10k = Tensor(Shape{10000}, ocl_dev); + empty10k.CopyDataFromHostPtr(empty, 10000); + } + + float float4[4]; + float float4zero[4]; + float float16[16]; + float float16zero[16]; + + Tensor tf4in, tf16in; + Tensor tf4zin, tf16zin; + Tensor empty10k; +}; + + // Makes a float array and fills it with increasing values from 0. float* MakeMatrix(const int size) { float* mat = new float[size]; @@ -44,6 +88,7 @@ TEST(OpenclDevice, Constructor) { EXPECT_EQ(0, dev.id()); } + TEST(OpenclDevice, MemoryAllocFree) { OpenclDevice dev; Block* b = dev.NewBlock(4); @@ -75,6 +120,7 @@ TEST(OpenclDevice, CopyDataToFrom) { EXPECT_EQ('x', astr[3]); } + TEST(OpenclDevice, DuplicateDataOnDevice) { OpenclDevice dev; CppCPU host; @@ -99,81 +145,483 @@ TEST(OpenclDevice, DuplicateDataOnDevice) { EXPECT_EQ('x', astr[3]); } -// Tensor tests - -TEST(OpenCL_TensorMath, TensorMath_CopyDataToDevice) { - auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice()); +// Tensor tests, uses OpenCL_TensorMath class defined above. - Tensor t(Shape{1, 4}, ocl_dev); - float a[] = {0.0f, 1.0f, 2.0f, 3.0f}; - t.CopyDataFromHostPtr(a, 4); - - CppCPU host; - Block* host_out = host.NewBlock(sizeof(float) * 4); - ocl_dev->CopyDataToFrom(host_out, t.block(), sizeof(float) * 4, singa::kDeviceToHost); +TEST_F(OpenCL_TensorMath, CopyDataToDevice) { + tf4in.ToHost(); + const float* out = tf4in.data<float>(); - float* out = static_cast<float*>(host_out->mutable_data()); EXPECT_EQ(1.0f, out[1]); EXPECT_EQ(3.0f, out[3]); } -TEST(OpenCL_TensorMath, TensorMath_Abs) { - auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice()); - Tensor in(Shape{1, 4}, ocl_dev); - float a[] = {0.0f, -1.0f, -2.0f, -3.0f}; - in.CopyDataFromHostPtr(a, 4); +TEST_F(OpenCL_TensorMath, MemberAbs) { + tf4in = Abs(tf4in); - in = Abs(in); + tf4in.ToHost(); + const float* out = tf4in.data<float>(); - CppCPU host; - Block* host_out = host.NewBlock(sizeof(float) * 4); - ocl_dev->CopyDataToFrom(host_out, in.block(), sizeof(float) * 4, singa::kDeviceToHost); - - float* out = static_cast<float*>(host_out->mutable_data()); EXPECT_EQ(0.0f, out[0]); EXPECT_EQ(1.0f, out[1]); EXPECT_EQ(2.0f, out[2]); EXPECT_EQ(3.0f, out[3]); } -TEST(OpenCL_TensorMath, TensorMath_ScalarAdd) { - auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice()); - Tensor in(Shape{1, 4}, ocl_dev); - float a[] = {0.0f, 1.0f, 2.0f, 3.0f}; - in.CopyDataFromHostPtr(a, 4); +TEST_F(OpenCL_TensorMath, MemberExp) { + tf4in = Exp(tf4in); - in += 1.0f; + tf4in.ToHost(); + const float* out = tf4in.data<float>(); - CppCPU host; - Block* host_out = host.NewBlock(sizeof(float) * 4); - ocl_dev->CopyDataToFrom(host_out, in.block(), sizeof(float) * 4, singa::kDeviceToHost); + EXPECT_NEAR(exp(0.0f), out[0], 1e-5); + EXPECT_NEAR(exp(1.0f), out[1], 1e-5); + EXPECT_NEAR(exp(2.0f), out[2], 1e-5); + EXPECT_NEAR(exp(3.0f), out[3], 1e-5); +} + + +TEST_F(OpenCL_TensorMath, MemberLog) { + tf4in = Log(tf4in); - float* out = static_cast<float*>(host_out->mutable_data()); - EXPECT_EQ(1.0f, out[0]); - EXPECT_EQ(2.0f, out[1]); - EXPECT_EQ(3.0f, out[2]); - EXPECT_EQ(4.0f, out[3]); + tf4in.ToHost(); + const float* out = tf4in.data<float>(); + +// EXPECT_NEAR(log(0.0f), out[0], 1e-5); // Evaluates to neg infinity. + EXPECT_NEAR(log(1.0f), out[1], 1e-5); + EXPECT_NEAR(log(2.0f), out[2], 1e-5); + EXPECT_NEAR(log(3.0f), out[3], 1e-5); } -TEST(OpenCL_TensorMath, TensorMath_EltwiseAdd) { - auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice()); - Tensor in_1(Shape{1, 4}, ocl_dev); - float a[] = {0.0f, 1.0f, 2.0f, 3.0f}; - in_1.CopyDataFromHostPtr(a, 4); - Tensor in_2 = in_1.Clone(); +TEST_F(OpenCL_TensorMath, MemberReLU) { + tf4in -= 2.0f; + Tensor result = ReLU(tf4in); - in_2 += in_1; + result.ToHost(); + const float* out = result.data<float>(); - CppCPU host; - Block* host_out = host.NewBlock(sizeof(float) * 4); - ocl_dev->CopyDataToFrom(host_out, in_2.block(), sizeof(float) * 4, singa::kDeviceToHost); + EXPECT_NEAR(0.0f, out[0], 1e-5); + EXPECT_NEAR(0.0f, out[1], 1e-5); + EXPECT_NEAR(0.0f, out[2], 1e-5); + EXPECT_NEAR(1.0f, out[3], 1e-5); +} + + +TEST_F(OpenCL_TensorMath, MemberSigmoid) { + tf4in = Sigmoid(tf4in); + + tf4in.ToHost(); + const float* out = tf4in.data<float>(); + + EXPECT_NEAR(1.0f / (1.0f + exp(-0.0f)), out[0], 1e-5); + EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), out[1], 1e-5); + EXPECT_NEAR(1.0f / (1.0f + exp(-2.0f)), out[2], 1e-5); +} + + +TEST_F(OpenCL_TensorMath, MemberSign) { + tf4in -= 1.0f; + + tf4in.ToHost(); + const float* out = tf4in.data<float>(); + + EXPECT_NEAR(-1.0f, out[0], 1e-5); + EXPECT_NEAR(0.0f, out[1], 1e-5); + EXPECT_NEAR(1.0f, out[2], 1e-5); + EXPECT_NEAR(2.0f, out[3], 1e-5); +} + + +TEST_F(OpenCL_TensorMath, MemberSqrt) { + tf4in = Sqrt(tf4in); + + tf4in.ToHost(); + const float* out = tf4in.data<float>(); + + EXPECT_NEAR(0.0f, out[0], 1e-5); + EXPECT_NEAR(1.0f, out[1], 1e-5); + EXPECT_NEAR(sqrt(2.0f), out[2], 1e-5); + EXPECT_NEAR(sqrt(3.0f), out[3], 1e-5); +} + + +TEST_F(OpenCL_TensorMath, MemberSquare) { + tf4in = Square(tf4in); + + tf4in.ToHost(); + const float* out = tf4in.data<float>(); + + EXPECT_NEAR(0.0f, out[0], 1e-5); + EXPECT_NEAR(1.0f, out[1], 1e-5); + EXPECT_NEAR(4.0f, out[2], 1e-5); + EXPECT_NEAR(9.0f, out[3], 1e-5); +} + + +TEST_F(OpenCL_TensorMath, MemberTanh) { + tf4in = Tanh(tf4in); + + tf4in.ToHost(); + const float* out = tf4in.data<float>(); + + EXPECT_NEAR(0.0f, out[0], 1e-5); + EXPECT_NEAR(tanh(1.0f), out[1], 1e-5); + EXPECT_NEAR(tanh(2.0f), out[2], 1e-5); + EXPECT_NEAR(tanh(3.0f), out[3], 1e-5); +} + + +TEST_F(OpenCL_TensorMath, Sum) { + Tensor result = Sum(tf4in, 0); + + result.ToHost(); + const float* out = result.data<float>(); + + EXPECT_NEAR(0.0f, out[0], 1e-5); + EXPECT_NEAR(1.0f, out[1], 1e-5); + EXPECT_NEAR(2.0f, out[2], 1e-5); + EXPECT_NEAR(3.0f, out[3], 1e-5); +} + +TEST_F(OpenCL_TensorMath, MemberLT) { + Tensor result = tf4in < 2.0f; + + result.ToHost(); + const float* out = result.data<float>(); + + EXPECT_FLOAT_EQ(1.0f, out[0]); + EXPECT_FLOAT_EQ(1.0f, out[1]); + EXPECT_FLOAT_EQ(0.0f, out[2]); + EXPECT_FLOAT_EQ(0.0f, out[3]); +} + + +TEST_F(OpenCL_TensorMath, MemberLE) { + Tensor result = tf4in <= 2.0f; + + result.ToHost(); + const float* out = result.data<float>(); + + EXPECT_FLOAT_EQ(1.0f, out[0]); + EXPECT_FLOAT_EQ(1.0f, out[1]); + EXPECT_FLOAT_EQ(1.0f, out[2]); + EXPECT_FLOAT_EQ(0.0f, out[3]); +} + + +TEST_F(OpenCL_TensorMath, MemberGT) { + Tensor result = tf4in > 2.0f; + + result.ToHost(); + const float* out = result.data<float>(); + + EXPECT_FLOAT_EQ(0.0f, out[0]); + EXPECT_FLOAT_EQ(0.0f, out[1]); + EXPECT_FLOAT_EQ(0.0f, out[2]); + EXPECT_FLOAT_EQ(1.0f, out[3]); +} + + +TEST_F(OpenCL_TensorMath, MemberGE) { + Tensor result = tf4in >= 2.0f; + + result.ToHost(); + const float* out = result.data<float>(); + + EXPECT_FLOAT_EQ(0.0f, out[0]); + EXPECT_FLOAT_EQ(0.0f, out[1]); + EXPECT_FLOAT_EQ(1.0f, out[2]); + EXPECT_FLOAT_EQ(1.0f, out[3]); +} + + +TEST_F(OpenCL_TensorMath, MemberPow) { + Tensor result = Pow(tf4in, 2.0f); + + result.ToHost(); + const float* out = result.data<float>(); + + EXPECT_FLOAT_EQ(0.0f, out[0]); + EXPECT_FLOAT_EQ(1.0f, out[1]); + EXPECT_FLOAT_EQ(4.0f, out[2]); + EXPECT_FLOAT_EQ(9.0f, out[3]); + + result = Pow(tf4in, tf4in); + + result.ToHost(); + const float* out1 = result.data<float>(); + + EXPECT_FLOAT_EQ(1.0f, out1[0]); // 0 ^ 0 is 1, apparently. + EXPECT_FLOAT_EQ(1.0f, out1[1]); + EXPECT_FLOAT_EQ(4.0f, out1[2]); + EXPECT_FLOAT_EQ(27.0f, out1[3]); +} + + +TEST_F(OpenCL_TensorMath, MemberSub) { + Tensor result = tf4in - tf4zin; + + result.ToHost(); + const float* out = result.data<float>(); + + EXPECT_FLOAT_EQ(0.0f, out[0]); + EXPECT_FLOAT_EQ(1.0f, out[1]); + EXPECT_FLOAT_EQ(2.0f, out[2]); + EXPECT_FLOAT_EQ(3.0f, out[3]); + + result = tf4in - 0.0f; + + result.ToHost(); + const float* out1 = result.data<float>(); + + EXPECT_FLOAT_EQ(0.0f, out1[0]); + EXPECT_FLOAT_EQ(1.0f, out1[1]); + EXPECT_FLOAT_EQ(2.0f, out1[2]); + EXPECT_FLOAT_EQ(3.0f, out1[3]); +} + + +TEST_F(OpenCL_TensorMath, MemberEltwiseMult) { + Tensor result = tf4in * tf4zin; + + result.ToHost(); + const float* out = result.data<float>(); + + EXPECT_FLOAT_EQ(0.0f, out[0]); + EXPECT_FLOAT_EQ(0.0f, out[1]); + EXPECT_FLOAT_EQ(0.0f, out[2]); + EXPECT_FLOAT_EQ(0.0f, out[3]); + + result = tf4in * 10.0f; + + result.ToHost(); + const float* out1 = result.data<float>(); + + EXPECT_FLOAT_EQ(0.0f, out1[0]); + EXPECT_FLOAT_EQ(10.0f, out1[1]); + EXPECT_FLOAT_EQ(20.0f, out1[2]); + EXPECT_FLOAT_EQ(30.0f, out1[3]); +} + + +TEST_F(OpenCL_TensorMath, MemberDiv) { + Tensor result = tf4in / tf4in; + + result.ToHost(); + const float* out = result.data<float>(); + +// EXPECT_FLOAT_EQ(0.0f, out[0]); // Divide by zero. + EXPECT_FLOAT_EQ(1.0f, out[1]); + EXPECT_FLOAT_EQ(1.0f, out[2]); + EXPECT_FLOAT_EQ(1.0f, out[3]); + + result = tf4in / 10.0f; + + result.ToHost(); + const float* out1 = result.data<float>(); + + EXPECT_FLOAT_EQ(0.0f, out1[0]); + EXPECT_FLOAT_EQ(0.1f, out1[1]); + EXPECT_FLOAT_EQ(0.2f, out1[2]); + EXPECT_FLOAT_EQ(0.3f, out1[3]); + + result = Div(10.0f, tf4in); + + result.ToHost(); + const float* out2 = result.data<float>(); + +// EXPECT_FLOAT_EQ(0.0f, out[0]); // Divide by 0. + EXPECT_FLOAT_EQ(10.0f, out2[1]); + EXPECT_FLOAT_EQ(5.0f, out2[2]); + EXPECT_NEAR((10.0f / 3.0f), out2[3], 1e-5); +} + +// ************************************** +// Random functions +// ************************************** + +TEST_F(OpenCL_TensorMath, Bernoulli) { + const float p = 0.3f; + + Bernoulli(p, &empty10k); + + empty10k.ToHost(); + const float* out = empty10k.data<float>(); + + float sum = 0.0f; + for (int i = 0; i < 10000; i++) sum += out[i]; + + float mean = sum / 10000; + + EXPECT_NEAR(mean, p, 1e-2); + + sum = 0.0f; + for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean); + float variance = sum / 9999; + + EXPECT_NEAR(variance, p * (1 - p), 1e-2); +} + + +TEST_F(OpenCL_TensorMath, Gaussian) { + Gaussian(0.0f, 1.0f, &empty10k); + + empty10k.ToHost(); + const float* out = empty10k.data<float>(); + + float sum = 0.0f; + for (int i = 0; i < 10000; i++) sum += out[i]; + float mean = sum / 10000; + + EXPECT_NEAR(mean, 0.0f, 1e-2); + + sum = 0.0f; + for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean); + float variance = sum / 9999; + + EXPECT_NEAR(variance, 1.0f, 1e-2); +} + + +TEST_F(OpenCL_TensorMath, Uniform) { + Uniform(0.1f, 0.2f, &empty10k); + + empty10k.ToHost(); + const float* out = empty10k.data<float>(); + + float sum = 0.0f; + for (int i = 0; i < 10000; i++) sum += out[i]; + float mean = sum / 10000; + + EXPECT_NEAR(mean, 0.15f, 1e-2); + + sum = 0.0f; + for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean); + float variance = sum / 9999; + + EXPECT_NEAR(variance, 0.01f, 1e-2); +} + +// ********************************************************* +// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas +// ********************************************************* + + +TEST_F(OpenCL_TensorMath, EltwiseAdd) { + Tensor result = tf4in + tf4in; + + result.ToHost(); + const float* out = result.data<float>(); - float* out = static_cast<float*>(host_out->mutable_data()); EXPECT_EQ(0.0f, out[0]); EXPECT_EQ(2.0f, out[1]); EXPECT_EQ(4.0f, out[2]); - EXPECT_EQ(6.0f, out[3]); + EXPECT_EQ(6.0f, out[3]); + + result = tf4in + tf4zin; + + result.ToHost(); + const float* out1 = result.data<float>(); + + EXPECT_EQ(0.0f, out1[0]); + EXPECT_EQ(1.0f, out1[1]); + EXPECT_EQ(2.0f, out1[2]); + EXPECT_EQ(3.0f, out1[3]); + + result = Tensor(tf4in.shape(), tf4in.device(), tf4in.data_type()); + Add(tf4in, tf4in, &result); + + result.ToHost(); + const float* out2 = result.data<float>(); + + EXPECT_EQ(0.0f, out2[0]); + EXPECT_EQ(2.0f, out2[1]); + EXPECT_EQ(4.0f, out2[2]); + EXPECT_EQ(6.0f, out2[3]); + + result = tf4in + 1.0f; + + result.ToHost(); + const float* out3 = result.data<float>(); + + EXPECT_EQ(1.0f, out3[0]); + EXPECT_EQ(2.0f, out3[1]); + EXPECT_EQ(3.0f, out3[2]); + EXPECT_EQ(4.0f, out3[3]); +} + + +TEST_F(OpenCL_TensorMath, SetValue) { + const float one_third = 1.0f / 3.0f; + empty10k.SetValue(one_third); + + empty10k.ToHost(); + const float* out = empty10k.data<float>(); + + EXPECT_EQ(one_third, out[0]); + EXPECT_EQ(one_third, out[1]); + EXPECT_EQ(one_third, out[1024]); + EXPECT_EQ(one_third, out[4096]); + EXPECT_EQ(one_third, out[9998]); + EXPECT_EQ(one_third, out[9999]); +} + + +TEST_F(OpenCL_TensorMath, Axpy) { + Axpy(10.0f, tf4in, &tf4in); + + tf4in.ToHost(); + const float* out = tf4in.data<float>(); + + EXPECT_EQ(0.0f, out[0]); // 0 * 10 + 0 = 0 + EXPECT_EQ(11.0f, out[1]); // 1 * 10 + 1 = 11 + EXPECT_EQ(22.0f, out[2]); // 2 * 10 + 2 = 22 + EXPECT_EQ(33.0f, out[3]); // 3 * 10 + 3 = 33 } + +TEST_F(OpenCL_TensorMath, Mult) { + Tensor result = Mult(tf4in, tf4zin.T()); // Multiply with zero. + + result.ToHost(); + const float* out = result.data<float>(); + + EXPECT_EQ(0.0f, out[0]); // 1x4 * 4x1 = 1x1. + + result = Mult(tf4in, tf4in.T()); + + result.ToHost(); + const float* out0 = result.data<float>(); + + EXPECT_EQ(14.0f, out0[0]); // 1x4 * 4x1 = 1x1. + + tf16zin.SetValue(10.0f); // Multiply with 10.0. + result = Mult(tf16in, tf16zin); // 4x4 * 4x4 = 4x4. + + result.ToHost(); + const float* out1 = result.data<float>(); + EXPECT_EQ(240.0f, out1[0]); + EXPECT_EQ(280.0f, out1[1]); + EXPECT_EQ(320.0f, out1[2]); + EXPECT_EQ(360.0f, out1[3]); + + EXPECT_EQ(240.0f, out1[4]); + EXPECT_EQ(280.0f, out1[5]); + EXPECT_EQ(320.0f, out1[6]); + EXPECT_EQ(360.0f, out1[7]); + + EXPECT_EQ(240.0f, out1[8]); + EXPECT_EQ(280.0f, out1[9]); + EXPECT_EQ(320.0f, out1[10]); + EXPECT_EQ(360.0f, out1[11]); + + EXPECT_EQ(240.0f, out1[12]); + EXPECT_EQ(280.0f, out1[13]); + EXPECT_EQ(320.0f, out1[14]); + EXPECT_EQ(360.0f, out1[15]); +} + + + +// TODO: ComputeCrossEntropy, SoftmaxCrossEntropy
