[1/6] incubator-singa git commit: SINGA-173 OpenCL device support and implementation.

wangwei Fri, 29 Jul 2016 22:11:57 -0700

Repository: incubator-singa
Updated Branches:
  refs/heads/dev 4e7f3c13b -> 464dcda63



SINGA-173 OpenCL device support and implementation.

Added option to Cmake to skip building tests for OpenCL.
More unit tests, more bugfixes.
Adding distribution.cl, and unit tests.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/57f3a1eb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/57f3a1eb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/57f3a1eb

Branch: refs/heads/dev
Commit: 57f3a1eb3fb90ff3806ba7eade33b7d24170bf4d
Parents: 35c8930
Author: Tan Li Boon <[email protected]>
Authored: Sun Jul 17 15:44:58 2016 +0800
Committer: Tan Li Boon <[email protected]>
Committed: Tue Jul 26 18:39:08 2016 +0800

----------------------------------------------------------------------
 .travis.yml                           |    2 +-
 CMakeLists.txt                        |    3 +-
 include/singa/core/device.h           |   67 ++
 include/singa/core/platform.h         |  105 ---
 src/core/device/opencl_device.cc      |    3 +-
 src/core/device/platform.cc           |    8 +
 src/core/tensor/distribution.cl       | 1020 ++++++++++++++++++++++++++++
 src/core/tensor/tensor_math_opencl.cl |   57 +-
 src/core/tensor/tensor_math_opencl.h  |   79 ++-
 test/CMakeLists.txt                   |    5 +
 test/singa/test_opencl.cc             |  546 +++++++++++++--
 11 files changed, 1690 insertions(+), 205 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/.travis.yml
----------------------------------------------------------------------
diff --git a/.travis.yml b/.travis.yml
index f7434d8..8b1f89c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -13,7 +13,7 @@ before_install:
 
 before_script:
  - mkdir build && cd build
- - cmake .. -DUSE_CUDA=OFF -DUSE_CUDNN=OFF -DUSE_PYTHON=OFF
+ - cmake .. -DUSE_CUDA=OFF -DUSE_CUDNN=OFF -DUSE_PYTHON=OFF 
-DBUILD_OPENCL_TESTS=OFF
 
 script:
  - make

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd39a0a..8f862f0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,8 @@ OPTION(USE_CUDNN "Use Cudnn libs" ON)
 OPTION(USE_OPENCV "Use opencv" OFF)
 OPTION(USE_LMDB "Use LMDB libs" OFF)
 OPTION(USE_PYTHON "Generate py wrappers" ON)
-OPTION(USE_OPENCL "Use OpenCL" ON)
+OPTION(USE_OPENCL "Use OpenCL" OFF)
+OPTION(BUILD_OPENCL_TESTS "Build OpenCL tests" OFF)
 
 INCLUDE("cmake/Dependencies.cmake")
 INCLUDE("cmake/Utils.cmake")

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 4c775a1..a7f6488 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -189,6 +189,73 @@ class CudaGPU : public Device {
 
 #endif  // USE_CUDA
 
+
+
+/// This class queries all available calculating devices on a given machine
+/// grouped according to manufacturer or device drivers. All methods should be 
static.
+/// If CUDA or OPENCL are not enabled, then the respective related methods 
should
+/// return something that indicates their absence (for example, 0 devices); 
+/// however they should always be available regardless of compile-time 
switches.
+class Platform {
+public:
+
+  /// Constructor.
+  Platform();
+
+  /// Return the number of total available GPUs
+  static int GetNumGPUs();
+
+  /// Return the device IDs of available GPUs.
+  /// TODO(wangwei) return the IDs according to free memory in decending order
+  static const std::vector<int> GetGPUIDs();
+
+  static const std::pair<size_t, size_t> GetGPUMemSize(const int device);
+
+  /// Return the memory of a GPU <free, total>
+  static const std::vector<std::pair<size_t, size_t>> GetGPUMemSize();
+
+  /// Return a string containing all hardware info, e.g., version, memory size.
+  static const std::string DeviceQuery(int id, bool verbose = false);
+
+  /// Create a set of CudaGPU Device using 'num_devices' free GPUs.
+  static const std::vector<std::shared_ptr<Device>>
+  CreateCudaGPUs(const size_t num_devices, size_t init_size = 0);
+
+  /// Create a set of CudaGPU Device using given GPU IDs.
+  static const std::vector<std::shared_ptr<Device>>
+  CreateCudaGPUs(const std::vector<int> &devices, size_t init_size = 0);
+
+  /// Create a \p num_devices set of valid OpenCL devices, regardless of 
platforms.
+  /// If there are fewer valid devices than requested, then this method will 
return as many as possible.
+  /// If OpenCL is not in use, this method will return an empty array.
+  const std::vector<std::shared_ptr<Device>> CreateOpenclDevices(const size_t 
num_devices);
+
+  /// Create a set of valid OpenCL devices, regardless of platforms, assigning 
\p id to each device in sequence.
+  /// If there are fewer valid devices than requested, then this method will 
return as many as possible.
+  /// If OpenCL is not in use, this method will return an empty array.
+  const std::vector<std::shared_ptr<Device>> CreateOpenclDevices(const 
vector<int>& id);
+
+  /// This function is implementd by Caffe (http://caffe.berkeleyvision.org/).
+  /// This function checks the availability of GPU #device_id.
+  /// It attempts to create a context on the device by calling cudaFree(0).
+  /// cudaSetDevice() alone is not sufficient to check the availability.
+  /// It lazily records device_id, however, does not initialize a
+  /// context. So it does not know if the host thread has the permission to use
+  /// the device or not.
+  ///
+  /// In a shared environment where the devices are set to EXCLUSIVE_PROCESS
+  /// or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess
+  /// even if the device is exclusively occupied by another process or thread.
+  /// Cuda operations that initialize the context are needed to check
+  /// the permission. cudaFree(0) is one of those with no side effect,
+  /// except the context initialization.
+  static bool CheckDevice(const int device_id);
+
+
+private:
+  cl::Platform clPlatform;
+};
+
 }  // namespace singa
 
 #endif  // SINGA_CORE_DEVICE_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/include/singa/core/platform.h
----------------------------------------------------------------------
diff --git a/include/singa/core/platform.h b/include/singa/core/platform.h
deleted file mode 100644
index ff1bbea..0000000
--- a/include/singa/core/platform.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef SINGA_CORE_PLATFORM_H_
-#define SINGA_CORE_PLATFORM_H_
-
-#include <memory>
-#include <vector>
-
-#include "singa/core/device.h"
-#include "singa/singa_config.h"
-
-#ifdef USE_CUDA
-#include "singa/utils/cuda_utils.h"
-#endif // USE_CUDA
-
-#ifdef USE_OPENCL
-#include <cl/cl2.hpp>
-#endif // USE_OPENCL
-
-namespace singa {
-
-/// This class queries all available calculating devices on a given machine
-/// grouped according to manufacturer or device drivers. All methods should be 
static.
-/// If CUDA or OPENCL are not enabled, then the respective related methods 
should
-/// return something that indicates their absence (for example, 0 devices); 
-/// however they should always be available regardless of compile-time 
switches.
-class Platform {
-public:
-
-  /// Constructor.
-  Platform();
-
-  /// Return the number of total available GPUs
-  static int GetNumGPUs();
-
-  /// Return the device IDs of available GPUs.
-  /// TODO(wangwei) return the IDs according to free memory in decending order
-  static const std::vector<int> GetGPUIDs();
-
-  static const std::pair<size_t, size_t> GetGPUMemSize(const int device);
-
-  /// Return the memory of a GPU <free, total>
-  static const std::vector<std::pair<size_t, size_t>> GetGPUMemSize();
-
-  /// Return a string containing all hardware info, e.g., version, memory size.
-  static const std::string DeviceQuery(int id, bool verbose = false);
-
-  /// Create a set of CudaGPU Device using 'num_devices' free GPUs.
-  static const std::vector<std::shared_ptr<Device>>
-  CreateCudaGPUs(const size_t num_devices, size_t init_size = 0);
-
-  /// Create a set of CudaGPU Device using given GPU IDs.
-  static const std::vector<std::shared_ptr<Device>>
-  CreateCudaGPUs(const std::vector<int> &devices, size_t init_size = 0);
-
-  /// Create a \p num_devices set of valid OpenCL devices, regardless of 
platforms.
-  /// If there are fewer valid devices than requested, then this method will 
return as many as possible.
-  /// If OpenCL is not in use, this method will return an empty array.
-  const std::vector<std::shared_ptr<Device>> CreateOpenclDevices(const size_t 
num_devices);
-
-  /// Create a set of valid OpenCL devices, regardless of platforms, assigning 
\p id to each device in sequence.
-  /// If there are fewer valid devices than requested, then this method will 
return as many as possible.
-  /// If OpenCL is not in use, this method will return an empty array.
-  const std::vector<std::shared_ptr<Device>> CreateOpenclDevices(const 
vector<int>& id);
-
-  /// This function is implementd by Caffe (http://caffe.berkeleyvision.org/).
-  /// This function checks the availability of GPU #device_id.
-  /// It attempts to create a context on the device by calling cudaFree(0).
-  /// cudaSetDevice() alone is not sufficient to check the availability.
-  /// It lazily records device_id, however, does not initialize a
-  /// context. So it does not know if the host thread has the permission to use
-  /// the device or not.
-  ///
-  /// In a shared environment where the devices are set to EXCLUSIVE_PROCESS
-  /// or EXCLUSIVE_THREAD mode, cudaSetDevice() returns cudaSuccess
-  /// even if the device is exclusively occupied by another process or thread.
-  /// Cuda operations that initialize the context are needed to check
-  /// the permission. cudaFree(0) is one of those with no side effect,
-  /// except the context initialization.
-  static bool CheckDevice(const int device_id);
-
-
-private:
-  cl::Platform clPlatform;
-};
-
-} // namespace singa
-
-#endif // SINGA_CORE_PLATFORM_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/src/core/device/opencl_device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/opencl_device.cc b/src/core/device/opencl_device.cc
index 053ac4f..d4d1fe5 100644
--- a/src/core/device/opencl_device.cc
+++ b/src/core/device/opencl_device.cc
@@ -44,13 +44,12 @@ OpenclDevice::OpenclDevice(int id, int num_executors)
   std::vector<cl::Platform> platforms;
   status = cl::Platform::get(&platforms);
   OCL_CHECK(status, "Failed to find any OpenCL platforms!");
-
+  
   std::vector<cl::Device> devices;
   status = platforms[0].getDevices(CL_DEVICE_TYPE_ALL, &devices);
   OCL_CHECK(status, "Failed to get list of devices from platform!");
 
   this->this_device = cl::Device(devices[0]);
-
   this->ocl_ctx = cl::Context(this_device, nullptr, nullptr, nullptr, &status);
   OCL_CHECK(status, "Failed to create context!");
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/src/core/device/platform.cc
----------------------------------------------------------------------
diff --git a/src/core/device/platform.cc b/src/core/device/platform.cc
index 984df69..80826c0 100644
--- a/src/core/device/platform.cc
+++ b/src/core/device/platform.cc
@@ -16,6 +16,14 @@
  * limitations under the License.
  */
 
+#ifdef USE_CUDA
+#include "singa/utils/cuda_utils.h"
+#endif // USE_CUDA
+
+#ifdef USE_OPENCL
+#include <cl/cl2.hpp>
+#endif // USE_OPENCL
+
 #include "singa/core/device.h"
 #include "singa/singa_config.h"
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/src/core/tensor/distribution.cl
----------------------------------------------------------------------
diff --git a/src/core/tensor/distribution.cl b/src/core/tensor/distribution.cl
new file mode 100644
index 0000000..ce298c0
--- /dev/null
+++ b/src/core/tensor/distribution.cl
@@ -0,0 +1,1020 @@
+// This code is adapted from 
https://github.com/amd/OpenCL-caffe/blob/stable/src/caffe/ocl/random.cl
+
+//Note: random generator has two parts
+//first part: the open sourced threefy random generator kernel from DE Shaw 
Research
+//second part. we wrap the kernel up to generate uniform, bernoulli and 
gaussion distribution generators.
+
+//begin: the open sourced random generator from DE Shaw Research
+//https://www.deshawresearch.com/resources_random123.html
+typedef uint uint32_t;
+
+struct r123array4x32 {
+  uint32_t v[4];
+};
+
+enum r123_enum_threefry32x4 {
+  R_32x4_0_0 = 10,
+  R_32x4_0_1 = 26,
+  R_32x4_1_0 = 11,
+  R_32x4_1_1 = 21,
+  R_32x4_2_0 = 13,
+  R_32x4_2_1 = 27,
+  R_32x4_3_0 = 23,
+  R_32x4_3_1 = 5,
+  R_32x4_4_0 = 6,
+  R_32x4_4_1 = 20,
+  R_32x4_5_0 = 17,
+  R_32x4_5_1 = 11,
+  R_32x4_6_0 = 25,
+  R_32x4_6_1 = 10,
+  R_32x4_7_0 = 18,
+  R_32x4_7_1 = 20
+};
+
+inline uint32_t RotL_32(uint32_t x, unsigned int N) {
+  return (x << (N & 31)) | (x >> ((32 - N) & 31));
+}
+
+typedef struct r123array4x32 threefry4x32_ctr_t;
+typedef struct r123array4x32 threefry4x32_key_t;
+typedef struct r123array4x32 threefry4x32_ukey_t;
+
+inline threefry4x32_ctr_t threefry4x32_R(unsigned int Nrounds, 
threefry4x32_ctr_t in, threefry4x32_key_t k) {
+  threefry4x32_ctr_t X;
+  uint32_t ks[4 + 1];
+  int i;
+  ks[4] = 0x1BD11BDA;
+
+  {
+    ks[0] = k.v[0];
+    X.v[0] = in.v[0];
+    ks[4] ^= k.v[0];
+
+    ks[1] = k.v[1];
+    X.v[1] = in.v[1];
+    ks[4] ^= k.v[1];
+
+    ks[2] = k.v[2];
+    X.v[2] = in.v[2];
+    ks[4] ^= k.v[2];
+
+    ks[3] = k.v[3];
+    X.v[3] = in.v[3];
+    ks[4] ^= k.v[3];
+  }
+
+  X.v[0] += ks[0];
+  X.v[1] += ks[1];
+  X.v[2] += ks[2];
+  X.v[3] += ks[3];
+
+  if (Nrounds > 0) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 1) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 2) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 3) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 3) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 1;
+  }
+
+  if (Nrounds > 4) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 5) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 6) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 7) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 7) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 2;
+  }
+
+  if (Nrounds > 8) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 9) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 10) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 11) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 11) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 3;
+  }
+
+  if (Nrounds > 12) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 13) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 14) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 15) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 15) {
+    X.v[0] += ks[4];
+    X.v[1] += ks[0];
+    X.v[2] += ks[1];
+    X.v[3] += ks[2];
+    X.v[4 - 1] += 4;
+  }
+
+  if (Nrounds > 16) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 17) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 18) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 19) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 19) {
+    X.v[0] += ks[0];
+    X.v[1] += ks[1];
+    X.v[2] += ks[2];
+    X.v[3] += ks[3];
+    X.v[4 - 1] += 5;
+  }
+
+  if (Nrounds > 20) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 21) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 22) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 23) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 23) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 6;
+  }
+
+  if (Nrounds > 24) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 25) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 26) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 27) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 27) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 7;
+  }
+
+  if (Nrounds > 28) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 29) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 30) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 31) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 31) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 8;
+  }
+
+  if (Nrounds > 32) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 33) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 34) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 35) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 35) {
+    X.v[0] += ks[4];
+    X.v[1] += ks[0];
+    X.v[2] += ks[1];
+    X.v[3] += ks[2];
+    X.v[4 - 1] += 9;
+  }
+
+  if (Nrounds > 36) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 37) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 38) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 39) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 39) {
+    X.v[0] += ks[0];
+    X.v[1] += ks[1];
+    X.v[2] += ks[2];
+    X.v[3] += ks[3];
+    X.v[4 - 1] += 10;
+  }
+
+  if (Nrounds > 40) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 41) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+  if (Nrounds > 42) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 43) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 43) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 11;
+  }
+
+  if (Nrounds > 44) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 45) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 46) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 47) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 47) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 12;
+  }
+
+  if (Nrounds > 48) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 49) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 50) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 51) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 51) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 13;
+  }
+
+  if (Nrounds > 52) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 53) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 54) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 55) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 55) {
+    X.v[0] += ks[4];
+    X.v[1] += ks[0];
+    X.v[2] += ks[1];
+    X.v[3] += ks[2];
+    X.v[4 - 1] += 14;
+  }
+
+  if (Nrounds > 56) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 57) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 58) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 59) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 59) {
+    X.v[0] += ks[0];
+    X.v[1] += ks[1];
+    X.v[2] += ks[2];
+    X.v[3] += ks[3];
+    X.v[4 - 1] += 15;
+  }
+
+  if (Nrounds > 60) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 61) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 62) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 63) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 63) {
+    X.v[0] += ks[1];
+    X.v[1] += ks[2];
+    X.v[2] += ks[3];
+    X.v[3] += ks[4];
+    X.v[4 - 1] += 16;
+  }
+
+  if (Nrounds > 64) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_0_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_0_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 65) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_1_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_1_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 66) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_2_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_2_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 67) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_3_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_3_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 67) {
+    X.v[0] += ks[2];
+    X.v[1] += ks[3];
+    X.v[2] += ks[4];
+    X.v[3] += ks[0];
+    X.v[4 - 1] += 17;
+  }
+
+  if (Nrounds > 68) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_4_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_4_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 69) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_5_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_5_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 70) {
+    X.v[0] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_6_0);
+    X.v[1] ^= X.v[0];
+    X.v[2] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_6_1);
+    X.v[3] ^= X.v[2];
+  }
+
+  if (Nrounds > 71) {
+    X.v[0] += X.v[3];
+    X.v[3] = RotL_32(X.v[3], R_32x4_7_0);
+    X.v[3] ^= X.v[0];
+    X.v[2] += X.v[1];
+    X.v[1] = RotL_32(X.v[1], R_32x4_7_1);
+    X.v[1] ^= X.v[2];
+  }
+
+  if (Nrounds > 71) {
+    X.v[0] += ks[3];
+    X.v[1] += ks[4];
+    X.v[2] += ks[0];
+    X.v[3] += ks[1];
+    X.v[4 - 1] += 18;
+  }
+  return X;
+}
+//end: the open sourced random generator from DE Shaw Research
+
+// **************************
+// BERNOULLI DISTRIBUTION
+// **************************
+
+__kernel void PRNG_threefry4x32_bernoulli(
+       __global float4 *randomnumber,
+       threefry4x32_ctr_t ctr_i,
+       float inf, float sup,
+       float threshold,
+       uint nrounds, uint numrandom) {
+
+  size_t gdx = get_global_id(0);
+
+  uint maxUint = 0;
+  maxUint--;
+  float r = (float)maxUint;
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey;
+
+  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+  threefry4x32_ctr_t random4;
+
+  if ( gdx < numrandom ) {
+    random4 = threefry4x32_R(nrounds, ctr, ukey);
+    float4 frnd;
+    frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf ) < threshold ? 
1.0f : 0.0f;
+    frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf ) < threshold ? 
1.0f : 0.0f;
+    frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf ) < threshold ? 
1.0f : 0.0f;
+    frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf ) < threshold ? 
1.0f : 0.0f;
+    randomnumber[gdx] = frnd;
+  }
+}
+
+// **************************
+// UNIFORM DISTRIBUTION (float)
+// **************************
+
+__kernel void PRNG_threefry4x32_uniform(
+       __global float4 *randomnumber,
+       threefry4x32_ctr_t ctr_i,
+       float inf, float sup,
+       uint nrounds, uint numrandom) {
+
+  size_t gdx = get_global_id(0);
+
+  uint maxUint = 0;
+  maxUint--;
+  float r = (float)maxUint;
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey;
+
+  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+  threefry4x32_ctr_t random4;
+
+  if ( gdx < numrandom ) {
+    random4 = threefry4x32_R(nrounds, ctr, ukey);
+    float4 frnd;
+    frnd.x = ( (((float)random4.v[0]) / r) * (sup - inf) + inf );
+    frnd.y = ( (((float)random4.v[1]) / r) * (sup - inf) + inf );
+    frnd.z = ( (((float)random4.v[2]) / r) * (sup - inf) + inf );
+    frnd.w = ( (((float)random4.v[3]) / r) * (sup - inf) + inf );
+    randomnumber[gdx] = frnd;
+  }
+}
+
+// **************************
+// UNIFORM DISTRIBUTION (uint)
+// **************************
+
+__kernel void PRNG_threefry4x32_uint_uniform(
+       __global uint4 *randomnumber,
+       threefry4x32_ctr_t ctr_i,
+       uint inf, uint sup,
+       uint nrounds, uint numrandom) {
+
+  size_t gdx = get_global_id(0);
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey;
+
+  ukey.v[0] = ukey.v[1] = ukey.v[2] = ukey.v[3] = gdx;
+
+  threefry4x32_ctr_t random4;
+
+  if ( gdx < numrandom ) {
+    random4 = threefry4x32_R(nrounds, ctr, ukey);
+    uint4 frnd;
+    frnd.x = random4.v[0] % (sup - inf) + inf;
+    frnd.y = random4.v[1] % (sup - inf) + inf;
+    frnd.z = random4.v[2] % (sup - inf) + inf;
+    frnd.w = random4.v[3] % (sup - inf) + inf;
+    randomnumber[gdx] = frnd;
+  }
+}
+
+// **************************
+// GAUSSIAN DISTRIBUTION
+// **************************
+
+__kernel void PRNG_threefry4x32_gaussian(
+       __global float4 *randomnumber,
+       threefry4x32_ctr_t ctr_i,
+       float E, float V,
+       uint nrounds, uint numrandom) {
+
+  size_t gdx = get_global_id(0);
+
+  uint maxUint = 0;
+  maxUint--;
+  float r = (float)maxUint;
+
+  threefry4x32_ctr_t ctr = ctr_i;
+  threefry4x32_ukey_t ukey1, ukey2;
+
+  ukey1.v[0] = ukey2.v[1] = ukey1.v[2] = ukey2.v[3] = gdx;
+  ukey2.v[0] = ukey1.v[1] = ukey2.v[2] = ukey1.v[3] = 0;
+
+  threefry4x32_ctr_t random1, random2;
+
+  if ( gdx < numrandom ) {
+    random1 = threefry4x32_R(nrounds, ctr, ukey1);
+    random2 = threefry4x32_R(nrounds, ctr, ukey2);
+    float4 frnd1;
+
+    float r1 = (((float)random1.v[0]) / r); // generate a random sequence of 
uniform distribution
+    float r2 = (((float)random2.v[0]) / r);
+    float r3 = (((float)random1.v[1]) / r);
+    float r4 = (((float)random2.v[1]) / r);
+    float r5 = (((float)random1.v[2]) / r);
+    float r6 = (((float)random2.v[2]) / r);
+    float r7 = (((float)random1.v[3]) / r);
+    float r8 = (((float)random2.v[3]) / r);
+
+    if(r2 == 0 || r4 == 0 || r6 == 0 || r8 == 0) {
+      r2 += 0.0001;
+      r4 += 0.0001;
+      r6 += 0.0001;
+      r8 += 0.0001;
+    }
+
+    frnd1.x = cos(2*M_PI*r1)*sqrt(-2.0*log(r2)) * V + E;// return a pseudo 
sequence of normal distribution using two above uniform noise data
+    //frnd2.x = sin(2*M_PI*r1)*sqrt(-2.0*log(r2));      // return the 
quadrature counterpart of the foregoing pseudo normal distribution sequence
+    frnd1.y = cos(2*M_PI*r3)*sqrt(-2.0*log(r4)) * V + E;// return a pseudo 
sequence of normal distribution using two above uniform noise data
+    //frnd2.y = sin(2*M_PI*r3)*sqrt(-2.0*log(r4));      // return the 
quadrature counterpart of the foregoing pseudo normal distribution sequence
+    frnd1.z = cos(2*M_PI*r5)*sqrt(-2.0*log(r6)) * V + E;// return a pseudo 
sequence of normal distribution using two above uniform noise data
+    //frnd2.z = sin(2*M_PI*r5)*sqrt(-2.0*log(r6));      // return the 
quadrature counterpart of the foregoing pseudo normal distribution sequence
+    frnd1.w = cos(2*M_PI*r7)*sqrt(-2.0*log(r8)) * V + E;// return a pseudo 
sequence of normal distribution using two above uniform noise data
+    //frnd2.w = sin(2*M_PI*r7)*sqrt(-2.0*log(r8));      // return the 
quadrature counterpart of the foregoing pseudo normal distribution sequence
+
+    randomnumber[gdx] = frnd1;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/src/core/tensor/tensor_math_opencl.cl
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_opencl.cl 
b/src/core/tensor/tensor_math_opencl.cl
index 56eef44..f9cf96e 100644
--- a/src/core/tensor/tensor_math_opencl.cl
+++ b/src/core/tensor/tensor_math_opencl.cl
@@ -55,7 +55,7 @@ void clkernel_clamp(const int num, float low, float high, 
__global const float*
 
 __kernel
 void clkernel_divide_scalar_matx(const int num, __global const float* in1, 
const float x,
-                 __global const float* in2, __global float* out) {
+                 __global float* out) {
   const int i = get_global_id(0);
   if (i >= num) return;
   out[i] = in1[i] / x;
@@ -63,7 +63,7 @@ void clkernel_divide_scalar_matx(const int num, __global 
const float* in1, const
 
 __kernel
 void clkernel_divide_scalar_xmat(const int num, const float x, __global const 
float* in1, 
-                 __global const float* in2, __global float* out) {
+                 __global float* out) {
   const int i = get_global_id(0);
   if (i >= num) return;
   out[i] = x / in1[i];
@@ -159,7 +159,7 @@ __kernel
 void clkernel_relu(const int num, __global const float* in, __global float* 
out) {
   const int i = get_global_id(0);
   if (i >= num) return;
-  out[i] = (in[i] > 0) ? in[i] : 0.0f;
+  out[i] = (in[i] >= 0.0f) ? in[i] : 0.0f;
 }
 
 __kernel
@@ -371,8 +371,9 @@ void clkernel_dot(const int num, __global const float* in1, 
__global const float
   
 }
 
-// Third kernel from http://www.bealto.com/gpu-gemv_intro.html
+// First kernel from http://www.bealto.com/gpu-gemv_intro.html
 // y = Î±*A*v + Î²*y
+// fma(a, b, c) == (a * b) + c with infinite precision
 __kernel
 void clkernel_gemv(const int m, const int n, const float alpha,
                                   __global const float* A, __global const 
float* v, 
@@ -380,7 +381,7 @@ void clkernel_gemv(const int m, const int n, const float 
alpha,
   const int i = get_global_id(0);
   float sum  = 0.0f;
   for (int k = 0; k < n; k++) {
-       sum += fma(alpha, A[i + m * k], v[k]) + beta * out[i + m * k];
+    sum += fma(beta, out[i + m * k], alpha * A[i + m * k] * v[k]);
   }
   out[i] = sum;
 }
@@ -418,18 +419,37 @@ void clkernel_dgmm_right(const int nrow, const int ncol,
 // TODO: Optimize with Reference from 
http://www.cedricnugteren.nl/tutorial.php?page=1
 //  C = Î±*A*B + Î²*C
 __kernel
-void clkernel_gemm(const int nrowA, const int ncolB, const int ncolA, const 
float alpha,
-                                  __global const float *A, __global const 
float* B, const float beta, 
-                                  __global float* C) {
-  const uint gidx = get_global_id(0);
-  const uint gidy = get_global_id(1);
+void clkernel_gemm(const uint nrowA, const uint ncolB, const uint ncolA, const 
float alpha,
+                                  __global const float* A, __global const 
float* B, const float beta, 
+                                  __global float* C, __local float* Asub, 
__local float* Bsub) {
 
+  const uint lidx = get_local_id(0);
+  const uint lidy = get_local_id(1);
+  const uint TS = get_local_size(0); // Tile size
+  const uint gidx = TS * get_group_id(0) + lidx; // Row ID of C (0..M)
+  const uint gidy = TS * get_group_id(1) + lidy; // Row ID of C (0..N)
+  
+  // Initialise the accumulation register
   float acc = 0.0f;
-  for (uint i = 0; i < ncolA; i++) {
-       acc = fma(A[i * nrowA + gidx], B[gidy * ncolA + i] * alpha, acc);
+  
+  // Loop over all tiles
+  const int numtiles = ncolA / TS;
+  for (int t = 0; t < numtiles; t++) {
+    const int tiledRow = TS * t + lidx;
+    const int tiledCol = TS * t + lidy;
+    Asub[lidy * TS + lidx] = A[tiledCol * nrowA + gidx];
+    Bsub[lidy * TS + lidx] = B[gidy * ncolA + tiledRow];
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    for(int k = 0; k < TS; k++) {
+      acc += Asub[k * TS + lidx] * Bsub[lidy * TS + k] * alpha;
+    }
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
   }
-
-  C[gidy * nrowA + gidx] = fma(C[gidy * nrowA + gidx], beta, acc);
+  
+  C[gidy * nrowA + gidx] = fma(beta, C[gidy * nrowA + gidx], acc);
 }
 
 
@@ -567,3 +587,12 @@ void clkernel_diagvec_left(uint vsize, __global const 
float* vin, __global float
   for (uint i = 0; i < vsize; i++)
        out[gid * vsize + i] = (i == gid) ? vin[gid] : 0.0f;
 }
+
+
+__kernel
+void clkernel_diagvec_right(uint vsize, __global const float* vin, __global 
float* out) {
+  const uint gid = get_global_id(0);
+
+  for (uint i = 0; i < vsize; i++)
+       out[gid * vsize + i] = (i == gid) ? vin[gid] : 0.0f;
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/src/core/tensor/tensor_math_opencl.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_opencl.h 
b/src/core/tensor/tensor_math_opencl.h
index bfc051d..c289a56 100644
--- a/src/core/tensor/tensor_math_opencl.h
+++ b/src/core/tensor/tensor_math_opencl.h
@@ -19,7 +19,7 @@
 #ifndef  SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_
 
 #ifdef USE_OPENCL
-//#include <CL/cl2.hpp>
+#include <limits>
 
 #include "singa/utils/opencl_utils.h"
 #include "tensor_math.h"
@@ -122,7 +122,7 @@ void Div<float, lang::Opencl>(const size_t num, const 
Block* in, const float x,
 
   std::string kname = "clkernel_divide_scalar_matx";
   auto kernel = ctx->kernels->at(kname);
-
+  
   cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
   cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
 
@@ -142,7 +142,7 @@ void Div<float, lang::Opencl>(const size_t num, const float 
x, const Block* in,
 
   std::string kname = "clkernel_divide_scalar_xmat";
   auto kernel = ctx->kernels->at(kname);
-
+  
   cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
   cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
 
@@ -162,7 +162,7 @@ void Div<float, lang::Opencl>(const size_t num, const 
Block* in1, const Block* i
 
   std::string kname = "clkernel_divide";
   auto kernel = ctx->kernels->at(kname);
-
+  
   cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
   cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
   cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
@@ -401,7 +401,7 @@ void Set<float, lang::Opencl>(const size_t num, const float 
x, Block* out, Conte
 
   std::string kname = "clkernel_set";
   auto kernel = ctx->kernels->at(kname);
-
+  
   cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
 
   kernel.setArg(0, (cl_int)num);
@@ -568,21 +568,29 @@ void Tanh<float, lang::Opencl>(const size_t num, const 
Block* in, Block* out, Co
 // Random functions
 // **************************************
 
+/// Seed value required for generating distributions.
+static unsigned int seed[4] = {0, 32, 42, 888};
+/// Number of generation rounds used in the current algorithm.
+static cl_uint rounds = 8;
+
 template<>
 void Bernoulli<float, lang::Opencl>(const size_t num, const float p, Block* 
out, Context *ctx) {
   cl_int status = CL_SUCCESS;
 
-//std::string kname = "clkernel_bernoulli";
   std::string kname = "PRNG_threefry4x32_bernoulli";
   auto kernel = ctx->kernels->at(kname);
   
   cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
 
-  kernel.setArg(0, (cl_int)(num / 4)); // Divide by 4 because kernel uses 
float4 as argument.
-  kernel.setArg(1, p);
-  kernel.setArg(2, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  kernel.setArg(0, outbuf);
+  kernel.setArg(1, seed);
+  kernel.setArg(2, 0.0f); // inf
+  kernel.setArg(3, 1.0f); // sup
+  kernel.setArg(4, p); // threshold
+  kernel.setArg(5, rounds);
+  kernel.setArg(6, cl_uint(num) / 4);
+  
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num/4));
   OCL_CHECK(status, "Failed to enqueue kernel function!");
 }
 
@@ -591,18 +599,19 @@ template<>
 void Gaussian<float, lang::Opencl>(const size_t num, const float mean, const 
float std, Block* out, Context *ctx) {
   cl_int status = CL_SUCCESS;
 
-//std::string kname = "clkernel_gaussian";
   std::string kname = "PRNG_threefry4x32_gaussian";
   auto kernel = ctx->kernels->at(kname);
 
   cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
 
-  kernel.setArg(0, (cl_int)(num / 4));
-  kernel.setArg(1, mean);
-  kernel.setArg(2, std);
-  kernel.setArg(3, outbuf);
+  kernel.setArg(0, outbuf);
+  kernel.setArg(1, seed);
+  kernel.setArg(2, mean); // E
+  kernel.setArg(3, std);  // V
+  kernel.setArg(4, rounds);
+  kernel.setArg(5, cl_uint(num) / 4);
 
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num/4));
   OCL_CHECK(status, "Failed to enqueue kernel function!");
 }
 
@@ -611,18 +620,19 @@ template<>
 void Uniform<float, lang::Opencl>(const size_t num, const float low, const 
float high, Block* out, Context *ctx) {
   cl_int status = CL_SUCCESS;
 
-//std::string kname = "clkernel_uniform";
   std::string kname = "PRNG_threefry4x32_uniform";
   auto kernel = ctx->kernels->at(kname);
 
   cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
-
-  kernel.setArg(0, (cl_int)(num / 4));
-  kernel.setArg(1, low);
-  kernel.setArg(2, high);
-  kernel.setArg(3, outbuf);
-
-  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  
+  status = kernel.setArg(0, outbuf); OCL_CHECK(status, "kernel arg 0");
+  status = kernel.setArg(1, seed); OCL_CHECK(status, "kernel arg 1");
+  status = kernel.setArg(2, low); OCL_CHECK(status, "kernel arg 2");
+  status = kernel.setArg(3, high); OCL_CHECK(status, "kernel arg 3");
+  status = kernel.setArg(4, rounds); OCL_CHECK(status, "kernel arg 4");
+  status = kernel.setArg(5, cl_uint(num) / 4); OCL_CHECK(status, "kernel arg 
5");
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num/4));
   OCL_CHECK(status, "Failed to enqueue kernel function!");
 }
 
@@ -887,17 +897,17 @@ void GEMM<float, lang::Opencl>(const bool transA, const 
bool transB,
 
   std::string kname = "clkernel_gemm";
   auto kernel = ctx->kernels->at(kname);
-
+  
   cl::Buffer Abuf = *(static_cast<cl::Buffer*>(A->mutable_data()));
   cl::Buffer Bbuf = *(static_cast<cl::Buffer*>(B->mutable_data()));
   cl::Buffer Cbuf = *(static_cast<cl::Buffer*>(C->mutable_data()));
 
   // If matrix A needs to be transposed, do it.
-  if (!transA)
+  if (transA)
        Transpose(nrowA, ncolA, Abuf, Abuf, ctx);
 
   // If vector B needs to be transposed, do it.
-  if (!transB)
+  if (transB)
        Transpose(nrowA, ncolB, Bbuf, Bbuf, ctx);
 
   kernel.setArg(0, (cl_int)nrowA);
@@ -908,10 +918,13 @@ void GEMM<float, lang::Opencl>(const bool transA, const 
bool transB,
   kernel.setArg(5, Bbuf);
   kernel.setArg(6, beta);
   kernel.setArg(7, Cbuf);
-
-  cl::NDRange global(nrowA, ncolA);
-  cl::NDRange local(32, 32);
-
+  kernel.setArg(8, cl::Local(sizeof(float) * nrowA * ncolB));
+  kernel.setArg(9, cl::Local(sizeof(float) * nrowA * ncolB));
+  
+// TODO: Try to make the work group size a power of 2 given an arbitrary 
matrix.
+  cl::NDRange global(nrowA, ncolB);
+  cl::NDRange local(nrowA, ncolB);
+  
   status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global, 
local);
   OCL_CHECK(status, "Failed to enqueue kernel function!");
 }
@@ -1070,7 +1083,7 @@ void DiagVec_Left(const size_t size, cl::Buffer& in, 
cl::Buffer& out, Context* c
 
   std::string kname = "clkernel_diagvec_left";
   auto kernel = ctx->kernels->at(kname);
-
+  
   kernel.setArg(0, (cl_uint)size);
   kernel.setArg(1, in);
   kernel.setArg(2, out);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/test/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 95c3cf7..632a2cd 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -4,6 +4,11 @@ ADD_LIBRARY(gtest STATIC EXCLUDE_FROM_ALL "gtest/gtest.h" 
"gtest/gtest-all.cc")
 
 AUX_SOURCE_DIRECTORY(singa singa_test_source)
 
+IF(NOT BUILD_OPENCL_TESTS)
+    MESSAGE(STATUS "Skipping OpenCL tests")
+    LIST(REMOVE_ITEM singa_test_source "singa/test_opencl.cc")
+ENDIF()
+
 ADD_EXECUTABLE(test_singa "gtest/gtest_main.cc" ${singa_test_source}) 
 ADD_DEPENDENCIES(test_singa singa_core singa_utils)
 MESSAGE(STATUS "link libs" ${singa_linker_libs})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/57f3a1eb/test/singa/test_opencl.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_opencl.cc b/test/singa/test_opencl.cc
index f426559..0a335e5 100644
--- a/test/singa/test_opencl.cc
+++ b/test/singa/test_opencl.cc
@@ -30,6 +30,50 @@ using singa::Block;
 using singa::Shape;
 using singa::Tensor;
 
+class OpenCL_TensorMath : public ::testing::Test {
+protected:
+
+  OpenCL_TensorMath() {
+    for (int i = 0; i < 4; i++) {
+      float4[i] = (float)i;
+      float4zero[i] = 0.0f;
+    }
+    
+    for (int i = 0; i < 16; i++) {
+      float16[i] = (float)i;
+      float16zero[i] = 0.0f;
+    }
+    
+    auto ocl_dev = std::make_shared<OpenclDevice>();
+    
+    tf4in = Tensor(Shape{1, 4}, ocl_dev);
+    tf4in.CopyDataFromHostPtr(float4, 4);
+    
+    tf4zin = Tensor(Shape{1, 4}, ocl_dev);
+    tf4zin.CopyDataFromHostPtr(float4zero, 4);
+
+    tf16in = Tensor(Shape{4, 4}, ocl_dev);
+    tf16in.CopyDataFromHostPtr(float16, 16);
+    
+    tf16zin = Tensor(Shape{4, 4}, ocl_dev);
+    tf16zin.CopyDataFromHostPtr(float16zero, 16);
+    
+    float empty[10000] = {};
+    empty10k = Tensor(Shape{10000}, ocl_dev);
+    empty10k.CopyDataFromHostPtr(empty, 10000);
+  }
+  
+  float float4[4];
+  float float4zero[4];
+  float float16[16];
+  float float16zero[16];
+  
+  Tensor tf4in, tf16in;
+  Tensor tf4zin, tf16zin;
+  Tensor empty10k;
+};
+
+
 // Makes a float array and fills it with increasing values from 0.
 float* MakeMatrix(const int size) {
   float* mat = new float[size];
@@ -44,6 +88,7 @@ TEST(OpenclDevice, Constructor) {
   EXPECT_EQ(0, dev.id());
 }
 
+
 TEST(OpenclDevice, MemoryAllocFree) {
   OpenclDevice dev;
   Block* b = dev.NewBlock(4);
@@ -75,6 +120,7 @@ TEST(OpenclDevice, CopyDataToFrom) {
   EXPECT_EQ('x', astr[3]);
 }
 
+
 TEST(OpenclDevice, DuplicateDataOnDevice) {
   OpenclDevice dev;
   CppCPU host;
@@ -99,81 +145,483 @@ TEST(OpenclDevice, DuplicateDataOnDevice) {
   EXPECT_EQ('x', astr[3]);
 }
 
-// Tensor tests
-
-TEST(OpenCL_TensorMath, TensorMath_CopyDataToDevice) {
-  auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice());
+// Tensor tests, uses OpenCL_TensorMath class defined above.
 
-  Tensor t(Shape{1, 4}, ocl_dev);
-  float a[] = {0.0f, 1.0f, 2.0f, 3.0f};
-  t.CopyDataFromHostPtr(a, 4);
-  
-  CppCPU host;
-  Block* host_out = host.NewBlock(sizeof(float) * 4);
-  ocl_dev->CopyDataToFrom(host_out, t.block(), sizeof(float) * 4, 
singa::kDeviceToHost);
+TEST_F(OpenCL_TensorMath, CopyDataToDevice) {
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
   
-  float* out = static_cast<float*>(host_out->mutable_data());
   EXPECT_EQ(1.0f, out[1]);
   EXPECT_EQ(3.0f, out[3]);
 }
 
-TEST(OpenCL_TensorMath, TensorMath_Abs) {
-  auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice());
 
-  Tensor in(Shape{1, 4}, ocl_dev);
-  float a[] = {0.0f, -1.0f, -2.0f, -3.0f};
-  in.CopyDataFromHostPtr(a, 4);
+TEST_F(OpenCL_TensorMath, MemberAbs) {
+  tf4in = Abs(tf4in);
   
-  in = Abs(in);
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
   
-  CppCPU host;
-  Block* host_out = host.NewBlock(sizeof(float) * 4);
-  ocl_dev->CopyDataToFrom(host_out, in.block(), sizeof(float) * 4, 
singa::kDeviceToHost);
-  
-  float* out = static_cast<float*>(host_out->mutable_data());
   EXPECT_EQ(0.0f, out[0]);
   EXPECT_EQ(1.0f, out[1]);
   EXPECT_EQ(2.0f, out[2]);
   EXPECT_EQ(3.0f, out[3]);
 }
 
-TEST(OpenCL_TensorMath, TensorMath_ScalarAdd) {
-  auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice());
 
-  Tensor in(Shape{1, 4}, ocl_dev);
-  float a[] = {0.0f, 1.0f, 2.0f, 3.0f};
-  in.CopyDataFromHostPtr(a, 4);
+TEST_F(OpenCL_TensorMath, MemberExp) {
+  tf4in = Exp(tf4in);
   
-  in += 1.0f;
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
   
-  CppCPU host;
-  Block* host_out = host.NewBlock(sizeof(float) * 4);
-  ocl_dev->CopyDataToFrom(host_out, in.block(), sizeof(float) * 4, 
singa::kDeviceToHost);
+  EXPECT_NEAR(exp(0.0f), out[0], 1e-5);
+  EXPECT_NEAR(exp(1.0f), out[1], 1e-5);
+  EXPECT_NEAR(exp(2.0f), out[2], 1e-5);
+  EXPECT_NEAR(exp(3.0f), out[3], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberLog) {
+  tf4in = Log(tf4in);
   
-  float* out = static_cast<float*>(host_out->mutable_data());
-  EXPECT_EQ(1.0f, out[0]);
-  EXPECT_EQ(2.0f, out[1]);
-  EXPECT_EQ(3.0f, out[2]);
-  EXPECT_EQ(4.0f, out[3]);
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+  
+//  EXPECT_NEAR(log(0.0f), out[0], 1e-5); // Evaluates to neg infinity.
+  EXPECT_NEAR(log(1.0f), out[1], 1e-5);
+  EXPECT_NEAR(log(2.0f), out[2], 1e-5);
+  EXPECT_NEAR(log(3.0f), out[3], 1e-5);
 }
 
-TEST(OpenCL_TensorMath, TensorMath_EltwiseAdd) {
-  auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice());
 
-  Tensor in_1(Shape{1, 4}, ocl_dev);
-  float a[] = {0.0f, 1.0f, 2.0f, 3.0f};
-  in_1.CopyDataFromHostPtr(a, 4);
-  Tensor in_2 = in_1.Clone();
+TEST_F(OpenCL_TensorMath, MemberReLU) {
+  tf4in -= 2.0f;
+  Tensor result = ReLU(tf4in);
   
-  in_2 += in_1;
+  result.ToHost();
+  const float* out = result.data<float>();
   
-  CppCPU host;
-  Block* host_out = host.NewBlock(sizeof(float) * 4);
-  ocl_dev->CopyDataToFrom(host_out, in_2.block(), sizeof(float) * 4, 
singa::kDeviceToHost);
+  EXPECT_NEAR(0.0f, out[0], 1e-5);
+  EXPECT_NEAR(0.0f, out[1], 1e-5);
+  EXPECT_NEAR(0.0f, out[2], 1e-5);
+  EXPECT_NEAR(1.0f, out[3], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberSigmoid) {
+  tf4in = Sigmoid(tf4in);
+  
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+  
+  EXPECT_NEAR(1.0f / (1.0f + exp(-0.0f)), out[0], 1e-5);
+  EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), out[1], 1e-5);
+  EXPECT_NEAR(1.0f / (1.0f + exp(-2.0f)), out[2], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberSign) {
+  tf4in -= 1.0f;
+  
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+  
+  EXPECT_NEAR(-1.0f, out[0], 1e-5);
+  EXPECT_NEAR(0.0f, out[1], 1e-5);
+  EXPECT_NEAR(1.0f, out[2], 1e-5);
+  EXPECT_NEAR(2.0f, out[3], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberSqrt) {
+  tf4in = Sqrt(tf4in);
+  
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+  
+  EXPECT_NEAR(0.0f, out[0], 1e-5);
+  EXPECT_NEAR(1.0f, out[1], 1e-5);
+  EXPECT_NEAR(sqrt(2.0f), out[2], 1e-5);
+  EXPECT_NEAR(sqrt(3.0f), out[3], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberSquare) {
+  tf4in = Square(tf4in);
+  
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+  
+  EXPECT_NEAR(0.0f, out[0], 1e-5);
+  EXPECT_NEAR(1.0f, out[1], 1e-5);
+  EXPECT_NEAR(4.0f, out[2], 1e-5);
+  EXPECT_NEAR(9.0f, out[3], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberTanh) {
+  tf4in = Tanh(tf4in);
+  
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+  
+  EXPECT_NEAR(0.0f, out[0], 1e-5);
+  EXPECT_NEAR(tanh(1.0f), out[1], 1e-5);
+  EXPECT_NEAR(tanh(2.0f), out[2], 1e-5);
+  EXPECT_NEAR(tanh(3.0f), out[3], 1e-5);
+}
+
+
+TEST_F(OpenCL_TensorMath, Sum) {
+  Tensor result = Sum(tf4in, 0);
+
+  result.ToHost();
+  const float* out = result.data<float>();
+  
+  EXPECT_NEAR(0.0f, out[0], 1e-5);
+  EXPECT_NEAR(1.0f, out[1], 1e-5);
+  EXPECT_NEAR(2.0f, out[2], 1e-5);
+  EXPECT_NEAR(3.0f, out[3], 1e-5);
+}
+
+TEST_F(OpenCL_TensorMath, MemberLT) {
+  Tensor result = tf4in < 2.0f;
+  
+  result.ToHost();
+  const float* out = result.data<float>();
+  
+  EXPECT_FLOAT_EQ(1.0f, out[0]);
+  EXPECT_FLOAT_EQ(1.0f, out[1]);
+  EXPECT_FLOAT_EQ(0.0f, out[2]);
+  EXPECT_FLOAT_EQ(0.0f, out[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberLE) {
+  Tensor result = tf4in <= 2.0f;
+  
+  result.ToHost();
+  const float* out = result.data<float>();
+  
+  EXPECT_FLOAT_EQ(1.0f, out[0]);
+  EXPECT_FLOAT_EQ(1.0f, out[1]);
+  EXPECT_FLOAT_EQ(1.0f, out[2]);
+  EXPECT_FLOAT_EQ(0.0f, out[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberGT) {
+  Tensor result = tf4in > 2.0f;
+  
+  result.ToHost();
+  const float* out = result.data<float>();
+  
+  EXPECT_FLOAT_EQ(0.0f, out[0]);
+  EXPECT_FLOAT_EQ(0.0f, out[1]);
+  EXPECT_FLOAT_EQ(0.0f, out[2]);
+  EXPECT_FLOAT_EQ(1.0f, out[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberGE) {
+  Tensor result = tf4in >= 2.0f;
+  
+  result.ToHost();
+  const float* out = result.data<float>();
+  
+  EXPECT_FLOAT_EQ(0.0f, out[0]);
+  EXPECT_FLOAT_EQ(0.0f, out[1]);
+  EXPECT_FLOAT_EQ(1.0f, out[2]);
+  EXPECT_FLOAT_EQ(1.0f, out[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberPow) {
+  Tensor result = Pow(tf4in, 2.0f);
+
+  result.ToHost();
+  const float* out = result.data<float>();
+  
+  EXPECT_FLOAT_EQ(0.0f, out[0]);
+  EXPECT_FLOAT_EQ(1.0f, out[1]);
+  EXPECT_FLOAT_EQ(4.0f, out[2]);
+  EXPECT_FLOAT_EQ(9.0f, out[3]);
+  
+  result = Pow(tf4in, tf4in);
+  
+  result.ToHost();
+  const float* out1 = result.data<float>();
+  
+  EXPECT_FLOAT_EQ(1.0f, out1[0]); // 0 ^ 0 is 1, apparently.
+  EXPECT_FLOAT_EQ(1.0f, out1[1]);
+  EXPECT_FLOAT_EQ(4.0f, out1[2]);
+  EXPECT_FLOAT_EQ(27.0f, out1[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberSub) {
+  Tensor result = tf4in - tf4zin;
+
+  result.ToHost();
+  const float* out = result.data<float>();
+  
+  EXPECT_FLOAT_EQ(0.0f, out[0]);
+  EXPECT_FLOAT_EQ(1.0f, out[1]);
+  EXPECT_FLOAT_EQ(2.0f, out[2]);
+  EXPECT_FLOAT_EQ(3.0f, out[3]);
+  
+  result = tf4in - 0.0f;
+
+  result.ToHost();
+  const float* out1 = result.data<float>();
+  
+  EXPECT_FLOAT_EQ(0.0f, out1[0]);
+  EXPECT_FLOAT_EQ(1.0f, out1[1]);
+  EXPECT_FLOAT_EQ(2.0f, out1[2]);
+  EXPECT_FLOAT_EQ(3.0f, out1[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberEltwiseMult) {
+  Tensor result = tf4in * tf4zin;
+
+  result.ToHost();
+  const float* out = result.data<float>();
+  
+  EXPECT_FLOAT_EQ(0.0f, out[0]);
+  EXPECT_FLOAT_EQ(0.0f, out[1]);
+  EXPECT_FLOAT_EQ(0.0f, out[2]);
+  EXPECT_FLOAT_EQ(0.0f, out[3]);
+  
+  result = tf4in * 10.0f;
+
+  result.ToHost();
+  const float* out1 = result.data<float>();
+  
+  EXPECT_FLOAT_EQ(0.0f, out1[0]);
+  EXPECT_FLOAT_EQ(10.0f, out1[1]);
+  EXPECT_FLOAT_EQ(20.0f, out1[2]);
+  EXPECT_FLOAT_EQ(30.0f, out1[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, MemberDiv) {
+  Tensor result = tf4in / tf4in;
+
+  result.ToHost();
+  const float* out = result.data<float>();
+  
+//  EXPECT_FLOAT_EQ(0.0f, out[0]); // Divide by zero.
+  EXPECT_FLOAT_EQ(1.0f, out[1]);
+  EXPECT_FLOAT_EQ(1.0f, out[2]);
+  EXPECT_FLOAT_EQ(1.0f, out[3]);
+  
+  result = tf4in / 10.0f;
+
+  result.ToHost();
+  const float* out1 = result.data<float>();
+  
+  EXPECT_FLOAT_EQ(0.0f, out1[0]);
+  EXPECT_FLOAT_EQ(0.1f, out1[1]);
+  EXPECT_FLOAT_EQ(0.2f, out1[2]);
+  EXPECT_FLOAT_EQ(0.3f, out1[3]);
+  
+  result = Div(10.0f, tf4in);
+
+  result.ToHost();
+  const float* out2 = result.data<float>();
+  
+//  EXPECT_FLOAT_EQ(0.0f, out[0]); // Divide by 0.
+  EXPECT_FLOAT_EQ(10.0f, out2[1]);
+  EXPECT_FLOAT_EQ(5.0f, out2[2]);
+  EXPECT_NEAR((10.0f / 3.0f), out2[3], 1e-5);
+}
+
+// **************************************
+// Random functions
+// **************************************
+
+TEST_F(OpenCL_TensorMath, Bernoulli) {
+  const float p = 0.3f;
+
+  Bernoulli(p, &empty10k);
+  
+  empty10k.ToHost();
+  const float* out = empty10k.data<float>();
+
+  float sum = 0.0f;
+  for (int i = 0; i < 10000; i++) sum += out[i];
+
+  float mean = sum / 10000;
+  
+  EXPECT_NEAR(mean, p, 1e-2);
+  
+  sum = 0.0f;
+  for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
+  float variance = sum / 9999;
+  
+  EXPECT_NEAR(variance, p * (1 - p), 1e-2);
+}
+
+
+TEST_F(OpenCL_TensorMath, Gaussian) {
+  Gaussian(0.0f, 1.0f, &empty10k);
+  
+  empty10k.ToHost();
+  const float* out = empty10k.data<float>();
+  
+  float sum = 0.0f;
+  for (int i = 0; i < 10000; i++) sum += out[i];
+  float mean = sum / 10000;
+  
+  EXPECT_NEAR(mean, 0.0f, 1e-2);
+  
+  sum = 0.0f;
+  for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
+  float variance = sum / 9999;
+  
+  EXPECT_NEAR(variance, 1.0f, 1e-2);
+}
+
+
+TEST_F(OpenCL_TensorMath, Uniform) {
+  Uniform(0.1f, 0.2f, &empty10k);
+  
+  empty10k.ToHost();
+  const float* out = empty10k.data<float>();
+  
+  float sum = 0.0f;
+  for (int i = 0; i < 10000; i++) sum += out[i];
+  float mean = sum / 10000;
+  
+  EXPECT_NEAR(mean, 0.15f, 1e-2);
+  
+  sum = 0.0f;
+  for (int i = 0; i < 10000; i++) sum += (out[i] - mean) * (out[i] - mean);
+  float variance = sum / 9999;
+  
+  EXPECT_NEAR(variance, 0.01f, 1e-2);
+}
+
+// *********************************************************
+// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// *********************************************************
+
+
+TEST_F(OpenCL_TensorMath, EltwiseAdd) {
+  Tensor result = tf4in + tf4in;
+
+  result.ToHost();
+  const float* out = result.data<float>();
   
-  float* out = static_cast<float*>(host_out->mutable_data());
   EXPECT_EQ(0.0f, out[0]);
   EXPECT_EQ(2.0f, out[1]);
   EXPECT_EQ(4.0f, out[2]);
-  EXPECT_EQ(6.0f, out[3]); 
+  EXPECT_EQ(6.0f, out[3]);
+  
+  result = tf4in + tf4zin;
+
+  result.ToHost();
+  const float* out1 = result.data<float>();
+  
+  EXPECT_EQ(0.0f, out1[0]);
+  EXPECT_EQ(1.0f, out1[1]);
+  EXPECT_EQ(2.0f, out1[2]);
+  EXPECT_EQ(3.0f, out1[3]);
+  
+  result = Tensor(tf4in.shape(), tf4in.device(), tf4in.data_type());
+  Add(tf4in, tf4in, &result);
+
+  result.ToHost();
+  const float* out2 = result.data<float>();
+  
+  EXPECT_EQ(0.0f, out2[0]);
+  EXPECT_EQ(2.0f, out2[1]);
+  EXPECT_EQ(4.0f, out2[2]);
+  EXPECT_EQ(6.0f, out2[3]);
+  
+  result = tf4in + 1.0f;
+  
+  result.ToHost();
+  const float* out3 = result.data<float>();
+  
+  EXPECT_EQ(1.0f, out3[0]);
+  EXPECT_EQ(2.0f, out3[1]);
+  EXPECT_EQ(3.0f, out3[2]);
+  EXPECT_EQ(4.0f, out3[3]);
+}
+
+
+TEST_F(OpenCL_TensorMath, SetValue) {
+  const float one_third = 1.0f / 3.0f;
+  empty10k.SetValue(one_third);
+  
+  empty10k.ToHost();
+  const float* out = empty10k.data<float>();
+  
+  EXPECT_EQ(one_third, out[0]);
+  EXPECT_EQ(one_third, out[1]);
+  EXPECT_EQ(one_third, out[1024]);
+  EXPECT_EQ(one_third, out[4096]);
+  EXPECT_EQ(one_third, out[9998]);
+  EXPECT_EQ(one_third, out[9999]);
+}
+
+
+TEST_F(OpenCL_TensorMath, Axpy) {
+  Axpy(10.0f, tf4in, &tf4in);
+  
+  tf4in.ToHost();
+  const float* out = tf4in.data<float>();
+  
+  EXPECT_EQ(0.0f, out[0]);  // 0 * 10 + 0 = 0
+  EXPECT_EQ(11.0f, out[1]); // 1 * 10 + 1 = 11
+  EXPECT_EQ(22.0f, out[2]); // 2 * 10 + 2 = 22
+  EXPECT_EQ(33.0f, out[3]); // 3 * 10 + 3 = 33
 }
+
+TEST_F(OpenCL_TensorMath, Mult) {
+  Tensor result = Mult(tf4in, tf4zin.T()); // Multiply with zero.
+  
+  result.ToHost();
+  const float* out = result.data<float>();
+  
+  EXPECT_EQ(0.0f, out[0]); // 1x4 * 4x1 = 1x1.
+  
+  result = Mult(tf4in, tf4in.T());
+  
+  result.ToHost();
+  const float* out0 = result.data<float>();
+  
+  EXPECT_EQ(14.0f, out0[0]); // 1x4 * 4x1 = 1x1.
+  
+  tf16zin.SetValue(10.0f); // Multiply with 10.0.
+  result = Mult(tf16in, tf16zin); // 4x4 * 4x4 = 4x4.
+  
+  result.ToHost();
+  const float* out1 = result.data<float>();
+  EXPECT_EQ(240.0f, out1[0]);
+  EXPECT_EQ(280.0f, out1[1]);
+  EXPECT_EQ(320.0f, out1[2]);
+  EXPECT_EQ(360.0f, out1[3]);
+  
+  EXPECT_EQ(240.0f, out1[4]);
+  EXPECT_EQ(280.0f, out1[5]);
+  EXPECT_EQ(320.0f, out1[6]);
+  EXPECT_EQ(360.0f, out1[7]);
+  
+  EXPECT_EQ(240.0f, out1[8]);
+  EXPECT_EQ(280.0f, out1[9]);
+  EXPECT_EQ(320.0f, out1[10]);
+  EXPECT_EQ(360.0f, out1[11]);
+  
+  EXPECT_EQ(240.0f, out1[12]);
+  EXPECT_EQ(280.0f, out1[13]);
+  EXPECT_EQ(320.0f, out1[14]);
+  EXPECT_EQ(360.0f, out1[15]);
+}
+
+
+
+// TODO: ComputeCrossEntropy, SoftmaxCrossEntropy

[1/6] incubator-singa git commit: SINGA-173 OpenCL device support and implementation.

Reply via email to