http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/src/core/tensor/tensor_math_opencl.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_opencl.h 
b/src/core/tensor/tensor_math_opencl.h
index c4b1347..bfc051d 100644
--- a/src/core/tensor/tensor_math_opencl.h
+++ b/src/core/tensor/tensor_math_opencl.h
@@ -17,12 +17,1084 @@
  */
 
 #ifndef  SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_
-#include "./tensor_math.h"
+
+#ifdef USE_OPENCL
+//#include <CL/cl2.hpp>
+
+#include "singa/utils/opencl_utils.h"
+#include "tensor_math.h"
 
 namespace singa {
 
+// Some forward declarations of utility functions that only exist here.
+void Transpose(const size_t nrow, const size_t ncol, cl::Buffer& in, 
cl::Buffer& out, Context* ctx);
+void DiagVec_Left(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* 
ctx);
+void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, 
Context* ctx);
+
+// **************************************
+// Element-wise functions
+// **************************************
+
+template<>
+void Abs<float, lang::Opencl>(const size_t num, const Block* in, Block* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_abs";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Add<float, lang::Opencl>(const size_t num, const Block* in, const float 
x, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_add_scalar";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, x);
+  kernel.setArg(2, inbuf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Add<float, lang::Opencl>(const size_t num, const Block* in1, const Block* 
in2, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_add";
+  auto kernel = ctx->kernels->at(kname);
+  
+  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
+  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, in1buf);
+  kernel.setArg(2, in2buf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Clamp<float, lang::Opencl>(const size_t num, const float low, const float 
high, const Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_clamp";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, low);
+  kernel.setArg(2, high);
+  kernel.setArg(3, inbuf);
+  kernel.setArg(4, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Div<float, lang::Opencl>(const size_t num, const Block* in, const float 
x, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_divide_scalar_matx";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, x);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Div<float, lang::Opencl>(const size_t num, const float x, const Block* 
in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_divide_scalar_xmat";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, x);
+  kernel.setArg(2, inbuf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Div<float, lang::Opencl>(const size_t num, const Block* in1, const Block* 
in2, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_divide";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
+  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, in1buf);
+  kernel.setArg(2, in2buf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void EltwiseMult<float, lang::Opencl>(const size_t num, const Block* in, const 
float x, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_eltmult_scalar";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, x);
+  kernel.setArg(2, inbuf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void EltwiseMult<float, lang::Opencl>(const size_t num, const Block* in1, 
const Block* in2, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_eltmult";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
+  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, in1buf);
+  kernel.setArg(2, in2buf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Exp<float, lang::Opencl>(const size_t num, const Block* in, Block* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_exp";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void LE<float, lang::Opencl>(const size_t num, const Block *in, const float x, 
Block *out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_le";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, x);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Log<float, lang::Opencl>(const size_t num, const Block* in, Block* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_log";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void LT<float, lang::Opencl>(const size_t num, const Block *in, const float x, 
Block *out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_lt";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, x);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void GE<float, lang::Opencl>(const size_t num, const Block *in, const float x, 
Block *out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_ge";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, x);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void GT<float, lang::Opencl>(const size_t num, const Block *in, const float x, 
Block *out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_gt";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, x);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Pow<float, lang::Opencl>(const size_t num, const Block* in, float x, 
Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_pow_scalar";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, x);
+  kernel.setArg(2, inbuf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Pow<float, lang::Opencl>(const size_t num, const Block* in1, const Block* 
in2, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_pow";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
+  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, in1buf);
+  kernel.setArg(2, in2buf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void ReLU<float, lang::Opencl>(const size_t num, const Block* in, Block* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_relu";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+template<>
+void Set<float, lang::Opencl>(const size_t num, const float x, Block* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_set";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, x);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Sigmoid<float, lang::Opencl>(const size_t num, const Block* in, Block* 
out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_sigmoid";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Sign<float, lang::Opencl>(const size_t num, const Block* in, Block* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_sign";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Sqrt<float, lang::Opencl>(const size_t num, const Block* in, Block* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_sqrt";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Square<float, lang::Opencl>(const size_t num, const Block* in, Block* 
out, Context* ctx) {
+  Pow<float, lang::Opencl>(num, in, 2, out, ctx);
+}
+
+
+template<>
+void Sub<float, lang::Opencl>(const size_t num, const Block* in, const float 
x, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_subtract_scalar";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, x);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Sub<float, lang::Opencl>(const size_t num, const Block* in1, const Block* 
in2, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_subtract";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
+  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, in1buf);
+  kernel.setArg(2, in2buf);
+  kernel.setArg(3, outbuf);
 
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
 }
 
 
+template<>
+void Sum<float, lang::Opencl>(const size_t num, const Block* in, float* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_reduce";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  
+  size_t size = sizeof(float) * num;
+  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
+  OCL_CHECK(status, "Failed to create buffer!");
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outval);
+  kernel.setArg(3, cl::Local(size));
+  
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+
+  float* temp = new float[num];
+  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
+  OCL_CHECK(status, "Failed to read from buffer!");
+  out[0] = temp[0];
+  delete temp;
+}
+
+
+template<>
+void Tanh<float, lang::Opencl>(const size_t num, const Block* in, Block* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_tanh";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+// **************************************
+// Random functions
+// **************************************
+
+template<>
+void Bernoulli<float, lang::Opencl>(const size_t num, const float p, Block* 
out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+//std::string kname = "clkernel_bernoulli";
+  std::string kname = "PRNG_threefry4x32_bernoulli";
+  auto kernel = ctx->kernels->at(kname);
+  
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)(num / 4)); // Divide by 4 because kernel uses 
float4 as argument.
+  kernel.setArg(1, p);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Gaussian<float, lang::Opencl>(const size_t num, const float mean, const 
float std, Block* out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+//std::string kname = "clkernel_gaussian";
+  std::string kname = "PRNG_threefry4x32_gaussian";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)(num / 4));
+  kernel.setArg(1, mean);
+  kernel.setArg(2, std);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Uniform<float, lang::Opencl>(const size_t num, const float low, const 
float high, Block* out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+//std::string kname = "clkernel_uniform";
+  std::string kname = "PRNG_threefry4x32_uniform";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)(num / 4));
+  kernel.setArg(1, low);
+  kernel.setArg(2, high);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+// *********************************************************
+// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// *********************************************************
+
+template<>
+void Amax<float, lang::Opencl>(const size_t num, const Block* in, size_t* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_amax";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+
+  size_t size = sizeof(size_t) * num;
+  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
+  OCL_CHECK(status, "Failed to create buffer!");
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outval);
+  kernel.setArg(3, cl::Local(size));
+  kernel.setArg(4, cl::Local(sizeof(size_t)));
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+
+  size_t* temp = new size_t[num];
+  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
+  OCL_CHECK(status, "Failed to read from buffer!");
+  out[0] = temp[0];
+  delete temp;
+}
+
+
+template<>
+void Amin<float, lang::Opencl>(const size_t num, const Block* in, size_t* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_amin";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+
+  size_t size = sizeof(size_t) * num;
+  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
+  OCL_CHECK(status, "Failed to create buffer!");
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outval);
+  kernel.setArg(3, cl::Local(size));
+  kernel.setArg(4, cl::Local(sizeof(size_t)));
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+
+  size_t* temp = new size_t[num];
+  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
+  OCL_CHECK(status, "Failed to read from buffer!");
+  out[0] = temp[0];
+  delete temp;
+}
+
+
+template<>
+void Asum<float, lang::Opencl>(const size_t num, const Block* in, float* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_asum";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  
+  size_t size = sizeof(float) * num;
+  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
+  OCL_CHECK(status, "Failed to create buffer!");
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outval);
+  kernel.setArg(3, cl::Local(size));
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+
+  float* temp = new float[num];
+  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
+  OCL_CHECK(status, "Failed to read from buffer!");
+  out[0] = temp[0];
+  delete temp;
+}
+
+
+template<>
+void Axpy<float, lang::Opencl>(const size_t num, const float alpha, const 
Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_axpy";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, alpha);
+  kernel.setArg(2, inbuf);
+  kernel.setArg(3, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Nrm2<float, lang::Opencl>(const size_t num, const Block* in, float* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_nrm2";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+
+  size_t size = sizeof(float) * num;
+  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
+  OCL_CHECK(status, "Failed to create buffer!");
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, inbuf);
+  kernel.setArg(2, outval);
+  kernel.setArg(3, cl::Local(sizeof(float) * (std::pow(2, num))));
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+
+  float* temp = new float[num];
+  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
+  OCL_CHECK(status, "Failed to read from buffer!");
+  out[0] = temp[0];
+  delete temp;
+}
+
+
+template<>
+void Scale<float, lang::Opencl>(const size_t num, const float x, Block* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_scale";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, x);
+  kernel.setArg(2, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void Dot<float, lang::Opencl>(const size_t num, const Block *in1, const Block 
*in2, float *out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_dot";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data()));
+  cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data()));
+
+  size_t size = sizeof(float) * num;
+  cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status);
+  OCL_CHECK(status, "Failed to create buffer!");
+
+  kernel.setArg(0, (cl_int)num);
+  kernel.setArg(1, in1buf);
+  kernel.setArg(2, in2buf);
+  kernel.setArg(3, outval);
+  kernel.setArg(4, cl::Local(size));
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(num));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+
+  float* temp = new float[num];
+  status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp);
+  OCL_CHECK(status, "Failed to read from buffer!");
+  out[0] = temp[0];
+  delete temp;
+}
+
+
+template<>
+void GEMV<float, lang::Opencl>(bool trans, const size_t m, const size_t n, 
const float alpha,
+                 const Block *A, const Block *v, const float beta, Block* out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_gemv";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer Abuf = *(static_cast<cl::Buffer*>(A->mutable_data()));
+  cl::Buffer vbuf = *(static_cast<cl::Buffer*>(v->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)m);
+  kernel.setArg(1, (cl_int)n);
+  kernel.setArg(2, alpha);
+  kernel.setArg(3, Abuf);
+  kernel.setArg(4, vbuf);
+  kernel.setArg(5, beta);
+  kernel.setArg(6, outbuf);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(m, n));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void DGMM<float, lang::Opencl>(bool side_right,
+                 const size_t nrow, const size_t ncol,
+                 const Block *M, const Block *v, Block *out, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  cl::Buffer Mbuf = *(static_cast<cl::Buffer*>(M->mutable_data()));
+  cl::Buffer vbuf = *(static_cast<cl::Buffer*>(v->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  std::string kname;
+  if (side_right) {
+       DiagVec_Right(ncol, vbuf, vbuf, ctx);
+       kname = "clkernel_dgmm_right";
+  } else {
+       DiagVec_Left(nrow, vbuf, vbuf, ctx);
+       kname = "clkernel_dgmm_left";
+  }
+
+  auto kernel = ctx->kernels->at(kname);
+
+  kernel.setArg(0, (cl_int)nrow);
+  kernel.setArg(1, (cl_int)ncol);
+  kernel.setArg(2, Mbuf);
+  kernel.setArg(3, vbuf);
+  kernel.setArg(4, outbuf);
+  kernel.setArg(5, cl::Local(sizeof(float) * nrow * ncol));
+
+  cl::NDRange global(nrow); // Only nrow because current implementation is 1 
dimensional
+//  cl::NDRange local();
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global);
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+
+template<>
+void GEMM<float, lang::Opencl>(const bool transA, const bool transB,
+                 const size_t nrowA, const size_t ncolB, const size_t ncolA,
+                 const float alpha, const Block *A, const Block *B, const 
float beta,
+                 Block *C, Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_gemm";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer Abuf = *(static_cast<cl::Buffer*>(A->mutable_data()));
+  cl::Buffer Bbuf = *(static_cast<cl::Buffer*>(B->mutable_data()));
+  cl::Buffer Cbuf = *(static_cast<cl::Buffer*>(C->mutable_data()));
+
+  // If matrix A needs to be transposed, do it.
+  if (!transA)
+       Transpose(nrowA, ncolA, Abuf, Abuf, ctx);
+
+  // If vector B needs to be transposed, do it.
+  if (!transB)
+       Transpose(nrowA, ncolB, Bbuf, Bbuf, ctx);
+
+  kernel.setArg(0, (cl_int)nrowA);
+  kernel.setArg(1, (cl_int)ncolB);
+  kernel.setArg(2, (cl_int)ncolA);
+  kernel.setArg(3, alpha);
+  kernel.setArg(4, Abuf);
+  kernel.setArg(5, Bbuf);
+  kernel.setArg(6, beta);
+  kernel.setArg(7, Cbuf);
+
+  cl::NDRange global(nrowA, ncolA);
+  cl::NDRange local(32, 32);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global, 
local);
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+template <>
+void ComputeCrossEntropy<float, lang::Opencl>(const size_t batchsize, const 
size_t dim,
+                         const Block *p, const Block *t, Block *loss,
+                         Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_crossentropy";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer pbuf = *(static_cast<cl::Buffer*>(p->mutable_data()));
+  cl::Buffer tbuf = *(static_cast<cl::Buffer*>(t->mutable_data()));
+  cl::Buffer lossbuf = *(static_cast<cl::Buffer*>(loss->mutable_data()));
+
+  kernel.setArg(0, (cl_uint)batchsize);
+  kernel.setArg(1, (cl_uint)dim);
+  kernel.setArg(2, pbuf);
+  kernel.setArg(3, tbuf);
+  kernel.setArg(4, lossbuf);
+
+  cl::NDRange global(batchsize);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global);
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+template <>
+void SoftmaxCrossEntropyBwd<float, lang::Opencl>(const size_t batchsize, const 
size_t dim,
+                            const Block *p, const Block *t, Block *grad,
+                            Context *ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_softmaxentropy";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer pbuf = *(static_cast<cl::Buffer*>(p->mutable_data()));
+  cl::Buffer tbuf = *(static_cast<cl::Buffer*>(t->mutable_data()));
+  cl::Buffer gradbuf = *(static_cast<cl::Buffer*>(grad->mutable_data()));
+
+  kernel.setArg(0, (cl_uint)batchsize);
+  kernel.setArg(1, (cl_uint)dim);
+  kernel.setArg(2, pbuf);
+  kernel.setArg(3, tbuf);
+  kernel.setArg(4, gradbuf);
+
+  cl::NDRange global(batchsize);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global);
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+// **************************************
+// Matrix functions
+// **************************************
+/*
+template<>
+void AddCol<float, lang::Opencl>(const size_t nrow, const size_t ncol, const 
Block* A, const Block* v, Block* out, Context* ctx) {
+  std::string kname = "clkernel_addcol";
+  auto kernel = ctx->kernels->at(kname);
+  kernel.setArg(0, (cl_int)nrow);
+  kernel.setArg(1, (cl_int)ncol);
+  kernel.setArg(2, static_cast<const float*>(A->mutable_data()));
+  kernel.setArg(3, static_cast<const float*>(v->mutable_data()));
+  kernel.setArg(3, static_cast<float*>(out->mutable_data()));
+
+  ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, 
ncol));
+}
+
+template<>
+void AddRow<float, lang::Opencl>(const size_t nrow, const size_t ncol, const 
Block* A, const Block* v, Block* out, Context* ctx) {
+  std::string kname = "clkernel_addrow";
+  auto kernel = ctx->kernels->at(kname);
+  kernel.setArg(0, (cl_int)nrow);
+  kernel.setArg(1, (cl_int)ncol);
+  kernel.setArg(2, static_cast<const float*>(A->mutable_data()));
+  kernel.setArg(3, static_cast<const float*>(v->mutable_data()));
+  kernel.setArg(3, static_cast<float*>(out->mutable_data()));
+
+  ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, 
ncol));
+}
+
+template<>
+void Outer<float, lang::Opencl>(const size_t m, const size_t n, const Block* 
lhs, const Block* rhs, Block* out, Context* ctx) {
+  std::string kname = "clkernel_outerproduct";
+  auto kernel = ctx->kernels->at(kname);
+  kernel.setArg(0, (cl_int)m);
+  kernel.setArg(1, (cl_int)n);
+  kernel.setArg(2, static_cast<const float*>(lhs->data()));
+  kernel.setArg(3, static_cast<const float*>(rhs->data()));
+  kernel.setArg(4, static_cast<float*>(out->mutable_data()));
+
+  ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(m, 
n));
+}
+
+template<>
+void SumColumns<float, lang::Opencl>(const size_t nrow, const size_t ncol, 
const Block* in, Block* out, Context* ctx) {
+  std::string kname = "clkernel_sumcol";
+  auto kernel = ctx->kernels->at(kname);
+  kernel.setArg(0, (cl_int)nrow);
+  kernel.setArg(1, (cl_int)ncol);
+  kernel.setArg(2, static_cast<const float*>(in->mutable_data()));
+  kernel.setArg(3, static_cast<float*>(out->mutable_data()));
+
+  ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, 
ncol));
+}*/
+/*
+template<>
+void SumRows<float, lang::Opencl>(const size_t nrow, const size_t ncol, const 
Block* in, Block* out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_sumrow";
+  auto kernel = ctx->kernels->at(kname);
+
+  cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data()));
+  cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data()));
+
+  kernel.setArg(0, (cl_int)nrow);
+  kernel.setArg(1, (cl_int)ncol);
+  kernel.setArg(2, inbuf);
+  kernel.setArg(3, outbuf);
+  kernel.setArg(4, cl::Local(sizeof(float) * nrow * ncol));
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(nrow, ncol));
+}
+*/
+
+
+#define BLOCK_DIM 16
+
+void Transpose(const size_t nrow, const size_t ncol, cl::Buffer& in, 
cl::Buffer& out, Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_transpose";
+  auto kernel = ctx->kernels->at(kname);
+
+  kernel.setArg(0, (cl_uint)nrow);
+  kernel.setArg(1, (cl_uint)ncol);
+  kernel.setArg(2, in);
+  kernel.setArg(3, out);
+  kernel.setArg(4, cl::Local((BLOCK_DIM + 1) * BLOCK_DIM));
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(nrow, ncol));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+#undef BLOCK_DIM
+
+
+/// This is a utility function that transforms a single-row vector into a 
diagonal matrix.
+/// For example, a vector of size n will become a matrix of size n*n where 
only the positions nx == ny will have values.
+void DiagVec_Left(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* 
ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_diagvec_left";
+  auto kernel = ctx->kernels->at(kname);
+
+  kernel.setArg(0, (cl_uint)size);
+  kernel.setArg(1, in);
+  kernel.setArg(2, out);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(size));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, 
Context* ctx) {
+  cl_int status = CL_SUCCESS;
+
+  std::string kname = "clkernel_diagvec_right";
+  auto kernel = ctx->kernels->at(kname);
+
+  kernel.setArg(0, (cl_uint)size);
+  kernel.setArg(1, in);
+  kernel.setArg(2, out);
+
+  status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), 
cl::NDRange(size));
+  OCL_CHECK(status, "Failed to enqueue kernel function!");
+}
+
+} // namespace singa
+
+#endif // USE_OPENCL
+
 #endif  // SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/src/utils/opencl_utils.cc
----------------------------------------------------------------------
diff --git a/src/utils/opencl_utils.cc b/src/utils/opencl_utils.cc
new file mode 100644
index 0000000..e4fe69b
--- /dev/null
+++ b/src/utils/opencl_utils.cc
@@ -0,0 +1,63 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/utils/opencl_utils.h"
+
+#ifdef USE_OPENCL
+
+void PrintDeviceInfo(const cl::Device &dev) {
+  cl_int status = CL_SUCCESS;
+
+  LOG(INFO) << "\tDevice type: " << dev.getInfo<CL_DEVICE_TYPE>(&status);
+  LOG(INFO) << "\tUnified memory: " << 
dev.getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>(&status);
+  LOG(INFO) << "\tClock speed (MHz): " << 
dev.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(&status);
+  LOG(INFO) << "\tECC memory: " << 
dev.getInfo<CL_DEVICE_ERROR_CORRECTION_SUPPORT>(&status);
+  LOG(INFO) << "\tLittle endian: " << 
dev.getInfo<CL_DEVICE_ENDIAN_LITTLE>(&status);
+  LOG(INFO) << "\tCompute units: " << 
dev.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(&status);
+  LOG(INFO) << "\tMax work grp size: " << 
dev.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(&status);
+//LOG(INFO) << "\tMax work item size: " << 
dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>(&status);
+  LOG(INFO) << "\tMax item dimension: " << 
dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS>(&status);
+  LOG(INFO) << "\tQueue properties: " << 
dev.getInfo<CL_DEVICE_QUEUE_PROPERTIES>(&status);
+  LOG(INFO) << "\tExecution capabilities: " << 
dev.getInfo<CL_DEVICE_EXECUTION_CAPABILITIES>(&status);
+  LOG(INFO) << "\tMax mem alloc size: " << 
dev.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>(&status);
+  LOG(INFO) << "\tGlobal mem size: " << 
dev.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>(&status);
+  LOG(INFO) << "\tLocal mem size: " << 
dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(&status);
+  LOG(INFO) << "\n";
+
+  OCL_CHECK(status, "Failed to retrieve device information!");
+}
+
+
+void PrintPlatformInfo(const cl::Platform &p) {
+  cl_int status = CL_SUCCESS;
+
+  LOG(INFO) << "\tName:         " << p.getInfo<CL_PLATFORM_NAME>(&status);
+  LOG(INFO) << "\tProfile: " << p.getInfo<CL_PLATFORM_PROFILE>(&status);
+  LOG(INFO) << "\tVersion: " << p.getInfo<CL_PLATFORM_VERSION>(&status);
+  LOG(INFO) << "\tVendor:  " << p.getInfo<CL_PLATFORM_VENDOR>(&status);
+  LOG(INFO) << "\tExtensions: " << p.getInfo<CL_PLATFORM_EXTENSIONS>(&status);
+  LOG(INFO) << "\n";
+
+  OCL_CHECK(status, "Failed to retrieve platform information!");
+}
+
+
+#endif // USE_OPENCL

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/test/singa/test_opencl.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_opencl.cc b/test/singa/test_opencl.cc
new file mode 100644
index 0000000..f426559
--- /dev/null
+++ b/test/singa/test_opencl.cc
@@ -0,0 +1,179 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "gtest/gtest.h"
+#include "singa/core/opencl_device.h"
+#include "singa/core/tensor.h"
+#include "singa/proto/core.pb.h"
+
+using singa::OpenclDevice;
+using singa::CppCPU;
+using singa::Block;
+using singa::Shape;
+using singa::Tensor;
+
+// Makes a float array and fills it with increasing values from 0.
+float* MakeMatrix(const int size) {
+  float* mat = new float[size];
+  for (int i = 0; i < size; i++)
+    mat[i] = i;
+  return mat;
+}
+
+
+TEST(OpenclDevice, Constructor) {
+  OpenclDevice dev;
+  EXPECT_EQ(0, dev.id());
+}
+
+TEST(OpenclDevice, MemoryAllocFree) {
+  OpenclDevice dev;
+  Block* b = dev.NewBlock(4);
+  EXPECT_NE(nullptr, b);
+  EXPECT_EQ(4u, b->size());
+  dev.FreeBlock(b);
+}
+
+// Tests for integrity of one round of data transfer to an OpenCL device and 
back.
+TEST(OpenclDevice, CopyDataToFrom) {
+  OpenclDevice dev;
+  CppCPU host;
+  
+  Block* a = host.NewBlock(4);
+  Block* b = dev.NewBlock(4);
+  Block* c = host.NewBlock(4);
+  
+  // Allocate the Block object on the host.
+  char s[] = {'a', 'b', 'c', 'x'};
+  host.CopyDataFromHostPtr(a, s, 4);
+  
+  // Copy back and forth.
+  dev.CopyDataToFrom(b, a, 4, singa::kHostToDevice);
+  dev.CopyDataToFrom(c, b, 4, singa::kDeviceToHost);
+  
+  const char* astr = static_cast<const char*>(c->data());
+  EXPECT_EQ('a', astr[0]);
+  EXPECT_EQ('b', astr[1]);
+  EXPECT_EQ('x', astr[3]);
+}
+
+TEST(OpenclDevice, DuplicateDataOnDevice) {
+  OpenclDevice dev;
+  CppCPU host;
+  
+  Block* a = host.NewBlock(4);
+  Block* b = dev.NewBlock(4);
+  Block* c = dev.NewBlock(4);
+  Block* d = host.NewBlock(4);
+  
+  // Allocate the Block object on the host.
+  char s[] = {'a', 'b', 'c', 'x'};
+  host.CopyDataFromHostPtr(a, s, 4);
+  
+  // Copy to device and duplicate.
+  dev.CopyDataToFrom(b, a, 4, singa::kHostToDevice);
+  dev.CopyDataToFrom(c, b, 4, singa::kDeviceToDevice);
+  dev.CopyDataToFrom(d, c, 4, singa::kDeviceToHost);
+  
+  const char* astr = static_cast<const char*>(d->data());
+  EXPECT_EQ('a', astr[0]);
+  EXPECT_EQ('b', astr[1]);
+  EXPECT_EQ('x', astr[3]);
+}
+
+// Tensor tests
+
+TEST(OpenCL_TensorMath, TensorMath_CopyDataToDevice) {
+  auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice());
+
+  Tensor t(Shape{1, 4}, ocl_dev);
+  float a[] = {0.0f, 1.0f, 2.0f, 3.0f};
+  t.CopyDataFromHostPtr(a, 4);
+  
+  CppCPU host;
+  Block* host_out = host.NewBlock(sizeof(float) * 4);
+  ocl_dev->CopyDataToFrom(host_out, t.block(), sizeof(float) * 4, 
singa::kDeviceToHost);
+  
+  float* out = static_cast<float*>(host_out->mutable_data());
+  EXPECT_EQ(1.0f, out[1]);
+  EXPECT_EQ(3.0f, out[3]);
+}
+
+TEST(OpenCL_TensorMath, TensorMath_Abs) {
+  auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice());
+
+  Tensor in(Shape{1, 4}, ocl_dev);
+  float a[] = {0.0f, -1.0f, -2.0f, -3.0f};
+  in.CopyDataFromHostPtr(a, 4);
+  
+  in = Abs(in);
+  
+  CppCPU host;
+  Block* host_out = host.NewBlock(sizeof(float) * 4);
+  ocl_dev->CopyDataToFrom(host_out, in.block(), sizeof(float) * 4, 
singa::kDeviceToHost);
+  
+  float* out = static_cast<float*>(host_out->mutable_data());
+  EXPECT_EQ(0.0f, out[0]);
+  EXPECT_EQ(1.0f, out[1]);
+  EXPECT_EQ(2.0f, out[2]);
+  EXPECT_EQ(3.0f, out[3]);
+}
+
+TEST(OpenCL_TensorMath, TensorMath_ScalarAdd) {
+  auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice());
+
+  Tensor in(Shape{1, 4}, ocl_dev);
+  float a[] = {0.0f, 1.0f, 2.0f, 3.0f};
+  in.CopyDataFromHostPtr(a, 4);
+  
+  in += 1.0f;
+  
+  CppCPU host;
+  Block* host_out = host.NewBlock(sizeof(float) * 4);
+  ocl_dev->CopyDataToFrom(host_out, in.block(), sizeof(float) * 4, 
singa::kDeviceToHost);
+  
+  float* out = static_cast<float*>(host_out->mutable_data());
+  EXPECT_EQ(1.0f, out[0]);
+  EXPECT_EQ(2.0f, out[1]);
+  EXPECT_EQ(3.0f, out[2]);
+  EXPECT_EQ(4.0f, out[3]);
+}
+
+TEST(OpenCL_TensorMath, TensorMath_EltwiseAdd) {
+  auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice());
+
+  Tensor in_1(Shape{1, 4}, ocl_dev);
+  float a[] = {0.0f, 1.0f, 2.0f, 3.0f};
+  in_1.CopyDataFromHostPtr(a, 4);
+  Tensor in_2 = in_1.Clone();
+  
+  in_2 += in_1;
+  
+  CppCPU host;
+  Block* host_out = host.NewBlock(sizeof(float) * 4);
+  ocl_dev->CopyDataToFrom(host_out, in_2.block(), sizeof(float) * 4, 
singa::kDeviceToHost);
+  
+  float* out = static_cast<float*>(host_out->mutable_data());
+  EXPECT_EQ(0.0f, out[0]);
+  EXPECT_EQ(2.0f, out[1]);
+  EXPECT_EQ(4.0f, out[2]);
+  EXPECT_EQ(6.0f, out[3]); 
+}


Reply via email to