[2/4] incubator-singa git commit: SINGA-230 OpenCL Convolution and Pooling

wangwei Sun, 25 Sep 2016 00:20:32 -0700

SINGA-230 OpenCL Convolution and Pooling

 - Added implementation files.
 - Added relevant unit test files.
 - Bugfixes in OpenCL GEMV and GEMM.
 - Added licensing details to LICENSE file and the kernels retrieved from Caffe.



Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/e3df3bd7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/e3df3bd7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/e3df3bd7

Branch: refs/heads/master
Commit: e3df3bd763b9cfa41b584790f5eed89cdd19684b
Parents: f3665e5
Author: Tan Li Boon <[email protected]>
Authored: Tue Aug 30 15:35:33 2016 +0800
Committer: Tan Li Boon <[email protected]>
Committed: Sun Sep 25 14:46:05 2016 +0800

----------------------------------------------------------------------
 LICENSE                               |   2 +
 cmake/Thirdparty/FindViennaCL.cmake   |   3 -
 include/singa/core/device.h           |   6 +-
 include/singa/utils/opencl_utils.h    |   8 +-
 src/core/device/opencl_device.cc      |  17 +-
 src/core/tensor/tensor.cc             |   7 +
 src/core/tensor/tensor_math_opencl.h  |  43 +++--
 src/model/layer/convolution.cc        |  35 ++--
 src/model/layer/convolution.h         |   2 +-
 src/model/layer/im2col.cl             |  85 +++++++++
 src/model/layer/opencl_convolution.cc | 220 +++++++++++++++++++++++
 src/model/layer/opencl_convolution.h  |  75 ++++++++
 src/model/layer/opencl_pooling.cc     | 272 +++++++++++++++++++++++++++++
 src/model/layer/opencl_pooling.h      | 109 ++++++++++++
 src/model/layer/pooling.cc            | 117 +++++++------
 src/model/layer/pooling.cl            | 264 ++++++++++++++++++++++++++++
 src/model/layer/pooling.h             |  27 ++-
 test/CMakeLists.txt                   |   6 -
 test/singa/test_opencl_convolution.cc | 223 +++++++++++++++++++++++
 test/singa/test_opencl_pooling.cc     | 156 +++++++++++++++++
 20 files changed, 1559 insertions(+), 118 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/LICENSE
----------------------------------------------------------------------
diff --git a/LICENSE b/LICENSE
index 4f9d1e7..62a3430 100644
--- a/LICENSE
+++ b/LICENSE
@@ -305,6 +305,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 SINGA bundles the following under BSD 2-clause license:
 include/singa/utils/cuda_utils.h
 src/core/tensor/distribution.cl
+src/model/layer/im2col.cl
+src/model/layer/pooling.cl
 cmake/ThirdParty/FindViennaCL.cmake
 
 COPYRIGHT

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/cmake/Thirdparty/FindViennaCL.cmake
----------------------------------------------------------------------
diff --git a/cmake/Thirdparty/FindViennaCL.cmake 
b/cmake/Thirdparty/FindViennaCL.cmake
index c0addf8..263c80f 100644
--- a/cmake/Thirdparty/FindViennaCL.cmake
+++ b/cmake/Thirdparty/FindViennaCL.cmake
@@ -1,8 +1,5 @@
-<<<<<<< HEAD
 # This file is retrieved from caffe/cmake/Modules/FindViennaCL.cmake.
 
-=======
->>>>>>> 8ac95cb... SINGA-243 ViennaCL backend for OpenCL support
 SET(ViennaCL_WITH_OPENCL TRUE)
 
 SET(VIENNACL_INCLUDE_SEARCH_PATHS

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/include/singa/core/device.h
----------------------------------------------------------------------
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 62fa250..0fecc6d 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -72,7 +72,7 @@ class Device {
   }
 
   /// Copy data within or across devices.
-  void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
+  virtual void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
                       CopyDirection direction, int dst_offset, int src_offset);
 
   void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes,
@@ -214,9 +214,9 @@ public:
 // Overridden, inherited methods
   void SetRandSeed(unsigned seed) override;
 
-  void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
+  virtual void CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
                       CopyDirection direction, int dst_offset = 0,
-                      int src_offset = 0);
+                      int src_offset = 0) override;
 
 protected:
   /// The OpenCL device that this object represents.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/include/singa/utils/opencl_utils.h
----------------------------------------------------------------------
diff --git a/include/singa/utils/opencl_utils.h 
b/include/singa/utils/opencl_utils.h
index 8c05643..0445f13 100644
--- a/include/singa/utils/opencl_utils.h
+++ b/include/singa/utils/opencl_utils.h
@@ -51,16 +51,16 @@
 
 
 inline viennacl::ocl::handle<cl_mem>
-WrapHandle(cl_mem in, viennacl::ocl::context *ctx) {
+WrapHandle(cl_mem in, viennacl::ocl::context &ctx) {
   if (in != nullptr) {
-    viennacl::ocl::handle<cl_mem> memhandle(in, *ctx);
+    viennacl::ocl::handle<cl_mem> memhandle(in, ctx);
     memhandle.inc();
     return memhandle;
   } else {
     cl_int err;
-    cl_mem dummy = clCreateBuffer(ctx->handle().get(), CL_MEM_READ_WRITE, 0,
+    cl_mem dummy = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE, 0,
                                   nullptr, &err);
-    viennacl::ocl::handle<cl_mem> memhandle(dummy, *ctx);
+    viennacl::ocl::handle<cl_mem> memhandle(dummy, ctx);
     return memhandle;
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/core/device/opencl_device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/opencl_device.cc b/src/core/device/opencl_device.cc
index 6b371c4..0c8f010 100644
--- a/src/core/device/opencl_device.cc
+++ b/src/core/device/opencl_device.cc
@@ -45,6 +45,7 @@ OpenclDevice::OpenclDevice(int id, int num_executors)
   this->this_device = ocl::current_device();
   
   BuildPrograms(cl_src_path);
+  BuildPrograms("../src/model/layer");
 }
 
 
@@ -70,18 +71,18 @@ void OpenclDevice::CopyDataToFrom(Block* dst, Block* src, 
size_t nBytes,
 
   switch(direction) {
   case kHostToDevice: {
-    auto dst_handle = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), 
&ocl_ctx);
+    auto dst_handle = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), 
ocl_ctx);
     memory_write(dst_handle, dst_offset, nBytes, src->data());
     return;
   }
   case kDeviceToHost: {
-    auto src_handle = WrapHandle(static_cast<cl_mem>(src->mutable_data()), 
&ocl_ctx);
+    auto src_handle = WrapHandle(static_cast<cl_mem>(src->mutable_data()), 
ocl_ctx);
     memory_read(src_handle, src_offset, nBytes, dst->mutable_data());
     return;
   }
   case kDeviceToDevice: {
-    auto src_handle = WrapHandle(static_cast<cl_mem>(src->mutable_data()), 
&ocl_ctx);
-    auto dst_handle = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), 
&ocl_ctx);
+    auto src_handle = WrapHandle(static_cast<cl_mem>(src->mutable_data()), 
ocl_ctx);
+    auto dst_handle = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), 
ocl_ctx);
     memory_copy(src_handle, dst_handle, src_offset, dst_offset, nBytes);
     return;
   }
@@ -131,18 +132,18 @@ void OpenclDevice::CopyToFrom(void* dst, const void* src, 
size_t nBytes,
 
   switch(direction) {
   case kHostToDevice: {
-    auto dst_handle = WrapHandle(static_cast<cl_mem>(dst), &ocl_ctx);
+    auto dst_handle = WrapHandle(static_cast<cl_mem>(dst), ocl_ctx);
     memory_write(dst_handle, 0, nBytes, src);
     return;
   }
   case kDeviceToHost: {
-    auto src_handle = WrapHandle((const cl_mem)src, &ocl_ctx);
+    auto src_handle = WrapHandle((const cl_mem)src, ocl_ctx);
     memory_read(src_handle, 0, nBytes, dst);
     return;
   }
   case kDeviceToDevice: {
-    auto src_handle = WrapHandle((const cl_mem)src, &ocl_ctx);
-    auto dst_handle = WrapHandle(static_cast<cl_mem>(dst), &ocl_ctx);
+    auto src_handle = WrapHandle((const cl_mem)src, ocl_ctx);
+    auto dst_handle = WrapHandle(static_cast<cl_mem>(dst), ocl_ctx);
     memory_copy(src_handle, dst_handle, 0, 0, nBytes);
     return;
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 670b27e..d7e8f86 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -787,6 +787,7 @@ Tensor ConcatenateColumns(const vector<Tensor> &in) {
   }
   return out;
 }
+
 Tensor CopyRows(const Tensor &in, const size_t start, const size_t end) {
   CHECK_LT(start, end);
   CHECK_GE(in.shape(0), end);
@@ -797,6 +798,7 @@ Tensor CopyRows(const Tensor &in, const size_t start, const 
size_t end) {
   CopyDataToFrom(&out, in, out.Size(), 0, start * sample_size);
   return out;
 }
+
 Tensor CopyColumns(const Tensor &in, const size_t start, const size_t end) {
   CHECK_EQ(in.nDim(), 2u);
   CHECK_LT(start, end);
@@ -865,6 +867,7 @@ Tensor SliceRows(const Tensor &in, const size_t start, 
const size_t end) {
   */
   return ret;
 }
+
 void SubColumn(const Tensor &v, Tensor *M) { AddColumn(-1, 1, v, M); }
 
 void SubRow(const Tensor &v, Tensor *M) { AddRow(-1, 1, v, M); }
@@ -910,6 +913,7 @@ void Bernoulli(const SType p, Tensor *out) {
     }, {}, {out->block()}, true);
   });
 }
+
 template void Bernoulli<float>(const float p, Tensor *out);
 
 template <typename SType>
@@ -922,6 +926,7 @@ void Uniform(const SType low, const SType high, Tensor 
*out) {
     }, {}, {out->block()}, true);
   });
 }
+
 template void Uniform<float>(const float low, const float high, Tensor *out);
 
 template <typename SType>
@@ -947,6 +952,7 @@ void Axpy(const SType alpha, const Tensor &in, Tensor *out) 
{
     }, {in.block(), out->block()}, {out->block()});
   });
 }
+
 template
 void Axpy<float>(const float alpha, const Tensor &in, Tensor *out);
 
@@ -1006,6 +1012,7 @@ void ComputeCrossEntropy(const Tensor &p, const Tensor 
&t, Tensor *loss) {
     }, {p.block(), t.block()}, {loss->block()});
   });
 }
+
 void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
   CHECK_LE(p->nDim(), 2u);
   CHECK_LE(t.nDim(), 2u);  // TODO(wangwei) consider multi-labels.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/core/tensor/tensor_math_opencl.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_opencl.h 
b/src/core/tensor/tensor_math_opencl.h
index c387031..a209de4 100644
--- a/src/core/tensor/tensor_math_opencl.h
+++ b/src/core/tensor/tensor_math_opencl.h
@@ -27,6 +27,7 @@
 #include <viennacl/vector.hpp>
 #include <viennacl/matrix.hpp>
 
+#include <viennacl/linalg/prod.hpp>
 #include <viennacl/linalg/inner_prod.hpp>
 #include <viennacl/linalg/norm_2.hpp>
 #include <viennacl/linalg/sum.hpp>
@@ -510,19 +511,23 @@ void Dot<float, lang::Opencl>(const size_t num, const 
Block *in1, const Block *i
 template<>
 void GEMV<float, lang::Opencl>(bool trans, const size_t m, const size_t n, 
const float alpha,
                  const Block *A, const Block *v, const float beta, Block* out, 
Context* ctx) {
-                 
-  viennacl::matrix<float> A_in((const cl_mem)A->data(), m, n);
-  viennacl::vector<float> v_in((const cl_mem)v->data(), trans ? m : n);
-  viennacl::vector<float> o_in(static_cast<cl_mem>(out->mutable_data()), trans 
? n : m);
+  viennacl::vector<float> v_buf((const cl_mem)v->data(), n);
+  viennacl::vector<float> o_buf(static_cast<cl_mem>(out->mutable_data()), m);
   
-  if (trans) viennacl::trans(A_in);
+  viennacl::matrix<float> A_buf;
   
-  o_in *= beta;
-  o_in += alpha * viennacl::linalg::prod(A_in, v_in);
-}
+  if (trans) {
+    A_buf = viennacl::matrix<float>((const cl_mem)A->data(), n, m);
+    A_buf = viennacl::trans(A_buf);
+  } else {
+    A_buf = viennacl::matrix<float>((const cl_mem)A->data(), m, n);
+  }
 
+  o_buf *= beta;
+  o_buf += alpha * viennacl::linalg::prod(A_buf, v_buf);
+}
 
-/// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
+/// multiply a matrix with a diagonal matrix constructed using values from 'v'.
 /// if matrix_lef_side is true, do M*v; else do v*M
 template<>
 void DGMM<float, lang::Opencl>(bool side_right,
@@ -549,12 +554,22 @@ void GEMM<float, lang::Opencl>(const bool transA, const 
bool transB,
                  const float alpha, const Block *A, const Block *B, const 
float beta,
                  Block *C, Context *ctx) {
 
-  viennacl::matrix<float> A_buf((const cl_mem)A->data(), nrowA, ncolA);
-  viennacl::matrix<float> B_buf((const cl_mem)B->data(), ncolA, ncolB);
+  viennacl::matrix<float> A_buf, B_buf;
   viennacl::matrix<float> C_buf(static_cast<cl_mem>(C->mutable_data()), nrowA, 
ncolB);
-
-  if (transA) viennacl::trans(A_buf);
-  if (transB) viennacl::trans(B_buf);
+  
+  if (transA) {
+    A_buf = viennacl::matrix<float>((const cl_mem)A->data(), ncolA, nrowA);
+    A_buf = viennacl::trans(A_buf);
+  } else {
+    A_buf = viennacl::matrix<float>((const cl_mem)A->data(), nrowA, ncolA);
+  }
+  
+  if (transB) {
+    B_buf = viennacl::matrix<float>((const cl_mem)B->data(), ncolB, ncolA);
+    B_buf = viennacl::trans(B_buf);
+  } else {
+    B_buf = viennacl::matrix<float>((const cl_mem)B->data(), ncolA, ncolB);
+  }
   
   C_buf *= beta;
   C_buf += alpha * viennacl::linalg::prod(A_buf, B_buf);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.cc b/src/model/layer/convolution.cc
index 52e9d93..bd7cc00 100644
--- a/src/model/layer/convolution.cc
+++ b/src/model/layer/convolution.cc
@@ -142,14 +142,16 @@ const std::pair<Tensor, vector<Tensor>> 
Convolution::Backward(
   size_t batchsize = grad.shape(0);
   size_t imagesize = src_data.Size() / batchsize;
   if (bias_term_) {
-    Tensor tmp1 =
-        Reshape(grad, Shape{batchsize * num_filters_,
-                            grad.Size() / (batchsize * num_filters_)});
+    auto tmpshp = Shape{batchsize * num_filters_, grad.Size() / (batchsize * 
num_filters_)};
+    Tensor tmp1 = Reshape(grad, tmpshp);
+
     Tensor tmp2(Shape{batchsize * num_filters_});
     SumColumns(tmp1, &tmp2);
     Tensor tmp3 = Reshape(tmp2, Shape{batchsize, num_filters_});
+
     SumRows(tmp3, &db);
   }
+  
   auto in_data = src_data.data<float>();
   Tensor col_data(Shape{col_height_, col_width_});
   float *data_col = new float[col_height_ * col_width_];
@@ -157,14 +159,17 @@ const std::pair<Tensor, vector<Tensor>> 
Convolution::Backward(
   for (size_t b = 0; b < batchsize; b++) {
     Im2col(in_data + b * imagesize, channels_, height_, width_, kernel_h_,
            kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data_col);
+    
     col_data.CopyDataFromHostPtr(data_col, col_height_ * col_width_);
     Tensor grad_b(Shape{num_filters_, conv_height_ * conv_width_});
     CopyDataToFrom(&grad_b, grad, grad_b.Size(), 0, b * grad_b.Size());
     dw += Mult(grad_b, col_data.T());
     Tensor dcol_b = Mult(weight_.T(), grad_b);
     auto dcol_data = dcol_b.data<float>();
+    
     Col2im(dcol_data, channels_, height_, width_, kernel_h_, kernel_w_, pad_h_,
            pad_w_, stride_h_, stride_w_, dx_b);
+    
     dx.CopyDataFromHostPtr(dx_b, imagesize, b * imagesize);
   }
   param_grad.push_back(dw);
@@ -180,12 +185,13 @@ void Convolution::ToDevice(std::shared_ptr<Device> 
device) {
 }
 
 void Convolution::Im2col(const float *data_im, const int channels,
-                         const int height, const int width, const int kernel_h,
-                         const int kernel_w, const int pad_h, const int pad_w,
+                         const int height, const int width,
+                         const int kernel_h, const int kernel_w,
+                         const int pad_h, const int pad_w,
                          const int stride_h, const int stride_w,
                          float *data_col) {
   int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int width_col  = ( width + 2 * pad_w - kernel_w) / stride_w + 1;
   int channels_col = channels * kernel_h * kernel_w;
   for (int c = 0; c < channels_col; ++c) {
     int w_offset = c % kernel_w;
@@ -206,18 +212,19 @@ void Convolution::Im2col(const float *data_im, const int 
channels,
 }
 
 void Convolution::Col2im(const float *data_col, const int channels,
-                         const int height, const int width, const int patch_h,
-                         const int patch_w, const int pad_h, const int pad_w,
+                         const int height, const int width,
+                         const int kernel_h, const int kernel_w,
+                         const int pad_h, const int pad_w,
                          const int stride_h, const int stride_w,
                          float *data_im) {
   memset(data_im, 0, height * width * channels * sizeof(float));
-  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
-  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
-  int channels_col = channels * patch_h * patch_w;
+  int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
+  int width_col  = ( width + 2 * pad_w - kernel_w) / stride_w + 1;
+  int channels_col = channels * kernel_h * kernel_w;
   for (int c = 0; c < channels_col; ++c) {
-    int w_offset = c % patch_w;
-    int h_offset = (c / patch_w) % patch_h;
-    int c_im = c / patch_h / patch_w;
+    int w_offset = c % kernel_w;
+    int h_offset = (c / kernel_w) % kernel_h;
+    int c_im = c / kernel_h / kernel_w;
     for (int h = 0; h < height_col; ++h) {
       for (int w = 0; w < width_col; ++w) {
         int h_pad = h * stride_h - pad_h + h_offset;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/convolution.h b/src/model/layer/convolution.h
index d85a17b..7b7fd00 100644
--- a/src/model/layer/convolution.h
+++ b/src/model/layer/convolution.h
@@ -52,7 +52,7 @@ class Convolution : public Layer {
               const int stride_w, float* data_col);
 
   void Col2im(const float* data_col, const int channels, const int height,
-              const int width, const int patch_h, const int patch_w,
+              const int width, const int kernel_h, const int kernel_w,
               const int pad_h, const int pad_w, const int stride_h,
               const int stride_w, float* data_im);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/im2col.cl
----------------------------------------------------------------------
diff --git a/src/model/layer/im2col.cl b/src/model/layer/im2col.cl
new file mode 100644
index 0000000..e977dd6
--- /dev/null
+++ b/src/model/layer/im2col.cl
@@ -0,0 +1,85 @@
+// This file is modified from the file located at
+// 
https://github.com/BVLC/caffe/blob/opencl/src/caffe/greentea/cl_kernels/im2col.cl
+// and is covered under the BSD 2-Clause License, as indicated in the LICENSE
+// file at the root of this repository.
+
+__kernel void im2col(const int n, __global const float* data_im,
+                     const int data_im_off,
+                     const int height, const int width,
+                     const int kernel_h, const int kernel_w,
+                     const int pad_h, const int pad_w,
+                     const int stride_h, const int stride_w,
+                     const int dilation_h, const int dilation_w,
+                     const int height_col, const int width_col,
+                     __global float* data_col, const int data_col_off) {
+
+  for (int index = get_global_id(0); index < n;
+      index += get_global_size(0)) {
+    const int h_index = index / width_col;
+    const int h_col = h_index % height_col;
+    const int w_col = index % width_col;
+    const int c_im = h_index / height_col;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col * stride_h - pad_h;
+    const int w_offset = w_col * stride_w - pad_w;
+    
+    __global float* data_col_ptr = data_col + data_col_off;
+    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
+    __global const float* data_im_ptr = data_im + data_im_off;
+    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
+    
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        int h_im = h_offset + i * dilation_h;
+        int w_im = w_offset + j * dilation_w;
+        *data_col_ptr =
+            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+                data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+__kernel void col2im(const int n, __global const float* data_col,
+                     const int data_col_off, const int channels,
+                     const int height, const int width,
+                     const int kernel_h, const int kernel_w,
+                     const int pad_h, const int pad_w,
+                     const int stride_h, const int stride_w,
+                     const int dilation_h, const int dilation_w,
+                     const int height_col, const int width_col,
+                     __global float* data_im, const int data_im_off) {
+
+  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {
+    float val = 0;
+    const int w_im = index % width + pad_w;
+    const int h_im = (index / width) % height + pad_h;
+    const int c_im = index / (width * height);
+    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+    // compute the start and end of the output
+    const int w_col_start =
+        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+    const int w_col_end = min(w_im / stride_w + 1, width_col);
+    const int h_col_start =
+        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+    const int h_col_end = min(h_im / stride_h + 1, height_col);
+    
+    // TODO: use LCM of stride and dilation to avoid unnecessary loops
+    for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+      for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+        int h_k = (h_im - h_col * stride_h);
+        int w_k = (w_im - w_col * stride_w);
+        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+          h_k /= dilation_h;
+          w_k /= dilation_w;
+          int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
+                                height_col + h_col) * width_col + w_col;
+          val += data_col[data_col_off + data_col_index];
+        }
+      }
+    }
+    data_im[data_im_off + index] = val;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/opencl_convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/opencl_convolution.cc 
b/src/model/layer/opencl_convolution.cc
new file mode 100644
index 0000000..c43719f
--- /dev/null
+++ b/src/model/layer/opencl_convolution.cc
@@ -0,0 +1,220 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#include "opencl_convolution.h"
+
+#ifdef USE_OPENCL
+
+namespace singa {
+
+RegisterLayerClass(opencl_convolution, OpenclConvolution);
+
+/// \copydoc Layer::Forward(int flag, const Tensor&)
+const Tensor OpenclConvolution::Forward(int flag, const Tensor &input) {
+  CHECK(buf_.empty());
+  CHECK_EQ(input.device()->lang(), kOpencl);
+  CHECK_EQ(input.nDim(), 4u);
+  
+  if (flag & kTrain) buf_.push(input);
+  
+  auto batchsize = input.shape(0);
+  auto imagesize = input.Size() / batchsize;
+  auto data_type = input.data_type();
+  auto device = input.device();
+  
+  Shape shape{batchsize, num_filters_, conv_height_, conv_width_};
+  Tensor output(shape, device, data_type);
+  Tensor col_data(Shape{col_height_, col_width_}, device, data_type);
+  
+  for (size_t b = 0; b < batchsize; b++) {
+    int offset = b * imagesize;
+    
+    col_data.device()->Exec([input, offset, col_data, this](Context* ctx) 
mutable {
+
+      this->Im2Col(input.block(), offset, 
+                   height_, width_,
+                   kernel_h_, kernel_w_, 
+                   pad_h_, pad_w_,
+                   stride_h_, stride_w_,
+                   conv_height_, conv_width_,
+                   0, channels_,
+                   col_data.block(), ctx);
+    },
+    {input.block()},
+    {col_data.block()});
+    
+    Tensor each = Mult(weight_, col_data);
+
+    if (bias_term_) {
+      AddColumn(bias_, &each);
+    }
+    
+    CopyDataToFrom(&output, each, each.Size(), b * each.Size());
+  }
+  
+  return output;
+}
+
+
+/// \copydoc Layer::Backward(int, const Tensor&, const Tensor&);
+const std::pair<Tensor, std::vector<Tensor>>
+OpenclConvolution::Backward(int flag, const Tensor &grad) {
+  CHECK(!buf_.empty());
+  CHECK_EQ(grad.device()->lang(), kOpencl);
+  CHECK_EQ(grad.nDim(), 4u);
+  
+  std::vector<Tensor> param_grad;
+  
+  Tensor src_data = buf_.top();
+  buf_.pop();
+  
+  Tensor dx, db, dw;
+  dx.ResetLike(src_data);
+  db.ResetLike(bias_);
+  dw.ResetLike(weight_);
+  dw.SetValue(0.0f);
+  
+  size_t batchsize = grad.shape(0);
+  size_t imagesize = src_data.Size() / batchsize;
+  
+  if (bias_term_) {
+    auto tmpshp = Shape{batchsize * num_filters_, grad.Size() / (batchsize * 
num_filters_)};
+    Tensor tmp1 = Reshape(grad, tmpshp);
+
+    Tensor tmp2(Shape{batchsize * num_filters_}, 
+                grad.device(), grad.data_type());
+    SumColumns(tmp1, &tmp2);
+    Tensor tmp3 = Reshape(tmp2, Shape{batchsize, num_filters_});
+
+    SumRows(tmp3, &db);
+  }
+  
+  Tensor col_data(Shape{col_height_, col_width_}, 
+                  grad.device(), grad.data_type());
+  
+  for (size_t b = 0; b < batchsize; b++) {
+  
+    int im_offset = b * imagesize;
+    int col_offset = 0; // Always keep this to zero.
+    
+    col_data.device()->Exec([src_data, col_data, im_offset, col_offset, 
this](Context* ctx) mutable {
+      
+      this->Im2Col(src_data.block(), im_offset, 
+                   height_, width_,
+                   kernel_h_, kernel_w_, 
+                   pad_h_, pad_w_,
+                   stride_h_, stride_w_,
+                   conv_height_, conv_width_,
+                   col_offset, channels_,
+                   col_data.block(), ctx);
+    },
+    {src_data.block()},
+    {col_data.block()});
+    
+    Tensor grad_b(Shape{num_filters_, conv_height_ * conv_width_}, 
+                  grad.device(), grad.data_type());
+    CopyDataToFrom(&grad_b, grad, grad_b.Size(), 0, b * grad_b.Size());
+    
+    dw += Mult(grad_b, col_data.T());
+    Tensor dcol_b = Mult(weight_.T(), grad_b);
+                         
+    dx.device()->Exec([dcol_b, dx, im_offset, col_offset, this](Context* ctx) 
mutable {
+      
+      this->Col2Im(dcol_b.block(), col_offset, 
+                   height_, width_,
+                   kernel_h_, kernel_w_, 
+                   pad_h_, pad_w_,
+                   stride_h_, stride_w_,
+                   conv_height_, conv_width_,
+                   im_offset, channels_,
+                   dx.block(), ctx);
+    },
+    {dcol_b.block()},
+    {dx.block()});
+  }
+  
+  param_grad.push_back(dw);
+  param_grad.push_back(db);
+  
+  return std::make_pair(dx, param_grad);
+}
+
+
+void OpenclConvolution::Setup(const Shape &in_sample, const LayerConf &conf) {
+  Convolution::Setup(in_sample, conf);
+}
+
+
+void OpenclConvolution::ToDevice(std::shared_ptr<Device> device) {
+  Convolution::ToDevice(device);
+}
+
+                           
+void OpenclConvolution::Im2Col(Block* src, int data_im_off, 
+                               const int height, const int width,
+                               const int kernel_h, const int kernel_w,
+                               const int pad_h, const int pad_w,
+                               const int stride_h, const int stride_w,
+                               const int conv_h, const int conv_w,
+                               const int col_data_off, const int channels, 
+                               Block* dst, Context* ctx) {
+
+  auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("im2col.cl", "im2col");
+
+  auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
+  auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);
+  
+  int num_kernels = channels * conv_h * conv_w;
+  
+  viennacl::ocl::enqueue(kernel(num_kernels, src_buf, data_im_off,
+                                height, width, kernel_h, kernel_w, 
+                                pad_h, pad_w, stride_h, stride_w,
+                                1, 1, conv_h, conv_w,
+                                dst_buf, col_data_off));
+}
+
+  
+void OpenclConvolution::Col2Im(Block* src, const int col_data_off, 
+                               const int height, const int width,
+                               const int kernel_h, const int kernel_w,
+                               const int pad_h, const int pad_w,
+                               const int stride_h, const int stride_w,
+                               const int conv_h, const int conv_w,
+                               const int data_im_off, const int channels, 
+                               Block* dst, Context* ctx) {
+                               
+  auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("im2col.cl", "col2im");
+  
+  auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
+  auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);
+  
+  int num_kernels = channels * height * width;
+  
+  viennacl::ocl::enqueue(kernel(num_kernels, src_buf, col_data_off, channels,
+                                height, width, kernel_h, kernel_w, 
+                                pad_h, pad_w, stride_h, stride_w,
+                                1, 1, conv_h, conv_w,
+                                dst_buf, data_im_off));
+}
+
+
+} // namespace singa
+
+#endif // USE_OPENCL

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/opencl_convolution.h
----------------------------------------------------------------------
diff --git a/src/model/layer/opencl_convolution.h 
b/src/model/layer/opencl_convolution.h
new file mode 100644
index 0000000..a25acd2
--- /dev/null
+++ b/src/model/layer/opencl_convolution.h
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#ifndef SRC_MODEL_LAYER_OPENCL_CONVOLUTION_H_
+#define SRC_MODEL_LAYER_OPENCL_CONVOLUTION_H_
+
+#include "singa/singa_config.h"
+#include "singa/core/common.h"
+#include "singa/model/layer.h"
+#include "singa/utils/opencl_utils.h"
+#include "singa/proto/core.pb.h"
+#include "convolution.h"
+
+#ifdef USE_OPENCL
+
+namespace singa {
+
+class OpenclConvolution : public Convolution {
+public:
+
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "OpenclConvolution"; }
+  
+  const Tensor Forward(int flag, const Tensor &input) override;
+  
+  const std::pair<Tensor, std::vector<Tensor>>
+  Backward(int flag, const Tensor &grad) override;
+
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const Shape &in_sample, const LayerConf &conf) override;
+  
+  void ToDevice(std::shared_ptr<Device> device) override;
+  
+private:
+
+  void Im2Col(Block* src, int data_im_off, 
+              const int height, const int width,
+              const int kernel_h, const int kernel_w,
+              const int pad_h, const int pad_w,
+              const int stride_h, const int stride_w,
+              const int conv_h, const int conv_w,
+              const int data_col_off, const int channels, 
+              Block* dst, Context* ctx);
+  
+  void Col2Im(Block* src, const int data_col_off,
+              const int height, const int width,
+              const int kernel_h, const int kernel_w,
+              const int pad_h, const int pad_w,
+              const int stride_h, const int stride_w,
+              const int conv_h, const int conv_w,
+              const int data_im_off, const int channels, 
+              Block* dst, Context* ctx);
+
+};
+
+} // namespace singa
+
+#endif // USE_OPENCL
+
+#endif // SRC_MODEL_LAYER_OPENCL_CONVOLUTION_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/opencl_pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/opencl_pooling.cc 
b/src/model/layer/opencl_pooling.cc
new file mode 100644
index 0000000..2e35330
--- /dev/null
+++ b/src/model/layer/opencl_pooling.cc
@@ -0,0 +1,272 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#include "opencl_pooling.h"
+
+#ifdef USE_OPENCL
+
+namespace singa {
+
+RegisterLayerClass(opencl_pooling, OpenclPooling);
+
+const Tensor OpenclPooling::Forward(int flag, const Tensor &input) {
+  CHECK(buf_.empty());
+  CHECK_EQ(input.device()->lang(), kOpencl);
+  CHECK_EQ(input.nDim(), 4u);
+  
+  auto batchsize = input.shape(0);
+  auto data_type = input.data_type();
+  auto device = input.device();
+
+  Shape shape{batchsize, channels_, pooled_height_, pooled_width_};
+  Tensor output = Tensor(shape, device, data_type);
+  
+  output.device()->Exec([input, output, flag, this](Context *ctx) {
+    Block* in_block = input.block();
+    Block* outblock = output.block();
+
+    if (pool_ == PoolingConf_PoolMethod_MAX) {
+      Tensor mask;
+      mask.ResetLike(output);
+      
+      Pooling_Forward_Max((int)output.Size(), in_block, mask.block(), 
+                          height_, width_,
+                          pooled_height_, pooled_width_,
+                          kernel_h_, kernel_w_,
+                          stride_h_, stride_w_,
+                          pad_h_, pad_w_,
+                          outblock, channels_, ctx);
+      
+      if (flag & kTrain)
+        buf_.push(mask);
+      
+    } else if (pool_ == PoolingConf_PoolMethod_AVE) {
+      Pooling_Forward_Ave((int)output.Size(), in_block, outblock,
+                          height_, width_, pooled_height_, pooled_width_,
+                          kernel_h_, kernel_w_, stride_h_, stride_w_,
+                          pad_h_, pad_w_, channels_, ctx);
+    } else
+      LOG(FATAL) << "Unknown pooling method.";
+    
+  }, {input.block()}, {output.block()});
+  
+  return output;
+}
+
+
+const std::pair<Tensor, std::vector<Tensor>>
+OpenclPooling::Backward(int flag, const Tensor &grad) {
+  CHECK_EQ(grad.device()->lang(), kOpencl);
+  CHECK_EQ(grad.nDim(), 4u);
+  
+  std::vector<Tensor> param_grad;
+  
+  auto batchsize = grad.shape(0);
+  auto data_type = grad.data_type();
+  auto device = grad.device();
+  Shape shape{batchsize, channels_, height_, width_};
+  
+  Tensor dx(shape, device, data_type);
+
+  dx.device()->Exec([dx, grad, this](Context *ctx) {
+    if (pool_ == PoolingConf_PoolMethod_MAX) {
+      CHECK(!buf_.empty());
+      Tensor mask = buf_.top();
+      buf_.pop();
+
+      Pooling_Backward_Max(grad.block(), mask.block(),
+                           dx.Size(), channels_,
+                           height_, width_,
+                           pooled_height_, pooled_width_,
+                           kernel_h_, kernel_w_,
+                           pad_h_, pad_w_,
+                           stride_h_, stride_w_,
+                           dx.block(), ctx);
+                           
+    } else if (pool_ == PoolingConf_PoolMethod_AVE) {
+      Pooling_Backward_Ave(grad.block(), grad.shape(0), channels_, 
+                           height_, width_,
+                           pooled_height_, pooled_width_,
+                           kernel_h_, kernel_w_,
+                           pad_h_, pad_w_,
+                           stride_h_, stride_w_,
+                           dx.block(), ctx);
+                           
+    } else
+      LOG(FATAL) << "Unknown pooling method.";
+    
+  }, {grad.block()}, {dx.block()});
+
+  return std::make_pair(dx, param_grad);
+}
+
+
+void OpenclPooling::Setup(const Shape& in_sample, const LayerConf &conf) {
+  Pooling::Setup(in_sample, conf);
+  auto pool_conf = conf.pooling_conf();
+}
+
+
+void OpenclPooling::Pooling_Forward_Max(const int num, Block* src, Block* 
mask, 
+                                        const int height, const int width,
+                                        const int pooled_h, const int pooled_w,
+                                        const int kernel_h, const int kernel_w,
+                                        const int stride_h, const int stride_w,
+                                        const int pad_h, const int pad_w,
+                                        Block* dst, const int channels,
+                                        Context* ctx) {
+  auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("pooling.cl", "max_pool_forward");
+  
+  auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
+  auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);
+  auto maskbuf = WrapHandle(static_cast<cl_mem>(mask->mutable_data()), 
ocl_ctx);
+
+  viennacl::ocl::enqueue(kernel(num, src_buf, channels,
+                                height, width, pooled_h, pooled_w,
+                                kernel_h, kernel_w, stride_h, stride_w,
+                                pad_h, pad_w, dst_buf, maskbuf));
+}
+
+
+void OpenclPooling::Pooling_Forward_Ave(const int num, Block* src, Block* dst, 
+                                        const int height, const int width,
+                                        const int pooled_h, const int pooled_w,
+                                        const int kernel_h, const int kernel_w,
+                                        const int stride_h, const int stride_w,
+                                        const int pad_h, const int pad_w,
+                                        const int channels, Context* ctx) {
+  auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("pooling.cl", "ave_pool_forward");
+  
+  auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
+  auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);
+                                   
+  viennacl::ocl::enqueue(kernel(num, src_buf, channels,
+                                height, width, pooled_h, pooled_w,
+                                kernel_h, kernel_w, stride_h, stride_w,
+                                pad_h, pad_w, dst_buf));
+}
+
+
+void OpenclPooling::Pooling_Forward_Sto_Train(Block* src, Block* rand,
+                                              const int height, const int 
width,
+                                              const int pooled_h, const int 
pooled_w,
+                                              const int kernel_h, const int 
kernel_w,
+                                              const int stride_h, const int 
stride_w,
+                                              const int channels, 
+                                              Block* dst, Context* ctx) {
+  auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("pooling.cl", "sto_pool_forward_train");
+  
+  auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
+  auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);
+  auto randbuf = WrapHandle(static_cast<cl_mem>(rand->mutable_data()), 
ocl_ctx);
+
+  viennacl::ocl::enqueue(kernel(height * width, src_buf, channels,
+                                height, width, pooled_h, pooled_w,
+                                kernel_h, kernel_w, stride_h, stride_w,
+                                randbuf, dst_buf));
+}
+
+
+void OpenclPooling::Pooling_Forward_Sto_Test(Block* src, Block* dst, 
+                                             const int height, const int width,
+                                             const int pooled_h, const int 
pooled_w,
+                                             const int kernel_h, const int 
kernel_w,
+                                             const int stride_h, const int 
stride_w,
+                                             const int channels, Context* ctx) 
{
+  auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("pooling.cl", "sto_pool_forward_test");
+  
+  auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
+  auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);
+
+  viennacl::ocl::enqueue(kernel(height * width, src_buf, channels,
+                                height, width, pooled_h, pooled_w,
+                                kernel_h, kernel_w, stride_h, stride_w,
+                                dst_buf));
+}
+
+
+void OpenclPooling::Pooling_Backward_Max(Block* top, Block* mask,
+                                         const int num, const int channels,
+                                         const int height, const int width,
+                                         const int pooled_h, const int 
pooled_w,
+                                         const int kernel_h, const int 
kernel_w,
+                                         const int pad_h, const int pad_w,
+                                         const int stride_h, const int 
stride_w,
+                                         Block* bottom, Context* ctx) {
+  auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("pooling.cl", "max_pool_backward");
+  
+  auto src_buf = WrapHandle(static_cast<cl_mem>(top->mutable_data()), ocl_ctx);
+  auto dst_buf = WrapHandle(static_cast<cl_mem>(bottom->mutable_data()), 
ocl_ctx);
+  auto mask_buf = WrapHandle(static_cast<cl_mem>(mask->mutable_data()), 
ocl_ctx);
+
+  viennacl::ocl::enqueue(kernel(num, src_buf, mask_buf, channels,
+                                height, width, pooled_h, pooled_w,
+                                kernel_h, kernel_w, stride_h, stride_w,
+                                pad_h, pad_w, dst_buf));
+}
+
+
+void OpenclPooling::Pooling_Backward_Ave(Block* bottom,
+                                         const int num, const int channels, 
+                                         const int height, const int width,
+                                         const int pooled_h, const int 
pooled_w,
+                                         const int kernel_h, const int 
kernel_w,
+                                         const int pad_h, const int pad_w,
+                                         const int stride_h, const int 
stride_w,
+                                         Block* top, Context* ctx) {
+  auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("pooling.cl", "ave_pool_backward");
+  
+  auto src_buf = WrapHandle(static_cast<cl_mem>(bottom->mutable_data()), 
ocl_ctx);
+  auto dst_buf = WrapHandle(static_cast<cl_mem>(top->mutable_data()), ocl_ctx);
+                                   
+  viennacl::ocl::enqueue(kernel(num, src_buf, channels,
+                                height, width, pooled_h, pooled_w,
+                                kernel_h, kernel_w, stride_h, stride_w,
+                                pad_h, pad_w, dst_buf));
+}
+
+
+void OpenclPooling::Pooling_Backward_Sto(Block* src, Block* rand, Block* dst,
+                                         const int height, const int width,
+                                         const int pooled_h, const int 
pooled_w,
+                                         const int kernel_h, const int 
kernel_w,
+                                         const int stride_h, const int 
stride_w,
+                                         const int channels, Context* ctx) {
+  auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
+  auto kernel = ocl_ctx.get_kernel("pooling.cl", "sto_pool_backward");
+  
+  auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
+  auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);
+  auto randbuf = WrapHandle(static_cast<cl_mem>(rand->mutable_data()), 
ocl_ctx);
+                                   
+  viennacl::ocl::enqueue(kernel(height * width, randbuf, src_buf, channels,
+                                height, width, pooled_h, pooled_w,
+                                kernel_h, kernel_w, stride_h, stride_w,
+                                dst_buf));
+}
+
+
+} // namespace singa
+
+#endif // USE_OPENCL

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/opencl_pooling.h
----------------------------------------------------------------------
diff --git a/src/model/layer/opencl_pooling.h b/src/model/layer/opencl_pooling.h
new file mode 100644
index 0000000..01e447c
--- /dev/null
+++ b/src/model/layer/opencl_pooling.h
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#ifndef SRC_MODEL_LAYER_OPENCL_POOLING_H_
+#define SRC_MODEL_LAYER_OPENCL_POOLING_H_
+
+#include "pooling.h"
+#include "singa/core/common.h"
+#include "singa/model/layer.h"
+#include "singa/utils/opencl_utils.h"
+#include "singa/proto/core.pb.h"
+
+#ifdef USE_OPENCL
+
+namespace singa {
+
+class OpenclPooling : public Pooling {
+public:
+
+  /// \copydoc Layer::layer_type()
+  const std::string layer_type() const override { return "OpenclPooling"; }
+  
+  const Tensor Forward(int flag, const Tensor &input) override;
+  
+  const std::pair<Tensor, std::vector<Tensor>>
+  Backward(int flag, const Tensor &grad) override;
+  
+  /// \copydoc Layer::Setup(const LayerConf&);
+  void Setup(const Shape &in_sample, const LayerConf &conf) override;
+  
+private:
+  void Pooling_Forward_Max(const int num, Block* src, Block* mask, 
+                           const int height, const int width,
+                           const int pooled_h, const int pooled_w,
+                           const int kernel_h, const int kernel_w,
+                           const int stride_h, const int stride_w,
+                           const int pad_h, const int pad_w,
+                           Block* dst, const int channels,
+                           Context* ctx);
+                           
+  void Pooling_Forward_Ave(const int num, Block* src, Block* dst, 
+                           const int height, const int width,
+                           const int pooled_h, const int pooled_w,
+                           const int kernel_h, const int kernel_w,
+                           const int stride_h, const int stride_w,
+                           const int pad_h, const int pad_w,
+                           const int channels, Context* ctx);
+  
+  void Pooling_Forward_Sto_Train(Block* src, Block* rand,
+                                 const int height, const int width,
+                                 const int pooled_h, const int pooled_w,
+                                 const int kernel_h, const int kernel_w,
+                                 const int stride_h, const int stride_w,
+                                 const int channels, 
+                                 Block* dst, Context* ctx);
+  
+  void Pooling_Forward_Sto_Test(Block* src, Block* dst, 
+                                const int height, const int width,
+                                const int pooled_h, const int pooled_w,
+                                const int kernel_h, const int kernel_w,
+                                const int stride_h, const int stride_w,
+                                const int channels, Context* ctx);
+                            
+  void Pooling_Backward_Max(Block* top, Block* mask,
+                            const int num, const int channels,
+                            const int height, const int width,
+                            const int pooled_h, const int pooled_w,
+                            const int kernel_h, const int kernel_w,
+                            const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            Block* bottom, Context* ctx);
+
+  void Pooling_Backward_Ave(Block* bottom, const int num, const int channels, 
+                            const int height, const int width,
+                            const int pooled_h, const int pooled_w,
+                            const int kernel_h, const int kernel_w,
+                            const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            Block* top, Context* ctx);
+  
+  void Pooling_Backward_Sto(Block* src, Block* rand, Block* dst,
+                            const int height, const int width,
+                            const int pooled_h, const int pooled_w,
+                            const int kernel_h, const int kernel_w,
+                            const int stride_h, const int stride_w,
+                            const int channels, Context* ctx);
+                                         
+};
+
+} // namespace singa
+
+#endif // USE_OPENCL
+
+#endif // SRC_MODEL_LAYER_OPENCL_POOLING_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/pooling.cc b/src/model/layer/pooling.cc
index 1312776..ff8d58e 100644
--- a/src/model/layer/pooling.cc
+++ b/src/model/layer/pooling.cc
@@ -85,49 +85,55 @@ const Tensor Pooling::Forward(int flag, const Tensor& 
input) {
     Tensor mask;
     mask.ResetLike(output);
     float* maskptr = new float[mask.Size()];
-    ForwardMaxPooling(inptr, batchsize, channels_, height_, width_, kernel_h_,
-                      kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, outptr,
+    ForwardMaxPooling(inptr, batchsize, channels_, height_, width_,
+                      pooled_height_, pooled_width_, kernel_h_, kernel_w_,
+                      pad_h_, pad_w_, stride_h_, stride_w_, outptr,
                       maskptr);
     mask.CopyDataFromHostPtr(maskptr, mask.Size());
     if (flag & kTrain) buf_.push(mask);
     delete[] maskptr;
   } else if (pool_ == PoolingConf_PoolMethod_AVE)
-    ForwardAvgPooling(inptr, batchsize, channels_, height_, width_, kernel_h_,
-                      kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, outptr);
+    ForwardAvgPooling(inptr, batchsize, channels_, height_, width_,
+                      pooled_height_, pooled_width_, kernel_h_, kernel_w_,
+                      pad_h_, pad_w_, stride_h_, stride_w_, outptr);
   else
-    LOG(FATAL) << "Unknow pooling method";
+    LOG(FATAL) << "Unknown pooling method";
 
   output.CopyDataFromHostPtr(outptr, output.Size());
   delete[] outptr;
   return output;
 }
 
-const std::pair<Tensor, vector<Tensor>> Pooling::Backward(int flag,
-                                                          const Tensor& grad) {
+const std::pair<Tensor, vector<Tensor>>
+Pooling::Backward(int flag, const Tensor& grad) {
   CHECK_EQ(grad.device()->lang(), kCpp);
   CHECK_EQ(grad.nDim(), 4u);
+  
   vector<Tensor> param_grad;
-    size_t batchsize = grad.shape(0);
-  Shape shape{batchsize, channels_, height_, width_};
+  
+  auto batchsize = grad.shape(0);
+  auto dtype = grad.data_type();
   auto dev = grad.device();
-  DataType dtype = grad.data_type();
+  Shape shape{batchsize, channels_, height_, width_};
+  
   Tensor dx(shape, dev, dtype);
   auto gradptr = grad.data<float>();
   float* dxptr = new float[dx.Size()];
+
   if (pool_ == PoolingConf_PoolMethod_MAX) {
     CHECK(!buf_.empty());
     Tensor mask = buf_.top();
     buf_.pop();
     auto maskptr = mask.data<float>();
     BackwardMaxPooling(gradptr, maskptr, batchsize, channels_, height_, width_,
-                       kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
-                       stride_w_, dxptr);
+                       pooled_height_, pooled_width_, kernel_h_, kernel_w_,
+                       pad_h_, pad_w_, stride_h_, stride_w_, dxptr);
   } else if (pool_ == PoolingConf_PoolMethod_AVE) {
     BackwardAvgPooling(gradptr, batchsize, channels_, height_, width_,
-                       kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_,
-                       stride_w_, dxptr);
+                       pooled_height_, pooled_width_, kernel_h_, kernel_w_,
+                       pad_h_, pad_w_, stride_h_, stride_w_, dxptr);
   } else {
-    LOG(FATAL) << "Unknow pooling method";
+    LOG(FATAL) << "Unknown pooling method";
   }
 
   dx.CopyDataFromHostPtr(dxptr, dx.Size());
@@ -136,32 +142,32 @@ const std::pair<Tensor, vector<Tensor>> 
Pooling::Backward(int flag,
 }
 
 void Pooling::ForwardMaxPooling(const float* bottom, const int num,
-                                const int channels, const int height,
-                                const int width, const int kernel_h,
-                                const int kernel_w, const int pad_h,
-                                const int pad_w, const int stride_h,
-                                const int stride_w, float* top, float* mask) {
-  int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1;
-  int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1;
-  int top_count = num * top_height * top_width * channels;
+                                const int channels,
+                                const int height, const int width,
+                                const int pooled_h, const int pooled_w,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w,
+                                const int stride_h, const int stride_w,
+                                float* top, float* mask) {
+  int top_count = num * pooled_h * pooled_w * channels;
   for (int i = 0; i < top_count; i++) {
     mask[i] = -1;
     top[i] = -FLT_MAX;
   }
   const int bottom_offset = height * width;
-  const int top_offset = top_height * top_width;
+  const int top_offset = pooled_h * pooled_w;
   // The main loop
   for (int n = 0; n < num; ++n) {
     for (int c = 0; c < channels; ++c) {
-      for (int ph = 0; ph < top_height; ++ph) {
-        for (int pw = 0; pw < top_width; ++pw) {
+      for (int ph = 0; ph < pooled_h; ++ph) {
+        for (int pw = 0; pw < pooled_w; ++pw) {
           int hstart = ph * stride_h - pad_h;
           int wstart = pw * stride_w - pad_w;
           int hend = std::min(hstart + kernel_h, height);
           int wend = std::min(wstart + kernel_w, width);
           hstart = std::max(hstart, 0);
           wstart = std::max(wstart, 0);
-          const int top_index = ph * top_width + pw;
+          const int top_index = ph * pooled_w + pw;
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
               const int index = h * width + w;
@@ -184,20 +190,19 @@ void Pooling::ForwardMaxPooling(const float* bottom, 
const int num,
 void Pooling::BackwardMaxPooling(const float* top, const float* mask,
                                  const int num, const int channels,
                                  const int height, const int width,
+                                 const int pooled_h, const int pooled_w, 
                                  const int kernel_h, const int kernel_w,
                                  const int pad_h, const int pad_w,
                                  const int stride_h, const int stride_w,
                                  float* bottom) {
-  int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1;
-  int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1;
-  const int top_offset = top_height * top_width;
+  const int top_offset = pooled_h * pooled_w;
   const int bottom_offset = height * width;
   memset(bottom, 0, sizeof(float) * num * channels * bottom_offset);
   for (int n = 0; n < num; ++n) {
     for (int c = 0; c < channels; ++c) {
-      for (int ph = 0; ph < top_height; ++ph) {
-        for (int pw = 0; pw < top_width; ++pw) {
-          const int top_idx = ph * top_width + pw;
+      for (int ph = 0; ph < pooled_h; ++ph) {
+        for (int pw = 0; pw < pooled_w; ++pw) {
+          const int top_idx = ph * pooled_w + pw;
           const int bottom_idx = static_cast<int>(mask[top_idx]);
           bottom[bottom_idx] += top[top_idx];
         }
@@ -210,24 +215,24 @@ void Pooling::BackwardMaxPooling(const float* top, const 
float* mask,
 }
 
 void Pooling::ForwardAvgPooling(const float* bottom, const int num,
-                                const int channels, const int height,
-                                const int width, const int kernel_h,
-                                const int kernel_w, const int pad_h,
-                                const int pad_w, const int stride_h,
-                                const int stride_w, float* top) {
-  int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1;
-  int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1;
-  int top_count = num * top_height * top_width * channels;
+                                const int channels, 
+                                const int height, const int width,
+                                const int pooled_h, const int pooled_w,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w,
+                                const int stride_h, const int stride_w,
+                                float* top) {
+  int top_count = num * pooled_h * pooled_w * channels;
   for (int i = 0; i < top_count; i++) {
     top[i] = 0;
   }
   const int bottom_offset = height * width;
-  const int top_offset = top_height * top_width;
+  const int top_offset = pooled_h * pooled_w;
   // The main loop
   for (int n = 0; n < num; ++n) {
     for (int c = 0; c < channels; ++c) {
-      for (int ph = 0; ph < top_height; ++ph) {
-        for (int pw = 0; pw < top_width; ++pw) {
+      for (int ph = 0; ph < pooled_h; ++ph) {
+        for (int pw = 0; pw < pooled_w; ++pw) {
           int hstart = ph * stride_h - pad_h;
           int wstart = pw * stride_w - pad_w;
           int hend = std::min(hstart + kernel_h, height + pad_h);
@@ -237,7 +242,7 @@ void Pooling::ForwardAvgPooling(const float* bottom, const 
int num,
           wstart = std::max(wstart, 0);
           hend = std::min(hend, height);
           wend = std::min(wend, width);
-          const int top_index = ph * top_width + pw;
+          const int top_index = ph * pooled_w + pw;
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
               const int index = h * width + w;
@@ -255,20 +260,20 @@ void Pooling::ForwardAvgPooling(const float* bottom, 
const int num,
 }
 
 void Pooling::BackwardAvgPooling(const float* top, const int num,
-                                 const int channels, const int height,
-                                 const int width, const int kernel_h,
-                                 const int kernel_w, const int pad_h,
-                                 const int pad_w, const int stride_h,
-                                 const int stride_w, float* bottom) {
-  int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1;
-  int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1;
-  const int top_offset = top_height * top_width;
+                                 const int channels,
+                                 const int height, const int width, 
+                                 const int pooled_h, const int pooled_w,
+                                 const int kernel_h, const int kernel_w,
+                                 const int pad_h, const int pad_w,
+                                 const int stride_h, const int stride_w,
+                                 float* bottom) {
+  const int top_offset = pooled_h * pooled_w;
   const int bottom_offset = height * width;
   memset(bottom, 0, sizeof(float) * num * channels * bottom_offset);
   for (int n = 0; n < num; ++n) {
     for (int c = 0; c < channels; ++c) {
-      for (int ph = 0; ph < top_height; ++ph) {
-        for (int pw = 0; pw < top_width; ++pw) {
+      for (int ph = 0; ph < pooled_h; ++ph) {
+        for (int pw = 0; pw < pooled_w; ++pw) {
           int hstart = ph * stride_h - pad_h;
           int wstart = pw * stride_w - pad_w;
           int hend = std::min(hstart + kernel_h, height + pad_h);
@@ -278,7 +283,7 @@ void Pooling::BackwardAvgPooling(const float* top, const 
int num,
           wstart = std::max(wstart, 0);
           hend = std::min(hend, height);
           wend = std::min(wend, width);
-          const int top_index = ph * top_width + pw;
+          const int top_index = ph * pooled_w + pw;
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
               const int index = h * width + w;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/pooling.cl
----------------------------------------------------------------------
diff --git a/src/model/layer/pooling.cl b/src/model/layer/pooling.cl
new file mode 100644
index 0000000..3ea4ecd
--- /dev/null
+++ b/src/model/layer/pooling.cl
@@ -0,0 +1,264 @@
+// This file is modified from the file located at
+// 
https://github.com/BVLC/caffe/blob/opencl/src/caffe/greentea/cl_kernels/pooling.cl
+// and is covered under the BSD 2-Clause License, as indicated in the LICENSE
+// file at the root of this repository.
+
+__kernel void max_pool_forward(
+    const int nthreads, __global const float* bottom, const int channels, 
+    const int height, const int width,
+    const int pooled_h, const int pooled_w,
+    const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w,
+    __global float* top, __global float* mask) {
+
+//  printf("%d ", get_global_size(0));
+  for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {
+    const int pw = i % pooled_w;
+    const int ph = (i / pooled_w) % pooled_h;
+    const int c = (i / pooled_w / pooled_h) % channels;
+    const int n = i / pooled_w / pooled_h / channels;
+    
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, (int)0);
+    wstart = max(wstart, (int)0);
+    
+    float maxval = -FLT_MAX;
+    int maxidx = -1;
+    __global const float* bottom_slice = bottom + (n * channels + c) * height 
* width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        const int index = h * width + w;
+        if (bottom_slice[index] > maxval) {
+          maxidx = index;
+          maxval = bottom_slice[maxidx];
+        }
+      }
+    }
+    top[i] = maxval;
+    mask[i] = (float)maxidx;
+  }
+}
+
+__kernel void ave_pool_forward(
+    const int nthreads, __global const float* const bottom, const int 
channels, 
+    const int height, const int width,
+    const int pooled_h, const int pooled_w,
+    const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, 
+    const int pad_h, const int pad_w, __global float* top) {
+    
+  for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {
+    const int pw = i % pooled_w;
+    const int ph = (i / pooled_w) % pooled_h;
+    const int c = (i / pooled_w / pooled_h) % channels;
+    const int n = i / pooled_w / pooled_h / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    int hend = min(hstart + kernel_h, height + pad_h);
+    int wend = min(wstart + kernel_w, width + pad_w);
+    const int pool_size = (hend - hstart) * (wend - wstart);
+    hstart = max(hstart, (int)0);
+    wstart = max(wstart, (int)0);
+    hend = min(hend, height);
+    wend = min(wend, width);
+    float aveval = 0;
+    __global const float* bottom_slice = bottom + (n * channels + c) * height 
* width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        aveval += bottom_slice[h * width + w];
+      }
+    }
+    top[i] = aveval / pool_size;
+  }
+}
+
+__kernel void sto_pool_forward_train(
+    const int nthreads, __global const float* bottom,
+    const int channels, const int height, const int width,
+    const int pooled_h, const int pooled_w, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w,
+    __global float* rand_idx, __global float* top) {
+    
+  for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {
+    const int pw = i % pooled_w;
+    const int ph = (i / pooled_w) % pooled_h;
+    const int c = (i / pooled_w / pooled_h) % channels;
+    const int n = i / pooled_w / pooled_h / channels;
+    
+    const int hstart = ph * stride_h;
+    const int hend = min(hstart + kernel_h, height);
+    const int wstart = pw * stride_w;
+    const int wend = min(wstart + kernel_w, width);
+    float cumsum = 0.;
+    __global const float* bottom_slice = bottom + (n * channels + c) * height 
* width;
+    // First pass: get sum
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_slice[h * width + w];
+      }
+    }
+    const float thres = rand_idx[i] * cumsum;
+    // Second pass: get value, and set i.
+    cumsum = 0;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_slice[h * width + w];
+        if (cumsum >= thres) {
+          rand_idx[i] = ((n * channels + c) * height + h) * width + w;
+          top[i] = bottom_slice[h * width + w];
+          h = hend;
+          w = wend;
+        }
+      }
+    }
+  }
+}
+
+__kernel void sto_pool_forward_test(
+    const int nthreads, __global const float* const bottom, const int 
channels, 
+    const int height, const int width,
+    const int pooled_h, const int pooled_w, 
+    const int kernel_h, const int kernel_w, 
+    const int stride_h, const int stride_w,
+    __global float* top) {
+    
+  for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {
+    const int pw = i % pooled_w;
+    const int ph = (i / pooled_w) % pooled_h;
+    const int c = (i / pooled_w / pooled_h) % channels;
+    const int n = i / pooled_w / pooled_h / channels;
+    
+    const int hstart = ph * stride_h;
+    const int hend = min(hstart + kernel_h, height);
+    const int wstart = pw * stride_w;
+    const int wend = min(wstart + kernel_w, width);
+    // We set cumsum to be 0 to avoid divide-by-zero problems
+    float cumsum = FLT_MIN;
+    float cumvalues = 0.;
+    __global const float* bottom_slice = bottom + (n * channels + c) * height 
* width;
+    // First pass: get sum
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        cumsum += bottom_slice[h * width + w];
+        cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w];
+      }
+    }
+    top[i] = cumvalues / cumsum;
+  }
+}
+
+__kernel void max_pool_backward(const int nthreads,
+                                __global const float* top_diff,
+                                __global const float* mask,
+                                const int channels,
+                                const int height, const int width,
+                                const int pooled_h, const int pooled_w,
+                                const int kernel_h, const int kernel_w,
+                                const int stride_h, const int stride_w,
+                                const int pad_h, const int pad_w,
+                                __global float* bottom_diff) {
+  for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {
+    // find out the local i
+    // find out the local offset
+    const int w = i % width;
+    const int h = (i / width) % height;
+    const int c = (i / width / height) % channels;
+    const int n = i / width / height / channels;
+    
+    const int phstart =
+        (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;
+    const int phend = min((h + pad_h) / stride_h + 1, pooled_h);
+    const int pwstart =
+        (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1;
+    const int pwend = min((w + pad_w) / stride_w + 1, pooled_w);
+    float gradient = 0.0f;
+    const int offset = (n * channels + c) * pooled_h * pooled_w;
+    __global const float* top_diff_slice = top_diff + offset;
+    __global const float* mask_slice = mask + offset;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        if (mask_slice[ph * pooled_w + pw] == (float)(h * width + w)) {
+          gradient += top_diff_slice[ph * pooled_w + pw];
+        }
+      }
+    }
+    bottom_diff[i] = gradient;
+  }
+}
+
+__kernel void ave_pool_backward(const int nthreads,
+                                __global const float* top_diff,
+                                const int channels,
+                                const int height, const int width,
+                                const int pooled_h, const int pooled_w,
+                                const int kernel_h, const int kernel_w,
+                                const int stride_h, const int stride_w,
+                                const int pad_h, const int pad_w,
+                                __global float* bottom_diff) {
+  for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {
+    // find out the local i
+    // find out the local offset
+    const int w = i % width + pad_w;
+    const int h = (i / width) % height + pad_h;
+    const int c = (i / width / height) % channels;
+    const int n = i / width / height / channels;
+    
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_h);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_w);
+    float gradient = 0.0;
+    __global const float* const top_diff_slice = top_diff + (n * channels + c) 
* pooled_h * pooled_w;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int hstart = ph * stride_h - pad_h;
+        int wstart = pw * stride_w - pad_w;
+        int hend = min(hstart + kernel_h, height + pad_h);
+        int wend = min(wstart + kernel_w, width + pad_w);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        gradient += top_diff_slice[ph * pooled_w + pw] / pool_size;
+      }
+    }
+    bottom_diff[i] = gradient;
+  }
+}
+
+__kernel void sto_pool_backward(
+    const int nthreads, __global const float* rand_idx,
+    __global const float* const top_diff, const int channels,
+    const int height, const int width,
+    const int pooled_h, const int pooled_w,
+    const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w,
+    __global float* bottom_diff) {
+
+  for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {
+    // find out the local i
+    // find out the local offset
+    const int w = i % width;
+    const int h = (i / width) % height;
+    const int c = (i / width / height) % channels;
+    const int n = i / width / height / channels;
+    
+    const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+    const int phend = min(h / stride_h + 1, pooled_h);
+    const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+    const int pwend = min(w / stride_w + 1, pooled_w);
+    float gradient = 0.0;
+    __global const float* rand_idx_slice = rand_idx + (n * channels + c) * 
pooled_h * pooled_w;
+    __global const float* top_diff_slice = top_diff + (n * channels + c) * 
pooled_h * pooled_w;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        gradient += top_diff_slice[ph * pooled_w + pw]
+            * (i == (int) (rand_idx_slice[ph * pooled_w + pw])?1.0:0.0);
+      }
+    }
+    bottom_diff[i] = gradient;
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/pooling.h
----------------------------------------------------------------------
diff --git a/src/model/layer/pooling.h b/src/model/layer/pooling.h
index f844799..d16db27 100644
--- a/src/model/layer/pooling.h
+++ b/src/model/layer/pooling.h
@@ -44,25 +44,34 @@ class Pooling : public Layer {
                                                    const Tensor& grad) 
override;
 
   void ForwardMaxPooling(const float* bottom, const int num, const int 
channels,
-                         const int height, const int width, const int kernel_h,
-                         const int kernel_w, const int pad_h, const int pad_w,
+                         const int height, const int width, 
+                         const int pooled_h, const int pooled_w,
+                         const int kernel_h, const int kernel_w,
+                         const int pad_h, const int pad_w,
                          const int stride_h, const int stride_w, float* top,
                          float* mask);
 
   void BackwardMaxPooling(const float* top, const float* mask, const int num,
                           const int channels, const int height, const int 
width,
+                          const int pooled_h, const int pooled_w,
                           const int kernel_h, const int kernel_w,
-                          const int pad_h, const int pad_w, const int stride_h,
-                          const int stride_w, float* bottom);
+                          const int pad_h, const int pad_w,
+                          const int stride_h, const int stride_w,
+                          float* bottom);
 
   void ForwardAvgPooling(const float* bottom, const int num, const int 
channels,
-                         const int height, const int width, const int kernel_h,
-                         const int kernel_w, const int pad_h, const int pad_w,
-                         const int stride_h, const int stride_w, float* top);
+                         const int height, const int width,
+                         const int pooled_h, const int pooled_w,
+                         const int kernel_h, const int kernel_w,
+                         const int pad_h, const int pad_w,
+                         const int stride_h, const int stride_w,
+                         float* top);
 
   void BackwardAvgPooling(const float* top, const int num, const int channels,
-                          const int height, const int width, const int 
kernel_h,
-                          const int kernel_w, const int pad_h, const int pad_w,
+                          const int height, const int width,
+                          const int pooled_h, const int pooled_w,
+                          const int kernel_h, const int kernel_w,
+                          const int pad_h, const int pad_w,
                           const int stride_h, const int stride_w,
                           float* bottom);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/test/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index efc1983..e1487d2 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -30,12 +30,6 @@ ADD_LIBRARY(gtest STATIC EXCLUDE_FROM_ALL "gtest/gtest.h" 
"gtest/gtest-all.cc")
 AUX_SOURCE_DIRECTORY(singa singa_test_source)
 LIST(REMOVE_ITEM singa_test_source "singa/test_ep.cc")
 
-IF(NOT USE_OPENCL)
-    MESSAGE(STATUS "Skipping OpenCL tests")
-    LIST(REMOVE_ITEM singa_test_source "singa/test_opencl.cc")
-ENDIF()
-
-
 ADD_EXECUTABLE(test_singa "gtest/gtest_main.cc" ${singa_test_source})
 ADD_DEPENDENCIES(test_singa singa)
 #MESSAGE(STATUS "link libs" ${singa_linker_libs})

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/test/singa/test_opencl_convolution.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_opencl_convolution.cc 
b/test/singa/test_opencl_convolution.cc
new file mode 100644
index 0000000..972756d
--- /dev/null
+++ b/test/singa/test_opencl_convolution.cc
@@ -0,0 +1,223 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "../src/model/layer/convolution.h"
+#include "../src/model/layer/opencl_convolution.h"
+
+#include "gtest/gtest.h"
+
+#ifdef USE_OPENCL
+
+using singa::OpenclConvolution;
+using singa::OpenclDevice;
+using singa::Shape;
+
+
+TEST(OpenclConvolution, Setup) {
+  OpenclConvolution conv;
+  EXPECT_EQ("OpenclConvolution", conv.layer_type());
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(2);
+  convconf->set_kernel_w(2);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(1);
+  convconf->set_stride_w(1);
+  convconf->set_num_output(2);
+  convconf->set_bias_term(true);
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  EXPECT_EQ(2u, conv.kernel_h());
+  EXPECT_EQ(2u, conv.kernel_w());
+  EXPECT_EQ(1u, conv.pad_h());
+  EXPECT_EQ(1u, conv.pad_w());
+  EXPECT_EQ(1u, conv.stride_h());
+  EXPECT_EQ(1u, conv.stride_w());
+  EXPECT_EQ(2u, conv.num_filters());
+  EXPECT_EQ(true, conv.bias_term());
+  EXPECT_EQ(1u, conv.channels());
+  EXPECT_EQ(3u, conv.height());
+  EXPECT_EQ(3u, conv.width());
+}
+
+
+TEST(OpenclConvolution, Forward) {
+  const size_t batchsize = 2, c = 1, h = 3, w = 3;
+  const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
+                                          7.0f, 8.0f, 9.0f, 1.0f, 2.0f, 3.0f,
+                                          4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+                                          
+  auto ocl = std::make_shared<OpenclDevice>();
+  singa::Tensor in(singa::Shape{batchsize, c, h, w}, ocl);
+  in.CopyDataFromHostPtr(x, batchsize * c * h * w);
+
+  // Set weight and bias manually
+  const size_t num_filters = 1;
+  const size_t col_height = 1 * 3 * 3;  // channels * kernel_w * kernel_h
+  const float we[num_filters * col_height] = {1.0f,  1.0f, 0.0f, 0.0f, 0.0f,
+                                              -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, col_height}, ocl);
+  weight.CopyDataFromHostPtr(we, num_filters * col_height);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters}, ocl);
+  bias.CopyDataFromHostPtr(b, num_filters);
+  OpenclConvolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  // Parameter "flag" does not influence convolution
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+  out1.ToHost();
+  const float *outptr1 = out1.data<float>();
+  // Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1.
+  EXPECT_EQ(8u, out1.Size());
+
+  EXPECT_EQ(3.0f, outptr1[0]);
+  EXPECT_EQ(7.0f, outptr1[1]);
+  EXPECT_EQ(-3.0f, outptr1[2]);
+  EXPECT_EQ(12.0f, outptr1[3]);
+  EXPECT_EQ(3.0f, outptr1[4]);
+  EXPECT_EQ(7.0f, outptr1[5]);
+  EXPECT_EQ(-3.0f, outptr1[6]);
+  EXPECT_EQ(12.0f, outptr1[7]);
+}
+
+
+TEST(OpenclConvolution, Backward) {
+  // src_data
+  const size_t batchsize = 2, c = 1, src_h = 3, src_w = 3;
+  const float x[batchsize * c * src_h * src_w] = {
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
+      1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  auto ocl = std::make_shared<OpenclDevice>();
+  singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, ocl);
+  in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w);
+
+  // Set weight_ and bias_ manually
+  const size_t num_filters = 1;
+  const size_t col_height = 1 * 3 * 3;  // channels * kernel_w * kernel_h
+  const float we[num_filters * col_height] = {1.0f,  1.0f, 0.0f, 0.0f, 0.0f,
+                                              -1.0f, 0.0f, 1.0f, 0.0f};
+  singa::Tensor weight(singa::Shape{num_filters, col_height}, ocl);
+  weight.CopyDataFromHostPtr(we, num_filters * col_height);
+  const float b[num_filters] = {1.0f};
+  singa::Tensor bias(singa::Shape{num_filters}, ocl);
+  bias.CopyDataFromHostPtr(b, num_filters);
+  OpenclConvolution conv;
+  conv.set_weight(weight);
+  conv.set_bias(bias);
+
+  singa::LayerConf conf;
+  singa::ConvolutionConf *convconf = conf.mutable_convolution_conf();
+  convconf->set_kernel_h(3);
+  convconf->set_kernel_w(3);
+  convconf->set_pad_h(1);
+  convconf->set_pad_w(1);
+  convconf->set_stride_h(2);
+  convconf->set_stride_w(2);
+  convconf->set_num_output(1);
+  convconf->set_bias_term(true);
+  convconf->set_workspace_byte_limit(256);
+  convconf->set_prefer("fastest");
+  conv.Setup(Shape{1, 3, 3}, conf);
+
+  singa::Tensor out1 = conv.Forward(singa::kTrain, in);
+
+  // grad
+  const size_t grad_h = 2, grad_w = 2;
+  const float dy[batchsize * num_filters * grad_h * grad_w] = {
+      0.1f, 0.2f, 0.3f, 0.4f, 0.1f, 0.2f, 0.3f, 0.4f};
+  singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w}, 
ocl);
+  grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w);
+
+  const auto ret = conv.Backward(singa::kTrain, grad);
+  singa::Tensor in_grad = ret.first;
+  in_grad.ToHost();
+  const float *dx = in_grad.data<float>();
+  const float *wptr = we;
+  EXPECT_EQ(18u, in_grad.Size());
+  EXPECT_EQ(dy[0] * wptr[4], dx[0]);
+  EXPECT_EQ(dy[0] * wptr[5] + dy[1] * wptr[3], dx[1]);
+  EXPECT_EQ(dy[1] * wptr[4], dx[2]);
+  EXPECT_EQ(dy[0] * wptr[7] + dy[2] * wptr[1], dx[3]);
+  EXPECT_EQ(
+      dy[0] * wptr[8] + dy[1] * wptr[6] + dy[2] * wptr[2] + dy[3] * wptr[0],
+      dx[4]);
+  EXPECT_EQ(dy[1] * wptr[7] + dy[3] * wptr[1], dx[5]);
+  EXPECT_EQ(dy[2] * wptr[4], dx[6]);
+  EXPECT_EQ(dy[2] * wptr[5] + dy[3] * wptr[3], dx[7]);
+  EXPECT_EQ(dy[3] * wptr[4], dx[8]);
+  EXPECT_EQ(dy[4] * wptr[4], dx[9]);
+  EXPECT_EQ(dy[4] * wptr[5] + dy[1] * wptr[3], dx[10]);
+  EXPECT_EQ(dy[5] * wptr[4], dx[11]);
+  EXPECT_EQ(dy[4] * wptr[7] + dy[2] * wptr[1], dx[12]);
+  EXPECT_EQ(
+      dy[4] * wptr[8] + dy[5] * wptr[6] + dy[6] * wptr[2] + dy[7] * wptr[0],
+      dx[13]);
+  EXPECT_EQ(dy[5] * wptr[7] + dy[7] * wptr[1], dx[14]);
+  EXPECT_EQ(dy[6] * wptr[4], dx[15]);
+  EXPECT_EQ(dy[6] * wptr[5] + dy[7] * wptr[3], dx[16]);
+  EXPECT_EQ(dy[7] * wptr[4], dx[17]);
+
+  singa::Tensor dw = ret.second[0];
+  singa::Tensor db = ret.second[1];
+  dw.ToHost();
+  db.ToHost();
+  const float *dbptr = db.data<float>();
+  EXPECT_FLOAT_EQ(dy[0] + dy[1] + dy[2] + dy[3] + dy[4] + dy[5] + dy[6] + 
dy[7],
+                  dbptr[0]);
+
+  const float *dwptr = dw.data<float>();
+  EXPECT_EQ(9u, dw.Size());
+  EXPECT_FLOAT_EQ(dy[3] * x[4] + dy[7] * x[13], dwptr[0]);
+  EXPECT_FLOAT_EQ(dy[3] * x[5] + dy[7] * x[14] + dy[2] * x[3] + dy[6] * x[12],
+                  dwptr[1]);
+  EXPECT_FLOAT_EQ(dy[2] * x[4] + dy[6] * x[13], dwptr[2]);
+  EXPECT_FLOAT_EQ(dy[1] * x[1] + dy[5] * x[10] + dy[3] * x[7] + dy[7] * x[16],
+                  dwptr[3]);
+  EXPECT_FLOAT_EQ(dy[0] * x[0] + dy[4] * x[9] + dy[1] * x[2] + dy[5] * x[11] +
+                      dy[2] * x[6] + dy[6] * x[15] + dy[3] * x[8] +
+                      dy[7] * x[17],
+                  dwptr[4]);
+  EXPECT_FLOAT_EQ(dy[0] * x[1] + dy[4] * x[10] + dy[2] * x[7] + dy[6] * x[16],
+                  dwptr[5]);
+  EXPECT_FLOAT_EQ(dy[1] * x[4] + dy[5] * x[13], dwptr[6]);
+  EXPECT_FLOAT_EQ(dy[0] * x[3] + dy[4] * x[12] + dy[1] * x[5] + dy[5] * x[14],
+                  dwptr[7]);
+  EXPECT_FLOAT_EQ(dy[0] * x[4] + dy[4] * x[13], dwptr[8]);
+}
+
+
+#endif // USE_OPENCL

[2/4] incubator-singa git commit: SINGA-230 OpenCL Convolution and Pooling

Reply via email to