SINGA-230 OpenCL Convolution and Pooling - Added implementation files. - Added relevant unit test files. - Bugfixes in OpenCL GEMV and GEMM. - Added licensing details to LICENSE file and the kernels retrieved from Caffe.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/e3df3bd7 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/e3df3bd7 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/e3df3bd7 Branch: refs/heads/master Commit: e3df3bd763b9cfa41b584790f5eed89cdd19684b Parents: f3665e5 Author: Tan Li Boon <[email protected]> Authored: Tue Aug 30 15:35:33 2016 +0800 Committer: Tan Li Boon <[email protected]> Committed: Sun Sep 25 14:46:05 2016 +0800 ---------------------------------------------------------------------- LICENSE | 2 + cmake/Thirdparty/FindViennaCL.cmake | 3 - include/singa/core/device.h | 6 +- include/singa/utils/opencl_utils.h | 8 +- src/core/device/opencl_device.cc | 17 +- src/core/tensor/tensor.cc | 7 + src/core/tensor/tensor_math_opencl.h | 43 +++-- src/model/layer/convolution.cc | 35 ++-- src/model/layer/convolution.h | 2 +- src/model/layer/im2col.cl | 85 +++++++++ src/model/layer/opencl_convolution.cc | 220 +++++++++++++++++++++++ src/model/layer/opencl_convolution.h | 75 ++++++++ src/model/layer/opencl_pooling.cc | 272 +++++++++++++++++++++++++++++ src/model/layer/opencl_pooling.h | 109 ++++++++++++ src/model/layer/pooling.cc | 117 +++++++------ src/model/layer/pooling.cl | 264 ++++++++++++++++++++++++++++ src/model/layer/pooling.h | 27 ++- test/CMakeLists.txt | 6 - test/singa/test_opencl_convolution.cc | 223 +++++++++++++++++++++++ test/singa/test_opencl_pooling.cc | 156 +++++++++++++++++ 20 files changed, 1559 insertions(+), 118 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/LICENSE ---------------------------------------------------------------------- diff --git a/LICENSE b/LICENSE index 4f9d1e7..62a3430 100644 --- a/LICENSE +++ b/LICENSE @@ -305,6 +305,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SINGA bundles the following under BSD 2-clause license: include/singa/utils/cuda_utils.h src/core/tensor/distribution.cl +src/model/layer/im2col.cl +src/model/layer/pooling.cl cmake/ThirdParty/FindViennaCL.cmake COPYRIGHT http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/cmake/Thirdparty/FindViennaCL.cmake ---------------------------------------------------------------------- diff --git a/cmake/Thirdparty/FindViennaCL.cmake b/cmake/Thirdparty/FindViennaCL.cmake index c0addf8..263c80f 100644 --- a/cmake/Thirdparty/FindViennaCL.cmake +++ b/cmake/Thirdparty/FindViennaCL.cmake @@ -1,8 +1,5 @@ -<<<<<<< HEAD # This file is retrieved from caffe/cmake/Modules/FindViennaCL.cmake. -======= ->>>>>>> 8ac95cb... SINGA-243 ViennaCL backend for OpenCL support SET(ViennaCL_WITH_OPENCL TRUE) SET(VIENNACL_INCLUDE_SEARCH_PATHS http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/include/singa/core/device.h ---------------------------------------------------------------------- diff --git a/include/singa/core/device.h b/include/singa/core/device.h index 62fa250..0fecc6d 100644 --- a/include/singa/core/device.h +++ b/include/singa/core/device.h @@ -72,7 +72,7 @@ class Device { } /// Copy data within or across devices. - void CopyDataToFrom(Block* dst, Block* src, size_t nBytes, + virtual void CopyDataToFrom(Block* dst, Block* src, size_t nBytes, CopyDirection direction, int dst_offset, int src_offset); void CopyDataFromHostPtr(Block* dst, const void* src, size_t nBytes, @@ -214,9 +214,9 @@ public: // Overridden, inherited methods void SetRandSeed(unsigned seed) override; - void CopyDataToFrom(Block* dst, Block* src, size_t nBytes, + virtual void CopyDataToFrom(Block* dst, Block* src, size_t nBytes, CopyDirection direction, int dst_offset = 0, - int src_offset = 0); + int src_offset = 0) override; protected: /// The OpenCL device that this object represents. http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/include/singa/utils/opencl_utils.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/opencl_utils.h b/include/singa/utils/opencl_utils.h index 8c05643..0445f13 100644 --- a/include/singa/utils/opencl_utils.h +++ b/include/singa/utils/opencl_utils.h @@ -51,16 +51,16 @@ inline viennacl::ocl::handle<cl_mem> -WrapHandle(cl_mem in, viennacl::ocl::context *ctx) { +WrapHandle(cl_mem in, viennacl::ocl::context &ctx) { if (in != nullptr) { - viennacl::ocl::handle<cl_mem> memhandle(in, *ctx); + viennacl::ocl::handle<cl_mem> memhandle(in, ctx); memhandle.inc(); return memhandle; } else { cl_int err; - cl_mem dummy = clCreateBuffer(ctx->handle().get(), CL_MEM_READ_WRITE, 0, + cl_mem dummy = clCreateBuffer(ctx.handle().get(), CL_MEM_READ_WRITE, 0, nullptr, &err); - viennacl::ocl::handle<cl_mem> memhandle(dummy, *ctx); + viennacl::ocl::handle<cl_mem> memhandle(dummy, ctx); return memhandle; } } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/core/device/opencl_device.cc ---------------------------------------------------------------------- diff --git a/src/core/device/opencl_device.cc b/src/core/device/opencl_device.cc index 6b371c4..0c8f010 100644 --- a/src/core/device/opencl_device.cc +++ b/src/core/device/opencl_device.cc @@ -45,6 +45,7 @@ OpenclDevice::OpenclDevice(int id, int num_executors) this->this_device = ocl::current_device(); BuildPrograms(cl_src_path); + BuildPrograms("../src/model/layer"); } @@ -70,18 +71,18 @@ void OpenclDevice::CopyDataToFrom(Block* dst, Block* src, size_t nBytes, switch(direction) { case kHostToDevice: { - auto dst_handle = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), &ocl_ctx); + auto dst_handle = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx); memory_write(dst_handle, dst_offset, nBytes, src->data()); return; } case kDeviceToHost: { - auto src_handle = WrapHandle(static_cast<cl_mem>(src->mutable_data()), &ocl_ctx); + auto src_handle = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx); memory_read(src_handle, src_offset, nBytes, dst->mutable_data()); return; } case kDeviceToDevice: { - auto src_handle = WrapHandle(static_cast<cl_mem>(src->mutable_data()), &ocl_ctx); - auto dst_handle = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), &ocl_ctx); + auto src_handle = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx); + auto dst_handle = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx); memory_copy(src_handle, dst_handle, src_offset, dst_offset, nBytes); return; } @@ -131,18 +132,18 @@ void OpenclDevice::CopyToFrom(void* dst, const void* src, size_t nBytes, switch(direction) { case kHostToDevice: { - auto dst_handle = WrapHandle(static_cast<cl_mem>(dst), &ocl_ctx); + auto dst_handle = WrapHandle(static_cast<cl_mem>(dst), ocl_ctx); memory_write(dst_handle, 0, nBytes, src); return; } case kDeviceToHost: { - auto src_handle = WrapHandle((const cl_mem)src, &ocl_ctx); + auto src_handle = WrapHandle((const cl_mem)src, ocl_ctx); memory_read(src_handle, 0, nBytes, dst); return; } case kDeviceToDevice: { - auto src_handle = WrapHandle((const cl_mem)src, &ocl_ctx); - auto dst_handle = WrapHandle(static_cast<cl_mem>(dst), &ocl_ctx); + auto src_handle = WrapHandle((const cl_mem)src, ocl_ctx); + auto dst_handle = WrapHandle(static_cast<cl_mem>(dst), ocl_ctx); memory_copy(src_handle, dst_handle, 0, 0, nBytes); return; } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/core/tensor/tensor.cc ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc index 670b27e..d7e8f86 100644 --- a/src/core/tensor/tensor.cc +++ b/src/core/tensor/tensor.cc @@ -787,6 +787,7 @@ Tensor ConcatenateColumns(const vector<Tensor> &in) { } return out; } + Tensor CopyRows(const Tensor &in, const size_t start, const size_t end) { CHECK_LT(start, end); CHECK_GE(in.shape(0), end); @@ -797,6 +798,7 @@ Tensor CopyRows(const Tensor &in, const size_t start, const size_t end) { CopyDataToFrom(&out, in, out.Size(), 0, start * sample_size); return out; } + Tensor CopyColumns(const Tensor &in, const size_t start, const size_t end) { CHECK_EQ(in.nDim(), 2u); CHECK_LT(start, end); @@ -865,6 +867,7 @@ Tensor SliceRows(const Tensor &in, const size_t start, const size_t end) { */ return ret; } + void SubColumn(const Tensor &v, Tensor *M) { AddColumn(-1, 1, v, M); } void SubRow(const Tensor &v, Tensor *M) { AddRow(-1, 1, v, M); } @@ -910,6 +913,7 @@ void Bernoulli(const SType p, Tensor *out) { }, {}, {out->block()}, true); }); } + template void Bernoulli<float>(const float p, Tensor *out); template <typename SType> @@ -922,6 +926,7 @@ void Uniform(const SType low, const SType high, Tensor *out) { }, {}, {out->block()}, true); }); } + template void Uniform<float>(const float low, const float high, Tensor *out); template <typename SType> @@ -947,6 +952,7 @@ void Axpy(const SType alpha, const Tensor &in, Tensor *out) { }, {in.block(), out->block()}, {out->block()}); }); } + template void Axpy<float>(const float alpha, const Tensor &in, Tensor *out); @@ -1006,6 +1012,7 @@ void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss) { }, {p.block(), t.block()}, {loss->block()}); }); } + void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) { CHECK_LE(p->nDim(), 2u); CHECK_LE(t.nDim(), 2u); // TODO(wangwei) consider multi-labels. http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/core/tensor/tensor_math_opencl.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_opencl.h b/src/core/tensor/tensor_math_opencl.h index c387031..a209de4 100644 --- a/src/core/tensor/tensor_math_opencl.h +++ b/src/core/tensor/tensor_math_opencl.h @@ -27,6 +27,7 @@ #include <viennacl/vector.hpp> #include <viennacl/matrix.hpp> +#include <viennacl/linalg/prod.hpp> #include <viennacl/linalg/inner_prod.hpp> #include <viennacl/linalg/norm_2.hpp> #include <viennacl/linalg/sum.hpp> @@ -510,19 +511,23 @@ void Dot<float, lang::Opencl>(const size_t num, const Block *in1, const Block *i template<> void GEMV<float, lang::Opencl>(bool trans, const size_t m, const size_t n, const float alpha, const Block *A, const Block *v, const float beta, Block* out, Context* ctx) { - - viennacl::matrix<float> A_in((const cl_mem)A->data(), m, n); - viennacl::vector<float> v_in((const cl_mem)v->data(), trans ? m : n); - viennacl::vector<float> o_in(static_cast<cl_mem>(out->mutable_data()), trans ? n : m); + viennacl::vector<float> v_buf((const cl_mem)v->data(), n); + viennacl::vector<float> o_buf(static_cast<cl_mem>(out->mutable_data()), m); - if (trans) viennacl::trans(A_in); + viennacl::matrix<float> A_buf; - o_in *= beta; - o_in += alpha * viennacl::linalg::prod(A_in, v_in); -} + if (trans) { + A_buf = viennacl::matrix<float>((const cl_mem)A->data(), n, m); + A_buf = viennacl::trans(A_buf); + } else { + A_buf = viennacl::matrix<float>((const cl_mem)A->data(), m, n); + } + o_buf *= beta; + o_buf += alpha * viennacl::linalg::prod(A_buf, v_buf); +} -/// multiply a matrix with a diagnoal matrix constructed using values from 'v'. +/// multiply a matrix with a diagonal matrix constructed using values from 'v'. /// if matrix_lef_side is true, do M*v; else do v*M template<> void DGMM<float, lang::Opencl>(bool side_right, @@ -549,12 +554,22 @@ void GEMM<float, lang::Opencl>(const bool transA, const bool transB, const float alpha, const Block *A, const Block *B, const float beta, Block *C, Context *ctx) { - viennacl::matrix<float> A_buf((const cl_mem)A->data(), nrowA, ncolA); - viennacl::matrix<float> B_buf((const cl_mem)B->data(), ncolA, ncolB); + viennacl::matrix<float> A_buf, B_buf; viennacl::matrix<float> C_buf(static_cast<cl_mem>(C->mutable_data()), nrowA, ncolB); - - if (transA) viennacl::trans(A_buf); - if (transB) viennacl::trans(B_buf); + + if (transA) { + A_buf = viennacl::matrix<float>((const cl_mem)A->data(), ncolA, nrowA); + A_buf = viennacl::trans(A_buf); + } else { + A_buf = viennacl::matrix<float>((const cl_mem)A->data(), nrowA, ncolA); + } + + if (transB) { + B_buf = viennacl::matrix<float>((const cl_mem)B->data(), ncolB, ncolA); + B_buf = viennacl::trans(B_buf); + } else { + B_buf = viennacl::matrix<float>((const cl_mem)B->data(), ncolA, ncolB); + } C_buf *= beta; C_buf += alpha * viennacl::linalg::prod(A_buf, B_buf); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/convolution.cc ---------------------------------------------------------------------- diff --git a/src/model/layer/convolution.cc b/src/model/layer/convolution.cc index 52e9d93..bd7cc00 100644 --- a/src/model/layer/convolution.cc +++ b/src/model/layer/convolution.cc @@ -142,14 +142,16 @@ const std::pair<Tensor, vector<Tensor>> Convolution::Backward( size_t batchsize = grad.shape(0); size_t imagesize = src_data.Size() / batchsize; if (bias_term_) { - Tensor tmp1 = - Reshape(grad, Shape{batchsize * num_filters_, - grad.Size() / (batchsize * num_filters_)}); + auto tmpshp = Shape{batchsize * num_filters_, grad.Size() / (batchsize * num_filters_)}; + Tensor tmp1 = Reshape(grad, tmpshp); + Tensor tmp2(Shape{batchsize * num_filters_}); SumColumns(tmp1, &tmp2); Tensor tmp3 = Reshape(tmp2, Shape{batchsize, num_filters_}); + SumRows(tmp3, &db); } + auto in_data = src_data.data<float>(); Tensor col_data(Shape{col_height_, col_width_}); float *data_col = new float[col_height_ * col_width_]; @@ -157,14 +159,17 @@ const std::pair<Tensor, vector<Tensor>> Convolution::Backward( for (size_t b = 0; b < batchsize; b++) { Im2col(in_data + b * imagesize, channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data_col); + col_data.CopyDataFromHostPtr(data_col, col_height_ * col_width_); Tensor grad_b(Shape{num_filters_, conv_height_ * conv_width_}); CopyDataToFrom(&grad_b, grad, grad_b.Size(), 0, b * grad_b.Size()); dw += Mult(grad_b, col_data.T()); Tensor dcol_b = Mult(weight_.T(), grad_b); auto dcol_data = dcol_b.data<float>(); + Col2im(dcol_data, channels_, height_, width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, dx_b); + dx.CopyDataFromHostPtr(dx_b, imagesize, b * imagesize); } param_grad.push_back(dw); @@ -180,12 +185,13 @@ void Convolution::ToDevice(std::shared_ptr<Device> device) { } void Convolution::Im2col(const float *data_im, const int channels, - const int height, const int width, const int kernel_h, - const int kernel_w, const int pad_h, const int pad_w, + const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, float *data_col) { int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; + int width_col = ( width + 2 * pad_w - kernel_w) / stride_w + 1; int channels_col = channels * kernel_h * kernel_w; for (int c = 0; c < channels_col; ++c) { int w_offset = c % kernel_w; @@ -206,18 +212,19 @@ void Convolution::Im2col(const float *data_im, const int channels, } void Convolution::Col2im(const float *data_col, const int channels, - const int height, const int width, const int patch_h, - const int patch_w, const int pad_h, const int pad_w, + const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, float *data_im) { memset(data_im, 0, height * width * channels * sizeof(float)); - int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; - int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; - int channels_col = channels * patch_h * patch_w; + int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; + int width_col = ( width + 2 * pad_w - kernel_w) / stride_w + 1; + int channels_col = channels * kernel_h * kernel_w; for (int c = 0; c < channels_col; ++c) { - int w_offset = c % patch_w; - int h_offset = (c / patch_w) % patch_h; - int c_im = c / patch_h / patch_w; + int w_offset = c % kernel_w; + int h_offset = (c / kernel_w) % kernel_h; + int c_im = c / kernel_h / kernel_w; for (int h = 0; h < height_col; ++h) { for (int w = 0; w < width_col; ++w) { int h_pad = h * stride_h - pad_h + h_offset; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/convolution.h ---------------------------------------------------------------------- diff --git a/src/model/layer/convolution.h b/src/model/layer/convolution.h index d85a17b..7b7fd00 100644 --- a/src/model/layer/convolution.h +++ b/src/model/layer/convolution.h @@ -52,7 +52,7 @@ class Convolution : public Layer { const int stride_w, float* data_col); void Col2im(const float* data_col, const int channels, const int height, - const int width, const int patch_h, const int patch_w, + const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, float* data_im); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/im2col.cl ---------------------------------------------------------------------- diff --git a/src/model/layer/im2col.cl b/src/model/layer/im2col.cl new file mode 100644 index 0000000..e977dd6 --- /dev/null +++ b/src/model/layer/im2col.cl @@ -0,0 +1,85 @@ +// This file is modified from the file located at +// https://github.com/BVLC/caffe/blob/opencl/src/caffe/greentea/cl_kernels/im2col.cl +// and is covered under the BSD 2-Clause License, as indicated in the LICENSE +// file at the root of this repository. + +__kernel void im2col(const int n, __global const float* data_im, + const int data_im_off, + const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int height_col, const int width_col, + __global float* data_col, const int data_col_off) { + + for (int index = get_global_id(0); index < n; + index += get_global_size(0)) { + const int h_index = index / width_col; + const int h_col = h_index % height_col; + const int w_col = index % width_col; + const int c_im = h_index / height_col; + const int c_col = c_im * kernel_h * kernel_w; + const int h_offset = h_col * stride_h - pad_h; + const int w_offset = w_col * stride_w - pad_w; + + __global float* data_col_ptr = data_col + data_col_off; + data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; + __global const float* data_im_ptr = data_im + data_im_off; + data_im_ptr += (c_im * height + h_offset) * width + w_offset; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + int h_im = h_offset + i * dilation_h; + int w_im = w_offset + j * dilation_w; + *data_col_ptr = + (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? + data_im_ptr[i * dilation_h * width + j * dilation_w] : 0; + data_col_ptr += height_col * width_col; + } + } + } +} + +__kernel void col2im(const int n, __global const float* data_col, + const int data_col_off, const int channels, + const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int height_col, const int width_col, + __global float* data_im, const int data_im_off) { + + for (int index = get_global_id(0); index < n; index += get_global_size(0)) { + float val = 0; + const int w_im = index % width + pad_w; + const int h_im = (index / width) % height + pad_h; + const int c_im = index / (width * height); + int kernel_extent_w = (kernel_w - 1) * dilation_w + 1; + int kernel_extent_h = (kernel_h - 1) * dilation_h + 1; + // compute the start and end of the output + const int w_col_start = + (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; + const int w_col_end = min(w_im / stride_w + 1, width_col); + const int h_col_start = + (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; + const int h_col_end = min(h_im / stride_h + 1, height_col); + + // TODO: use LCM of stride and dilation to avoid unnecessary loops + for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) { + for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) { + int h_k = (h_im - h_col * stride_h); + int w_k = (w_im - w_col * stride_w); + if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { + h_k /= dilation_h; + w_k /= dilation_w; + int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) * + height_col + h_col) * width_col + w_col; + val += data_col[data_col_off + data_col_index]; + } + } + } + data_im[data_im_off + index] = val; + } +} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/opencl_convolution.cc ---------------------------------------------------------------------- diff --git a/src/model/layer/opencl_convolution.cc b/src/model/layer/opencl_convolution.cc new file mode 100644 index 0000000..c43719f --- /dev/null +++ b/src/model/layer/opencl_convolution.cc @@ -0,0 +1,220 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "opencl_convolution.h" + +#ifdef USE_OPENCL + +namespace singa { + +RegisterLayerClass(opencl_convolution, OpenclConvolution); + +/// \copydoc Layer::Forward(int flag, const Tensor&) +const Tensor OpenclConvolution::Forward(int flag, const Tensor &input) { + CHECK(buf_.empty()); + CHECK_EQ(input.device()->lang(), kOpencl); + CHECK_EQ(input.nDim(), 4u); + + if (flag & kTrain) buf_.push(input); + + auto batchsize = input.shape(0); + auto imagesize = input.Size() / batchsize; + auto data_type = input.data_type(); + auto device = input.device(); + + Shape shape{batchsize, num_filters_, conv_height_, conv_width_}; + Tensor output(shape, device, data_type); + Tensor col_data(Shape{col_height_, col_width_}, device, data_type); + + for (size_t b = 0; b < batchsize; b++) { + int offset = b * imagesize; + + col_data.device()->Exec([input, offset, col_data, this](Context* ctx) mutable { + + this->Im2Col(input.block(), offset, + height_, width_, + kernel_h_, kernel_w_, + pad_h_, pad_w_, + stride_h_, stride_w_, + conv_height_, conv_width_, + 0, channels_, + col_data.block(), ctx); + }, + {input.block()}, + {col_data.block()}); + + Tensor each = Mult(weight_, col_data); + + if (bias_term_) { + AddColumn(bias_, &each); + } + + CopyDataToFrom(&output, each, each.Size(), b * each.Size()); + } + + return output; +} + + +/// \copydoc Layer::Backward(int, const Tensor&, const Tensor&); +const std::pair<Tensor, std::vector<Tensor>> +OpenclConvolution::Backward(int flag, const Tensor &grad) { + CHECK(!buf_.empty()); + CHECK_EQ(grad.device()->lang(), kOpencl); + CHECK_EQ(grad.nDim(), 4u); + + std::vector<Tensor> param_grad; + + Tensor src_data = buf_.top(); + buf_.pop(); + + Tensor dx, db, dw; + dx.ResetLike(src_data); + db.ResetLike(bias_); + dw.ResetLike(weight_); + dw.SetValue(0.0f); + + size_t batchsize = grad.shape(0); + size_t imagesize = src_data.Size() / batchsize; + + if (bias_term_) { + auto tmpshp = Shape{batchsize * num_filters_, grad.Size() / (batchsize * num_filters_)}; + Tensor tmp1 = Reshape(grad, tmpshp); + + Tensor tmp2(Shape{batchsize * num_filters_}, + grad.device(), grad.data_type()); + SumColumns(tmp1, &tmp2); + Tensor tmp3 = Reshape(tmp2, Shape{batchsize, num_filters_}); + + SumRows(tmp3, &db); + } + + Tensor col_data(Shape{col_height_, col_width_}, + grad.device(), grad.data_type()); + + for (size_t b = 0; b < batchsize; b++) { + + int im_offset = b * imagesize; + int col_offset = 0; // Always keep this to zero. + + col_data.device()->Exec([src_data, col_data, im_offset, col_offset, this](Context* ctx) mutable { + + this->Im2Col(src_data.block(), im_offset, + height_, width_, + kernel_h_, kernel_w_, + pad_h_, pad_w_, + stride_h_, stride_w_, + conv_height_, conv_width_, + col_offset, channels_, + col_data.block(), ctx); + }, + {src_data.block()}, + {col_data.block()}); + + Tensor grad_b(Shape{num_filters_, conv_height_ * conv_width_}, + grad.device(), grad.data_type()); + CopyDataToFrom(&grad_b, grad, grad_b.Size(), 0, b * grad_b.Size()); + + dw += Mult(grad_b, col_data.T()); + Tensor dcol_b = Mult(weight_.T(), grad_b); + + dx.device()->Exec([dcol_b, dx, im_offset, col_offset, this](Context* ctx) mutable { + + this->Col2Im(dcol_b.block(), col_offset, + height_, width_, + kernel_h_, kernel_w_, + pad_h_, pad_w_, + stride_h_, stride_w_, + conv_height_, conv_width_, + im_offset, channels_, + dx.block(), ctx); + }, + {dcol_b.block()}, + {dx.block()}); + } + + param_grad.push_back(dw); + param_grad.push_back(db); + + return std::make_pair(dx, param_grad); +} + + +void OpenclConvolution::Setup(const Shape &in_sample, const LayerConf &conf) { + Convolution::Setup(in_sample, conf); +} + + +void OpenclConvolution::ToDevice(std::shared_ptr<Device> device) { + Convolution::ToDevice(device); +} + + +void OpenclConvolution::Im2Col(Block* src, int data_im_off, + const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int conv_h, const int conv_w, + const int col_data_off, const int channels, + Block* dst, Context* ctx) { + + auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("im2col.cl", "im2col"); + + auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx); + auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx); + + int num_kernels = channels * conv_h * conv_w; + + viennacl::ocl::enqueue(kernel(num_kernels, src_buf, data_im_off, + height, width, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + 1, 1, conv_h, conv_w, + dst_buf, col_data_off)); +} + + +void OpenclConvolution::Col2Im(Block* src, const int col_data_off, + const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int conv_h, const int conv_w, + const int data_im_off, const int channels, + Block* dst, Context* ctx) { + + auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("im2col.cl", "col2im"); + + auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx); + auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx); + + int num_kernels = channels * height * width; + + viennacl::ocl::enqueue(kernel(num_kernels, src_buf, col_data_off, channels, + height, width, kernel_h, kernel_w, + pad_h, pad_w, stride_h, stride_w, + 1, 1, conv_h, conv_w, + dst_buf, data_im_off)); +} + + +} // namespace singa + +#endif // USE_OPENCL http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/opencl_convolution.h ---------------------------------------------------------------------- diff --git a/src/model/layer/opencl_convolution.h b/src/model/layer/opencl_convolution.h new file mode 100644 index 0000000..a25acd2 --- /dev/null +++ b/src/model/layer/opencl_convolution.h @@ -0,0 +1,75 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_MODEL_LAYER_OPENCL_CONVOLUTION_H_ +#define SRC_MODEL_LAYER_OPENCL_CONVOLUTION_H_ + +#include "singa/singa_config.h" +#include "singa/core/common.h" +#include "singa/model/layer.h" +#include "singa/utils/opencl_utils.h" +#include "singa/proto/core.pb.h" +#include "convolution.h" + +#ifdef USE_OPENCL + +namespace singa { + +class OpenclConvolution : public Convolution { +public: + + /// \copydoc Layer::layer_type() + const std::string layer_type() const override { return "OpenclConvolution"; } + + const Tensor Forward(int flag, const Tensor &input) override; + + const std::pair<Tensor, std::vector<Tensor>> + Backward(int flag, const Tensor &grad) override; + + /// \copydoc Layer::Setup(const LayerConf&); + void Setup(const Shape &in_sample, const LayerConf &conf) override; + + void ToDevice(std::shared_ptr<Device> device) override; + +private: + + void Im2Col(Block* src, int data_im_off, + const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int conv_h, const int conv_w, + const int data_col_off, const int channels, + Block* dst, Context* ctx); + + void Col2Im(Block* src, const int data_col_off, + const int height, const int width, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int conv_h, const int conv_w, + const int data_im_off, const int channels, + Block* dst, Context* ctx); + +}; + +} // namespace singa + +#endif // USE_OPENCL + +#endif // SRC_MODEL_LAYER_OPENCL_CONVOLUTION_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/opencl_pooling.cc ---------------------------------------------------------------------- diff --git a/src/model/layer/opencl_pooling.cc b/src/model/layer/opencl_pooling.cc new file mode 100644 index 0000000..2e35330 --- /dev/null +++ b/src/model/layer/opencl_pooling.cc @@ -0,0 +1,272 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "opencl_pooling.h" + +#ifdef USE_OPENCL + +namespace singa { + +RegisterLayerClass(opencl_pooling, OpenclPooling); + +const Tensor OpenclPooling::Forward(int flag, const Tensor &input) { + CHECK(buf_.empty()); + CHECK_EQ(input.device()->lang(), kOpencl); + CHECK_EQ(input.nDim(), 4u); + + auto batchsize = input.shape(0); + auto data_type = input.data_type(); + auto device = input.device(); + + Shape shape{batchsize, channels_, pooled_height_, pooled_width_}; + Tensor output = Tensor(shape, device, data_type); + + output.device()->Exec([input, output, flag, this](Context *ctx) { + Block* in_block = input.block(); + Block* outblock = output.block(); + + if (pool_ == PoolingConf_PoolMethod_MAX) { + Tensor mask; + mask.ResetLike(output); + + Pooling_Forward_Max((int)output.Size(), in_block, mask.block(), + height_, width_, + pooled_height_, pooled_width_, + kernel_h_, kernel_w_, + stride_h_, stride_w_, + pad_h_, pad_w_, + outblock, channels_, ctx); + + if (flag & kTrain) + buf_.push(mask); + + } else if (pool_ == PoolingConf_PoolMethod_AVE) { + Pooling_Forward_Ave((int)output.Size(), in_block, outblock, + height_, width_, pooled_height_, pooled_width_, + kernel_h_, kernel_w_, stride_h_, stride_w_, + pad_h_, pad_w_, channels_, ctx); + } else + LOG(FATAL) << "Unknown pooling method."; + + }, {input.block()}, {output.block()}); + + return output; +} + + +const std::pair<Tensor, std::vector<Tensor>> +OpenclPooling::Backward(int flag, const Tensor &grad) { + CHECK_EQ(grad.device()->lang(), kOpencl); + CHECK_EQ(grad.nDim(), 4u); + + std::vector<Tensor> param_grad; + + auto batchsize = grad.shape(0); + auto data_type = grad.data_type(); + auto device = grad.device(); + Shape shape{batchsize, channels_, height_, width_}; + + Tensor dx(shape, device, data_type); + + dx.device()->Exec([dx, grad, this](Context *ctx) { + if (pool_ == PoolingConf_PoolMethod_MAX) { + CHECK(!buf_.empty()); + Tensor mask = buf_.top(); + buf_.pop(); + + Pooling_Backward_Max(grad.block(), mask.block(), + dx.Size(), channels_, + height_, width_, + pooled_height_, pooled_width_, + kernel_h_, kernel_w_, + pad_h_, pad_w_, + stride_h_, stride_w_, + dx.block(), ctx); + + } else if (pool_ == PoolingConf_PoolMethod_AVE) { + Pooling_Backward_Ave(grad.block(), grad.shape(0), channels_, + height_, width_, + pooled_height_, pooled_width_, + kernel_h_, kernel_w_, + pad_h_, pad_w_, + stride_h_, stride_w_, + dx.block(), ctx); + + } else + LOG(FATAL) << "Unknown pooling method."; + + }, {grad.block()}, {dx.block()}); + + return std::make_pair(dx, param_grad); +} + + +void OpenclPooling::Setup(const Shape& in_sample, const LayerConf &conf) { + Pooling::Setup(in_sample, conf); + auto pool_conf = conf.pooling_conf(); +} + + +void OpenclPooling::Pooling_Forward_Max(const int num, Block* src, Block* mask, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, + Block* dst, const int channels, + Context* ctx) { + auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("pooling.cl", "max_pool_forward"); + + auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx); + auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx); + auto maskbuf = WrapHandle(static_cast<cl_mem>(mask->mutable_data()), ocl_ctx); + + viennacl::ocl::enqueue(kernel(num, src_buf, channels, + height, width, pooled_h, pooled_w, + kernel_h, kernel_w, stride_h, stride_w, + pad_h, pad_w, dst_buf, maskbuf)); +} + + +void OpenclPooling::Pooling_Forward_Ave(const int num, Block* src, Block* dst, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, + const int channels, Context* ctx) { + auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("pooling.cl", "ave_pool_forward"); + + auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx); + auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx); + + viennacl::ocl::enqueue(kernel(num, src_buf, channels, + height, width, pooled_h, pooled_w, + kernel_h, kernel_w, stride_h, stride_w, + pad_h, pad_w, dst_buf)); +} + + +void OpenclPooling::Pooling_Forward_Sto_Train(Block* src, Block* rand, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int channels, + Block* dst, Context* ctx) { + auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("pooling.cl", "sto_pool_forward_train"); + + auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx); + auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx); + auto randbuf = WrapHandle(static_cast<cl_mem>(rand->mutable_data()), ocl_ctx); + + viennacl::ocl::enqueue(kernel(height * width, src_buf, channels, + height, width, pooled_h, pooled_w, + kernel_h, kernel_w, stride_h, stride_w, + randbuf, dst_buf)); +} + + +void OpenclPooling::Pooling_Forward_Sto_Test(Block* src, Block* dst, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int channels, Context* ctx) { + auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("pooling.cl", "sto_pool_forward_test"); + + auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx); + auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx); + + viennacl::ocl::enqueue(kernel(height * width, src_buf, channels, + height, width, pooled_h, pooled_w, + kernel_h, kernel_w, stride_h, stride_w, + dst_buf)); +} + + +void OpenclPooling::Pooling_Backward_Max(Block* top, Block* mask, + const int num, const int channels, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Block* bottom, Context* ctx) { + auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("pooling.cl", "max_pool_backward"); + + auto src_buf = WrapHandle(static_cast<cl_mem>(top->mutable_data()), ocl_ctx); + auto dst_buf = WrapHandle(static_cast<cl_mem>(bottom->mutable_data()), ocl_ctx); + auto mask_buf = WrapHandle(static_cast<cl_mem>(mask->mutable_data()), ocl_ctx); + + viennacl::ocl::enqueue(kernel(num, src_buf, mask_buf, channels, + height, width, pooled_h, pooled_w, + kernel_h, kernel_w, stride_h, stride_w, + pad_h, pad_w, dst_buf)); +} + + +void OpenclPooling::Pooling_Backward_Ave(Block* bottom, + const int num, const int channels, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Block* top, Context* ctx) { + auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("pooling.cl", "ave_pool_backward"); + + auto src_buf = WrapHandle(static_cast<cl_mem>(bottom->mutable_data()), ocl_ctx); + auto dst_buf = WrapHandle(static_cast<cl_mem>(top->mutable_data()), ocl_ctx); + + viennacl::ocl::enqueue(kernel(num, src_buf, channels, + height, width, pooled_h, pooled_w, + kernel_h, kernel_w, stride_h, stride_w, + pad_h, pad_w, dst_buf)); +} + + +void OpenclPooling::Pooling_Backward_Sto(Block* src, Block* rand, Block* dst, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int channels, Context* ctx) { + auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id); + auto kernel = ocl_ctx.get_kernel("pooling.cl", "sto_pool_backward"); + + auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx); + auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx); + auto randbuf = WrapHandle(static_cast<cl_mem>(rand->mutable_data()), ocl_ctx); + + viennacl::ocl::enqueue(kernel(height * width, randbuf, src_buf, channels, + height, width, pooled_h, pooled_w, + kernel_h, kernel_w, stride_h, stride_w, + dst_buf)); +} + + +} // namespace singa + +#endif // USE_OPENCL http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/opencl_pooling.h ---------------------------------------------------------------------- diff --git a/src/model/layer/opencl_pooling.h b/src/model/layer/opencl_pooling.h new file mode 100644 index 0000000..01e447c --- /dev/null +++ b/src/model/layer/opencl_pooling.h @@ -0,0 +1,109 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef SRC_MODEL_LAYER_OPENCL_POOLING_H_ +#define SRC_MODEL_LAYER_OPENCL_POOLING_H_ + +#include "pooling.h" +#include "singa/core/common.h" +#include "singa/model/layer.h" +#include "singa/utils/opencl_utils.h" +#include "singa/proto/core.pb.h" + +#ifdef USE_OPENCL + +namespace singa { + +class OpenclPooling : public Pooling { +public: + + /// \copydoc Layer::layer_type() + const std::string layer_type() const override { return "OpenclPooling"; } + + const Tensor Forward(int flag, const Tensor &input) override; + + const std::pair<Tensor, std::vector<Tensor>> + Backward(int flag, const Tensor &grad) override; + + /// \copydoc Layer::Setup(const LayerConf&); + void Setup(const Shape &in_sample, const LayerConf &conf) override; + +private: + void Pooling_Forward_Max(const int num, Block* src, Block* mask, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, + Block* dst, const int channels, + Context* ctx); + + void Pooling_Forward_Ave(const int num, Block* src, Block* dst, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, + const int channels, Context* ctx); + + void Pooling_Forward_Sto_Train(Block* src, Block* rand, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int channels, + Block* dst, Context* ctx); + + void Pooling_Forward_Sto_Test(Block* src, Block* dst, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int channels, Context* ctx); + + void Pooling_Backward_Max(Block* top, Block* mask, + const int num, const int channels, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Block* bottom, Context* ctx); + + void Pooling_Backward_Ave(Block* bottom, const int num, const int channels, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + Block* top, Context* ctx); + + void Pooling_Backward_Sto(Block* src, Block* rand, Block* dst, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int channels, Context* ctx); + +}; + +} // namespace singa + +#endif // USE_OPENCL + +#endif // SRC_MODEL_LAYER_OPENCL_POOLING_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/pooling.cc ---------------------------------------------------------------------- diff --git a/src/model/layer/pooling.cc b/src/model/layer/pooling.cc index 1312776..ff8d58e 100644 --- a/src/model/layer/pooling.cc +++ b/src/model/layer/pooling.cc @@ -85,49 +85,55 @@ const Tensor Pooling::Forward(int flag, const Tensor& input) { Tensor mask; mask.ResetLike(output); float* maskptr = new float[mask.Size()]; - ForwardMaxPooling(inptr, batchsize, channels_, height_, width_, kernel_h_, - kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, outptr, + ForwardMaxPooling(inptr, batchsize, channels_, height_, width_, + pooled_height_, pooled_width_, kernel_h_, kernel_w_, + pad_h_, pad_w_, stride_h_, stride_w_, outptr, maskptr); mask.CopyDataFromHostPtr(maskptr, mask.Size()); if (flag & kTrain) buf_.push(mask); delete[] maskptr; } else if (pool_ == PoolingConf_PoolMethod_AVE) - ForwardAvgPooling(inptr, batchsize, channels_, height_, width_, kernel_h_, - kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, outptr); + ForwardAvgPooling(inptr, batchsize, channels_, height_, width_, + pooled_height_, pooled_width_, kernel_h_, kernel_w_, + pad_h_, pad_w_, stride_h_, stride_w_, outptr); else - LOG(FATAL) << "Unknow pooling method"; + LOG(FATAL) << "Unknown pooling method"; output.CopyDataFromHostPtr(outptr, output.Size()); delete[] outptr; return output; } -const std::pair<Tensor, vector<Tensor>> Pooling::Backward(int flag, - const Tensor& grad) { +const std::pair<Tensor, vector<Tensor>> +Pooling::Backward(int flag, const Tensor& grad) { CHECK_EQ(grad.device()->lang(), kCpp); CHECK_EQ(grad.nDim(), 4u); + vector<Tensor> param_grad; - size_t batchsize = grad.shape(0); - Shape shape{batchsize, channels_, height_, width_}; + + auto batchsize = grad.shape(0); + auto dtype = grad.data_type(); auto dev = grad.device(); - DataType dtype = grad.data_type(); + Shape shape{batchsize, channels_, height_, width_}; + Tensor dx(shape, dev, dtype); auto gradptr = grad.data<float>(); float* dxptr = new float[dx.Size()]; + if (pool_ == PoolingConf_PoolMethod_MAX) { CHECK(!buf_.empty()); Tensor mask = buf_.top(); buf_.pop(); auto maskptr = mask.data<float>(); BackwardMaxPooling(gradptr, maskptr, batchsize, channels_, height_, width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, - stride_w_, dxptr); + pooled_height_, pooled_width_, kernel_h_, kernel_w_, + pad_h_, pad_w_, stride_h_, stride_w_, dxptr); } else if (pool_ == PoolingConf_PoolMethod_AVE) { BackwardAvgPooling(gradptr, batchsize, channels_, height_, width_, - kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, - stride_w_, dxptr); + pooled_height_, pooled_width_, kernel_h_, kernel_w_, + pad_h_, pad_w_, stride_h_, stride_w_, dxptr); } else { - LOG(FATAL) << "Unknow pooling method"; + LOG(FATAL) << "Unknown pooling method"; } dx.CopyDataFromHostPtr(dxptr, dx.Size()); @@ -136,32 +142,32 @@ const std::pair<Tensor, vector<Tensor>> Pooling::Backward(int flag, } void Pooling::ForwardMaxPooling(const float* bottom, const int num, - const int channels, const int height, - const int width, const int kernel_h, - const int kernel_w, const int pad_h, - const int pad_w, const int stride_h, - const int stride_w, float* top, float* mask) { - int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1; - int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1; - int top_count = num * top_height * top_width * channels; + const int channels, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + float* top, float* mask) { + int top_count = num * pooled_h * pooled_w * channels; for (int i = 0; i < top_count; i++) { mask[i] = -1; top[i] = -FLT_MAX; } const int bottom_offset = height * width; - const int top_offset = top_height * top_width; + const int top_offset = pooled_h * pooled_w; // The main loop for (int n = 0; n < num; ++n) { for (int c = 0; c < channels; ++c) { - for (int ph = 0; ph < top_height; ++ph) { - for (int pw = 0; pw < top_width; ++pw) { + for (int ph = 0; ph < pooled_h; ++ph) { + for (int pw = 0; pw < pooled_w; ++pw) { int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; int hend = std::min(hstart + kernel_h, height); int wend = std::min(wstart + kernel_w, width); hstart = std::max(hstart, 0); wstart = std::max(wstart, 0); - const int top_index = ph * top_width + pw; + const int top_index = ph * pooled_w + pw; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { const int index = h * width + w; @@ -184,20 +190,19 @@ void Pooling::ForwardMaxPooling(const float* bottom, const int num, void Pooling::BackwardMaxPooling(const float* top, const float* mask, const int num, const int channels, const int height, const int width, + const int pooled_h, const int pooled_w, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, float* bottom) { - int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1; - int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1; - const int top_offset = top_height * top_width; + const int top_offset = pooled_h * pooled_w; const int bottom_offset = height * width; memset(bottom, 0, sizeof(float) * num * channels * bottom_offset); for (int n = 0; n < num; ++n) { for (int c = 0; c < channels; ++c) { - for (int ph = 0; ph < top_height; ++ph) { - for (int pw = 0; pw < top_width; ++pw) { - const int top_idx = ph * top_width + pw; + for (int ph = 0; ph < pooled_h; ++ph) { + for (int pw = 0; pw < pooled_w; ++pw) { + const int top_idx = ph * pooled_w + pw; const int bottom_idx = static_cast<int>(mask[top_idx]); bottom[bottom_idx] += top[top_idx]; } @@ -210,24 +215,24 @@ void Pooling::BackwardMaxPooling(const float* top, const float* mask, } void Pooling::ForwardAvgPooling(const float* bottom, const int num, - const int channels, const int height, - const int width, const int kernel_h, - const int kernel_w, const int pad_h, - const int pad_w, const int stride_h, - const int stride_w, float* top) { - int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1; - int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1; - int top_count = num * top_height * top_width * channels; + const int channels, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + float* top) { + int top_count = num * pooled_h * pooled_w * channels; for (int i = 0; i < top_count; i++) { top[i] = 0; } const int bottom_offset = height * width; - const int top_offset = top_height * top_width; + const int top_offset = pooled_h * pooled_w; // The main loop for (int n = 0; n < num; ++n) { for (int c = 0; c < channels; ++c) { - for (int ph = 0; ph < top_height; ++ph) { - for (int pw = 0; pw < top_width; ++pw) { + for (int ph = 0; ph < pooled_h; ++ph) { + for (int pw = 0; pw < pooled_w; ++pw) { int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; int hend = std::min(hstart + kernel_h, height + pad_h); @@ -237,7 +242,7 @@ void Pooling::ForwardAvgPooling(const float* bottom, const int num, wstart = std::max(wstart, 0); hend = std::min(hend, height); wend = std::min(wend, width); - const int top_index = ph * top_width + pw; + const int top_index = ph * pooled_w + pw; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { const int index = h * width + w; @@ -255,20 +260,20 @@ void Pooling::ForwardAvgPooling(const float* bottom, const int num, } void Pooling::BackwardAvgPooling(const float* top, const int num, - const int channels, const int height, - const int width, const int kernel_h, - const int kernel_w, const int pad_h, - const int pad_w, const int stride_h, - const int stride_w, float* bottom) { - int top_height = (height + pad_h * 2 - kernel_h) / stride_h + 1; - int top_width = (width + pad_w * 2 - kernel_w) / stride_w + 1; - const int top_offset = top_height * top_width; + const int channels, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + float* bottom) { + const int top_offset = pooled_h * pooled_w; const int bottom_offset = height * width; memset(bottom, 0, sizeof(float) * num * channels * bottom_offset); for (int n = 0; n < num; ++n) { for (int c = 0; c < channels; ++c) { - for (int ph = 0; ph < top_height; ++ph) { - for (int pw = 0; pw < top_width; ++pw) { + for (int ph = 0; ph < pooled_h; ++ph) { + for (int pw = 0; pw < pooled_w; ++pw) { int hstart = ph * stride_h - pad_h; int wstart = pw * stride_w - pad_w; int hend = std::min(hstart + kernel_h, height + pad_h); @@ -278,7 +283,7 @@ void Pooling::BackwardAvgPooling(const float* top, const int num, wstart = std::max(wstart, 0); hend = std::min(hend, height); wend = std::min(wend, width); - const int top_index = ph * top_width + pw; + const int top_index = ph * pooled_w + pw; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { const int index = h * width + w; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/pooling.cl ---------------------------------------------------------------------- diff --git a/src/model/layer/pooling.cl b/src/model/layer/pooling.cl new file mode 100644 index 0000000..3ea4ecd --- /dev/null +++ b/src/model/layer/pooling.cl @@ -0,0 +1,264 @@ +// This file is modified from the file located at +// https://github.com/BVLC/caffe/blob/opencl/src/caffe/greentea/cl_kernels/pooling.cl +// and is covered under the BSD 2-Clause License, as indicated in the LICENSE +// file at the root of this repository. + +__kernel void max_pool_forward( + const int nthreads, __global const float* bottom, const int channels, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, + __global float* top, __global float* mask) { + +// printf("%d ", get_global_size(0)); + for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) { + const int pw = i % pooled_w; + const int ph = (i / pooled_w) % pooled_h; + const int c = (i / pooled_w / pooled_h) % channels; + const int n = i / pooled_w / pooled_h / channels; + + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int hend = min(hstart + kernel_h, height); + const int wend = min(wstart + kernel_w, width); + hstart = max(hstart, (int)0); + wstart = max(wstart, (int)0); + + float maxval = -FLT_MAX; + int maxidx = -1; + __global const float* bottom_slice = bottom + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int index = h * width + w; + if (bottom_slice[index] > maxval) { + maxidx = index; + maxval = bottom_slice[maxidx]; + } + } + } + top[i] = maxval; + mask[i] = (float)maxidx; + } +} + +__kernel void ave_pool_forward( + const int nthreads, __global const float* const bottom, const int channels, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, __global float* top) { + + for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) { + const int pw = i % pooled_w; + const int ph = (i / pooled_w) % pooled_h; + const int c = (i / pooled_w / pooled_h) % channels; + const int n = i / pooled_w / pooled_h / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + const int pool_size = (hend - hstart) * (wend - wstart); + hstart = max(hstart, (int)0); + wstart = max(wstart, (int)0); + hend = min(hend, height); + wend = min(wend, width); + float aveval = 0; + __global const float* bottom_slice = bottom + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + aveval += bottom_slice[h * width + w]; + } + } + top[i] = aveval / pool_size; + } +} + +__kernel void sto_pool_forward_train( + const int nthreads, __global const float* bottom, + const int channels, const int height, const int width, + const int pooled_h, const int pooled_w, const int kernel_h, + const int kernel_w, const int stride_h, const int stride_w, + __global float* rand_idx, __global float* top) { + + for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) { + const int pw = i % pooled_w; + const int ph = (i / pooled_w) % pooled_h; + const int c = (i / pooled_w / pooled_h) % channels; + const int n = i / pooled_w / pooled_h / channels; + + const int hstart = ph * stride_h; + const int hend = min(hstart + kernel_h, height); + const int wstart = pw * stride_w; + const int wend = min(wstart + kernel_w, width); + float cumsum = 0.; + __global const float* bottom_slice = bottom + (n * channels + c) * height * width; + // First pass: get sum + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_slice[h * width + w]; + } + } + const float thres = rand_idx[i] * cumsum; + // Second pass: get value, and set i. + cumsum = 0; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_slice[h * width + w]; + if (cumsum >= thres) { + rand_idx[i] = ((n * channels + c) * height + h) * width + w; + top[i] = bottom_slice[h * width + w]; + h = hend; + w = wend; + } + } + } + } +} + +__kernel void sto_pool_forward_test( + const int nthreads, __global const float* const bottom, const int channels, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + __global float* top) { + + for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) { + const int pw = i % pooled_w; + const int ph = (i / pooled_w) % pooled_h; + const int c = (i / pooled_w / pooled_h) % channels; + const int n = i / pooled_w / pooled_h / channels; + + const int hstart = ph * stride_h; + const int hend = min(hstart + kernel_h, height); + const int wstart = pw * stride_w; + const int wend = min(wstart + kernel_w, width); + // We set cumsum to be 0 to avoid divide-by-zero problems + float cumsum = FLT_MIN; + float cumvalues = 0.; + __global const float* bottom_slice = bottom + (n * channels + c) * height * width; + // First pass: get sum + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + cumsum += bottom_slice[h * width + w]; + cumvalues += bottom_slice[h * width + w] * bottom_slice[h * width + w]; + } + } + top[i] = cumvalues / cumsum; + } +} + +__kernel void max_pool_backward(const int nthreads, + __global const float* top_diff, + __global const float* mask, + const int channels, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, + __global float* bottom_diff) { + for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) { + // find out the local i + // find out the local offset + const int w = i % width; + const int h = (i / width) % height; + const int c = (i / width / height) % channels; + const int n = i / width / height / channels; + + const int phstart = + (h + pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1; + const int phend = min((h + pad_h) / stride_h + 1, pooled_h); + const int pwstart = + (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / stride_w + 1; + const int pwend = min((w + pad_w) / stride_w + 1, pooled_w); + float gradient = 0.0f; + const int offset = (n * channels + c) * pooled_h * pooled_w; + __global const float* top_diff_slice = top_diff + offset; + __global const float* mask_slice = mask + offset; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + if (mask_slice[ph * pooled_w + pw] == (float)(h * width + w)) { + gradient += top_diff_slice[ph * pooled_w + pw]; + } + } + } + bottom_diff[i] = gradient; + } +} + +__kernel void ave_pool_backward(const int nthreads, + __global const float* top_diff, + const int channels, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int pad_h, const int pad_w, + __global float* bottom_diff) { + for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) { + // find out the local i + // find out the local offset + const int w = i % width + pad_w; + const int h = (i / width) % height + pad_h; + const int c = (i / width / height) % channels; + const int n = i / width / height / channels; + + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_h); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_w); + float gradient = 0.0; + __global const float* const top_diff_slice = top_diff + (n * channels + c) * pooled_h * pooled_w; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + // figure out the pooling size + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = min(hstart + kernel_h, height + pad_h); + int wend = min(wstart + kernel_w, width + pad_w); + int pool_size = (hend - hstart) * (wend - wstart); + gradient += top_diff_slice[ph * pooled_w + pw] / pool_size; + } + } + bottom_diff[i] = gradient; + } +} + +__kernel void sto_pool_backward( + const int nthreads, __global const float* rand_idx, + __global const float* const top_diff, const int channels, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + __global float* bottom_diff) { + + for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) { + // find out the local i + // find out the local offset + const int w = i % width; + const int h = (i / width) % height; + const int c = (i / width / height) % channels; + const int n = i / width / height / channels; + + const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; + const int phend = min(h / stride_h + 1, pooled_h); + const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; + const int pwend = min(w / stride_w + 1, pooled_w); + float gradient = 0.0; + __global const float* rand_idx_slice = rand_idx + (n * channels + c) * pooled_h * pooled_w; + __global const float* top_diff_slice = top_diff + (n * channels + c) * pooled_h * pooled_w; + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { + gradient += top_diff_slice[ph * pooled_w + pw] + * (i == (int) (rand_idx_slice[ph * pooled_w + pw])?1.0:0.0); + } + } + bottom_diff[i] = gradient; + } +} + http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/src/model/layer/pooling.h ---------------------------------------------------------------------- diff --git a/src/model/layer/pooling.h b/src/model/layer/pooling.h index f844799..d16db27 100644 --- a/src/model/layer/pooling.h +++ b/src/model/layer/pooling.h @@ -44,25 +44,34 @@ class Pooling : public Layer { const Tensor& grad) override; void ForwardMaxPooling(const float* bottom, const int num, const int channels, - const int height, const int width, const int kernel_h, - const int kernel_w, const int pad_h, const int pad_w, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, float* top, float* mask); void BackwardMaxPooling(const float* top, const float* mask, const int num, const int channels, const int height, const int width, + const int pooled_h, const int pooled_w, const int kernel_h, const int kernel_w, - const int pad_h, const int pad_w, const int stride_h, - const int stride_w, float* bottom); + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + float* bottom); void ForwardAvgPooling(const float* bottom, const int num, const int channels, - const int height, const int width, const int kernel_h, - const int kernel_w, const int pad_h, const int pad_w, - const int stride_h, const int stride_w, float* top); + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + float* top); void BackwardAvgPooling(const float* top, const int num, const int channels, - const int height, const int width, const int kernel_h, - const int kernel_w, const int pad_h, const int pad_w, + const int height, const int width, + const int pooled_h, const int pooled_w, + const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, float* bottom); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/test/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index efc1983..e1487d2 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -30,12 +30,6 @@ ADD_LIBRARY(gtest STATIC EXCLUDE_FROM_ALL "gtest/gtest.h" "gtest/gtest-all.cc") AUX_SOURCE_DIRECTORY(singa singa_test_source) LIST(REMOVE_ITEM singa_test_source "singa/test_ep.cc") -IF(NOT USE_OPENCL) - MESSAGE(STATUS "Skipping OpenCL tests") - LIST(REMOVE_ITEM singa_test_source "singa/test_opencl.cc") -ENDIF() - - ADD_EXECUTABLE(test_singa "gtest/gtest_main.cc" ${singa_test_source}) ADD_DEPENDENCIES(test_singa singa) #MESSAGE(STATUS "link libs" ${singa_linker_libs}) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/e3df3bd7/test/singa/test_opencl_convolution.cc ---------------------------------------------------------------------- diff --git a/test/singa/test_opencl_convolution.cc b/test/singa/test_opencl_convolution.cc new file mode 100644 index 0000000..972756d --- /dev/null +++ b/test/singa/test_opencl_convolution.cc @@ -0,0 +1,223 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#include "../src/model/layer/convolution.h" +#include "../src/model/layer/opencl_convolution.h" + +#include "gtest/gtest.h" + +#ifdef USE_OPENCL + +using singa::OpenclConvolution; +using singa::OpenclDevice; +using singa::Shape; + + +TEST(OpenclConvolution, Setup) { + OpenclConvolution conv; + EXPECT_EQ("OpenclConvolution", conv.layer_type()); + + singa::LayerConf conf; + singa::ConvolutionConf *convconf = conf.mutable_convolution_conf(); + convconf->set_kernel_h(2); + convconf->set_kernel_w(2); + convconf->set_pad_h(1); + convconf->set_pad_w(1); + convconf->set_stride_h(1); + convconf->set_stride_w(1); + convconf->set_num_output(2); + convconf->set_bias_term(true); + conv.Setup(Shape{1, 3, 3}, conf); + + EXPECT_EQ(2u, conv.kernel_h()); + EXPECT_EQ(2u, conv.kernel_w()); + EXPECT_EQ(1u, conv.pad_h()); + EXPECT_EQ(1u, conv.pad_w()); + EXPECT_EQ(1u, conv.stride_h()); + EXPECT_EQ(1u, conv.stride_w()); + EXPECT_EQ(2u, conv.num_filters()); + EXPECT_EQ(true, conv.bias_term()); + EXPECT_EQ(1u, conv.channels()); + EXPECT_EQ(3u, conv.height()); + EXPECT_EQ(3u, conv.width()); +} + + +TEST(OpenclConvolution, Forward) { + const size_t batchsize = 2, c = 1, h = 3, w = 3; + const float x[batchsize * c * h * w] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, + 7.0f, 8.0f, 9.0f, 1.0f, 2.0f, 3.0f, + 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f}; + + auto ocl = std::make_shared<OpenclDevice>(); + singa::Tensor in(singa::Shape{batchsize, c, h, w}, ocl); + in.CopyDataFromHostPtr(x, batchsize * c * h * w); + + // Set weight and bias manually + const size_t num_filters = 1; + const size_t col_height = 1 * 3 * 3; // channels * kernel_w * kernel_h + const float we[num_filters * col_height] = {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, + -1.0f, 0.0f, 1.0f, 0.0f}; + singa::Tensor weight(singa::Shape{num_filters, col_height}, ocl); + weight.CopyDataFromHostPtr(we, num_filters * col_height); + const float b[num_filters] = {1.0f}; + singa::Tensor bias(singa::Shape{num_filters}, ocl); + bias.CopyDataFromHostPtr(b, num_filters); + OpenclConvolution conv; + conv.set_weight(weight); + conv.set_bias(bias); + + singa::LayerConf conf; + singa::ConvolutionConf *convconf = conf.mutable_convolution_conf(); + convconf->set_kernel_h(3); + convconf->set_kernel_w(3); + convconf->set_pad_h(1); + convconf->set_pad_w(1); + convconf->set_stride_h(2); + convconf->set_stride_w(2); + convconf->set_num_output(1); + convconf->set_bias_term(true); + conv.Setup(Shape{1, 3, 3}, conf); + + // Parameter "flag" does not influence convolution + singa::Tensor out1 = conv.Forward(singa::kTrain, in); + out1.ToHost(); + const float *outptr1 = out1.data<float>(); + // Input: 3*3; kernel: 3*3; stride: 2*2; padding: 1*1. + EXPECT_EQ(8u, out1.Size()); + + EXPECT_EQ(3.0f, outptr1[0]); + EXPECT_EQ(7.0f, outptr1[1]); + EXPECT_EQ(-3.0f, outptr1[2]); + EXPECT_EQ(12.0f, outptr1[3]); + EXPECT_EQ(3.0f, outptr1[4]); + EXPECT_EQ(7.0f, outptr1[5]); + EXPECT_EQ(-3.0f, outptr1[6]); + EXPECT_EQ(12.0f, outptr1[7]); +} + + +TEST(OpenclConvolution, Backward) { + // src_data + const size_t batchsize = 2, c = 1, src_h = 3, src_w = 3; + const float x[batchsize * c * src_h * src_w] = { + 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, + 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f}; + auto ocl = std::make_shared<OpenclDevice>(); + singa::Tensor in(singa::Shape{batchsize, c, src_h, src_w}, ocl); + in.CopyDataFromHostPtr(x, batchsize * c * src_h * src_w); + + // Set weight_ and bias_ manually + const size_t num_filters = 1; + const size_t col_height = 1 * 3 * 3; // channels * kernel_w * kernel_h + const float we[num_filters * col_height] = {1.0f, 1.0f, 0.0f, 0.0f, 0.0f, + -1.0f, 0.0f, 1.0f, 0.0f}; + singa::Tensor weight(singa::Shape{num_filters, col_height}, ocl); + weight.CopyDataFromHostPtr(we, num_filters * col_height); + const float b[num_filters] = {1.0f}; + singa::Tensor bias(singa::Shape{num_filters}, ocl); + bias.CopyDataFromHostPtr(b, num_filters); + OpenclConvolution conv; + conv.set_weight(weight); + conv.set_bias(bias); + + singa::LayerConf conf; + singa::ConvolutionConf *convconf = conf.mutable_convolution_conf(); + convconf->set_kernel_h(3); + convconf->set_kernel_w(3); + convconf->set_pad_h(1); + convconf->set_pad_w(1); + convconf->set_stride_h(2); + convconf->set_stride_w(2); + convconf->set_num_output(1); + convconf->set_bias_term(true); + convconf->set_workspace_byte_limit(256); + convconf->set_prefer("fastest"); + conv.Setup(Shape{1, 3, 3}, conf); + + singa::Tensor out1 = conv.Forward(singa::kTrain, in); + + // grad + const size_t grad_h = 2, grad_w = 2; + const float dy[batchsize * num_filters * grad_h * grad_w] = { + 0.1f, 0.2f, 0.3f, 0.4f, 0.1f, 0.2f, 0.3f, 0.4f}; + singa::Tensor grad(singa::Shape{batchsize, num_filters, grad_h, grad_w}, ocl); + grad.CopyDataFromHostPtr(dy, batchsize * num_filters * grad_h * grad_w); + + const auto ret = conv.Backward(singa::kTrain, grad); + singa::Tensor in_grad = ret.first; + in_grad.ToHost(); + const float *dx = in_grad.data<float>(); + const float *wptr = we; + EXPECT_EQ(18u, in_grad.Size()); + EXPECT_EQ(dy[0] * wptr[4], dx[0]); + EXPECT_EQ(dy[0] * wptr[5] + dy[1] * wptr[3], dx[1]); + EXPECT_EQ(dy[1] * wptr[4], dx[2]); + EXPECT_EQ(dy[0] * wptr[7] + dy[2] * wptr[1], dx[3]); + EXPECT_EQ( + dy[0] * wptr[8] + dy[1] * wptr[6] + dy[2] * wptr[2] + dy[3] * wptr[0], + dx[4]); + EXPECT_EQ(dy[1] * wptr[7] + dy[3] * wptr[1], dx[5]); + EXPECT_EQ(dy[2] * wptr[4], dx[6]); + EXPECT_EQ(dy[2] * wptr[5] + dy[3] * wptr[3], dx[7]); + EXPECT_EQ(dy[3] * wptr[4], dx[8]); + EXPECT_EQ(dy[4] * wptr[4], dx[9]); + EXPECT_EQ(dy[4] * wptr[5] + dy[1] * wptr[3], dx[10]); + EXPECT_EQ(dy[5] * wptr[4], dx[11]); + EXPECT_EQ(dy[4] * wptr[7] + dy[2] * wptr[1], dx[12]); + EXPECT_EQ( + dy[4] * wptr[8] + dy[5] * wptr[6] + dy[6] * wptr[2] + dy[7] * wptr[0], + dx[13]); + EXPECT_EQ(dy[5] * wptr[7] + dy[7] * wptr[1], dx[14]); + EXPECT_EQ(dy[6] * wptr[4], dx[15]); + EXPECT_EQ(dy[6] * wptr[5] + dy[7] * wptr[3], dx[16]); + EXPECT_EQ(dy[7] * wptr[4], dx[17]); + + singa::Tensor dw = ret.second[0]; + singa::Tensor db = ret.second[1]; + dw.ToHost(); + db.ToHost(); + const float *dbptr = db.data<float>(); + EXPECT_FLOAT_EQ(dy[0] + dy[1] + dy[2] + dy[3] + dy[4] + dy[5] + dy[6] + dy[7], + dbptr[0]); + + const float *dwptr = dw.data<float>(); + EXPECT_EQ(9u, dw.Size()); + EXPECT_FLOAT_EQ(dy[3] * x[4] + dy[7] * x[13], dwptr[0]); + EXPECT_FLOAT_EQ(dy[3] * x[5] + dy[7] * x[14] + dy[2] * x[3] + dy[6] * x[12], + dwptr[1]); + EXPECT_FLOAT_EQ(dy[2] * x[4] + dy[6] * x[13], dwptr[2]); + EXPECT_FLOAT_EQ(dy[1] * x[1] + dy[5] * x[10] + dy[3] * x[7] + dy[7] * x[16], + dwptr[3]); + EXPECT_FLOAT_EQ(dy[0] * x[0] + dy[4] * x[9] + dy[1] * x[2] + dy[5] * x[11] + + dy[2] * x[6] + dy[6] * x[15] + dy[3] * x[8] + + dy[7] * x[17], + dwptr[4]); + EXPECT_FLOAT_EQ(dy[0] * x[1] + dy[4] * x[10] + dy[2] * x[7] + dy[6] * x[16], + dwptr[5]); + EXPECT_FLOAT_EQ(dy[1] * x[4] + dy[5] * x[13], dwptr[6]); + EXPECT_FLOAT_EQ(dy[0] * x[3] + dy[4] * x[12] + dy[1] * x[5] + dy[5] * x[14], + dwptr[7]); + EXPECT_FLOAT_EQ(dy[0] * x[4] + dy[4] * x[13], dwptr[8]); +} + + +#endif // USE_OPENCL
