http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/src/core/tensor/tensor_math_opencl.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_opencl.h b/src/core/tensor/tensor_math_opencl.h index c4b1347..bfc051d 100644 --- a/src/core/tensor/tensor_math_opencl.h +++ b/src/core/tensor/tensor_math_opencl.h @@ -17,12 +17,1084 @@ */ #ifndef SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_ -#include "./tensor_math.h" + +#ifdef USE_OPENCL +//#include <CL/cl2.hpp> + +#include "singa/utils/opencl_utils.h" +#include "tensor_math.h" namespace singa { +// Some forward declarations of utility functions that only exist here. +void Transpose(const size_t nrow, const size_t ncol, cl::Buffer& in, cl::Buffer& out, Context* ctx); +void DiagVec_Left(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx); +void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx); + +// ************************************** +// Element-wise functions +// ************************************** + +template<> +void Abs<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_abs"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Add<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_add_scalar"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, x); + kernel.setArg(2, inbuf); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Add<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_add"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data())); + cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, in1buf); + kernel.setArg(2, in2buf); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Clamp<float, lang::Opencl>(const size_t num, const float low, const float high, const Block* in, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_clamp"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, low); + kernel.setArg(2, high); + kernel.setArg(3, inbuf); + kernel.setArg(4, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Div<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_divide_scalar_matx"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, x); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Div<float, lang::Opencl>(const size_t num, const float x, const Block* in, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_divide_scalar_xmat"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, x); + kernel.setArg(2, inbuf); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Div<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_divide"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data())); + cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, in1buf); + kernel.setArg(2, in2buf); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void EltwiseMult<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_eltmult_scalar"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, x); + kernel.setArg(2, inbuf); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void EltwiseMult<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_eltmult"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data())); + cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, in1buf); + kernel.setArg(2, in2buf); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Exp<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_exp"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void LE<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_le"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, x); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Log<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_log"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void LT<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_lt"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, x); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void GE<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_ge"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, x); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void GT<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_gt"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, x); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Pow<float, lang::Opencl>(const size_t num, const Block* in, float x, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_pow_scalar"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, x); + kernel.setArg(2, inbuf); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Pow<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_pow"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data())); + cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, in1buf); + kernel.setArg(2, in2buf); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void ReLU<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_relu"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + +template<> +void Set<float, lang::Opencl>(const size_t num, const float x, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_set"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, x); + kernel.setArg(2, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Sigmoid<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_sigmoid"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Sign<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_sign"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Sqrt<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_sqrt"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Square<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { + Pow<float, lang::Opencl>(num, in, 2, out, ctx); +} + + +template<> +void Sub<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_subtract_scalar"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, x); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Sub<float, lang::Opencl>(const size_t num, const Block* in1, const Block* in2, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_subtract"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data())); + cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, in1buf); + kernel.setArg(2, in2buf); + kernel.setArg(3, outbuf); + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); } +template<> +void Sum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_reduce"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + + size_t size = sizeof(float) * num; + cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status); + OCL_CHECK(status, "Failed to create buffer!"); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, outval); + kernel.setArg(3, cl::Local(size)); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); + + float* temp = new float[num]; + status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp); + OCL_CHECK(status, "Failed to read from buffer!"); + out[0] = temp[0]; + delete temp; +} + + +template<> +void Tanh<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_tanh"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + +// ************************************** +// Random functions +// ************************************** + +template<> +void Bernoulli<float, lang::Opencl>(const size_t num, const float p, Block* out, Context *ctx) { + cl_int status = CL_SUCCESS; + +//std::string kname = "clkernel_bernoulli"; + std::string kname = "PRNG_threefry4x32_bernoulli"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)(num / 4)); // Divide by 4 because kernel uses float4 as argument. + kernel.setArg(1, p); + kernel.setArg(2, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Gaussian<float, lang::Opencl>(const size_t num, const float mean, const float std, Block* out, Context *ctx) { + cl_int status = CL_SUCCESS; + +//std::string kname = "clkernel_gaussian"; + std::string kname = "PRNG_threefry4x32_gaussian"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)(num / 4)); + kernel.setArg(1, mean); + kernel.setArg(2, std); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Uniform<float, lang::Opencl>(const size_t num, const float low, const float high, Block* out, Context *ctx) { + cl_int status = CL_SUCCESS; + +//std::string kname = "clkernel_uniform"; + std::string kname = "PRNG_threefry4x32_uniform"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)(num / 4)); + kernel.setArg(1, low); + kernel.setArg(2, high); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + +// ********************************************************* +// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas +// ********************************************************* + +template<> +void Amax<float, lang::Opencl>(const size_t num, const Block* in, size_t* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_amax"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + + size_t size = sizeof(size_t) * num; + cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status); + OCL_CHECK(status, "Failed to create buffer!"); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, outval); + kernel.setArg(3, cl::Local(size)); + kernel.setArg(4, cl::Local(sizeof(size_t))); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); + + size_t* temp = new size_t[num]; + status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp); + OCL_CHECK(status, "Failed to read from buffer!"); + out[0] = temp[0]; + delete temp; +} + + +template<> +void Amin<float, lang::Opencl>(const size_t num, const Block* in, size_t* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_amin"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + + size_t size = sizeof(size_t) * num; + cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status); + OCL_CHECK(status, "Failed to create buffer!"); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, outval); + kernel.setArg(3, cl::Local(size)); + kernel.setArg(4, cl::Local(sizeof(size_t))); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); + + size_t* temp = new size_t[num]; + status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp); + OCL_CHECK(status, "Failed to read from buffer!"); + out[0] = temp[0]; + delete temp; +} + + +template<> +void Asum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_asum"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + + size_t size = sizeof(float) * num; + cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status); + OCL_CHECK(status, "Failed to create buffer!"); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, outval); + kernel.setArg(3, cl::Local(size)); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); + + float* temp = new float[num]; + status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp); + OCL_CHECK(status, "Failed to read from buffer!"); + out[0] = temp[0]; + delete temp; +} + + +template<> +void Axpy<float, lang::Opencl>(const size_t num, const float alpha, const Block* in, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_axpy"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, alpha); + kernel.setArg(2, inbuf); + kernel.setArg(3, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Nrm2<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_nrm2"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + + size_t size = sizeof(float) * num; + cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status); + OCL_CHECK(status, "Failed to create buffer!"); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, inbuf); + kernel.setArg(2, outval); + kernel.setArg(3, cl::Local(sizeof(float) * (std::pow(2, num)))); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); + + float* temp = new float[num]; + status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp); + OCL_CHECK(status, "Failed to read from buffer!"); + out[0] = temp[0]; + delete temp; +} + + +template<> +void Scale<float, lang::Opencl>(const size_t num, const float x, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_scale"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, x); + kernel.setArg(2, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void Dot<float, lang::Opencl>(const size_t num, const Block *in1, const Block *in2, float *out, Context *ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_dot"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer in1buf = *(static_cast<cl::Buffer*>(in1->mutable_data())); + cl::Buffer in2buf = *(static_cast<cl::Buffer*>(in2->mutable_data())); + + size_t size = sizeof(float) * num; + cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status); + OCL_CHECK(status, "Failed to create buffer!"); + + kernel.setArg(0, (cl_int)num); + kernel.setArg(1, in1buf); + kernel.setArg(2, in2buf); + kernel.setArg(3, outval); + kernel.setArg(4, cl::Local(size)); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(num)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); + + float* temp = new float[num]; + status = ctx->ocl_cmdq.enqueueReadBuffer(outval, CL_TRUE, 0, size, temp); + OCL_CHECK(status, "Failed to read from buffer!"); + out[0] = temp[0]; + delete temp; +} + + +template<> +void GEMV<float, lang::Opencl>(bool trans, const size_t m, const size_t n, const float alpha, + const Block *A, const Block *v, const float beta, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_gemv"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer Abuf = *(static_cast<cl::Buffer*>(A->mutable_data())); + cl::Buffer vbuf = *(static_cast<cl::Buffer*>(v->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)m); + kernel.setArg(1, (cl_int)n); + kernel.setArg(2, alpha); + kernel.setArg(3, Abuf); + kernel.setArg(4, vbuf); + kernel.setArg(5, beta); + kernel.setArg(6, outbuf); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(m, n)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void DGMM<float, lang::Opencl>(bool side_right, + const size_t nrow, const size_t ncol, + const Block *M, const Block *v, Block *out, Context *ctx) { + cl_int status = CL_SUCCESS; + + cl::Buffer Mbuf = *(static_cast<cl::Buffer*>(M->mutable_data())); + cl::Buffer vbuf = *(static_cast<cl::Buffer*>(v->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + std::string kname; + if (side_right) { + DiagVec_Right(ncol, vbuf, vbuf, ctx); + kname = "clkernel_dgmm_right"; + } else { + DiagVec_Left(nrow, vbuf, vbuf, ctx); + kname = "clkernel_dgmm_left"; + } + + auto kernel = ctx->kernels->at(kname); + + kernel.setArg(0, (cl_int)nrow); + kernel.setArg(1, (cl_int)ncol); + kernel.setArg(2, Mbuf); + kernel.setArg(3, vbuf); + kernel.setArg(4, outbuf); + kernel.setArg(5, cl::Local(sizeof(float) * nrow * ncol)); + + cl::NDRange global(nrow); // Only nrow because current implementation is 1 dimensional +// cl::NDRange local(); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + + +template<> +void GEMM<float, lang::Opencl>(const bool transA, const bool transB, + const size_t nrowA, const size_t ncolB, const size_t ncolA, + const float alpha, const Block *A, const Block *B, const float beta, + Block *C, Context *ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_gemm"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer Abuf = *(static_cast<cl::Buffer*>(A->mutable_data())); + cl::Buffer Bbuf = *(static_cast<cl::Buffer*>(B->mutable_data())); + cl::Buffer Cbuf = *(static_cast<cl::Buffer*>(C->mutable_data())); + + // If matrix A needs to be transposed, do it. + if (!transA) + Transpose(nrowA, ncolA, Abuf, Abuf, ctx); + + // If vector B needs to be transposed, do it. + if (!transB) + Transpose(nrowA, ncolB, Bbuf, Bbuf, ctx); + + kernel.setArg(0, (cl_int)nrowA); + kernel.setArg(1, (cl_int)ncolB); + kernel.setArg(2, (cl_int)ncolA); + kernel.setArg(3, alpha); + kernel.setArg(4, Abuf); + kernel.setArg(5, Bbuf); + kernel.setArg(6, beta); + kernel.setArg(7, Cbuf); + + cl::NDRange global(nrowA, ncolA); + cl::NDRange local(32, 32); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global, local); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + +template <> +void ComputeCrossEntropy<float, lang::Opencl>(const size_t batchsize, const size_t dim, + const Block *p, const Block *t, Block *loss, + Context *ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_crossentropy"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer pbuf = *(static_cast<cl::Buffer*>(p->mutable_data())); + cl::Buffer tbuf = *(static_cast<cl::Buffer*>(t->mutable_data())); + cl::Buffer lossbuf = *(static_cast<cl::Buffer*>(loss->mutable_data())); + + kernel.setArg(0, (cl_uint)batchsize); + kernel.setArg(1, (cl_uint)dim); + kernel.setArg(2, pbuf); + kernel.setArg(3, tbuf); + kernel.setArg(4, lossbuf); + + cl::NDRange global(batchsize); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + +template <> +void SoftmaxCrossEntropyBwd<float, lang::Opencl>(const size_t batchsize, const size_t dim, + const Block *p, const Block *t, Block *grad, + Context *ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_softmaxentropy"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer pbuf = *(static_cast<cl::Buffer*>(p->mutable_data())); + cl::Buffer tbuf = *(static_cast<cl::Buffer*>(t->mutable_data())); + cl::Buffer gradbuf = *(static_cast<cl::Buffer*>(grad->mutable_data())); + + kernel.setArg(0, (cl_uint)batchsize); + kernel.setArg(1, (cl_uint)dim); + kernel.setArg(2, pbuf); + kernel.setArg(3, tbuf); + kernel.setArg(4, gradbuf); + + cl::NDRange global(batchsize); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), global); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + +// ************************************** +// Matrix functions +// ************************************** +/* +template<> +void AddCol<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* A, const Block* v, Block* out, Context* ctx) { + std::string kname = "clkernel_addcol"; + auto kernel = ctx->kernels->at(kname); + kernel.setArg(0, (cl_int)nrow); + kernel.setArg(1, (cl_int)ncol); + kernel.setArg(2, static_cast<const float*>(A->mutable_data())); + kernel.setArg(3, static_cast<const float*>(v->mutable_data())); + kernel.setArg(3, static_cast<float*>(out->mutable_data())); + + ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol)); +} + +template<> +void AddRow<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* A, const Block* v, Block* out, Context* ctx) { + std::string kname = "clkernel_addrow"; + auto kernel = ctx->kernels->at(kname); + kernel.setArg(0, (cl_int)nrow); + kernel.setArg(1, (cl_int)ncol); + kernel.setArg(2, static_cast<const float*>(A->mutable_data())); + kernel.setArg(3, static_cast<const float*>(v->mutable_data())); + kernel.setArg(3, static_cast<float*>(out->mutable_data())); + + ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol)); +} + +template<> +void Outer<float, lang::Opencl>(const size_t m, const size_t n, const Block* lhs, const Block* rhs, Block* out, Context* ctx) { + std::string kname = "clkernel_outerproduct"; + auto kernel = ctx->kernels->at(kname); + kernel.setArg(0, (cl_int)m); + kernel.setArg(1, (cl_int)n); + kernel.setArg(2, static_cast<const float*>(lhs->data())); + kernel.setArg(3, static_cast<const float*>(rhs->data())); + kernel.setArg(4, static_cast<float*>(out->mutable_data())); + + ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(m, n)); +} + +template<> +void SumColumns<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) { + std::string kname = "clkernel_sumcol"; + auto kernel = ctx->kernels->at(kname); + kernel.setArg(0, (cl_int)nrow); + kernel.setArg(1, (cl_int)ncol); + kernel.setArg(2, static_cast<const float*>(in->mutable_data())); + kernel.setArg(3, static_cast<float*>(out->mutable_data())); + + ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol)); +}*/ +/* +template<> +void SumRows<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_sumrow"; + auto kernel = ctx->kernels->at(kname); + + cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); + cl::Buffer outbuf = *(static_cast<cl::Buffer*>(out->mutable_data())); + + kernel.setArg(0, (cl_int)nrow); + kernel.setArg(1, (cl_int)ncol); + kernel.setArg(2, inbuf); + kernel.setArg(3, outbuf); + kernel.setArg(4, cl::Local(sizeof(float) * nrow * ncol)); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol)); +} +*/ + + +#define BLOCK_DIM 16 + +void Transpose(const size_t nrow, const size_t ncol, cl::Buffer& in, cl::Buffer& out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_transpose"; + auto kernel = ctx->kernels->at(kname); + + kernel.setArg(0, (cl_uint)nrow); + kernel.setArg(1, (cl_uint)ncol); + kernel.setArg(2, in); + kernel.setArg(3, out); + kernel.setArg(4, cl::Local((BLOCK_DIM + 1) * BLOCK_DIM)); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(nrow, ncol)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + +#undef BLOCK_DIM + + +/// This is a utility function that transforms a single-row vector into a diagonal matrix. +/// For example, a vector of size n will become a matrix of size n*n where only the positions nx == ny will have values. +void DiagVec_Left(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_diagvec_left"; + auto kernel = ctx->kernels->at(kname); + + kernel.setArg(0, (cl_uint)size); + kernel.setArg(1, in); + kernel.setArg(2, out); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(size)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + +void DiagVec_Right(const size_t size, cl::Buffer& in, cl::Buffer& out, Context* ctx) { + cl_int status = CL_SUCCESS; + + std::string kname = "clkernel_diagvec_right"; + auto kernel = ctx->kernels->at(kname); + + kernel.setArg(0, (cl_uint)size); + kernel.setArg(1, in); + kernel.setArg(2, out); + + status = ctx->ocl_cmdq.enqueueNDRangeKernel(kernel, cl::NDRange(0), cl::NDRange(size)); + OCL_CHECK(status, "Failed to enqueue kernel function!"); +} + +} // namespace singa + +#endif // USE_OPENCL + #endif // SINGA_CORE_TENSOR_TENSOR_MATH_OPENCL_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/src/utils/opencl_utils.cc ---------------------------------------------------------------------- diff --git a/src/utils/opencl_utils.cc b/src/utils/opencl_utils.cc new file mode 100644 index 0000000..e4fe69b --- /dev/null +++ b/src/utils/opencl_utils.cc @@ -0,0 +1,63 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#include "singa/utils/opencl_utils.h" + +#ifdef USE_OPENCL + +void PrintDeviceInfo(const cl::Device &dev) { + cl_int status = CL_SUCCESS; + + LOG(INFO) << "\tDevice type: " << dev.getInfo<CL_DEVICE_TYPE>(&status); + LOG(INFO) << "\tUnified memory: " << dev.getInfo<CL_DEVICE_HOST_UNIFIED_MEMORY>(&status); + LOG(INFO) << "\tClock speed (MHz): " << dev.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(&status); + LOG(INFO) << "\tECC memory: " << dev.getInfo<CL_DEVICE_ERROR_CORRECTION_SUPPORT>(&status); + LOG(INFO) << "\tLittle endian: " << dev.getInfo<CL_DEVICE_ENDIAN_LITTLE>(&status); + LOG(INFO) << "\tCompute units: " << dev.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(&status); + LOG(INFO) << "\tMax work grp size: " << dev.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(&status); +//LOG(INFO) << "\tMax work item size: " << dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>(&status); + LOG(INFO) << "\tMax item dimension: " << dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS>(&status); + LOG(INFO) << "\tQueue properties: " << dev.getInfo<CL_DEVICE_QUEUE_PROPERTIES>(&status); + LOG(INFO) << "\tExecution capabilities: " << dev.getInfo<CL_DEVICE_EXECUTION_CAPABILITIES>(&status); + LOG(INFO) << "\tMax mem alloc size: " << dev.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>(&status); + LOG(INFO) << "\tGlobal mem size: " << dev.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>(&status); + LOG(INFO) << "\tLocal mem size: " << dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(&status); + LOG(INFO) << "\n"; + + OCL_CHECK(status, "Failed to retrieve device information!"); +} + + +void PrintPlatformInfo(const cl::Platform &p) { + cl_int status = CL_SUCCESS; + + LOG(INFO) << "\tName: " << p.getInfo<CL_PLATFORM_NAME>(&status); + LOG(INFO) << "\tProfile: " << p.getInfo<CL_PLATFORM_PROFILE>(&status); + LOG(INFO) << "\tVersion: " << p.getInfo<CL_PLATFORM_VERSION>(&status); + LOG(INFO) << "\tVendor: " << p.getInfo<CL_PLATFORM_VENDOR>(&status); + LOG(INFO) << "\tExtensions: " << p.getInfo<CL_PLATFORM_EXTENSIONS>(&status); + LOG(INFO) << "\n"; + + OCL_CHECK(status, "Failed to retrieve platform information!"); +} + + +#endif // USE_OPENCL http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/35c89308/test/singa/test_opencl.cc ---------------------------------------------------------------------- diff --git a/test/singa/test_opencl.cc b/test/singa/test_opencl.cc new file mode 100644 index 0000000..f426559 --- /dev/null +++ b/test/singa/test_opencl.cc @@ -0,0 +1,179 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#include "gtest/gtest.h" +#include "singa/core/opencl_device.h" +#include "singa/core/tensor.h" +#include "singa/proto/core.pb.h" + +using singa::OpenclDevice; +using singa::CppCPU; +using singa::Block; +using singa::Shape; +using singa::Tensor; + +// Makes a float array and fills it with increasing values from 0. +float* MakeMatrix(const int size) { + float* mat = new float[size]; + for (int i = 0; i < size; i++) + mat[i] = i; + return mat; +} + + +TEST(OpenclDevice, Constructor) { + OpenclDevice dev; + EXPECT_EQ(0, dev.id()); +} + +TEST(OpenclDevice, MemoryAllocFree) { + OpenclDevice dev; + Block* b = dev.NewBlock(4); + EXPECT_NE(nullptr, b); + EXPECT_EQ(4u, b->size()); + dev.FreeBlock(b); +} + +// Tests for integrity of one round of data transfer to an OpenCL device and back. +TEST(OpenclDevice, CopyDataToFrom) { + OpenclDevice dev; + CppCPU host; + + Block* a = host.NewBlock(4); + Block* b = dev.NewBlock(4); + Block* c = host.NewBlock(4); + + // Allocate the Block object on the host. + char s[] = {'a', 'b', 'c', 'x'}; + host.CopyDataFromHostPtr(a, s, 4); + + // Copy back and forth. + dev.CopyDataToFrom(b, a, 4, singa::kHostToDevice); + dev.CopyDataToFrom(c, b, 4, singa::kDeviceToHost); + + const char* astr = static_cast<const char*>(c->data()); + EXPECT_EQ('a', astr[0]); + EXPECT_EQ('b', astr[1]); + EXPECT_EQ('x', astr[3]); +} + +TEST(OpenclDevice, DuplicateDataOnDevice) { + OpenclDevice dev; + CppCPU host; + + Block* a = host.NewBlock(4); + Block* b = dev.NewBlock(4); + Block* c = dev.NewBlock(4); + Block* d = host.NewBlock(4); + + // Allocate the Block object on the host. + char s[] = {'a', 'b', 'c', 'x'}; + host.CopyDataFromHostPtr(a, s, 4); + + // Copy to device and duplicate. + dev.CopyDataToFrom(b, a, 4, singa::kHostToDevice); + dev.CopyDataToFrom(c, b, 4, singa::kDeviceToDevice); + dev.CopyDataToFrom(d, c, 4, singa::kDeviceToHost); + + const char* astr = static_cast<const char*>(d->data()); + EXPECT_EQ('a', astr[0]); + EXPECT_EQ('b', astr[1]); + EXPECT_EQ('x', astr[3]); +} + +// Tensor tests + +TEST(OpenCL_TensorMath, TensorMath_CopyDataToDevice) { + auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice()); + + Tensor t(Shape{1, 4}, ocl_dev); + float a[] = {0.0f, 1.0f, 2.0f, 3.0f}; + t.CopyDataFromHostPtr(a, 4); + + CppCPU host; + Block* host_out = host.NewBlock(sizeof(float) * 4); + ocl_dev->CopyDataToFrom(host_out, t.block(), sizeof(float) * 4, singa::kDeviceToHost); + + float* out = static_cast<float*>(host_out->mutable_data()); + EXPECT_EQ(1.0f, out[1]); + EXPECT_EQ(3.0f, out[3]); +} + +TEST(OpenCL_TensorMath, TensorMath_Abs) { + auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice()); + + Tensor in(Shape{1, 4}, ocl_dev); + float a[] = {0.0f, -1.0f, -2.0f, -3.0f}; + in.CopyDataFromHostPtr(a, 4); + + in = Abs(in); + + CppCPU host; + Block* host_out = host.NewBlock(sizeof(float) * 4); + ocl_dev->CopyDataToFrom(host_out, in.block(), sizeof(float) * 4, singa::kDeviceToHost); + + float* out = static_cast<float*>(host_out->mutable_data()); + EXPECT_EQ(0.0f, out[0]); + EXPECT_EQ(1.0f, out[1]); + EXPECT_EQ(2.0f, out[2]); + EXPECT_EQ(3.0f, out[3]); +} + +TEST(OpenCL_TensorMath, TensorMath_ScalarAdd) { + auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice()); + + Tensor in(Shape{1, 4}, ocl_dev); + float a[] = {0.0f, 1.0f, 2.0f, 3.0f}; + in.CopyDataFromHostPtr(a, 4); + + in += 1.0f; + + CppCPU host; + Block* host_out = host.NewBlock(sizeof(float) * 4); + ocl_dev->CopyDataToFrom(host_out, in.block(), sizeof(float) * 4, singa::kDeviceToHost); + + float* out = static_cast<float*>(host_out->mutable_data()); + EXPECT_EQ(1.0f, out[0]); + EXPECT_EQ(2.0f, out[1]); + EXPECT_EQ(3.0f, out[2]); + EXPECT_EQ(4.0f, out[3]); +} + +TEST(OpenCL_TensorMath, TensorMath_EltwiseAdd) { + auto ocl_dev = std::make_shared<OpenclDevice>(OpenclDevice()); + + Tensor in_1(Shape{1, 4}, ocl_dev); + float a[] = {0.0f, 1.0f, 2.0f, 3.0f}; + in_1.CopyDataFromHostPtr(a, 4); + Tensor in_2 = in_1.Clone(); + + in_2 += in_1; + + CppCPU host; + Block* host_out = host.NewBlock(sizeof(float) * 4); + ocl_dev->CopyDataToFrom(host_out, in_2.block(), sizeof(float) * 4, singa::kDeviceToHost); + + float* out = static_cast<float*>(host_out->mutable_data()); + EXPECT_EQ(0.0f, out[0]); + EXPECT_EQ(2.0f, out[1]); + EXPECT_EQ(4.0f, out[2]); + EXPECT_EQ(6.0f, out[3]); +}
