http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f6cf8f5d/src/core/tensor/tensor_math_opencl.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_opencl.h 
b/src/core/tensor/tensor_math_opencl.h
index 55acb09..6ab248b 100644
--- a/src/core/tensor/tensor_math_opencl.h
+++ b/src/core/tensor/tensor_math_opencl.h
@@ -49,7 +49,7 @@ namespace singa {
 template<>
 void Abs<float, lang::Opencl>(const size_t num, const Block* in, Block* out, 
Context* ctx) {
   auto ocl_ctx = get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_fabs");
+  auto kernel = ocl_ctx.get_kernel("opencl_tensor_math", "clkernel_fabs");
 
   viennacl::vector<float> v_in((const cl_mem)in->data(), num);
   viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
@@ -85,7 +85,7 @@ template<>
 void Clamp<float, lang::Opencl>(const size_t num, const float low, const float 
high,
                                 const Block* in, Block* out, Context* ctx) {
   auto ocl_ctx = get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_clamp");
+  auto kernel = ocl_ctx.get_kernel("opencl_tensor_math", "clkernel_clamp");
 
   viennacl::vector<float> v_in((const cl_mem)in->data(), num);
   viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
@@ -162,7 +162,7 @@ void Exp<float, lang::Opencl>(const size_t num, const 
Block* in, Block* out, Con
 template<>
 void LE<float, lang::Opencl>(const size_t num, const Block *in, const float x, 
Block *out, Context *ctx) {
   auto ocl_ctx = get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_le");
+  auto kernel = ocl_ctx.get_kernel("opencl_tensor_math", "clkernel_le");
 
   viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
   viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), 
num);
@@ -183,7 +183,7 @@ void Log<float, lang::Opencl>(const size_t num, const 
Block* in, Block* out, Con
 template<>
 void LT<float, lang::Opencl>(const size_t num, const Block *in, const float x, 
Block *out, Context *ctx) {
   auto ocl_ctx = get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_lt");
+  auto kernel = ocl_ctx.get_kernel("opencl_tensor_math", "clkernel_lt");
 
   viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
   viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), 
num);
@@ -195,7 +195,7 @@ void LT<float, lang::Opencl>(const size_t num, const Block 
*in, const float x, B
 template<>
 void GE<float, lang::Opencl>(const size_t num, const Block *in, const float x, 
Block *out, Context *ctx) {
   auto ocl_ctx = get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_ge");
+  auto kernel = ocl_ctx.get_kernel("opencl_tensor_math", "clkernel_ge");
 
   viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
   viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), 
num);
@@ -207,7 +207,7 @@ void GE<float, lang::Opencl>(const size_t num, const Block 
*in, const float x, B
 template<>
 void GT<float, lang::Opencl>(const size_t num, const Block *in, const float x, 
Block *out, Context *ctx) {
   auto ocl_ctx = get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_gt");
+  auto kernel = ocl_ctx.get_kernel("opencl_tensor_math", "clkernel_gt");
 
   viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
   viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), 
num);
@@ -241,7 +241,7 @@ void Pow<float, lang::Opencl>(const size_t num, const 
Block* in1, const Block* i
 template<>
 void ReLU<float, lang::Opencl>(const size_t num, const Block* in, Block* out, 
Context* ctx) {
   auto ocl_ctx = get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_relu");
+  auto kernel = ocl_ctx.get_kernel("opencl_tensor_math", "clkernel_relu");
 
   viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
   viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), 
num);
@@ -277,7 +277,7 @@ void Sigmoid<float, lang::Opencl>(const size_t num, const 
Block* in, Block* out,
 template<>
 void Sign<float, lang::Opencl>(const size_t num, const Block* in, Block* out, 
Context* ctx) {
   auto ocl_ctx = get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_sign");
+  auto kernel = ocl_ctx.get_kernel("opencl_tensor_math", "clkernel_sign");
 
   viennacl::vector<float> in_buf((const cl_mem)in->data(), num);
   viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), 
num);
@@ -343,7 +343,7 @@ static cl_uint rounds = 8;
 template<>
 void Bernoulli<float, lang::Opencl>(const size_t num, const float p, Block* 
out, Context *ctx) {
   auto ocl_ctx = get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("distribution.cl", 
"PRNG_threefry4x32_bernoulli");
+  auto kernel = ocl_ctx.get_kernel("opencl_distribution", 
"PRNG_threefry4x32_bernoulli");
 
   viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
@@ -356,7 +356,7 @@ void Bernoulli<float, lang::Opencl>(const size_t num, const 
float p, Block* out,
 template<>
 void Gaussian<float, lang::Opencl>(const size_t num, const float mean, const 
float std, Block* out, Context *ctx) {
   auto ocl_ctx = get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("distribution.cl", 
"PRNG_threefry4x32_gaussian");
+  auto kernel = ocl_ctx.get_kernel("opencl_distribution", 
"PRNG_threefry4x32_gaussian");
 
   viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num);
 
@@ -369,7 +369,7 @@ void Gaussian<float, lang::Opencl>(const size_t num, const 
float mean, const flo
 template<>
 void Uniform<float, lang::Opencl>(const size_t num, const float low, const 
float high, Block* out, Context *ctx) {
   auto ocl_ctx = get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("distribution.cl", 
"PRNG_threefry4x32_uniform");
+  auto kernel = ocl_ctx.get_kernel("opencl_distribution", 
"PRNG_threefry4x32_uniform");
 
   viennacl::ocl::packed_cl_uint seed = {0, 32, 42, 888};
 
@@ -562,7 +562,7 @@ void ComputeCrossEntropy<float, lang::Opencl>(bool 
int_target, const size_t batc
                          const size_t dim, const Block *p, const Block *t,
                          Block *loss, Context *ctx) {
   auto ocl_ctx = get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", 
"clkernel_crossentropy");
+  auto kernel = ocl_ctx.get_kernel("opencl_tensor_math", 
"clkernel_crossentropy");
 
   viennacl::vector<float> p_buf((const cl_mem)p->data(), batchsize);
   viennacl::vector<float> t_buf((const cl_mem)t->data(), batchsize);
@@ -577,7 +577,7 @@ void SoftmaxCrossEntropyBwd<float, lang::Opencl>(bool 
int_target, const size_t b
                             const Block *p, const Block *t, Block *grad,
                             Context *ctx) {
   auto ocl_ctx = get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", 
"clkernel_softmaxentropy");
+  auto kernel = ocl_ctx.get_kernel("opencl_tensor_math", 
"clkernel_softmaxentropy");
 
   viennacl::vector<float> p_buf((const cl_mem)p->data(), batchsize);
   viennacl::vector<float> t_buf((const cl_mem)t->data(), batchsize);
@@ -591,7 +591,7 @@ template<>
 void RowMax<float, lang::Opencl>(const size_t nrow, const size_t ncol,
                                  const Block *in, Block *out, Context *ctx) {
   auto ocl_ctx = get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_rowmax");
+  auto kernel = ocl_ctx.get_kernel("opencl_tensor_math", "clkernel_rowmax");
 
 //  kernel.global_work_size(0, nrow);
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f6cf8f5d/src/core/tensor/tensormath_str.cpp
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensormath_str.cpp 
b/src/core/tensor/tensormath_str.cpp
deleted file mode 100644
index 38bf7fe..0000000
--- a/src/core/tensor/tensormath_str.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// This file is auto-generated, do not edit manually.
-// If any error occurs during compilation, please refer to clsrc_to_str.py
-#include <string>
-
-namespace singa {
-
-std::string tensormath_str = "/**\n * Licensed to the Apache Software 
Foundation (ASF) under one\n * or more contributor license agreements.  See the 
NOTICE file\n * distributed with this work for additional information\n * 
regarding copyright ownership.  The ASF licenses this file\n * to you under the 
Apache License, Version 2.0 (the\n * \"License\"); you may not use this file 
except in compliance\n * with the License.  You may obtain a copy of the 
License at\n *\n *     http://www.apache.org/licenses/LICENSE-2.0\n *\n * 
Unless required by applicable law or agreed to in writing, software\n * 
distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT 
WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the 
License for the specific language governing permissions and\n * limitations 
under the License.\n */\n\n// **************************************\n// 
Element-wise functions\n// **************************************\n\n// Sum is 
basically re
 duction.\n// This reduction code is serial reduction modified from AMD\'s 
example.\n// 
http://developer.amd.com/resources/documentation-articles/articles-whitepapers/opencl-optimization-case-study-simple-reductions/\n__kernel\nvoid
 clkernel_fabs(const int num, __global const float* in, __global float* out) 
{\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = 
fabs(in[i]);\n}\n\n__kernel\nvoid clkernel_add_scalar(const int num, float x, 
__global const float* in, __global float* out) {\n  const int i = 
get_global_id(0);\n  if (i >= num) return;\n  out[i] = in[i] + 
x;\n}\n\n__kernel\nvoid clkernel_add(const int num, __global const float* in1, 
__global const float* in2,\n  __global float* out) {\n  const int i = 
get_global_id(0);\n  if (i >= num) return;\n  out[i] = in1[i] + 
in2[i];\n}\n\n__kernel\nvoid clkernel_clamp(const int num, float low, float 
high, __global const float* in,\n  __global float* out) {\n  const int i = 
get_global_id(0);\n  if (i >= num) return;\n
   out[i] = clamp(in[i], low, high);\n}\n\n__kernel\nvoid 
clkernel_divide_scalar_matx(const int num, __global const float* in1, const 
float x,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i 
>= num) return;\n  out[i] = in1[i] / x;\n}\n\n__kernel\nvoid 
clkernel_divide_scalar_xmat(const int num, const float x, __global const float* 
in1,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= 
num) return;\n  out[i] = x / in1[i];\n}\n\n__kernel\nvoid clkernel_divide(const 
int num, __global const float* in1, __global const float* in2,\n  __global 
float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  
out[i] = in1[i] / in2[i];\n}\n\n__kernel\nvoid clkernel_eltmult_scalar(const 
int num, const float x, __global const float* in,\n  __global float* out) {\n  
const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = in[i] * 
x;\n}\n\n__kernel\nvoid clkernel_eltmult(const int num, __global const float* 
in1, __global const flo
 at* in2,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i 
>= num) return;\n  out[i] = in1[i] * in2[i];\n}\n\n__kernel\nvoid 
clkernel_exp(const int num, __global const float* in, __global float* out) {\n  
const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = 
exp(in[i]);\n}\n\n__kernel\nvoid clkernel_le(const int num, __global const 
float* in, const float x,\n  __global float* out) {\n  const int i = 
get_global_id(0);\n  if (i >= num) return;\n  out[i] = (in[i] <= x) ? 1.0f : 
0.0f;\n}\n\n__kernel\nvoid clkernel_log(const int num, __global const float* 
in, __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) 
return;\n  out[i] = log(in[i]);\n}\n\n__kernel\nvoid clkernel_lt(const int num, 
__global const float* in, const float x,\n  __global float* out) {\n  const int 
i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = (in[i] < x) ? 1.0f : 
0.0f;\n}\n\n__kernel\nvoid clkernel_ge(const int num, __global const float* in, 
const 
 float x,\n  __global float* out) {\n  const int i = get_global_id(0);\n  if (i 
>= num) return;\n  out[i] = (in[i] >= x) ? 1.0f : 0.0f;\n}\n\n__kernel\nvoid 
clkernel_gt(const int num, __global const float* in, const float x,\n  __global 
float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  
out[i] = (in[i] > x) ? 1.0f : 0.0f;\n}\n\n__kernel\nvoid 
clkernel_pow_scalar(const int num, const float x, __global const float* in,\n  
__global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) 
return;\n  out[i] = pow(in[i], x);\n}\n\n__kernel\nvoid clkernel_pow(const int 
num, __global const float* in1, __global const float* in2,\n  __global float* 
out) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = 
pow(in1[i], in2[i]);\n}\n\n__kernel\nvoid clkernel_relu(const int num, __global 
const float* in, __global float* out) {\n  const int i = get_global_id(0);\n  
if (i >= num) return;\n  out[i] = (in[i] >= 0.0f) ? in[i] : 0.0f;\n}\n\n__kerne
 l\nvoid clkernel_set(const int num, const float x, __global float* out) {\n  
const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = 
x;\n}\n\n__kernel\nvoid clkernel_sigmoid(const int num, __global const float* 
in, __global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) 
return;\n  out[i] = 1 / (1 + exp(-(in[i])));\n}\n\n__kernel\nvoid 
clkernel_sign(const int num, __global const float* in, __global float* out) {\n 
 const int i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = (in[i] > 
0) - (in[i] < 0);\n}\n\n__kernel\nvoid clkernel_sqrt(const int num, __global 
const float* in, __global float* out) {\n  const int i = get_global_id(0);\n  
if (i >= num) return;\n  out[i] = sqrt(in[i]);\n}\n\n// kernel for square is 
called pow(2).\n\n__kernel\nvoid clkernel_subtract_scalar(const int num, 
__global const float* in, const float x,\n  __global float* out) {\n  const int 
i = get_global_id(0);\n  if (i >= num) return;\n  out[i] = in[i] - 
x;\n}\n\n__kernel
 \nvoid clkernel_subtract(const int num, __global const float* in1, __global 
const float* in2,\n   __global float* out) {\n  const int i = 
get_global_id(0);\n  if (i >= num) return;\n  out[i] = in1[i] - 
in2[i];\n}\n\n// reduce3 kernel from\n// 
https://github.com/sschaetz/nvidia-opencl-examples/blob/master/OpenCL/src/oclReduction/oclReduction_kernel.cl\n__kernel\nvoid
 clkernel_sum(const int num, __global const float* in, __global float* out,\n  
__local float* sdata) {\n  const int i = get_group_id(0)*(get_local_size(0)*2) 
+ get_local_id(0);\n  const int tid = get_local_id(0);\n  sdata[tid] = (i < 
num) ? in[i] : 0.0f;\n\n  // Perform the first level of reduction.\n  if (i + 
get_local_size(0) < num) {\nsdata[tid] += in[i + get_local_size(0)];\n  }\n  
barrier(CLK_LOCAL_MEM_FENCE);\n\n  for (int s = get_local_size(0)/2; s > 0; s 
>>= 1) {\nif (tid > s) {\n  sdata[tid] += sdata[tid + 
s];\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n  }\n\n  if (tid == 0) 
{\nout[get_group_id(0)] = sdata[0];\n  }\n}\n\
 n__kernel\nvoid clkernel_tanh(const int num, __global const float* in, 
__global float* out) {\n  const int i = get_global_id(0);\n  if (i >= num) 
return;\n  out[i] = tanh(in[i]);\n}\n\n// 
**************************************\n// Random functions\n// 
**************************************\n\n// See: distribution.cl\n\n// 
*********************************************************\n// BLAS functions, 
ref to http://docs.nvidia.com/cuda/cublas\n// 
*********************************************************\n\n__kernel\nvoid 
clkernel_amax(const int num, __global const float* in, __global int* ret,\n   
__local uint* sdata, __local size_t* temp) {\n  const int gid = 
get_global_id(0);\n  const int tid = get_local_id(0);\n\n  for(int s = 
get_local_size(0)/2; s > 0; s >>= 1) {\nif (tid < s) {\n  sdata[tid] = 
(in[sdata[tid]] > in[tid+s]) ? sdata[tid] : 
tid;\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n  }\n  if (tid == 0) {\nret[0] = 
sdata[0];\n  }\n}\n\n\n/* TODO: Fix line 284:20.\n__kernel\nvoid clkerne
 l_amin(const int num, __global const float* in, __global int* ret,\n   __local 
float* sdata, __local size_t* temp) {\n  const int gid = get_global_id(0);\n  
const int tid = get_local_id(0);\n\n  // Initialize the values to pos 
infinity.\n  sdata[tid] = (gid < num) ? in[gid] : INFINITY;\n  
barrier(CLK_LOCAL_MEM_FENCE);\n\n  for(int s = get_local_size(0)/2; s > 0; s 
>>= 1) {\nif (tid < s) {\n  sdata[tid] = (in[sdata[tid]] < in[tid+s]) ? 
sdata[tid] : tid;\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n  }\n  if (tid == 0) 
{\nret[0] = sdata[0];\n  }\n}*/\n\n\n__kernel\nvoid clkernel_asum(const int 
num, __global const float* in, __global float* out,\n   __local float* sdata) 
{\n  const int tid = get_local_id(0);\n  const int i = get_global_id(0);\n\n  
// Initialize\n  sdata[tid] = (i < num) ? in[i] : INFINITY;\n  // Perform the 
first level of reduction.\n  if (i + get_local_size(0) < num) {\nsdata[tid] += 
in[i + get_local_size(0)];\n  }\n  barrier(CLK_LOCAL_MEM_FENCE);\n\n  for(int s 
= get_local_siz
 e(0)/2; s > 0; s >>= 1) {\nif (tid < s) {\n  sdata[tid] = fabs(sdata[tid + 
s]);\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n  }\n  if (tid == 0) {\nout[0] = 
sdata[0];\n  }\n}\n\n__kernel\nvoid clkernel_axpy(const int num, float alpha, 
__global const float* in,\n   __global float* out) {\n  const int i = 
get_global_id(0);\n  if (i >= num) return;\n  out[i] = fma(alpha, in[i], 
out[i]);\n}\n\n// This kernel is essentially the same as Sum, except that 
during the process\n// of reading in data to the local memory, the value is 
also doubled.\n// Then, just before submitting the sum to out, we do a 
square-root on it.\n__kernel\nvoid clkernel_nrm2(const int num, __global const 
float* in, __global float* out,\n   __local float* sdata) {\n  const int i = 
get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);\n  const int tid = 
get_local_id(0);\n  sdata[tid] = (i < num) ? (in[i] * in[i]) : 0.0f;\n\n  // 
Perform the first level of reduction.\n  if (i + get_local_size(0) < num) 
{\nsdata[tid] += in[i + 
 get_local_size(0)];\n  }\n  barrier(CLK_LOCAL_MEM_FENCE);\n\n  for (int s = 
get_local_size(0)/2; s > 0; s >>= 1) {\nif (tid > s) {\n  sdata[tid] += 
sdata[tid + s];\n}\nbarrier(CLK_LOCAL_MEM_FENCE);\n  }\n\n  if (tid == 0) 
{\nout[get_group_id(0)] = sqrt(sdata[0]);\n  }\n}\n\n__kernel\nvoid 
clkernel_scale(const int num, float x, __global float* out) {\n  const int i = 
get_global_id(0);\n  if (i >= num) return;\n  out[i] = x * 
out[i];\n}\n\n__kernel\nvoid clkernel_dot(const int num, __global const float* 
in1, __global const float* in2,\n    __global float* out, __local float* 
scratch) {\n  const int i = get_global_id(0);\n  if (i >= num) return;\n  int 
offset = i << 2;\n  scratch[i] = in1[offset] * in2[offset];\n\n}\n\n// First 
kernel from http://www.bealto.com/gpu-gemv_intro.html\n// y = \xce\xb1*A*v + 
\xce\xb2*y\n// fma(a, b, c) == (a * b) + c with infinite 
precision\n__kernel\nvoid clkernel_gemv(const int m, const int n, const float 
alpha,\n   __global const float* A, __global const
  float* v,\n   const float beta, __global float* out) {\n  const int i = 
get_global_id(0);\n  float sum  = 0.0f;\n  for (int k = 0; k < n; k++) {\n    
sum += fma(beta, out[i + m * k], alpha * A[i + m * k] * v[k]);\n  }\n  out[i] = 
sum;\n}\n\n// http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-dgmm\n// X[j] 
= x[j*inc(x)] if inc(x) \xe2\x89\xa5 0\n//= x[(\xcf\x87 \xe2\x88\x92 
1)*|inc(x)| \xe2\x88\x92 j*|inc(x)|] if inc(x) < 0\n\n// C = diag( X 
)*A\n__kernel\nvoid clkernel_dgmm_left(const int nrow, const int 
ncol,\n__global const float* M, __global const float* v,\n__global float* out) 
{\n  const uint gidx = get_global_id(0);\n\n  uint offset = gidx * ncol;\n  for 
(uint i = 0; i < ncol; i++) {\nout[offset + i] = M[offset + i] * v[i];\n  
}\n}\n\n// C = A*diag( X )\n__kernel\nvoid clkernel_dgmm_right(const int nrow, 
const int ncol,\n __global const float* M, __global const float* v,\n __global 
float* out) {\n  const uint gidx = get_global_id(0);\n\n  uint offset = gidx * 
ncol;\n  for (
 uint i = 0; i < ncol; i++) {\nout[offset + i] = M[offset + i] * v[gidx];\n  
}\n}\n\n// TODO: Optimize with Reference from 
http://www.cedricnugteren.nl/tutorial.php?page=1\n//  C = \xce\xb1*A*B + 
\xce\xb2*C\n__kernel\nvoid clkernel_gemm(const uint nrowA, const uint ncolB, 
const uint ncolA, const float alpha,\n    __global const float* A, __global 
const float* B, const float beta,\n     __global float* C, __local float* Asub, 
__local float* Bsub) {\n\n  const uint lidx = get_local_id(0);\n  const uint 
lidy = get_local_id(1);\n  const uint TS = get_local_size(0); // Tile size\n  
const uint gidx = TS * get_group_id(0) + lidx; // Row ID of C (0..M)\n  const 
uint gidy = TS * get_group_id(1) + lidy; // Row ID of C (0..N)\n\n  // 
Initialise the accumulation register\n  float acc = 0.0f;\n\n  // Loop over all 
tiles\n  const int numtiles = ncolA / TS;\n  for (int t = 0; t < numtiles; t++) 
{\n    const int tiledRow = TS * t + lidx;\n    const int tiledCol = TS * t + 
lidy;\n    Asub[lidy * TS +
  lidx] = A[tiledCol * nrowA + gidx];\n    Bsub[lidy * TS + lidx] = B[gidy * 
ncolA + tiledRow];\n\n    barrier(CLK_LOCAL_MEM_FENCE);\n\n    for(int k = 0; k 
< TS; k++) {\n      acc += Asub[k * TS + lidx] * Bsub[lidy * TS + k] * alpha;\n 
   }\n\n    barrier(CLK_LOCAL_MEM_FENCE);\n  }\n\n  C[gidy * nrowA + gidx] = 
fma(beta, C[gidy * nrowA + gidx], acc);\n}\n\n\n__kernel\nvoid 
clkernel_crossentropy(const uint batchsize, const uint dim,\n   __global const 
float* p, __global const int* t,\n   __global float* loss) {\n  const uint gidx 
= get_global_id(0);\n  if (gidx >= batchsize) return;\n\n  int truth_idx = 
t[gidx];\n  if (truth_idx <= 0) return;\n  float prob_of_truth = p[gidx * dim + 
truth_idx];\n  loss[gidx] = -log(fmax(prob_of_truth, 
-FLT_MIN));\n}\n\n\n__kernel\nvoid clkernel_softmaxentropy(const uint 
batchsize, const uint dim,\n __global const float* p, __global const int* t,\n 
__global float* grad) {\n  const uint gidx = get_global_id(0);\n  if (gidx >= 
batchsize) return;\n\n  int
  truth_idx = t[gidx];\n  if (truth_idx <= 0) return;\n  grad[gidx * dim + 
truth_idx] -= 1.0;\n}\n\n\n__kernel\nvoid clkernel_rowmax(const uint nrow, 
const uint ncol,\n                     __global const float* in, __global 
float* out) {\n  const uint row_id = get_global_id(0);\n  if (row_id >= nrow) 
return;\n\n  float row_max_val = -FLT_MAX;\n  for (uint i = 0; i < ncol; i++) 
{\n    row_max_val = fmax(row_max_val, in[row_id * ncol + i]);\n  }\n\n  
out[row_id] = row_max_val;\n}\n\n\n// 
**************************************\n// Matrix functions\n// 
**************************************\n/*\n__kernel\nvoid clkernel_addcol(int 
nrow, int ncol, __global const float* A, __global const float* v, __global 
float* out) {\n  const int i = get_global_id(0);\n  const int j = 
get_global_id(1);\n  if (i >= nrow) return;\n  if (j >= ncol) return;\n  ret[j] 
= A[j + nrow * i] + v[j];\n}\n\n__kernel\nvoid clkernel_addrow(int nrow, int 
ncol, __global const float* A, __global const float* v, __global f
 loat* out) {\n  const int i = get_global_id(0);\n  const int j = 
get_global_id(1);\n  if (i >= nrow) return;\n  if (j >= ncol) return;\n  out[i] 
= A[i + ncol * j] + v[i];\n}\n\n__kernel\nvoid clkernel_outerproduct(int m, 
const int n, __global const float* in1, __global const float* in2, __global 
float* out) {\n  const int col = get_global_id(0);\n  const int row = 
get_global_id(1);\n\n  // TODO: This\n}\n\n__kernel\nvoid clkernel_sumcol(int 
nrow, int ncol, __global const float* in, __global float* out) {\n  const int i 
= get_global_id(0);\n  if (i >= nrow) return;\n\n  float sum = 0.0f;\n  for 
(int j = 0; j < nrow; j++) {\nsum += input[nrow * i + j];\n  }\n  out[i] = 
sum;\n}\n*/\n__kernel\nvoid clkernel_sumrow(int nrow, int ncol, __global const 
float* in, __global float* out) {\n  const int idx = get_global_id(0);\n  if 
(idx >= nrow) return;\n\n  float sum = 0.0f;\n  for (int j = 0; j < ncol; j++) 
{\nsum += in[j + ncol * idx];\n  }\n  out[idx] = sum;\n}\n\n\n// Adapted from 
http://c
 ode.haskell.org/HsOpenCL/tests/bench/transpose.cl\n#define BLOCK_DIM 
16\n__kernel\nvoid clkernel_transpose(uint nrow, uint ncol,\n__global const 
float* in, __global float* out,\n__local float* sdata) {\n  uint gidx = 
get_global_id(0);\n  uint gidy = get_global_id(1);\n\n  if ((gidx < ncol) && 
(gidy < nrow)) {\nuint id_in = gidy * ncol + gidx;\nsdata[get_local_id(1) * 
(BLOCK_DIM+1) + get_local_id(0)] = in[id_in];\n  }\n\n  
barrier(CLK_LOCAL_MEM_FENCE);\n\n  gidx = get_group_id(1) * BLOCK_DIM + 
get_local_id(0);\n  gidy = get_group_id(0) * BLOCK_DIM + get_local_id(1);\n  if 
((gidx < nrow) && (gidy < ncol)) {\nuint id_out = gidy * nrow + 
gidx;\nout[id_out] = sdata[get_local_id(0) * (BLOCK_DIM + 1) + 
get_local_id(1)];\n  }\n}\n/*\n__kernel\nvoid clkernel_transpose2(uint nrow, 
uint ncol, __global const float* in, __global float* out, __local float* sdata) 
{\n  const uint lidx = get_local_id(0);\n  const uint lidy = get_local_id(1);\n 
 const uint id0 = get_group_id(0) * ncol * lidx;\n  con
 st uint id1 = get_group_id(1) * nrow * lidy;\n\n  if (id0 < nrow && id1 < 
ncol) {\nsdata[lidx][lidy] = in[id1 * nrow + id0];\n  }\n\n  
barrier(CLK_LOCAL_MEM_FENCE);\n\n  const uint new_id0 = get_group_id(1) * nrow 
+ lidx;\n  const uint new_id1 = get_group_id(0) * ncol + lidy;\n\n  if (new_id0 
< ncol && new_id1 < nrow) {\nout[new_id1 * ncol + new_id0] = 
sdata[lidx][lidy];\n  }\n}*/\n\n__kernel\nvoid clkernel_diagvec_left(uint 
vsize, __global const float* vin, __global float* out) {\n  const uint gid = 
get_global_id(0);\n\n  for (uint i = 0; i < vsize; i++)\nout[gid * vsize + i] = 
(i == gid) ? vin[gid] : 0.0f;\n}\n\n\n__kernel\nvoid 
clkernel_diagvec_right(uint vsize, __global const float* vin, __global float* 
out) {\n  const uint gid = get_global_id(0);\n\n  for (uint i = 0; i < vsize; 
i++)\nout[gid * vsize + i] = (i == gid) ? vin[gid] : 0.0f;\n}\n";
-
-} // namespace singa
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f6cf8f5d/src/model/layer/im2col_str.cpp
----------------------------------------------------------------------
diff --git a/src/model/layer/im2col_str.cpp b/src/model/layer/im2col_str.cpp
deleted file mode 100644
index 2245b82..0000000
--- a/src/model/layer/im2col_str.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// This file is auto-generated, do not edit manually.
-// If any error occurs during compilation, please refer to clsrc_to_str.py
-#include <string>
-
-namespace singa {
-
-std::string im2col_str = "// This file is modified from the file located 
at\n// 
https://github.com/BVLC/caffe/blob/opencl/src/caffe/greentea/cl_kernels/im2col.cl\n//
 and is covered under the BSD 2-Clause License, as indicated in the LICENSE\n// 
file at the root of this repository.\n\n__kernel void im2col(const int n, 
__global const float* data_im,\n                     const int data_im_off,\n   
                  const int height, const int width,\n                     
const int kernel_h, const int kernel_w,\n                     const int pad_h, 
const int pad_w,\n                     const int stride_h, const int 
stride_w,\n                     const int dilation_h, const int dilation_w,\n   
                  const int height_col, const int width_col,\n                  
   __global float* data_col, const int data_col_off) {\n\n  for (int index = 
get_global_id(0); index < n;\n      index += get_global_size(0)) {\n    const 
int h_index = index / width_col;\n    const int h_col = h_in
 dex % height_col;\n    const int w_col = index % width_col;\n    const int 
c_im = h_index / height_col;\n    const int c_col = c_im * kernel_h * 
kernel_w;\n    const int h_offset = h_col * stride_h - pad_h;\n    const int 
w_offset = w_col * stride_w - pad_w;\n    \n    __global float* data_col_ptr = 
data_col + data_col_off;\n    data_col_ptr += (c_col * height_col + h_col) * 
width_col + w_col;\n    __global const float* data_im_ptr = data_im + 
data_im_off;\n    data_im_ptr += (c_im * height + h_offset) * width + 
w_offset;\n    \n    for (int i = 0; i < kernel_h; ++i) {\n      for (int j = 
0; j < kernel_w; ++j) {\n        int h_im = h_offset + i * dilation_h;\n        
int w_im = w_offset + j * dilation_w;\n        *data_col_ptr =\n            
(h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?\n                
data_im_ptr[i * dilation_h * width + j * dilation_w] : 0;\n        data_col_ptr 
+= height_col * width_col;\n      }\n    }\n  }\n}\n\n__kernel void 
col2im(const int n, 
 __global const float* data_col,\n                     const int data_col_off, 
const int channels,\n                     const int height, const int width,\n  
                   const int kernel_h, const int kernel_w,\n                    
 const int pad_h, const int pad_w,\n                     const int stride_h, 
const int stride_w,\n                     const int dilation_h, const int 
dilation_w,\n                     const int height_col, const int width_col,\n  
                   __global float* data_im, const int data_im_off) {\n\n  for 
(int index = get_global_id(0); index < n; index += get_global_size(0)) {\n    
float val = 0;\n    const int w_im = index % width + pad_w;\n    const int h_im 
= (index / width) % height + pad_h;\n    const int c_im = index / (width * 
height);\n    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;\n    int 
kernel_extent_h = (kernel_h - 1) * dilation_h + 1;\n    // compute the start 
and end of the output\n    const int w_col_start =\n        (w
 _im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;\n    
const int w_col_end = min(w_im / stride_w + 1, width_col);\n    const int 
h_col_start =\n        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) 
/ stride_h + 1;\n    const int h_col_end = min(h_im / stride_h + 1, 
height_col);\n    \n    // TODO: use LCM of stride and dilation to avoid 
unnecessary loops\n    for (int h_col = h_col_start; h_col < h_col_end; h_col 
+= 1) {\n      for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {\n 
       int h_k = (h_im - h_col * stride_h);\n        int w_k = (w_im - w_col * 
stride_w);\n        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {\n     
     h_k /= dilation_h;\n          w_k /= dilation_w;\n          int 
data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *\n                
                height_col + h_col) * width_col + w_col;\n          val += 
data_col[data_col_off + data_col_index];\n        }\n      }\n    }\n    
data_im[data_i
 m_off + index] = val;\n  }\n}\n";
-
-} // namespace singa
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f6cf8f5d/src/model/layer/opencl_convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/opencl_convolution.cc 
b/src/model/layer/opencl_convolution.cc
index 4b70a71..eb37236 100644
--- a/src/model/layer/opencl_convolution.cc
+++ b/src/model/layer/opencl_convolution.cc
@@ -175,7 +175,7 @@ void OpenclConvolution::Im2Col(Block* src, int data_im_off,
                                Block* dst, Context* ctx) {
 
   auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("im2col.cl", "im2col");
+  auto kernel = ocl_ctx.get_kernel("opencl_im2col", "im2col");
 
   auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
   auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);
@@ -200,7 +200,7 @@ void OpenclConvolution::Col2Im(Block* src, const int 
col_data_off,
                                Block* dst, Context* ctx) {
                                
   auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("im2col.cl", "col2im");
+  auto kernel = ocl_ctx.get_kernel("opencl_im2col", "col2im");
   
   auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
   auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f6cf8f5d/src/model/layer/opencl_pooling.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/opencl_pooling.cc 
b/src/model/layer/opencl_pooling.cc
index f123270..155f2bb 100644
--- a/src/model/layer/opencl_pooling.cc
+++ b/src/model/layer/opencl_pooling.cc
@@ -131,7 +131,7 @@ void OpenclPooling::Pooling_Forward_Max(const int num, 
Block* src, Block* mask,
                                         Block* dst, const int channels,
                                         Context* ctx) {
   auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("pooling.cl", "max_pool_forward");
+  auto kernel = ocl_ctx.get_kernel("opencl_pooling", "max_pool_forward");
   
   auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
   auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);
@@ -152,7 +152,7 @@ void OpenclPooling::Pooling_Forward_Ave(const int num, 
Block* src, Block* dst,
                                         const int pad_h, const int pad_w,
                                         const int channels, Context* ctx) {
   auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("pooling.cl", "ave_pool_forward");
+  auto kernel = ocl_ctx.get_kernel("opencl_pooling", "ave_pool_forward");
   
   auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
   auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);
@@ -172,7 +172,7 @@ void OpenclPooling::Pooling_Forward_Sto_Train(Block* src, 
Block* rand,
                                               const int channels, 
                                               Block* dst, Context* ctx) {
   auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("pooling.cl", "sto_pool_forward_train");
+  auto kernel = ocl_ctx.get_kernel("opencl_pooling", "sto_pool_forward_train");
   
   auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
   auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);
@@ -192,7 +192,7 @@ void OpenclPooling::Pooling_Forward_Sto_Test(Block* src, 
Block* dst,
                                              const int stride_h, const int 
stride_w,
                                              const int channels, Context* ctx) 
{
   auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("pooling.cl", "sto_pool_forward_test");
+  auto kernel = ocl_ctx.get_kernel("opencl_pooling", "sto_pool_forward_test");
   
   auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
   auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);
@@ -213,7 +213,7 @@ void OpenclPooling::Pooling_Backward_Max(Block* top, Block* 
mask,
                                          const int stride_h, const int 
stride_w,
                                          Block* bottom, Context* ctx) {
   auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("pooling.cl", "max_pool_backward");
+  auto kernel = ocl_ctx.get_kernel("opencl_pooling", "max_pool_backward");
   
   auto src_buf = WrapHandle(static_cast<cl_mem>(top->mutable_data()), ocl_ctx);
   auto dst_buf = WrapHandle(static_cast<cl_mem>(bottom->mutable_data()), 
ocl_ctx);
@@ -235,7 +235,7 @@ void OpenclPooling::Pooling_Backward_Ave(Block* bottom,
                                          const int stride_h, const int 
stride_w,
                                          Block* top, Context* ctx) {
   auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("pooling.cl", "ave_pool_backward");
+  auto kernel = ocl_ctx.get_kernel("opencl_pooling", "ave_pool_backward");
   
   auto src_buf = WrapHandle(static_cast<cl_mem>(bottom->mutable_data()), 
ocl_ctx);
   auto dst_buf = WrapHandle(static_cast<cl_mem>(top->mutable_data()), ocl_ctx);
@@ -254,7 +254,7 @@ void OpenclPooling::Pooling_Backward_Sto(Block* src, Block* 
rand, Block* dst,
                                          const int stride_h, const int 
stride_w,
                                          const int channels, Context* ctx) {
   auto ocl_ctx = viennacl::ocl::get_context(ctx->vcl_ctx_id);
-  auto kernel = ocl_ctx.get_kernel("pooling.cl", "sto_pool_backward");
+  auto kernel = ocl_ctx.get_kernel("opencl_pooling", "sto_pool_backward");
   
   auto src_buf = WrapHandle(static_cast<cl_mem>(src->mutable_data()), ocl_ctx);
   auto dst_buf = WrapHandle(static_cast<cl_mem>(dst->mutable_data()), ocl_ctx);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f6cf8f5d/src/model/layer/pooling_str.cpp
----------------------------------------------------------------------
diff --git a/src/model/layer/pooling_str.cpp b/src/model/layer/pooling_str.cpp
deleted file mode 100644
index e306e09..0000000
--- a/src/model/layer/pooling_str.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// This file is auto-generated, do not edit manually.
-// If any error occurs during compilation, please refer to clsrc_to_str.py
-#include <string>
-
-namespace singa {
-
-std::string pooling_str = "// This file is modified from the file located 
at\n// 
https://github.com/BVLC/caffe/blob/opencl/src/caffe/greentea/cl_kernels/pooling.cl\n//
 and is covered under the BSD 2-Clause License, as indicated in the LICENSE\n// 
file at the root of this repository.\n\n__kernel void max_pool_forward(\n    
const int nthreads, __global const float* bottom, const int channels, \n    
const int height, const int width,\n    const int pooled_h, const int 
pooled_w,\n    const int kernel_h, const int kernel_w,\n    const int stride_h, 
const int stride_w,\n    const int pad_h, const int pad_w,\n    __global float* 
top, __global float* mask) {\n\n//  printf(\"%d \", get_global_size(0));\n  for 
(int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {\n    const 
int pw = i % pooled_w;\n    const int ph = (i / pooled_w) % pooled_h;\n    
const int c = (i / pooled_w / pooled_h) % channels;\n    const int n = i / 
pooled_w / pooled_h / channels;\n    \n    int hstart = ph
  * stride_h - pad_h;\n    int wstart = pw * stride_w - pad_w;\n    const int 
hend = min(hstart + kernel_h, height);\n    const int wend = min(wstart + 
kernel_w, width);\n    hstart = max(hstart, (int)0);\n    wstart = max(wstart, 
(int)0);\n    \n    float maxval = -FLT_MAX;\n    int maxidx = -1;\n    
__global const float* bottom_slice = bottom + (n * channels + c) * height * 
width;\n    for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w 
< wend; ++w) {\n        const int index = h * width + w;\n        if 
(bottom_slice[index] > maxval) {\n          maxidx = index;\n          maxval = 
bottom_slice[maxidx];\n        }\n      }\n    }\n    top[i] = maxval;\n    
mask[i] = (float)maxidx;\n  }\n}\n\n__kernel void ave_pool_forward(\n    const 
int nthreads, __global const float* const bottom, const int channels, \n    
const int height, const int width,\n    const int pooled_h, const int 
pooled_w,\n    const int kernel_h, const int kernel_w,\n    const int stride_h, 
const int
  stride_w, \n    const int pad_h, const int pad_w, __global float* top) {\n    
\n  for (int i = get_global_id(0); i < nthreads; i += get_global_size(0)) {\n   
 const int pw = i % pooled_w;\n    const int ph = (i / pooled_w) % pooled_h;\n  
  const int c = (i / pooled_w / pooled_h) % channels;\n    const int n = i / 
pooled_w / pooled_h / channels;\n    int hstart = ph * stride_h - pad_h;\n    
int wstart = pw * stride_w - pad_w;\n    int hend = min(hstart + kernel_h, 
height + pad_h);\n    int wend = min(wstart + kernel_w, width + pad_w);\n    
const int pool_size = (hend - hstart) * (wend - wstart);\n    hstart = 
max(hstart, (int)0);\n    wstart = max(wstart, (int)0);\n    hend = min(hend, 
height);\n    wend = min(wend, width);\n    float aveval = 0;\n    __global 
const float* bottom_slice = bottom + (n * channels + c) * height * width;\n    
for (int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; 
++w) {\n        aveval += bottom_slice[h * width + w];\n      }\n    }\
 n    top[i] = aveval / pool_size;\n  }\n}\n\n__kernel void 
sto_pool_forward_train(\n    const int nthreads, __global const float* 
bottom,\n    const int channels, const int height, const int width,\n    const 
int pooled_h, const int pooled_w, const int kernel_h,\n    const int kernel_w, 
const int stride_h, const int stride_w,\n    __global float* rand_idx, __global 
float* top) {\n    \n  for (int i = get_global_id(0); i < nthreads; i += 
get_global_size(0)) {\n    const int pw = i % pooled_w;\n    const int ph = (i 
/ pooled_w) % pooled_h;\n    const int c = (i / pooled_w / pooled_h) % 
channels;\n    const int n = i / pooled_w / pooled_h / channels;\n    \n    
const int hstart = ph * stride_h;\n    const int hend = min(hstart + kernel_h, 
height);\n    const int wstart = pw * stride_w;\n    const int wend = 
min(wstart + kernel_w, width);\n    float cumsum = 0.;\n    __global const 
float* bottom_slice = bottom + (n * channels + c) * height * width;\n    // 
First pass: get sum\n    for (
 int h = hstart; h < hend; ++h) {\n      for (int w = wstart; w < wend; ++w) 
{\n        cumsum += bottom_slice[h * width + w];\n      }\n    }\n    const 
float thres = rand_idx[i] * cumsum;\n    // Second pass: get value, and set 
i.\n    cumsum = 0;\n    for (int h = hstart; h < hend; ++h) {\n      for (int 
w = wstart; w < wend; ++w) {\n        cumsum += bottom_slice[h * width + w];\n  
      if (cumsum >= thres) {\n          rand_idx[i] = ((n * channels + c) * 
height + h) * width + w;\n          top[i] = bottom_slice[h * width + w];\n     
     h = hend;\n          w = wend;\n        }\n      }\n    }\n  
}\n}\n\n__kernel void sto_pool_forward_test(\n    const int nthreads, __global 
const float* const bottom, const int channels, \n    const int height, const 
int width,\n    const int pooled_h, const int pooled_w, \n    const int 
kernel_h, const int kernel_w, \n    const int stride_h, const int stride_w,\n   
 __global float* top) {\n    \n  for (int i = get_global_id(0); i < nthreads; i
  += get_global_size(0)) {\n    const int pw = i % pooled_w;\n    const int ph 
= (i / pooled_w) % pooled_h;\n    const int c = (i / pooled_w / pooled_h) % 
channels;\n    const int n = i / pooled_w / pooled_h / channels;\n    \n    
const int hstart = ph * stride_h;\n    const int hend = min(hstart + kernel_h, 
height);\n    const int wstart = pw * stride_w;\n    const int wend = 
min(wstart + kernel_w, width);\n    // We set cumsum to be 0 to avoid 
divide-by-zero problems\n    float cumsum = FLT_MIN;\n    float cumvalues = 
0.;\n    __global const float* bottom_slice = bottom + (n * channels + c) * 
height * width;\n    // First pass: get sum\n    for (int h = hstart; h < hend; 
++h) {\n      for (int w = wstart; w < wend; ++w) {\n        cumsum += 
bottom_slice[h * width + w];\n        cumvalues += bottom_slice[h * width + w] 
* bottom_slice[h * width + w];\n      }\n    }\n    top[i] = cumvalues / 
cumsum;\n  }\n}\n\n__kernel void max_pool_backward(const int nthreads,\n        
             
            __global const float* top_diff,\n                                
__global const float* mask,\n                                const int 
channels,\n                                const int height, const int width,\n 
                               const int pooled_h, const int pooled_w,\n        
                        const int kernel_h, const int kernel_w,\n               
                 const int stride_h, const int stride_w,\n                      
          const int pad_h, const int pad_w,\n                                
__global float* bottom_diff) {\n  for (int i = get_global_id(0); i < nthreads; 
i += get_global_size(0)) {\n    // find out the local i\n    // find out the 
local offset\n    const int w = i % width;\n    const int h = (i / width) % 
height;\n    const int c = (i / width / height) % channels;\n    const int n = 
i / width / height / channels;\n    \n    const int phstart =\n        (h + 
pad_h < kernel_h) ? 0 : (h + pad_h - kernel_h) / stride_h + 1;\n  
   const int phend = min((h + pad_h) / stride_h + 1, pooled_h);\n    const int 
pwstart =\n        (w + pad_w < kernel_w) ? 0 : (w + pad_w - kernel_w) / 
stride_w + 1;\n    const int pwend = min((w + pad_w) / stride_w + 1, 
pooled_w);\n    float gradient = 0.0f;\n    const int offset = (n * channels + 
c) * pooled_h * pooled_w;\n    __global const float* top_diff_slice = top_diff 
+ offset;\n    __global const float* mask_slice = mask + offset;\n    for (int 
ph = phstart; ph < phend; ++ph) {\n      for (int pw = pwstart; pw < pwend; 
++pw) {\n        if (mask_slice[ph * pooled_w + pw] == (float)(h * width + w)) 
{\n          gradient += top_diff_slice[ph * pooled_w + pw];\n        }\n      
}\n    }\n    bottom_diff[i] = gradient;\n  }\n}\n\n__kernel void 
ave_pool_backward(const int nthreads,\n                                __global 
const float* top_diff,\n                                const int channels,\n   
                             const int height, const int width,\n               
                  const int pooled_h, const int pooled_w,\n                     
           const int kernel_h, const int kernel_w,\n                            
    const int stride_h, const int stride_w,\n                                
const int pad_h, const int pad_w,\n                                __global 
float* bottom_diff) {\n  for (int i = get_global_id(0); i < nthreads; i += 
get_global_size(0)) {\n    // find out the local i\n    // find out the local 
offset\n    const int w = i % width + pad_w;\n    const int h = (i / width) % 
height + pad_h;\n    const int c = (i / width / height) % channels;\n    const 
int n = i / width / height / channels;\n    \n    const int phstart = (h < 
kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;\n    const int phend = min(h / 
stride_h + 1, pooled_h);\n    const int pwstart = (w < kernel_w) ? 0 : (w - 
kernel_w) / stride_w + 1;\n    const int pwend = min(w / stride_w + 1, 
pooled_w);\n    float gradient = 0.0;\n    __global const float* const top_
 diff_slice = top_diff + (n * channels + c) * pooled_h * pooled_w;\n    for 
(int ph = phstart; ph < phend; ++ph) {\n      for (int pw = pwstart; pw < 
pwend; ++pw) {\n        // figure out the pooling size\n        int hstart = ph 
* stride_h - pad_h;\n        int wstart = pw * stride_w - pad_w;\n        int 
hend = min(hstart + kernel_h, height + pad_h);\n        int wend = min(wstart + 
kernel_w, width + pad_w);\n        int pool_size = (hend - hstart) * (wend - 
wstart);\n        gradient += top_diff_slice[ph * pooled_w + pw] / pool_size;\n 
     }\n    }\n    bottom_diff[i] = gradient;\n  }\n}\n\n__kernel void 
sto_pool_backward(\n    const int nthreads, __global const float* rand_idx,\n   
 __global const float* const top_diff, const int channels,\n    const int 
height, const int width,\n    const int pooled_h, const int pooled_w,\n    
const int kernel_h, const int kernel_w,\n    const int stride_h, const int 
stride_w,\n    __global float* bottom_diff) {\n\n  for (int i = get_global_id(
 0); i < nthreads; i += get_global_size(0)) {\n    // find out the local i\n    
// find out the local offset\n    const int w = i % width;\n    const int h = 
(i / width) % height;\n    const int c = (i / width / height) % channels;\n    
const int n = i / width / height / channels;\n    \n    const int phstart = (h 
< kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;\n    const int phend = min(h / 
stride_h + 1, pooled_h);\n    const int pwstart = (w < kernel_w) ? 0 : (w - 
kernel_w) / stride_w + 1;\n    const int pwend = min(w / stride_w + 1, 
pooled_w);\n    float gradient = 0.0;\n    __global const float* rand_idx_slice 
= rand_idx + (n * channels + c) * pooled_h * pooled_w;\n    __global const 
float* top_diff_slice = top_diff + (n * channels + c) * pooled_h * pooled_w;\n  
  for (int ph = phstart; ph < phend; ++ph) {\n      for (int pw = pwstart; pw < 
pwend; ++pw) {\n        gradient += top_diff_slice[ph * pooled_w + pw]\n        
    * (i == (int) (rand_idx_slice[ph * pooled_w + pw])?1.0:0
 .0);\n      }\n    }\n    bottom_diff[i] = gradient;\n  }\n}\n\n";
-
-} // namespace singa
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/f6cf8f5d/tool/opencl/clsrc_to_str.py
----------------------------------------------------------------------
diff --git a/tool/opencl/clsrc_to_str.py b/tool/opencl/clsrc_to_str.py
new file mode 100755
index 0000000..24400f7
--- /dev/null
+++ b/tool/opencl/clsrc_to_str.py
@@ -0,0 +1,71 @@
+#!/usr/bin/python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+'''Extract Opencl source code into c++ strings, for runtime use.
+
+This file is executed only if .cl files are updated.
+It is executed in the ROOT folder of SINGA source repo.
+'''
+
+
+distribution = "./src/core/tensor/distribution.cl"
+tensormath = "./src/core/tensor/tensor_math_opencl.cl"
+im2col = "./src/model/layer/im2col.cl"
+pooling = "./src/model/layer/pooling.cl"
+
+files = {"distribution_str" : distribution, "tensormath_str" : tensormath, 
"im2col_str" : im2col, "pooling_str" : pooling}
+
+if __name__ == "__main__":
+    fullpath = './src/core/device/opencl_func.h'
+    with open(fullpath, 'w') as fout:
+        fout.write("// This file is auto-generated by 
tool/opencl/clsrc_to_str, do not edit manually.\n")
+        license = """
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+"""
+        fout.write(license)
+        fout.write("#include <string>\n\n")
+        fout.write("namespace singa {\n namespace opencl {\n")
+        for name, path in files.items():
+            with open(path, 'r') as fin:
+                src = fin.read()
+                src = repr(src)
+                src = src[1:-1]
+                src = src.replace('\"', '\\"')  # Escape double quotes
+                src = src.replace('\\t', '')  # Strip out tabs
+                fout.write("const std::string " + name + " = \"")
+                fout.write(src)
+                fout.write("\";")
+        fout.write("\n } //  namespace opencl \n} //  namespace singa")
+        fout.close()

Reply via email to