Repository: incubator-singa Updated Branches: refs/heads/master f647d685f -> 2d5f696bd
SINGA-296 - Add sign and to_host function for pysinga tensor module add sign func for pysinga tensor; add tensor.to_host() which copies the tensor to a host tensor Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/3d407061 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/3d407061 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/3d407061 Branch: refs/heads/master Commit: 3d4070618022278902bea96d93eb914503b50ead Parents: f647d68 Author: wangwei <[email protected]> Authored: Sun Jan 22 12:02:35 2017 +0800 Committer: wangwei <[email protected]> Committed: Sun Jan 22 12:07:38 2017 +0800 ---------------------------------------------------------------------- python/singa/loss.py | 8 +- python/singa/metric.py | 1 + python/singa/net.py | 36 ++++++++- python/singa/snapshot.py | 12 ++- python/singa/tensor.py | 50 +++++++----- src/core/tensor/tensor_math_cpp.h | 2 +- src/core/tensor/tensor_math_opencl.cl | 86 ++++++++++----------- src/core/tensor/tensor_math_opencl.h | 120 ++++++++++++++--------------- test/singa/test_tensor_math.cc | 2 +- 9 files changed, 187 insertions(+), 130 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3d407061/python/singa/loss.py ---------------------------------------------------------------------- diff --git a/python/singa/loss.py b/python/singa/loss.py index f3330dc..60835fc 100644 --- a/python/singa/loss.py +++ b/python/singa/loss.py @@ -25,14 +25,13 @@ Example usage:: from singa import tensor from singa import loss - from singa.proto import model_pb2 x = tensor.Tensor((3, 5)) x.uniform(0, 1) # randomly genearte the prediction activation y = tensor.from_numpy(np.array([0, 1, 3], dtype=np.int)) # set the truth f = loss.SoftmaxCrossEntropy() - l = f.forward(model_pb2.kTrain, x, y) # l is tensor with 3 loss values + l = f.forward(True, x, y) # l is tensor with 3 loss values g = f.backward() # g is a tensor containing all gradients of x w.r.t l ''' @@ -42,7 +41,6 @@ from proto import model_pb2 import tensor - class Loss(object): '''Base loss class. @@ -58,7 +56,7 @@ class Loss(object): '''Compute the loss values. Args: - flag (int): kTrain or kEval. If it is kTrain, then the backward + flag: kTrain/kEval or bool. If it is kTrain/True, then the backward function must be called before calling forward again. x (Tensor): the prediction Tensor y (Tensor): the ground truch Tensor, x.shape[0] must = y.shape[0] @@ -125,7 +123,7 @@ class SquaredError(Loss): It is implemented using Python Tensor operations. ''' def __init__(self): - super(SquareLoss, self).__init__() + super(SquaredError, self).__init__() self.err = None def forward(self, flag, x, y): http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3d407061/python/singa/metric.py ---------------------------------------------------------------------- diff --git a/python/singa/metric.py b/python/singa/metric.py index 3a5750d..da8213b 100644 --- a/python/singa/metric.py +++ b/python/singa/metric.py @@ -35,6 +35,7 @@ Example usage:: ''' + from . import singa_wrap as singa import tensor http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3d407061/python/singa/net.py ---------------------------------------------------------------------- diff --git a/python/singa/net.py b/python/singa/net.py index 36c70f8..9d09740 100644 --- a/python/singa/net.py +++ b/python/singa/net.py @@ -17,8 +17,42 @@ """ Nerual net class for constructing the nets using layers and providing access functions for net info, e.g., parameters. -""" +Example usages:: + + from singa import net as ffnet + from singa import metric + from singa import loss + from singa import layer + from singa import device + + # create net and add layers + net = ffnet.FeedForwardNet(loss.SoftmaxCrossEntropy(), metric.Accuracy()) + net.add(layer.Conv2D('conv1', 32, 5, 1, input_sample_shape=(3,32,32,))) + net.add(layer.Activation('relu1')) + net.add(layer.MaxPooling2D('pool1', 3, 2)) + net.add(layer.Flatten('flat')) + net.add(layer.Dense('dense', 10)) + + # init parameters + for p in net.param_values(): + if len(p.shape) == 0: + p.set_value(0) + else: + p.gaussian(0, 0.01) + + # move net onto gpu + dev = device.create_cuda_gpu() + net.to_device(dev) + + # training (skipped) + + # do prediction after training + x = tensor.Tensor((2, 3, 32, 32), dev) + x.uniform(-1, 1) + y = net.predict(x) + print tensor.to_numpy(y) +""" from .proto.model_pb2 import kTrain, kEval import tensor http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3d407061/python/singa/snapshot.py ---------------------------------------------------------------------- diff --git a/python/singa/snapshot.py b/python/singa/snapshot.py index c259850..bd8918e 100644 --- a/python/singa/snapshot.py +++ b/python/singa/snapshot.py @@ -18,6 +18,16 @@ ''' This script includes io::snapshot class and its methods. +Example usages:: + + from singa import snapshot + + sn1 = snapshot.Snapshot('param', False) + params = sn1.read() # read all params as a dictionary + + sn2 = snapshot.Snapshot('param_new', False) + for k, v in params.iteritems(): + sn2.write(k, v) ''' from . import singa_wrap as singa @@ -36,7 +46,7 @@ class Snapshot(object): buffer_size (int): Buffer size (in MB), default is 10 ''' self.snapshot = singa.Snapshot(f, mode, buffer_size) - + def write(self, param_name, param_val): '''Call Write method to write a parameter http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3d407061/python/singa/tensor.py ---------------------------------------------------------------------- diff --git a/python/singa/tensor.py b/python/singa/tensor.py index 57ce563..d1851d1 100644 --- a/python/singa/tensor.py +++ b/python/singa/tensor.py @@ -21,11 +21,11 @@ Example usage:: from singa import tensor from singa import device -# create a tensor with shape (2,3), default CppCPU device and float32 + # create a tensor with shape (2,3), default CppCPU device and float32 x = tensor.Tensor((2, 3)) x.set_value(0.4) -# create a tensor from a numpy array + # create a tensor from a numpy array npy = np.zeros((3, 3), dtype=np.float32) y = tensor.from_numpy(npy) @@ -40,13 +40,13 @@ Example usage:: r = tensor.relu(x) - r.to_host() # move the data back to host cpu - s = tensor.to_numpy(r) # tensor -> numpy array, r must be on cpu + s = tensor.to_numpy(r) # tensor -> numpy array There are two sets of tensor functions, Tensor member functions which would change the internal state of the Tensor instance. + Tensor module functions which accept Tensor instances as arguments and return Tensor instances. @@ -558,28 +558,31 @@ def from_numpy(np_array): return ret -def to_numpy(t): - '''Convert the tensor into a numpy array. +def to_host(t): + '''Copy the data to a host tensor. + ''' + ret = t.clone() + ret.to_host() + return ret - Since numpy array is allocated on CPU devices, the input Tensor instance - must be on the default CppCPU device. + +def to_numpy(t): + '''Copy the tensor into a numpy array. Args: - t (Tensor), a Tensor on the default CppCPU device. + t (Tensor), a Tensor Returns: a numpy array ''' - assert (t.device.id() == -1) or (t.device is None), \ - 'Please move the tensor onto the default host device' - - if t.dtype == core_pb2.kFloat32: - np_array = t.singa_tensor.GetFloatValue(int(t.size())) - elif t.dtype == core_pb2.kInt: - np_array = t.singa_tensor.GetIntValue(int(t.size())) + th = to_host(t) + if th.dtype == core_pb2.kFloat32: + np_array = ret.singa_tensor.GetFloatValue(int(th.size())) + elif th.dtype == core_pb2.kInt: + np_array = ret.singa_tensor.GetIntValue(int(th.size())) else: - print 'Not implemented yet for ', t.dtype - return np_array.reshape(t.shape) + print 'Not implemented yet for ', th.dtype + return np_array.reshape(th.shape) def abs(t): @@ -638,6 +641,17 @@ def sigmoid(t): return _call_singa_func(singa.Sigmoid, t.singa_tensor) +def sign(t): + ''' + Args: + t (Tensor): input Tensor + + Returns: + a new Tensor whose element y = sign(x) + ''' + return _call_singa_func(singa.Sign, t.singa_tensor) + + def sqrt(t): ''' Args: http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3d407061/src/core/tensor/tensor_math_cpp.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h index 5167fba..4f510ed 100644 --- a/src/core/tensor/tensor_math_cpp.h +++ b/src/core/tensor/tensor_math_cpp.h @@ -278,7 +278,7 @@ void Sign<float, lang::Cpp>(const size_t num, const Block *in, Block *out, float *outPtr = static_cast<float *>(out->mutable_data()); const float *inPtr = static_cast<const float *>(in->data()); for (size_t i = 0; i < num; i++) { - outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f; + outPtr[i] = (inPtr[i] > 0) - (inPtr[i] < 0); } } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3d407061/src/core/tensor/tensor_math_opencl.cl ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_opencl.cl b/src/core/tensor/tensor_math_opencl.cl index 7b89970..d5bc62f 100644 --- a/src/core/tensor/tensor_math_opencl.cl +++ b/src/core/tensor/tensor_math_opencl.cl @@ -23,7 +23,7 @@ // Sum is basically reduction. // This reduction code is serial reduction modified from AMD's example. // http://developer.amd.com/resources/documentation-articles/articles-whitepapers/opencl-optimization-case-study-simple-reductions/ -__kernel +__kernel void clkernel_fabs(const int num, __global const float* in, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -38,7 +38,7 @@ void clkernel_add_scalar(const int num, float x, __global const float* in, __glo } __kernel -void clkernel_add(const int num, __global const float* in1, __global const float* in2, +void clkernel_add(const int num, __global const float* in1, __global const float* in2, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -46,7 +46,7 @@ void clkernel_add(const int num, __global const float* in1, __global const float } __kernel -void clkernel_clamp(const int num, float low, float high, __global const float* in, +void clkernel_clamp(const int num, float low, float high, __global const float* in, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -62,7 +62,7 @@ void clkernel_divide_scalar_matx(const int num, __global const float* in1, const } __kernel -void clkernel_divide_scalar_xmat(const int num, const float x, __global const float* in1, +void clkernel_divide_scalar_xmat(const int num, const float x, __global const float* in1, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -70,7 +70,7 @@ void clkernel_divide_scalar_xmat(const int num, const float x, __global const fl } __kernel -void clkernel_divide(const int num, __global const float* in1, __global const float* in2, +void clkernel_divide(const int num, __global const float* in1, __global const float* in2, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -78,7 +78,7 @@ void clkernel_divide(const int num, __global const float* in1, __global const fl } __kernel -void clkernel_eltmult_scalar(const int num, const float x, __global const float* in, +void clkernel_eltmult_scalar(const int num, const float x, __global const float* in, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -86,7 +86,7 @@ void clkernel_eltmult_scalar(const int num, const float x, __global const float* } __kernel -void clkernel_eltmult(const int num, __global const float* in1, __global const float* in2, +void clkernel_eltmult(const int num, __global const float* in1, __global const float* in2, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -101,7 +101,7 @@ void clkernel_exp(const int num, __global const float* in, __global float* out) } __kernel -void clkernel_le(const int num, __global const float* in, const float x, +void clkernel_le(const int num, __global const float* in, const float x, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -116,7 +116,7 @@ void clkernel_log(const int num, __global const float* in, __global float* out) } __kernel -void clkernel_lt(const int num, __global const float* in, const float x, +void clkernel_lt(const int num, __global const float* in, const float x, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -124,7 +124,7 @@ void clkernel_lt(const int num, __global const float* in, const float x, } __kernel -void clkernel_ge(const int num, __global const float* in, const float x, +void clkernel_ge(const int num, __global const float* in, const float x, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -132,7 +132,7 @@ void clkernel_ge(const int num, __global const float* in, const float x, } __kernel -void clkernel_gt(const int num, __global const float* in, const float x, +void clkernel_gt(const int num, __global const float* in, const float x, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -140,7 +140,7 @@ void clkernel_gt(const int num, __global const float* in, const float x, } __kernel -void clkernel_pow_scalar(const int num, const float x, __global const float* in, +void clkernel_pow_scalar(const int num, const float x, __global const float* in, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -148,7 +148,7 @@ void clkernel_pow_scalar(const int num, const float x, __global const float* in, } __kernel -void clkernel_pow(const int num, __global const float* in1, __global const float* in2, +void clkernel_pow(const int num, __global const float* in1, __global const float* in2, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -180,7 +180,7 @@ __kernel void clkernel_sign(const int num, __global const float* in, __global float* out) { const int i = get_global_id(0); if (i >= num) return; - out[i] = sign(in[i]); + out[i] = (in[i] > 0) - (in[i] < 0); } __kernel @@ -193,7 +193,7 @@ void clkernel_sqrt(const int num, __global const float* in, __global float* out) // kernel for square is called pow(2). __kernel -void clkernel_subtract_scalar(const int num, __global const float* in, const float x, +void clkernel_subtract_scalar(const int num, __global const float* in, const float x, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -201,7 +201,7 @@ void clkernel_subtract_scalar(const int num, __global const float* in, const flo } __kernel -void clkernel_subtract(const int num, __global const float* in1, __global const float* in2, +void clkernel_subtract(const int num, __global const float* in1, __global const float* in2, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -210,8 +210,8 @@ void clkernel_subtract(const int num, __global const float* in1, __global const // reduce3 kernel from // https://github.com/sschaetz/nvidia-opencl-examples/blob/master/OpenCL/src/oclReduction/oclReduction_kernel.cl -__kernel -void clkernel_sum(const int num, __global const float* in, __global float* out, +__kernel +void clkernel_sum(const int num, __global const float* in, __global float* out, __local float* sdata) { const int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0); const int tid = get_local_id(0); @@ -253,7 +253,7 @@ void clkernel_tanh(const int num, __global const float* in, __global float* out) // ********************************************************* __kernel -void clkernel_amax(const int num, __global const float* in, __global int* ret, +void clkernel_amax(const int num, __global const float* in, __global int* ret, __local uint* sdata, __local size_t* temp) { const int gid = get_global_id(0); const int tid = get_local_id(0); @@ -272,7 +272,7 @@ void clkernel_amax(const int num, __global const float* in, __global int* ret, /* TODO: Fix line 284:20. __kernel -void clkernel_amin(const int num, __global const float* in, __global int* ret, +void clkernel_amin(const int num, __global const float* in, __global int* ret, __local float* sdata, __local size_t* temp) { const int gid = get_global_id(0); const int tid = get_local_id(0); @@ -294,7 +294,7 @@ void clkernel_amin(const int num, __global const float* in, __global int* ret, __kernel -void clkernel_asum(const int num, __global const float* in, __global float* out, +void clkernel_asum(const int num, __global const float* in, __global float* out, __local float* sdata) { const int tid = get_local_id(0); const int i = get_global_id(0); @@ -319,7 +319,7 @@ void clkernel_asum(const int num, __global const float* in, __global float* out, } __kernel -void clkernel_axpy(const int num, float alpha, __global const float* in, +void clkernel_axpy(const int num, float alpha, __global const float* in, __global float* out) { const int i = get_global_id(0); if (i >= num) return; @@ -362,13 +362,13 @@ void clkernel_scale(const int num, float x, __global float* out) { } __kernel -void clkernel_dot(const int num, __global const float* in1, __global const float* in2, +void clkernel_dot(const int num, __global const float* in1, __global const float* in2, __global float* out, __local float* scratch) { const int i = get_global_id(0); if (i >= num) return; int offset = i << 2; scratch[i] = in1[offset] * in2[offset]; - + } // First kernel from http://www.bealto.com/gpu-gemv_intro.html @@ -376,7 +376,7 @@ void clkernel_dot(const int num, __global const float* in1, __global const float // fma(a, b, c) == (a * b) + c with infinite precision __kernel void clkernel_gemv(const int m, const int n, const float alpha, - __global const float* A, __global const float* v, + __global const float* A, __global const float* v, const float beta, __global float* out) { const int i = get_global_id(0); float sum = 0.0f; @@ -387,13 +387,13 @@ void clkernel_gemv(const int m, const int n, const float alpha, } // http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-dgmm -// X[j] = x[j*inc(x)] if inc(x) ⥠0 +// X[j] = x[j*inc(x)] if inc(x) ⥠0 // = x[(Ï â 1)*|inc(x)| â j*|inc(x)|] if inc(x) < 0 // C = diag( X )*A __kernel void clkernel_dgmm_left(const int nrow, const int ncol, - __global const float* M, __global const float* v, + __global const float* M, __global const float* v, __global float* out) { const uint gidx = get_global_id(0); @@ -406,7 +406,7 @@ void clkernel_dgmm_left(const int nrow, const int ncol, // C = A*diag( X ) __kernel void clkernel_dgmm_right(const int nrow, const int ncol, - __global const float* M, __global const float* v, + __global const float* M, __global const float* v, __global float* out) { const uint gidx = get_global_id(0); @@ -420,7 +420,7 @@ void clkernel_dgmm_right(const int nrow, const int ncol, // C = α*A*B + β*C __kernel void clkernel_gemm(const uint nrowA, const uint ncolB, const uint ncolA, const float alpha, - __global const float* A, __global const float* B, const float beta, + __global const float* A, __global const float* B, const float beta, __global float* C, __local float* Asub, __local float* Bsub) { const uint lidx = get_local_id(0); @@ -428,10 +428,10 @@ void clkernel_gemm(const uint nrowA, const uint ncolB, const uint ncolA, const f const uint TS = get_local_size(0); // Tile size const uint gidx = TS * get_group_id(0) + lidx; // Row ID of C (0..M) const uint gidy = TS * get_group_id(1) + lidy; // Row ID of C (0..N) - + // Initialise the accumulation register float acc = 0.0f; - + // Loop over all tiles const int numtiles = ncolA / TS; for (int t = 0; t < numtiles; t++) { @@ -439,23 +439,23 @@ void clkernel_gemm(const uint nrowA, const uint ncolB, const uint ncolA, const f const int tiledCol = TS * t + lidy; Asub[lidy * TS + lidx] = A[tiledCol * nrowA + gidx]; Bsub[lidy * TS + lidx] = B[gidy * ncolA + tiledRow]; - + barrier(CLK_LOCAL_MEM_FENCE); - + for(int k = 0; k < TS; k++) { acc += Asub[k * TS + lidx] * Bsub[lidy * TS + k] * alpha; } - + barrier(CLK_LOCAL_MEM_FENCE); } - + C[gidy * nrowA + gidx] = fma(beta, C[gidy * nrowA + gidx], acc); } __kernel -void clkernel_crossentropy(const uint batchsize, const uint dim, - __global const float* p, __global const int* t, +void clkernel_crossentropy(const uint batchsize, const uint dim, + __global const float* p, __global const int* t, __global float* loss) { const uint gidx = get_global_id(0); if (gidx >= batchsize) return; @@ -485,12 +485,12 @@ void clkernel_rowmax(const uint nrow, const uint ncol, __global const float* in, __global float* out) { const uint row_id = get_global_id(0); if (row_id >= nrow) return; - + float row_max_val = -FLT_MAX; for (uint i = 0; i < ncol; i++) { row_max_val = fmax(row_max_val, in[row_id * ncol + i]); } - + out[row_id] = row_max_val; } @@ -521,7 +521,7 @@ __kernel void clkernel_outerproduct(int m, const int n, __global const float* in1, __global const float* in2, __global float* out) { const int col = get_global_id(0); const int row = get_global_id(1); - + // TODO: This } @@ -541,7 +541,7 @@ __kernel void clkernel_sumrow(int nrow, int ncol, __global const float* in, __global float* out) { const int idx = get_global_id(0); if (idx >= nrow) return; - + float sum = 0.0f; for (int j = 0; j < ncol; j++) { sum += in[j + ncol * idx]; @@ -553,8 +553,8 @@ void clkernel_sumrow(int nrow, int ncol, __global const float* in, __global floa // Adapted from http://code.haskell.org/HsOpenCL/tests/bench/transpose.cl #define BLOCK_DIM 16 __kernel -void clkernel_transpose(uint nrow, uint ncol, - __global const float* in, __global float* out, +void clkernel_transpose(uint nrow, uint ncol, + __global const float* in, __global float* out, __local float* sdata) { uint gidx = get_global_id(0); uint gidy = get_global_id(1); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3d407061/src/core/tensor/tensor_math_opencl.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_opencl.h b/src/core/tensor/tensor_math_opencl.h index a209de4..bc876b3 100644 --- a/src/core/tensor/tensor_math_opencl.h +++ b/src/core/tensor/tensor_math_opencl.h @@ -50,10 +50,10 @@ template<> void Abs<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_fabs"); - + viennacl::vector<float> v_in((const cl_mem)in->data(), num); viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - + v_out = v_in; enqueue(kernel((cl_int)num, v_in, v_out)); } @@ -62,11 +62,11 @@ void Abs<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Con template<> void Add<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); - + viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx); viennacl::vector<float> v_in((const cl_mem)in->data(), num); viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - + v_out = v_in + x_in; } @@ -86,7 +86,7 @@ void Clamp<float, lang::Opencl>(const size_t num, const float low, const float h const Block* in, Block* out, Context* ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_clamp"); - + viennacl::vector<float> v_in((const cl_mem)in->data(), num); viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); @@ -97,7 +97,7 @@ void Clamp<float, lang::Opencl>(const size_t num, const float low, const float h template<> void Div<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); - + viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx); viennacl::vector<float> v_in((const cl_mem)in->data(), num); viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); @@ -109,7 +109,7 @@ void Div<float, lang::Opencl>(const size_t num, const Block* in, const float x, template<> void Div<float, lang::Opencl>(const size_t num, const float x, const Block* in, Block* out, Context* ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); - + viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx); viennacl::vector<float> v_in((const cl_mem)in->data(), num); viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); @@ -131,7 +131,7 @@ void Div<float, lang::Opencl>(const size_t num, const Block* in1, const Block* i template<> void EltwiseMult<float, lang::Opencl>(const size_t num, const Block* in, const float x, Block* out, Context* ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); - + viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx); viennacl::vector<float> v_in((const cl_mem)in->data(), num); viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); @@ -154,7 +154,7 @@ template<> void Exp<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { viennacl::vector<float> v_in((const cl_mem)in->data(), num); viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - + v_out = viennacl::linalg::element_exp(v_in); } @@ -163,10 +163,10 @@ template<> void LE<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_le"); - + viennacl::vector<float> in_buf((const cl_mem)in->data(), num); viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num); - + enqueue(kernel((cl_int)num, in_buf, x, out_buf)); } @@ -175,7 +175,7 @@ template<> void Log<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { viennacl::vector<float> v_in((const cl_mem)in->data(), num); viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - + v_out = viennacl::linalg::element_log(v_in); } @@ -184,10 +184,10 @@ template<> void LT<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_lt"); - + viennacl::vector<float> in_buf((const cl_mem)in->data(), num); viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num); - + enqueue(kernel((cl_int)num, in_buf, x, out_buf)); } @@ -196,10 +196,10 @@ template<> void GE<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_ge"); - + viennacl::vector<float> in_buf((const cl_mem)in->data(), num); viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num); - + enqueue(kernel((cl_int)num, in_buf, x, out_buf)); } @@ -208,10 +208,10 @@ template<> void GT<float, lang::Opencl>(const size_t num, const Block *in, const float x, Block *out, Context *ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_gt"); - + viennacl::vector<float> in_buf((const cl_mem)in->data(), num); viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num); - + enqueue(kernel((cl_int)num, in_buf, x, out_buf)); } @@ -219,7 +219,7 @@ void GT<float, lang::Opencl>(const size_t num, const Block *in, const float x, B template<> void Pow<float, lang::Opencl>(const size_t num, const Block* in, float x, Block* out, Context* ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); - + viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx); viennacl::vector<float> v_in((const cl_mem)in->data(), num); viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); @@ -242,10 +242,10 @@ template<> void ReLU<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_relu"); - + viennacl::vector<float> in_buf((const cl_mem)in->data(), num); viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num); - + enqueue(kernel((cl_int)num, in_buf, out_buf)); } @@ -255,7 +255,7 @@ void Set<float, lang::Opencl>(const size_t num, const float x, Block* out, Conte auto ocl_ctx = get_context(ctx->vcl_ctx_id); viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - + v_out = viennacl::scalar_vector<float>(num, x, ocl_ctx); } @@ -263,13 +263,13 @@ void Set<float, lang::Opencl>(const size_t num, const float x, Block* out, Conte template<> void Sigmoid<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); - + const viennacl::vector<float> zero = viennacl::zero_vector<float>(num, ocl_ctx); const viennacl::vector<float> one = viennacl::scalar_vector<float>(num, 1.0f, ocl_ctx); - + viennacl::vector<float> v_in((const cl_mem)in->data(), num); viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - + v_out = viennacl::linalg::element_div(one, viennacl::linalg::element_exp(zero - v_in) + one); } @@ -277,11 +277,11 @@ void Sigmoid<float, lang::Opencl>(const size_t num, const Block* in, Block* out, template<> void Sign<float, lang::Opencl>(const size_t num, const Block* in, Block* out, Context* ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); - auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_abs"); - + auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_sign"); + viennacl::vector<float> in_buf((const cl_mem)in->data(), num); viennacl::vector<float> out_buf(static_cast<cl_mem>(out->mutable_data()), num); - + enqueue(kernel(num, in_buf, out_buf)); } @@ -344,11 +344,11 @@ template<> void Bernoulli<float, lang::Opencl>(const size_t num, const float p, Block* out, Context *ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); auto kernel = ocl_ctx.get_kernel("distribution.cl", "PRNG_threefry4x32_bernoulli"); - + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - + viennacl::ocl::packed_cl_uint seed = {0, 32, 42, 888}; - + enqueue(kernel(v_out, seed, 0.0f, 1.0f, p, rounds, cl_uint(num / 4))); } @@ -357,11 +357,11 @@ template<> void Gaussian<float, lang::Opencl>(const size_t num, const float mean, const float std, Block* out, Context *ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); auto kernel = ocl_ctx.get_kernel("distribution.cl", "PRNG_threefry4x32_gaussian"); - + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - + viennacl::ocl::packed_cl_uint seed = {0, 32, 42, 888}; - + enqueue(kernel(v_out, seed, mean, std, rounds, cl_uint(num/4))); } @@ -370,11 +370,11 @@ template<> void Uniform<float, lang::Opencl>(const size_t num, const float low, const float high, Block* out, Context *ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); auto kernel = ocl_ctx.get_kernel("distribution.cl", "PRNG_threefry4x32_uniform"); - + viennacl::ocl::packed_cl_uint seed = {0, 32, 42, 888}; - + viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); - + enqueue(kernel(v_out, seed, low, high, rounds, cl_uint(num/4))); } @@ -441,7 +441,7 @@ void Amin<float, lang::Opencl>(const size_t num, const Block* in, size_t* out, C delete temp; } - + template<> void Asum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) { cl_int status = CL_SUCCESS; @@ -450,7 +450,7 @@ void Asum<float, lang::Opencl>(const size_t num, const Block* in, float* out, Co auto kernel = ctx->kernels->at(kname); cl::Buffer inbuf = *(static_cast<cl::Buffer*>(in->mutable_data())); - + size_t size = sizeof(float) * num; cl::Buffer outval(ctx->ocl_ctx, CL_MEM_WRITE_ONLY, size, nullptr, &status); OCL_CHECK(status, "Failed to create buffer!"); @@ -475,7 +475,7 @@ template<> void Axpy<float, lang::Opencl>(const size_t num, const float alpha, const Block* in, Block* out, Context* ctx) { viennacl::vector<float> inbuf((const cl_mem)in->data(), num); viennacl::vector<float> outbuf(static_cast<cl_mem>(out->mutable_data()), num); - + outbuf += alpha * inbuf; } @@ -483,7 +483,7 @@ void Axpy<float, lang::Opencl>(const size_t num, const float alpha, const Block* template<> void Nrm2<float, lang::Opencl>(const size_t num, const Block* in, float* out, Context* ctx) { viennacl::vector<float> inbuf((const cl_mem)in->data(), num); - + out[0] = viennacl::linalg::norm_2(inbuf); } @@ -491,7 +491,7 @@ void Nrm2<float, lang::Opencl>(const size_t num, const Block* in, float* out, Co template<> void Scale<float, lang::Opencl>(const size_t num, const float x, Block* out, Context* ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); - + viennacl::vector<float> x_in = viennacl::scalar_vector<float>(num, x, ocl_ctx); viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), num); @@ -503,7 +503,7 @@ template<> void Dot<float, lang::Opencl>(const size_t num, const Block *in1, const Block *in2, float *out, Context *ctx) { viennacl::vector<float> in1_buf((const cl_mem)in1->data(), num); viennacl::vector<float> in2_buf((const cl_mem)in2->data(), num); - + out[0] = viennacl::linalg::inner_prod(in1_buf, in2_buf); } @@ -513,9 +513,9 @@ void GEMV<float, lang::Opencl>(bool trans, const size_t m, const size_t n, const const Block *A, const Block *v, const float beta, Block* out, Context* ctx) { viennacl::vector<float> v_buf((const cl_mem)v->data(), n); viennacl::vector<float> o_buf(static_cast<cl_mem>(out->mutable_data()), m); - + viennacl::matrix<float> A_buf; - + if (trans) { A_buf = viennacl::matrix<float>((const cl_mem)A->data(), n, m); A_buf = viennacl::trans(A_buf); @@ -537,9 +537,9 @@ void DGMM<float, lang::Opencl>(bool side_right, viennacl::matrix<float> M_buf((const cl_mem)M->data(), nrow, ncol); viennacl::vector<float> v_buf((const cl_mem)v->data(), nrow); viennacl::matrix<float> out_buf(static_cast<cl_mem>(out->mutable_data()), nrow, ncol); - + auto diag = viennacl::diag(v_buf); - + if (side_right) { out_buf = viennacl::linalg::prod(diag, M_buf); } else { @@ -556,21 +556,21 @@ void GEMM<float, lang::Opencl>(const bool transA, const bool transB, viennacl::matrix<float> A_buf, B_buf; viennacl::matrix<float> C_buf(static_cast<cl_mem>(C->mutable_data()), nrowA, ncolB); - + if (transA) { A_buf = viennacl::matrix<float>((const cl_mem)A->data(), ncolA, nrowA); A_buf = viennacl::trans(A_buf); } else { A_buf = viennacl::matrix<float>((const cl_mem)A->data(), nrowA, ncolA); } - + if (transB) { B_buf = viennacl::matrix<float>((const cl_mem)B->data(), ncolB, ncolA); B_buf = viennacl::trans(B_buf); } else { B_buf = viennacl::matrix<float>((const cl_mem)B->data(), ncolA, ncolB); } - + C_buf *= beta; C_buf += alpha * viennacl::linalg::prod(A_buf, B_buf); } @@ -582,11 +582,11 @@ void ComputeCrossEntropy<float, lang::Opencl>(const size_t batchsize, const size Context *ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_crossentropy"); - + viennacl::vector<float> p_buf((const cl_mem)p->data(), batchsize); viennacl::vector<float> t_buf((const cl_mem)t->data(), batchsize); viennacl::vector<float> loss_buf(static_cast<cl_mem>(loss->mutable_data()), batchsize); - + enqueue(kernel((cl_uint)batchsize, (cl_uint)dim, p_buf, t_buf, loss_buf)); } @@ -597,11 +597,11 @@ void SoftmaxCrossEntropyBwd<float, lang::Opencl>(const size_t batchsize, const s Context *ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_softmaxentropy"); - + viennacl::vector<float> p_buf((const cl_mem)p->data(), batchsize); viennacl::vector<float> t_buf((const cl_mem)t->data(), batchsize); viennacl::vector<float> grad_buf(static_cast<cl_mem>(grad->mutable_data()), batchsize); - + enqueue(kernel((cl_uint)batchsize, (cl_uint)dim, p_buf, t_buf, grad_buf)); } @@ -611,12 +611,12 @@ void RowMax<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block *in, Block *out, Context *ctx) { auto ocl_ctx = get_context(ctx->vcl_ctx_id); auto kernel = ocl_ctx.get_kernel("tensor_math_opencl.cl", "clkernel_rowmax"); - + // kernel.global_work_size(0, nrow); - + viennacl::matrix<float> in_buf((const cl_mem)in->data(), nrow, ncol); viennacl::vector<float> outbuf(static_cast<cl_mem>(out->mutable_data()), nrow); - + enqueue(kernel((cl_uint)nrow, (cl_uint)ncol, in_buf, outbuf)); } @@ -641,7 +641,7 @@ void Outer<float, lang::Opencl>(const size_t m, const size_t n, const Block* lhs viennacl::vector<float> lhs_in((const cl_mem)lhs->data(), m); viennacl::vector<float> rhs_in((const cl_mem)rhs->data(), n); viennacl::matrix<float> out_buf(static_cast<cl_mem>(out->mutable_data()), m, n); - + out_buf = viennacl::linalg::outer_prod(lhs_in, rhs_in); } @@ -650,7 +650,7 @@ template<> void SumColumns<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) { viennacl::matrix<float> m_in((const cl_mem)in->data(), nrow, ncol); viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), nrow); - + v_out = viennacl::linalg::column_sum(m_in); } @@ -659,7 +659,7 @@ template<> void SumRows<float, lang::Opencl>(const size_t nrow, const size_t ncol, const Block* in, Block* out, Context* ctx) { viennacl::matrix<float> m_in((const cl_mem)in->data(), nrow, ncol); viennacl::vector<float> v_out(static_cast<cl_mem>(out->mutable_data()), ncol); - + v_out = viennacl::linalg::column_sum(m_in); } */ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3d407061/test/singa/test_tensor_math.cc ---------------------------------------------------------------------- diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc index c3a1039..116262c 100644 --- a/test/singa/test_tensor_math.cc +++ b/test/singa/test_tensor_math.cc @@ -105,7 +105,7 @@ TEST_F(TestTensorMath, MemberSign) { Tensor p = Sign(cc); const float *dptr1 = p.data<float>(); - EXPECT_EQ(0.0f, dptr1[0]); + EXPECT_EQ(-1.0f, dptr1[0]); EXPECT_EQ(0.0f, dptr1[1]); EXPECT_EQ(1.0f, dptr1[2]); }
