http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_cudnn_softmax.cc
----------------------------------------------------------------------
diff --cc test/singa/test_cudnn_softmax.cc
index e11be87,53ecb2b..d715b33
--- a/test/singa/test_cudnn_softmax.cc
+++ b/test/singa/test_cudnn_softmax.cc
@@@ -33,75 -35,133 +35,129 @@@ TEST(CudnnSoftmax, Setup) 
  
    singa::LayerConf conf;
    singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
-   softmaxconf->set_axis(2);
- 
-   sft.Setup(conf);
-   sft.InitCudnn(1, singa::kFloat32);
-   EXPECT_EQ(2, sft.Axis());
+   softmaxconf->set_algorithm("fast");
+   sft.Setup(Shape{1}, conf);
+   EXPECT_EQ(CUDNN_SOFTMAX_FAST, sft.Algorithm());
  }
  
- TEST(CudnnSoftmax, Forward) {
-   const float x[] = {1.0f, 2.0f, 0.0f, -2.0f, -3.0f, -1.0};
+ TEST(CudnnSoftmax, Forward1D) {
+   const float x[] = {1.f, 2.f, 0.f, -2.f, -3.f, -1.f};
    size_t n = sizeof(x) / sizeof(float);
--  singa::CudaGPU cuda(0, 1);
-   singa::Tensor in(singa::Shape{n}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+   singa::Shape shape = {n};
 -  singa::Tensor in(shape, &cuda);
++  singa::Tensor in(shape, cuda);
    in.CopyDataFromHostPtr<float>(x, n);
  
-   int axis = 1;
    CudnnSoftmax sft;
    singa::LayerConf conf;
    singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
-   softmaxconf->set_axis(axis);
-   sft.Setup(conf);
-   sft.InitCudnn(n, singa::kFloat32);
- 
+   softmaxconf->set_algorithm("accurate");
+   sft.Setup(Shape{1}, conf);
    singa::Tensor out = sft.Forward(singa::kTrain, in);
--  singa::CppCPU host(0, 1);
--  out.ToDevice(&host);
-   const float* yptr = out.data<const float*>();
++  out.ToHost();
+   const float* yptr = out.data<float>();
    EXPECT_EQ(n, out.Size());
  
    float* y = new float[n];
    float sigma = 0.f;
    for (size_t i = 0; i < n; i++) sigma += exp(x[i]);
    for (size_t i = 0; i < n; i++) y[i] = exp(x[i]) / sigma;
-   EXPECT_FLOAT_EQ(y[0], yptr[0]);
-   EXPECT_FLOAT_EQ(y[4], yptr[4]);
-   EXPECT_FLOAT_EQ(y[5], yptr[5]);
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
  }
  
- TEST(CudnnSoftmax, Backward) {
-   const float x[] = {1.0f, 2.0f, 3.0f, -2.0f, -3.0f, -1.0};
+ TEST(CudnnSoftmax, Backward1D) {
+   const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f};
    size_t n = sizeof(x) / sizeof(float);
--  singa::CudaGPU cuda(0, 1);
-   singa::Tensor in(singa::Shape{n}, &cuda);
+   singa::Shape shape = {n};
 -  singa::Tensor in(shape, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(shape, cuda);
    in.CopyDataFromHostPtr<float>(x, n);
  
-   int axis = 1;
    CudnnSoftmax sft;
    singa::LayerConf conf;
    singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
-   softmaxconf->set_axis(axis);
-   sft.Setup(conf);
+   softmaxconf->set_algorithm("accurate");
+   sft.Setup(Shape{1}, conf);
+ 
    singa::Tensor out = sft.Forward(singa::kTrain, in);
--  singa::CppCPU host(0, 1);
--  out.ToDevice(&host);
-   const float* yptr = out.data<const float*>();
++  out.ToHost();
+   const float* yptr = out.data<float>();
  
-   const float grad[] = {2.0f, -3.0f, 1.0f, 3.0f, -1.0f, -2.0};
-   singa::Tensor out_diff(singa::Shape{n}, &cuda);
+   const float grad[] = {2.f, -3.f, 1.f, 3.f, -1.f, -2.f};
 -  singa::Tensor out_diff(shape, &cuda);
++  singa::Tensor out_diff(shape, cuda);
    out_diff.CopyDataFromHostPtr<float>(grad, n);
    const auto ret = sft.Backward(singa::kTrain, out_diff);
    singa::Tensor in_diff = ret.first;
--  in_diff.ToDevice(&host);
-   const float* xptr = in_diff.data<const float*>();
++  in_diff.ToHost();
+   const float* xptr = in_diff.data<float>();
  
    float* dx = new float[n];
    float sigma = 0.f;
    for (size_t i = 0; i < n; i++) sigma += grad[i] * yptr[i];
    for (size_t i = 0; i < n; i++) dx[i] = (grad[i] - sigma) * yptr[i];
-   EXPECT_FLOAT_EQ(dx[0], xptr[0]);
-   EXPECT_FLOAT_EQ(dx[4], xptr[4]);
-   EXPECT_FLOAT_EQ(dx[5], xptr[5]);
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+ }
+ 
+ TEST(CudnnSoftmax, Forward2D) {
+   const float x[] = {1.f, 2.f, 0.f, -2.f, -3.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
+   size_t batch = 2, c = 3;
 -  singa::CudaGPU cuda(0, 1);
+   singa::Shape shape = {batch, c};
 -  singa::Tensor in(shape, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(shape, cuda);
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   CudnnSoftmax sft;
+   singa::LayerConf conf;
+   singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+   softmaxconf->set_algorithm("accurate");
+   sft.Setup(Shape{c}, conf);
+ 
+   singa::Tensor out = sft.Forward(singa::kTrain, in);
 -  singa::CppCPU host(0, 1);
 -  out.ToDevice(&host);
++  out.ToHost();
+   const float* yptr = out.data<float>();
+   EXPECT_EQ(n, out.Size());
+ 
+   float* y = new float[n];
+   float* sigma = new float[batch];
+   for (size_t i = 0; i < batch; i++) sigma[i] = 0.f;
+   for (size_t i = 0; i < n; i++) sigma[i / c] += exp(x[i]);
+   for (size_t i = 0; i < n; i++) y[i] = exp(x[i]) / sigma[i / c];
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
+ }
+ 
+ TEST(CudnnSoftmax, Backward2D) {
+   const float x[] = {1.f, 2.f, 3.f, -2.f, -3.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
+   size_t batch = 2, c = 3;
 -  singa::CudaGPU cuda(0, 1);
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
+   singa::Shape shape = {batch, c};
 -  singa::Tensor in(shape, &cuda);
++  singa::Tensor in(shape, cuda);
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   CudnnSoftmax sft;
+   singa::LayerConf conf;
+   singa::SoftmaxConf* softmaxconf = conf.mutable_softmax_conf();
+   softmaxconf->set_algorithm("accurate");
+   sft.Setup(Shape{c}, conf);
+ 
+   singa::Tensor out = sft.Forward(singa::kTrain, in);
 -  singa::CppCPU host(0, 1);
 -  out.ToDevice(&host);
++  out.ToHost();
+   const float* yptr = out.data<float>();
+ 
+   const float grad[] = {2.f, -3.f, 1.f, 3.f, -1.f, -2.f};
 -  singa::Tensor out_diff(shape, &cuda);
++  singa::Tensor out_diff(shape, cuda);
+   out_diff.CopyDataFromHostPtr<float>(grad, n);
+   const auto ret = sft.Backward(singa::kTrain, out_diff);
+   singa::Tensor in_diff = ret.first;
 -  in_diff.ToDevice(&host);
++  in_diff.ToHost();
+   const float* xptr = in_diff.data<float>();
+ 
+   float* dx = new float[n];
+   float* sigma = new float[batch];
+   for (size_t i = 0; i < batch; i++) sigma[i] = 0.f;
+   for (size_t i = 0; i < n; i++) sigma[i / c] += grad[i] * yptr[i];
+   for (size_t i = 0; i < n; i++) dx[i] = (grad[i] - sigma[i / c]) * yptr[i];
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
  }
  #endif  // USE_CUDNN

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_dense.cc
----------------------------------------------------------------------
diff --cc test/singa/test_dense.cc
index 7ed4d33,a5fd960..363fb6e
--- a/test/singa/test_dense.cc
+++ b/test/singa/test_dense.cc
@@@ -1,242 -1,245 +1,238 @@@
--/************************************************************
--*
--* Licensed to the Apache Software Foundation (ASF) under one
--* or more contributor license agreements.  See the NOTICE file
--* distributed with this work for additional information
--* regarding copyright ownership.  The ASF licenses this file
--* to you under the Apache License, Version 2.0 (the
--* "License"); you may not use this file except in compliance
--* with the License.  You may obtain a copy of the License at
--*
--*   http://www.apache.org/licenses/LICENSE-2.0
--*
--* Unless required by applicable law or agreed to in writing,
--* software distributed under the License is distributed on an
--* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
--* KIND, either express or implied.  See the License for the
--* specific language governing permissions and limitations
--* under the License.
--*
--*************************************************************/
--#include "../src/model/layer/dense.h"
--#include "gtest/gtest.h"
- #include "singa/singa_config.h"
 -#include "singa_config.h"
--
--using singa::Dense;
 -using singa::Shape;
--TEST(Dense, Setup) {
--  Dense dense;
--  EXPECT_EQ("Dense", dense.layer_type());
--
--  singa::LayerConf conf;
--  singa::DenseConf *denseconf = conf.mutable_dense_conf();
-   denseconf->set_num_input(2);
--  denseconf->set_num_output(3);
--  denseconf->set_transpose(false);
-   dense.Setup(conf);
 -  dense.Setup(Shape{2}, conf);
--
--  EXPECT_EQ(3u, dense.num_output());
--  EXPECT_EQ(2u, dense.num_input());
--}
--#ifdef USE_CBLAS
--TEST(Dense, ForwardCpp) {
--  Dense dense;
--
--  singa::LayerConf conf;
--  singa::DenseConf *denseconf = conf.mutable_dense_conf();
-   denseconf->set_num_input(2);
--  denseconf->set_num_output(3);
--  denseconf->set_transpose(false);
-   dense.Setup(conf);
 -  dense.Setup(Shape{2}, conf);
--
--  const size_t batchsize = 3, vdim = 2, hdim = 3;
--  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
--  singa::Tensor in(singa::Shape{batchsize, vdim});
--  in.CopyDataFromHostPtr(x, batchsize * vdim);
--
--  // set weight
--  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
--  singa::Tensor weight(singa::Shape{hdim, vdim});
--  weight.CopyDataFromHostPtr(we, hdim * vdim);
--
--  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
--  singa::Tensor bias(singa::Shape{hdim});
--  bias.CopyDataFromHostPtr(bia, hdim);
--
--  dense.set_weight(weight);
--  dense.set_bias(bias);
--
--  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
-   const float *outptr1 = out1.data<const float *>();
 -  singa::CppCPU host(0, 1);
 -  const float *outptr1 = out1.data<float>();
--  EXPECT_EQ(9u, out1.Size());
--  for (int i = 0; i < 3; i++)
--    for (int j = 0; j < 3; j++)
--      EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j * 2 + 0] +
--                       x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
--                      outptr1[i * 3 + j]);
--}
--#endif  // USE_CBLAS
 -#ifdef USE_CUDA
--TEST(Dense, BackwardCpp) {
--  Dense dense;
--
--  singa::LayerConf conf;
--  singa::DenseConf *denseconf = conf.mutable_dense_conf();
-   denseconf->set_num_input(2);
--  denseconf->set_num_output(3);
--  denseconf->set_transpose(false);
-   dense.Setup(conf);
 -  dense.Setup(Shape{2}, conf);
--
--  const size_t batchsize = 3, vdim = 2, hdim = 3;
--  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
 -  singa::CudaGPU cuda(0, 1);
--  singa::Tensor in(singa::Shape{batchsize, vdim});
--  in.CopyDataFromHostPtr(x, batchsize * vdim);
--
--  // set weight
--  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
--  singa::Tensor weight(singa::Shape{hdim, vdim});
--  weight.CopyDataFromHostPtr(we, hdim * vdim);
--
--  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
--  singa::Tensor bias(singa::Shape{hdim});
--  bias.CopyDataFromHostPtr(bia, hdim);
--
--  dense.set_weight(weight);
--  dense.set_bias(bias);
--
--  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
--
--  // grad
--  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
--                                      2.0f, 3.0f, 3.0f, 3.0f};
--  singa::Tensor grad(singa::Shape{batchsize, hdim});
--  grad.CopyDataFromHostPtr(dy, batchsize * hdim);
--
--  const auto ret = dense.Backward(singa::kTrain, grad);
 -  singa::CppCPU host(0, 1);
--  singa::Tensor in_grad = ret.first;
--  singa::Tensor dweight = ret.second.at(0);
--  singa::Tensor dbias = ret.second.at(1);
-   const float *dx = in_grad.data<const float *>();
 -  const float *dx = in_grad.data<float>();
--  EXPECT_EQ(6u, in_grad.Size());
--  for (int i = 0; i < 3; i++)
--    for (int j = 0; j < 2; j++)
--      EXPECT_FLOAT_EQ(
--          (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
--           dy[i * 3 + 2] * we[2 * 2 + j]),
--          dx[i * 2 + j]);
-   const float *dweightx = dweight.data<const float *>();
 -  const float *dweightx = dweight.data<float>();
--  EXPECT_EQ(6u, dweight.Size());
--  for (int i = 0; i < 3; i++)
--    for (int j = 0; j < 2; j++)
--      EXPECT_FLOAT_EQ(
--          (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
--           dy[2 * 3 + i] * x[2 * 2 + j]),
--          dweightx[i * 2 + j]);
-   const float *dbiasx = dbias.data<const float *>();
 -  const float *dbiasx = dbias.data<float>();
--  EXPECT_EQ(3u, dbias.Size());
--  for (int i = 0; i < 3; i++)
--    EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), 
dbiasx[i]);
--}
 -#endif
--
--#ifdef USE_CUDA
--TEST(Dense, ForwardCuda) {
--  Dense dense;
--
--  singa::LayerConf conf;
--  singa::DenseConf *denseconf = conf.mutable_dense_conf();
-   denseconf->set_num_input(2);
--  denseconf->set_num_output(3);
--  denseconf->set_transpose(false);
-   dense.Setup(conf);
 -  dense.Setup(Shape{2}, conf);
--
--  const size_t batchsize = 3, vdim = 2, hdim = 3;
--  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-   auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
-   singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
 -  singa::CudaGPU cuda(0, 1);
 -  singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
--  in.CopyDataFromHostPtr(x, batchsize * vdim);
--
--  // set weight
--  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
-   singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
 -  singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
--  weight.CopyDataFromHostPtr(we, hdim * vdim);
--
--  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
-   singa::Tensor bias(singa::Shape{hdim}, cuda);
 -  singa::Tensor bias(singa::Shape{hdim}, &cuda);
--  bias.CopyDataFromHostPtr(bia, hdim);
--
--  dense.set_weight(weight);
--  dense.set_bias(bias);
--
--  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
-   out1.ToHost();
-   const float *outptr1 = out1.data<const float *>();
 -  singa::CppCPU host(0, 1);
 -  out1.ToDevice(&host);
 -  const float *outptr1 = out1.data<float>();
--  EXPECT_EQ(9u, out1.Size());
--  for (int i = 0; i < 3; i++)
--    for (int j = 0; j < 3; j++)
--      EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j * 2 + 0] +
--                       x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
--                      outptr1[i * 3 + j]);
--}
--TEST(Dense, BackwardCuda) {
--  Dense dense;
--
--  singa::LayerConf conf;
--  singa::DenseConf *denseconf = conf.mutable_dense_conf();
-   denseconf->set_num_input(2);
--  denseconf->set_num_output(3);
--  denseconf->set_transpose(false);
-   dense.Setup(conf);
 -  dense.Setup(Shape{2}, conf);
--
--  const size_t batchsize = 3, vdim = 2, hdim = 3;
--  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
-   auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
-   singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
 -  singa::CudaGPU cuda(0, 1);
 -  singa::Tensor in(singa::Shape{batchsize, vdim}, &cuda);
--  in.CopyDataFromHostPtr(x, batchsize * vdim);
--
--  // set weight
--  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
-   singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
 -  singa::Tensor weight(singa::Shape{hdim, vdim}, &cuda);
--  weight.CopyDataFromHostPtr(we, hdim * vdim);
--
--  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
-   singa::Tensor bias(singa::Shape{hdim}, cuda);
 -  singa::Tensor bias(singa::Shape{hdim}, &cuda);
--  bias.CopyDataFromHostPtr(bia, hdim);
--
--  dense.set_weight(weight);
--  dense.set_bias(bias);
--
--  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
--
--  // grad
--  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
--                                      2.0f, 3.0f, 3.0f, 3.0f};
-   singa::Tensor grad(singa::Shape{batchsize, hdim}, cuda);
 -  singa::Tensor grad(singa::Shape{batchsize, hdim}, &cuda);
--  grad.CopyDataFromHostPtr(dy, batchsize * hdim);
--
--  const auto ret = dense.Backward(singa::kTrain, grad);
 -  singa::CppCPU host(0, 1);
--  singa::Tensor in_grad = ret.first;
--  singa::Tensor dweight = ret.second.at(0);
--  singa::Tensor dbias = ret.second.at(1);
-   in_grad.ToHost();
-   const float *dx = in_grad.data<const float *>();
 -  in_grad.ToDevice(&host);
 -  const float *dx = in_grad.data<float>();
--  EXPECT_EQ(6u, in_grad.Size());
--  for (int i = 0; i < 3; i++)
--    for (int j = 0; j < 2; j++)
--      EXPECT_FLOAT_EQ(
--          (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
--           dy[i * 3 + 2] * we[2 * 2 + j]),
--          dx[i * 2 + j]);
-   dweight.ToHost();
-   const float *dweightx = dweight.data<const float *>();
 -  dweight.ToDevice(&host);
 -  const float *dweightx = dweight.data<float>();
--  EXPECT_EQ(6u, dweight.Size());
--  for (int i = 0; i < 3; i++)
--    for (int j = 0; j < 2; j++)
--      EXPECT_FLOAT_EQ(
--          (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
--           dy[2 * 3 + i] * x[2 * 2 + j]),
--          dweightx[i * 2 + j]);
-   dbias.ToHost();
-   const float *dbiasx = dbias.data<const float *>();
 -  dbias.ToDevice(&host);
 -  const float *dbiasx = dbias.data<float>();
--  EXPECT_EQ(3u, dbias.Size());
--  for (int i = 0; i < 3; i++)
--    EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), 
dbiasx[i]);
--}
--#endif
++/************************************************************
++*
++* Licensed to the Apache Software Foundation (ASF) under one
++* or more contributor license agreements.  See the NOTICE file
++* distributed with this work for additional information
++* regarding copyright ownership.  The ASF licenses this file
++* to you under the Apache License, Version 2.0 (the
++* "License"); you may not use this file except in compliance
++* with the License.  You may obtain a copy of the License at
++*
++*   http://www.apache.org/licenses/LICENSE-2.0
++*
++* Unless required by applicable law or agreed to in writing,
++* software distributed under the License is distributed on an
++* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
++* KIND, either express or implied.  See the License for the
++* specific language governing permissions and limitations
++* under the License.
++*
++*************************************************************/
++#include "../src/model/layer/dense.h"
++#include "gtest/gtest.h"
++#include "singa/singa_config.h"
++
++using singa::Dense;
++using singa::Shape;
++TEST(Dense, Setup) {
++  Dense dense;
++  EXPECT_EQ("Dense", dense.layer_type());
++
++  singa::LayerConf conf;
++  singa::DenseConf *denseconf = conf.mutable_dense_conf();
++  denseconf->set_num_output(3);
++  denseconf->set_transpose(false);
++  dense.Setup(Shape{2}, conf);
++
++  EXPECT_EQ(3u, dense.num_output());
++  EXPECT_EQ(2u, dense.num_input());
++}
++#ifdef USE_CBLAS
++TEST(Dense, ForwardCpp) {
++  Dense dense;
++
++  singa::LayerConf conf;
++  singa::DenseConf *denseconf = conf.mutable_dense_conf();
++  denseconf->set_num_output(3);
++  denseconf->set_transpose(false);
++  dense.Setup(Shape{2}, conf);
++
++  const size_t batchsize = 3, vdim = 2, hdim = 3;
++  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
++  singa::Tensor in(singa::Shape{batchsize, vdim});
++  in.CopyDataFromHostPtr(x, batchsize * vdim);
++
++  // set weight
++  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
++  singa::Tensor weight(singa::Shape{hdim, vdim});
++  weight.CopyDataFromHostPtr(we, hdim * vdim);
++
++  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
++  singa::Tensor bias(singa::Shape{hdim});
++  bias.CopyDataFromHostPtr(bia, hdim);
++
++  dense.set_weight(weight);
++  dense.set_bias(bias);
++
++  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
++  const float *outptr1 = out1.data<float>();
++  EXPECT_EQ(9u, out1.Size());
++  for (int i = 0; i < 3; i++)
++    for (int j = 0; j < 3; j++)
++      EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j * 2 + 0] +
++                       x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
++                      outptr1[i * 3 + j]);
++}
++TEST(Dense, BackwardCpp) {
++  Dense dense;
++
++  singa::LayerConf conf;
++  singa::DenseConf *denseconf = conf.mutable_dense_conf();
++  denseconf->set_num_output(3);
++  denseconf->set_transpose(false);
++  dense.Setup(Shape{2}, conf);
++
++  const size_t batchsize = 3, vdim = 2, hdim = 3;
++  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
++  singa::Tensor in(singa::Shape{batchsize, vdim});
++  in.CopyDataFromHostPtr(x, batchsize * vdim);
++
++  // set weight
++  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
++  singa::Tensor weight(singa::Shape{hdim, vdim});
++  weight.CopyDataFromHostPtr(we, hdim * vdim);
++
++  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
++  singa::Tensor bias(singa::Shape{hdim});
++  bias.CopyDataFromHostPtr(bia, hdim);
++
++  dense.set_weight(weight);
++  dense.set_bias(bias);
++
++  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
++
++  // grad
++  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
++                                      2.0f, 3.0f, 3.0f, 3.0f};
++  singa::Tensor grad(singa::Shape{batchsize, hdim});
++  grad.CopyDataFromHostPtr(dy, batchsize * hdim);
++
++  const auto ret = dense.Backward(singa::kTrain, grad);
++  singa::Tensor in_grad = ret.first;
++  singa::Tensor dweight = ret.second.at(0);
++  singa::Tensor dbias = ret.second.at(1);
++  const float *dx = in_grad.data<float>();
++  EXPECT_EQ(6u, in_grad.Size());
++  for (int i = 0; i < 3; i++)
++    for (int j = 0; j < 2; j++)
++      EXPECT_FLOAT_EQ(
++          (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
++           dy[i * 3 + 2] * we[2 * 2 + j]),
++          dx[i * 2 + j]);
++  const float *dweightx = dweight.data<float>();
++  EXPECT_EQ(6u, dweight.Size());
++  for (int i = 0; i < 3; i++)
++    for (int j = 0; j < 2; j++)
++      EXPECT_FLOAT_EQ(
++          (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
++           dy[2 * 3 + i] * x[2 * 2 + j]),
++          dweightx[i * 2 + j]);
++  const float *dbiasx = dbias.data<float>();
++  EXPECT_EQ(3u, dbias.Size());
++  for (int i = 0; i < 3; i++)
++    EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), 
dbiasx[i]);
++}
++#endif  // USE_CBLAS
++
++#ifdef USE_CUDA
++TEST(Dense, ForwardCuda) {
++  Dense dense;
++
++  singa::LayerConf conf;
++  singa::DenseConf *denseconf = conf.mutable_dense_conf();
++  denseconf->set_num_output(3);
++  denseconf->set_transpose(false);
++  dense.Setup(Shape{2}, conf);
++
++  const size_t batchsize = 3, vdim = 2, hdim = 3;
++  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
++  in.CopyDataFromHostPtr(x, batchsize * vdim);
++
++  // set weight
++  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
++  singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
++  weight.CopyDataFromHostPtr(we, hdim * vdim);
++
++  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
++  singa::Tensor bias(singa::Shape{hdim}, cuda);
++  bias.CopyDataFromHostPtr(bia, hdim);
++
++  dense.set_weight(weight);
++  dense.set_bias(bias);
++
++  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
++  out1.ToHost();
++  const float *outptr1 = out1.data<float>();
++  EXPECT_EQ(9u, out1.Size());
++  for (int i = 0; i < 3; i++)
++    for (int j = 0; j < 3; j++)
++      EXPECT_FLOAT_EQ((x[i * 2 + 0] * we[j * 2 + 0] +
++                       x[i * 2 + 1] * we[j * 2 + 1] + bia[j]),
++                      outptr1[i * 3 + j]);
++}
++TEST(Dense, BackwardCuda) {
++  Dense dense;
++
++  singa::LayerConf conf;
++  singa::DenseConf *denseconf = conf.mutable_dense_conf();
++  denseconf->set_num_output(3);
++  denseconf->set_transpose(false);
++  dense.Setup(Shape{2}, conf);
++
++  const size_t batchsize = 3, vdim = 2, hdim = 3;
++  const float x[batchsize * vdim] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
++  auto cuda = std::make_shared<singa::CudaGPU>(0, 1);
++  singa::Tensor in(singa::Shape{batchsize, vdim}, cuda);
++  in.CopyDataFromHostPtr(x, batchsize * vdim);
++
++  // set weight
++  const float we[hdim * vdim] = {1.0f, 1.0f, 1.0f, 2.0f, 0.0f, 1.0f};
++  singa::Tensor weight(singa::Shape{hdim, vdim}, cuda);
++  weight.CopyDataFromHostPtr(we, hdim * vdim);
++
++  const float bia[hdim] = {1.0f, 1.0f, 1.0f};
++  singa::Tensor bias(singa::Shape{hdim}, cuda);
++  bias.CopyDataFromHostPtr(bia, hdim);
++
++  dense.set_weight(weight);
++  dense.set_bias(bias);
++
++  singa::Tensor out1 = dense.Forward(singa::kTrain, in);
++
++  // grad
++  const float dy[batchsize * hdim] = {1.0f, 1.0f, 1.0f, 2.0f, 2.0f,
++                                      2.0f, 3.0f, 3.0f, 3.0f};
++  singa::Tensor grad(singa::Shape{batchsize, hdim}, cuda);
++  grad.CopyDataFromHostPtr(dy, batchsize * hdim);
++
++  const auto ret = dense.Backward(singa::kTrain, grad);
++  singa::Tensor in_grad = ret.first;
++  singa::Tensor dweight = ret.second.at(0);
++  singa::Tensor dbias = ret.second.at(1);
++  in_grad.ToHost();
++  const float *dx = in_grad.data<float>();
++  EXPECT_EQ(6u, in_grad.Size());
++  for (int i = 0; i < 3; i++)
++    for (int j = 0; j < 2; j++)
++      EXPECT_FLOAT_EQ(
++          (dy[i * 3 + 0] * we[0 * 2 + j] + dy[i * 3 + 1] * we[1 * 2 + j] +
++           dy[i * 3 + 2] * we[2 * 2 + j]),
++          dx[i * 2 + j]);
++  dweight.ToHost();
++  const float *dweightx = dweight.data<float>();
++  EXPECT_EQ(6u, dweight.Size());
++  for (int i = 0; i < 3; i++)
++    for (int j = 0; j < 2; j++)
++      EXPECT_FLOAT_EQ(
++          (dy[0 * 3 + i] * x[0 * 2 + j] + dy[1 * 3 + i] * x[1 * 2 + j] +
++           dy[2 * 3 + i] * x[2 * 2 + j]),
++          dweightx[i * 2 + j]);
++  dbias.ToHost();
++  const float *dbiasx = dbias.data<float>();
++  EXPECT_EQ(3u, dbias.Size());
++  for (int i = 0; i < 3; i++)
++    EXPECT_FLOAT_EQ((dy[0 * 3 + i] + dy[1 * 3 + i] + dy[2 * 3 + i]), 
dbiasx[i]);
++}
++#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_flatten.cc
----------------------------------------------------------------------
diff --cc test/singa/test_flatten.cc
index 0000000,2a77272..25e00c4
mode 000000,100644..100644
--- a/test/singa/test_flatten.cc
+++ b/test/singa/test_flatten.cc
@@@ -1,0 -1,145 +1,143 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+ 
+ #include "../src/model/layer/flatten.h"
+ #include "gtest/gtest.h"
+ 
+ using singa::Flatten;
+ using singa::Shape;
+ TEST(Flatten, Setup) {
+   Flatten flt;
+   EXPECT_EQ("Flatten", flt.layer_type());
+ 
+   singa::LayerConf conf;
+   singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+   flattenconf->set_axis(1);
+ 
+   flt.Setup(Shape{2}, conf);
+   EXPECT_EQ(1, flt.Axis());
+ }
+ 
+ TEST(Flatten, ForwardCPU) {
+   const float x[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                      1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
+   singa::Shape s = {2, 1, 3, 2};
+   singa::Tensor in(s);
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   int axis = 3;
+   Flatten flt;
+   singa::LayerConf conf;
+   singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+   flattenconf->set_axis(axis);
+   flt.Setup(Shape{1, 3, 2}, conf);
+ 
+   singa::Tensor out = flt.Forward(singa::kTrain, in);
+   EXPECT_EQ(n, out.Size());
+   EXPECT_EQ(6u, out.shape(0));
+   EXPECT_EQ(2u, out.shape(1));
+   const float *yptr = out.data<float>();
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(x[i], yptr[i]);
+ }
+ 
+ TEST(Flatten, BackwardCPU) {
+   // directly use input as the output_grad for backward
+   // note that only the shape of input really matters
+   const float dy[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                       1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+   size_t n = sizeof(dy) / sizeof(float);
+   singa::Tensor in(singa::Shape{2, 1, 3, 2});
+   in.CopyDataFromHostPtr<float>(dy, n);
+ 
+   int axis = 2;
+   Flatten flt;
+   singa::LayerConf conf;
+   singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+   flattenconf->set_axis(axis);
+   flt.Setup(Shape{1, 3, 2}, conf);
+ 
+   singa::Tensor temp = flt.Forward(singa::kTrain, in);
+   const auto out = flt.Backward(singa::kTrain, temp);
+   const float *xptr = out.first.data<float>();
+   EXPECT_EQ(n, out.first.Size());
+   EXPECT_EQ(2u, out.first.shape(0));
+   EXPECT_EQ(1u, out.first.shape(1));
+   EXPECT_EQ(3u, out.first.shape(2));
+   EXPECT_EQ(2u, out.first.shape(3));
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dy[i], xptr[i]);
+ }
+ 
+ #ifdef USE_CUDA
+ TEST(Flatten, ForwardGPU) {
+   const float x[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                      1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
 -  singa::CudaGPU cuda(0, 1);
 -  singa::Tensor in(singa::Shape{2, 1, 3, 2}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>();
++  singa::Tensor in(singa::Shape{2, 1, 3, 2}, cuda);
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   int axis = 3;
+   Flatten flt;
+   singa::LayerConf conf;
+   singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+   flattenconf->set_axis(axis);
+   flt.Setup(Shape{1, 3, 2}, conf);
+ 
+   singa::Tensor out = flt.Forward(singa::kTrain, in);
 -  singa::CppCPU host(0, 1);
 -  out.ToDevice(&host);
++  out.ToHost();
+   EXPECT_EQ(n, out.Size());
+   EXPECT_EQ(6u, out.shape(0));
+   EXPECT_EQ(2u, out.shape(1));
+   const float *yptr = out.data<float>();
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(x[i], yptr[i]);
+ }
+ 
+ TEST(Flatten, BackwardGPU) {
+   // directly use input as the output_grad for backward
+   // note that only the shape of input really matters
+   const float dy[] = {1.f,  2.f,   3.f, -2.f,  -3.f, -4.f,
+                       1.5f, -1.5f, 0.f, -0.5f, -2.f, -1.f};
+   size_t n = sizeof(dy) / sizeof(float);
 -  singa::CudaGPU cuda(0, 1);
 -  singa::Tensor in(singa::Shape{2, 1, 3, 2}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>();
++  singa::Tensor in(singa::Shape{2, 1, 3, 2}, cuda);
+   in.CopyDataFromHostPtr<float>(dy, n);
+ 
+   int axis = 2;
+   Flatten flt;
+   singa::LayerConf conf;
+   singa::FlattenConf *flattenconf = conf.mutable_flatten_conf();
+   flattenconf->set_axis(axis);
+   flt.Setup(Shape{1, 3, 2}, conf);
+ 
+   singa::Tensor out = flt.Forward(singa::kTrain, in);
+   const auto ret = flt.Backward(singa::kTrain, out);
 -  singa::CppCPU host(0, 1);
+   singa::Tensor in_diff = ret.first;
 -  in_diff.ToDevice(&host);
++  in_diff.ToHost();
+   const float *xptr = in_diff.data<float>();
+   EXPECT_EQ(n, in_diff.Size());
+   EXPECT_EQ(2u, in_diff.shape(0));
+   EXPECT_EQ(1u, in_diff.shape(1));
+   EXPECT_EQ(3u, in_diff.shape(2));
+   EXPECT_EQ(2u, in_diff.shape(3));
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dy[i], xptr[i]);
+ }
+ #endif // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_initializer.cc
----------------------------------------------------------------------
diff --cc test/singa/test_initializer.cc
index 0000000,e99cd79..4631af2
mode 000000,100644..100644
--- a/test/singa/test_initializer.cc
+++ b/test/singa/test_initializer.cc
@@@ -1,0 -1,148 +1,148 @@@
+ /**
+  * Licensed to the Apache Software Foundation (ASF) under one
+  * or more contributor license agreements.  See the NOTICE file
+  * distributed with this work for additional information
+  * regarding copyright ownership.  The ASF licenses this file
+  * to you under the Apache License, Version 2.0 (the
+  * "License"); you may not use this file except in compliance
+  * with the License.  You may obtain a copy of the License at
+  *
+  *     http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
+ 
+ #include "singa/model/initializer.h"
+ #include "gtest/gtest.h"
+ 
+ TEST(Initializer, Constant) {
+   singa::init::Constant x;
+   size_t n = 10;
+   singa::Tensor t(singa::Shape{n});
+   singa::FillerConf conf;
+   conf.set_value(3.1f);
+   x.Setup(conf);
+   x.Fill(&t);
+   const float* xPtr = t.data<float>();
+   for (size_t i = 0; i < n; i++)
+     EXPECT_FLOAT_EQ(xPtr[i], 3.1f);
+ }
+ 
+ 
+ TEST(Initializer, Gaussian) {
+   singa::init::Gaussian x;
+   size_t n = 1000;
+   singa::Tensor t(singa::Shape{n});
+   singa::FillerConf conf;
+   conf.set_mean(0.11f);
+   conf.set_std(0.01f);
+   x.Setup(conf);
+   x.Fill(&t);
+   const float* xPtr = t.data<float>();
+   float mean = 0.0f, std = 0.0f;
+   for (size_t i = 0; i < n; i++)
+     mean += xPtr[i];
+   mean /= n;
+   EXPECT_NEAR(mean, 0.11f, 1e-3);
+   for (size_t i = 0; i < n; i++)
+     std += (xPtr[i] - mean) * (xPtr[i] - mean);
+   std /= n;
+   std = sqrt(std);
+   EXPECT_NEAR(std, 0.01f, 1e-3);
+ }
+ 
+ #ifdef USE_CUDA
+ TEST(Initializer, ConstantCUDA) {
+   singa::init::Constant x;
 -  singa::CudaGPU dev;
++  auto dev = std::make_shared<singa::CudaGPU>();
+   size_t n = 10;
 -  singa::Tensor t(singa::Shape{n}, &dev);
++  singa::Tensor t(singa::Shape{n}, dev);
+   singa::FillerConf conf;
+   conf.set_value(3.1f);
+   x.Setup(conf);
+   x.Fill(&t);
+   t.ToHost();
+   const float* xPtr = t.data<float>();
+   for (size_t i = 0; i < n; i++)
+     EXPECT_FLOAT_EQ(xPtr[i], 3.1f);
+ 
+ 
+   singa::init::Constant y(-0.1f);
 -  singa::Tensor s(singa::Shape{n}, &dev);
++  singa::Tensor s(singa::Shape{n}, dev);
+   y.Fill(&s);
+   s.ToHost();
+   const float* sPtr = s.data<float>();
+   for (size_t i = 0; i < n; i++)
+     EXPECT_FLOAT_EQ(sPtr[i], -0.1f);
+ }
+ 
+ 
+ TEST(Initializer, GaussianCUDA) {
+   singa::init::Gaussian x;
 -  singa::CudaGPU dev;
++  auto dev = std::make_shared<singa::CudaGPU>();
+   size_t n = 1000;
 -  singa::Tensor t(singa::Shape{n}, &dev);
++  singa::Tensor t(singa::Shape{n}, dev);
+   singa::FillerConf conf;
+   conf.set_mean(0.11f);
+   conf.set_std(0.01f);
+   x.Setup(conf);
+   x.Fill(&t);
+   t.ToHost();
+   const float* tPtr = t.data<float>();
+   float mean = 0.0f, std = 0.0f;
+   for (size_t i = 0; i < n; i++)
+     mean += tPtr[i];
+   mean /= n;
+   EXPECT_NEAR(mean, 0.11f, 1e-2);
+   for (size_t i = 0; i < n; i++)
+     std += (tPtr[i] - mean) * (tPtr[i] - mean);
+   std /= n;
+   std = sqrt(std);
+   EXPECT_NEAR(std, 0.01f, 1e-2);
+ 
+ 
+   singa::init::Gaussian y(1.5f, 0.1f);
 -  singa::Tensor s(singa::Shape{n}, &dev);
++  singa::Tensor s(singa::Shape{n}, dev);
+   y.Fill(&s);
+   s.ToHost();
+   const float* sPtr = s.data<float>();
+   for (size_t i = 0; i < n; i++)
+     mean += sPtr[i];
+   mean /= n;
+   EXPECT_NEAR(mean, 1.5f, 0.1f);
+   for (size_t i = 0; i < n; i++)
+     std += (sPtr[i] - mean) * (sPtr[i] - mean);
+   std /= n;
+   std = sqrt(std);
+   EXPECT_NEAR(std, 0.1f, 0.1f);
+ }
+ 
+ TEST(Initializer, XavierCUDA) {
+   singa::init::Constant x;
 -  singa::CudaGPU dev;
++  auto dev = std::make_shared<singa::CudaGPU>();
+   size_t m = 30, n=40;
 -  singa::Tensor t(singa::Shape{m, n}, &dev);
++  singa::Tensor t(singa::Shape{m, n}, dev);
+   x.Fill(&t);
+   t.ToHost();
+   const float* xPtr = t.data<float>();
+   float mean = 0.0f;
+   float high = -100.0f, low = 100.0f;
+   for (size_t i = 0; i < n; i++) {
+     mean += xPtr[i];
+     if (high < xPtr[i])
+       high = xPtr[i];
+     if (low > xPtr[i])
+       low = xPtr[i];
+   }
+   mean /= m * n;
+   EXPECT_NEAR(mean, 0, 1e-2);
+   float scale = sqrt(6.0f / (m + n));
+   EXPECT_LT(high, scale);
+   EXPECT_GT(low, -scale);
+ }
+ 
+ #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_memory.cc
----------------------------------------------------------------------
diff --cc test/singa/test_memory.cc
index 90fc99a,0000000..b0df226
mode 100644,000000..100644
--- a/test/singa/test_memory.cc
+++ b/test/singa/test_memory.cc
@@@ -1,111 -1,0 +1,104 @@@
 +/************************************************************
 +*
 +* Licensed to the Apache Software Foundation (ASF) under one
 +* or more contributor license agreements.  See the NOTICE file
 +* distributed with this work for additional information
 +* regarding copyright ownership.  The ASF licenses this file
 +* to you under the Apache License, Version 2.0 (the
 +* "License"); you may not use this file except in compliance
 +* with the License.  You may obtain a copy of the License at
- * 
++*
 +*   http://www.apache.org/licenses/LICENSE-2.0
- * 
++*
 +* Unless required by applicable law or agreed to in writing,
 +* software distributed under the License is distributed on an
 +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 +* KIND, either express or implied.  See the License for the
 +* specific language governing permissions and limitations
 +* under the License.
 +*
 +*************************************************************/
 +
 +#include "gtest/gtest.h"
 +#include "singa/utils/logging.h"
 +#include "singa/core/memory.h"
 +#include "singa/singa_config.h"
++#include "singa/utils/timer.h"
 +#include <sys/time.h>
 +
 +#ifdef USE_CUDA
 +TEST(CnmemPool, PoolInit) {
-       singa::CnMemPool pool;
-       pool.InitPool();
++  singa::CnMemPool pool;
++  pool.InitPool();
 +}
 +
 +TEST(CnmemPool, PoolInitAll) {
-       singa::CnMemPool pool;
-       int nDevices;
-       cudaGetDeviceCount(&nDevices);
-       CHECK_GE(nDevices,1);
-       pool.InitPool(nDevices,1000000U,0);
++  singa::CnMemPool pool;
++  int nDevices;
++  cudaGetDeviceCount(&nDevices);
++  CHECK_GE(nDevices, 1);
++  pool.InitPool(nDevices, 32, 0);
 +}
 +
 +TEST(CnmemPool, UsePool) {
-       singa::CnMemPool pool;
-       pool.InitPool();
-       int numOfTests = 10;
-       int numOfWriteVsRead = 3;
-       int allocSize = 1000000U;
-       for(int i = 0; i < numOfTests; i++) {
-               int** memPtrs = new int*[numOfWriteVsRead];
-               for(int j = 0; j < numOfWriteVsRead; j++) {
-                       pool.Malloc((void**)(&memPtrs[j]), allocSize); 
-               }
-               pool.Free(memPtrs[0]);
-               delete[] memPtrs;
-       }
++  singa::CnMemPool pool;
++  pool.InitPool();
++  int numOfTests = 10;
++  int numOfWriteVsRead = 3;
++  int allocSize = 32;
++  for (int i = 0; i < numOfTests; i++) {
++    int** memPtrs = new int* [numOfWriteVsRead];
++    for (int j = 0; j < numOfWriteVsRead; j++) {
++      pool.Malloc((void**)(&memPtrs[j]), allocSize);
++    }
++    pool.Free(memPtrs[0]);
++    delete[] memPtrs;
++  }
 +}
 +
 +TEST(CudaMemPool, UsePool) {
-       singa::CudaMemPool pool;
-       int numOfTests = 10;
-       int numOfWriteVsRead = 3;
-       int allocSize = 1000000U;
-       for(int i = 0; i < numOfTests; i++) {
-               int** memPtrs = new int*[numOfWriteVsRead];
-               for(int j = 0; j < numOfWriteVsRead; j++) {
-                       pool.Malloc((void**)(&memPtrs[j]), allocSize); 
-               }
-               pool.Free(memPtrs[0]);
-               delete[] memPtrs;
-       }
++  singa::CudaMemPool pool;
++  int numOfTests = 10;
++  int numOfWriteVsRead = 3;
++  int allocSize = 32;
++  for (int i = 0; i < numOfTests; i++) {
++    int** memPtrs = new int* [numOfWriteVsRead];
++    for (int j = 0; j < numOfWriteVsRead; j++) {
++      pool.Malloc((void**)(&memPtrs[j]), allocSize);
++    }
++    pool.Free(memPtrs[0]);
++    delete[] memPtrs;
++  }
 +}
 +
 +TEST(MemPool, CompareCudaCnmem) {
-       singa::CudaMemPool cudaPool;
-       singa::CnMemPool cnPool;
-       cnPool.InitPool();
++  singa::CudaMemPool cudaPool;
++  singa::CnMemPool cnPool;
++  cnPool.InitPool();
++
++  int numOfTests = 5000;
++  int allocSize = 32;
 +
-       int numOfTests = 5000;
-       int allocSize = 1000000U;
-       struct timeval start,end;
-       double t1,t2;
++  singa::DeviceMemPool* pool = NULL;
++  pool = &cnPool;
 +
-       singa::DeviceMemPool* pool = NULL;
-       pool = &cnPool;
-       
-       gettimeofday(&start,NULL);
-       for(int i = 0; i < numOfTests; i++) {
-               int* memPtrs = NULL;
-               pool->Malloc((void**)&memPtrs, allocSize);      
-               pool->Free(memPtrs);
-       }
-       gettimeofday(&end,NULL);
-       
-       t1 = start.tv_sec * 1000 + start.tv_usec/1000;
-       t2 = end.tv_sec * 1000 + end.tv_usec/1000;
-       LOG(INFO) << "cnmem memory time: " << t2-t1 << " ms" << std::endl;
++  singa::Timer tick;
++  for (int i = 0; i < numOfTests; i++) {
++    int* memPtrs = NULL;
++    pool->Malloc((void**)&memPtrs, allocSize);
++    pool->Free(memPtrs);
++  }
++  tick.Tick();
++  int cn_time = tick.Elapsed();
 +
-       pool = &cudaPool;
-       gettimeofday(&start,NULL);
-       for(int i = 0; i < numOfTests; i++) {
-               int* memPtrs = NULL;
-               pool->Malloc((void**)&memPtrs, allocSize); 
-               pool->Free(memPtrs);
-       }
-       gettimeofday(&end,NULL);
-       
-       t1 = start.tv_sec * 1000 + start.tv_usec/1000;
-       t2 = end.tv_sec * 1000 + end.tv_usec/1000;
-       LOG(INFO) << "cuda memory time: " << t2-t1 << " ms" << std::endl;
++  pool = &cudaPool;
++  for (int i = 0; i < numOfTests; i++) {
++    int* memPtrs = NULL;
++    pool->Malloc((void**)&memPtrs, allocSize);
++    pool->Free(memPtrs);
++  }
++  tick.Tick();
++  int cuda_time = tick.Elapsed();
++  EXPECT_GE(cuda_time, cn_time);
 +}
- #endif // USE_CUDA
++#endif  // USE_CUDA

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_mse.cc
----------------------------------------------------------------------
diff --cc test/singa/test_mse.cc
index d2c5125,928be9d..788652f
--- a/test/singa/test_mse.cc
+++ b/test/singa/test_mse.cc
@@@ -22,8 -22,9 +22,8 @@@
  #include "gtest/gtest.h"
  #include "singa/core/tensor.h"
  #include "singa/core/device.h"
- #include "../src/model/loss/mse.h"
- #include "singa/singa_config.h"
+ #include "singa/model/loss.h"
 -#include "singa_config.h"
+ 
  using singa::Tensor;
  class TestMSE : public ::testing::Test {
   protected:
@@@ -68,14 -69,14 +68,14 @@@ TEST_F(TestMSE, CppBackward) 
  #endif
  #ifdef USE_CUDA
  TEST_F(TestMSE, CudaForward) {
 -  singa::MSE mse;
 -  singa::CudaGPU dev;
 -  p.ToDevice(&dev);
 -  t.ToDevice(&dev);
 -  Tensor loss = mse.Forward(p, t);
 +  singa::MSE* mse = new singa::MSE();
 +  auto dev = std::make_shared<singa::CudaGPU>();
 +  p.ToDevice(dev);
 +  t.ToDevice(dev);
 +  Tensor loss = mse->Forward(p, t);
  
    loss.ToHost();
-   auto ldat = loss.data<const float*>();
+   auto ldat = loss.data<float>();
  
    for (size_t i = 0, k = 0; i < loss.Size(); i++) {
      float l = 0.f;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_nesterov.cc
----------------------------------------------------------------------
diff --cc test/singa/test_nesterov.cc
index 0000000,35b2b4d..73f69f4
mode 000000,100644..100644
--- a/test/singa/test_nesterov.cc
+++ b/test/singa/test_nesterov.cc
@@@ -1,0 -1,101 +1,101 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+ 
+ #include "gtest/gtest.h"
+ #include "singa/model/optimizer.h"
 -#include "singa_config.h"
++#include "singa/singa_config.h"
+ 
+ TEST(Nesterov, ApplyCPU) {
+   singa::Nesterov nesterov;
+   float lr = 0.1f;
+   auto func = [](int step) { return step <= 5 ? 0.5f : 0.9f; };
+   nesterov.SetMomentumGenerator(func);
+   const float v[4] = {0.1, 0.2, 0.3, 0.4};
+   const float g[4] = {0.01, 0.02, 0.03, 0.04};
+ 
+   singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+   value.CopyDataFromHostPtr(v, 4);
+   grad.CopyDataFromHostPtr(g, 4);
+ 
+   nesterov.Apply(0, lr, "xx", grad, &value);
+ 
+   singa::Tensor v1 = value.Clone();
+   const float* newv1 = v1.data<float>();
+   float history[4], tmp[4];
+   for (int i = 0; i < 4; ++i) {
+     history[i] = g[i] * lr;
+     tmp[i] = history[i] * (1 + func(0));
+   }
+   for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]);
+ 
+   grad.CopyDataFromHostPtr(g, 4);
+   nesterov.Apply(1, lr, "xx", grad, &value);
+   singa::Tensor v2 = value.Clone();
+   const float* newv2 = v2.data<float>();
+   for (int i = 0; i < 4; ++i) {
+     tmp[i] = history[i];
+     history[i] = history[i] * func(1) + g[i] * lr;
+     tmp[i] = history[i] * (1 + func(1)) - tmp[i] * func(1);
+   }
+ 
+   for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv2[i], newv1[i] - tmp[i]);
+ }
+ 
+ #ifdef USE_CUDA
+ TEST(Nesterov, ApplyCUDA) {
+   singa::Nesterov nesterov;
+   float lr = 0.1f;
+   auto func = [](int step) { return step <= 5 ? 0.5f : 0.9f; };
+   nesterov.SetMomentumGenerator(func);
+   const float v[4] = {0.1, 0.2, 0.3, 0.4};
+   const float g[4] = {0.01, 0.02, 0.03, 0.04};
+ 
 -  singa::CudaGPU dev;
 -  singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
+   value.CopyDataFromHostPtr(v, 4);
+   grad.CopyDataFromHostPtr(g, 4);
+ 
+   nesterov.Apply(0, lr, "xx", grad, &value);
+ 
+   singa::Tensor v1 = value.Clone();
+   v1.ToHost();
+   const float* newv1 = v1.data<float>();
+   float history[4], tmp[4];
+   for (int i = 0; i < 4; ++i) {
+     history[i] = g[i] * lr;
+     tmp[i] = history[i] * (1 + func(0));
+   }
+   for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv1[i], v[i] - tmp[i]);
+ 
+   grad.CopyDataFromHostPtr(g, 4);
+   nesterov.Apply(1, lr, "xx", grad, &value);
+   singa::Tensor v2 = value.Clone();
+   v2.ToHost();
+   const float* newv2 = v2.data<float>();
+   for (int i = 0; i < 4; ++i) {
+     tmp[i] = history[i];
+     history[i] = history[i] * func(1) + g[i] * lr;
+     tmp[i] = history[i] * (1 + func(1)) - tmp[i] * func(1);
+   }
+ 
+   for (int i = 0; i < 4; ++i) EXPECT_FLOAT_EQ(newv2[i], newv1[i] - tmp[i]);
+ }
+ #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_prelu.cc
----------------------------------------------------------------------
diff --cc test/singa/test_prelu.cc
index 0000000,fee7c5b..dbf5ca6
mode 000000,100644..100644
--- a/test/singa/test_prelu.cc
+++ b/test/singa/test_prelu.cc
@@@ -1,0 -1,247 +1,245 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+ 
+ #include "../src/model/layer/prelu.h"
+ #include "gtest/gtest.h"
 -#include "singa_config.h"
++#include "singa/singa_config.h"
+ 
+ using singa::PReLU;
+ using singa::Shape;
+ TEST(PReLU, Setup) {
+   PReLU prelu;
+   EXPECT_EQ("PReLU", prelu.layer_type());
+ 
+   singa::LayerConf conf;
+   singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+   preluconf->set_channel_shared(true);
+   preluconf->set_format("NHWC");
+ 
+   prelu.Setup(Shape{4}, conf);
+   EXPECT_EQ(true, prelu.Channel_shared());
+   EXPECT_EQ("NHWC", prelu.Format());
+ }
+ 
+ TEST(PReLU, ForwardCPU) {
+   const float x[] = {1.f,  2.f, 3.f,  -2.f, -3.f, -1.f,
+                      -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
+   size_t batchsize = 2, c = 3, h = 2, w = 1;
+   singa::Tensor in(singa::Shape{batchsize, h, w, c});
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   PReLU prelu;
+   singa::LayerConf conf;
+   singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+   preluconf->set_channel_shared(false);
+   preluconf->set_format("NHWC");
+   prelu.Setup(Shape{h, w, c}, conf);
+ 
+   const float neg_slope[] = {0.25f, 0.5f, 0.75f};
+   singa::Tensor a(singa::Shape{c});
+   a.CopyDataFromHostPtr<float>(neg_slope, c);
+   prelu.Set_a(a);
+ 
+   singa::Tensor out = prelu.Forward(singa::kTrain, in);
+   const float *yptr = out.data<float>();
+   EXPECT_EQ(n, out.Size());
+ 
+   float *y = new float[n];
+   size_t div_factor = prelu.Channel_shared() ? c : 1;
+   if (prelu.Format() == "NCHW") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i / (h * w) % c / div_factor;
+       y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+     }
+   } else if (prelu.Format() == "NHWC") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i % c / div_factor;
+       y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+     }
+   }
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
+ }
+ 
+ TEST(PReLU, BackwardCPU) {
+   const float x[] = {1.f,  2.f, 3.f,  -2.f, -3.f, -1.f,
+                      -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
+   size_t batchsize = 2, c = 3, h = 2, w = 1;
+   singa::Tensor in(singa::Shape{batchsize, c, h, w});
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   PReLU prelu;
+   singa::LayerConf conf;
+   singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+   preluconf->set_channel_shared(false);
+   preluconf->set_format("NCHW");
+   prelu.Setup(Shape{c, h, w}, conf);
+ 
+   const float neg_slope[] = {0.25f, 0.5f, 0.75f};
+   singa::Tensor a(singa::Shape{c});
+   a.CopyDataFromHostPtr<float>(neg_slope, c);
+   prelu.Set_a(a);
+ 
+   singa::Tensor out = prelu.Forward(singa::kTrain, in);
+ 
+   const float grad[] = {1.f, 2.f,  -2.f, -1.f, -1.f, -3.f,
+                         2.f, -2.f, 1.f,  1.f,  -2.f, 0.f};
+   singa::Tensor out_diff(singa::Shape{batchsize, c, h, w});
+   out_diff.CopyDataFromHostPtr<float>(grad, n);
+   const auto ret = prelu.Backward(singa::kTrain, out_diff);
+   const float *xptr = ret.first.data<float>();
+   const float *aptr = ret.second.at(0).data<float>();
+   float *dx = new float[n];
+   size_t div_factor = prelu.Channel_shared() ? c : 1;
+   size_t params = prelu.Channel_shared() ? 1 : c;
+   float da[] = {0.f, 0.f, 0.f};
+   if (prelu.Format() == "NCHW") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i / (h * w) % c / div_factor;
+       dx[i] = grad[i] *
+               (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+     }
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i / (h * w) % c / div_factor;
+       da[pos] += grad[i] * std::min(x[i], 0.f);
+     }
+   } else if (prelu.Format() == "NHWC") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i % c / div_factor;
+       dx[i] = grad[i] *
+               (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+     }
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i % c / div_factor;
+       da[pos] += grad[i] * std::min(x[i], 0.f);
+     }
+   }
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+   for (size_t i = 0; i < params; i++) EXPECT_FLOAT_EQ(da[i], aptr[i]);
+ }
+ 
+ #ifdef USE_CUDA
+ TEST(PReLU, ForwardGPU) {
+   const float x[] = {1.f,  2.f, 3.f,  -2.f, -3.f, -1.f,
+                          -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
+   size_t batchsize = 2, c = 3, h = 2, w = 1;
 -  singa::CudaGPU cuda(0, 1);
 -  singa::Tensor in(singa::Shape{batchsize, h, w, c}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>();
++  singa::Tensor in(singa::Shape{batchsize, h, w, c}, cuda);
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   PReLU prelu;
+   singa::LayerConf conf;
+   singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+   preluconf->set_channel_shared(false);
+   preluconf->set_format("NHWC");
+   prelu.Setup(Shape{h, w, c}, conf);
+ 
+   const float neg_slope[] = {0.25f, 0.5f, 0.75f};
 -  singa::Tensor a(singa::Shape{c}, &cuda);
++  singa::Tensor a(singa::Shape{c}, cuda);
+   a.CopyDataFromHostPtr<float>(neg_slope, c);
+   prelu.Set_a(a);
+ 
+   singa::Tensor out = prelu.Forward(singa::kTrain, in);
 -  singa::CppCPU host(0, 1);
 -  out.ToDevice(&host);
++  out.ToHost();
+   const float *yptr = out.data<float>();
+   EXPECT_EQ(n, out.Size());
+ 
+   float *y = new float[n];
+   size_t div_factor = prelu.Channel_shared() ? c : 1;
+   if (prelu.Format() == "NCHW") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i / (h * w) % c / div_factor;
+       y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+     }
+   } else if (prelu.Format() == "NHWC") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i % c / div_factor;
+       y[i] = std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f);
+     }
+   }
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(y[i], yptr[i]);
+ }
+ 
+ TEST(PReLU, BackwardGPU) {
+   const float x[] = {1.f,  2.f, 3.f,  -2.f, -3.f, -1.f,
+                            -1.f, 2.f, -1.f, -2.f, -2.f, -1.f};
+   size_t n = sizeof(x) / sizeof(float);
+   size_t batchsize = 2, c = 3, h = 2, w = 1;
 -  singa::CudaGPU cuda(0, 1);
 -  singa::Tensor in(singa::Shape{batchsize, c, h, w}, &cuda);
++  auto cuda = std::make_shared<singa::CudaGPU>();
++  singa::Tensor in(singa::Shape{batchsize, c, h, w}, cuda);
+   in.CopyDataFromHostPtr<float>(x, n);
+ 
+   PReLU prelu;
+   singa::LayerConf conf;
+   singa::PReLUConf *preluconf = conf.mutable_prelu_conf();
+   preluconf->set_channel_shared(false);
+   preluconf->set_format("NCHW");
+   prelu.Setup(Shape{c, h, w}, conf);
+ 
+   const float neg_slope[] = {0.25f, 0.5f, 0.75f};
 -  singa::Tensor a(singa::Shape{c}, &cuda);
++  singa::Tensor a(singa::Shape{c}, cuda);
+   a.CopyDataFromHostPtr<float>(neg_slope, c);
+   prelu.Set_a(a);
+ 
+   singa::Tensor out = prelu.Forward(singa::kTrain, in);
+   const float grad[] = {1.f, 2.f,  -2.f, -1.f, -1.f, -3.f,
+                           2.f, -2.f, 1.f,  1.f,  -2.f, 0.f};
 -  singa::Tensor out_diff(singa::Shape{batchsize, c, h, w}, &cuda);
++  singa::Tensor out_diff(singa::Shape{batchsize, c, h, w}, cuda);
+   out_diff.CopyDataFromHostPtr<float>(grad, n);
+   const auto ret = prelu.Backward(singa::kTrain, out_diff);
+ 
+   singa::Tensor in_diff = ret.first;
 -  singa::CppCPU host(0, 1);
 -  in_diff.ToDevice(&host);
++  in_diff.ToHost();
+   const float *xptr = in_diff.data<float>();
+   singa::Tensor a_diff = ret.second.at(0);
 -  a_diff.ToDevice(&host);
++  a_diff.ToHost();
+   const float *aptr = a_diff.data<float>();
+   float *dx = new float[n];
+   size_t div_factor = prelu.Channel_shared() ? c : 1;
+   size_t params = prelu.Channel_shared() ? 1 : c;
+   float da[] = {0.f, 0.f, 0.f};
+   if (prelu.Format() == "NCHW") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i / (h * w) % c / div_factor;
+       dx[i] = grad[i] *
+                 (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+     }
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i / (h * w) % c / div_factor;
+       da[pos] += grad[i] * std::min(x[i], 0.f);
+     }
+   } else if (prelu.Format() == "NHWC") {
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i % c / div_factor;
+       dx[i] = grad[i] *
+         (std::max(x[i], 0.f) + neg_slope[pos] * std::min(x[i], 0.f));
+   }
+     for (size_t i = 0; i < n; i++) {
+       size_t pos = i % c / div_factor;
+       da[pos] += grad[i] * std::min(x[i], 0.f);
+     }
+   }
+   for (size_t i = 0; i < n; i++) EXPECT_FLOAT_EQ(dx[i], xptr[i]);
+   for (size_t i = 0; i < params; i++) EXPECT_FLOAT_EQ(da[i], aptr[i]);
+ }
+ #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_rmsprop.cc
----------------------------------------------------------------------
diff --cc test/singa/test_rmsprop.cc
index 0000000,004a9b6..18de9c3
mode 000000,100644..100644
--- a/test/singa/test_rmsprop.cc
+++ b/test/singa/test_rmsprop.cc
@@@ -1,0 -1,106 +1,105 @@@
+ /************************************************************
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *************************************************************/
+ 
+ #include "gtest/gtest.h"
+ #include "singa/model/optimizer.h"
 -#include "singa_config.h"
+ #include <cmath>
+ 
+ TEST(RMSProp, ApplyCPU) {
+   singa::RMSProp rmsprop;
+   float lr = 0.1f;
+   float rho = 0.9;
+   const float v[4] = {0.1, 0.2, 0.3, 0.4};
+   const float g[4] = {0.01, 0.02, 0.03, 0.04};
+ 
+   singa::OptimizerConf conf;
+   conf.set_rho(rho);
+   conf.set_delta(1E-8);
+ 
+   singa::Tensor value(singa::Shape{4}), grad(singa::Shape{4});
+   value.CopyDataFromHostPtr(v, 4);
+   grad.CopyDataFromHostPtr(g, 4);
+ 
+   rmsprop.Setup(conf);
+   rmsprop.Apply(0, lr, "xx", grad, &value);
+ 
+   singa::Tensor v1 = value.Clone();
+   const float* newv1 = v1.data<float>();
+   float history[4];
+   for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho);
+   for (int i = 0; i < 4; ++i)
+     EXPECT_NEAR(newv1[i], v[i] - g[i] * lr / sqrt(history[i] + (float)1E-8),
+                 1e-5);
+ 
+   grad.CopyDataFromHostPtr(g, 4);
+   rmsprop.Apply(1, lr, "xx", grad, &value);
+   singa::Tensor v2 = value.Clone();
+   const float* newv2 = v2.data<float>();
+   for (int i = 0; i < 4; ++i)
+     history[i] = history[i] * rho + g[i] * g[i] * (1 - rho);
+ 
+   for (int i = 0; i < 4; ++i)
+     EXPECT_NEAR(newv2[i], newv1[i] - lr * g[i] / sqrt(history[i] + 
(float)1E-8),
+                 1e-5);
+ }
+ 
+ #ifdef USE_CUDA
+ TEST(RMSProp, ApplyCUDA) {
+   singa::RMSProp rmsprop;
+   float lr = 0.1f;
+   float rho = 0.02;
+   const float v[4] = {0.1, 0.2, 0.3, 0.4};
+   const float g[4] = {0.01, 0.02, 0.03, 0.04};
+ 
+   singa::OptimizerConf conf;
+   conf.set_rho(rho);
+   conf.set_delta(1e-8);
+ 
 -  singa::CudaGPU dev;
 -  singa::Tensor value(singa::Shape{4}, &dev), grad(singa::Shape{4}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  singa::Tensor value(singa::Shape{4}, dev), grad(singa::Shape{4}, dev);
+   value.CopyDataFromHostPtr(v, 4);
+   grad.CopyDataFromHostPtr(g, 4);
+ 
+   rmsprop.Setup(conf);
+   rmsprop.Apply(0, lr, "xx", grad, &value);
+ 
+   singa::Tensor v1 = value.Clone();
+   v1.ToHost();
+   const float* newv1 = v1.data<float>();
+   float history[4];
+   for (int i = 0; i < 4; ++i) history[i] = g[i] * g[i] * (1 - rho);
+   for (int i = 0; i < 4; ++i)
+     EXPECT_NEAR(newv1[i], v[i] - lr * g[i] / sqrt(history[i] + conf.delta()),
+                 1e-5);
+ 
+   grad.CopyDataFromHostPtr(g, 4);
+   rmsprop.Apply(1, lr, "xx", grad, &value);
+   singa::Tensor v2 = value.Clone();
+   v2.ToHost();
+   const float* newv2 = v2.data<float>();
+   for (int i = 0; i < 4; ++i)
+     history[i] = history[i] * rho + g[i] * g[i] * (1 - rho);
+ 
+   for (int i = 0; i < 4; ++i)
+     EXPECT_NEAR(newv2[i],
+                 newv1[i] - lr * g[i] / sqrt(history[i] + conf.delta()), 1e-5);
+ }
+ #endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_sgd.cc
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_tensor.cc
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/dd08f413/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --cc test/singa/test_tensor_math.cc
index 0f998c0,a40a848..f8d0351
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@@ -253,12 -507,21 +507,21 @@@ TEST_F(TestTensorMath, SumColumnsCpp) 
  }
  #endif
  #ifdef USE_CUDA
+ TEST_F(TestTensorMath, L2Cuda) {
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{3, 2}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  Tensor t(Shape{3, 2}, dev);
+   t.CopyDataFromHostPtr(dat1, 6);
+   float l2 = t.L2();
+   float target = 0.0f;
+   for (size_t i = 0; i < t.Size(); i++) target += dat1[i] * dat1[i];
+   EXPECT_FLOAT_EQ(l2, sqrt(target));
+ }
  TEST_F(TestTensorMath, MultCuda) {
    const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{2, 2}, &dev);
 +  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{2, 2}, dev);
    t.CopyDataFromHostPtr(x, 4);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    d.CopyDataFromHostPtr(dat1, 6);
    Tensor C = Mult(d, t);
    C.ToHost();
@@@ -302,20 -565,18 +565,20 @@@
        EXPECT_FLOAT_EQ(oPtr[i * 4 + j], x[i]);
      }
    }
-       d.ToHost();
-       p.ToHost();
++  d.ToHost();
++  p.ToHost();
  }
  
  TEST_F(TestTensorMath, AddColumnCuda) {
    const float x[3] = {1.0f, 2.0f, 3.0f};
-       auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{3}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{3}, dev);
    t.CopyDataFromHostPtr(x, 3);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    AddColumn(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] + x[i]);
@@@ -323,17 -584,16 +586,16 @@@
    }
  }
  
- 
  TEST_F(TestTensorMath, SubColumnCuda) {
    const float x[3] = {1.0f, 2.0f, 3.0f};
-       auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{3}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{3}, dev);
    t.CopyDataFromHostPtr(x, 3);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    SubColumn(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] - x[i]);
@@@ -357,14 -617,14 +619,14 @@@ TEST_F(TestTensorMath, MultColumnCpp) 
  #ifdef USE_CUDA
  TEST_F(TestTensorMath, MultColumnCuda) {
    const float x[3] = {1.0f, 2.0f, 3.0f};
-       auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{3}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{3}, dev);
    t.CopyDataFromHostPtr(x, 3);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    MultColumn(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[i]);
@@@ -373,14 -633,14 +635,14 @@@
  }
  TEST_F(TestTensorMath, DivColumnCuda) {
    const float x[3] = {1.0f, 2.0f, 3.0f};
-       auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{3}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{3}, dev);
    t.CopyDataFromHostPtr(x, 3);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    DivColumn(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[i]);
@@@ -389,14 -649,14 +651,14 @@@
  }
  TEST_F(TestTensorMath, AddRowCuda) {
    const float x[2] = {1.1f, 2.1f};
-       auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{2}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{2}, dev);
    t.CopyDataFromHostPtr(x, 2);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    AddRow(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] + x[j]);
@@@ -405,14 -665,14 +667,14 @@@
  }
  TEST_F(TestTensorMath, SubRowCuda) {
    const float x[2] = {1.1f, 2.1f};
-       auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{2}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{2}, dev);
    t.CopyDataFromHostPtr(x, 2);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    SubRow(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] - x[j]);
@@@ -421,14 -681,14 +683,14 @@@
  }
  TEST_F(TestTensorMath, MultRowCuda) {
    const float x[2] = {1.1f, 2.1f};
-       auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{2}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{2}, dev);
    t.CopyDataFromHostPtr(x, 2);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    MultRow(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] * x[j]);
@@@ -452,14 -712,14 +714,14 @@@ TEST_F(TestTensorMath, DivRowCpp) 
  #ifdef USE_CUDA
  TEST_F(TestTensorMath, DivRowCuda) {
    const float x[2] = {1.1f, 2.1f};
-       auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{2}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{2}, dev);
    t.CopyDataFromHostPtr(x, 2);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    DivRow(t, &d);
    d.ToHost();
-   const float *xptr = d.data<const float *>();
+   const float *xptr = d.data<float>();
    for (int i = 0; i < 3; i++) {
      for (int j = 0; j < 2; j++) {
        EXPECT_FLOAT_EQ(xptr[i * 2 + j], dat1[i * 2 + j] / x[j]);
@@@ -467,13 -727,13 +729,13 @@@
    }
  }
  TEST_F(TestTensorMath, SumRowsCuda) {
-       auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{2}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{2}, dev);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    SumRows(d, &t);
    t.ToHost();
-   const float *tptr = t.data<const float *>();
+   const float *tptr = t.data<float>();
    for (int i = 0; i < 2; i++) {
      float tmp = 0;
      for (int j = 0; j < 3; j++) {
@@@ -481,16 -741,15 +743,16 @@@
      }
      EXPECT_FLOAT_EQ(tptr[i], tmp);
    }
-       d.ToHost();
++  d.ToHost();
  }
  TEST_F(TestTensorMath, SumColumnCuda) {
-       auto dev = std::make_shared<singa::CudaGPU>();
 -  singa::CudaGPU dev;
 -  Tensor t(Shape{3}, &dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
 +  Tensor t(Shape{3}, dev);
    d.CopyDataFromHostPtr(dat1, 6);
 -  d.ToDevice(&dev);
 +  d.ToDevice(dev);
    SumColumns(d, &t);
    t.ToHost();
-   const float *tptr = t.data<const float *>();
+   const float *tptr = t.data<float>();
    for (int i = 0; i < 3; i++) {
      float tmp = 0;
      for (int j = 0; j < 2; j++) {
@@@ -498,6 -757,120 +760,121 @@@
      }
      EXPECT_FLOAT_EQ(tptr[i], tmp);
    }
-       d.ToHost();
++  d.ToHost();
  }
+ 
+ #endif
+ 
+ TEST_F(TestTensorMath, ConcatenateRowsCpp) {
+   d.CopyDataFromHostPtr<float>(dat1, 6);
+   e.CopyDataFromHostPtr<float>(dat2, 6);
+   const auto ret = singa::ConcatenateRows(vector<Tensor>{d, e});
+   EXPECT_EQ(ret.shape(0), d.shape(0) + e.shape(0));
+   EXPECT_EQ(ret.shape(1), d.shape(1));
+   const float *retPtr = ret.data<float>();
+   for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i], dat1[i]);
+   for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i + 6], dat2[i]);
+ }
+ 
+ TEST_F(TestTensorMath, ConcatenateColumnsCpp) {
+   d.CopyDataFromHostPtr<float>(dat1, 6);
+   e.CopyDataFromHostPtr<float>(dat2, 6);
+   const auto ret = singa::ConcatenateColumns(vector<Tensor>{d, e});
+   EXPECT_EQ(ret.shape(0), d.shape(0));
+   EXPECT_EQ(ret.shape(1), d.shape(1) + e.shape(1));
+ 
+   const float *retPtr = ret.data<float>();
+   for (int i = 0; i < 3; i++) {
+     for (int j = 0; j < 2; j++)
+       EXPECT_FLOAT_EQ(retPtr[i * 4 + j], dat1[i * 2 + j]);
+     for (int j = 0; j < 2; j++)
+       EXPECT_FLOAT_EQ(retPtr[i * 4 + 2 + j], dat2[i * 2 + j]);
+   }
+ }
+ 
+ TEST_F(TestTensorMath, CopyRowsCpp) {
+   const auto ret = singa::CopyRows(e, 1, 2);
+   EXPECT_EQ(ret.shape(0), 1u);
+   EXPECT_EQ(ret.shape(1), e.shape(1));
+   const float *retPtr = ret.data<float>();
+   for (size_t i = 0; i < ret.Size(); i++)
+     EXPECT_FLOAT_EQ(retPtr[i], dat1[1 * 2 + i]);
+ }
+ 
+ TEST_F(TestTensorMath, CopyColumnsCpp) {
+   a.Reshape(Shape{2, 3});
+   const auto ret = singa::CopyColumns(a, 1, 3);
+   EXPECT_EQ(ret.shape(0), a.shape(0));
+   EXPECT_EQ(ret.shape(1), 2u);
+   const float *retPtr = ret.data<float>();
+   for (size_t i = 0; i < ret.shape(0); i++)
+     for (size_t j = 0; j < ret.shape(1); j++)
+       EXPECT_FLOAT_EQ(retPtr[i * ret.shape(1) + j],
+                       dat1[i * a.shape(1) + j + 1]);
+ }
+ 
+ #ifdef USE_CUDA
+ 
+ TEST_F(TestTensorMath, ConcatenateRowsCuda) {
 -  singa::CudaGPU dev;
 -  d.ToDevice(&dev);
 -  e.ToDevice(&dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  d.ToDevice(dev);
++  e.ToDevice(dev);
+   d.CopyDataFromHostPtr<float>(dat1, 6);
+   e.CopyDataFromHostPtr<float>(dat2, 6);
+   auto ret = singa::ConcatenateRows(vector<Tensor>{d, e});
+   EXPECT_EQ(ret.shape(0), d.shape(0) + e.shape(0));
+   EXPECT_EQ(ret.shape(1), d.shape(1));
+   ret.ToHost();
+   const float *retPtr = ret.data<float>();
+   for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i], dat1[i]);
+   for (int i = 0; i < 6; i++) EXPECT_FLOAT_EQ(retPtr[i + 6], dat2[i]);
+ }
+ 
+ TEST_F(TestTensorMath, ConcatenateColumnsCuda) {
 -  singa::CudaGPU dev;
 -  d.ToDevice(&dev);
 -  e.ToDevice(&dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  d.ToDevice(dev);
++  e.ToDevice(dev);
+   d.CopyDataFromHostPtr<float>(dat1, 6);
+   e.CopyDataFromHostPtr<float>(dat2, 6);
+   auto ret = singa::ConcatenateColumns(vector<Tensor>{d, e});
+   ret.ToHost();
+   EXPECT_EQ(ret.shape(0), d.shape(0));
+   EXPECT_EQ(ret.shape(1), d.shape(1) + e.shape(1));
+ 
+   const float *retPtr = ret.data<float>();
+   for (int i = 0; i < 3; i++) {
+     for (int j = 0; j < 2; j++)
+       EXPECT_FLOAT_EQ(retPtr[i * 4 + j], dat1[i * 2 + j]);
+     for (int j = 0; j < 2; j++)
+       EXPECT_FLOAT_EQ(retPtr[i * 4 + 2 + j], dat2[i * 2 + j]);
+   }
+ }
+ 
+ TEST_F(TestTensorMath, CopyRowsCuda) {
 -  singa::CudaGPU dev;
 -  e.ToDevice(&dev);
++  auto dev = std::make_shared<singa::CudaGPU>();
++  e.ToDevice(dev);
+   auto ret = singa::CopyRows(e, 1, 2);
+   ret.ToHost();
+   EXPECT_EQ(ret.shape(0), 1u);
+   EXPECT_EQ(ret.shape(1), e.shape(1));
+   const float *retPtr = ret.data<float>();
+   for (size_t i = 0; i < ret.Size(); i++)
+     EXPECT_FLOAT_EQ(retPtr[i], dat1[1 * 2 + i]);
+ }
+ 
+ TEST_F(TestTensorMath, CopyColumnsCuda) {
 -  singa::CudaGPU dev;
++  auto dev = std::make_shared<singa::CudaGPU>();
+   a.Reshape(Shape{2, 3});
 -  a.ToDevice(&dev);
++  a.ToDevice(dev);
+   auto ret = singa::CopyColumns(a, 1, 3);
+   EXPECT_EQ(ret.shape(0), a.shape(0));
+   EXPECT_EQ(ret.shape(1), 2u);
+   ret.ToHost();
+   const float *retPtr = ret.data<float>();
+   for (size_t i = 0; i < ret.shape(0); i++)
+     for (size_t j = 0; j < ret.shape(1); j++)
+       EXPECT_FLOAT_EQ(retPtr[i * ret.shape(1) + j],
+                       dat1[i * a.shape(1) + j + 1]);
+ }
+ 
  #endif

Reply via email to