[4/4] incubator-singa git commit: SINGA-380) Fix bugs from Reshape

wangsh Wed, 11 Jul 2018 01:29:54 -0700

SINGA-380) Fix bugs from Reshape

Update reshape API in C++ and Python.
C++ Tensor method reshape changes original tensor;
All other reshape method returns a new tensor (which shares memory with the 
original tensor if possible).


APIs for transpose are updated in the same way.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/b30d7ea5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/b30d7ea5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/b30d7ea5

Branch: refs/heads/master
Commit: b30d7ea55cd58bb0858aa354833c1ba9a3242470
Parents: 58e6640
Author: Wang Wei <[email protected]>
Authored: Mon Jul 9 23:52:10 2018 +0800
Committer: wang wei <[email protected]>
Committed: Wed Jul 11 15:24:27 2018 +0800

----------------------------------------------------------------------
 examples/autograd/mnist_cnn.py        |  17 +-
 examples/cifar10/cnn-parallel.cc      |   8 +-
 examples/cifar10/vgg-parallel.cc      |   8 +-
 examples/imagenet/alexnet/alexnet.cc  |   2 +-
 examples/imagenet/alexnet/ilsvrc12.h  |  16 +-
 include/singa/core/tensor.h           | 162 ++++----
 python/singa/autograd.py              | 273 +++++++-------
 python/singa/tensor.py                | 109 +++---
 src/api/core_tensor.i                 |  19 +-
 src/core/tensor/tensor.cc             | 297 ++++-----------
 src/core/tensor/tensor_math.h         |   2 +-
 src/core/tensor/tensor_math_cuda.h    | 323 ++++------------
 src/io/image_transformer.cc           | 573 ++++++++++++++---------------
 src/model/layer/batchnorm.cc          |  15 +-
 src/model/layer/convolution.cc        |   8 +-
 src/model/layer/cudnn_batchnorm.cc    |   4 +-
 src/model/layer/dense.cc              |  14 +-
 src/model/layer/flatten.cc            |   3 +-
 src/model/layer/lrn.cc                |   9 +-
 src/model/layer/opencl_convolution.cc |  58 +--
 src/model/layer/rnn.cc                |   2 +-
 src/model/operation/convolution.cc    |  67 ++--
 src/model/updater/local_updater.cc    |   4 +-
 23 files changed, 849 insertions(+), 1144 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/examples/autograd/mnist_cnn.py
----------------------------------------------------------------------
diff --git a/examples/autograd/mnist_cnn.py b/examples/autograd/mnist_cnn.py
index 43a22ba..f78ccc8 100755
--- a/examples/autograd/mnist_cnn.py
+++ b/examples/autograd/mnist_cnn.py
@@ -84,7 +84,7 @@ if __name__ == '__main__':
         dev = device.get_default_device()
     else:
         print('Using GPU')
-        dev = device.create_cuda_gpu()
+        dev = device.create_cuda_gpu_on(1)
 
     train, test = load_data(args.file_path)
 
@@ -92,7 +92,7 @@ if __name__ == '__main__':
     num_classes = 10
     epochs = 1
 
-    sgd = optimizer.SGD(0.001)
+    sgd = optimizer.SGD(0.01)
 
     x_train = preprocess(train[0])
     y_train = to_categorical(train[1], num_classes)
@@ -111,7 +111,6 @@ if __name__ == '__main__':
 
 
     def forward(x, t):
-        
         y = conv1(x)
         y = autograd.relu(y)
         y = autograd.max_pool_2d(y)
@@ -124,11 +123,11 @@ if __name__ == '__main__':
         return loss, y
 
     autograd.training = True
-    for epoch in range(50):
+    for epoch in range(epochs):
         for i in range(batch_number):
             inputs = tensor.Tensor(device=dev, data=x_train[ i * 100:(1 + i) * 
100], stores_grad=False)
             targets = tensor.Tensor(device=dev, data=y_train[i * 100:(1 + i) * 
100], requires_grad=False, stores_grad=False)
-            
+
             loss, y = forward(inputs, targets)
 
             accuracy_rate = accuracy(tensor.to_numpy(y),
@@ -136,12 +135,6 @@ if __name__ == '__main__':
             if (i % 5 == 0):
                 print('accuracy is:', accuracy_rate, 'loss is:',
                       tensor.to_numpy(loss)[0])
-            
+
             for p, gp in autograd.backward(loss):
                 sgd.apply(epoch, gp, p, '')
-            
-            
-
-            
-            
-

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/examples/cifar10/cnn-parallel.cc
----------------------------------------------------------------------
diff --git a/examples/cifar10/cnn-parallel.cc b/examples/cifar10/cnn-parallel.cc
index 8cc3352..4bee575 100644
--- a/examples/cifar10/cnn-parallel.cc
+++ b/examples/cifar10/cnn-parallel.cc
@@ -154,20 +154,20 @@ void Train(float lr, int num_epoch, string data_dir) {
     train_y = train.second;
 
     LOG(INFO) << "Slicing training data...";
-    train_x_1.Reshape(Shape{nsamples / 2, train.first.shape(1),
+    train_x_1 = Tensor(Shape{nsamples / 2, train.first.shape(1),
         train.first.shape(2), train.first.shape(3)});
     LOG(INFO) << "Copying first data slice...";
     CopyDataToFrom(&train_x_1, train_x, train_x.Size() / 2);
-    train_x_2.Reshape(Shape{nsamples / 2, train.first.shape(1),
+    train_x_2 = Tensor(Shape{nsamples / 2, train.first.shape(1),
         train.first.shape(2), train.first.shape(3)});
     LOG(INFO) << "Copying second data slice...";
     CopyDataToFrom(&train_x_2, train_x, train_x.Size() / 2, 0,
                    train_x.Size() / 2);
-    train_y_1.Reshape(Shape{nsamples / 2});
+    train_y_1 = Tensor(Shape{nsamples / 2});
     train_y_1.AsType(kInt);
     LOG(INFO) << "Copying first label slice...";
     CopyDataToFrom(&train_y_1, train_y, train_y.Size() / 2);
-    train_y_2.Reshape(Shape{nsamples / 2});
+    train_y_2 = Tensor(Shape{nsamples / 2});
     train_y_2.AsType(kInt);
     LOG(INFO) << "Copying second label slice...";
     CopyDataToFrom(&train_y_2, train_y, train_y.Size() / 2, 0,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/examples/cifar10/vgg-parallel.cc
----------------------------------------------------------------------
diff --git a/examples/cifar10/vgg-parallel.cc b/examples/cifar10/vgg-parallel.cc
index 90e9fce..33c533b 100644
--- a/examples/cifar10/vgg-parallel.cc
+++ b/examples/cifar10/vgg-parallel.cc
@@ -223,20 +223,20 @@ void Train(float lr, int num_epoch, string data_dir) {
     train_y = train.second;
 
     LOG(INFO) << "Slicing training data...";
-    train_x_1.Reshape(Shape{nsamples / 2, train.first.shape(1),
+    train_x_1 = Tensor(Shape{nsamples / 2, train.first.shape(1),
         train.first.shape(2), train.first.shape(3)});
     LOG(INFO) << "Copying first data slice...";
     CopyDataToFrom(&train_x_1, train_x, train_x.Size() / 2);
-    train_x_2.Reshape(Shape{nsamples / 2, train.first.shape(1),
+    train_x_2 = Tensor(Shape{nsamples / 2, train.first.shape(1),
         train.first.shape(2), train.first.shape(3)});
     LOG(INFO) << "Copying second data slice...";
     CopyDataToFrom(&train_x_2, train_x, train_x.Size() / 2, 0,
                    train_x.Size() / 2);
-    train_y_1.Reshape(Shape{nsamples / 2});
+    train_y_1 = Tensor(Shape{nsamples / 2});
     train_y_1.AsType(kInt);
     LOG(INFO) << "Copying first label slice...";
     CopyDataToFrom(&train_y_1, train_y, train_y.Size() / 2);
-    train_y_2.Reshape(Shape{nsamples / 2});
+    train_y_2 = Tensor(Shape{nsamples / 2});
     train_y_2.AsType(kInt);
     LOG(INFO) << "Copying second label slice...";
     CopyDataToFrom(&train_y_2, train_y, train_y.Size() / 2, 0,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/examples/imagenet/alexnet/alexnet.cc
----------------------------------------------------------------------
diff --git a/examples/imagenet/alexnet/alexnet.cc 
b/examples/imagenet/alexnet/alexnet.cc
index 4ac1130..2d8db2d 100644
--- a/examples/imagenet/alexnet/alexnet.cc
+++ b/examples/imagenet/alexnet/alexnet.cc
@@ -174,7 +174,7 @@ void TrainOneEpoch(FeedForwardNet &net, ILSVRC &data,
   size_t b = 0;
   size_t n_read;
   Timer timer, ttr;
-  Tensor prefetch_x, prefetch_y;
+  Tensor prefetch_x(Shape{batchsize, 3, kCropSize, kCropSize}), 
prefetch_y(Shape{batchsize}, kInt);
   string binfile = bin_folder + "/train1.bin";
   timer.Tick();
   data.LoadData(kTrain, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/examples/imagenet/alexnet/ilsvrc12.h
----------------------------------------------------------------------
diff --git a/examples/imagenet/alexnet/ilsvrc12.h 
b/examples/imagenet/alexnet/ilsvrc12.h
index 74fffbb..05b3451 100644
--- a/examples/imagenet/alexnet/ilsvrc12.h
+++ b/examples/imagenet/alexnet/ilsvrc12.h
@@ -43,6 +43,12 @@
 using std::string;
 using namespace singa::io;
 namespace singa {
+
+ /// size for resizing
+const size_t kImageSize = 256;
+const size_t kImageNBytes = 3 * kImageSize * kImageSize;
+/// size for cropping
+const size_t kCropSize = 227;
 /// For reading ILSVRC2012 image data as tensors.
 class ILSVRC {
  public:
@@ -105,11 +111,7 @@ class ILSVRC {
   void WriteMean(Tensor &mean, string path);
 
  private:
-  /// size for resizing
-  const size_t kImageSize = 256;
-  const size_t kImageNBytes = 3 * kImageSize * kImageSize;
-  /// size for cropping
-  const size_t kCropSize = 227;
+ 
   Tensor mean;
   string last_read_file = "";
 
@@ -299,9 +301,7 @@ std::thread ILSVRC::AsyncLoadData(int flag, string file, 
size_t read_size,
 
 size_t ILSVRC::LoadData(int flag, string file, size_t read_size, Tensor *x,
                         Tensor *y, size_t *n_read, int nthreads) {
-  x->Reshape(Shape{read_size, 3, kCropSize, kCropSize});
-  y->AsType(kInt);
-  y->Reshape(Shape{read_size});
+  
   if (file != last_read_file) {
     if (reader != nullptr) {
       reader->Close();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 5921762..a73821c 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -57,47 +57,38 @@ class Tensor {
  public:
   ~Tensor();
   Tensor();
-  explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
+
+  /// Constructor using default device.
   explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
 
-  Tensor(Shape &&shape,
-         std::shared_ptr<Device> dev,
-         DataType dtype = kFloat32);
+  /// Constructor with shape, device and data type
   Tensor(const Shape &shape,
          std::shared_ptr<Device> dev,
          DataType dtype = kFloat32);
 
-  /// Copy Tensor to share the internal data.  No deep copy.
+  /// Copy constructor.  No deep copy.
   Tensor(const Tensor &from);
-  /// Copy Tensor to share the internal data.  No deep copy.
-  /// For 2 tensors sharing same block but different strides.
-  Tensor(const Tensor &from, Shape &new_shape, vector<int> &new_strides);
-  /// Copy Tensor to share the internal data.  No deep copy.
+
+  /// Move constructor.  No deep copy.
   Tensor(Tensor &&from);
 
+  // --------------------------------------------------------------------------
+  // ---Following methods return info of the class without making any changes--
+  // --------------------------------------------------------------------------
+
   /// For functions in xx_math.cc to access the block.
   /// Users should not operate against Block directly.
   /// block_ is allocated in constructors.
   Block *block() const { return block_; }
-  void SetBlock(Block *block);
 
   std::shared_ptr<Device> device() const { return device_; }
 
-  /// return immutable Tensor values with given type.
+  /// Return immutable Tensor values with given type.
   template <typename SType>
   const SType *data() const {
     return static_cast<const SType *>(block()->data());
   }
 
-  /// used for swig code to convert Tensor into numpy array.
-  /// It gets data into 'value'
-  template <typename SType>
-  void GetValue(SType *value, const size_t num) {
-    CHECK(device_ == defaultDevice);
-    const SType* ptr = data<SType>();
-    for (size_t i = 0; i < num; i++) value[i] = ptr[i];
-  }
-
   /// data type, including kFloat16, kFloat32, kInt
   const DataType data_type() const { return data_type_; }
 
@@ -113,28 +104,55 @@ class Tensor {
   bool empty() const { return nDim() == 0; }
 
   /// Check if the tensor's last stride==1
-  bool transpose() const { return (strides_.back() != 1); }
+  bool transpose() const {
+    if (!strides_.empty()) {
+      auto last = strides_.front();
+      for (auto s : strides_) {
+        if (s > last)
+          return true;
+        last = s;
+      }
+    }
+    return false;
+  }
 
   const vector<int>& strides() const { return strides_; }
 
-  /// return true if the content of the tensor is initialized
+  /// Return true if the content of the tensor is initialized
   bool initailized() const {
     return block_ != nullptr && block_->initialized();
   }
 
-  /// return number of total elements
+  /// Return number of total elements
   size_t Size() const {
     if (block_ == nullptr) return 0u;
     CHECK_EQ(block_->size() % SizeOf(data_type_), 0u);
     return block_->size() / SizeOf(data_type_);
   }
 
-  /// return memory size (i.e., Bytes)
+  /// Return memory size (i.e., Bytes)
   size_t MemSize() const { return block_->size(); }
 
-  /// Reset the tensor shape, it may reallocate block, if MemSize() changes.
-  Tensor Reshape(const Shape &shape);
-  Tensor Reshape(Shape &&shape);
+  /// used for swig code to convert Tensor into numpy array.
+  /// It gets data into 'value'
+  template <typename SType>
+  void GetValue(SType *value, const size_t num) {
+    CHECK(device_ == defaultDevice);
+    const SType* ptr = data<SType>();
+    for (size_t i = 0; i < num; i++) value[i] = ptr[i];
+  }
+
+  /// Serialize data, shape and transpose to protobuf object.
+  void ToProto(singa::TensorProto *proto) const;
+
+  /// Return average L1 norm
+  float L1() const;
+
+  /// Return average L2 norm
+  float L2() const;
+  // --------------------------------------------------------------------------
+  // ---Following methods changes the internal members
+  // --------------------------------------------------------------------------
 
   /// Reset the shape, device, and data type as given tensor.
   /// If block size changes, then reallocate a new block.
@@ -155,6 +173,8 @@ class Tensor {
   template <typename SType>
   void SetValue(const SType x);
 
+  void SetShape(const Shape& shape);
+
   /// For init the tensor values, copy 'num' elements from 'src' to the 
internal
   /// memory with 'offset' (elements).
   template <typename SType>
@@ -165,46 +185,41 @@ class Tensor {
   /// Meta data would not be copied!
   void CopyData(const Tensor &other);
 
-  void RepeatData(vector<size_t> repeats, int axis, int total_repeats, const 
Tensor &other);
-
   /// Deserialize data, shape and transpose from protobuf object.
   void FromProto(const singa::TensorProto &proto);
 
-  /// Serialize data, shape and transpose to protobuf object.
-  void ToProto(singa::TensorProto *proto) const;
 
-  /// return an exactly the same Tensor with data been deep copied to the given
-  /// device. If 'device' is nullptr, then clone it one the current device.
-  Tensor Clone(std::shared_ptr<Device> device = nullptr) const;
+  /// TODO(wangwei) merge RepeatData into  Repeat?
+  void RepeatData(const vector<size_t>& repeats, int axis, int total_repeats,
+                  const Tensor &other);
 
-  Tensor Repeat(vector<size_t> repeats, int axis, std::shared_ptr<Device> 
device = nullptr) ;
+  // --------------------------------------------------------------------------
+  // ---Following methods returns a new Tensor without change original tensor
+  // --------------------------------------------------------------------------
 
-  // Tensor operations
-
-  /// Matrix transpose.  Valid only if shape.size() == 2.
-  /// No data copy, just set the transpose_ filed of the returned tensor.
-  Tensor T() const;
-
-  /// Reverse the shape vector
-  Tensor Transpose() const;
+  Tensor Repeat(const vector<size_t>& repeats, int axis,
+                std::shared_ptr<Device> device = nullptr);
 
-  /// Change the axes
-  Tensor Transpose(const vector<size_t> &axes) const;
+  /// return an exactly the same Tensor with data been deep copied to the given
+  /// device. If 'device' is nullptr, then clone it one the current device.
+  Tensor Clone(std::shared_ptr<Device> device = nullptr) const;
 
-  /// Copy the meta info with data block shared.
+  // --------------------------------------------------------------------------
+  // ---Following methods change the tensor and return itself
+  // --------------------------------------------------------------------------
+  /// Copy assignment
   Tensor &operator=(const Tensor &in);
 
-  /// Copy the meta info with data block shared.
+  /// Move assignment
   Tensor &operator=(Tensor &&in);
 
   Tensor &operator+=(const Tensor &in);
-  // void operator+=(Tensor&& in);
+
   Tensor &operator-=(const Tensor &in);
-  // void operator-=(Tensor&& in);
+
   Tensor &operator*=(const Tensor &in);
-  // void operator*=(Tensor&& in);
+
   Tensor &operator/=(const Tensor &in);
-  // void operator/=(Tensor&& in);
 
   // Scalar operations.
 
@@ -224,10 +239,19 @@ class Tensor {
   template <typename SType>
   Tensor &operator/=(const SType x);
 
-  /// Return average L1 norm
-  float L1() const;
-  /// Return average L2 norm
-  float L2() const;
+  /// change the shape (and stride); the block may be reallocated.
+  Tensor &Reshape(const Shape &shape);
+
+  /// Matrix transpose.  Valid only if shape.size() == 2.
+  Tensor& T();
+
+  /// Reverse the shape vector
+  Tensor& Transpose();
+
+  /// Change the axes
+  Tensor& Transpose(const vector<size_t> &axes);
+
+ protected:
 
   //generate strides automatically if stride field is not passed
   void generate_strides() {
@@ -259,10 +283,10 @@ class Tensor {
   vector<int> strides_ = {};
 }; //end of tensor class
 
+
 inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
   if (len == 0) len = shape.size();
-  if (len == 0)
-    return 0;
+  if (len == 0) return 0;
   CHECK_LE(len, shape.size());
   size_t v = 1;
   for (unsigned int i = start; i < len; i++) v *= shape[i];
@@ -275,24 +299,31 @@ inline void CheckDataTypeAndLang(const Tensor &in1, const 
Tensor &in2) {
   CHECK_EQ(in1.device()->lang(), in2.device()->lang());
 }
 
+
 template <typename FromType, typename ToType>
 ToType TypeCast(const FromType &x) {
   // TODO(wangwei) cast fp16; prevent some casts, e.g., float to char
   return static_cast<ToType>(x);
 }
 
+
+/// Reshape the given tensor and generate a new tensor,
+/// which shares the memory with in if possible
 Tensor Reshape(const Tensor &in, const Shape &s);
-Tensor Reshape(const Tensor &in, Shape &&s);
 
-// For tensors with sparse content, e.g., missing columns or rows.
-// class SparseTensor : public Tensor {};
+/// Reverse the shape vector
+Tensor Transpose(const Tensor& in);
+
+/// Change the axes
+Tensor Transpose(const Tensor& in, const vector<size_t> &axes);
 
 /// Copy 'num' elements of src to dst.
 /// The first 'src_offset' ('dst_offset') elements will be skipped.
 void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
                     const size_t dst_offset = 0, const size_t src_offset = 0);
 
-void RepeatDataToFrom(bool broadcast_flag, vector<size_t> repeats, int axis,
+
+void RepeatDataToFrom(bool broadcast_flag, const vector<size_t>& repeats, int 
axis,
                       Tensor *dst, const Tensor &in, const size_t num);
 
 // =============Element-wise operations====================================
@@ -411,6 +442,8 @@ void Div(const SType x, const Tensor &in, Tensor *out);
 
 template <typename SType = float>
 SType Sum(const Tensor &in);
+
+
 // ============Matrix (row/column) operations==================================
 /// Average elements in the Tensor, currently only support vector and matrix.
 /// if 'axis' is 0, average all rows into a single row
@@ -510,8 +543,8 @@ void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p);
 
 /// To be called by pysinga autograd operations;
 /// swig ignores the const qualifier 
http://www.swig.org/Doc3.0/SWIGPlus.html#SWIGPlus_const
-const Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t);
-const Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t);
+Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t);
+Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t);
 
 /// Return a tensor consisting of rows ([start, end)) from 'in'. It copies the
 /// values from 'in'. 'in' ia a 2D Tensor.
@@ -519,7 +552,8 @@ Tensor CopyRows(const Tensor &in, const size_t start, const 
size_t end);
 /// Alias of CopyRows
 Tensor SliceRows(const Tensor &in, const size_t start, const size_t end);
 /// Slice the input tensor along the give axis to generate a new tensor
-Tensor SliceOn(const Tensor &in, const size_t start, const size_t end, int 
axis);
+Tensor SliceOn(const Tensor &in, const size_t start, const size_t end,
+               int axis);
 /// Return a tensor consisting of columns ([start, end)) from 'in'. It copies
 /// the values from 'in'. 'in' is a  2D Tensor.
 Tensor CopyColumns(const Tensor &in, const size_t start, const size_t end);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/python/singa/autograd.py
----------------------------------------------------------------------
diff --git a/python/singa/autograd.py b/python/singa/autograd.py
index 63698c2..aa6b37a 100755
--- a/python/singa/autograd.py
+++ b/python/singa/autograd.py
@@ -33,6 +33,126 @@ CTensor = singa.Tensor
 training = False
 
 
+
+def infer_dependency(op):
+    '''
+    Infer the dependency of all operations with the
+    given op as the last operation.
+
+    Operation A is depending on B is A uses the output(s) of B.
+
+    Args:
+        op: an Operation instance, e.g. the loss operation.
+
+    Return:
+        a Counter instance with the operation as the key,
+        and the number of operations that are depending on it as the value
+    '''
+    # dependency = {}
+    dependency_count = Counter()
+    queue = deque([op])
+    while len(queue) > 0:
+        cur_op = queue.pop()
+        for src_op, _, _, _ in cur_op.src:
+            if src_op not in dependency_count and \
+                    (not isinstance(src_op, Dummy)):
+                # dependency[src_op] = [Counter() for _ in src_op.y_id2idx]
+                dependency_count[src_op] = 0
+                queue.append(src_op)
+            # y_idx = src_op.y_id2idx[x_id]
+            # dependency[src_op][y_idx][cur_op] += 1
+            dependency_count[src_op] += 1
+    return dependency_count
+
+
+def gradients(y, dy=None):
+    grads = {}  # mapping: x->dx if x.stores_grad
+    for p, dp in backward(y, dy):
+        gradients[p] = dp
+    return grads
+
+
+def backward(y, dy=None):
+    '''
+    Run the backward propagation starting at y.
+
+    Args:
+        y: a Tensor instance, usually the loss
+        dy: a number or a Tensor instance, for the gradient of the
+            objective/loss w.r.t y, usually 1.0
+
+    Return:
+        a dictionary storing the gradient tensors of all tensors
+        whose stores_grad is true (e.g. parameter tensors)
+    '''
+    dependency = infer_dependency(y.creator)
+    assert y.size() == 1, 'y must be a Tensor with a single value;'\
+        'size of y is % d' % y.size()
+
+    # by default the dy is a tensor with 1.0 for each sample;
+    if dy is None:
+        dy = float(1.0)
+    elif isinstance(dy, Tensor):
+        dy = dy.data
+    else:
+        dy = float(dy)
+
+    # ready is a queue of (operation, dy list)
+    ready = deque([(y.creator, (dy,))])
+    not_ready = {}  # mapping: op->[dy]
+
+    if y.stores_grad:
+        gradients[y] = dy
+
+    while len(ready) > 0:
+        op, dys = ready.pop()
+        if not op.requires_grad or isinstance(op, Dummy):
+            continue
+        # if not isinstance(op, tensor.Dummy):
+        dxs = op._do_backward(*dys)
+        # TODO src and dx must match
+        assert len(op.src) == len(dxs), \
+            'the number of src ops (=%d) and dx (=%d) not match' \
+            % (len(op.src), len(dxs))
+        for (src_op, x_id, y, y_stores_grad), dx in zip(op.src, dxs):
+            # prefix x is w.r.t op; prefix y is w.r.t src_op.
+            # x_id is the python id of one input arg of src_op, denoted as x.
+            # y_idx (below) is the index of x among the outputs of src_op.
+            # not_ready[src_op][y_idx] records the intermediate gradient
+            # of the y_idx'th output of src_op. 'intermediate gradient'
+            # indicates that if this output is used in multiple children
+            # operations, then we have to add the graident (dx) from all these
+            # children operations. When src_op is ready, it means that
+            # the gradient of all its outputs are available, i.e. all children
+            # operations have been backwarded.
+            # y is None if y.stores_grad is false; otherwise it is a Tensor
+            y_idx = src_op.y_id2idx[x_id]
+            if src_op not in not_ready:
+                # src_op may have mulitple outputs
+                not_ready[src_op] = [None for _ in src_op.y_id2idx]
+                not_ready[src_op][y_idx] = dx
+            else:
+                dxs = not_ready[src_op]
+                if dxs[y_idx] is None:
+                    dxs[y_idx] = dx
+                else:
+                    # add the gradient from another children operation that
+                    # uses y_idx'th output of src_op as input arg
+                    dxs[y_idx] += dx
+            if y_stores_grad:
+                # store the gradient for final return, e.g. if x is parameter
+                g = not_ready[src_op][y_idx]
+                tg = Tensor(device=g.device(), data=g)
+                yield (y, tg)
+            dependency[src_op] -= 1
+            if src_op.requires_grad is True:
+                if dependency[src_op] == 0:
+                    if not isinstance(src_op, Dummy):
+                        ready.append((src_op, not_ready[src_op]))
+                    del not_ready[src_op]
+        del op  # delete the operation to free all tensors from this op
+
+
 class Operation(object):
     '''
     An operation includes the forward and backward function of
@@ -194,8 +314,8 @@ class Matmul(Operation):
         Returns:
             a tuple for (dx, dw)
         '''
-        return singa.Mult(dy, self.input[1].T()), \
-            singa.Mult(self.input[0].T(), dy)
+        return singa.Mult(dy, singa.DefaultTranspose(self.input[1])), \
+            singa.Mult(singa.DefaultTranspose(self.input[0]), dy)
 
 
 def matmul(x, w):
@@ -268,12 +388,12 @@ class SoftMax(Operation):
             the result Tensor
         '''
         if self.axis == 1:
-            x = x.T()
+            x = singa.DefaultTranspose(x)
         self.output = singa.SoftMax(x)
         if self.axis == 0:
             return self.output
         elif self.axis == 1:
-            return self.output.T()
+            return singa.DefaultTranspose(self.output)
 
     def backward(self, dy):
         '''
@@ -286,7 +406,7 @@ class SoftMax(Operation):
         '''
         # calculations are made on numpy array
         if self.axis == 1:
-            dy = dy.T()
+            dy = singa.DefaultTranspose(dy)
         grad = ctensor2numpy(dy)
         output = ctensor2numpy(self.output)
         out_1 = np.einsum('ki,ki->ki', grad, output)
@@ -298,14 +418,14 @@ class SoftMax(Operation):
         if self.axis == 0:
             return dx
         elif self.axis == 1:
-            return dx.T()
+            return singa.DefaultTranspose(dx)
 
 
 def soft_max(x, axis=0):
     return SoftMax(axis)(x)[0]
 
 
-class NLL(Operation):
+class CrossEntropy(Operation):
     '''
     Calculte negative log likelihood loss for a batch of training data.
 
@@ -350,12 +470,11 @@ class NLL(Operation):
             pass  # TODO, broadcast elementwise multiply seems not support
 
 
-def nll(y, t):
-    return NLL()(y, t)[0]
+def cross_entropy(y, t):
+    return CrossEntropy()(y, t)[0]
 
 
 class SoftMaxCrossEntropy(Operation):
-
     def forward(self, x, t):
         self.p = singa.SoftMax(x)
         self.t = t
@@ -365,7 +484,8 @@ class SoftMaxCrossEntropy(Operation):
         return loss
 
     def backward(self, dy=1.0):
-        return singa.SoftmaxCrossEntropyBwd(self.p, self.t), None
+        dx = singa.SoftmaxCrossEntropyBwd(self.p, self.t)
+        return singa.DivFloat(dx, float(self.p.shape()[0])), None
 
 
 def softmax_cross_entropy(x, t):
@@ -448,11 +568,11 @@ class Flatten(Operation):
     def forward(self, x):
         # TODO Do flatten start from axis != 1
         self.shape = list(x.shape())
-        y = x.Reshape((x.shape()[0], x.Size() // x.shape()[0]))
+        y = singa.Reshape(x, (x.shape()[0], x.Size() // x.shape()[0]))
         return y
 
     def backward(self, dy):
-        dx = dy.Reshape(self.shape)
+        dx = singa.Reshape(dy, self.shape)
         return dx
 
 
@@ -466,11 +586,7 @@ class _Conv2D(Operation):
         self.handle = handle
 
     def forward(self, x, W, b):
-        #assert x.nDim() == 4, 'The dimensions of input should be 4D.'
-        #assert x.shape()[1] == self.in_channels, 'in_channels dismatched.'
-        #assert 
(xs[0].shape()[2]+2*self.padding[0]-self.kernel_size[0])%self.stride[0] == 0, 
'invalid padding.'
-        #assert 
(xs[0].shape()[3]+2*self.padding[1]-self.kernel_size[1])%self.stride[1] == 0, 
'invalid padding'
-        #assert 0 == 0, 'invalid padding'
+        assert x.nDim() == 4, 'The dimensions of input should be 4D.'
 
         if training:
             if self.handle.bias_term:
@@ -517,125 +633,6 @@ def conv2d(x, W, b, handle):
     return _Conv2D(handle)(x, W, b)[0]
 
 
-def infer_dependency(op):
-    '''
-    Infer the dependency of all operations with the
-    given op as the last operation.
-
-    Operation A is depending on B is A uses the output(s) of B.
-
-    Args:
-        op: an Operation instance, e.g. the loss operation.
-
-    Return:
-        a Counter instance with the operation as the key,
-        and the number of operations that are depending on it as the value
-    '''
-    # dependency = {}
-    dependency_count = Counter()
-    queue = deque([op])
-    while len(queue) > 0:
-        cur_op = queue.pop()
-        for src_op, _, _, _ in cur_op.src:
-            if src_op not in dependency_count and \
-                    (not isinstance(src_op, Dummy)):
-                # dependency[src_op] = [Counter() for _ in src_op.y_id2idx]
-                dependency_count[src_op] = 0
-                queue.append(src_op)
-            # y_idx = src_op.y_id2idx[x_id]
-            # dependency[src_op][y_idx][cur_op] += 1
-            dependency_count[src_op] += 1
-    return dependency_count
-
-
-def gradients(y, dy=None):
-    grads = {}  # mapping: x->dx if x.stores_grad
-    for p, dp in backward(y, dy):
-        gradients[p] = dp
-    return grads
-
-
-def backward(y, dy=None):
-    '''
-    Run the backward propagation starting at y.
-
-    Args:
-        y: a Tensor instance, usually the loss
-        dy: a number or a Tensor instance, for the gradient of the
-            objective/loss w.r.t y, usually 1.0
-
-    Return:
-        a dictionary storing the gradient tensors of all tensors
-        whose stores_grad is true (e.g. parameter tensors)
-    '''
-    dependency = infer_dependency(y.creator)
-    assert y.size() == 1, 'y must be a Tensor with a single value;'\
-        'size of y is % d' % y.size()
-
-    # by default the dy is a tensor with 1.0 for each sample;
-    if dy is None:
-        dy = float(1.0)
-    elif isinstance(dy, Tensor):
-        dy = dy.data
-    else:
-        dy = float(dy)
-
-    # ready is a queue of (operation, dy list)
-    ready = deque([(y.creator, (dy,))])
-    not_ready = {}  # mapping: op->[dy]
-
-    if y.stores_grad:
-        gradients[y] = dy
-
-    while len(ready) > 0:
-        op, dys = ready.pop()
-        if not op.requires_grad or isinstance(op, Dummy):
-            continue
-        # if not isinstance(op, tensor.Dummy):
-        dxs = op._do_backward(*dys)
-        # TODO src and dx must match
-        assert len(op.src) == len(dxs), \
-            'the number of src ops (=%d) and dx (=%d) not match' \
-            % (len(op.src), len(dxs))
-        for (src_op, x_id, y, y_stores_grad), dx in zip(op.src, dxs):
-            # prefix x is w.r.t op; prefix y is w.r.t src_op.
-            # x_id is the python id of one input arg of src_op, denoted as x.
-            # y_idx (below) is the index of x among the outputs of src_op.
-            # not_ready[src_op][y_idx] records the intermediate gradient
-            # of the y_idx'th output of src_op. 'intermediate gradient'
-            # indicates that if this output is used in multiple children
-            # operations, then we have to add the graident (dx) from all these
-            # children operations. When src_op is ready, it means that
-            # the gradient of all its outputs are available, i.e. all children
-            # operations have been backwarded.
-            # y is None if y.stores_grad is false; otherwise it is a Tensor
-            y_idx = src_op.y_id2idx[x_id]
-            if src_op not in not_ready:
-                # src_op may have mulitple outputs
-                not_ready[src_op] = [None for _ in src_op.y_id2idx]
-                not_ready[src_op][y_idx] = dx
-            else:
-                dxs = not_ready[src_op]
-                if dxs[y_idx] is None:
-                    dxs[y_idx] = dx
-                else:
-                    # add the gradient from another children operation that
-                    # uses y_idx'th output of src_op as input arg
-                    dxs[y_idx] += dx
-            if y_stores_grad:
-                # store the gradient for final return, e.g. if x is parameter
-                g = not_ready[src_op][y_idx]
-                tg = Tensor(device=g.device(), data=g)
-                yield (y, tg)
-            dependency[src_op] -= 1
-            if src_op.requires_grad is True:
-                if dependency[src_op] == 0:
-                    if not isinstance(src_op, Dummy):
-                        ready.append((src_op, not_ready[src_op]))
-                    del not_ready[src_op]
-        del op  # delete the operation to free all tensors from this op
-
-
 class Layer(object):
 
     def __init__(self):
@@ -651,8 +648,6 @@ class Layer(object):
 class Linear(Layer):
 
     def __init__(self, in_features, out_features, bias=True):
-        #self.in_features = in_features
-        #self.out_features = out_features
         w_shape = (in_features, out_features)
         b_shape = (1, out_features)
         self.bias = bias

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/python/singa/tensor.py
----------------------------------------------------------------------
diff --git a/python/singa/tensor.py b/python/singa/tensor.py
index 0860d9d..46a47b7 100644
--- a/python/singa/tensor.py
+++ b/python/singa/tensor.py
@@ -134,7 +134,7 @@ class Tensor(object):
         '''
         return self.data.transpose()
 
-    def transpose(self,axes = None):
+    def transpose(self, axes=None):
         '''
         To transpose the tensor
         '''
@@ -142,13 +142,13 @@ class Tensor(object):
         if axes == None:
             tshape = [self.shape[x] for x in range(len(t.shape))]
             t.shape = tuple(tshape)
-            t.data = self.data.Transpose()
+            t.data = singa.DefaultTranspose(self.data)
         else:
             if(len(axes) != len(self.shape)):
                 raise ValueError('dimensions do not match')
             tshape = [self.shape[x] for x in axes]
             t.shape = tuple(tshape)
-            t.data = self.data.Transpose(list(axes))
+            t.data = singa.Transpose(self.data, list(axes))
         return t
 
     def size(self):  # TODO(wangwei) compute size
@@ -166,17 +166,18 @@ class Tensor(object):
         return self.data.MemSize()
 
     def reshape(self, shape):
-        '''Change the tensor shape.
+        '''Return a new tensor with the given shape, and the original 
+        tensor is not changed.
 
         Args:
-            shape (list<int>): new shape, which should have the same volumn as
-                the original shape.
+            shape (list<int>): new shape, which should have the same 
+            volumn as the original shape.
         '''
         t = Tensor(self.shape, self.device, self.dtype)
         assert product(self.shape) == product(shape), \
             'product of shape should be equal'
         t.shape = shape
-        t.data = self.data.Reshape(list(shape))
+        t.data = singa.Reshape(self.data, shape)
         return t
 
     def reset_like(self, t):
@@ -283,38 +284,41 @@ class Tensor(object):
 
         Return:
             the tensor which has been repeated
-        
+
         '''
         t = Tensor()
         t_ndim = self.ndim()
         if isinstance(repeats, int) or isinstance(repeats, long):
             if repeats < 0:
-                raise ValueError("'repeats' should not be negative: 
{}".format(repeats))
+                raise ValueError(
+                    "'repeats' should not be negative: {}".format(repeats))
             if axis != None and axis < 0:
                 axis += t_ndim
             # broadcast = True
             if axis == None:
                 axis = 9999
-                t.shape = (product(self.shape)*repeats,)
-                Repeats = [repeats,]
+                t.shape = (product(self.shape) * repeats,)
+                Repeats = [repeats, ]
                 t.data = self.data.Repeat(Repeats, axis)
             elif axis >= 0:
                 t_shape = list(self.shape)
-                t_shape[axis] = self.shape[axis]*repeats
+                t_shape[axis] = self.shape[axis] * repeats
                 t.shape = tuple(t_shape)
-                Repeats = [repeats,]
+                Repeats = [repeats, ]
                 t.data = self.data.Repeat(Repeats, axis)
 
         elif isinstance(repeats, tuple) or isinstance(repeats, list):
             for rep in repeats:
                 if rep < 0:
-                    raise ValueError("'repeats' should be int or sequence: 
{}".format(repeats))
+                    raise ValueError(
+                        "'repeats' should be int or sequence: 
{}".format(repeats))
 
             if axis != None and axis < 0:
                 axis += t_ndim
             if axis == None:
                 axis = 9999
-                raise ValueError("when axis us None, 'repeats' should be int: 
{}".format(repeats))
+                raise ValueError(
+                    "when axis us None, 'repeats' should be int: 
{}".format(repeats))
             elif axis >= 0:
                 t_shape = list(self.shape)
                 t_shape[axis] = sum(repeats)
@@ -323,16 +327,15 @@ class Tensor(object):
         else:
             raise ValueError('repeats should be int or sequence')
 
-        return t     
+        return t
 
     def T(self):
-        ''' shallow copy, negate the transpose field.
+        ''' shallow copy.
 
         Returns:
-            a new Tensor which shares the underlying data memory (shallow copy)
-            but is marked as a transposed version of this tensor.
+            a new Tensor which shares the underlying data memory (shallow 
copy).
         '''
-        return _call_singa_func(self.data.T)
+        return _call_singa_func(singa.DefaultTranspose, self.data)
 
     def copy(self):
         '''shallow copy calls copy constructor of singa::Tensor
@@ -611,8 +614,9 @@ def sizeof(dtype):
     return singa.SizeOf(dtype)
 
 
-def reshape(t, s):
-    '''Reshape the input tensor with the given shape.
+def reshape(tensor, shape):
+    '''Reshape the input tensor with the given shape and 
+    the original tensor is not changed
 
     Args:
         t (Tensor): the tensor to be changed
@@ -624,12 +628,8 @@ def reshape(t, s):
     '''
     return _call_singa_func(singa.Reshape, t.data, s)
 
-def Reshape(t,s):
-
-    ret = t.reshape(s)
-    return ret
 
-def transpose(t,axes = None):
+def transpose(t, axes=None):
     '''
     Returns:
         the transposed tensor 
@@ -796,6 +796,7 @@ def tanh(t):
     '''
     return _call_singa_func(singa.Tanh, t.data)
 
+
 def sum(t, axis=None, out=None):
     '''Sum of tensor elements over given axis
 
@@ -827,24 +828,24 @@ def sum(t, axis=None, out=None):
         one.set_value(1.0)
         ret = tensordot(t, one, t_ndim)
 
-    if isinstance(axis,int):
+    if isinstance(axis, int):
         if axis < 0:
             axis += t_ndim
 
         axis_shape = t_shape[axis]
         axis_shape = int(axis_shape)
-        one = Tensor(shape = (axis_shape, ), device = t.device)
+        one = Tensor(shape=(axis_shape, ), device=t.device)
         one.set_value(1.0)
-        ret = tensordot(t, one, axes=([axis],[0]))
+        ret = tensordot(t, one, axes=([axis], [0]))
 
-    if isinstance(axis,tuple):
+    if isinstance(axis, tuple):
         l_axis = list(axis)
         axis_shape = [t_shape[x] for x in axis]
         axisshape = tuple(axis_shape)
         one = Tensor(axisshape, t.device)
         one.set_value(1.0)
         one_axis = [x for x in range(one.ndim())]
-        ret = tensordot(t, one, (l_axis,one_axis))
+        ret = tensordot(t, one, (l_axis, one_axis))
 
     if out is not None:
         if out.shape != ret.shape:
@@ -1181,10 +1182,10 @@ def einsum(ops, *args):
     if len(broadcast_a) == 0:
         broadcast_a = [1]
     if len(broadcast_b) == 0:
-        broadcast_b = [1]  
+        broadcast_b = [1]
     mult_A = repeat(A, product(broadcast_a))
     mult_A = mult_A.reshape(reshape_A)
-    mult_A = transpose(mult_A,transpose_A)
+    mult_A = transpose(mult_A, transpose_A)
     mult_B = repeat(B, product(broadcast_b))
     mult_B = mult_B.reshape(reshape_B)
     mult_B = transpose(mult_B, transpose_B)
@@ -1199,9 +1200,9 @@ def einsum(ops, *args):
     res = transpose(res, transpose_res)
 
     return res
-    
 
-def repeat (t, repeats, axis = None):
+
+def repeat(t, repeats, axis=None):
     '''Return the repeated tensor
     Args:
         t(tensor): the tensor to be repeated
@@ -1213,12 +1214,11 @@ def repeat (t, repeats, axis = None):
     Return:
         the tensor which has been repeated
     '''
-    ret = t.repeat(repeats,axis)
+    ret = t.repeat(repeats, axis)
     return ret
 
-        
-def tensordot (A,B,axes=2):
 
+def tensordot(A, B, axes=2):
     """Returns the tensor multiplication of two tensors along specified axes.
 
     This is equivalent to compute dot product along the specified axes which
@@ -1244,30 +1244,33 @@ def tensordot (A,B,axes=2):
     # when axes is an integer, axes_A and axes_B represent axes at the last of 
''A'' and
     # the first of ''B''. For example, when axes is 1, we do the normal 
multiplication :
     # if A is in shape(3,2,4), B is in shape(4,2,5), it will return a matrix 
in shape(3,2,2,5)
-    #when axes is 2 and A,B are shape (3,2,4) and (2,4,5), it will return a 
matrix in shape(3,5)
+    # when axes is 2 and A,B are shape (3,2,4) and (2,4,5), it will return a
+    # matrix in shape(3,5)
 
     if type(axes) == int or type(axes) == long:
         axes_A = list(range(-axes, 0))
         axes_B = list(range(0, axes))
         axes_B = axes_B
     else:
-        axes_A,axes_B =axes
+        axes_A, axes_B = axes
     # when axes is a pair of sequences of integers.For example, A is in 
shape(3,2,4),
-    #B is in shape(4,2,5), we set axes as ([1,2],[1,0]), it will return a 
matrix in shape(3,5)
-    if isinstance(axes_A,list):
+    # B is in shape(4,2,5), we set axes as ([1,2],[1,0]), it will return a
+    # matrix in shape(3,5)
+    if isinstance(axes_A, list):
         na = len(axes_A)
         axes_A = list(axes_A)
     else:
         axes_A = [axes_A]
         na = 1
-    if isinstance(axes_B,list):
+    if isinstance(axes_B, list):
         nb = len(axes_B)
         axes_B = list(axes_B)
     else:
         axes_B = [axes_B]
         nb = 1
 
-    # a_shape and b_shape are the shape of tensor A and B, while nda and ndb 
are the dim of A and B
+    # a_shape and b_shape are the shape of tensor A and B, while nda and ndb
+    # are the dim of A and B
     a_shape = A.shape
     nda = A.ndim()
     b_shape = B.shape
@@ -1277,7 +1280,7 @@ def tensordot (A,B,axes=2):
     if na != nb:
         equal = False
     else:
-    # to make the shape match
+        # to make the shape match
         for k in range(na):
             if a_shape[axes_A[k]] != b_shape[axes_B[k]]:
                 equal = False
@@ -1291,18 +1294,19 @@ def tensordot (A,B,axes=2):
     '''start to do the calculation according to the axes'''
 
     notin = [k for k in range(nda) if k not in axes_A]
-    # nda is the dim of A, and axes_a is the axis for A, notin is the axis 
which is not in axes_A
+    # nda is the dim of A, and axes_a is the axis for A, notin is the axis
+    # which is not in axes_A
     newaxes_a = notin + axes_A
     N2 = 1
     for axis in axes_A:
         N2 *= a_shape[axis]
     N1 = 1
     for ax in notin:
-        N1 *=a_shape[ax]
+        N1 *= a_shape[ax]
     # newshape_a is the shape to do multiplication.For example, A is in 
shape(3,2,4),
-    #B is in shape(4,2,5), we set axes as ([1,2],[1,0]), then newshape_a 
should be (3,5)
-    #olda is the shape that will be shown in the result.
-    newshape_a = (N1,N2)
+    # B is in shape(4,2,5), we set axes as ([1,2],[1,0]), then newshape_a 
should be (3,5)
+    # olda is the shape that will be shown in the result.
+    newshape_a = (N1, N2)
     olda = [a_shape[axis] for axis in notin]
     notin = [k for k in range(ndb) if k not in axes_B]
     newaxes_b = axes_B + notin
@@ -1320,7 +1324,7 @@ def tensordot (A,B,axes=2):
     at = Reshape(A, newshape_a)
     bt = Reshape(B, newshape_b)
 
-    res = mult(at,bt)
+    res = mult(at, bt)
     if len(olda + oldb) == 0:
         olda = [1]
         oldb = [1]
@@ -1330,6 +1334,7 @@ def tensordot (A,B,axes=2):
 
     return res
 
+
 def div(lhs, rhs, ret=None):
     '''Elementi-wise division.
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/src/api/core_tensor.i
----------------------------------------------------------------------
diff --git a/src/api/core_tensor.i b/src/api/core_tensor.i
index cc72d21..9427b11 100644
--- a/src/api/core_tensor.i
+++ b/src/api/core_tensor.i
@@ -101,12 +101,11 @@ namespace singa{
     const std::vector<size_t> &shape() const;
     const size_t shape(size_t idx) const;
     bool transpose() const;
-    size_t nDim() const;
-    Tensor Transpose() const;
-    Tensor Transpose(const std::vector<size_t> &axes) const;
+    size_t nDim() const;    
+
     size_t Size() const;
     size_t MemSize() const;
-    Tensor Reshape(const std::vector<size_t> &shape);
+    
     void ResetLike(const Tensor &t);
     void AsType(DataType type);
     void ToDevice(std::shared_ptr<singa::Device> dev);
@@ -122,10 +121,10 @@ namespace singa{
 
     void CopyData(const Tensor &other);
     void RepeatData(std::vector<size_t> repeats, int axis, int total_repeats, 
const Tensor &src);
+    
     Tensor Clone() const;
     Tensor Repeat(std::vector<size_t> repeats, int axis);
-    Tensor T() const;
-
+    
 
 #if USE_JAVA
     %rename(iAdd) operator+=(const Tensor &t);
@@ -166,6 +165,10 @@ namespace singa{
                         Tensor *dst, const Tensor &src, const size_t num);
 
   Tensor Reshape(const Tensor &in, const std::vector<size_t> &s);
+  Tensor Transpose(const Tensor &in, const std::vector<size_t> &axes);
+
+  %rename(DefaultTranspose) Transpose(const Tensor &in);
+  Tensor Transpose(const Tensor &in);
 
   Tensor Abs(const Tensor &t);
   Tensor Exp(const Tensor &t);
@@ -326,6 +329,6 @@ namespace singa{
   Tensor SoftMax(const Tensor &in);
   void SoftMax(const Tensor &in, Tensor *out);
 
-  const Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t);
-  const Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t);
+  Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t);
+  Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t);
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index e5e8017..1ac1b42 100755
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -21,6 +21,7 @@
 #include "./tensor_math_cuda.h"
 #include "./tensor_math_opencl.h"
 #include <utility>
+#include <algorithm>
 
 #define Noaxis 9999
 
@@ -45,13 +46,7 @@ Tensor::Tensor(const Shape &shape, DataType dtype)
     block_ = device_->NewBlock((int)size);
   generate_strides();
 }
-Tensor::Tensor(Shape &&shape, DataType dtype)
-  : data_type_(dtype), device_(defaultDevice), shape_(shape) {
-  size_t size = Product(shape_) * SizeOf(data_type_);
-  if (size)
-    block_ = device_->NewBlock((int)size);
-  generate_strides();
-}
+
 
 //non-strided constructors with device
 Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device,
@@ -62,56 +57,24 @@ Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> 
device,
     block_ = device_->NewBlock((int)size);
   generate_strides();
 }
-Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
-  : data_type_(dtype), device_(device), shape_(shape) {
-  size_t size = Product(shape_) * SizeOf(data_type_);
-  if (size)
-    block_ = device_->NewBlock((int)size);
-  generate_strides();
-}
 
 
-Tensor::Tensor(const Tensor &in)
-  : //transpose_(in.transpose_),
-    data_type_(in.data_type_),
-    device_(in.device_),
-    block_(in.block()),
-    shape_(in.shape_),
-    strides_(in.strides_) {
+Tensor::Tensor(const Tensor &in) : data_type_(in.data_type_),
+  device_(in.device_),  block_(in.block()),  shape_(in.shape_),
+  strides_(in.strides_) {
   if (block_ != nullptr)
     block_->IncRefCount();
 }
 
-//strided constructor taking in a tensor, shape and strides
-Tensor::Tensor(const Tensor &in, Shape &new_shape, vector<int> &new_strides)
-  : //transpose_(in.transpose_),
-    data_type_(in.data_type_),
-    device_(in.device_),
-    block_(in.block()),
-    shape_(new_shape),
-    strides_(new_strides) {
-  if (block_ != nullptr)
-    block_->IncRefCount();
-}
 
-Tensor::Tensor(Tensor &&in)
-  : //transpose_(in.transpose_),
-    data_type_(in.data_type_),
-    device_(in.device_),
-    shape_(std::move(in.shape_)),
-    strides_(in.strides_) {
+Tensor::Tensor(Tensor &&in) : data_type_(in.data_type_),
+  device_(in.device_), shape_(std::move(in.shape_)),
+  strides_(std::move(in.strides_)) {
   block_ = in.block_;
   in.block_ = nullptr;
 }
 
 
-void Tensor::SetBlock(Block *block) {
-  LOG(WARNING) << "Pls avoid using this function, which may have side-effect.";
-  if (block_ != nullptr)
-    if (block_->DecRefCount()) device_->FreeBlock(block_);
-  block_ = block;
-}
-
 void Tensor::ResetLike(const Tensor &in) {
   if (block_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) 
{
     if (block_ != nullptr && block_->DecRefCount() == 0)
@@ -124,41 +87,16 @@ void Tensor::ResetLike(const Tensor &in) {
   strides_ = in.strides_;
 }
 
-// if tensor is not transposed yet i.e strides == 1,
-// then we simply change the shape and generate new default strides
-// if tensor is already transposed i.e strides != 1,
-// it should be copied to a new tensor with newly generated default strides
-// TODO(wangwei) raise error if the shape not match
-
-// void Tensor::Reshape(const Shape &shape) {
-//   if (strides_.size() == 0)
-//     strides_.push_back(1);
-
-//   if (Product(shape_) != Product(shape)) {
-//     if (block_ != nullptr && block_->DecRefCount() == 0)
-//       device_->FreeBlock(block_);
-//     block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
-//   } else if (transpose()) {
-//     LOG(FATAL) << "Reshape Error: Reshape called on tranposed tensor. Not 
implemented yet." ;
-//   }
-//   shape_ = shape;
-//   generate_strides();
-// }
-
-// void Tensor::Reshape(Shape &&shape) {
-//   if (strides_.size() == 0)
-//     strides_.push_back(1);
-
-//   if (Product(shape_) != Product(shape)) {
-//     if (block_ != nullptr && block_->DecRefCount() == 0)
-//       device_->FreeBlock(block_);
-//     block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
-//   } else if (transpose()) {
-//     LOG(FATAL) << "Reshape Error: Reshape called on tranposed tensor. Not 
implemented yet." ;
-//   }
-//   shape_ = std::move(shape);
-//   generate_strides();
-// }
+void Tensor::SetShape(const Shape& shape) {
+  if (Product(shape_) != Product(shape)) {
+    if (block_ != nullptr && block_->DecRefCount() == 0)
+      device_->FreeBlock(block_);
+    block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
+  }
+  shape_ = shape;
+  generate_strides();
+}
+
 
 void Tensor::AsType(const DataType type) {
   if (data_type_ != type) {
@@ -217,7 +155,8 @@ void Tensor::CopyData(const Tensor &src) {
   }
 }
 
-void Tensor::RepeatData(vector<size_t> repeats, int axis, int total_repeats, 
const Tensor &src) {
+void Tensor::RepeatData(const vector<size_t>& repeats, int axis, int 
total_repeats,
+                        const Tensor &src) {
   if (repeats.size() == 1) {
     CHECK_EQ(Size(), src.Size()*total_repeats);
   } else {
@@ -336,7 +275,8 @@ void Tensor::ToProto(singa::TensorProto *proto) const {
   }
 }
 
-Tensor Tensor::Repeat(vector<size_t> repeats, int axis, 
std::shared_ptr<Device> device) {
+Tensor Tensor::Repeat(const vector<size_t>& repeats, int axis,
+                      std::shared_ptr<Device> device) {
   if (device == nullptr) device = device_;
   vector<size_t> tshape;
   int total_repeats = 0;
@@ -346,7 +286,7 @@ Tensor Tensor::Repeat(vector<size_t> repeats, int axis, 
std::shared_ptr<Device>
   } else {
     if (repeats.size() == 1) {
       total_repeats = repeats[0];
-      for (size_t i = 0; i < shape_.size(); i++) {
+      for (int i = 0; i < static_cast<int>(shape_.size()); i++) {
         if (i == axis) {
           tshape.push_back(shape_[i] * total_repeats);
         } else {
@@ -363,7 +303,7 @@ Tensor Tensor::Repeat(vector<size_t> repeats, int axis, 
std::shared_ptr<Device>
         }
         total_repeats += repeats[i];
       }
-      for (size_t i = 0; i < shape_.size(); i++) {
+      for (int i = 0; i < static_cast<int>(shape_.size()); i++) {
         if (i == axis) {
           tshape.push_back(total_repeats);
         } else {
@@ -387,68 +327,53 @@ Tensor Tensor::Clone(std::shared_ptr<Device> device) 
const {
   return t;
 }
 
-Tensor Tensor::T() const {
+Tensor& Tensor::T() {
   // this function only works for 2d tensors
   CHECK_EQ(shape_.size(), 2u);
-  Tensor t;
-  t.device_ = device_;
-  t.data_type_ = data_type_;
-  t.shape_.push_back(shape_[1]);
-  t.shape_.push_back(shape_[0]);
-  t.strides_.clear();
-  t.strides_.push_back(strides_[1]);
-  t.strides_.push_back(strides_[0]);
-  t.block_ = block_;
-  block_->IncRefCount();
-  return t;
+  Transpose();
+  return *this;
 }
 
 //normal transpose without axes
-Tensor Tensor::Transpose() const {
-  // if(shape_.size() != strides_.size())
-  //   generate_strides();
-
-  Tensor t;
-  t.device_ = device_;
-  t.data_type_ = data_type_;
-  t.strides_.clear();
-  for (size_t n = 0; n < shape_.size(); ++n) {
-    t.shape_.push_back(shape_[shape_.size() - n - 1]);
-    t.strides_.push_back(strides_[shape_.size() - n - 1]);
-  }
-  t.block_ = block_;
-  block_->IncRefCount();
-  return t;
+Tensor& Tensor::Transpose() {
+  std::reverse(shape_.begin(), shape_.end());
+  std::reverse(strides_.begin(), strides_.end());
+  return *this;
 }
 
 //transpose with axes
-// TODO(wangwei) the shape and axes should match
-Tensor Tensor::Transpose(const vector<size_t> &axes) const {
-  // if(axes.size() != shape_.size()){
-  //   std::cout << "Warning: Size of input axes doesn't match size of shape" 
<< std::endl;
-  //   return void();
-  // }
-  // if(shape_.size() != strides_.size())
-  //   generate_strides();
+Tensor& Tensor::Transpose(const vector<size_t> &axes) {
+  CHECK_EQ(axes.size(), shape_.size()) <<
+                                       "Tranpose axes's length should be equal 
to shape";
 
-  Tensor t;
-  t.device_ = device_;
-  t.data_type_ = data_type_;
-  t.strides_.clear();
+  auto shape = shape_;
+  auto strides = strides_;
+  shape_.clear();
+  strides_.clear();
   for (size_t n = 0; n < axes.size(); ++n) {
-    t.shape_.push_back(shape_[axes[n]]);
-    t.strides_.push_back(strides_[axes[n]]);
+    shape_.push_back(shape[axes[n]]);
+    strides_.push_back(strides[axes[n]]);
   }
-  t.block_ = block_;
-  block_->IncRefCount();
-  return t;
+  return *this;
+}
+
+//normal transpose without axes
+Tensor Transpose(const Tensor& in) {
+  Tensor out(in);
+  out.Transpose();
+  return out;
+}
+
+//transpose with axes
+Tensor Transpose(const Tensor& in, const vector<size_t> &axes) {
+  Tensor out(in);
+  out.Transpose(axes);
+  return out;
 }
 
 Tensor &Tensor::operator=(const Tensor &in) {
-  // LOG(ERROR) << "= const &";
   if (block_ != nullptr && block_->DecRefCount() == 0)
     device_->FreeBlock(block_);
-  //transpose_ = in.transpose_;
   strides_ = in.strides_;
   data_type_ = in.data_type_;
   shape_ = in.shape_;
@@ -460,11 +385,9 @@ Tensor &Tensor::operator=(const Tensor &in) {
 }
 
 Tensor &Tensor::operator=(Tensor &&in) {
-  // LOG(ERROR) << "= &&";
   if (block_ != nullptr && block_->DecRefCount() == 0)
     device_->FreeBlock(block_);
-  //transpose_ = in.transpose_;
-  strides_ = std::move(in.strides_);
+    strides_ = std::move(in.strides_);
   data_type_ = in.data_type_;
   shape_ = std::move(in.shape_);
   device_ = in.device_;
@@ -473,17 +396,6 @@ Tensor &Tensor::operator=(Tensor &&in) {
   return *this;
 }
 
-// Tensor Reshape(const Tensor &in, const Shape &s) {
-//   // Tensor out(in);
-//   // out.Reshape(s);
-//   return out;
-// }
-
-// Tensor Reshape(const Tensor &in, Shape &&s) {
-//   // Tensor out(in);
-//   // out.Reshape(std::move(s));
-//   return out;
-// }
 
 #define GenUnaryTensorArgMemberFn(op, fn) \
   Tensor &Tensor::op(const Tensor &in) {  \
@@ -539,7 +451,7 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, const 
size_t num,
   }
 }
 
-void RepeatDataToFrom(bool broadcast_flag, vector<size_t> repeats, int axis,
+void RepeatDataToFrom(bool broadcast_flag, const vector<size_t>& repeats, int 
axis,
                       Tensor *dst, const Tensor &src, const size_t num) {
   if (repeats.size() == 1) {
     broadcast_flag = true;
@@ -561,11 +473,11 @@ void RepeatDataToFrom(bool broadcast_flag, vector<size_t> 
repeats, int axis,
     axis_shape = 1;
     shape_outer = Product(src.shape());
   } else {
-    for (size_t i = 0; i < axis; i++) {
+    for (int i = 0; i < axis; i++) {
       shape_outer *= src.shape()[i];
     }
     axis_shape = src.shape()[axis];
-    for (size_t i = axis + 1; i < src.nDim(); i++) {
+    for (int i = axis + 1; i < static_cast<int>(src.nDim()); i++) {
       chunk *= src.shape()[i];
     }
   }
@@ -912,7 +824,7 @@ template <typename SType>
 void AddColumn(const SType alpha, const SType beta, const Tensor &v,
                Tensor *M) {
   if (M->transpose()) {
-    Tensor X = M->T();
+    Tensor X = Transpose(*M);
     AddRow(v, &X);
   } else {
     CHECK_EQ(M->nDim(), 2u);
@@ -935,7 +847,7 @@ void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, 
M); }
 template <typename SType>
 void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M) {
   if (M->transpose()) {
-    Tensor X = M->T();
+    Tensor X = Transpose(*M);
     AddColumn(v, &X);
   } else {
     CHECK_EQ(M->nDim(), 2u);
@@ -980,7 +892,7 @@ Tensor ConcatOn(const vector<Tensor> &in, int axis) {
       tmp.push_back(Reshape(t, {t.shape(0), t.Size() / t.shape(0)}));
     }
     auto ret = ConcatenateRows(tmp);
-    ret = ret.Reshape(out_shape);
+    ret.Reshape(out_shape);
     return ret;
   } else {
     for (const auto& t : in) {
@@ -990,7 +902,7 @@ Tensor ConcatOn(const vector<Tensor> &in, int axis) {
       tmp.push_back(Reshape(t, {nrow, t.Size() / nrow}));
     }
     auto ret = ConcatenateColumns(tmp);
-    ret = ret.Reshape(out_shape);
+    ret.Reshape(out_shape);
     return ret;
   }
 }
@@ -1059,7 +971,8 @@ Tensor CopyRows(const Tensor &in, const size_t start, 
const size_t end) {
 }
 
 
-Tensor SliceOn(const Tensor&in, const size_t start, const size_t end, int 
axis) {
+Tensor SliceOn(const Tensor&in, const size_t start, const size_t end,
+               int axis) {
   Shape out_shape = in.shape();
   out_shape[axis] = end - start;
   if (axis == 0) {
@@ -1074,7 +987,7 @@ Tensor SliceOn(const Tensor&in, const size_t start, const 
size_t end, int axis)
     auto suffix = in.Size() / nrow / in.shape(axis);
     auto ret = SliceColumns(Reshape(in, {nrow, in.Size() / nrow}),
                             start * suffix, end * suffix);
-    ret = ret.Reshape(out_shape);
+    ret.Reshape(out_shape);
     return ret;
   }
 }
@@ -1145,7 +1058,7 @@ void SubRow(const Tensor &v, Tensor *M) { AddRow(-1, 1, 
v, M); }
 
 void SumColumns(const Tensor &M, Tensor *v) {
   if (M.transpose()) {
-    Tensor X = M.T();
+    Tensor X = Transpose(M);
     SumRows(X, v);
   } else {
     CHECK_EQ(M.nDim(), 2u);
@@ -1160,7 +1073,7 @@ void SumColumns(const Tensor &M, Tensor *v) {
 }
 void SumRows(const Tensor &M, Tensor *v) {
   if (M.transpose()) {
-    Tensor X = M.T();
+    Tensor X = Transpose(M);
     SumColumns(X, v);
   } else {
     CHECK_EQ(M.nDim(), 2u);
@@ -1170,7 +1083,7 @@ void SumRows(const Tensor &M, Tensor *v) {
 
     Tensor one(Shape{nb_row}, M.device(), M.data_type());
     one.SetValue(1.0f);  // TODO(wangwei) cast type
-    Tensor X = M.T();
+    Tensor X = Transpose(M);
     Mult(X, one, v);
   }
 }
@@ -1268,13 +1181,13 @@ void Mult(const SType alpha, const Tensor &A, const 
Tensor &B, const SType beta,
 // ************************
 // Misc.
 // ************************
-const Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t) {
+Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t) {
   Tensor loss({p.shape(0)}, p.device(), p.data_type());
   ComputeCrossEntropy(p, t, &loss);
   return loss;
 }
 
-const Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t) {
+Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t) {
   auto g = p.Clone();
   SoftmaxCrossEntropyBwd(t, &g);
   return g;
@@ -1310,65 +1223,20 @@ void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) 
{
   });
 }
 
-Tensor Tensor::Reshape(const Shape &shape) {
-  if (strides_.size() == 0)
-    strides_.push_back(1);
 
-  // TODO(wangwei) remove this condition and report error if size changes.
-  if (Product(shape_) != Product(shape)) {
-    if (block_ != nullptr && block_->DecRefCount() == 0)
-      device_->FreeBlock(block_);
-    block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
-    shape_ = shape;
-    generate_strides();
-    return *this;
-
-  } else if (transpose()) {
-    Tensor t(shape_, device_, data_type_);
-    t.block_ = t.device()->NewBlock((int)(Product(shape) * 
SizeOf(data_type_)));
+// if tensor is not transposed yet, we change the shape and generate new 
strides
+// if tensor is already transposed, we reallocate the memory and generate 
strides
+Tensor& Tensor::Reshape(const Shape &shape) {
+  if (transpose()) {
+    Tensor t(shape, device_, data_type_);
     singa::Transform(*this, &t);
-    t.shape_ = shape;
-    return t;
+    shape_ = shape;
+    std::swap(t.block_, block_);
   } else {
-    Tensor t;
-    t.shape_ = shape;
-    t.device_ = device_;
-    t.data_type_ = data_type_;
-    t.block_ = block_;  // be careful about the block inference (mem leaking)
-    t.block_->IncRefCount();
-    t.generate_strides();
-    return t;
-  }
-}
-
-Tensor Tensor::Reshape(Shape &&shape) {
-  if (strides_.size() == 0)
-    strides_.push_back(1);
-
-  if (Product(shape_) != Product(shape)) {
-    if (block_ != nullptr && block_->DecRefCount() == 0)
-      device_->FreeBlock(block_);
-    block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
-    shape_ = std::move(shape);
+    shape_ = shape;
     generate_strides();
-    return *this;
-
-  } else if (transpose()) {
-    Tensor t(shape_, device_, data_type_);
-    t.block_ = t.device()->NewBlock((int)(Product(shape) * 
SizeOf(data_type_)));
-    singa::Transform(*this, &t);
-    t.shape_ = shape;
-    return t;
-  } else {
-    Tensor t;
-    t.shape_ = shape;
-    t.device_ = device_;
-    t.data_type_ = data_type_;
-    t.block_ = block_;  // be careful about the block inference (mem leaking)
-    t.block_->IncRefCount();
-    t.generate_strides();
-    return t;
   }
+  return *this;
 }
 
 Tensor Reshape(const Tensor &in, const Shape &s) {
@@ -1376,9 +1244,4 @@ Tensor Reshape(const Tensor &in, const Shape &s) {
   return out.Reshape(s);
 }
 
-Tensor Reshape(const Tensor &in, Shape &&s) {
-  Tensor out(in);
-  return out.Reshape(std::move(s));
-}
-
 }  // namespace singa

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index f438fc6..f5fbc84 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -253,7 +253,7 @@ void Tanh(const Tensor &in, Tensor *out, Context *ctx) {
 
 /// similar to cudnnTransformTensor
 /// copies the data from one tensor to another tensor with a different layout
-/// the tensors must have the same dimensions but not necessarily the same 
strides 
+/// the tensors must have the same dimensions but not necessarily the same 
strides
 template <typename DType, typename Lang>
 void Transform(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Transform Not Implemented";

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h 
b/src/core/tensor/tensor_math_cuda.h
index 2a43468..dfe5724 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -54,34 +54,23 @@ cudnn requires tensor dimensions to fulfill 1 requirement:
            Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be 
the input
 */
 vector<int> generate_shape_cuda(const Tensor& x) {
-  Shape shape_ = x.shape();
+  Shape shape = x.shape();
+  CHECK_LE(shape.size(), 5) << "Dimensions (shape) beyond 5 are currently not 
supported" ;
   vector<int> shape_arr;
-  if (shape_.size() <= 4) {
-    for (size_t n = 0; n < 4 - shape_.size(); ++n) {
+  if (shape.size() <= 4) {
+    for (int n = 0; n < 4 - shape.size(); ++n) {
       shape_arr.push_back(1);
     }
-    for (size_t n = 0; n < shape_.size(); ++n) {
-      shape_arr.push_back(shape_.at(n));
-    }
-    return shape_arr;
-  } else if (shape_.size() == 5) {
-    for (size_t n = 0; n < shape_.size(); ++n) {
-      shape_arr.push_back(shape_.at(n));
-    }
-    return shape_arr;
-  } else {
-    LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
   }
+  for(auto x: shape)
+    shape_arr.push_back(static_cast<int>(x));
   return shape_arr;
 }
 
 int generate_dim_cuda(const Tensor& x) {
+  CHECK_LE(x.nDim(), 5) << "Dimensions (shape) beyond 5 are currently not 
supported" ;
   if (x.shape().size() <= 4) {return 4;}
-  else if (x.shape().size() == 5) {return 5;}
-  else {
-    LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
-  }
-  return 0;
+  else {return 5;}
 }
 
 /*
@@ -94,29 +83,17 @@ int generate_dim_cuda(const Tensor& x) {
     and stride {9, 9, 3, 1} or {9, 9, 1, 3} to be the inputs
   */
 vector<int> generate_strides_cuda(const Tensor& x) {
-  Shape shape_ = x.shape();
-  vector<int> strides_ = x.strides();
+  Shape shape = x.shape();
+  auto& strides = x.strides();
   vector<int> strides_arr;
-  int product = 1;
-  for (size_t n = 0; n < (shape_.size()); ++n) {
-    product *= shape_[n];
-  }
-  if (shape_.size() <= 4) {
-    for (size_t n = 0; n < 4 - shape_.size(); ++n) {
+  int product = Product(shape);
+  if (shape.size() <= 4) {
+    for (int n = 0; n < 4 - shape.size(); ++n) {
       strides_arr.push_back(product);
     }
-    for (size_t n = 0; n < strides_.size(); ++n) {
-      strides_arr.push_back(strides_[n]);
-    }
-    return strides_arr;
-  } else if (shape_.size() == 5) {
-    for (size_t n = 0; n < strides_.size(); ++n) {
-      strides_arr.push_back(strides_[n]);
-    }
-    return strides_arr;
-  } else {
-    LOG(FATAL) << "Dimensions (strides) beyond 5 are currently not supported" ;
   }
+  for(auto x : strides)
+    strides_arr.push_back(static_cast<int>(x));
   return strides_arr;
 }
 
@@ -241,6 +218,22 @@ void Sub<float, lang::Cuda>(const Tensor& in1,
   }
 }
 
+template <>
+void Transform<float, lang::Cuda>(const Tensor& in, Tensor* out,
+                             Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in.block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+  float alpha = 1.0;
+  float beta = 0.0;
+
+  check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
+                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
+                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
+                        ));
+
+}
+
 /// Element-wise operation, clamp every element into [low, high]
 /// if x>high, then x=high; if x<low, then x=low.
 template <>
@@ -254,14 +247,7 @@ void Clamp<float, lang::Cuda>(const float low,
   if (in.strides() == out->strides()) {
     cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, out, ctx);
     cuda::clamp(num, low, high, outPtr, outPtr, ctx->stream);
   }
 }
@@ -280,36 +266,18 @@ void Div<float, lang::Cuda>(const Tensor& in1,
   if (!in1.transpose() && !in2.transpose() && (in1.strides() == 
in2.strides())) {
     cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
   } else { //else we check whether in1 or in2 or both are transposed
-    float alpha = 1.0;
-    float beta = 0.0;
-
     if (in1.transpose() && in2.transpose()) {
       Tensor t(in1.shape(), in1.device(), in1.data_type());
-      float* tPtr = static_cast<float*>(t.block()->mutable_data());
-
-      check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                           (void*)(&alpha), generate_tensor_nd_desc(in1), 
inPtr1,
-                           (void*)(&beta), generate_tensor_nd_desc(t), tPtr
-                          ));
+      Transform<float, lang::Cuda>(in1, &t, ctx);
+      Transform<float, lang::Cuda>(in2, out, ctx);
 
-      check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                           (void*)(&alpha), generate_tensor_nd_desc(in2), 
inPtr2,
-                           (void*)(&beta), generate_tensor_nd_desc(*out), 
outPtr
-                          ));
+      float* tPtr = static_cast<float*>(t.block()->mutable_data());
       cuda::div(num, tPtr, outPtr, outPtr, ctx->stream);
-
     } else if (in1.transpose()) {
-      check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                           (void*)(&alpha), generate_tensor_nd_desc(in1), 
inPtr1,
-                           (void*)(&beta), generate_tensor_nd_desc(*out), 
outPtr
-                          ));
+      Transform<float, lang::Cuda>(in1, out, ctx);
       cuda::div(num, outPtr, inPtr2, outPtr, ctx->stream);
-
     } else if (in2.transpose()) {
-      check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                           (void*)(&alpha), generate_tensor_nd_desc(in2), 
inPtr2,
-                           (void*)(&beta), generate_tensor_nd_desc(*out), 
outPtr
-                          ));
+      Transform<float, lang::Cuda>(in2, out, ctx);
       cuda::div(num, inPtr1, outPtr, outPtr, ctx->stream);
     }
   }
@@ -325,14 +293,7 @@ void Div<float, lang::Cuda>(const float x, const Tensor& 
in,
   if (in.strides() == out->strides()) {
     cuda::div(num, x, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, out, ctx);
     cuda::div(num, x, outPtr, outPtr, ctx->stream);
   }
 }
@@ -366,36 +327,17 @@ void EltwiseMult<float, lang::Cuda>(const Tensor& in1,
   if (!in1.transpose() && !in2.transpose() && (in1.strides() == 
in2.strides())) {
     cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
   } else { //else we check whether in1 or in2 or both are transposed
-    float alpha = 1.0;
-    float beta = 0.0;
-
     if (in1.transpose() && in2.transpose()) {
       Tensor t(in1.shape(), in1.device(), in1.data_type());
+      Transform<float, lang::Cuda>(in1, &t, ctx);
+      Transform<float, lang::Cuda>(in2, out, ctx);
       float* tPtr = static_cast<float*>(t.block()->mutable_data());
-
-      check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                           (void*)(&alpha), generate_tensor_nd_desc(in1), 
inPtr1,
-                           (void*)(&beta), generate_tensor_nd_desc(t), tPtr
-                          ));
-
-      check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                           (void*)(&alpha), generate_tensor_nd_desc(in2), 
inPtr2,
-                           (void*)(&beta), generate_tensor_nd_desc(*out), 
outPtr
-                          ));
       cuda::mult(num, tPtr, outPtr, outPtr, ctx->stream);
-
     } else if (in1.transpose()) {
-      check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                           (void*)(&alpha), generate_tensor_nd_desc(in1), 
inPtr1,
-                           (void*)(&beta), generate_tensor_nd_desc(*out), 
outPtr
-                          ));
+      Transform<float, lang::Cuda>(in1, out, ctx);
       cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream);
-
     } else if (in2.transpose()) {
-      check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                           (void*)(&alpha), generate_tensor_nd_desc(in2), 
inPtr2,
-                           (void*)(&beta), generate_tensor_nd_desc(*out), 
outPtr
-                          ));
+      Transform<float, lang::Cuda>(in2, out, ctx);
       cuda::mult(num, inPtr1, outPtr, outPtr, ctx->stream);
     }
   }
@@ -413,14 +355,7 @@ void Exp<float, lang::Cuda>(const Tensor& in, Tensor* out,
   if (in.strides() == out->strides()) {
     cuda::exp(num, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, out, ctx);
     cuda::exp(num, outPtr, outPtr, ctx->stream);
   }
 }
@@ -435,14 +370,7 @@ void GE<float, lang::Cuda>(const Tensor& in, const float x,
   if (in.strides() == out->strides()) {
     cuda::ge(num, inPtr, x, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, out, ctx);
     cuda::ge(num, outPtr, x, outPtr, ctx->stream);
   }
 }
@@ -451,10 +379,7 @@ void GE<float, lang::Cuda>(const Tensor& in1, const 
Tensor& in2,
                            Tensor* out, Context* ctx) {
   Sub<float, lang::Cuda>(in1, in2, out, ctx);
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
-  // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   const size_t num = in1.Size();
-  //cuda::ge(num, inPtr1, inPtr2, outPtr, ctx->stream);
   cuda::ge(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 
@@ -469,14 +394,7 @@ void GT<float, lang::Cuda>(const Tensor& in, const float x,
   if (in.strides() == out->strides()) {
     cuda::gt(num, inPtr, x, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, out, ctx);
     cuda::gt(num, outPtr, x, outPtr, ctx->stream);
   }
 }
@@ -485,10 +403,7 @@ void GT<float, lang::Cuda>(const Tensor& in1, const 
Tensor& in2,
                            Tensor* out, Context* ctx) {
   Sub<float, lang::Cuda>(in1, in2, out, ctx);
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
-  // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   const size_t num = in1.Size();
-  //cuda::gt(num, inPtr1, inPtr2, outPtr, ctx->stream);
   cuda::gt(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 
@@ -502,14 +417,7 @@ void LE<float, lang::Cuda>(const Tensor& in, const float x,
   if (in.strides() == out->strides()) {
     cuda::le(num, inPtr, x, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, out, ctx);
     cuda::le(num, outPtr, x, outPtr, ctx->stream);
   }
 }
@@ -518,10 +426,7 @@ void LE<float, lang::Cuda>(const Tensor& in1, const 
Tensor& in2,
                            Tensor* out, Context* ctx) {
   Sub<float, lang::Cuda>(in1, in2, out, ctx);
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
-  // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   const size_t num = in1.Size();
-  //cuda::le(num, inPtr1, inPtr2, outPtr, ctx->stream);
   cuda::le(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 
@@ -536,14 +441,7 @@ void Log<float, lang::Cuda>(const Tensor& in, Tensor* out,
   if (in.strides() == out->strides()) {
     cuda::log(num, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, out, ctx);
     cuda::log(num, outPtr, outPtr, ctx->stream);
   }
 }
@@ -558,14 +456,7 @@ void LT<float, lang::Cuda>(const Tensor& in, const float x,
   if (in.strides() == out->strides()) {
     cuda::lt(num, inPtr, x, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, out, ctx);
     cuda::lt(num, outPtr, x, outPtr, ctx->stream);
   }
 }
@@ -574,10 +465,7 @@ void LT<float, lang::Cuda>(const Tensor& in1, const 
Tensor& in2,
                            Tensor* out, Context* ctx) {
   Sub<float, lang::Cuda>(in1, in2, out, ctx);
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
-  // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   const size_t num = in1.Size();
-  //cuda::lt(num, inPtr1, inPtr2, outPtr, ctx->stream);
   cuda::lt(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 
@@ -592,14 +480,7 @@ void Pow<float, lang::Cuda>(const Tensor& in, const float 
x,
   if (in.strides() == out->strides()) {
     cuda::pow(num, inPtr, x, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, out, ctx);
     cuda::pow(num, outPtr, x, outPtr, ctx->stream);
   }
 }
@@ -617,36 +498,17 @@ void Pow<float, lang::Cuda>(const Tensor& in1,
   if (!in1.transpose() && !in2.transpose() && (in1.strides() == 
in2.strides())) {
     cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
   } else { //else we check whether in1 or in2 or both are transposed
-    float alpha = 1.0;
-    float beta = 0.0;
-
     if (in1.transpose() && in2.transpose()) {
       Tensor t(in1.shape(), in1.device(), in1.data_type());
       float* tPtr = static_cast<float*>(t.block()->mutable_data());
-
-      check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                           (void*)(&alpha), generate_tensor_nd_desc(in1), 
inPtr1,
-                           (void*)(&beta), generate_tensor_nd_desc(t), tPtr
-                          ));
-
-      check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                           (void*)(&alpha), generate_tensor_nd_desc(in2), 
inPtr2,
-                           (void*)(&beta), generate_tensor_nd_desc(*out), 
outPtr
-                          ));
+      Transform<float, lang::Cuda>(in1, &t, ctx);
+      Transform<float, lang::Cuda>(in2, out, ctx);
       cuda::pow(num, tPtr, outPtr, outPtr, ctx->stream);
-
     } else if (in1.transpose()) {
-      check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                           (void*)(&alpha), generate_tensor_nd_desc(in1), 
inPtr1,
-                           (void*)(&beta), generate_tensor_nd_desc(*out), 
outPtr
-                          ));
+      Transform<float, lang::Cuda>(in1, out, ctx);
       cuda::pow(num, outPtr, inPtr2, outPtr, ctx->stream);
-
     } else if (in2.transpose()) {
-      check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                           (void*)(&alpha), generate_tensor_nd_desc(in2), 
inPtr2,
-                           (void*)(&beta), generate_tensor_nd_desc(*out), 
outPtr
-                          ));
+      Transform<float, lang::Cuda>(in2, out, ctx);
       cuda::pow(num, inPtr1, outPtr, outPtr, ctx->stream);
     }
   }
@@ -694,14 +556,7 @@ void ReLU<float, lang::Cuda>(const Tensor& in, Tensor* out,
   if (in.strides() == out->strides()) {
     cuda::relu(num, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, out, ctx);
     cuda::relu(num, outPtr, outPtr, ctx->stream);
   }
 }
@@ -749,14 +604,7 @@ void Sigmoid<float, lang::Cuda>(const Tensor& in, Tensor* 
out,
   if (in.strides() == out->strides()) {
     cuda::sigmoid(num, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, out, ctx);
     cuda::sigmoid(num, outPtr, outPtr, ctx->stream);
   }
 }
@@ -772,14 +620,7 @@ void Sign<float, lang::Cuda>(const Tensor& in, Tensor* out,
   if (in.strides() == out->strides()) {
     cuda::sign(num, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, out, ctx);
     cuda::sign(num, outPtr, outPtr, ctx->stream);
   }
 }
@@ -788,15 +629,14 @@ void Sign<float, lang::Cuda>(const Tensor& in, Tensor* 
out,
 template <>
 void Sqrt<float, lang::Cuda>(const Tensor& in, Tensor* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
 #if CUDNN_MAJOR < 7
+  Transform<float, lang::Cuda>(in, out, ctx);
   size_t num = in.Size();
-  cuda::sqrt(num, inPtr, outPtr, ctx->stream);
-
+  cuda::sqrt(num, outPtr, outPtr, ctx->stream);
 #else
-
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float alpha1 = 1.0;
   float alpha2 = 0.0;
   float beta = 0.0;
@@ -820,14 +660,7 @@ void Square<float, lang::Cuda>(const Tensor& in, Tensor* 
out,
   if (in.strides() == out->strides()) {
     cuda::square(num, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, out, ctx);
     cuda::square(num, outPtr, outPtr, ctx->stream);
   }
 }
@@ -883,34 +716,11 @@ void Tanh<float, lang::Cuda>(const Tensor& in, Tensor* 
out,
   if (in.strides() == out->strides()) {
     cuda::tanh(num, inPtr, outPtr, ctx->stream);
   } else { //else we transform in to out to store first
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, out, ctx);
     cuda::tanh(num, outPtr, outPtr, ctx->stream);
   }
 }
 
-template <>
-void Transform<float, lang::Cuda>(const Tensor& in, Tensor* out,
-                             Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in.block()->data());
-  float* outPtr = static_cast<float*>(out->block()->mutable_data());
-
-  float alpha = 1.0;
-  float beta = 0.0;
-
-  check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(*out), outPtr
-                        ));
-
-}
-
 // ================Random functions===========================================
 /// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
 // Get the random generator from 'ctx'
@@ -1175,16 +985,7 @@ void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* 
out,
 
   if (in.transpose()) {
     Tensor t(in.shape(), in.device(), in.data_type());
-    float* tPtr = static_cast<float*>(t.block()->mutable_data());
-
-    float alpha = 1.0;
-    float beta = 0.0;
-
-    check_cudnn(cudnnTransformTensor(ctx->cudnn_handle,
-                         (void*)(&alpha), generate_tensor_nd_desc(in), inPtr,
-                         (void*)(&beta), generate_tensor_nd_desc(t), tPtr
-                        ));
-
+    Transform<float, lang::Cuda>(in, &t, ctx);
     const float* tPtr_const = static_cast<const float*>(t.block()->data());
     cuda::RowMax(nrow, ncol, tPtr_const, outPtr, ctx->stream);
   } else {

[4/4] incubator-singa git commit: SINGA-380) Fix bugs from Reshape

Reply via email to