SINGA-380) Fix bugs from Reshape Update reshape API in C++ and Python. C++ Tensor method reshape changes original tensor; All other reshape method returns a new tensor (which shares memory with the original tensor if possible).
APIs for transpose are updated in the same way. Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/b30d7ea5 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/b30d7ea5 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/b30d7ea5 Branch: refs/heads/master Commit: b30d7ea55cd58bb0858aa354833c1ba9a3242470 Parents: 58e6640 Author: Wang Wei <[email protected]> Authored: Mon Jul 9 23:52:10 2018 +0800 Committer: wang wei <[email protected]> Committed: Wed Jul 11 15:24:27 2018 +0800 ---------------------------------------------------------------------- examples/autograd/mnist_cnn.py | 17 +- examples/cifar10/cnn-parallel.cc | 8 +- examples/cifar10/vgg-parallel.cc | 8 +- examples/imagenet/alexnet/alexnet.cc | 2 +- examples/imagenet/alexnet/ilsvrc12.h | 16 +- include/singa/core/tensor.h | 162 ++++---- python/singa/autograd.py | 273 +++++++------- python/singa/tensor.py | 109 +++--- src/api/core_tensor.i | 19 +- src/core/tensor/tensor.cc | 297 ++++----------- src/core/tensor/tensor_math.h | 2 +- src/core/tensor/tensor_math_cuda.h | 323 ++++------------ src/io/image_transformer.cc | 573 ++++++++++++++--------------- src/model/layer/batchnorm.cc | 15 +- src/model/layer/convolution.cc | 8 +- src/model/layer/cudnn_batchnorm.cc | 4 +- src/model/layer/dense.cc | 14 +- src/model/layer/flatten.cc | 3 +- src/model/layer/lrn.cc | 9 +- src/model/layer/opencl_convolution.cc | 58 +-- src/model/layer/rnn.cc | 2 +- src/model/operation/convolution.cc | 67 ++-- src/model/updater/local_updater.cc | 4 +- 23 files changed, 849 insertions(+), 1144 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/examples/autograd/mnist_cnn.py ---------------------------------------------------------------------- diff --git a/examples/autograd/mnist_cnn.py b/examples/autograd/mnist_cnn.py index 43a22ba..f78ccc8 100755 --- a/examples/autograd/mnist_cnn.py +++ b/examples/autograd/mnist_cnn.py @@ -84,7 +84,7 @@ if __name__ == '__main__': dev = device.get_default_device() else: print('Using GPU') - dev = device.create_cuda_gpu() + dev = device.create_cuda_gpu_on(1) train, test = load_data(args.file_path) @@ -92,7 +92,7 @@ if __name__ == '__main__': num_classes = 10 epochs = 1 - sgd = optimizer.SGD(0.001) + sgd = optimizer.SGD(0.01) x_train = preprocess(train[0]) y_train = to_categorical(train[1], num_classes) @@ -111,7 +111,6 @@ if __name__ == '__main__': def forward(x, t): - y = conv1(x) y = autograd.relu(y) y = autograd.max_pool_2d(y) @@ -124,11 +123,11 @@ if __name__ == '__main__': return loss, y autograd.training = True - for epoch in range(50): + for epoch in range(epochs): for i in range(batch_number): inputs = tensor.Tensor(device=dev, data=x_train[ i * 100:(1 + i) * 100], stores_grad=False) targets = tensor.Tensor(device=dev, data=y_train[i * 100:(1 + i) * 100], requires_grad=False, stores_grad=False) - + loss, y = forward(inputs, targets) accuracy_rate = accuracy(tensor.to_numpy(y), @@ -136,12 +135,6 @@ if __name__ == '__main__': if (i % 5 == 0): print('accuracy is:', accuracy_rate, 'loss is:', tensor.to_numpy(loss)[0]) - + for p, gp in autograd.backward(loss): sgd.apply(epoch, gp, p, '') - - - - - - http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/examples/cifar10/cnn-parallel.cc ---------------------------------------------------------------------- diff --git a/examples/cifar10/cnn-parallel.cc b/examples/cifar10/cnn-parallel.cc index 8cc3352..4bee575 100644 --- a/examples/cifar10/cnn-parallel.cc +++ b/examples/cifar10/cnn-parallel.cc @@ -154,20 +154,20 @@ void Train(float lr, int num_epoch, string data_dir) { train_y = train.second; LOG(INFO) << "Slicing training data..."; - train_x_1.Reshape(Shape{nsamples / 2, train.first.shape(1), + train_x_1 = Tensor(Shape{nsamples / 2, train.first.shape(1), train.first.shape(2), train.first.shape(3)}); LOG(INFO) << "Copying first data slice..."; CopyDataToFrom(&train_x_1, train_x, train_x.Size() / 2); - train_x_2.Reshape(Shape{nsamples / 2, train.first.shape(1), + train_x_2 = Tensor(Shape{nsamples / 2, train.first.shape(1), train.first.shape(2), train.first.shape(3)}); LOG(INFO) << "Copying second data slice..."; CopyDataToFrom(&train_x_2, train_x, train_x.Size() / 2, 0, train_x.Size() / 2); - train_y_1.Reshape(Shape{nsamples / 2}); + train_y_1 = Tensor(Shape{nsamples / 2}); train_y_1.AsType(kInt); LOG(INFO) << "Copying first label slice..."; CopyDataToFrom(&train_y_1, train_y, train_y.Size() / 2); - train_y_2.Reshape(Shape{nsamples / 2}); + train_y_2 = Tensor(Shape{nsamples / 2}); train_y_2.AsType(kInt); LOG(INFO) << "Copying second label slice..."; CopyDataToFrom(&train_y_2, train_y, train_y.Size() / 2, 0, http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/examples/cifar10/vgg-parallel.cc ---------------------------------------------------------------------- diff --git a/examples/cifar10/vgg-parallel.cc b/examples/cifar10/vgg-parallel.cc index 90e9fce..33c533b 100644 --- a/examples/cifar10/vgg-parallel.cc +++ b/examples/cifar10/vgg-parallel.cc @@ -223,20 +223,20 @@ void Train(float lr, int num_epoch, string data_dir) { train_y = train.second; LOG(INFO) << "Slicing training data..."; - train_x_1.Reshape(Shape{nsamples / 2, train.first.shape(1), + train_x_1 = Tensor(Shape{nsamples / 2, train.first.shape(1), train.first.shape(2), train.first.shape(3)}); LOG(INFO) << "Copying first data slice..."; CopyDataToFrom(&train_x_1, train_x, train_x.Size() / 2); - train_x_2.Reshape(Shape{nsamples / 2, train.first.shape(1), + train_x_2 = Tensor(Shape{nsamples / 2, train.first.shape(1), train.first.shape(2), train.first.shape(3)}); LOG(INFO) << "Copying second data slice..."; CopyDataToFrom(&train_x_2, train_x, train_x.Size() / 2, 0, train_x.Size() / 2); - train_y_1.Reshape(Shape{nsamples / 2}); + train_y_1 = Tensor(Shape{nsamples / 2}); train_y_1.AsType(kInt); LOG(INFO) << "Copying first label slice..."; CopyDataToFrom(&train_y_1, train_y, train_y.Size() / 2); - train_y_2.Reshape(Shape{nsamples / 2}); + train_y_2 = Tensor(Shape{nsamples / 2}); train_y_2.AsType(kInt); LOG(INFO) << "Copying second label slice..."; CopyDataToFrom(&train_y_2, train_y, train_y.Size() / 2, 0, http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/examples/imagenet/alexnet/alexnet.cc ---------------------------------------------------------------------- diff --git a/examples/imagenet/alexnet/alexnet.cc b/examples/imagenet/alexnet/alexnet.cc index 4ac1130..2d8db2d 100644 --- a/examples/imagenet/alexnet/alexnet.cc +++ b/examples/imagenet/alexnet/alexnet.cc @@ -174,7 +174,7 @@ void TrainOneEpoch(FeedForwardNet &net, ILSVRC &data, size_t b = 0; size_t n_read; Timer timer, ttr; - Tensor prefetch_x, prefetch_y; + Tensor prefetch_x(Shape{batchsize, 3, kCropSize, kCropSize}), prefetch_y(Shape{batchsize}, kInt); string binfile = bin_folder + "/train1.bin"; timer.Tick(); data.LoadData(kTrain, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read, http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/examples/imagenet/alexnet/ilsvrc12.h ---------------------------------------------------------------------- diff --git a/examples/imagenet/alexnet/ilsvrc12.h b/examples/imagenet/alexnet/ilsvrc12.h index 74fffbb..05b3451 100644 --- a/examples/imagenet/alexnet/ilsvrc12.h +++ b/examples/imagenet/alexnet/ilsvrc12.h @@ -43,6 +43,12 @@ using std::string; using namespace singa::io; namespace singa { + + /// size for resizing +const size_t kImageSize = 256; +const size_t kImageNBytes = 3 * kImageSize * kImageSize; +/// size for cropping +const size_t kCropSize = 227; /// For reading ILSVRC2012 image data as tensors. class ILSVRC { public: @@ -105,11 +111,7 @@ class ILSVRC { void WriteMean(Tensor &mean, string path); private: - /// size for resizing - const size_t kImageSize = 256; - const size_t kImageNBytes = 3 * kImageSize * kImageSize; - /// size for cropping - const size_t kCropSize = 227; + Tensor mean; string last_read_file = ""; @@ -299,9 +301,7 @@ std::thread ILSVRC::AsyncLoadData(int flag, string file, size_t read_size, size_t ILSVRC::LoadData(int flag, string file, size_t read_size, Tensor *x, Tensor *y, size_t *n_read, int nthreads) { - x->Reshape(Shape{read_size, 3, kCropSize, kCropSize}); - y->AsType(kInt); - y->Reshape(Shape{read_size}); + if (file != last_read_file) { if (reader != nullptr) { reader->Close(); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/include/singa/core/tensor.h ---------------------------------------------------------------------- diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h index 5921762..a73821c 100644 --- a/include/singa/core/tensor.h +++ b/include/singa/core/tensor.h @@ -57,47 +57,38 @@ class Tensor { public: ~Tensor(); Tensor(); - explicit Tensor(Shape &&shape, DataType dtype = kFloat32); + + /// Constructor using default device. explicit Tensor(const Shape &shape, DataType dtype = kFloat32); - Tensor(Shape &&shape, - std::shared_ptr<Device> dev, - DataType dtype = kFloat32); + /// Constructor with shape, device and data type Tensor(const Shape &shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32); - /// Copy Tensor to share the internal data. No deep copy. + /// Copy constructor. No deep copy. Tensor(const Tensor &from); - /// Copy Tensor to share the internal data. No deep copy. - /// For 2 tensors sharing same block but different strides. - Tensor(const Tensor &from, Shape &new_shape, vector<int> &new_strides); - /// Copy Tensor to share the internal data. No deep copy. + + /// Move constructor. No deep copy. Tensor(Tensor &&from); + // -------------------------------------------------------------------------- + // ---Following methods return info of the class without making any changes-- + // -------------------------------------------------------------------------- + /// For functions in xx_math.cc to access the block. /// Users should not operate against Block directly. /// block_ is allocated in constructors. Block *block() const { return block_; } - void SetBlock(Block *block); std::shared_ptr<Device> device() const { return device_; } - /// return immutable Tensor values with given type. + /// Return immutable Tensor values with given type. template <typename SType> const SType *data() const { return static_cast<const SType *>(block()->data()); } - /// used for swig code to convert Tensor into numpy array. - /// It gets data into 'value' - template <typename SType> - void GetValue(SType *value, const size_t num) { - CHECK(device_ == defaultDevice); - const SType* ptr = data<SType>(); - for (size_t i = 0; i < num; i++) value[i] = ptr[i]; - } - /// data type, including kFloat16, kFloat32, kInt const DataType data_type() const { return data_type_; } @@ -113,28 +104,55 @@ class Tensor { bool empty() const { return nDim() == 0; } /// Check if the tensor's last stride==1 - bool transpose() const { return (strides_.back() != 1); } + bool transpose() const { + if (!strides_.empty()) { + auto last = strides_.front(); + for (auto s : strides_) { + if (s > last) + return true; + last = s; + } + } + return false; + } const vector<int>& strides() const { return strides_; } - /// return true if the content of the tensor is initialized + /// Return true if the content of the tensor is initialized bool initailized() const { return block_ != nullptr && block_->initialized(); } - /// return number of total elements + /// Return number of total elements size_t Size() const { if (block_ == nullptr) return 0u; CHECK_EQ(block_->size() % SizeOf(data_type_), 0u); return block_->size() / SizeOf(data_type_); } - /// return memory size (i.e., Bytes) + /// Return memory size (i.e., Bytes) size_t MemSize() const { return block_->size(); } - /// Reset the tensor shape, it may reallocate block, if MemSize() changes. - Tensor Reshape(const Shape &shape); - Tensor Reshape(Shape &&shape); + /// used for swig code to convert Tensor into numpy array. + /// It gets data into 'value' + template <typename SType> + void GetValue(SType *value, const size_t num) { + CHECK(device_ == defaultDevice); + const SType* ptr = data<SType>(); + for (size_t i = 0; i < num; i++) value[i] = ptr[i]; + } + + /// Serialize data, shape and transpose to protobuf object. + void ToProto(singa::TensorProto *proto) const; + + /// Return average L1 norm + float L1() const; + + /// Return average L2 norm + float L2() const; + // -------------------------------------------------------------------------- + // ---Following methods changes the internal members + // -------------------------------------------------------------------------- /// Reset the shape, device, and data type as given tensor. /// If block size changes, then reallocate a new block. @@ -155,6 +173,8 @@ class Tensor { template <typename SType> void SetValue(const SType x); + void SetShape(const Shape& shape); + /// For init the tensor values, copy 'num' elements from 'src' to the internal /// memory with 'offset' (elements). template <typename SType> @@ -165,46 +185,41 @@ class Tensor { /// Meta data would not be copied! void CopyData(const Tensor &other); - void RepeatData(vector<size_t> repeats, int axis, int total_repeats, const Tensor &other); - /// Deserialize data, shape and transpose from protobuf object. void FromProto(const singa::TensorProto &proto); - /// Serialize data, shape and transpose to protobuf object. - void ToProto(singa::TensorProto *proto) const; - /// return an exactly the same Tensor with data been deep copied to the given - /// device. If 'device' is nullptr, then clone it one the current device. - Tensor Clone(std::shared_ptr<Device> device = nullptr) const; + /// TODO(wangwei) merge RepeatData into Repeat? + void RepeatData(const vector<size_t>& repeats, int axis, int total_repeats, + const Tensor &other); - Tensor Repeat(vector<size_t> repeats, int axis, std::shared_ptr<Device> device = nullptr) ; + // -------------------------------------------------------------------------- + // ---Following methods returns a new Tensor without change original tensor + // -------------------------------------------------------------------------- - // Tensor operations - - /// Matrix transpose. Valid only if shape.size() == 2. - /// No data copy, just set the transpose_ filed of the returned tensor. - Tensor T() const; - - /// Reverse the shape vector - Tensor Transpose() const; + Tensor Repeat(const vector<size_t>& repeats, int axis, + std::shared_ptr<Device> device = nullptr); - /// Change the axes - Tensor Transpose(const vector<size_t> &axes) const; + /// return an exactly the same Tensor with data been deep copied to the given + /// device. If 'device' is nullptr, then clone it one the current device. + Tensor Clone(std::shared_ptr<Device> device = nullptr) const; - /// Copy the meta info with data block shared. + // -------------------------------------------------------------------------- + // ---Following methods change the tensor and return itself + // -------------------------------------------------------------------------- + /// Copy assignment Tensor &operator=(const Tensor &in); - /// Copy the meta info with data block shared. + /// Move assignment Tensor &operator=(Tensor &&in); Tensor &operator+=(const Tensor &in); - // void operator+=(Tensor&& in); + Tensor &operator-=(const Tensor &in); - // void operator-=(Tensor&& in); + Tensor &operator*=(const Tensor &in); - // void operator*=(Tensor&& in); + Tensor &operator/=(const Tensor &in); - // void operator/=(Tensor&& in); // Scalar operations. @@ -224,10 +239,19 @@ class Tensor { template <typename SType> Tensor &operator/=(const SType x); - /// Return average L1 norm - float L1() const; - /// Return average L2 norm - float L2() const; + /// change the shape (and stride); the block may be reallocated. + Tensor &Reshape(const Shape &shape); + + /// Matrix transpose. Valid only if shape.size() == 2. + Tensor& T(); + + /// Reverse the shape vector + Tensor& Transpose(); + + /// Change the axes + Tensor& Transpose(const vector<size_t> &axes); + + protected: //generate strides automatically if stride field is not passed void generate_strides() { @@ -259,10 +283,10 @@ class Tensor { vector<int> strides_ = {}; }; //end of tensor class + inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) { if (len == 0) len = shape.size(); - if (len == 0) - return 0; + if (len == 0) return 0; CHECK_LE(len, shape.size()); size_t v = 1; for (unsigned int i = start; i < len; i++) v *= shape[i]; @@ -275,24 +299,31 @@ inline void CheckDataTypeAndLang(const Tensor &in1, const Tensor &in2) { CHECK_EQ(in1.device()->lang(), in2.device()->lang()); } + template <typename FromType, typename ToType> ToType TypeCast(const FromType &x) { // TODO(wangwei) cast fp16; prevent some casts, e.g., float to char return static_cast<ToType>(x); } + +/// Reshape the given tensor and generate a new tensor, +/// which shares the memory with in if possible Tensor Reshape(const Tensor &in, const Shape &s); -Tensor Reshape(const Tensor &in, Shape &&s); -// For tensors with sparse content, e.g., missing columns or rows. -// class SparseTensor : public Tensor {}; +/// Reverse the shape vector +Tensor Transpose(const Tensor& in); + +/// Change the axes +Tensor Transpose(const Tensor& in, const vector<size_t> &axes); /// Copy 'num' elements of src to dst. /// The first 'src_offset' ('dst_offset') elements will be skipped. void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num, const size_t dst_offset = 0, const size_t src_offset = 0); -void RepeatDataToFrom(bool broadcast_flag, vector<size_t> repeats, int axis, + +void RepeatDataToFrom(bool broadcast_flag, const vector<size_t>& repeats, int axis, Tensor *dst, const Tensor &in, const size_t num); // =============Element-wise operations==================================== @@ -411,6 +442,8 @@ void Div(const SType x, const Tensor &in, Tensor *out); template <typename SType = float> SType Sum(const Tensor &in); + + // ============Matrix (row/column) operations================================== /// Average elements in the Tensor, currently only support vector and matrix. /// if 'axis' is 0, average all rows into a single row @@ -510,8 +543,8 @@ void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p); /// To be called by pysinga autograd operations; /// swig ignores the const qualifier http://www.swig.org/Doc3.0/SWIGPlus.html#SWIGPlus_const -const Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t); -const Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t); +Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t); +Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t); /// Return a tensor consisting of rows ([start, end)) from 'in'. It copies the /// values from 'in'. 'in' ia a 2D Tensor. @@ -519,7 +552,8 @@ Tensor CopyRows(const Tensor &in, const size_t start, const size_t end); /// Alias of CopyRows Tensor SliceRows(const Tensor &in, const size_t start, const size_t end); /// Slice the input tensor along the give axis to generate a new tensor -Tensor SliceOn(const Tensor &in, const size_t start, const size_t end, int axis); +Tensor SliceOn(const Tensor &in, const size_t start, const size_t end, + int axis); /// Return a tensor consisting of columns ([start, end)) from 'in'. It copies /// the values from 'in'. 'in' is a 2D Tensor. Tensor CopyColumns(const Tensor &in, const size_t start, const size_t end); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/python/singa/autograd.py ---------------------------------------------------------------------- diff --git a/python/singa/autograd.py b/python/singa/autograd.py index 63698c2..aa6b37a 100755 --- a/python/singa/autograd.py +++ b/python/singa/autograd.py @@ -33,6 +33,126 @@ CTensor = singa.Tensor training = False + +def infer_dependency(op): + ''' + Infer the dependency of all operations with the + given op as the last operation. + + Operation A is depending on B is A uses the output(s) of B. + + Args: + op: an Operation instance, e.g. the loss operation. + + Return: + a Counter instance with the operation as the key, + and the number of operations that are depending on it as the value + ''' + # dependency = {} + dependency_count = Counter() + queue = deque([op]) + while len(queue) > 0: + cur_op = queue.pop() + for src_op, _, _, _ in cur_op.src: + if src_op not in dependency_count and \ + (not isinstance(src_op, Dummy)): + # dependency[src_op] = [Counter() for _ in src_op.y_id2idx] + dependency_count[src_op] = 0 + queue.append(src_op) + # y_idx = src_op.y_id2idx[x_id] + # dependency[src_op][y_idx][cur_op] += 1 + dependency_count[src_op] += 1 + return dependency_count + + +def gradients(y, dy=None): + grads = {} # mapping: x->dx if x.stores_grad + for p, dp in backward(y, dy): + gradients[p] = dp + return grads + + +def backward(y, dy=None): + ''' + Run the backward propagation starting at y. + + Args: + y: a Tensor instance, usually the loss + dy: a number or a Tensor instance, for the gradient of the + objective/loss w.r.t y, usually 1.0 + + Return: + a dictionary storing the gradient tensors of all tensors + whose stores_grad is true (e.g. parameter tensors) + ''' + dependency = infer_dependency(y.creator) + assert y.size() == 1, 'y must be a Tensor with a single value;'\ + 'size of y is % d' % y.size() + + # by default the dy is a tensor with 1.0 for each sample; + if dy is None: + dy = float(1.0) + elif isinstance(dy, Tensor): + dy = dy.data + else: + dy = float(dy) + + # ready is a queue of (operation, dy list) + ready = deque([(y.creator, (dy,))]) + not_ready = {} # mapping: op->[dy] + + if y.stores_grad: + gradients[y] = dy + + while len(ready) > 0: + op, dys = ready.pop() + if not op.requires_grad or isinstance(op, Dummy): + continue + # if not isinstance(op, tensor.Dummy): + dxs = op._do_backward(*dys) + # TODO src and dx must match + assert len(op.src) == len(dxs), \ + 'the number of src ops (=%d) and dx (=%d) not match' \ + % (len(op.src), len(dxs)) + for (src_op, x_id, y, y_stores_grad), dx in zip(op.src, dxs): + # prefix x is w.r.t op; prefix y is w.r.t src_op. + # x_id is the python id of one input arg of src_op, denoted as x. + # y_idx (below) is the index of x among the outputs of src_op. + # not_ready[src_op][y_idx] records the intermediate gradient + # of the y_idx'th output of src_op. 'intermediate gradient' + # indicates that if this output is used in multiple children + # operations, then we have to add the graident (dx) from all these + # children operations. When src_op is ready, it means that + # the gradient of all its outputs are available, i.e. all children + # operations have been backwarded. + # y is None if y.stores_grad is false; otherwise it is a Tensor + y_idx = src_op.y_id2idx[x_id] + if src_op not in not_ready: + # src_op may have mulitple outputs + not_ready[src_op] = [None for _ in src_op.y_id2idx] + not_ready[src_op][y_idx] = dx + else: + dxs = not_ready[src_op] + if dxs[y_idx] is None: + dxs[y_idx] = dx + else: + # add the gradient from another children operation that + # uses y_idx'th output of src_op as input arg + dxs[y_idx] += dx + if y_stores_grad: + # store the gradient for final return, e.g. if x is parameter + g = not_ready[src_op][y_idx] + tg = Tensor(device=g.device(), data=g) + yield (y, tg) + dependency[src_op] -= 1 + if src_op.requires_grad is True: + if dependency[src_op] == 0: + if not isinstance(src_op, Dummy): + ready.append((src_op, not_ready[src_op])) + del not_ready[src_op] + del op # delete the operation to free all tensors from this op + + class Operation(object): ''' An operation includes the forward and backward function of @@ -194,8 +314,8 @@ class Matmul(Operation): Returns: a tuple for (dx, dw) ''' - return singa.Mult(dy, self.input[1].T()), \ - singa.Mult(self.input[0].T(), dy) + return singa.Mult(dy, singa.DefaultTranspose(self.input[1])), \ + singa.Mult(singa.DefaultTranspose(self.input[0]), dy) def matmul(x, w): @@ -268,12 +388,12 @@ class SoftMax(Operation): the result Tensor ''' if self.axis == 1: - x = x.T() + x = singa.DefaultTranspose(x) self.output = singa.SoftMax(x) if self.axis == 0: return self.output elif self.axis == 1: - return self.output.T() + return singa.DefaultTranspose(self.output) def backward(self, dy): ''' @@ -286,7 +406,7 @@ class SoftMax(Operation): ''' # calculations are made on numpy array if self.axis == 1: - dy = dy.T() + dy = singa.DefaultTranspose(dy) grad = ctensor2numpy(dy) output = ctensor2numpy(self.output) out_1 = np.einsum('ki,ki->ki', grad, output) @@ -298,14 +418,14 @@ class SoftMax(Operation): if self.axis == 0: return dx elif self.axis == 1: - return dx.T() + return singa.DefaultTranspose(dx) def soft_max(x, axis=0): return SoftMax(axis)(x)[0] -class NLL(Operation): +class CrossEntropy(Operation): ''' Calculte negative log likelihood loss for a batch of training data. @@ -350,12 +470,11 @@ class NLL(Operation): pass # TODO, broadcast elementwise multiply seems not support -def nll(y, t): - return NLL()(y, t)[0] +def cross_entropy(y, t): + return CrossEntropy()(y, t)[0] class SoftMaxCrossEntropy(Operation): - def forward(self, x, t): self.p = singa.SoftMax(x) self.t = t @@ -365,7 +484,8 @@ class SoftMaxCrossEntropy(Operation): return loss def backward(self, dy=1.0): - return singa.SoftmaxCrossEntropyBwd(self.p, self.t), None + dx = singa.SoftmaxCrossEntropyBwd(self.p, self.t) + return singa.DivFloat(dx, float(self.p.shape()[0])), None def softmax_cross_entropy(x, t): @@ -448,11 +568,11 @@ class Flatten(Operation): def forward(self, x): # TODO Do flatten start from axis != 1 self.shape = list(x.shape()) - y = x.Reshape((x.shape()[0], x.Size() // x.shape()[0])) + y = singa.Reshape(x, (x.shape()[0], x.Size() // x.shape()[0])) return y def backward(self, dy): - dx = dy.Reshape(self.shape) + dx = singa.Reshape(dy, self.shape) return dx @@ -466,11 +586,7 @@ class _Conv2D(Operation): self.handle = handle def forward(self, x, W, b): - #assert x.nDim() == 4, 'The dimensions of input should be 4D.' - #assert x.shape()[1] == self.in_channels, 'in_channels dismatched.' - #assert (xs[0].shape()[2]+2*self.padding[0]-self.kernel_size[0])%self.stride[0] == 0, 'invalid padding.' - #assert (xs[0].shape()[3]+2*self.padding[1]-self.kernel_size[1])%self.stride[1] == 0, 'invalid padding' - #assert 0 == 0, 'invalid padding' + assert x.nDim() == 4, 'The dimensions of input should be 4D.' if training: if self.handle.bias_term: @@ -517,125 +633,6 @@ def conv2d(x, W, b, handle): return _Conv2D(handle)(x, W, b)[0] -def infer_dependency(op): - ''' - Infer the dependency of all operations with the - given op as the last operation. - - Operation A is depending on B is A uses the output(s) of B. - - Args: - op: an Operation instance, e.g. the loss operation. - - Return: - a Counter instance with the operation as the key, - and the number of operations that are depending on it as the value - ''' - # dependency = {} - dependency_count = Counter() - queue = deque([op]) - while len(queue) > 0: - cur_op = queue.pop() - for src_op, _, _, _ in cur_op.src: - if src_op not in dependency_count and \ - (not isinstance(src_op, Dummy)): - # dependency[src_op] = [Counter() for _ in src_op.y_id2idx] - dependency_count[src_op] = 0 - queue.append(src_op) - # y_idx = src_op.y_id2idx[x_id] - # dependency[src_op][y_idx][cur_op] += 1 - dependency_count[src_op] += 1 - return dependency_count - - -def gradients(y, dy=None): - grads = {} # mapping: x->dx if x.stores_grad - for p, dp in backward(y, dy): - gradients[p] = dp - return grads - - -def backward(y, dy=None): - ''' - Run the backward propagation starting at y. - - Args: - y: a Tensor instance, usually the loss - dy: a number or a Tensor instance, for the gradient of the - objective/loss w.r.t y, usually 1.0 - - Return: - a dictionary storing the gradient tensors of all tensors - whose stores_grad is true (e.g. parameter tensors) - ''' - dependency = infer_dependency(y.creator) - assert y.size() == 1, 'y must be a Tensor with a single value;'\ - 'size of y is % d' % y.size() - - # by default the dy is a tensor with 1.0 for each sample; - if dy is None: - dy = float(1.0) - elif isinstance(dy, Tensor): - dy = dy.data - else: - dy = float(dy) - - # ready is a queue of (operation, dy list) - ready = deque([(y.creator, (dy,))]) - not_ready = {} # mapping: op->[dy] - - if y.stores_grad: - gradients[y] = dy - - while len(ready) > 0: - op, dys = ready.pop() - if not op.requires_grad or isinstance(op, Dummy): - continue - # if not isinstance(op, tensor.Dummy): - dxs = op._do_backward(*dys) - # TODO src and dx must match - assert len(op.src) == len(dxs), \ - 'the number of src ops (=%d) and dx (=%d) not match' \ - % (len(op.src), len(dxs)) - for (src_op, x_id, y, y_stores_grad), dx in zip(op.src, dxs): - # prefix x is w.r.t op; prefix y is w.r.t src_op. - # x_id is the python id of one input arg of src_op, denoted as x. - # y_idx (below) is the index of x among the outputs of src_op. - # not_ready[src_op][y_idx] records the intermediate gradient - # of the y_idx'th output of src_op. 'intermediate gradient' - # indicates that if this output is used in multiple children - # operations, then we have to add the graident (dx) from all these - # children operations. When src_op is ready, it means that - # the gradient of all its outputs are available, i.e. all children - # operations have been backwarded. - # y is None if y.stores_grad is false; otherwise it is a Tensor - y_idx = src_op.y_id2idx[x_id] - if src_op not in not_ready: - # src_op may have mulitple outputs - not_ready[src_op] = [None for _ in src_op.y_id2idx] - not_ready[src_op][y_idx] = dx - else: - dxs = not_ready[src_op] - if dxs[y_idx] is None: - dxs[y_idx] = dx - else: - # add the gradient from another children operation that - # uses y_idx'th output of src_op as input arg - dxs[y_idx] += dx - if y_stores_grad: - # store the gradient for final return, e.g. if x is parameter - g = not_ready[src_op][y_idx] - tg = Tensor(device=g.device(), data=g) - yield (y, tg) - dependency[src_op] -= 1 - if src_op.requires_grad is True: - if dependency[src_op] == 0: - if not isinstance(src_op, Dummy): - ready.append((src_op, not_ready[src_op])) - del not_ready[src_op] - del op # delete the operation to free all tensors from this op - - class Layer(object): def __init__(self): @@ -651,8 +648,6 @@ class Layer(object): class Linear(Layer): def __init__(self, in_features, out_features, bias=True): - #self.in_features = in_features - #self.out_features = out_features w_shape = (in_features, out_features) b_shape = (1, out_features) self.bias = bias http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/python/singa/tensor.py ---------------------------------------------------------------------- diff --git a/python/singa/tensor.py b/python/singa/tensor.py index 0860d9d..46a47b7 100644 --- a/python/singa/tensor.py +++ b/python/singa/tensor.py @@ -134,7 +134,7 @@ class Tensor(object): ''' return self.data.transpose() - def transpose(self,axes = None): + def transpose(self, axes=None): ''' To transpose the tensor ''' @@ -142,13 +142,13 @@ class Tensor(object): if axes == None: tshape = [self.shape[x] for x in range(len(t.shape))] t.shape = tuple(tshape) - t.data = self.data.Transpose() + t.data = singa.DefaultTranspose(self.data) else: if(len(axes) != len(self.shape)): raise ValueError('dimensions do not match') tshape = [self.shape[x] for x in axes] t.shape = tuple(tshape) - t.data = self.data.Transpose(list(axes)) + t.data = singa.Transpose(self.data, list(axes)) return t def size(self): # TODO(wangwei) compute size @@ -166,17 +166,18 @@ class Tensor(object): return self.data.MemSize() def reshape(self, shape): - '''Change the tensor shape. + '''Return a new tensor with the given shape, and the original + tensor is not changed. Args: - shape (list<int>): new shape, which should have the same volumn as - the original shape. + shape (list<int>): new shape, which should have the same + volumn as the original shape. ''' t = Tensor(self.shape, self.device, self.dtype) assert product(self.shape) == product(shape), \ 'product of shape should be equal' t.shape = shape - t.data = self.data.Reshape(list(shape)) + t.data = singa.Reshape(self.data, shape) return t def reset_like(self, t): @@ -283,38 +284,41 @@ class Tensor(object): Return: the tensor which has been repeated - + ''' t = Tensor() t_ndim = self.ndim() if isinstance(repeats, int) or isinstance(repeats, long): if repeats < 0: - raise ValueError("'repeats' should not be negative: {}".format(repeats)) + raise ValueError( + "'repeats' should not be negative: {}".format(repeats)) if axis != None and axis < 0: axis += t_ndim # broadcast = True if axis == None: axis = 9999 - t.shape = (product(self.shape)*repeats,) - Repeats = [repeats,] + t.shape = (product(self.shape) * repeats,) + Repeats = [repeats, ] t.data = self.data.Repeat(Repeats, axis) elif axis >= 0: t_shape = list(self.shape) - t_shape[axis] = self.shape[axis]*repeats + t_shape[axis] = self.shape[axis] * repeats t.shape = tuple(t_shape) - Repeats = [repeats,] + Repeats = [repeats, ] t.data = self.data.Repeat(Repeats, axis) elif isinstance(repeats, tuple) or isinstance(repeats, list): for rep in repeats: if rep < 0: - raise ValueError("'repeats' should be int or sequence: {}".format(repeats)) + raise ValueError( + "'repeats' should be int or sequence: {}".format(repeats)) if axis != None and axis < 0: axis += t_ndim if axis == None: axis = 9999 - raise ValueError("when axis us None, 'repeats' should be int: {}".format(repeats)) + raise ValueError( + "when axis us None, 'repeats' should be int: {}".format(repeats)) elif axis >= 0: t_shape = list(self.shape) t_shape[axis] = sum(repeats) @@ -323,16 +327,15 @@ class Tensor(object): else: raise ValueError('repeats should be int or sequence') - return t + return t def T(self): - ''' shallow copy, negate the transpose field. + ''' shallow copy. Returns: - a new Tensor which shares the underlying data memory (shallow copy) - but is marked as a transposed version of this tensor. + a new Tensor which shares the underlying data memory (shallow copy). ''' - return _call_singa_func(self.data.T) + return _call_singa_func(singa.DefaultTranspose, self.data) def copy(self): '''shallow copy calls copy constructor of singa::Tensor @@ -611,8 +614,9 @@ def sizeof(dtype): return singa.SizeOf(dtype) -def reshape(t, s): - '''Reshape the input tensor with the given shape. +def reshape(tensor, shape): + '''Reshape the input tensor with the given shape and + the original tensor is not changed Args: t (Tensor): the tensor to be changed @@ -624,12 +628,8 @@ def reshape(t, s): ''' return _call_singa_func(singa.Reshape, t.data, s) -def Reshape(t,s): - - ret = t.reshape(s) - return ret -def transpose(t,axes = None): +def transpose(t, axes=None): ''' Returns: the transposed tensor @@ -796,6 +796,7 @@ def tanh(t): ''' return _call_singa_func(singa.Tanh, t.data) + def sum(t, axis=None, out=None): '''Sum of tensor elements over given axis @@ -827,24 +828,24 @@ def sum(t, axis=None, out=None): one.set_value(1.0) ret = tensordot(t, one, t_ndim) - if isinstance(axis,int): + if isinstance(axis, int): if axis < 0: axis += t_ndim axis_shape = t_shape[axis] axis_shape = int(axis_shape) - one = Tensor(shape = (axis_shape, ), device = t.device) + one = Tensor(shape=(axis_shape, ), device=t.device) one.set_value(1.0) - ret = tensordot(t, one, axes=([axis],[0])) + ret = tensordot(t, one, axes=([axis], [0])) - if isinstance(axis,tuple): + if isinstance(axis, tuple): l_axis = list(axis) axis_shape = [t_shape[x] for x in axis] axisshape = tuple(axis_shape) one = Tensor(axisshape, t.device) one.set_value(1.0) one_axis = [x for x in range(one.ndim())] - ret = tensordot(t, one, (l_axis,one_axis)) + ret = tensordot(t, one, (l_axis, one_axis)) if out is not None: if out.shape != ret.shape: @@ -1181,10 +1182,10 @@ def einsum(ops, *args): if len(broadcast_a) == 0: broadcast_a = [1] if len(broadcast_b) == 0: - broadcast_b = [1] + broadcast_b = [1] mult_A = repeat(A, product(broadcast_a)) mult_A = mult_A.reshape(reshape_A) - mult_A = transpose(mult_A,transpose_A) + mult_A = transpose(mult_A, transpose_A) mult_B = repeat(B, product(broadcast_b)) mult_B = mult_B.reshape(reshape_B) mult_B = transpose(mult_B, transpose_B) @@ -1199,9 +1200,9 @@ def einsum(ops, *args): res = transpose(res, transpose_res) return res - -def repeat (t, repeats, axis = None): + +def repeat(t, repeats, axis=None): '''Return the repeated tensor Args: t(tensor): the tensor to be repeated @@ -1213,12 +1214,11 @@ def repeat (t, repeats, axis = None): Return: the tensor which has been repeated ''' - ret = t.repeat(repeats,axis) + ret = t.repeat(repeats, axis) return ret - -def tensordot (A,B,axes=2): +def tensordot(A, B, axes=2): """Returns the tensor multiplication of two tensors along specified axes. This is equivalent to compute dot product along the specified axes which @@ -1244,30 +1244,33 @@ def tensordot (A,B,axes=2): # when axes is an integer, axes_A and axes_B represent axes at the last of ''A'' and # the first of ''B''. For example, when axes is 1, we do the normal multiplication : # if A is in shape(3,2,4), B is in shape(4,2,5), it will return a matrix in shape(3,2,2,5) - #when axes is 2 and A,B are shape (3,2,4) and (2,4,5), it will return a matrix in shape(3,5) + # when axes is 2 and A,B are shape (3,2,4) and (2,4,5), it will return a + # matrix in shape(3,5) if type(axes) == int or type(axes) == long: axes_A = list(range(-axes, 0)) axes_B = list(range(0, axes)) axes_B = axes_B else: - axes_A,axes_B =axes + axes_A, axes_B = axes # when axes is a pair of sequences of integers.For example, A is in shape(3,2,4), - #B is in shape(4,2,5), we set axes as ([1,2],[1,0]), it will return a matrix in shape(3,5) - if isinstance(axes_A,list): + # B is in shape(4,2,5), we set axes as ([1,2],[1,0]), it will return a + # matrix in shape(3,5) + if isinstance(axes_A, list): na = len(axes_A) axes_A = list(axes_A) else: axes_A = [axes_A] na = 1 - if isinstance(axes_B,list): + if isinstance(axes_B, list): nb = len(axes_B) axes_B = list(axes_B) else: axes_B = [axes_B] nb = 1 - # a_shape and b_shape are the shape of tensor A and B, while nda and ndb are the dim of A and B + # a_shape and b_shape are the shape of tensor A and B, while nda and ndb + # are the dim of A and B a_shape = A.shape nda = A.ndim() b_shape = B.shape @@ -1277,7 +1280,7 @@ def tensordot (A,B,axes=2): if na != nb: equal = False else: - # to make the shape match + # to make the shape match for k in range(na): if a_shape[axes_A[k]] != b_shape[axes_B[k]]: equal = False @@ -1291,18 +1294,19 @@ def tensordot (A,B,axes=2): '''start to do the calculation according to the axes''' notin = [k for k in range(nda) if k not in axes_A] - # nda is the dim of A, and axes_a is the axis for A, notin is the axis which is not in axes_A + # nda is the dim of A, and axes_a is the axis for A, notin is the axis + # which is not in axes_A newaxes_a = notin + axes_A N2 = 1 for axis in axes_A: N2 *= a_shape[axis] N1 = 1 for ax in notin: - N1 *=a_shape[ax] + N1 *= a_shape[ax] # newshape_a is the shape to do multiplication.For example, A is in shape(3,2,4), - #B is in shape(4,2,5), we set axes as ([1,2],[1,0]), then newshape_a should be (3,5) - #olda is the shape that will be shown in the result. - newshape_a = (N1,N2) + # B is in shape(4,2,5), we set axes as ([1,2],[1,0]), then newshape_a should be (3,5) + # olda is the shape that will be shown in the result. + newshape_a = (N1, N2) olda = [a_shape[axis] for axis in notin] notin = [k for k in range(ndb) if k not in axes_B] newaxes_b = axes_B + notin @@ -1320,7 +1324,7 @@ def tensordot (A,B,axes=2): at = Reshape(A, newshape_a) bt = Reshape(B, newshape_b) - res = mult(at,bt) + res = mult(at, bt) if len(olda + oldb) == 0: olda = [1] oldb = [1] @@ -1330,6 +1334,7 @@ def tensordot (A,B,axes=2): return res + def div(lhs, rhs, ret=None): '''Elementi-wise division. http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/src/api/core_tensor.i ---------------------------------------------------------------------- diff --git a/src/api/core_tensor.i b/src/api/core_tensor.i index cc72d21..9427b11 100644 --- a/src/api/core_tensor.i +++ b/src/api/core_tensor.i @@ -101,12 +101,11 @@ namespace singa{ const std::vector<size_t> &shape() const; const size_t shape(size_t idx) const; bool transpose() const; - size_t nDim() const; - Tensor Transpose() const; - Tensor Transpose(const std::vector<size_t> &axes) const; + size_t nDim() const; + size_t Size() const; size_t MemSize() const; - Tensor Reshape(const std::vector<size_t> &shape); + void ResetLike(const Tensor &t); void AsType(DataType type); void ToDevice(std::shared_ptr<singa::Device> dev); @@ -122,10 +121,10 @@ namespace singa{ void CopyData(const Tensor &other); void RepeatData(std::vector<size_t> repeats, int axis, int total_repeats, const Tensor &src); + Tensor Clone() const; Tensor Repeat(std::vector<size_t> repeats, int axis); - Tensor T() const; - + #if USE_JAVA %rename(iAdd) operator+=(const Tensor &t); @@ -166,6 +165,10 @@ namespace singa{ Tensor *dst, const Tensor &src, const size_t num); Tensor Reshape(const Tensor &in, const std::vector<size_t> &s); + Tensor Transpose(const Tensor &in, const std::vector<size_t> &axes); + + %rename(DefaultTranspose) Transpose(const Tensor &in); + Tensor Transpose(const Tensor &in); Tensor Abs(const Tensor &t); Tensor Exp(const Tensor &t); @@ -326,6 +329,6 @@ namespace singa{ Tensor SoftMax(const Tensor &in); void SoftMax(const Tensor &in, Tensor *out); - const Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t); - const Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t); + Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t); + Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t); } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/src/core/tensor/tensor.cc ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc index e5e8017..1ac1b42 100755 --- a/src/core/tensor/tensor.cc +++ b/src/core/tensor/tensor.cc @@ -21,6 +21,7 @@ #include "./tensor_math_cuda.h" #include "./tensor_math_opencl.h" #include <utility> +#include <algorithm> #define Noaxis 9999 @@ -45,13 +46,7 @@ Tensor::Tensor(const Shape &shape, DataType dtype) block_ = device_->NewBlock((int)size); generate_strides(); } -Tensor::Tensor(Shape &&shape, DataType dtype) - : data_type_(dtype), device_(defaultDevice), shape_(shape) { - size_t size = Product(shape_) * SizeOf(data_type_); - if (size) - block_ = device_->NewBlock((int)size); - generate_strides(); -} + //non-strided constructors with device Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device, @@ -62,56 +57,24 @@ Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device, block_ = device_->NewBlock((int)size); generate_strides(); } -Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype) - : data_type_(dtype), device_(device), shape_(shape) { - size_t size = Product(shape_) * SizeOf(data_type_); - if (size) - block_ = device_->NewBlock((int)size); - generate_strides(); -} -Tensor::Tensor(const Tensor &in) - : //transpose_(in.transpose_), - data_type_(in.data_type_), - device_(in.device_), - block_(in.block()), - shape_(in.shape_), - strides_(in.strides_) { +Tensor::Tensor(const Tensor &in) : data_type_(in.data_type_), + device_(in.device_), block_(in.block()), shape_(in.shape_), + strides_(in.strides_) { if (block_ != nullptr) block_->IncRefCount(); } -//strided constructor taking in a tensor, shape and strides -Tensor::Tensor(const Tensor &in, Shape &new_shape, vector<int> &new_strides) - : //transpose_(in.transpose_), - data_type_(in.data_type_), - device_(in.device_), - block_(in.block()), - shape_(new_shape), - strides_(new_strides) { - if (block_ != nullptr) - block_->IncRefCount(); -} -Tensor::Tensor(Tensor &&in) - : //transpose_(in.transpose_), - data_type_(in.data_type_), - device_(in.device_), - shape_(std::move(in.shape_)), - strides_(in.strides_) { +Tensor::Tensor(Tensor &&in) : data_type_(in.data_type_), + device_(in.device_), shape_(std::move(in.shape_)), + strides_(std::move(in.strides_)) { block_ = in.block_; in.block_ = nullptr; } -void Tensor::SetBlock(Block *block) { - LOG(WARNING) << "Pls avoid using this function, which may have side-effect."; - if (block_ != nullptr) - if (block_->DecRefCount()) device_->FreeBlock(block_); - block_ = block; -} - void Tensor::ResetLike(const Tensor &in) { if (block_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) { if (block_ != nullptr && block_->DecRefCount() == 0) @@ -124,41 +87,16 @@ void Tensor::ResetLike(const Tensor &in) { strides_ = in.strides_; } -// if tensor is not transposed yet i.e strides == 1, -// then we simply change the shape and generate new default strides -// if tensor is already transposed i.e strides != 1, -// it should be copied to a new tensor with newly generated default strides -// TODO(wangwei) raise error if the shape not match - -// void Tensor::Reshape(const Shape &shape) { -// if (strides_.size() == 0) -// strides_.push_back(1); - -// if (Product(shape_) != Product(shape)) { -// if (block_ != nullptr && block_->DecRefCount() == 0) -// device_->FreeBlock(block_); -// block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_))); -// } else if (transpose()) { -// LOG(FATAL) << "Reshape Error: Reshape called on tranposed tensor. Not implemented yet." ; -// } -// shape_ = shape; -// generate_strides(); -// } - -// void Tensor::Reshape(Shape &&shape) { -// if (strides_.size() == 0) -// strides_.push_back(1); - -// if (Product(shape_) != Product(shape)) { -// if (block_ != nullptr && block_->DecRefCount() == 0) -// device_->FreeBlock(block_); -// block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_))); -// } else if (transpose()) { -// LOG(FATAL) << "Reshape Error: Reshape called on tranposed tensor. Not implemented yet." ; -// } -// shape_ = std::move(shape); -// generate_strides(); -// } +void Tensor::SetShape(const Shape& shape) { + if (Product(shape_) != Product(shape)) { + if (block_ != nullptr && block_->DecRefCount() == 0) + device_->FreeBlock(block_); + block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_))); + } + shape_ = shape; + generate_strides(); +} + void Tensor::AsType(const DataType type) { if (data_type_ != type) { @@ -217,7 +155,8 @@ void Tensor::CopyData(const Tensor &src) { } } -void Tensor::RepeatData(vector<size_t> repeats, int axis, int total_repeats, const Tensor &src) { +void Tensor::RepeatData(const vector<size_t>& repeats, int axis, int total_repeats, + const Tensor &src) { if (repeats.size() == 1) { CHECK_EQ(Size(), src.Size()*total_repeats); } else { @@ -336,7 +275,8 @@ void Tensor::ToProto(singa::TensorProto *proto) const { } } -Tensor Tensor::Repeat(vector<size_t> repeats, int axis, std::shared_ptr<Device> device) { +Tensor Tensor::Repeat(const vector<size_t>& repeats, int axis, + std::shared_ptr<Device> device) { if (device == nullptr) device = device_; vector<size_t> tshape; int total_repeats = 0; @@ -346,7 +286,7 @@ Tensor Tensor::Repeat(vector<size_t> repeats, int axis, std::shared_ptr<Device> } else { if (repeats.size() == 1) { total_repeats = repeats[0]; - for (size_t i = 0; i < shape_.size(); i++) { + for (int i = 0; i < static_cast<int>(shape_.size()); i++) { if (i == axis) { tshape.push_back(shape_[i] * total_repeats); } else { @@ -363,7 +303,7 @@ Tensor Tensor::Repeat(vector<size_t> repeats, int axis, std::shared_ptr<Device> } total_repeats += repeats[i]; } - for (size_t i = 0; i < shape_.size(); i++) { + for (int i = 0; i < static_cast<int>(shape_.size()); i++) { if (i == axis) { tshape.push_back(total_repeats); } else { @@ -387,68 +327,53 @@ Tensor Tensor::Clone(std::shared_ptr<Device> device) const { return t; } -Tensor Tensor::T() const { +Tensor& Tensor::T() { // this function only works for 2d tensors CHECK_EQ(shape_.size(), 2u); - Tensor t; - t.device_ = device_; - t.data_type_ = data_type_; - t.shape_.push_back(shape_[1]); - t.shape_.push_back(shape_[0]); - t.strides_.clear(); - t.strides_.push_back(strides_[1]); - t.strides_.push_back(strides_[0]); - t.block_ = block_; - block_->IncRefCount(); - return t; + Transpose(); + return *this; } //normal transpose without axes -Tensor Tensor::Transpose() const { - // if(shape_.size() != strides_.size()) - // generate_strides(); - - Tensor t; - t.device_ = device_; - t.data_type_ = data_type_; - t.strides_.clear(); - for (size_t n = 0; n < shape_.size(); ++n) { - t.shape_.push_back(shape_[shape_.size() - n - 1]); - t.strides_.push_back(strides_[shape_.size() - n - 1]); - } - t.block_ = block_; - block_->IncRefCount(); - return t; +Tensor& Tensor::Transpose() { + std::reverse(shape_.begin(), shape_.end()); + std::reverse(strides_.begin(), strides_.end()); + return *this; } //transpose with axes -// TODO(wangwei) the shape and axes should match -Tensor Tensor::Transpose(const vector<size_t> &axes) const { - // if(axes.size() != shape_.size()){ - // std::cout << "Warning: Size of input axes doesn't match size of shape" << std::endl; - // return void(); - // } - // if(shape_.size() != strides_.size()) - // generate_strides(); +Tensor& Tensor::Transpose(const vector<size_t> &axes) { + CHECK_EQ(axes.size(), shape_.size()) << + "Tranpose axes's length should be equal to shape"; - Tensor t; - t.device_ = device_; - t.data_type_ = data_type_; - t.strides_.clear(); + auto shape = shape_; + auto strides = strides_; + shape_.clear(); + strides_.clear(); for (size_t n = 0; n < axes.size(); ++n) { - t.shape_.push_back(shape_[axes[n]]); - t.strides_.push_back(strides_[axes[n]]); + shape_.push_back(shape[axes[n]]); + strides_.push_back(strides[axes[n]]); } - t.block_ = block_; - block_->IncRefCount(); - return t; + return *this; +} + +//normal transpose without axes +Tensor Transpose(const Tensor& in) { + Tensor out(in); + out.Transpose(); + return out; +} + +//transpose with axes +Tensor Transpose(const Tensor& in, const vector<size_t> &axes) { + Tensor out(in); + out.Transpose(axes); + return out; } Tensor &Tensor::operator=(const Tensor &in) { - // LOG(ERROR) << "= const &"; if (block_ != nullptr && block_->DecRefCount() == 0) device_->FreeBlock(block_); - //transpose_ = in.transpose_; strides_ = in.strides_; data_type_ = in.data_type_; shape_ = in.shape_; @@ -460,11 +385,9 @@ Tensor &Tensor::operator=(const Tensor &in) { } Tensor &Tensor::operator=(Tensor &&in) { - // LOG(ERROR) << "= &&"; if (block_ != nullptr && block_->DecRefCount() == 0) device_->FreeBlock(block_); - //transpose_ = in.transpose_; - strides_ = std::move(in.strides_); + strides_ = std::move(in.strides_); data_type_ = in.data_type_; shape_ = std::move(in.shape_); device_ = in.device_; @@ -473,17 +396,6 @@ Tensor &Tensor::operator=(Tensor &&in) { return *this; } -// Tensor Reshape(const Tensor &in, const Shape &s) { -// // Tensor out(in); -// // out.Reshape(s); -// return out; -// } - -// Tensor Reshape(const Tensor &in, Shape &&s) { -// // Tensor out(in); -// // out.Reshape(std::move(s)); -// return out; -// } #define GenUnaryTensorArgMemberFn(op, fn) \ Tensor &Tensor::op(const Tensor &in) { \ @@ -539,7 +451,7 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num, } } -void RepeatDataToFrom(bool broadcast_flag, vector<size_t> repeats, int axis, +void RepeatDataToFrom(bool broadcast_flag, const vector<size_t>& repeats, int axis, Tensor *dst, const Tensor &src, const size_t num) { if (repeats.size() == 1) { broadcast_flag = true; @@ -561,11 +473,11 @@ void RepeatDataToFrom(bool broadcast_flag, vector<size_t> repeats, int axis, axis_shape = 1; shape_outer = Product(src.shape()); } else { - for (size_t i = 0; i < axis; i++) { + for (int i = 0; i < axis; i++) { shape_outer *= src.shape()[i]; } axis_shape = src.shape()[axis]; - for (size_t i = axis + 1; i < src.nDim(); i++) { + for (int i = axis + 1; i < static_cast<int>(src.nDim()); i++) { chunk *= src.shape()[i]; } } @@ -912,7 +824,7 @@ template <typename SType> void AddColumn(const SType alpha, const SType beta, const Tensor &v, Tensor *M) { if (M->transpose()) { - Tensor X = M->T(); + Tensor X = Transpose(*M); AddRow(v, &X); } else { CHECK_EQ(M->nDim(), 2u); @@ -935,7 +847,7 @@ void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, M); } template <typename SType> void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M) { if (M->transpose()) { - Tensor X = M->T(); + Tensor X = Transpose(*M); AddColumn(v, &X); } else { CHECK_EQ(M->nDim(), 2u); @@ -980,7 +892,7 @@ Tensor ConcatOn(const vector<Tensor> &in, int axis) { tmp.push_back(Reshape(t, {t.shape(0), t.Size() / t.shape(0)})); } auto ret = ConcatenateRows(tmp); - ret = ret.Reshape(out_shape); + ret.Reshape(out_shape); return ret; } else { for (const auto& t : in) { @@ -990,7 +902,7 @@ Tensor ConcatOn(const vector<Tensor> &in, int axis) { tmp.push_back(Reshape(t, {nrow, t.Size() / nrow})); } auto ret = ConcatenateColumns(tmp); - ret = ret.Reshape(out_shape); + ret.Reshape(out_shape); return ret; } } @@ -1059,7 +971,8 @@ Tensor CopyRows(const Tensor &in, const size_t start, const size_t end) { } -Tensor SliceOn(const Tensor&in, const size_t start, const size_t end, int axis) { +Tensor SliceOn(const Tensor&in, const size_t start, const size_t end, + int axis) { Shape out_shape = in.shape(); out_shape[axis] = end - start; if (axis == 0) { @@ -1074,7 +987,7 @@ Tensor SliceOn(const Tensor&in, const size_t start, const size_t end, int axis) auto suffix = in.Size() / nrow / in.shape(axis); auto ret = SliceColumns(Reshape(in, {nrow, in.Size() / nrow}), start * suffix, end * suffix); - ret = ret.Reshape(out_shape); + ret.Reshape(out_shape); return ret; } } @@ -1145,7 +1058,7 @@ void SubRow(const Tensor &v, Tensor *M) { AddRow(-1, 1, v, M); } void SumColumns(const Tensor &M, Tensor *v) { if (M.transpose()) { - Tensor X = M.T(); + Tensor X = Transpose(M); SumRows(X, v); } else { CHECK_EQ(M.nDim(), 2u); @@ -1160,7 +1073,7 @@ void SumColumns(const Tensor &M, Tensor *v) { } void SumRows(const Tensor &M, Tensor *v) { if (M.transpose()) { - Tensor X = M.T(); + Tensor X = Transpose(M); SumColumns(X, v); } else { CHECK_EQ(M.nDim(), 2u); @@ -1170,7 +1083,7 @@ void SumRows(const Tensor &M, Tensor *v) { Tensor one(Shape{nb_row}, M.device(), M.data_type()); one.SetValue(1.0f); // TODO(wangwei) cast type - Tensor X = M.T(); + Tensor X = Transpose(M); Mult(X, one, v); } } @@ -1268,13 +1181,13 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta, // ************************ // Misc. // ************************ -const Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t) { +Tensor CrossEntropyFwd(const Tensor& p, const Tensor& t) { Tensor loss({p.shape(0)}, p.device(), p.data_type()); ComputeCrossEntropy(p, t, &loss); return loss; } -const Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t) { +Tensor SoftmaxCrossEntropyBwd(const Tensor& p, const Tensor& t) { auto g = p.Clone(); SoftmaxCrossEntropyBwd(t, &g); return g; @@ -1310,65 +1223,20 @@ void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) { }); } -Tensor Tensor::Reshape(const Shape &shape) { - if (strides_.size() == 0) - strides_.push_back(1); - // TODO(wangwei) remove this condition and report error if size changes. - if (Product(shape_) != Product(shape)) { - if (block_ != nullptr && block_->DecRefCount() == 0) - device_->FreeBlock(block_); - block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_))); - shape_ = shape; - generate_strides(); - return *this; - - } else if (transpose()) { - Tensor t(shape_, device_, data_type_); - t.block_ = t.device()->NewBlock((int)(Product(shape) * SizeOf(data_type_))); +// if tensor is not transposed yet, we change the shape and generate new strides +// if tensor is already transposed, we reallocate the memory and generate strides +Tensor& Tensor::Reshape(const Shape &shape) { + if (transpose()) { + Tensor t(shape, device_, data_type_); singa::Transform(*this, &t); - t.shape_ = shape; - return t; + shape_ = shape; + std::swap(t.block_, block_); } else { - Tensor t; - t.shape_ = shape; - t.device_ = device_; - t.data_type_ = data_type_; - t.block_ = block_; // be careful about the block inference (mem leaking) - t.block_->IncRefCount(); - t.generate_strides(); - return t; - } -} - -Tensor Tensor::Reshape(Shape &&shape) { - if (strides_.size() == 0) - strides_.push_back(1); - - if (Product(shape_) != Product(shape)) { - if (block_ != nullptr && block_->DecRefCount() == 0) - device_->FreeBlock(block_); - block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_))); - shape_ = std::move(shape); + shape_ = shape; generate_strides(); - return *this; - - } else if (transpose()) { - Tensor t(shape_, device_, data_type_); - t.block_ = t.device()->NewBlock((int)(Product(shape) * SizeOf(data_type_))); - singa::Transform(*this, &t); - t.shape_ = shape; - return t; - } else { - Tensor t; - t.shape_ = shape; - t.device_ = device_; - t.data_type_ = data_type_; - t.block_ = block_; // be careful about the block inference (mem leaking) - t.block_->IncRefCount(); - t.generate_strides(); - return t; } + return *this; } Tensor Reshape(const Tensor &in, const Shape &s) { @@ -1376,9 +1244,4 @@ Tensor Reshape(const Tensor &in, const Shape &s) { return out.Reshape(s); } -Tensor Reshape(const Tensor &in, Shape &&s) { - Tensor out(in); - return out.Reshape(std::move(s)); -} - } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/src/core/tensor/tensor_math.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h index f438fc6..f5fbc84 100644 --- a/src/core/tensor/tensor_math.h +++ b/src/core/tensor/tensor_math.h @@ -253,7 +253,7 @@ void Tanh(const Tensor &in, Tensor *out, Context *ctx) { /// similar to cudnnTransformTensor /// copies the data from one tensor to another tensor with a different layout -/// the tensors must have the same dimensions but not necessarily the same strides +/// the tensors must have the same dimensions but not necessarily the same strides template <typename DType, typename Lang> void Transform(const Tensor &in, Tensor *out, Context *ctx) { LOG(FATAL) << "Transform Not Implemented"; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b30d7ea5/src/core/tensor/tensor_math_cuda.h ---------------------------------------------------------------------- diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h index 2a43468..dfe5724 100644 --- a/src/core/tensor/tensor_math_cuda.h +++ b/src/core/tensor/tensor_math_cuda.h @@ -54,34 +54,23 @@ cudnn requires tensor dimensions to fulfill 1 requirement: Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be the input */ vector<int> generate_shape_cuda(const Tensor& x) { - Shape shape_ = x.shape(); + Shape shape = x.shape(); + CHECK_LE(shape.size(), 5) << "Dimensions (shape) beyond 5 are currently not supported" ; vector<int> shape_arr; - if (shape_.size() <= 4) { - for (size_t n = 0; n < 4 - shape_.size(); ++n) { + if (shape.size() <= 4) { + for (int n = 0; n < 4 - shape.size(); ++n) { shape_arr.push_back(1); } - for (size_t n = 0; n < shape_.size(); ++n) { - shape_arr.push_back(shape_.at(n)); - } - return shape_arr; - } else if (shape_.size() == 5) { - for (size_t n = 0; n < shape_.size(); ++n) { - shape_arr.push_back(shape_.at(n)); - } - return shape_arr; - } else { - LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ; } + for(auto x: shape) + shape_arr.push_back(static_cast<int>(x)); return shape_arr; } int generate_dim_cuda(const Tensor& x) { + CHECK_LE(x.nDim(), 5) << "Dimensions (shape) beyond 5 are currently not supported" ; if (x.shape().size() <= 4) {return 4;} - else if (x.shape().size() == 5) {return 5;} - else { - LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ; - } - return 0; + else {return 5;} } /* @@ -94,29 +83,17 @@ int generate_dim_cuda(const Tensor& x) { and stride {9, 9, 3, 1} or {9, 9, 1, 3} to be the inputs */ vector<int> generate_strides_cuda(const Tensor& x) { - Shape shape_ = x.shape(); - vector<int> strides_ = x.strides(); + Shape shape = x.shape(); + auto& strides = x.strides(); vector<int> strides_arr; - int product = 1; - for (size_t n = 0; n < (shape_.size()); ++n) { - product *= shape_[n]; - } - if (shape_.size() <= 4) { - for (size_t n = 0; n < 4 - shape_.size(); ++n) { + int product = Product(shape); + if (shape.size() <= 4) { + for (int n = 0; n < 4 - shape.size(); ++n) { strides_arr.push_back(product); } - for (size_t n = 0; n < strides_.size(); ++n) { - strides_arr.push_back(strides_[n]); - } - return strides_arr; - } else if (shape_.size() == 5) { - for (size_t n = 0; n < strides_.size(); ++n) { - strides_arr.push_back(strides_[n]); - } - return strides_arr; - } else { - LOG(FATAL) << "Dimensions (strides) beyond 5 are currently not supported" ; } + for(auto x : strides) + strides_arr.push_back(static_cast<int>(x)); return strides_arr; } @@ -241,6 +218,22 @@ void Sub<float, lang::Cuda>(const Tensor& in1, } } +template <> +void Transform<float, lang::Cuda>(const Tensor& in, Tensor* out, + Context* ctx) { + const float* inPtr = static_cast<const float*>(in.block()->data()); + float* outPtr = static_cast<float*>(out->block()->mutable_data()); + + float alpha = 1.0; + float beta = 0.0; + + check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, + (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, + (void*)(&beta), generate_tensor_nd_desc(*out), outPtr + )); + +} + /// Element-wise operation, clamp every element into [low, high] /// if x>high, then x=high; if x<low, then x=low. template <> @@ -254,14 +247,7 @@ void Clamp<float, lang::Cuda>(const float low, if (in.strides() == out->strides()) { cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream); } else { //else we transform in to out to store first - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - + Transform<float, lang::Cuda>(in, out, ctx); cuda::clamp(num, low, high, outPtr, outPtr, ctx->stream); } } @@ -280,36 +266,18 @@ void Div<float, lang::Cuda>(const Tensor& in1, if (!in1.transpose() && !in2.transpose() && (in1.strides() == in2.strides())) { cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream); } else { //else we check whether in1 or in2 or both are transposed - float alpha = 1.0; - float beta = 0.0; - if (in1.transpose() && in2.transpose()) { Tensor t(in1.shape(), in1.device(), in1.data_type()); - float* tPtr = static_cast<float*>(t.block()->mutable_data()); - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in1), inPtr1, - (void*)(&beta), generate_tensor_nd_desc(t), tPtr - )); + Transform<float, lang::Cuda>(in1, &t, ctx); + Transform<float, lang::Cuda>(in2, out, ctx); - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in2), inPtr2, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); + float* tPtr = static_cast<float*>(t.block()->mutable_data()); cuda::div(num, tPtr, outPtr, outPtr, ctx->stream); - } else if (in1.transpose()) { - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in1), inPtr1, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); + Transform<float, lang::Cuda>(in1, out, ctx); cuda::div(num, outPtr, inPtr2, outPtr, ctx->stream); - } else if (in2.transpose()) { - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in2), inPtr2, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); + Transform<float, lang::Cuda>(in2, out, ctx); cuda::div(num, inPtr1, outPtr, outPtr, ctx->stream); } } @@ -325,14 +293,7 @@ void Div<float, lang::Cuda>(const float x, const Tensor& in, if (in.strides() == out->strides()) { cuda::div(num, x, inPtr, outPtr, ctx->stream); } else { //else we transform in to out to store first - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - + Transform<float, lang::Cuda>(in, out, ctx); cuda::div(num, x, outPtr, outPtr, ctx->stream); } } @@ -366,36 +327,17 @@ void EltwiseMult<float, lang::Cuda>(const Tensor& in1, if (!in1.transpose() && !in2.transpose() && (in1.strides() == in2.strides())) { cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream); } else { //else we check whether in1 or in2 or both are transposed - float alpha = 1.0; - float beta = 0.0; - if (in1.transpose() && in2.transpose()) { Tensor t(in1.shape(), in1.device(), in1.data_type()); + Transform<float, lang::Cuda>(in1, &t, ctx); + Transform<float, lang::Cuda>(in2, out, ctx); float* tPtr = static_cast<float*>(t.block()->mutable_data()); - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in1), inPtr1, - (void*)(&beta), generate_tensor_nd_desc(t), tPtr - )); - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in2), inPtr2, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); cuda::mult(num, tPtr, outPtr, outPtr, ctx->stream); - } else if (in1.transpose()) { - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in1), inPtr1, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); + Transform<float, lang::Cuda>(in1, out, ctx); cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream); - } else if (in2.transpose()) { - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in2), inPtr2, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); + Transform<float, lang::Cuda>(in2, out, ctx); cuda::mult(num, inPtr1, outPtr, outPtr, ctx->stream); } } @@ -413,14 +355,7 @@ void Exp<float, lang::Cuda>(const Tensor& in, Tensor* out, if (in.strides() == out->strides()) { cuda::exp(num, inPtr, outPtr, ctx->stream); } else { //else we transform in to out to store first - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - + Transform<float, lang::Cuda>(in, out, ctx); cuda::exp(num, outPtr, outPtr, ctx->stream); } } @@ -435,14 +370,7 @@ void GE<float, lang::Cuda>(const Tensor& in, const float x, if (in.strides() == out->strides()) { cuda::ge(num, inPtr, x, outPtr, ctx->stream); } else { //else we transform in to out to store first - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - + Transform<float, lang::Cuda>(in, out, ctx); cuda::ge(num, outPtr, x, outPtr, ctx->stream); } } @@ -451,10 +379,7 @@ void GE<float, lang::Cuda>(const Tensor& in1, const Tensor& in2, Tensor* out, Context* ctx) { Sub<float, lang::Cuda>(in1, in2, out, ctx); float* outPtr = static_cast<float*>(out->block()->mutable_data()); - // const float* inPtr1 = static_cast<const float*>(in1.block()->data()); - // const float* inPtr2 = static_cast<const float*>(in2.block()->data()); const size_t num = in1.Size(); - //cuda::ge(num, inPtr1, inPtr2, outPtr, ctx->stream); cuda::ge(num, outPtr, 0.0, outPtr, ctx->stream); } @@ -469,14 +394,7 @@ void GT<float, lang::Cuda>(const Tensor& in, const float x, if (in.strides() == out->strides()) { cuda::gt(num, inPtr, x, outPtr, ctx->stream); } else { //else we transform in to out to store first - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - + Transform<float, lang::Cuda>(in, out, ctx); cuda::gt(num, outPtr, x, outPtr, ctx->stream); } } @@ -485,10 +403,7 @@ void GT<float, lang::Cuda>(const Tensor& in1, const Tensor& in2, Tensor* out, Context* ctx) { Sub<float, lang::Cuda>(in1, in2, out, ctx); float* outPtr = static_cast<float*>(out->block()->mutable_data()); - // const float* inPtr1 = static_cast<const float*>(in1.block()->data()); - // const float* inPtr2 = static_cast<const float*>(in2.block()->data()); const size_t num = in1.Size(); - //cuda::gt(num, inPtr1, inPtr2, outPtr, ctx->stream); cuda::gt(num, outPtr, 0.0, outPtr, ctx->stream); } @@ -502,14 +417,7 @@ void LE<float, lang::Cuda>(const Tensor& in, const float x, if (in.strides() == out->strides()) { cuda::le(num, inPtr, x, outPtr, ctx->stream); } else { //else we transform in to out to store first - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - + Transform<float, lang::Cuda>(in, out, ctx); cuda::le(num, outPtr, x, outPtr, ctx->stream); } } @@ -518,10 +426,7 @@ void LE<float, lang::Cuda>(const Tensor& in1, const Tensor& in2, Tensor* out, Context* ctx) { Sub<float, lang::Cuda>(in1, in2, out, ctx); float* outPtr = static_cast<float*>(out->block()->mutable_data()); - // const float* inPtr1 = static_cast<const float*>(in1.block()->data()); - // const float* inPtr2 = static_cast<const float*>(in2.block()->data()); const size_t num = in1.Size(); - //cuda::le(num, inPtr1, inPtr2, outPtr, ctx->stream); cuda::le(num, outPtr, 0.0, outPtr, ctx->stream); } @@ -536,14 +441,7 @@ void Log<float, lang::Cuda>(const Tensor& in, Tensor* out, if (in.strides() == out->strides()) { cuda::log(num, inPtr, outPtr, ctx->stream); } else { //else we transform in to out to store first - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - + Transform<float, lang::Cuda>(in, out, ctx); cuda::log(num, outPtr, outPtr, ctx->stream); } } @@ -558,14 +456,7 @@ void LT<float, lang::Cuda>(const Tensor& in, const float x, if (in.strides() == out->strides()) { cuda::lt(num, inPtr, x, outPtr, ctx->stream); } else { //else we transform in to out to store first - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - + Transform<float, lang::Cuda>(in, out, ctx); cuda::lt(num, outPtr, x, outPtr, ctx->stream); } } @@ -574,10 +465,7 @@ void LT<float, lang::Cuda>(const Tensor& in1, const Tensor& in2, Tensor* out, Context* ctx) { Sub<float, lang::Cuda>(in1, in2, out, ctx); float* outPtr = static_cast<float*>(out->block()->mutable_data()); - // const float* inPtr1 = static_cast<const float*>(in1.block()->data()); - // const float* inPtr2 = static_cast<const float*>(in2.block()->data()); const size_t num = in1.Size(); - //cuda::lt(num, inPtr1, inPtr2, outPtr, ctx->stream); cuda::lt(num, outPtr, 0.0, outPtr, ctx->stream); } @@ -592,14 +480,7 @@ void Pow<float, lang::Cuda>(const Tensor& in, const float x, if (in.strides() == out->strides()) { cuda::pow(num, inPtr, x, outPtr, ctx->stream); } else { //else we transform in to out to store first - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - + Transform<float, lang::Cuda>(in, out, ctx); cuda::pow(num, outPtr, x, outPtr, ctx->stream); } } @@ -617,36 +498,17 @@ void Pow<float, lang::Cuda>(const Tensor& in1, if (!in1.transpose() && !in2.transpose() && (in1.strides() == in2.strides())) { cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream); } else { //else we check whether in1 or in2 or both are transposed - float alpha = 1.0; - float beta = 0.0; - if (in1.transpose() && in2.transpose()) { Tensor t(in1.shape(), in1.device(), in1.data_type()); float* tPtr = static_cast<float*>(t.block()->mutable_data()); - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in1), inPtr1, - (void*)(&beta), generate_tensor_nd_desc(t), tPtr - )); - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in2), inPtr2, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); + Transform<float, lang::Cuda>(in1, &t, ctx); + Transform<float, lang::Cuda>(in2, out, ctx); cuda::pow(num, tPtr, outPtr, outPtr, ctx->stream); - } else if (in1.transpose()) { - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in1), inPtr1, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); + Transform<float, lang::Cuda>(in1, out, ctx); cuda::pow(num, outPtr, inPtr2, outPtr, ctx->stream); - } else if (in2.transpose()) { - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in2), inPtr2, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); + Transform<float, lang::Cuda>(in2, out, ctx); cuda::pow(num, inPtr1, outPtr, outPtr, ctx->stream); } } @@ -694,14 +556,7 @@ void ReLU<float, lang::Cuda>(const Tensor& in, Tensor* out, if (in.strides() == out->strides()) { cuda::relu(num, inPtr, outPtr, ctx->stream); } else { //else we transform in to out to store first - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - + Transform<float, lang::Cuda>(in, out, ctx); cuda::relu(num, outPtr, outPtr, ctx->stream); } } @@ -749,14 +604,7 @@ void Sigmoid<float, lang::Cuda>(const Tensor& in, Tensor* out, if (in.strides() == out->strides()) { cuda::sigmoid(num, inPtr, outPtr, ctx->stream); } else { //else we transform in to out to store first - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - + Transform<float, lang::Cuda>(in, out, ctx); cuda::sigmoid(num, outPtr, outPtr, ctx->stream); } } @@ -772,14 +620,7 @@ void Sign<float, lang::Cuda>(const Tensor& in, Tensor* out, if (in.strides() == out->strides()) { cuda::sign(num, inPtr, outPtr, ctx->stream); } else { //else we transform in to out to store first - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - + Transform<float, lang::Cuda>(in, out, ctx); cuda::sign(num, outPtr, outPtr, ctx->stream); } } @@ -788,15 +629,14 @@ void Sign<float, lang::Cuda>(const Tensor& in, Tensor* out, template <> void Sqrt<float, lang::Cuda>(const Tensor& in, Tensor* out, Context* ctx) { - const float* inPtr = static_cast<const float*>(in.block()->data()); float* outPtr = static_cast<float*>(out->block()->mutable_data()); #if CUDNN_MAJOR < 7 + Transform<float, lang::Cuda>(in, out, ctx); size_t num = in.Size(); - cuda::sqrt(num, inPtr, outPtr, ctx->stream); - + cuda::sqrt(num, outPtr, outPtr, ctx->stream); #else - + const float* inPtr = static_cast<const float*>(in.block()->data()); float alpha1 = 1.0; float alpha2 = 0.0; float beta = 0.0; @@ -820,14 +660,7 @@ void Square<float, lang::Cuda>(const Tensor& in, Tensor* out, if (in.strides() == out->strides()) { cuda::square(num, inPtr, outPtr, ctx->stream); } else { //else we transform in to out to store first - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - + Transform<float, lang::Cuda>(in, out, ctx); cuda::square(num, outPtr, outPtr, ctx->stream); } } @@ -883,34 +716,11 @@ void Tanh<float, lang::Cuda>(const Tensor& in, Tensor* out, if (in.strides() == out->strides()) { cuda::tanh(num, inPtr, outPtr, ctx->stream); } else { //else we transform in to out to store first - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - + Transform<float, lang::Cuda>(in, out, ctx); cuda::tanh(num, outPtr, outPtr, ctx->stream); } } -template <> -void Transform<float, lang::Cuda>(const Tensor& in, Tensor* out, - Context* ctx) { - const float* inPtr = static_cast<const float*>(in.block()->data()); - float* outPtr = static_cast<float*>(out->block()->mutable_data()); - - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(*out), outPtr - )); - -} - // ================Random functions=========================================== /// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1 // Get the random generator from 'ctx' @@ -1175,16 +985,7 @@ void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out, if (in.transpose()) { Tensor t(in.shape(), in.device(), in.data_type()); - float* tPtr = static_cast<float*>(t.block()->mutable_data()); - - float alpha = 1.0; - float beta = 0.0; - - check_cudnn(cudnnTransformTensor(ctx->cudnn_handle, - (void*)(&alpha), generate_tensor_nd_desc(in), inPtr, - (void*)(&beta), generate_tensor_nd_desc(t), tPtr - )); - + Transform<float, lang::Cuda>(in, &t, ctx); const float* tPtr_const = static_cast<const float*>(t.block()->data()); cuda::RowMax(nrow, ncol, tPtr_const, outPtr, ctx->stream); } else {
