SINGA-120 - Implemented GRU and BPTT Change back to r * (h x U) for new memory compuation; Loss to 2.8 per char/unit.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/a2f4e468 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/a2f4e468 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/a2f4e468 Branch: refs/heads/master Commit: a2f4e4680bb7b5dc5077a064a757698e30cc5e13 Parents: 6a4c996 Author: Wei Wang <[email protected]> Authored: Wed Jan 6 00:35:41 2016 +0800 Committer: Wei Wang <[email protected]> Committed: Wed Jan 6 02:03:50 2016 +0800 ---------------------------------------------------------------------- examples/char-rnn/data.py | 8 + examples/char-rnn/job.conf | 250 +++++++++++++++++++ examples/char-rnn/sample.conf | 212 ++++++++++++++++ include/singa/neuralnet/neuron_layer.h | 15 +- include/singa/utils/math_blob.h | 2 + src/neuralnet/connection_layer/rnn_dummy.cc | 67 +++++ src/neuralnet/input_layer/char_rnn.cc | 6 +- src/neuralnet/input_layer/onehot.cc | 40 +++ src/neuralnet/neuralnet.cc | 73 +++--- src/neuralnet/neuron_layer/dummy.cc | 1 - src/neuralnet/neuron_layer/embedding.cc | 4 +- src/neuralnet/neuron_layer/gru.cc | 298 +++++++++++------------ src/neuralnet/output_layer/char_rnn.cc | 51 ++++ src/utils/updater.cc | 6 + src/worker.cc | 3 +- 15 files changed, 832 insertions(+), 204 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/examples/char-rnn/data.py ---------------------------------------------------------------------- diff --git a/examples/char-rnn/data.py b/examples/char-rnn/data.py new file mode 100644 index 0000000..4b1c28c --- /dev/null +++ b/examples/char-rnn/data.py @@ -0,0 +1,8 @@ +# pls get linux_input.txt from http://cs.stanford.edu/people/karpathy/char-rnn/ +data = open('linux_input.txt', 'r').read() # should be simple plain text file +chars = list(set(data)) +data_size, vocab_size = len(data), len(chars) +print 'data has %d characters, %d unique.' % (data_size, vocab_size) +with open('vocab.txt', 'w') as fd: + fd.write("".join(chars)) + fd.flush() http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/examples/char-rnn/job.conf ---------------------------------------------------------------------- diff --git a/examples/char-rnn/job.conf b/examples/char-rnn/job.conf new file mode 100644 index 0000000..2e1c761 --- /dev/null +++ b/examples/char-rnn/job.conf @@ -0,0 +1,250 @@ +name:"char-rnn" +train_steps: 100000 +disp_freq: 100 +#debug: true +gpu: 0 +train_one_batch { + alg: kBPTT +} + +updater { + type: kRMSProp + rmsprop_conf { + rho: 0.95 + } + learning_rate { + type: kStep + base_lr: 0.002 + step_conf { + gamma: 0.97 + change_freq: 2000 + } + } + clip_low: -5 + clip_high: 5 +} + +neuralnet { + unroll_len: 50 + layer { + name: "data" + type: kCharRNN + unroll_len: 1 + char_rnn_conf { + path: "examples/char-rnn/linux_input.txt" + vocab_path:"examples/char-rnn/vocab.txt" + batchsize: 50 + unroll_len: 50 + } + } + layer { + name: "onehot" + type: kOneHot + srclayers: "data" + unroll_conn_type: kUnrollOneToAll + } + + layer { + name: "label" + type: kRNNLabel + srclayers: "data" + unroll_conn_type: kUnrollOneToAll + } + + layer { + name: "gru1" + type: kGRU + srclayers: "onehot" + gru_conf { + dim_hidden: 512 + } + param { + name: "z_hx" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "r_hx" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "c_hx" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "z_hh" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "r_hh" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "c_hh" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "z_b" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "r_b" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "c_b" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + + } +# layer { +# name: "gru2" +# type: kGRU +# srclayers: "gru1" +# gru_conf { +# dim_hidden: 512 +# } +# param { +# name: "z_hx2" +# init { +# type: kUniform +# low: -0.08 +# high: 0.08 +# } +# } +# param { +# name: "r_hx2" +# init { +# type: kUniform +# low: -0.08 +# high: 0.08 +# } +# } +# param { +# name: "c_hx2" +# init { +# type: kUniform +# low: -0.08 +# high: 0.08 +# } +# } +# param { +# name: "z_hh2" +# init { +# type: kUniform +# low: -0.08 +# high: 0.08 +# } +# } +# param { +# name: "r_hh2" +# init { +# type: kUniform +# low: -0.08 +# high: 0.08 +# } +# } +# param { +# name: "c_hh2" +# init { +# type: kUniform +# low: -0.08 +# high: 0.08 +# } +# } +# param { +# name: "z_b2" +# init { +# type: kUniform +# low: -0.08 +# high: 0.08 +# } +# } +# param { +# name: "r_b2" +# init { +# type: kUniform +# low: -0.08 +# high: 0.08 +# } +# } +# param { +# name: "c_b2" +# init { +# type: kUniform +# low: -0.08 +# high: 0.08 +# } +# } +# } +# + layer { + name: "ip1" + type: kInnerProduct + srclayers: "gru1" + innerproduct_conf { + num_output: 101 + } + param { + name: "w" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "b" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + } + layer { + name: "loss" + type: kSoftmaxLoss + srclayers: "ip1" + srclayers: "label" + } +} + +cluster { + workspace: "examples/char-rnn/" +} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/examples/char-rnn/sample.conf ---------------------------------------------------------------------- diff --git a/examples/char-rnn/sample.conf b/examples/char-rnn/sample.conf new file mode 100644 index 0000000..b15ef9e --- /dev/null +++ b/examples/char-rnn/sample.conf @@ -0,0 +1,212 @@ +name:"char-rnn" +test_steps: 100 +#debug: true +gpu: 0 +checkpoint_path: "examples/char-rnn/checkpoint/step2000-worker0" +train_one_batch { + alg: kBPTT +} + +neuralnet { + layer { + name: "data" + type: kRNNDummy + rnn_dummy_conf { + shape: 1 + integer: true + low: 0 + high: 101 + dynamic_srclayer: "argsort" + } + } + layer { + name: "onehot" + type: kOneHot + srclayers: "data" + } + + layer { + name: "gru1" + type: kGRU + srclayers: "onehot" + gru_conf { + dim_hidden: 512 + } + param { + name: "z_hx" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "r_hx" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "c_hx" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "z_hh" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "r_hh" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "c_hh" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "z_b" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "r_b" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "c_b" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + } + layer { + name: "gru2" + type: kGRU + srclayers: "gru1" + gru_conf { + dim_hidden: 512 + } + param { + name: "z_hx2" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "r_hx2" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "c_hx2" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "z_hh2" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "r_hh2" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "c_hh2" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + } + + + layer { + name: "ip1" + type: kInnerProduct + srclayers: "gru2" + innerproduct_conf { + num_output: 101 + } + param { + name: "w" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + param { + name: "b" + init { + type: kUniform + low: -0.08 + high: 0.08 + } + } + } + layer { + name: "softmax" + type: kSoftmax + srclayers: "ip1" + } + layer { + name: "argsort" + type: kArgSort + srclayers: "softmax" + } + layer { + name: "sampling" + type: kCharRNNOutput + srclayers: "argsort" + char_rnn_conf { + vocab_path: "examples/char-rnn/vocab.txt" + } + } +} + +cluster { + workspace: "examples/char-rnn/" +} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/include/singa/neuralnet/neuron_layer.h ---------------------------------------------------------------------- diff --git a/include/singa/neuralnet/neuron_layer.h b/include/singa/neuralnet/neuron_layer.h index e1a63a2..c612aed 100644 --- a/include/singa/neuralnet/neuron_layer.h +++ b/include/singa/neuralnet/neuron_layer.h @@ -187,16 +187,15 @@ class GRULayer : public NeuronLayer { } const std::vector<Param*> GetParams() const override { + std::vector<Param*> params{weight_z_hx_, weight_r_hx_,weight_c_hx_, + weight_z_hh_, weight_r_hh_, weight_c_hh_}; + if (bias_z_ != nullptr && bias_r_ != nullptr && bias_c_ != nullptr) { - std::vector<Param*> params{weight_z_hx_, weight_r_hx_,weight_c_hx_, - weight_z_hh_, weight_r_hh_, weight_c_hh_, - bias_z_, bias_r_, bias_c_}; - return params; - } else { - std::vector<Param*> params{weight_z_hx_, weight_r_hx_,weight_c_hx_, - weight_z_hh_, weight_r_hh_, weight_c_hh_}; - return params; + params.push_back(bias_z_); + params.push_back(bias_r_); + params.push_back(bias_c_); } + return params; } private: http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/include/singa/utils/math_blob.h ---------------------------------------------------------------------- diff --git a/include/singa/utils/math_blob.h b/include/singa/utils/math_blob.h index bdaf914..e151c37 100644 --- a/include/singa/utils/math_blob.h +++ b/include/singa/utils/math_blob.h @@ -267,6 +267,8 @@ void Map(const Blob<Dtype> & A, Blob<Dtype> * B) { } else { #ifdef USE_GPU gpu_e_f<Op>(A.count(), A.gpu_data(), B->mutable_gpu_data()); +#else + LOG(ERROR) << "Not implemented"; #endif // USE_GPU } } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/src/neuralnet/connection_layer/rnn_dummy.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/connection_layer/rnn_dummy.cc b/src/neuralnet/connection_layer/rnn_dummy.cc new file mode 100644 index 0000000..865066f --- /dev/null +++ b/src/neuralnet/connection_layer/rnn_dummy.cc @@ -0,0 +1,67 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#include "singa/neuralnet/connection_layer.h" +#include "singa/utils/math_blob.h" + +namespace singa { + +void RNNDummyLayer::Setup(const LayerProto& conf, + const vector<Layer*>& srclayers) { + Layer::Setup(conf, srclayers); + dynamic_src_ = AddPrefixSuffix(unroll_index(), partition_id(), + conf.rnn_dummy_conf().dynamic_srclayer()); + LOG(ERROR) << dynamic_src_; + vector<int> shape; + for (int s : conf.rnn_dummy_conf().shape()) + shape.push_back(s); + integer_ = conf.rnn_dummy_conf().integer(); + low_ = conf.rnn_dummy_conf().low(); + high_ = conf.rnn_dummy_conf().high(); + // if no src layer, then it will genereate data by itself based on shape + // and random range + if (srclayers.size() == 0) { + CHECK(shape.size()); + CHECK_NE(low_, high_); + data_.Reshape(shape); + srclayer_ = nullptr; + } else { + srclayer_ = srclayers.at(0); + data_.ReshapeLike(srclayer_->data(this)); + data_.ShareData(srclayer_->mutable_data(this), false); + } +} + +void RNNDummyLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { + if (srclayers.size() == 0) { + SampleUniform(low_, high_, &data_); + if (integer_) { + for (int i = 0; i < data_.count(); i ++) { + data_.mutable_cpu_data()[i] = floor(data_.cpu_data()[i]); + } + } + } else if (srclayer_ != srclayers.at(0)) { + srclayer_ = srclayers.at(0); + data_.ShareData(srclayer_->mutable_data(this), false); + } +} +} // namespace singa + http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/src/neuralnet/input_layer/char_rnn.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/input_layer/char_rnn.cc b/src/neuralnet/input_layer/char_rnn.cc index cc13b1b..8a56711 100644 --- a/src/neuralnet/input_layer/char_rnn.cc +++ b/src/neuralnet/input_layer/char_rnn.cc @@ -69,6 +69,7 @@ void CharRNNInputLayer::ComputeFeature(int flag, // decide the start pos of each instance in one mini-batch int max_offset = buf_.length() / batchsize_; CHECK_GT(max_offset, unroll_len_); + LOG(ERROR) << "Max iteration per epoch = " << max_offset / unroll_len_; for (int i = 0; i < batchsize_; i ++) { start_.push_back(i * max_offset); } @@ -77,7 +78,7 @@ void CharRNNInputLayer::ComputeFeature(int flag, for (int l = 0; l < unroll_len_ + 1; l++) { float* ptr = datavec_[l]->mutable_cpu_data(); for (int i = 0; i < batchsize_; i++) { - ptr[i] = static_cast<float>(char2index_.at(buf_[start_[i] + l])); + ptr[i] = static_cast<float>(char2index_.at(buf_[start_[i] + offset_ + l])); } } offset_ += unroll_len_; @@ -87,9 +88,6 @@ void CharRNNInputLayer::ComputeFeature(int flag, // std::shuffle(start_.begin(), start_.end(), g); offset_ = 0; // return -1; - } else { - // return 0; } } - } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/src/neuralnet/input_layer/onehot.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/input_layer/onehot.cc b/src/neuralnet/input_layer/onehot.cc new file mode 100644 index 0000000..056656a --- /dev/null +++ b/src/neuralnet/input_layer/onehot.cc @@ -0,0 +1,40 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ +#include "singa/neuralnet/input_layer.h" + +namespace singa { +void OneHotLayer::Setup(const LayerProto& conf, + const vector<Layer*>& srclayers) { + InputLayer::Setup(conf, srclayers); + batchsize_ = srclayers.at(0)->data(unroll_index()).shape(0); + dim_ = 101 ; // proto.onehot_conf().vocab_size(); + data_.Reshape(batchsize_, dim_); +} + +void OneHotLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { + float* ptr = data_.mutable_cpu_data(); + memset(ptr, 0, sizeof(float) * data_.count()); + const float* idx = srclayers[0]->data(unroll_index()).cpu_data(); + for (int i = 0; i < batchsize_; i++) { + ptr[i * dim_ + static_cast<int>(idx[i])] = 1; + } +} +} // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/src/neuralnet/neuralnet.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/neuralnet.cc b/src/neuralnet/neuralnet.cc index 49978a1..b045e06 100644 --- a/src/neuralnet/neuralnet.cc +++ b/src/neuralnet/neuralnet.cc @@ -19,13 +19,12 @@ * *************************************************************/ -#include "singa/neuralnet/neuralnet.h" +#include "singa/neuralnet/neuralnet.h" +#include <unordered_map> #include <algorithm> #include <queue> #include "singa/utils/singleton.h" -#include <unordered_map> -using namespace std; namespace singa { @@ -60,7 +59,7 @@ const NetProto NetConfPreprocess(const NetProto& conf) { } NeuralNet* NeuralNet::Create(const NetProto& net_conf, Phase phase, - int npartitions) { + int npartitions) { const NetProto& full_net_conf = NetConfPreprocess(net_conf); NetProto conf = full_net_conf; conf.clear_layer(); @@ -99,21 +98,21 @@ NeuralNet* NeuralNet::Create(const NetProto& net_conf, Phase phase, layer_conf->set_partition_dim(net_conf.partition_dim()); } // LOG(INFO) << "Before unrolling: \n" << conf.DebugString(); - conf = Unrolling (conf); + conf = Unrolling(conf); // Copy shared parameters for sharing param conf std::vector<ParamProto*> shares; std::unordered_map<string, ParamProto*> name2param; - for (int index = 0; index < conf.layer_size();index ++) { - LayerProto* layer = conf.mutable_layer(index); - for (int i = 0; i < layer->param_size(); i++) { - ParamProto* param = layer->mutable_param(i); + for (int index = 0; index < conf.layer_size(); index++) { + LayerProto* layer = conf.mutable_layer(index); + for (int i = 0; i < layer->param_size(); i++) { + ParamProto* param = layer->mutable_param(i); CHECK(name2param.find(param->name()) == name2param.end()) << "Repeated param = " << param->name(); - name2param[param->name()] = param; - if (param->has_share_from() && param->share_from() != "") - shares.push_back(param); - } + name2param[param->name()] = param; + if (param->has_share_from() && param->share_from() != "") + shares.push_back(param); + } } for (auto param : shares) { const std::string from = param->share_from(); @@ -135,32 +134,30 @@ const NetProto NeuralNet::Unrolling(const NetProto& net_conf) { NetProto conf; std::vector<std::vector<int>> layer_groups; - std::unordered_map<string,int> org_layer_names; + std::unordered_map<string, int> org_layer_names; for (int index = 0; index < net_conf.layer_size(); index ++) { const LayerProto& org_layer = net_conf.layer(index); - org_layer_names[org_layer.name()] = index; // layer_name -> index + org_layer_names[org_layer.name()] = index; // layer_name -> index std::vector<int> layer_group; - for (int i = 0; i < org_layer.unroll_len(); i ++) { // unroll + for (int i = 0; i < org_layer.unroll_len(); i ++) { // unroll LayerProto* unroll_layer = conf.add_layer(); - unroll_layer->CopyFrom(org_layer); // create a new layer conf - // if (org_layer.unroll_len() > 1) { - // update layer names - std::stringstream sstm; - sstm << i << '#' << unroll_layer->name(); - unroll_layer->set_name(sstm.str()); - unroll_layer->set_unroll_index(i); - // update layer parameter sharing - for (int j = 0; j < unroll_layer->param_size(); j ++) { - ParamProto* param = unroll_layer->mutable_param(j); - if (i > 0) { - param->set_share_from("0#" + param->name()); - } - std::stringstream sstm1; - sstm1 << i << '#' << param->name(); - param->set_name(sstm1.str()); + unroll_layer->CopyFrom(org_layer); // create a new layer conf + // update layer names + std::stringstream sstm; + sstm << i << '#' << unroll_layer->name(); + unroll_layer->set_name(sstm.str()); + unroll_layer->set_unroll_index(i); + // update layer parameter sharing + for (int j = 0; j < unroll_layer->param_size(); j ++) { + ParamProto* param = unroll_layer->mutable_param(j); + if (i > 0) { + param->set_share_from("0#" + param->name()); } - // } + std::stringstream sstm1; + sstm1 << i << '#' << param->name(); + param->set_name(sstm1.str()); + } // clear unrolling related fields unroll_layer->clear_unroll_len(); unroll_layer->clear_unroll_conn_type(); @@ -176,7 +173,7 @@ const NetProto NeuralNet::Unrolling(const NetProto& net_conf) { for (int index = 0; index < net_conf.layer_size(); index ++) { const LayerProto& org_layer = net_conf.layer(index); if (org_layer.srclayers_size() == 0) - continue; // no src layer + continue; // no src layer for (int i = 0; i < org_layer.srclayers_size(); i ++) { const string& org_layer_src = org_layer.srclayers(i); singa::UnrollConnType unroll_conn_type = kUnrollOneToOne; @@ -197,7 +194,7 @@ const NetProto NeuralNet::Unrolling(const NetProto& net_conf) { unroll_layer->add_srclayers(conf.layer(unroll_layer_src).name()); } } else if (unroll_conn_type == kUnrollOneToOne) { - if (j < shift) continue; // no need to connect with the src + if (j < shift) continue; // no need to connect with the src int unroll_layer_src = unroll_layer_srcs[j - shift]; unroll_layer->add_srclayers(conf.layer(unroll_layer_src).name()); } else if (unroll_conn_type == kUnrollFirstToLast) { @@ -209,16 +206,14 @@ const NetProto NeuralNet::Unrolling(const NetProto& net_conf) { } } - //TODO(fanju): add LSTM when it is ready - if (org_layer.type() == kGRU) { // connect GRU layers + // TODO(fanju): add LSTM when it is ready + if (org_layer.type() == kGRU) { // connect GRU layers for (unsigned int j = 1; j < layer_groups[index].size(); j ++) { LayerProto* unroll_layer = conf.mutable_layer(layer_groups[index][j]); string srcname = conf.layer(layer_groups[index][j-1]).name(); unroll_layer->add_srclayers(srcname); - // LOG(ERROR) << "connect " << unroll_layer->name() << " from " << srcname; } } - } return conf; } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/src/neuralnet/neuron_layer/dummy.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/neuron_layer/dummy.cc b/src/neuralnet/neuron_layer/dummy.cc index 9ccb179..936bb5e 100644 --- a/src/neuralnet/neuron_layer/dummy.cc +++ b/src/neuralnet/neuron_layer/dummy.cc @@ -45,7 +45,6 @@ void DummyLayer::Setup(const LayerProto& proto, if (proto.dummy_conf().output()) { // use as output layer output_ = true; } - } void DummyLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/src/neuralnet/neuron_layer/embedding.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/neuron_layer/embedding.cc b/src/neuralnet/neuron_layer/embedding.cc index 00e9139..c980c54 100644 --- a/src/neuralnet/neuron_layer/embedding.cc +++ b/src/neuralnet/neuron_layer/embedding.cc @@ -65,8 +65,8 @@ void EmbeddingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { } } -void EmbeddingLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) -{ +void EmbeddingLayer::ComputeGradient(int flag, + const vector<Layer*>& srclayers) { const float* word_idx = srclayers.at(0)->data(unroll_index()).cpu_data(); auto context = Singleton<Context>::Instance(); if ((flag & kAggGrad) == 0) http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/src/neuralnet/neuron_layer/gru.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/neuron_layer/gru.cc b/src/neuralnet/neuron_layer/gru.cc index cf7425b..da2f9c5 100644 --- a/src/neuralnet/neuron_layer/gru.cc +++ b/src/neuralnet/neuron_layer/gru.cc @@ -19,14 +19,12 @@ * *************************************************************/ -#include "singa/neuralnet/neuron_layer.h" - #include <glog/logging.h> +#include "singa/neuralnet/neuron_layer.h" #include "singa/utils/singleton.h" #include "singa/utils/math_blob.h" #include "singa/utils/singa_op.h" -#include <iostream> using namespace std; namespace singa { @@ -57,10 +55,10 @@ void GRULayer::Setup(const LayerProto& conf, CHECK_LE(srclayers.size(), 2); const auto& src = srclayers[0]->data(this); - batchsize_ = src.shape()[0]; // size of batch - vdim_ = src.count() / (batchsize_); // dimension of input + batchsize_ = src.shape()[0]; // size of batch + vdim_ = src.count() / (batchsize_); // dimension of input - hdim_ = layer_conf_.gru_conf().dim_hidden(); // dimension of hidden state + hdim_ = layer_conf_.gru_conf().dim_hidden(); // dimension of hidden state data_.Reshape(vector<int>{batchsize_, hdim_}); grad_.ReshapeLike(data_); @@ -77,9 +75,9 @@ void GRULayer::Setup(const LayerProto& conf, weight_c_hh_ = Param::Create(conf.param(5)); if (conf.param_size() > 6) { - bias_z_ = Param::Create(conf.param(6)); - bias_r_ = Param::Create(conf.param(7)); - bias_c_ = Param::Create(conf.param(8)); + bias_z_ = Param::Create(conf.param(6)); + bias_r_ = Param::Create(conf.param(7)); + bias_c_ = Param::Create(conf.param(8)); } weight_z_hx_->Setup(vector<int>{hdim_, vdim_}); @@ -91,168 +89,170 @@ void GRULayer::Setup(const LayerProto& conf, weight_c_hh_->Setup(vector<int>{hdim_, hdim_}); if (conf.param_size() > 6) { - bias_z_->Setup(vector<int>{hdim_}); - bias_r_->Setup(vector<int>{hdim_}); - bias_c_->Setup(vector<int>{hdim_}); + bias_z_->Setup(vector<int>{hdim_}); + bias_r_->Setup(vector<int>{hdim_}); + bias_c_->Setup(vector<int>{hdim_}); } update_gate = new Blob<float>(batchsize_, hdim_); reset_gate = new Blob<float>(batchsize_, hdim_); - // reset gate x context - reset_context = new Blob<float>(batchsize_, hdim_); new_memory = new Blob<float>(batchsize_, hdim_); } void GRULayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { - CHECK_LE(srclayers.size(), 2); - - // Do transpose - Blob<float> *w_z_hx_t = Transpose (weight_z_hx_->data()); - Blob<float> *w_z_hh_t = Transpose (weight_z_hh_->data()); - Blob<float> *w_r_hx_t = Transpose (weight_r_hx_->data()); - Blob<float> *w_r_hh_t = Transpose (weight_r_hh_->data()); - Blob<float> *w_c_hx_t = Transpose (weight_c_hx_->data()); - Blob<float> *w_c_hh_t = Transpose (weight_c_hh_->data()); - - // Prepare the data input and the context - const auto& src = srclayers[0]->data(this); - const Blob<float> *context; - if (srclayers.size() == 1) { // only have data input - context = new Blob<float>(batchsize_, hdim_); - } else { // have data input & context - context = &srclayers[1]->data(this); - } - - // Compute the update gate - GEMM(1.0f, 0.0f, src,*w_z_hx_t,update_gate); - if (bias_z_ != nullptr) - MVAddRow(1.0f,1.0f,bias_z_->data(),update_gate); - GEMM(1.0f, 1.0f, *context, *w_z_hh_t, update_gate); - Map<op::Sigmoid<float>,float>(*update_gate, update_gate); - - // Compute the reset gate - GEMM(1.0f, 0.0f, src, *w_r_hx_t, reset_gate); - if (bias_r_ != nullptr) - MVAddRow(1.0f,1.0f, bias_r_->data(),reset_gate); - GEMM(1.0f, 1.0f, *context, *w_r_hh_t, reset_gate); - Map<op::Sigmoid<float>,float>(*reset_gate, reset_gate); - - // Compute the new memory - Mult<float>(*reset_gate, *context, reset_context); - GEMM(1.0f, 0.0f, *reset_context, *w_c_hh_t, new_memory); - GEMM(1.0f, 1.0f, src, *w_c_hx_t, new_memory); - if (bias_c_ != nullptr) - MVAddRow(1.0f, 1.0f, bias_c_->data(), new_memory); - Map<op::Tanh<float>,float>(*new_memory, new_memory); - - Sub(*new_memory, *context, &data_); + CHECK_LE(srclayers.size(), 2); + + // Do transpose + Blob<float> *w_z_hx_t = Transpose(weight_z_hx_->data()); + Blob<float> *w_z_hh_t = Transpose(weight_z_hh_->data()); + Blob<float> *w_r_hx_t = Transpose(weight_r_hx_->data()); + Blob<float> *w_r_hh_t = Transpose(weight_r_hh_->data()); + Blob<float> *w_c_hx_t = Transpose(weight_c_hx_->data()); + Blob<float> *w_c_hh_t = Transpose(weight_c_hh_->data()); + + // Prepare the data input and the context + const auto& src = srclayers[0]->data(this); + const Blob<float> *context; + if (srclayers.size() == 1) { // only have data input + context = new Blob<float>(batchsize_, hdim_); + } else { // have data input & context + context = &srclayers[1]->data(this); + } + + // Compute the update gate + GEMM(1.0f, 0.0f, src, *w_z_hx_t, update_gate); + if (bias_z_ != nullptr) + MVAddRow(1.0f, 1.0f, bias_z_->data(), update_gate); + GEMM(1.0f, 1.0f, *context, *w_z_hh_t, update_gate); + Map<op::Sigmoid<float>, float>(*update_gate, update_gate); + + // Compute the reset gate + GEMM(1.0f, 0.0f, src, *w_r_hx_t, reset_gate); + if (bias_r_ != nullptr) + MVAddRow(1.0f, 1.0f, bias_r_->data(), reset_gate); + GEMM(1.0f, 1.0f, *context, *w_r_hh_t, reset_gate); + Map<op::Sigmoid<float>, float>(*reset_gate, reset_gate); + + // Compute the new memory + GEMM(1.0f, 0.0f, src, *w_c_hx_t, new_memory); + if (bias_c_ != nullptr) + MVAddRow(1.0f, 1.0f, bias_c_->data(), new_memory); + Mult<float>(*reset_gate, *new_memory, new_memory); + GEMM(1.0f, 1.0f, *context, *w_c_hh_t, new_memory); + Map<op::Tanh<float>, float>(*new_memory, new_memory); + + + Sub(*context, *new_memory, &data_); Mult(data_, *update_gate, &data_); - AXPY(1.0f, *context, &data_); + Add(data_, *new_memory, &data_); - // delete the pointers - if (srclayers.size() == 1) + // delete the pointers + if (srclayers.size() == 1) delete context; - delete w_z_hx_t; - delete w_z_hh_t; - delete w_r_hx_t; - delete w_r_hh_t; - delete w_c_hx_t; - delete w_c_hh_t; + delete w_z_hx_t; + delete w_z_hh_t; + delete w_r_hx_t; + delete w_r_hh_t; + delete w_c_hx_t; + delete w_c_hh_t; } void GRULayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) { - CHECK_LE(srclayers.size(), 2); - // agg grad from two dst layers + CHECK_LE(srclayers.size(), 2); + // agg grad from two dst layers, gradvec_[0] is grad_ AXPY(1.0f, *gradvec_[1], &grad_); - float beta = 1.0f; // agg param gradients - - Layer* ilayer = srclayers[0]; // input layer - Layer* clayer = nullptr; // context layer - // Prepare the data input and the context - const Blob<float>& src = ilayer->data(this); - const Blob<float> *context; - if (srclayers.size() == 1) { // only have data input - context = new Blob<float>(batchsize_, hdim_); - } else { // have data input & context + float beta = 1.0f; // agg param gradients + + Layer* ilayer = srclayers[0]; // input layer + Layer* clayer = nullptr; // context layer + // Prepare the data input and the context + const Blob<float>& src = ilayer->data(this); + const Blob<float> *context; + if (srclayers.size() == 1) { // only have data input + context = new Blob<float>(batchsize_, hdim_); + } else { // have data input & context clayer = srclayers[1]; - context = &(clayer->data(this)); - } - - // Prepare gradient of output neurons - Blob<float> *grad_t = Transpose (grad_); - - // Compute intermediate gradients which are used for other computations - Blob<float> dugatedz (batchsize_, hdim_); - Map<singa::op::SigmoidGrad<float>, float>(*update_gate, &dugatedz); - Blob<float> drgatedr (batchsize_, hdim_); - Map<singa::op::SigmoidGrad<float>, float>(*reset_gate, &drgatedr); - Blob<float> dnewmdc (batchsize_, hdim_); - Map<singa::op::TanhGrad<float>, float>(*new_memory, &dnewmdc); - - Blob<float> dLdz (batchsize_, hdim_); - Sub<float>(*new_memory, *context, &dLdz); - Mult<float>(dLdz, grad_, &dLdz); - Mult<float>(dLdz, dugatedz, &dLdz); - - Blob<float> dLdc (batchsize_,hdim_); - Mult(grad_, *update_gate, &dLdc); - Mult(dLdc, dnewmdc, &dLdc); - - Blob<float> reset_dLdc (batchsize_,hdim_); - GEMM(1.0f, 0.0f, dLdc, weight_c_hh_->data(), &reset_dLdc); - - Blob<float> dLdr (batchsize_, hdim_); - Mult(reset_dLdc, *context, &dLdr); - Mult(dLdr, drgatedr, &dLdr); - - // Compute gradients for parameters of update gate - Blob<float> *dLdz_t = Transpose(dLdz); - GEMM(1.0f, beta, *dLdz_t, src, weight_z_hx_->mutable_grad()); - GEMM(1.0f, beta, *dLdz_t, *context, weight_z_hh_->mutable_grad()); - if (bias_z_ != nullptr) - MVSumRow<float>(1.0f, beta, dLdz, bias_z_->mutable_grad()); - delete dLdz_t; - - // Compute gradients for parameters of reset gate - Blob<float> *dLdr_t = Transpose(dLdr); - GEMM(1.0f, beta, *dLdr_t, src, weight_r_hx_->mutable_grad()); - GEMM(1.0f, beta, *dLdr_t, *context, weight_r_hh_->mutable_grad()); - if (bias_r_ != nullptr) - MVSumRow(1.0f, beta, dLdr, bias_r_->mutable_grad()); - delete dLdr_t; - - // Compute gradients for parameters of new memory - Blob<float> *dLdc_t = Transpose(dLdc); - GEMM(1.0f, beta, *dLdc_t, src, weight_c_hx_->mutable_grad()); - GEMM(1.0f, beta, *dLdc_t, *reset_context, weight_c_hh_->mutable_grad()); - if (bias_c_ != nullptr) - MVSumRow(1.0f, beta, dLdc, bias_c_->mutable_grad()); - delete dLdc_t; - - // Compute gradients for data input layer - if (srclayers[0]->mutable_grad(this) != nullptr) { - GEMM(1.0f,0.0f, dLdc, weight_c_hx_->data(), ilayer->mutable_grad(this)); - GEMM(1.0f,1.0f, dLdz, weight_z_hx_->data(), ilayer->mutable_grad(this)); - GEMM(1.0f,1.0f, dLdr, weight_r_hx_->data(), ilayer->mutable_grad(this)); - } - - if (clayer != nullptr && clayer->mutable_grad(this) != nullptr) { - // Compute gradients for context layer - Mult(reset_dLdc, *reset_gate, clayer->mutable_grad(this)); - GEMM(1.0f, 1.0f, dLdr, weight_r_hh_->data(), clayer->mutable_grad(this)); - GEMM(1.0f, 1.0f, dLdz, weight_z_hh_->data(), clayer->mutable_grad(this)); - AXPY(-1.0f, *update_gate, clayer->mutable_grad(this)); + context = &(clayer->data(this)); + } + + // Compute intermediate gradients which are used for other computations + Blob<float> dugatedz(batchsize_, hdim_); + Map<singa::op::SigmoidGrad<float>, float>(*update_gate, &dugatedz); + Blob<float> drgatedr(batchsize_, hdim_); + Map<singa::op::SigmoidGrad<float>, float>(*reset_gate, &drgatedr); + Blob<float> dnewmdc(batchsize_, hdim_); + Map<singa::op::TanhGrad<float>, float>(*new_memory, &dnewmdc); + + Blob<float> dLdz(batchsize_, hdim_); + Sub<float>(*context, *new_memory, &dLdz); + Mult<float>(dLdz, grad_, &dLdz); + Mult<float>(dLdz, dugatedz, &dLdz); + + Blob<float> dLdc(batchsize_, hdim_); + Blob<float> z1(batchsize_, hdim_); + z1.SetValue(1.0f); + AXPY<float>(-1.0f, *update_gate, &z1); + Mult(grad_, z1, &dLdc); + Mult(dLdc, dnewmdc, &dLdc); + + Blob<float> reset_dLdc(batchsize_, hdim_); + Mult(dLdc, *reset_gate, &reset_dLdc); + + Blob<float> dLdr(batchsize_, hdim_); + Blob<float> cprev(batchsize_, hdim_); + GEMM(1.0f, 0.0f, *context, weight_c_hh_->data().T(), &cprev); + Mult(dLdc, cprev, &dLdr); + Mult(dLdr, drgatedr, &dLdr); + + // Compute gradients for parameters of update gate + Blob<float> *dLdz_t = Transpose(dLdz); + GEMM(1.0f, beta, *dLdz_t, src, weight_z_hx_->mutable_grad()); + GEMM(1.0f, beta, *dLdz_t, *context, weight_z_hh_->mutable_grad()); + if (bias_z_ != nullptr) + MVSumRow<float>(1.0f, beta, dLdz, bias_z_->mutable_grad()); + delete dLdz_t; + + // Compute gradients for parameters of reset gate + Blob<float> *dLdr_t = Transpose(dLdr); + GEMM(1.0f, beta, *dLdr_t, src, weight_r_hx_->mutable_grad()); + GEMM(1.0f, beta, *dLdr_t, *context, weight_r_hh_->mutable_grad()); + if (bias_r_ != nullptr) + MVSumRow(1.0f, beta, dLdr, bias_r_->mutable_grad()); + delete dLdr_t; + + // Compute gradients for parameters of new memory + Blob<float> *dLdc_t = Transpose(dLdc); + GEMM(1.0f, beta, *dLdc_t, src, weight_c_hx_->mutable_grad()); + if (bias_c_ != nullptr) + MVSumRow(1.0f, beta, dLdc, bias_c_->mutable_grad()); + delete dLdc_t; + + Blob<float> *reset_dLdc_t = Transpose(reset_dLdc); + GEMM(1.0f, beta, *reset_dLdc_t, *context, weight_c_hh_->mutable_grad()); + delete reset_dLdc_t; + + // Compute gradients for data input layer + if (srclayers[0]->mutable_grad(this) != nullptr) { + GEMM(1.0f, 0.0f, dLdc, weight_c_hx_->data(), ilayer->mutable_grad(this)); + GEMM(1.0f, 1.0f, dLdz, weight_z_hx_->data(), ilayer->mutable_grad(this)); + GEMM(1.0f, 1.0f, dLdr, weight_r_hx_->data(), ilayer->mutable_grad(this)); + } + + if (clayer != nullptr && clayer->mutable_grad(this) != nullptr) { + // Compute gradients for context layer + GEMM(1.0f, 0.0f, reset_dLdc, weight_c_hh_->data(), + clayer->mutable_grad(this)); + GEMM(1.0f, 1.0f, dLdr, weight_r_hh_->data(), clayer->mutable_grad(this)); + GEMM(1.0f, 1.0f, dLdz, weight_z_hh_->data(), clayer->mutable_grad(this)); + Add(clayer->grad(this), *update_gate, clayer->mutable_grad(this)); // LOG(ERROR) << "grad to prev gru " << Asum(clayer->grad(this)); - } + } - if (srclayers.size() == 1) + if (srclayers.size() == 1) delete context; - else - context = NULL; - delete grad_t; } } // namespace singa http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/src/neuralnet/output_layer/char_rnn.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/output_layer/char_rnn.cc b/src/neuralnet/output_layer/char_rnn.cc new file mode 100644 index 0000000..c3f1733 --- /dev/null +++ b/src/neuralnet/output_layer/char_rnn.cc @@ -0,0 +1,51 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +#include <algorithm> +#include <iostream> +#include <fstream> +#include "singa/neuralnet/output_layer.h" + +namespace singa { + +void CharRNNOutputLayer::Setup(const LayerProto& proto, + const vector<Layer*>& srclayers) { + CHECK_EQ(srclayers.size(), 1); + OutputLayer::Setup(proto, srclayers); + std::ifstream fin; + const string path = proto.char_rnn_conf().vocab_path(); + fin.open(path); + CHECK(fin.is_open()) << "Can't open vocab_path = " << path; + std::stringstream stream; + stream << fin.rdbuf(); + vocab_ = stream.str(); + fin.close(); +} + +void CharRNNOutputLayer::ComputeFeature(int flag, + const vector<Layer*>& srclayers) { + const float* dptr = srclayers[0]->data(this).cpu_data(); + for (int i = 0; i < srclayers[0]->data(this).shape(0); i++) { + std::cout<<vocab_[static_cast<int>(dptr[i])]; + } +} + +} // namespace singa; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/src/utils/updater.cc ---------------------------------------------------------------------- diff --git a/src/utils/updater.cc b/src/utils/updater.cc index 200670a..1b3e26c 100644 --- a/src/utils/updater.cc +++ b/src/utils/updater.cc @@ -140,6 +140,9 @@ void SGDUpdater::Update(int step, Param* param, float grad_scale) { /***********************Nesterov******************************/ void NesterovUpdater::Update(int step, Param* param, float grad_scale) { + if (clip_high_ > clip_low_) + Clip(clip_low_, clip_high_, param); + Shape<1> s = Shape1(param->size()); Tensor<cpu, 1> data(param->mutable_cpu_data(), s); Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s); @@ -181,6 +184,9 @@ void RMSPropUpdater::Init(const UpdaterProto& proto) { } void RMSPropUpdater::Update(int step, Param* param, float grad_scale) { + if (clip_high_ > clip_low_) + Clip(clip_low_, clip_high_, param); + Shape<1> s=Shape1(param->size()); Tensor<cpu, 1> data(param->mutable_cpu_data(), s); Tensor<cpu, 1> grad(param->mutable_cpu_grad(), s); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a2f4e468/src/worker.cc ---------------------------------------------------------------------- diff --git a/src/worker.cc b/src/worker.cc index abe74e7..2afa8b0 100644 --- a/src/worker.cc +++ b/src/worker.cc @@ -433,7 +433,8 @@ void BPTTWorker::Backward(int step, NeuralNet* net) { for (auto it = layers.rbegin(); it != layers.rend(); it++) { Layer* layer = *it; if (layer->partition_id() == id_) { - layer->ComputeGradient(kTrain | kBackward | kAggGrad, net->srclayers(layer)); + layer->ComputeGradient(kTrain | kBackward | kAggGrad, + net->srclayers(layer)); // LOG(ERROR) << layer->name() << " backward"; if (job_conf_.debug() && DisplayNow(step) && grp_id_ == 0) label[layer->name()] = layer->ToString(true, kTrain | kBackward);
