Repository: incubator-singa Updated Branches: refs/heads/master 538736c4a -> 7a61a687c
SINGA-47 Fix a bug in data layers that leads to out-of-memory when group size is too large The bug is fixed by closing the data source (e.g., lmdb or datashard) after reading a sample record in the Setup function. The data source would cacahe memory which eat up all memory if there are many data layers. Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/7a61a687 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/7a61a687 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/7a61a687 Branch: refs/heads/master Commit: 7a61a687c2ceb4fc7e05c2d3bbd9817e8ba59e3f Parents: 538736c Author: Wei Wang <[email protected]> Authored: Wed Aug 12 17:28:50 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Wed Aug 12 17:32:50 2015 +0800 ---------------------------------------------------------------------- include/neuralnet/layer.h | 4 +++- include/neuralnet/optional_layer.h | 2 ++ src/neuralnet/layer.cc | 14 +++++++++++-- src/neuralnet/neuralnet.cc | 3 +++ src/neuralnet/optional_layer.cc | 36 +++++++++++++++++++++------------ 5 files changed, 43 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/include/neuralnet/layer.h ---------------------------------------------------------------------- diff --git a/include/neuralnet/layer.h b/include/neuralnet/layer.h index 05db916..118da56 100644 --- a/include/neuralnet/layer.h +++ b/include/neuralnet/layer.h @@ -335,10 +335,12 @@ class ShardDataLayer: public DataLayer{ public: using Layer::ComputeFeature; + ~ShardDataLayer(); void Setup(const LayerProto& proto, int npartitions) override; void ComputeFeature(Phase phase, Metric *perf) override; + private: - shared_ptr<DataShard> shard_; + DataShard* shard_; }; /** http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/include/neuralnet/optional_layer.h ---------------------------------------------------------------------- diff --git a/include/neuralnet/optional_layer.h b/include/neuralnet/optional_layer.h index 2cbcdb8..f6b60d3 100644 --- a/include/neuralnet/optional_layer.h +++ b/include/neuralnet/optional_layer.h @@ -9,6 +9,8 @@ class LMDBDataLayer: public DataLayer{ public: using Layer::ComputeFeature; + ~LMDBDataLayer(); + void OpenLMDB(const std::string& path); void Setup(const LayerProto& proto, int npartitions) override; void ComputeFeature(Phase phase, Metric *perf) override; void ConvertCaffeDatumToRecord(const CaffeDatum& datum, http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/src/neuralnet/layer.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc index 1fa92fb..314bb14 100644 --- a/src/neuralnet/layer.cc +++ b/src/neuralnet/layer.cc @@ -666,6 +666,9 @@ void RGBImageLayer::Setup(const LayerProto& proto, int npartitions) { /***************Implementation for ShardDataLayer**************************/ void ShardDataLayer::ComputeFeature(Phase phase, Metric* perf){ + if (shard_ == nullptr) + shard_ = new DataShard(layer_proto_.sharddata_conf().path(), + DataShard::kRead); if(random_skip_){ int nskip = rand() % random_skip_; LOG(INFO)<<"Random Skip "<<nskip<<" records, there are "<<shard_->Count() @@ -687,10 +690,11 @@ void ShardDataLayer::ComputeFeature(Phase phase, Metric* perf){ void ShardDataLayer::Setup(const LayerProto& proto, int npartitions) { Layer::Setup(proto, npartitions); - shard_= std::make_shared<DataShard>(proto.sharddata_conf().path(), - DataShard::kRead); + shard_= new DataShard(proto.sharddata_conf().path(), DataShard::kRead); string key; shard_->Next(&key, &sample_); + delete shard_; + shard_ = nullptr; batchsize_=proto.sharddata_conf().batchsize(); if(partition_dim() == 0) batchsize_ /= npartitions; @@ -698,6 +702,12 @@ void ShardDataLayer::Setup(const LayerProto& proto, int npartitions) { records_.resize(batchsize_); random_skip_=proto.sharddata_conf().random_skip(); } + +ShardDataLayer::~ShardDataLayer() { + if (shard_ != nullptr) + delete shard_; + shard_ = nullptr; +} /*******************Implementation of TanLayer***************************/ void TanhLayer::Setup(const LayerProto& proto, int npartitions){ Layer::Setup(proto, npartitions); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/src/neuralnet/neuralnet.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/neuralnet.cc b/src/neuralnet/neuralnet.cc index 10ddcf1..83bd4fd 100644 --- a/src/neuralnet/neuralnet.cc +++ b/src/neuralnet/neuralnet.cc @@ -3,6 +3,9 @@ #include "neuralnet/neuralnet.h" #include "utils/singleton.h" +#ifdef USE_OPTIONAL_LAYER +#include "neuralnet/optional_layer.h" +#endif namespace singa { // macros to shorten the code http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/src/neuralnet/optional_layer.cc ---------------------------------------------------------------------- diff --git a/src/neuralnet/optional_layer.cc b/src/neuralnet/optional_layer.cc index ba85807..06f413f 100644 --- a/src/neuralnet/optional_layer.cc +++ b/src/neuralnet/optional_layer.cc @@ -3,7 +3,9 @@ namespace singa { /*********************LMDBDataLayer**********************************/ -void LMDBDataLayer::ComputeFeature(Phase phase, Metric* perf){ +void LMDBDataLayer::ComputeFeature(Phase phase, Metric* perf) { + if (mdb_cursor_ == nullptr) + OpenLMDB(layer_proto_.lmdbdata_conf().path()); if(random_skip_){ int nskip = rand() % random_skip_; int n=0; @@ -63,29 +65,31 @@ void LMDBDataLayer::ConvertCaffeDatumToRecord(const CaffeDatum& datum, } } -void LMDBDataLayer::Setup(const LayerProto& proto, int npartitions) { - Layer::Setup(proto, npartitions); +void LMDBDataLayer::OpenLMDB(const std::string& path) { CHECK_EQ(mdb_env_create(&mdb_env_), MDB_SUCCESS) << "mdb_env_create failed"; CHECK_EQ(mdb_env_set_mapsize(mdb_env_, 1099511627776), MDB_SUCCESS); // 1TB - CHECK_EQ(mdb_env_open(mdb_env_, - proto.lmdbdata_conf().path().c_str(), - MDB_RDONLY, 0664), MDB_SUCCESS) << "cannot open lmdb " - << proto.lmdbdata_conf().path(); + CHECK_EQ(mdb_env_open(mdb_env_, path.c_str(), + MDB_RDONLY, 0664), MDB_SUCCESS) << "cannot open lmdb " << path; CHECK_EQ(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn_), MDB_SUCCESS) << "mdb_txn_begin failed"; CHECK_EQ(mdb_open(mdb_txn_, NULL, 0, &mdb_dbi_), MDB_SUCCESS) << "mdb_open failed"; CHECK_EQ(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_), MDB_SUCCESS) << "mdb_cursor_open failed"; - LOG(INFO) << "Opening lmdb " << proto.lmdbdata_conf().path(); + LOG(INFO) << "Opening lmdb " << path; CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_FIRST), MDB_SUCCESS) << "mdb_cursor_get failed"; +} + +void LMDBDataLayer::Setup(const LayerProto& proto, int npartitions) { + Layer::Setup(proto, npartitions); + OpenLMDB(proto.lmdbdata_conf().path()); + CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_NEXT), + MDB_SUCCESS); + mdb_cursor_close(mdb_cursor_); + mdb_txn_abort(mdb_txn_); + mdb_cursor_ = nullptr; - if (mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_NEXT) - != MDB_SUCCESS) { - CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, - MDB_FIRST), MDB_SUCCESS); - } CaffeDatum datum; datum.ParseFromArray(mdb_value_.mv_data, mdb_value_.mv_size); SingleLabelImageRecord* record=sample_.mutable_image(); @@ -98,6 +102,12 @@ void LMDBDataLayer::Setup(const LayerProto& proto, int npartitions) { random_skip_=proto.lmdbdata_conf().random_skip(); } +LMDBDataLayer::~LMDBDataLayer() { + mdb_cursor_close(mdb_cursor_); + mdb_txn_abort(mdb_txn_); + mdb_cursor_ = nullptr; +} + } /* singa */ #endif
