incubator-singa git commit: SINGA-47 Fix a bug in data layers that leads to out-of-memory when group size is too large

wangsh Wed, 12 Aug 2015 02:50:33 -0700

Repository: incubator-singa
Updated Branches:
  refs/heads/master 538736c4a -> 7a61a687c



SINGA-47 Fix a bug in data layers that leads to out-of-memory when group size 
is too large

The bug is fixed by closing the data source (e.g., lmdb or datashard) after 
reading a sample record in the Setup function.
The data source would cacahe memory which eat up all memory if there are many 
data layers.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/7a61a687
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/7a61a687
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/7a61a687

Branch: refs/heads/master
Commit: 7a61a687c2ceb4fc7e05c2d3bbd9817e8ba59e3f
Parents: 538736c
Author: Wei Wang <[email protected]>
Authored: Wed Aug 12 17:28:50 2015 +0800
Committer: Wei Wang <[email protected]>
Committed: Wed Aug 12 17:32:50 2015 +0800

----------------------------------------------------------------------
 include/neuralnet/layer.h          |  4 +++-
 include/neuralnet/optional_layer.h |  2 ++
 src/neuralnet/layer.cc             | 14 +++++++++++--
 src/neuralnet/neuralnet.cc         |  3 +++
 src/neuralnet/optional_layer.cc    | 36 +++++++++++++++++++++------------
 5 files changed, 43 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/include/neuralnet/layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/layer.h b/include/neuralnet/layer.h
index 05db916..118da56 100644
--- a/include/neuralnet/layer.h
+++ b/include/neuralnet/layer.h
@@ -335,10 +335,12 @@ class ShardDataLayer: public DataLayer{
  public:
   using Layer::ComputeFeature;
 
+  ~ShardDataLayer();
   void Setup(const LayerProto& proto, int npartitions) override;
   void ComputeFeature(Phase phase, Metric *perf) override;
+
  private:
-  shared_ptr<DataShard> shard_;
+  DataShard* shard_;
 };
 
 /**

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/include/neuralnet/optional_layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/optional_layer.h 
b/include/neuralnet/optional_layer.h
index 2cbcdb8..f6b60d3 100644
--- a/include/neuralnet/optional_layer.h
+++ b/include/neuralnet/optional_layer.h
@@ -9,6 +9,8 @@ class LMDBDataLayer: public DataLayer{
  public:
   using Layer::ComputeFeature;
 
+  ~LMDBDataLayer();
+  void OpenLMDB(const std::string& path);
   void Setup(const LayerProto& proto, int npartitions) override;
   void ComputeFeature(Phase phase, Metric *perf) override;
   void ConvertCaffeDatumToRecord(const CaffeDatum& datum,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/src/neuralnet/layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/layer.cc b/src/neuralnet/layer.cc
index 1fa92fb..314bb14 100644
--- a/src/neuralnet/layer.cc
+++ b/src/neuralnet/layer.cc
@@ -666,6 +666,9 @@ void RGBImageLayer::Setup(const LayerProto& proto, int 
npartitions) {
 
 /***************Implementation for ShardDataLayer**************************/
 void ShardDataLayer::ComputeFeature(Phase phase, Metric* perf){
+  if (shard_ == nullptr)
+    shard_ = new DataShard(layer_proto_.sharddata_conf().path(),
+        DataShard::kRead);
   if(random_skip_){
     int nskip = rand() % random_skip_;
     LOG(INFO)<<"Random Skip "<<nskip<<" records, there are "<<shard_->Count()
@@ -687,10 +690,11 @@ void ShardDataLayer::ComputeFeature(Phase phase, Metric* 
perf){
 
 void ShardDataLayer::Setup(const LayerProto& proto, int npartitions) {
   Layer::Setup(proto, npartitions);
-  shard_= std::make_shared<DataShard>(proto.sharddata_conf().path(),
-      DataShard::kRead);
+  shard_= new DataShard(proto.sharddata_conf().path(), DataShard::kRead);
   string key;
   shard_->Next(&key, &sample_);
+  delete shard_;
+  shard_ = nullptr;
   batchsize_=proto.sharddata_conf().batchsize();
   if(partition_dim() == 0)
     batchsize_ /= npartitions;
@@ -698,6 +702,12 @@ void ShardDataLayer::Setup(const LayerProto& proto, int 
npartitions) {
   records_.resize(batchsize_);
   random_skip_=proto.sharddata_conf().random_skip();
 }
+
+ShardDataLayer::~ShardDataLayer() {
+  if (shard_ != nullptr)
+    delete shard_;
+  shard_ = nullptr;
+}
 /*******************Implementation of TanLayer***************************/
 void TanhLayer::Setup(const LayerProto& proto, int npartitions){
   Layer::Setup(proto, npartitions);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/src/neuralnet/neuralnet.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/neuralnet.cc b/src/neuralnet/neuralnet.cc
index 10ddcf1..83bd4fd 100644
--- a/src/neuralnet/neuralnet.cc
+++ b/src/neuralnet/neuralnet.cc
@@ -3,6 +3,9 @@
 
 #include "neuralnet/neuralnet.h"
 #include "utils/singleton.h"
+#ifdef USE_OPTIONAL_LAYER
+#include "neuralnet/optional_layer.h"
+#endif
 
 namespace singa {
 // macros to shorten the code

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/7a61a687/src/neuralnet/optional_layer.cc
----------------------------------------------------------------------
diff --git a/src/neuralnet/optional_layer.cc b/src/neuralnet/optional_layer.cc
index ba85807..06f413f 100644
--- a/src/neuralnet/optional_layer.cc
+++ b/src/neuralnet/optional_layer.cc
@@ -3,7 +3,9 @@
 namespace singa {
 
 /*********************LMDBDataLayer**********************************/
-void LMDBDataLayer::ComputeFeature(Phase phase, Metric* perf){
+void LMDBDataLayer::ComputeFeature(Phase phase, Metric* perf) {
+  if (mdb_cursor_ == nullptr)
+    OpenLMDB(layer_proto_.lmdbdata_conf().path());
   if(random_skip_){
     int nskip = rand() % random_skip_;
     int n=0;
@@ -63,29 +65,31 @@ void LMDBDataLayer::ConvertCaffeDatumToRecord(const 
CaffeDatum& datum,
   }
 }
 
-void LMDBDataLayer::Setup(const LayerProto& proto, int npartitions) {
-  Layer::Setup(proto, npartitions);
+void LMDBDataLayer::OpenLMDB(const std::string& path) {
   CHECK_EQ(mdb_env_create(&mdb_env_), MDB_SUCCESS) << "mdb_env_create failed";
   CHECK_EQ(mdb_env_set_mapsize(mdb_env_, 1099511627776), MDB_SUCCESS); // 1TB
-  CHECK_EQ(mdb_env_open(mdb_env_,
-        proto.lmdbdata_conf().path().c_str(),
-        MDB_RDONLY, 0664), MDB_SUCCESS) << "cannot open lmdb "
-    << proto.lmdbdata_conf().path();
+  CHECK_EQ(mdb_env_open(mdb_env_, path.c_str(),
+        MDB_RDONLY, 0664), MDB_SUCCESS) << "cannot open lmdb " << path;
   CHECK_EQ(mdb_txn_begin(mdb_env_, NULL, MDB_RDONLY, &mdb_txn_), MDB_SUCCESS)
     << "mdb_txn_begin failed";
   CHECK_EQ(mdb_open(mdb_txn_, NULL, 0, &mdb_dbi_), MDB_SUCCESS)
     << "mdb_open failed";
   CHECK_EQ(mdb_cursor_open(mdb_txn_, mdb_dbi_, &mdb_cursor_), MDB_SUCCESS)
     << "mdb_cursor_open failed";
-  LOG(INFO) << "Opening lmdb " << proto.lmdbdata_conf().path();
+  LOG(INFO) << "Opening lmdb " << path;
   CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_FIRST),
       MDB_SUCCESS) << "mdb_cursor_get failed";
+}
+
+void LMDBDataLayer::Setup(const LayerProto& proto, int npartitions) {
+  Layer::Setup(proto, npartitions);
+  OpenLMDB(proto.lmdbdata_conf().path());
+  CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_NEXT),
+      MDB_SUCCESS);
+  mdb_cursor_close(mdb_cursor_);
+  mdb_txn_abort(mdb_txn_);
+  mdb_cursor_ = nullptr;
 
-  if (mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_, MDB_NEXT)
-      != MDB_SUCCESS) {
-    CHECK_EQ(mdb_cursor_get(mdb_cursor_, &mdb_key_, &mdb_value_,
-          MDB_FIRST), MDB_SUCCESS);
-  }
   CaffeDatum datum;
   datum.ParseFromArray(mdb_value_.mv_data, mdb_value_.mv_size);
   SingleLabelImageRecord* record=sample_.mutable_image();
@@ -98,6 +102,12 @@ void LMDBDataLayer::Setup(const LayerProto& proto, int 
npartitions) {
   random_skip_=proto.lmdbdata_conf().random_skip();
 }
 
+LMDBDataLayer::~LMDBDataLayer() {
+  mdb_cursor_close(mdb_cursor_);
+  mdb_txn_abort(mdb_txn_);
+  mdb_cursor_ = nullptr;
+}
+
 } /* singa */
 
 #endif

incubator-singa git commit: SINGA-47 Fix a bug in data layers that leads to out-of-memory when group size is too large

Reply via email to