[3/5] incubator-singa git commit: SINGA-82 Refactor input layers using data store abstraction

wangsh Wed, 07 Oct 2015 00:26:51 -0700

SINGA-82 Refactor input layers using data store abstraction

* Add StoreLayer to read data from Store, e.g., KVFile, TextFile (will add 
support for HDFS later).
* Implemente subclasses of StoreLayer to parse different format tuples, e.g., 
SingleLabelImageRecord or CSV line.
* Update examples to use the new input layers.
* Add unit tests.
* Add a function for Layer class, which returns a vector<AuxType> for auxiliary 
data (e.g., label).


TODO
1. make AuxType a template argument of Layer class, and extend data() to return 
a vector of Blob for multiple dense features.
2. separate layer classeses into different files to make the structure of the 
source folder clear.


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/5f010caa
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/5f010caa
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/5f010caa

Branch: refs/heads/master
Commit: 5f010caabd7c09cd9fabee666d93a36377639270
Parents: d99b24c
Author: Wei Wang <[email protected]>
Authored: Tue Oct 6 01:10:40 2015 +0800
Committer: wang sheng <[email protected]>
Committed: Wed Oct 7 15:19:59 2015 +0800

----------------------------------------------------------------------
 Makefile.am                         |  15 +-
 examples/cifar10/Makefile.example   |  11 +-
 examples/cifar10/create_data.cc     | 135 +++++++++
 examples/cifar10/create_shard.cc    | 131 ---------
 examples/cifar10/job.conf           |  42 ++-
 examples/mnist/Makefile.example     |  10 +-
 examples/mnist/conv.conf            |  42 ++-
 examples/mnist/create_data.cc       | 119 ++++++++
 examples/mnist/create_shard.cc      | 120 --------
 examples/mnist/job.conf             |  46 ++-
 examples/rbm/autoencoder.conf       |  41 ++-
 examples/rbm/rbm1.conf              |  52 ++--
 examples/rbm/rbm2.conf              |  52 ++--
 examples/rbm/rbm3.conf              |  51 ++--
 examples/rbm/rbm4.conf              |  53 ++--
 examples/rnnlm/Makefile.example     |   6 +-
 examples/rnnlm/create_data.cc       | 472 +++++++++++++++++++++++++++++++
 examples/rnnlm/create_shard.cc      | 471 ------------------------------
 examples/rnnlm/job.conf             |  13 +-
 examples/rnnlm/main.cc              |   1 -
 examples/rnnlm/rnnlm.cc             |  91 +++---
 examples/rnnlm/rnnlm.h              |   4 +-
 examples/rnnlm/rnnlm.proto          |   5 +-
 include/io/kvfile_store.h           |   1 +
 include/io/store.h                  |  24 +-
 include/io/textfile_store.h         |   1 +
 include/neuralnet/input_layer.h     | 121 ++++++++
 include/neuralnet/layer.h           |   9 +
 include/singa.h                     |   1 +
 include/utils/tokenizer.h           |  59 ++++
 src/driver.cc                       |   5 +
 src/io/store.cc                     |   7 +-
 src/io/textfile_store.cc            |   1 +
 src/neuralnet/input_layer.cc        | 218 +++++++++++++-
 src/neuralnet/loss_layer.cc         |   4 +-
 src/proto/job.proto                 |  26 +-
 src/test/test_csv_record_layer.cc   |  92 ++++++
 src/test/test_proto_record_layer.cc | 122 ++++++++
 src/utils/image_transform.cc        |  56 ++++
 39 files changed, 1711 insertions(+), 1019 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/Makefile.am
----------------------------------------------------------------------
diff --git a/Makefile.am b/Makefile.am
index f8e765d..a1496bd 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -35,6 +35,7 @@ SINGA_SRCS := src/driver.cc \
               src/utils/updater.cc \
               src/utils/data_shard.cc \
               src/utils/blob.cc \
+              src/utils/image_transform.cc \
               src/server.cc \
               src/worker.cc \
               src/stub.cc \
@@ -64,6 +65,8 @@ SINGA_HDRS := include/singa.h \
               include/utils/blob.h \
               include/utils/updater.h \
               include/utils/tinydir.h \
+              include/utils/tokenizer.h \
+              include/utils/image_transform.h \
               include/server.h \
               include/worker.h \
               include/stub.h \
@@ -84,10 +87,10 @@ SINGA_HDRS := include/singa.h \
               include/mshadow/tensor_random.h \
               include/comm/msg.h \
               include/comm/socket.h
-                                                       src/io/store.h \
-                                                       src/io/kvfile.h \
-                                                       src/io/kvfile_store.h \
-                                                       src/io/textfile_store.h
+                                                       include/io/store.h \
+                                                       include/io/kvfile.h \
+                                                       
include/io/kvfile_store.h \
+                                                       
include/io/textfile_store.h
 
 GTEST_SRCS := include/gtest/gtest-all.cc
 GTEST_HRDS := include/gtest/gtest.h
@@ -98,7 +101,9 @@ TEST_SRCS := include/gtest/gtest_main.cc \
                         src/test/test_neuralnet.cc \
                         src/test/test_paramslicer.cc \
                         src/test/test_shard.cc \
-                        src/test/test_store.cc
+                        src/test/test_store.cc \
+                        src/test/test_proto_record_layer.cc \
+                        src/test/test_csv_record_layer.cc
 
 #EXTRA_PROGRAMS = $(PROGS)
 EXTRA_PROGRAMS = singatest

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/cifar10/Makefile.example
----------------------------------------------------------------------
diff --git a/examples/cifar10/Makefile.example 
b/examples/cifar10/Makefile.example
index 9e65a58..dd65d7d 100644
--- a/examples/cifar10/Makefile.example
+++ b/examples/cifar10/Makefile.example
@@ -29,12 +29,7 @@ cifar-10-binary-bin:
        tar xf cifar-10-binary.tar.gz
 
 create:
-       $(CXX) create_shard.cc -std=c++11 -lsinga -lprotobuf -lglog \
+       $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog \
                -I../../include -L../../.libs/ 
-Wl,-unresolved-symbols=ignore-in-shared-libs \
-               -Wl,-rpath=../../.libs/  -o create_shard.bin
-       mkdir cifar10_train_shard
-       mkdir cifar10_test_shard
-       ./create_shard.bin cifar-10-batches-bin .
-
-
-
+               -Wl,-rpath=../../.libs/  -o create_data.bin
+       ./create_data.bin cifar-10-batches-bin .

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/cifar10/create_data.cc
----------------------------------------------------------------------
diff --git a/examples/cifar10/create_data.cc b/examples/cifar10/create_data.cc
new file mode 100644
index 0000000..5fddd1d
--- /dev/null
+++ b/examples/cifar10/create_data.cc
@@ -0,0 +1,135 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+//
+// This code creates training and test DataShard for CIFAR dataset.
+// It is adapted from the convert_cifar_data from Caffe
+//
+// Usage:
+//    create_shard.bin input_folder output_folder
+//
+// The CIFAR dataset could be downloaded at
+//    http://www.cs.toronto.edu/~kriz/cifar.html
+//
+
+#include <glog/logging.h>
+#include <fstream>
+#include <string>
+#include <cstdint>
+#include <iostream>
+
+#include "./singa.h"
+
+using std::string;
+
+const int kCIFARSize = 32;
+const int kCIFARImageNBytes = 3072;
+const int kCIFARBatchSize = 10000;
+const int kCIFARTrainBatches = 5;
+
+void read_image(std::ifstream* file, int* label, char* buffer) {
+  char label_char;
+  file->read(&label_char, 1);
+  *label = label_char;
+  file->read(buffer, kCIFARImageNBytes);
+  return;
+}
+
+void create_data(const string& input_folder, const string& output_folder) {
+  int label;
+  char str_buffer[kCIFARImageNBytes];
+  string rec_buf;
+
+  singa::SingleLabelImageRecord image;;
+  image.add_shape(3);
+  image.add_shape(kCIFARSize);
+  image.add_shape(kCIFARSize);
+
+  singa::SingleLabelImageRecord mean;
+  mean.CopyFrom(image);
+  for (int i = 0; i < kCIFARImageNBytes; i++)
+    mean.add_data(0.f);
+
+  auto store = singa::io::CreateStore("kvfile");
+  CHECK(store->Open(output_folder + "/train_data.bin", singa::io::kCreate));
+  LOG(INFO) << "Preparing training data";
+  int count = 0;
+  for (int fileid = 0; fileid < kCIFARTrainBatches; ++fileid) {
+    LOG(INFO) << "Training Batch " << fileid + 1;
+    snprintf(str_buffer, kCIFARImageNBytes, "/data_batch_%d.bin", fileid + 1);
+    std::ifstream data_file((input_folder + str_buffer).c_str(),
+        std::ios::in | std::ios::binary);
+    CHECK(data_file.is_open()) << "Unable to open train file #" << fileid + 1;
+    for (int itemid = 0; itemid < kCIFARBatchSize; ++itemid) {
+      read_image(&data_file, &label, str_buffer);
+      image.set_label(label);
+      image.set_pixel(str_buffer, kCIFARImageNBytes);
+      image.SerializeToString(&rec_buf);
+      int length = snprintf(str_buffer, kCIFARImageNBytes, "%05d", count);
+      CHECK(store->Write(string(str_buffer, length), rec_buf));
+
+      const string& pixels = image.pixel();
+      for (int i = 0; i < kCIFARImageNBytes; i++)
+        mean.set_data(i, mean.data(i) + static_cast<uint8_t>(pixels[i]));
+      count += 1;
+    }
+  }
+  store->Flush();
+  store->Close();
+
+  LOG(INFO) << "Create image mean";
+  store->Open(output_folder + "/image_mean.bin", singa::io::kCreate);
+  for (int i = 0; i < kCIFARImageNBytes; i++)
+    mean.set_data(i, mean.data(i) / count);
+  mean.SerializeToString(&rec_buf);
+  store->Write("mean", rec_buf);
+  store->Flush();
+  store->Close();
+
+  LOG(INFO) << "Create test data";
+  store->Open(output_folder + "/test_data.bin", singa::io::kCreate);
+  std::ifstream data_file((input_folder + "/test_batch.bin").c_str(),
+      std::ios::in | std::ios::binary);
+  CHECK(data_file.is_open()) << "Unable to open test file.";
+  for (int itemid = 0; itemid < kCIFARBatchSize; ++itemid) {
+    read_image(&data_file, &label, str_buffer);
+    image.set_label(label);
+    image.set_pixel(str_buffer, kCIFARImageNBytes);
+    image.SerializeToString(&rec_buf);
+    int length = snprintf(str_buffer, kCIFARImageNBytes, "%05d", itemid);
+    CHECK(store->Write(string(str_buffer, length), rec_buf));
+  }
+  store->Flush();
+  store->Close();
+}
+
+int main(int argc, char** argv) {
+  if (argc != 3) {
+    std::cout <<"Create train and test DataShard for Cifar dataset.\n"
+      << "Usage:\n"
+      << "    create_data.bin input_folder output_folder\n"
+      << "Where the input folder should contain the binary batch files.\n";
+  } else {
+    google::InitGoogleLogging(argv[0]);
+    create_data(string(argv[1]), string(argv[2]));
+  }
+  return 0;
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/cifar10/create_shard.cc
----------------------------------------------------------------------
diff --git a/examples/cifar10/create_shard.cc b/examples/cifar10/create_shard.cc
deleted file mode 100644
index 0a00639..0000000
--- a/examples/cifar10/create_shard.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-//
-// This code creates training and test DataShard for CIFAR dataset.
-// It is adapted from the convert_cifar_data from Caffe
-//
-// Usage:
-//    create_shard.bin input_folder output_folder
-//
-// The CIFAR dataset could be downloaded at
-//    http://www.cs.toronto.edu/~kriz/cifar.html
-//
-
-#include <fstream>
-#include <string>
-
-#include <glog/logging.h>
-#include <cstdint>
-#include <iostream>
-
-#include "singa.h"
-
-using std::string;
-
-using singa::DataShard;
-using singa::WriteProtoToBinaryFile;
-
-const int kCIFARSize = 32;
-const int kCIFARImageNBytes = 3072;
-const int kCIFARBatchSize = 10000;
-const int kCIFARTrainBatches = 5;
-
-void read_image(std::ifstream* file, int* label, char* buffer) {
-  char label_char;
-  file->read(&label_char, 1);
-  *label = label_char;
-  file->read(buffer, kCIFARImageNBytes);
-  return;
-}
-
-void create_shard(const string& input_folder, const string& output_folder) {
-  int label;
-  // Data buffer
-  char str_buffer[kCIFARImageNBytes];
-  singa::Record record;
-  singa::SingleLabelImageRecord* image=record.mutable_image();;
-  image->add_shape(3);
-  image->add_shape(kCIFARSize);
-  image->add_shape(kCIFARSize);
-
-  singa::SingleLabelImageRecord mean;
-  mean.CopyFrom(*image);
-  for(int i=0;i<kCIFARImageNBytes;i++)
-    mean.add_data(0.);
-
-  DataShard 
train_shard(output_folder+"/cifar10_train_shard",DataShard::kCreate);
-  LOG(INFO) << "Writing Training data";
-  int count=0;
-  for (int fileid = 0; fileid < kCIFARTrainBatches; ++fileid) {
-    // Open files
-    LOG(INFO) << "Training Batch " << fileid + 1;
-    snprintf(str_buffer, kCIFARImageNBytes, "/data_batch_%d.bin", fileid + 1);
-    std::ifstream data_file((input_folder + str_buffer).c_str(),
-        std::ios::in | std::ios::binary);
-    CHECK(data_file) << "Unable to open train file #" << fileid + 1;
-    for (int itemid = 0; itemid < kCIFARBatchSize; ++itemid) {
-      read_image(&data_file, &label, str_buffer);
-      image->set_label(label);
-      image->set_pixel(str_buffer, kCIFARImageNBytes);
-      int length = snprintf(str_buffer, kCIFARImageNBytes, "%05d",
-          fileid * kCIFARBatchSize + itemid);
-      CHECK(train_shard.Insert(string(str_buffer, length), record));
-
-      const string& pixels=image->pixel();
-      for(int i=0;i<kCIFARImageNBytes;i++)
-        mean.set_data(i, mean.data(i)+static_cast<uint8_t>(pixels[i]));
-      count+=1;
-    }
-  }
-  train_shard.Flush();
-  for(int i=0;i<kCIFARImageNBytes;i++)
-    mean.set_data(i, mean.data(i)/count);
-  WriteProtoToBinaryFile(mean, (output_folder+"/image_mean.bin").c_str());
-
-  LOG(INFO) << "Writing Testing data";
-  DataShard test_shard(output_folder+"/cifar10_test_shard",DataShard::kCreate);
-  // Open files
-  std::ifstream data_file((input_folder + "/test_batch.bin").c_str(),
-      std::ios::in | std::ios::binary);
-  CHECK(data_file) << "Unable to open test file.";
-  for (int itemid = 0; itemid < kCIFARBatchSize; ++itemid) {
-    read_image(&data_file, &label, str_buffer);
-    image->set_label(label);
-    image->set_pixel(str_buffer, kCIFARImageNBytes);
-    int length = snprintf(str_buffer, kCIFARImageNBytes, "%05d", itemid);
-    CHECK(test_shard.Insert(string(str_buffer, length), record));
-  }
-  test_shard.Flush();
-}
-
-int main(int argc, char** argv) {
-  if (argc != 3) {
-  std::cout<<"Create train and test DataShard for Cifar dataset.\n"
-           <<"Usage:\n"
-           <<"    create_shard.bin input_folder output_folder\n"
-           <<"Where the input folder should contain the binary batch files.\n";
-  } else {
-    google::InitGoogleLogging(argv[0]);
-    create_shard(string(argv[1]), string(argv[2]));
-  }
-  return 0;
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/cifar10/job.conf
----------------------------------------------------------------------
diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf
index 343d969..57f4b36 100644
--- a/examples/cifar10/job.conf
+++ b/examples/cifar10/job.conf
@@ -1,7 +1,7 @@
 name: "cifar10-convnet"
 train_steps: 1000
 test_steps: 100
-test_freq:300
+test_freq: 300
 disp_freq: 30
 train_one_batch {
   alg: kBP
@@ -24,41 +24,38 @@ updater{
 neuralnet {
   layer{
     name: "data"
-    type: kShardData
-    sharddata_conf {
-      path: "examples/cifar10/cifar10_train_shard"
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/cifar10/train_data.bin"
+      mean_file: "examples/cifar10/image_mean.bin"
       batchsize: 64
       random_skip: 5000
+      shape: 3
+      shape: 32
+      shape: 32
     }
     exclude: kTest
   }
   layer{
     name: "data"
-    type: kShardData
-    sharddata_conf {
-      path: "examples/cifar10/cifar10_test_shard"
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/cifar10/test_data.bin"
+      mean_file: "examples/cifar10/image_mean.bin"
       batchsize: 100
+      shape: 3
+      shape: 32
+      shape: 32
     }
     exclude: kTrain
   }
-  layer{
-    name:"rgb"
-    type: kRGBImage
-    srclayers: "data"
-    rgbimage_conf {
-      meanfile: "examples/cifar10/image_mean.bin"
-    }
-  }
-  layer{
-    name: "label"
-    type: kLabel
-    srclayers: "data"
-  }
 
   layer {
     name: "conv1"
     type: kCConvolution
-    srclayers: "rgb"
+    srclayers: "data"
     convolution_conf {
       num_filters: 32
       kernel: 5
@@ -223,7 +220,6 @@ neuralnet {
       }
     }
   }
-
   layer{
     name: "loss"
     type: kSoftmaxLoss
@@ -231,7 +227,7 @@ neuralnet {
       topk:1
     }
     srclayers:"ip1"
-    srclayers: "label"
+    srclayers: "data"
   }
 }
 cluster {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/mnist/Makefile.example
----------------------------------------------------------------------
diff --git a/examples/mnist/Makefile.example b/examples/mnist/Makefile.example
index 4df4edd..733633d 100644
--- a/examples/mnist/Makefile.example
+++ b/examples/mnist/Makefile.example
@@ -34,10 +34,8 @@ mnist:
        gunzip t10k-images-idx3-ubyte.gz && gunzip t10k-labels-idx1-ubyte.gz
 
 create:
-       $(CXX) create_shard.cc -std=c++11 -lsinga -lprotobuf -lglog 
-I../../include \
+       $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog 
-I../../include \
                -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs 
-Wl,-rpath=../../.libs/ \
-               -o create_shard.bin
-       mkdir mnist_train_shard
-       mkdir mnist_test_shard
-       ./create_shard.bin train-images-idx3-ubyte train-labels-idx1-ubyte 
mnist_train_shard
-       ./create_shard.bin t10k-images-idx3-ubyte t10k-labels-idx1-ubyte 
mnist_test_shard
+               -o create_data.bin
+       ./create_data.bin train-images-idx3-ubyte train-labels-idx1-ubyte 
train_data.bin
+       ./create_data.bin t10k-images-idx3-ubyte t10k-labels-idx1-ubyte 
test_data.bin

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/mnist/conv.conf
----------------------------------------------------------------------
diff --git a/examples/mnist/conv.conf b/examples/mnist/conv.conf
index 7f7a158..ba631c1 100644
--- a/examples/mnist/conv.conf
+++ b/examples/mnist/conv.conf
@@ -22,43 +22,39 @@ updater {
 neuralnet {
   layer {
     name: "data"
-    type:  kShardData
-    sharddata_conf {
-      path: "examples/mnist/mnist_train_shard"
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/mnist/train_data.bin"
       batchsize: 64
+      std_value: 255
+      random_skip: 5000
+      shape: 1
+      shape: 28
+      shape: 28
     }
     exclude: kTest
   }
 
   layer {
     name: "data"
-    type: kShardData
-    sharddata_conf {
-      path: "examples/mnist/mnist_test_shard"
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/mnist/test_data.bin"
+      std_value: 255
       batchsize: 100
+      shape: 1
+      shape: 28
+      shape: 28
     }
     exclude: kTrain
   }
 
-  layer{
-    name:"mnist"
-    type: kMnist
-    srclayers: "data"
-    mnist_conf {
-      norm_a:255
-      norm_b:0
-    }
-  }
-
-  layer{
-    name: "label"
-    type: kLabel
-    srclayers: "data"
-  }
   layer {
     name: "conv1"
     type: kCConvolution
-    srclayers: "mnist"
+    srclayers: "data"
     convolution_conf {
       num_filters: 20
       kernel: 5
@@ -181,7 +177,7 @@ neuralnet {
       topk:1
     }
     srclayers:"ip2"
-    srclayers:"label"
+    srclayers:"data"
   }
 }
 cluster {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/mnist/create_data.cc
----------------------------------------------------------------------
diff --git a/examples/mnist/create_data.cc b/examples/mnist/create_data.cc
new file mode 100644
index 0000000..aad1f56
--- /dev/null
+++ b/examples/mnist/create_data.cc
@@ -0,0 +1,119 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+//
+// This code creates DataShard for MNIST dataset.
+// It is adapted from the convert_mnist_data from Caffe
+//
+// Usage:
+//    create_shard.bin input_image_file input_label_file output_folder
+// The MNIST dataset could be downloaded at
+//    http://yann.lecun.com/exdb/mnist/
+
+#include <glog/logging.h>
+#include <cstdint>
+#include <iostream>
+
+#include <fstream>
+#include <string>
+
+#include "io/store.h"
+#include "utils/common.h"
+#include "proto/common.pb.h"
+
+using std::string;
+
+uint32_t swap_endian(uint32_t val) {
+    val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
+    return (val << 16) | (val >> 16);
+}
+
+void create_data(const char* image_filename, const char* label_filename,
+        const char* output) {
+  // Open files
+  std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
+  std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
+  CHECK(image_file) << "Unable to open file " << image_filename;
+  CHECK(label_file) << "Unable to open file " << label_filename;
+  // Read the magic and the meta data
+  uint32_t magic;
+  uint32_t num_items;
+  uint32_t num_labels;
+  uint32_t rows;
+  uint32_t cols;
+
+  image_file.read(reinterpret_cast<char*>(&magic), 4);
+  magic = swap_endian(magic);
+  CHECK_EQ(magic, 2051) << "Incorrect image file magic.";
+  label_file.read(reinterpret_cast<char*>(&magic), 4);
+  magic = swap_endian(magic);
+  CHECK_EQ(magic, 2049) << "Incorrect label file magic.";
+  image_file.read(reinterpret_cast<char*>(&num_items), 4);
+  num_items = swap_endian(num_items);
+  label_file.read(reinterpret_cast<char*>(&num_labels), 4);
+  num_labels = swap_endian(num_labels);
+  CHECK_EQ(num_items, num_labels);
+  image_file.read(reinterpret_cast<char*>(&rows), 4);
+  rows = swap_endian(rows);
+  image_file.read(reinterpret_cast<char*>(&cols), 4);
+  cols = swap_endian(cols);
+
+  auto store = singa::io::OpenStore("kvfile", output, singa::io::kCreate);
+  char label;
+  char* pixels = new char[rows * cols];
+  int count = 0;
+  const int kMaxKeyLength = 10;
+  char key[kMaxKeyLength];
+  string value;
+
+  singa::SingleLabelImageRecord image;
+  image.add_shape(rows);
+  image.add_shape(cols);
+  LOG(INFO) << "A total of " << num_items << " items.";
+  LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
+  for (int item_id = 0; item_id < num_items; ++item_id) {
+    image_file.read(pixels, rows * cols);
+    label_file.read(&label, 1);
+    image.set_pixel(pixels, rows*cols);
+    image.set_label(label);
+    snprintf(key, kMaxKeyLength, "%08d", item_id);
+    image.SerializeToString(&value);
+    store->Write(string(key), value);
+  }
+  delete pixels;
+  store->Flush();
+  delete store;
+}
+
+int main(int argc, char** argv) {
+  if (argc != 4) {
+    std::cout<<"This program create a DataShard for a MNIST dataset\n"
+        "Usage:\n"
+        "    create_shard.bin  input_image_file input_label_file 
output_db_file\n"
+        "The MNIST dataset could be downloaded at\n"
+        "    http://yann.lecun.com/exdb/mnist/\n";
+        "You should gunzip them after downloading.";
+  } else {
+    google::InitGoogleLogging(argv[0]);
+    create_data(argv[1], argv[2], argv[3]);
+  }
+  return 0;
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/mnist/create_shard.cc
----------------------------------------------------------------------
diff --git a/examples/mnist/create_shard.cc b/examples/mnist/create_shard.cc
deleted file mode 100644
index 3d7bd97..0000000
--- a/examples/mnist/create_shard.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* 
-*   http://www.apache.org/licenses/LICENSE-2.0
-* 
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-//
-// This code creates DataShard for MNIST dataset.
-// It is adapted from the convert_mnist_data from Caffe
-//
-// Usage:
-//    create_shard.bin input_image_file input_label_file output_folder
-// The MNIST dataset could be downloaded at
-//    http://yann.lecun.com/exdb/mnist/
-
-#include <glog/logging.h>
-#include <cstdint>
-#include <iostream>
-
-#include <fstream>
-#include <string>
-
-#include "utils/data_shard.h"
-#include "utils/common.h"
-#include "proto/common.pb.h"
-
-using singa::DataShard;
-using singa::WriteProtoToBinaryFile;
-using std::string;
-
-uint32_t swap_endian(uint32_t val) {
-    val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
-    return (val << 16) | (val >> 16);
-}
-
-void create_shard(const char* image_filename, const char* label_filename,
-        const char* output) {
-  // Open files
-  std::ifstream image_file(image_filename, std::ios::in | std::ios::binary);
-  std::ifstream label_file(label_filename, std::ios::in | std::ios::binary);
-  CHECK(image_file) << "Unable to open file " << image_filename;
-  CHECK(label_file) << "Unable to open file " << label_filename;
-  // Read the magic and the meta data
-  uint32_t magic;
-  uint32_t num_items;
-  uint32_t num_labels;
-  uint32_t rows;
-  uint32_t cols;
-
-  image_file.read(reinterpret_cast<char*>(&magic), 4);
-  magic = swap_endian(magic);
-  CHECK_EQ(magic, 2051) << "Incorrect image file magic.";
-  label_file.read(reinterpret_cast<char*>(&magic), 4);
-  magic = swap_endian(magic);
-  CHECK_EQ(magic, 2049) << "Incorrect label file magic.";
-  image_file.read(reinterpret_cast<char*>(&num_items), 4);
-  num_items = swap_endian(num_items);
-  label_file.read(reinterpret_cast<char*>(&num_labels), 4);
-  num_labels = swap_endian(num_labels);
-  CHECK_EQ(num_items, num_labels);
-  image_file.read(reinterpret_cast<char*>(&rows), 4);
-  rows = swap_endian(rows);
-  image_file.read(reinterpret_cast<char*>(&cols), 4);
-  cols = swap_endian(cols);
-
-  DataShard shard(output, DataShard::kCreate);
-  char label;
-  char* pixels = new char[rows * cols];
-  int count = 0;
-  const int kMaxKeyLength = 10;
-  char key[kMaxKeyLength];
-  string value;
-
-  singa::Record record;
-  singa::SingleLabelImageRecord* image=record.mutable_image();
-  image->add_shape(rows);
-  image->add_shape(cols);
-  LOG(INFO) << "A total of " << num_items << " items.";
-  LOG(INFO) << "Rows: " << rows << " Cols: " << cols;
-  for (int item_id = 0; item_id < num_items; ++item_id) {
-    image_file.read(pixels, rows * cols);
-    label_file.read(&label, 1);
-    image->set_pixel(pixels, rows*cols);
-    image->set_label(label);
-    snprintf(key, kMaxKeyLength, "%08d", item_id);
-    shard.Insert(string(key), record);
-  }
-  delete pixels;
-  shard.Flush();
-}
-
-int main(int argc, char** argv) {
-  if (argc != 4) {
-    std::cout<<"This program create a DataShard for a MNIST dataset\n"
-        "Usage:\n"
-        "    create_shard.bin  input_image_file input_label_file 
output_db_file\n"
-        "The MNIST dataset could be downloaded at\n"
-        "    http://yann.lecun.com/exdb/mnist/\n";
-        "You should gunzip them after downloading.";
-  } else {
-    google::InitGoogleLogging(argv[0]);
-    create_shard(argv[1], argv[2], argv[3]);
-  }
-  return 0;
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/mnist/job.conf
----------------------------------------------------------------------
diff --git a/examples/mnist/job.conf b/examples/mnist/job.conf
index 6d02561..bfbf17d 100644
--- a/examples/mnist/job.conf
+++ b/examples/mnist/job.conf
@@ -21,45 +21,37 @@ updater{
 neuralnet {
   layer {
     name: "data"
-    type: kShardData
-    sharddata_conf {
-      path: "examples/mnist/mnist_train_shard"
-      batchsize: 1000
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/mnist/train_data.bin"
+      random_skip: 5000
+      batchsize: 64
+      shape: 784
+      std_value: 127.5
+      mean_value: 127.5
     }
     exclude: kTest
   }
 
   layer {
     name: "data"
-    type: kShardData
-    sharddata_conf {
-      path: "examples/mnist/mnist_test_shard"
-      batchsize: 1000
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/mnist/test_data.bin"
+      batchsize: 100
+      shape: 784
+      std_value: 127.5
+      mean_value: 127.5
     }
     exclude: kTrain
   }
 
   layer{
-    name:"mnist"
-    type: kMnist
-    srclayers: "data"
-    mnist_conf {
-      norm_a: 127.5
-      norm_b: 1
-    }
-  }
-
-
-  layer{
-    name: "label"
-    type: kLabel
-    srclayers: "data"
-  }
-
-  layer{
     name: "fc1"
     type: kInnerProduct
-    srclayers:"mnist"
+    srclayers:"data"
     innerproduct_conf{
       num_output: 2500
     }
@@ -239,7 +231,7 @@ neuralnet {
       topk:1
     }
     srclayers:"fc6"
-    srclayers:"label"
+    srclayers:"data"
   }
 }
 cluster {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rbm/autoencoder.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/autoencoder.conf b/examples/rbm/autoencoder.conf
index c818c6e..5799adf 100644
--- a/examples/rbm/autoencoder.conf
+++ b/examples/rbm/autoencoder.conf
@@ -21,44 +21,35 @@ updater{
 neuralnet {
   layer {
     name: "data"
-    type: kShardData
-    sharddata_conf {
-      path: "examples/mnist/mnist_train_shard"
-      batchsize: 1000
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/mnist/train_data.bin"
+      batchsize: 100
+      std_value: 255
+      shape: 784
     }
     exclude: kTest
   }
 
   layer {
     name: "data"
-    type: kShardData
-    sharddata_conf {
-      path: "examples/mnist/mnist_test_shard"
-      batchsize: 1000
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/mnist/test_data.bin"
+      std_value: 255
+      batchsize: 100
+      shape: 784
     }
     exclude: kTrain
   }
 
-  layer{
-    name:"mnist"
-    type: kMnist
-    srclayers: "data"
-    mnist_conf {
-      norm_a: 255
-      norm_b: 0
-    }
-  }
-
-  layer{
-    name: "label"
-    type: kLabel
-    srclayers: "data"
-  }
 
   layer{
     name: "Inner1"
     type: kInnerProduct
-    srclayers:"mnist"
+    srclayers:"data"
     innerproduct_conf{
       num_output: 1000
     }
@@ -228,7 +219,7 @@ neuralnet {
     name: "loss"
     type:kEuclideanLoss
     srclayers:"Sigmoid8"
-    srclayers:"mnist"
+    srclayers:"data"
   }
 }
 cluster {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rbm/rbm1.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm1.conf b/examples/rbm/rbm1.conf
index d185766..1c23d47 100644
--- a/examples/rbm/rbm1.conf
+++ b/examples/rbm/rbm1.conf
@@ -17,42 +17,36 @@ updater{
 }
 
 neuralnet {
-layer {
-  name: "data"
-  type: kShardData
-  sharddata_conf {
-    path: "examples/mnist/mnist_train_shard"
-    batchsize: 100
-  }
-  exclude: kTest
-}
-
-
-layer {
-  name: "data"
-  type: kShardData
-  sharddata_conf {
-    path: "examples/mnist/mnist_test_shard"
-    batchsize: 100
+  layer {
+    name: "data"
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/mnist/train_data.bin"
+      batchsize: 100
+      std_value: 255
+      shape: 784
+    }
+    exclude: kTest
   }
-  exclude: kTrain
-}
-
 
-layer{
-  name:"mnist"
-  type: kMnist
-  srclayers: "data"
-  mnist_conf {
-    norm_a: 255
-    norm_b: 0
+  layer {
+    name: "data"
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/mnist/test_data.bin"
+      std_value: 255
+      batchsize: 100
+      shape: 784
+    }
+    exclude: kTrain
   }
-}
 
 layer{
   name: "RBMVis"
   type: kRBMVis
-  srclayers:"mnist"
+  srclayers:"data"
   srclayers:"RBMHid"
   rbm_conf{
     hdim: 1000

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rbm/rbm2.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm2.conf b/examples/rbm/rbm2.conf
index 52dc698..2f51208 100644
--- a/examples/rbm/rbm2.conf
+++ b/examples/rbm/rbm2.conf
@@ -18,42 +18,36 @@ updater{
 }
 
 neuralnet {
-layer {
-  name: "data"
-  type: kShardData
-  sharddata_conf {
-    path: "examples/mnist/mnist_train_shard"
-    batchsize: 100
-  }
-  exclude: kTest
-}
-
-
-layer {
-  name: "data"
-  type: kShardData
-  sharddata_conf {
-    path: "examples/mnist/mnist_test_shard"
-    batchsize: 100
+  layer {
+    name: "data"
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/mnist/train_data.bin"
+      batchsize: 100
+      std_value: 255
+      shape: 784
+    }
+    exclude: kTest
   }
-  exclude: kTrain
-}
-
 
-layer{
-  name:"mnist"
-  type: kMnist
-  srclayers: "data"
-  mnist_conf {
-    norm_a: 255
-    norm_b: 0
+  layer {
+    name: "data"
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/mnist/test_data.bin"
+      std_value: 255
+      batchsize: 100
+      shape: 784
+    }
+    exclude: kTrain
   }
-}
 
 layer{
   name: "Inner1"
   type: kInnerProduct
-  srclayers:"mnist"
+  srclayers:"data"
   innerproduct_conf{
     num_output: 1000
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rbm/rbm3.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm3.conf b/examples/rbm/rbm3.conf
index 354fb3b..5df9ae3 100644
--- a/examples/rbm/rbm3.conf
+++ b/examples/rbm/rbm3.conf
@@ -20,42 +20,37 @@ updater{
 
 
 neuralnet {
-layer {
-  name: "data"
-  type: kShardData
-  sharddata_conf {
-    path: "examples/mnist/mnist_train_shard"
-    batchsize: 100
+  layer {
+    name: "data"
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/mnist/train_data.bin"
+      batchsize: 100
+      std_value: 255
+      shape: 784
+    }
+    exclude: kTest
   }
-  exclude: kTest
-}
-
 
-layer {
-  name: "data"
-  type: kShardData
-  sharddata_conf {
-    path: "examples/mnist/mnist_test_shard"
-    batchsize: 100
+  layer {
+    name: "data"
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/mnist/test_data.bin"
+      std_value: 255
+      batchsize: 100
+      shape: 784
+    }
+    exclude: kTrain
   }
-  exclude: kTrain
-}
 
 
 layer{
-  name:"mnist"
-  type: kMnist
-  srclayers: "data"
-  mnist_conf {
-    norm_a: 255
-    norm_b: 0
-  }
-}
-
-layer{
     name: "Inner1"
     type: kInnerProduct
-    srclayers:"mnist"
+    srclayers:"data"
     innerproduct_conf{
       num_output: 1000
     }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rbm/rbm4.conf
----------------------------------------------------------------------
diff --git a/examples/rbm/rbm4.conf b/examples/rbm/rbm4.conf
index ebf39fa..a34a75c 100644
--- a/examples/rbm/rbm4.conf
+++ b/examples/rbm/rbm4.conf
@@ -18,42 +18,37 @@ updater{
 }
 
 neuralnet {
-layer {
-  name: "data"
-  type: kShardData
-  sharddata_conf {
-    path: "examples/mnist/mnist_train_shard"
-    batchsize: 100
+  layer {
+    name: "data"
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/mnist/train_data.bin"
+      batchsize: 100
+      std_value: 255
+      shape: 784
+    }
+    exclude: kTest
   }
-  exclude: kTest
-}
-
 
-layer {
-  name: "data"
-  type: kShardData
-  sharddata_conf {
-    path: "examples/mnist/mnist_test_shard"
-    batchsize: 100
+  layer {
+    name: "data"
+    type: kProtoRecord
+    store_conf {
+      backend: "kvfile"
+      path: "examples/mnist/test_data.bin"
+      std_value: 255
+      batchsize: 100
+      shape: 784
+    }
+    exclude: kTrain
   }
-  exclude: kTrain
-}
 
 
-layer{
-  name:"mnist"
-  type: kMnist
-  srclayers: "data"
-  mnist_conf {
-    norm_a: 255
-    norm_b: 0
-  }
-}
-
-layer{
+  layer{
     name: "Inner1"
     type: kInnerProduct
-    srclayers:"mnist"
+    srclayers:"data"
     innerproduct_conf{
       num_output: 1000
     }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/Makefile.example
----------------------------------------------------------------------
diff --git a/examples/rnnlm/Makefile.example b/examples/rnnlm/Makefile.example
index 48efd17..5c5ef97 100644
--- a/examples/rnnlm/Makefile.example
+++ b/examples/rnnlm/Makefile.example
@@ -38,11 +38,11 @@ download:
 
 create:
        protoc --proto_path=../../src/proto --proto_path=. --cpp_out=. 
rnnlm.proto
-       $(CXX) create_shard.cc rnnlm.pb.cc -std=c++11 -lsinga -lprotobuf 
-lzookeeper_mt -lglog -I../../include -I../../include/proto \
+       $(CXX) create_data.cc rnnlm.pb.cc -std=c++11 -lsinga -lprotobuf 
-lzookeeper_mt -lglog -I../../include -I../../include/proto \
                -L../../.libs/ -L/usr/local/lib 
-Wl,-unresolved-symbols=ignore-in-shared-libs -Wl,-rpath=../../.libs/ \
-               -o create_shard.bin
+               -o create_data.bin
        for d in $(dirshards); do mkdir -p $${d}; done
-       ./create_shard.bin -train $(dirname)/train -test $(dirname)/test -valid 
$(dirname)/valid -class_size $(numclass)
+       ./create_data.bin -train $(dirname)/train -test $(dirname)/test -valid 
$(dirname)/valid -class_size $(numclass)
 
 
 rnnlm:

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/create_data.cc
----------------------------------------------------------------------
diff --git a/examples/rnnlm/create_data.cc b/examples/rnnlm/create_data.cc
new file mode 100644
index 0000000..d63a8df
--- /dev/null
+++ b/examples/rnnlm/create_data.cc
@@ -0,0 +1,472 @@
+/*
+ * This file include code from rnnlmlib-0.4 whose licence is as follows:
+Copyright (c) 2010-2012 Tomas Mikolov
+Copyright (c) 2013 Cantab Research Ltd
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither name of copyright holders nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+//
+// This code creates DataShard for RNNLM dataset.
+// The RNNLM dataset could be downloaded at
+//    http://www.rnnlm.org/
+//
+// Usage:
+//    create_shard.bin -train [train_file] -valid [valid_file]
+//                     -test [test_file] -class_size [# of classes]
+
+#include <cstring>
+#include <cstdlib>
+#include <cstdio>
+#include <cmath>
+#include <algorithm>
+#include <fstream>
+
+#include "io/store.h"
+#include "utils/common.h"
+#include "proto/common.pb.h"
+#include "./rnnlm.pb.h"
+
+#define MAX_STRING 100
+#define BUFFER_LEN 32
+#define NL_STRING  "</s>"
+
+using std::string;
+using std::max;
+using std::min;
+
+struct vocab_word {
+  int cn;
+  char word[MAX_STRING];
+  int class_index;
+};
+
+struct vocab_word *vocab;
+int vocab_max_size;
+int vocab_size;
+int *vocab_hash;
+int vocab_hash_size;
+int debug_mode;
+int old_classes;
+int *class_start;
+int *class_end;
+int class_size;
+
+char train_file[MAX_STRING];
+char valid_file[MAX_STRING];
+char test_file[MAX_STRING];
+
+int valid_mode;
+int test_mode;
+
+unsigned int getWordHash(char *word) {
+  unsigned int hash, a;
+
+  hash = 0;
+  for (a = 0; a < strlen(word); a++) hash = hash * 237 + word[a];
+  hash = hash % vocab_hash_size;
+
+  return hash;
+}
+
+int searchVocab(char *word) {
+  int a;
+  unsigned int hash;
+
+  hash = getWordHash(word);
+
+  if (vocab_hash[hash] == -1) return -1;
+  if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
+
+  for (a = 0; a < vocab_size; a++) {   // search in vocabulary
+    if (!strcmp(word, vocab[a].word)) {
+      vocab_hash[hash] = a;
+      return a;
+    }
+  }
+
+  return -1;   // return OOV if not found
+}
+
+int addWordToVocab(char *word) {
+  unsigned int hash;
+
+  snprintf(vocab[vocab_size].word, strlen(word)+1, "%s", word);
+  vocab[vocab_size].cn = 0;
+  vocab_size++;
+
+  if (vocab_size + 2 >= vocab_max_size) {   // reallocate memory if needed
+    vocab_max_size += 100;
+    vocab = (struct vocab_word *) realloc(
+        vocab,
+        vocab_max_size * sizeof(struct vocab_word));
+  }
+
+  hash = getWordHash(word);
+  vocab_hash[hash] = vocab_size - 1;
+
+  return vocab_size - 1;
+}
+
+void readWord(char *word, FILE *fin) {
+  int a = 0, ch;
+
+  while (!feof(fin)) {
+    ch = fgetc(fin);
+
+    if (ch == 13) continue;
+
+    if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
+      if (a > 0) {
+        if (ch == '\n') ungetc(ch, fin);
+        break;
+      }
+
+      if (ch == '\n') {
+        snprintf(word, strlen(NL_STRING) + 1,
+            "%s", const_cast<char *>(NL_STRING));
+        return;
+      } else {
+        continue;
+      }
+    }
+
+    word[a] = static_cast<char>(ch);
+    a++;
+
+    if (a >= MAX_STRING) {
+      // printf("Too long word found!\n");   //truncate too long words
+      a--;
+    }
+  }
+  word[a] = 0;
+}
+
+void sortVocab() {
+  int a, b, max;
+  vocab_word swap;
+
+  for (a = 1; a < vocab_size; a++) {
+    max = a;
+    for (b = a + 1; b < vocab_size; b++)
+      if (vocab[max].cn < vocab[b].cn) max = b;
+
+    swap = vocab[max];
+    vocab[max] = vocab[a];
+    vocab[a] = swap;
+  }
+}
+
+int learnVocabFromTrainFile() {
+  char word[MAX_STRING];
+  FILE *fin;
+  int a, i, train_wcn;
+
+  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
+
+  fin = fopen(train_file, "rb");
+
+  vocab_size = 0;
+
+  addWordToVocab(const_cast<char *>(NL_STRING));
+
+  train_wcn = 0;
+  while (1) {
+    readWord(word, fin);
+    if (feof(fin)) break;
+
+    train_wcn++;
+
+    i = searchVocab(word);
+    if (i == -1) {
+      a = addWordToVocab(word);
+      vocab[a].cn = 1;
+    } else {
+      vocab[i].cn++;
+    }
+  }
+
+  sortVocab();
+
+  if (debug_mode > 0) {
+    printf("Vocab size: %d\n", vocab_size);
+    printf("Words in train file: %d\n", train_wcn);
+  }
+
+  fclose(fin);
+  return 0;
+}
+
+int splitClasses() {
+  double df, dd;
+  int i, a, b;
+
+  df = 0;
+  dd = 0;
+  a = 0;
+  b = 0;
+
+  class_start = reinterpret_cast<int *>(calloc(class_size, sizeof(int)));
+  memset(class_start, 0x7f, sizeof(int) * class_size);
+  class_end = reinterpret_cast<int *>(calloc(class_size, sizeof(int)));
+  memset(class_end, 0, sizeof(int) * class_size);
+
+  if (old_classes) {    // old classes
+    for (i = 0; i < vocab_size; i++)
+      b += vocab[i].cn;
+    for (i = 0; i < vocab_size; i++) {
+      df += vocab[i].cn / static_cast<double>(b);
+      if (df > 1) df = 1;
+      if (df > (a + 1) / static_cast<double>(class_size)) {
+        vocab[i].class_index = a;
+        if (a < class_size - 1) a++;
+      } else {
+        vocab[i].class_index = a;
+      }
+    }
+  } else {            // new classes
+    for (i = 0; i < vocab_size; i++)
+      b += vocab[i].cn;
+    for (i = 0; i < vocab_size; i++)
+      dd += sqrt(vocab[i].cn / static_cast<double>(b));
+    for (i = 0; i < vocab_size; i++) {
+      df += sqrt(vocab[i].cn / static_cast<double>(b)) / dd;
+      if (df > 1) df = 1;
+      if (df > (a + 1) / static_cast<double>(class_size)) {
+        vocab[i].class_index = a;
+        if (a < class_size - 1) a++;
+      } else {
+        vocab[i].class_index = a;
+      }
+    }
+  }
+
+  // after dividing classes, update class start and class end information
+  for (i = 0; i < vocab_size; i++)  {
+    a = vocab[i].class_index;
+    class_start[a] = min(i, class_start[a]);
+    class_end[a] = max(i + 1, class_end[a]);
+  }
+  return 0;
+}
+
+int init_class() {
+  // debug_mode = 1;
+  debug_mode = 0;
+  vocab_max_size = 100;  // largest length value for each word
+  vocab_size = 0;
+  vocab = (struct vocab_word *) calloc(vocab_max_size,
+      sizeof(struct vocab_word));
+  vocab_hash_size = 100000000;
+  vocab_hash = reinterpret_cast<int *>(calloc(vocab_hash_size, sizeof(int)));
+  old_classes = 1;
+
+  // read vocab
+  learnVocabFromTrainFile();
+
+  // split classes
+  splitClasses();
+
+  return 0;
+}
+
+int create_data(const char *input_file, const char *output) {
+  auto* store = singa::io::OpenStore("kvfile", output, singa::io::kCreate);
+  WordRecord wordRecord;
+
+  FILE *fin;
+  int a, i;
+  fin = fopen(input_file, "rb");
+
+  int wcnt = 0;
+  char key[BUFFER_LEN];
+  char wordstr[MAX_STRING];
+  string value;
+  while (1) {
+    readWord(wordstr, fin);
+    if (feof(fin)) break;
+    i = searchVocab(wordstr);
+    if (i == -1) {
+      if (debug_mode) printf("unknown word [%s] detected!", wordstr);
+    } else {
+      wordRecord.set_word(string(wordstr));
+      wordRecord.set_word_index(i);
+      int class_idx = vocab[i].class_index;
+      wordRecord.set_class_index(class_idx);
+      wordRecord.set_class_start(class_start[class_idx]);
+      wordRecord.set_class_end(class_end[class_idx]);
+      int length = snprintf(key, BUFFER_LEN, "%05d", wcnt++);
+      wordRecord.SerializeToString(&value);
+      store->Write(string(key, length), value);
+    }
+  }
+
+  fclose(fin);
+  store->Flush();
+  delete store;
+  return 0;
+}
+
+int argPos(char *str, int argc, char **argv) {
+  int a;
+
+  for (a = 1; a < argc; a++)
+    if (!strcmp(str, argv[a]))
+      return a;
+
+  return -1;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  FILE *f;
+
+  // set debug mode
+  i = argPos(const_cast<char *>("-debug"), argc, argv);
+  if (i > 0) {
+    debug_mode = 1;
+    if (debug_mode > 0)
+      printf("debug mode: %d\n", debug_mode);
+  }
+
+  // search for train file
+  i = argPos(const_cast<char *>("-train"), argc, argv);
+  if (i > 0) {
+    if (i + 1 == argc) {
+      printf("ERROR: training data file not specified!\n");
+      return 0;
+    }
+
+    snprintf(train_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]);
+
+    if (debug_mode > 0)
+      printf("train file: %s\n", train_file);
+
+    f = fopen(train_file, "rb");
+    if (f == NULL) {
+      printf("ERROR: training data file not found!\n");
+      return 0;
+    }
+    fclose(f);
+  } else {
+    printf("ERROR: training data must be set.\n");
+  }
+
+  // search for valid file
+  i = argPos(const_cast<char *>("-valid"), argc, argv);
+  if (i > 0) {
+    if (i + 1 == argc) {
+      printf("ERROR: validating data file not specified!\n");
+      return 0;
+    }
+
+    snprintf(valid_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]);
+
+    if (debug_mode > 0)
+      printf("valid file: %s\n", valid_file);
+
+    f = fopen(valid_file, "rb");
+    if (f == NULL) {
+      printf("ERROR: validating data file not found!\n");
+      return 0;
+    }
+    fclose(f);
+    valid_mode = 1;
+  }
+
+  // search for test file
+  i = argPos(const_cast<char *>("-test"), argc, argv);
+  if (i > 0) {
+    if (i + 1 == argc) {
+      printf("ERROR: testing data file not specified!\n");
+      return 0;
+    }
+
+    snprintf(test_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]);
+
+    if (debug_mode > 0)
+      printf("test file: %s\n", test_file);
+
+    f = fopen(test_file, "rb");
+    if (f == NULL) {
+      printf("ERROR: testing data file not found!\n");
+      return 0;
+    }
+    fclose(f);
+    test_mode = 1;
+  }
+
+  // search for class size
+  i = argPos(const_cast<char *>("-class_size"), argc, argv);
+  if (i > 0) {
+    if (i + 1 == argc) {
+      printf("ERROR: class size not specified!\n");
+      return 0;
+    }
+
+    class_size = atoi(argv[i + 1]);
+
+    if (debug_mode > 0)
+      printf("class size: %d\n", class_size);
+  }
+  if (class_size <= 0) {
+    printf("ERROR: no or invalid class size received!\n");
+    return 0;
+  }
+
+  init_class();
+
+  create_data(train_file, "train_data.bin");
+  if (valid_mode) create_data(valid_file, "valid_data.bin");
+  if (test_mode) create_data(test_file, "test_data.bin");
+
+  return 0;
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/create_shard.cc
----------------------------------------------------------------------
diff --git a/examples/rnnlm/create_shard.cc b/examples/rnnlm/create_shard.cc
deleted file mode 100644
index 536ce1f..0000000
--- a/examples/rnnlm/create_shard.cc
+++ /dev/null
@@ -1,471 +0,0 @@
-/*
- * This file include code from rnnlmlib-0.4 whose licence is as follows:
-Copyright (c) 2010-2012 Tomas Mikolov
-Copyright (c) 2013 Cantab Research Ltd
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions
-are met:
-
-1. Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-notice, this list of conditions and the following disclaimer in the
-documentation and/or other materials provided with the distribution.
-
-3. Neither name of copyright holders nor the names of its contributors
-may be used to endorse or promote products derived from this software
-without specific prior written permission.
-
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-//
-// This code creates DataShard for RNNLM dataset.
-// The RNNLM dataset could be downloaded at
-//    http://www.rnnlm.org/
-//
-// Usage:
-//    create_shard.bin -train [train_file] -valid [valid_file]
-//                     -test [test_file] -class_size [# of classes]
-
-#include <cstring>
-#include <cstdlib>
-#include <cstdio>
-#include <cmath>
-#include <algorithm>
-#include <fstream>
-
-#include "utils/data_shard.h"
-#include "utils/common.h"
-#include "proto/common.pb.h"
-#include "./rnnlm.pb.h"
-
-#define MAX_STRING 100
-#define BUFFER_LEN 32
-#define NL_STRING  "</s>"
-
-using std::string;
-using std::max;
-using std::min;
-using singa::DataShard;
-
-struct vocab_word {
-  int cn;
-  char word[MAX_STRING];
-  int class_index;
-};
-
-struct vocab_word *vocab;
-int vocab_max_size;
-int vocab_size;
-int *vocab_hash;
-int vocab_hash_size;
-int debug_mode;
-int old_classes;
-int *class_start;
-int *class_end;
-int class_size;
-
-char train_file[MAX_STRING];
-char valid_file[MAX_STRING];
-char test_file[MAX_STRING];
-
-int valid_mode;
-int test_mode;
-
-unsigned int getWordHash(char *word) {
-  unsigned int hash, a;
-
-  hash = 0;
-  for (a = 0; a < strlen(word); a++) hash = hash * 237 + word[a];
-  hash = hash % vocab_hash_size;
-
-  return hash;
-}
-
-int searchVocab(char *word) {
-  int a;
-  unsigned int hash;
-
-  hash = getWordHash(word);
-
-  if (vocab_hash[hash] == -1) return -1;
-  if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
-
-  for (a = 0; a < vocab_size; a++) {   // search in vocabulary
-    if (!strcmp(word, vocab[a].word)) {
-      vocab_hash[hash] = a;
-      return a;
-    }
-  }
-
-  return -1;   // return OOV if not found
-}
-
-int addWordToVocab(char *word) {
-  unsigned int hash;
-
-  snprintf(vocab[vocab_size].word, strlen(word)+1, "%s", word);
-  vocab[vocab_size].cn = 0;
-  vocab_size++;
-
-  if (vocab_size + 2 >= vocab_max_size) {   // reallocate memory if needed
-    vocab_max_size += 100;
-    vocab = (struct vocab_word *) realloc(
-        vocab,
-        vocab_max_size * sizeof(struct vocab_word));
-  }
-
-  hash = getWordHash(word);
-  vocab_hash[hash] = vocab_size - 1;
-
-  return vocab_size - 1;
-}
-
-void readWord(char *word, FILE *fin) {
-  int a = 0, ch;
-
-  while (!feof(fin)) {
-    ch = fgetc(fin);
-
-    if (ch == 13) continue;
-
-    if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
-      if (a > 0) {
-        if (ch == '\n') ungetc(ch, fin);
-        break;
-      }
-
-      if (ch == '\n') {
-        snprintf(word, strlen(NL_STRING) + 1,
-            "%s", const_cast<char *>(NL_STRING));
-        return;
-      } else {
-        continue;
-      }
-    }
-
-    word[a] = static_cast<char>(ch);
-    a++;
-
-    if (a >= MAX_STRING) {
-      // printf("Too long word found!\n");   //truncate too long words
-      a--;
-    }
-  }
-  word[a] = 0;
-}
-
-void sortVocab() {
-  int a, b, max;
-  vocab_word swap;
-
-  for (a = 1; a < vocab_size; a++) {
-    max = a;
-    for (b = a + 1; b < vocab_size; b++)
-      if (vocab[max].cn < vocab[b].cn) max = b;
-
-    swap = vocab[max];
-    vocab[max] = vocab[a];
-    vocab[a] = swap;
-  }
-}
-
-int learnVocabFromTrainFile() {
-  char word[MAX_STRING];
-  FILE *fin;
-  int a, i, train_wcn;
-
-  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
-
-  fin = fopen(train_file, "rb");
-
-  vocab_size = 0;
-
-  addWordToVocab(const_cast<char *>(NL_STRING));
-
-  train_wcn = 0;
-  while (1) {
-    readWord(word, fin);
-    if (feof(fin)) break;
-
-    train_wcn++;
-
-    i = searchVocab(word);
-    if (i == -1) {
-      a = addWordToVocab(word);
-      vocab[a].cn = 1;
-    } else {
-      vocab[i].cn++;
-    }
-  }
-
-  sortVocab();
-
-  if (debug_mode > 0) {
-    printf("Vocab size: %d\n", vocab_size);
-    printf("Words in train file: %d\n", train_wcn);
-  }
-
-  fclose(fin);
-  return 0;
-}
-
-int splitClasses() {
-  double df, dd;
-  int i, a, b;
-
-  df = 0;
-  dd = 0;
-  a = 0;
-  b = 0;
-
-  class_start = reinterpret_cast<int *>(calloc(class_size, sizeof(int)));
-  memset(class_start, 0x7f, sizeof(int) * class_size);
-  class_end = reinterpret_cast<int *>(calloc(class_size, sizeof(int)));
-  memset(class_end, 0, sizeof(int) * class_size);
-
-  if (old_classes) {    // old classes
-    for (i = 0; i < vocab_size; i++)
-      b += vocab[i].cn;
-    for (i = 0; i < vocab_size; i++) {
-      df += vocab[i].cn / static_cast<double>(b);
-      if (df > 1) df = 1;
-      if (df > (a + 1) / static_cast<double>(class_size)) {
-        vocab[i].class_index = a;
-        if (a < class_size - 1) a++;
-      } else {
-        vocab[i].class_index = a;
-      }
-    }
-  } else {            // new classes
-    for (i = 0; i < vocab_size; i++)
-      b += vocab[i].cn;
-    for (i = 0; i < vocab_size; i++)
-      dd += sqrt(vocab[i].cn / static_cast<double>(b));
-    for (i = 0; i < vocab_size; i++) {
-      df += sqrt(vocab[i].cn / static_cast<double>(b)) / dd;
-      if (df > 1) df = 1;
-      if (df > (a + 1) / static_cast<double>(class_size)) {
-        vocab[i].class_index = a;
-        if (a < class_size - 1) a++;
-      } else {
-        vocab[i].class_index = a;
-      }
-    }
-  }
-
-  // after dividing classes, update class start and class end information
-  for (i = 0; i < vocab_size; i++)  {
-    a = vocab[i].class_index;
-    class_start[a] = min(i, class_start[a]);
-    class_end[a] = max(i + 1, class_end[a]);
-  }
-  return 0;
-}
-
-int init_class() {
-  // debug_mode = 1;
-  debug_mode = 0;
-  vocab_max_size = 100;  // largest length value for each word
-  vocab_size = 0;
-  vocab = (struct vocab_word *) calloc(vocab_max_size,
-      sizeof(struct vocab_word));
-  vocab_hash_size = 100000000;
-  vocab_hash = reinterpret_cast<int *>(calloc(vocab_hash_size, sizeof(int)));
-  old_classes = 1;
-
-  // read vocab
-  learnVocabFromTrainFile();
-
-  // split classes
-  splitClasses();
-
-  return 0;
-}
-
-int create_shard(const char *input_file, const char *output) {
-  DataShard dataShard(output, DataShard::kCreate);
-  singa::Record record;
-  auto* wordRecord = record.MutableExtension(word);
-
-  FILE *fin;
-  int a, i;
-  fin = fopen(input_file, "rb");
-
-  int wcnt = 0;
-  char key[BUFFER_LEN];
-  char wordstr[MAX_STRING];
-  while (1) {
-    readWord(wordstr, fin);
-    if (feof(fin)) break;
-    i = searchVocab(wordstr);
-    if (i == -1) {
-      if (debug_mode) printf("unknown word [%s] detected!", wordstr);
-    } else {
-      wordRecord->set_word(string(wordstr));
-      wordRecord->set_word_index(i);
-      int class_idx = vocab[i].class_index;
-      wordRecord->set_class_index(class_idx);
-      wordRecord->set_class_start(class_start[class_idx]);
-      wordRecord->set_class_end(class_end[class_idx]);
-      int length = snprintf(key, BUFFER_LEN, "%05d", wcnt++);
-      dataShard.Insert(string(key, length), record);
-    }
-  }
-
-  dataShard.Flush();
-  fclose(fin);
-  return 0;
-}
-
-int argPos(char *str, int argc, char **argv) {
-  int a;
-
-  for (a = 1; a < argc; a++)
-    if (!strcmp(str, argv[a]))
-      return a;
-
-  return -1;
-}
-
-int main(int argc, char **argv) {
-  int i;
-  FILE *f;
-
-  // set debug mode
-  i = argPos(const_cast<char *>("-debug"), argc, argv);
-  if (i > 0) {
-    debug_mode = 1;
-    if (debug_mode > 0)
-      printf("debug mode: %d\n", debug_mode);
-  }
-
-  // search for train file
-  i = argPos(const_cast<char *>("-train"), argc, argv);
-  if (i > 0) {
-    if (i + 1 == argc) {
-      printf("ERROR: training data file not specified!\n");
-      return 0;
-    }
-
-    snprintf(train_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]);
-
-    if (debug_mode > 0)
-      printf("train file: %s\n", train_file);
-
-    f = fopen(train_file, "rb");
-    if (f == NULL) {
-      printf("ERROR: training data file not found!\n");
-      return 0;
-    }
-    fclose(f);
-  } else {
-    printf("ERROR: training data must be set.\n");
-  }
-
-  // search for valid file
-  i = argPos(const_cast<char *>("-valid"), argc, argv);
-  if (i > 0) {
-    if (i + 1 == argc) {
-      printf("ERROR: validating data file not specified!\n");
-      return 0;
-    }
-
-    snprintf(valid_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]);
-
-    if (debug_mode > 0)
-      printf("valid file: %s\n", valid_file);
-
-    f = fopen(valid_file, "rb");
-    if (f == NULL) {
-      printf("ERROR: validating data file not found!\n");
-      return 0;
-    }
-    fclose(f);
-    valid_mode = 1;
-  }
-
-  // search for test file
-  i = argPos(const_cast<char *>("-test"), argc, argv);
-  if (i > 0) {
-    if (i + 1 == argc) {
-      printf("ERROR: testing data file not specified!\n");
-      return 0;
-    }
-
-    snprintf(test_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]);
-
-    if (debug_mode > 0)
-      printf("test file: %s\n", test_file);
-
-    f = fopen(test_file, "rb");
-    if (f == NULL) {
-      printf("ERROR: testing data file not found!\n");
-      return 0;
-    }
-    fclose(f);
-    test_mode = 1;
-  }
-
-  // search for class size
-  i = argPos(const_cast<char *>("-class_size"), argc, argv);
-  if (i > 0) {
-    if (i + 1 == argc) {
-      printf("ERROR: class size not specified!\n");
-      return 0;
-    }
-
-    class_size = atoi(argv[i + 1]);
-
-    if (debug_mode > 0)
-      printf("class size: %d\n", class_size);
-  }
-  if (class_size <= 0) {
-    printf("ERROR: no or invalid class size received!\n");
-    return 0;
-  }
-
-  init_class();
-
-  create_shard(train_file, "train_shard");
-  if (valid_mode) create_shard(valid_file, "valid_shard");
-  if (test_mode) create_shard(test_file, "test_shard");
-
-  return 0;
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/job.conf
----------------------------------------------------------------------
diff --git a/examples/rnnlm/job.conf b/examples/rnnlm/job.conf
index 021692f..a1f803d 100644
--- a/examples/rnnlm/job.conf
+++ b/examples/rnnlm/job.conf
@@ -33,7 +33,8 @@ layer {
   name: "data"
   user_type: "kData"
   [data_conf] {
-    path: "examples/rnnlm/train_shard"
+    backend: "kvfile"
+    path: "examples/rnnlm/train_data.bin"
     max_window: 10
   }
   exclude: kVal
@@ -43,19 +44,13 @@ layer {
   name: "data"
   user_type: "kData"
   [data_conf] {
-    path: "examples/rnnlm/valid_shard"
+    path: "examples/rnnlm/valid_data.bin"
     max_window: 10
   }
   exclude: kTrain
 }
 
 layer{
-  name:"label"
-  user_type: "kLabel"
-  srclayers: "data"
-}
-
-layer{
   name: "embedding"
   user_type: "kEmbedding"
   srclayers: "data"
@@ -90,7 +85,7 @@ layer{
   name: "loss"
   user_type: "kLoss"
   srclayers:"hidden"
-  srclayers:"label"
+  srclayers:"data"
   [loss_conf] {
     nclass:100
     vocab_size: 3720

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/main.cc
----------------------------------------------------------------------
diff --git a/examples/rnnlm/main.cc b/examples/rnnlm/main.cc
index ea1dcdd..ceb8eb7 100644
--- a/examples/rnnlm/main.cc
+++ b/examples/rnnlm/main.cc
@@ -36,7 +36,6 @@ int main(int argc, char **argv) {
   driver.RegisterLayer<rnnlm::HiddenLayer, std::string>("kHidden");
   driver.RegisterLayer<rnnlm::LossLayer, std::string>("kLoss");
   driver.RegisterLayer<rnnlm::DataLayer, std::string>("kData");
-  driver.RegisterLayer<rnnlm::LabelLayer, std::string>("kLabel");
 
   singa::JobProto jobConf = driver.job_conf();
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/rnnlm.cc
----------------------------------------------------------------------
diff --git a/examples/rnnlm/rnnlm.cc b/examples/rnnlm/rnnlm.cc
index c086972..a6b65f7 100644
--- a/examples/rnnlm/rnnlm.cc
+++ b/examples/rnnlm/rnnlm.cc
@@ -52,42 +52,62 @@ inline Tensor<cpu, 1> RTensor1(Blob<float>* blob) {
 
 /*******DataLayer**************/
 DataLayer::~DataLayer() {
-  if (shard_ != nullptr)
-    delete shard_;
-  shard_ = nullptr;
+  if (store_ != nullptr)
+    delete store_;
 }
 
 void DataLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) 
{
   RNNLayer::Setup(conf, srclayers);
-  shard_ = new singa::DataShard(
-               conf.GetExtension(data_conf).path(),
-               singa::DataShard::kRead);
   string key;
   max_window_ = conf.GetExtension(data_conf).max_window();
-  records_.resize(max_window_ + 1);  // resize to # of records in data layer
+  data_.Reshape(vector<int>{max_window_ + 1, 4});
   window_ = 0;
-  shard_->Next(&key, &records_[window_]);
+}
+
+void SetInst(int k, WordRecord& word, Blob<float>* to) {
+  float* dptr = to->mutable_cpu_data() + k * 4;
+  dptr[0] = static_cast<float>(word.word_index());
+  dptr[1] = static_cast<float>(word.class_index());
+  dptr[2] = static_cast<float>(word.class_start());
+  dptr[3] = static_cast<float>(word.class_end());
+}
+
+void ShiftInst(int from, int to,  Blob<float>* data) {
+  const float* f = data->cpu_data() + from * 4;
+  float* t = data->mutable_cpu_data() + to * 4;
+  // hard code the feature dim to be 4;
+  t[0] = f[0]; t[1] = f[1]; t[2] = f[2]; t[3] = f[3];
 }
 
 void DataLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) {
-  CHECK(records_.size() <= shard_->Count());
-  records_[0] = records_[window_];
+  string key, value;
+  WordRecord word;
+  if (store_ == nullptr) {
+    store_ = singa::io::OpenStore(
+        layer_conf_.GetExtension(data_conf).backend(),
+        layer_conf_.GetExtension(data_conf).path(),
+        singa::io::kRead);
+    store_->Read(&key, &value);
+    word.ParseFromString(value);
+    SetInst(0, word, &data_);
+  }
+  ShiftInst(window_, 0, &data_);
   window_ = max_window_;
   for (int i = 1; i <= max_window_; i++) {
-    string key;
-    if (shard_->Next(&key, &records_[i])) {
-      if (records_[i].GetExtension(word).word_index() == 0) {
-        window_ = i;
-        break;
-      }
-    } else {
-      shard_->SeekToFirst();
-      CHECK(shard_->Next(&key, &records_[i]));
+    if (!store_->Read(&key, &value)) {
+      store_->SeekToFirst();
+      CHECK(store_->Read(&key, &value));
+    }
+    word.ParseFromString(value);
+    SetInst(i, word, &data_);
+    if (word.word_index() == 0) {
+      window_ = i;
+      break;
     }
   }
 }
 
-/*******LabelLayer**************/
+/*******LabelLayer**************
 void LabelLayer::Setup(const LayerProto& conf,
     const vector<Layer*>& srclayers) {
   RNNLayer::Setup(conf, srclayers);
@@ -108,6 +128,7 @@ void LabelLayer::ComputeFeature(int flag, const 
vector<Layer*>& srclayers) {
     label[4 * i + 3] = wordrecord.class_index();
   }
 }
+*/
 
 /*******EmbeddingLayer**************/
 EmbeddingLayer::~EmbeddingLayer() {
@@ -118,7 +139,7 @@ void EmbeddingLayer::Setup(const LayerProto& conf,
     const vector<Layer*>& srclayers) {
   RNNLayer::Setup(conf, srclayers);
   CHECK_EQ(srclayers.size(), 1);
-  int max_window = dynamic_cast<DataLayer*>(srclayers[0])->max_window();
+  int max_window = srclayers[0]->data(this).shape()[0];
   word_dim_ = conf.GetExtension(embedding_conf).word_dim();
   data_.Reshape(vector<int>{max_window, word_dim_});
   grad_.ReshapeLike(data_);
@@ -130,12 +151,12 @@ void EmbeddingLayer::Setup(const LayerProto& conf,
 void EmbeddingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) 
{
   auto datalayer = dynamic_cast<DataLayer*>(srclayers[0]);
   window_ = datalayer->window();
-  auto records = datalayer->records();
   auto words = RTensor2(&data_);
   auto embed = RTensor2(embed_->mutable_data());
 
+  const float* idxptr = datalayer->data(this).cpu_data();
   for (int t = 0; t < window_; t++) {
-    int idx = static_cast<int>(records[t].GetExtension(word).word_index());
+    int idx = static_cast<int>(idxptr[t * 4]);
     CHECK_GE(idx, 0);
     CHECK_LT(idx, vocab_size_);
     Copy(words[t], embed[idx]);
@@ -147,10 +168,10 @@ void EmbeddingLayer::ComputeGradient(int flag,
   auto grad = RTensor2(&grad_);
   auto gembed = RTensor2(embed_->mutable_grad());
   auto datalayer = dynamic_cast<DataLayer*>(srclayers[0]);
-  auto records = datalayer->records();
   gembed = 0;
+  const float* idxptr = datalayer->data(this).cpu_data();
   for (int t = 0; t < window_; t++) {
-    int idx = static_cast<int>(records[t].GetExtension(word).word_index());
+    int idx = static_cast<int>(idxptr[t * 4]);
     Copy(gembed[idx], grad[t]);
   }
 }
@@ -241,8 +262,9 @@ void LossLayer::ComputeFeature(int flag, const 
vector<Layer*>& srclayers) {
 
   float loss = 0.f, ppl = 0.f;
   for (int t = 0; t < window_; t++) {
-    int start = static_cast<int>(label[t * 4 + 0]);
-    int end = static_cast<int>(label[t * 4 + 1]);
+    // label is the next word
+    int start = static_cast<int>(label[(t + 1) * 4 + 2]);
+    int end = static_cast<int>(label[(t + 1) * 4 + 3]);
 
     auto wordWeight = word_weight.Slice(start, end);
     CHECK_GT(end, start);
@@ -254,8 +276,8 @@ void LossLayer::ComputeFeature(int flag, const 
vector<Layer*>& srclayers) {
     pclass[t] = dot(src[t], class_weight.T());
     Softmax(pclass[t], pclass[t]);
 
-    int wid = static_cast<int>(label[t * 4 + 2]);
-    int cid = static_cast<int>(label[t * 4 + 3]);
+    int wid = static_cast<int>(label[(t + 1) * 4 + 0]);
+    int cid = static_cast<int>(label[(t + 1) * 4 + 1]);
     CHECK_GT(end, wid);
     CHECK_GE(wid, start);
     loss_ += -log(std::max(pword[wid - start] * pclass[t][cid], FLT_MIN));
@@ -276,10 +298,10 @@ void LossLayer::ComputeGradient(int flag, const 
vector<Layer*>& srclayers) {
   gclass_weight = 0;
   gword_weight = 0;
   for (int t = 0; t < window_; t++) {
-    int start = static_cast<int>(label[t * 4 + 0]);
-    int end = static_cast<int>(label[t * 4 + 1]);
-    int wid = static_cast<int>(label[t * 4 + 2]);
-    int cid = static_cast<int>(label[t * 4 + 3]);
+    int start = static_cast<int>(label[(t + 1) * 4 + 2]);
+    int end = static_cast<int>(label[(t + 1) * 4 + 3]);
+    int wid = static_cast<int>(label[(t + 1) * 4 + 0]);
+    int cid = static_cast<int>(label[(t + 1) * 4 + 1]);
     auto pword = RTensor1(&pword_[t]);
     CHECK_GT(end, wid);
     CHECK_GE(wid, start);
@@ -304,6 +326,9 @@ void LossLayer::ComputeGradient(int flag, const 
vector<Layer*>& srclayers) {
 const std::string LossLayer::ToString(bool debug, int flag) {
   float loss = loss_ / num_;
   float ppl = exp10(- ppl_ / num_);
+  loss_ = 0;
+  num_ = 0;
+  ppl_ = 0;
   return "loss = " + std::to_string(loss) + ", ppl = " + std::to_string(ppl);
 }
 }   // end of namespace rnnlm

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/rnnlm.h
----------------------------------------------------------------------
diff --git a/examples/rnnlm/rnnlm.h b/examples/rnnlm/rnnlm.h
index 8ad7a68..cb3198e 100644
--- a/examples/rnnlm/rnnlm.h
+++ b/examples/rnnlm/rnnlm.h
@@ -69,20 +69,20 @@ class DataLayer : public RNNLayer, public singa::DataLayer {
 
  private:
   int max_window_;
-  singa::DataShard* shard_;
+  singa::io::Store* store_ = nullptr;
 };
 
 
 /**
  * LabelLayer that read records_[1] to records_[window_] from DataLayer to
  * offer label information
- */
 class LabelLayer : public RNNLayer {
  public:
   void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override;
   void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
   void ComputeGradient(int flag, const vector<Layer*>& srclayers) override {}
 };
+ */
 
 
 /**

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/rnnlm.proto
----------------------------------------------------------------------
diff --git a/examples/rnnlm/rnnlm.proto b/examples/rnnlm/rnnlm.proto
index 8feb3f9..8cfec86 100644
--- a/examples/rnnlm/rnnlm.proto
+++ b/examples/rnnlm/rnnlm.proto
@@ -35,6 +35,7 @@ message LossProto {
 message DataProto {
   required string path = 1;
   optional int32 max_window = 2;
+  optional string backend = 3 [default = "kvfile"];
 }
 
 extend singa.LayerProto {
@@ -50,7 +51,3 @@ message WordRecord {
   optional int32 class_start = 4;
   optional int32 class_end = 5;
 }
-
-extend singa.Record {
-  optional WordRecord word = 101;
-}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/include/io/kvfile_store.h
----------------------------------------------------------------------
diff --git a/include/io/kvfile_store.h b/include/io/kvfile_store.h
index bda7409..c3fd868 100644
--- a/include/io/kvfile_store.h
+++ b/include/io/kvfile_store.h
@@ -36,6 +36,7 @@ namespace singa { namespace io {
  */
 class KVFileStore : public Store {
  public:
+  ~KVFileStore() { Close();}
   bool Open(const std::string& source, Mode mode) override;
   void Close() override;
   bool Read(std::string* key, std::string* value) override;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/include/io/store.h
----------------------------------------------------------------------
diff --git a/include/io/store.h b/include/io/store.h
index 8665af0..f3cc282 100644
--- a/include/io/store.h
+++ b/include/io/store.h
@@ -38,6 +38,10 @@ enum Mode { kCreate, kRead, kAppend };
 class Store {
  public:
   Store() { }
+  /**
+   * In case that users forget to call Close() to release resources, e.g.,
+   * memory, you can release them here.
+   */
   virtual ~Store() { }
   /**
    * @param[in] source path to the storage, could be a file path, folder path
@@ -46,6 +50,9 @@ class Store {
    * @return true if open successfully, otherwise false.
    */
   virtual bool Open(const std::string& source, Mode mode) = 0;
+  /**
+   * Release resources.
+   */
   virtual void Close() = 0;
   /**
    * Read a tuple.
@@ -73,7 +80,22 @@ class Store {
   virtual void Flush() {}
 };
 
-Store* CreateStore(const std::string& store);
+/**
+ * Create a Store object.
+ *
+ * @param[in] backend identifier for a specific backend. Two backends are
+ * inluced currently, i.e., "kvfile", "textfile"
+ * @return a pointer to the newly created Store.
+ */
+Store* CreateStore(const string& backend);
+/**
+ * Create and open a Store object.
+ *
+ * @param[in] backend, @see CreateStore().
+ * @param[in] path
+ * @param[in] mode kRead or kCreate or kAppend
+ */
+Store* OpenStore(const string& backend, const string& path, Mode mode);
 } // namespace io
 } /* singa */
 #endif  // SINGA_IO_STORE_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/include/io/textfile_store.h
----------------------------------------------------------------------
diff --git a/include/io/textfile_store.h b/include/io/textfile_store.h
index 4c020e9..788dc20 100644
--- a/include/io/textfile_store.h
+++ b/include/io/textfile_store.h
@@ -32,6 +32,7 @@ namespace singa { namespace io {
  */
 class TextFileStore : public Store {
  public:
+  ~TextFileStore() { Close(); }
   bool Open(const std::string& source, Mode mode) override;
   void Close() override;
   bool Read(std::string* key, std::string* value) override;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/include/neuralnet/input_layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/input_layer.h b/include/neuralnet/input_layer.h
index b5f2dd4..4dfded0 100644
--- a/include/neuralnet/input_layer.h
+++ b/include/neuralnet/input_layer.h
@@ -26,6 +26,7 @@
 #include <vector>
 #include "neuralnet/layer.h"
 #include "utils/data_shard.h"
+#include "io/store.h"
 /**
  * \file this file includes the declarations of input layers that inherit the
  * base InputLayer to load input features.
@@ -40,6 +41,126 @@
  * ParserLayer.
  */
 namespace singa {
+using std::string;
+using std::vector;
+
+/************************Start of new input layers***************************/
+/**
+ * Base class for loading data from Store.
+ */
+class StoreInputLayer : virtual public InputLayer {
+ public:
+  ~StoreInputLayer();
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) 
override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+
+  ConnectionType dst_layer_connection() const override { return kOneToMany; }
+
+ protected:
+  /**
+   * Parsing the (key, val) tuple to get feature (and label).
+   * Subclasses must implment this function.
+   * @param[in] k parse this tuple as the k-th instance of one mini-batch.
+   * @param[in] flag used to guide the parsing, e.g., kDeploy phase should not
+   * parse labels from the tuple.
+   * @param[in] key
+   * @param[in] val
+   */
+  virtual bool Parse(int k, int flag, const string& key, const string& val) = 
0;
+
+ protected:
+  int batchsize_;
+  io::Store* store_ = nullptr;
+};
+
+/**
+ * Base layer for parsing a key-value tuple as a feature vector with fixed
+ * length. The feature shape is indicated by users in the configuration.
+ * Each tuple may has a label.
+ */
+class SingleLabelRecordLayer : public StoreInputLayer {
+ public:
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) 
override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers) override;
+
+ protected:
+  /**
+   * Load a single record (tuple), e.g., the mean or standard variance vector.
+   */
+  virtual void LoadRecord(const string& backend, const string& path,
+      Blob<float>* to) = 0;
+
+ protected:
+  /**
+   * Feature standardization by processing each feature dimension via
+   * @f$ y = (x - mu)/ std @f$
+   * <a href= "http://ufldl.stanford.edu/wiki/index.php/Data_Preprocessing";>
+   * UFLDL</a>
+   */
+  Blob<float> mean_, std_;
+};
+
+/**
+ * Specific layer that parses the value string loaded by Store into a
+ * SingleLabelImageRecord.
+ */
+class ProtoRecordLayer : public SingleLabelRecordLayer {
+ public:
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) 
override;
+
+ protected:
+  /**
+   * Parse key as instance ID and val into SingleLabelImageRecord.
+   * @copydetails StoreInputLayer::Parse()
+   */
+  bool Parse(int k, int flag, const string& key, const string& val) override;
+  void LoadRecord(const string& backend,
+                  const string& path,
+                  Blob<float>* to) override;
+
+ private:
+  // TODO(wangwei) decode the image
+  bool encoded_;
+};
+
+/**
+ * Specific layer that parses the value string loaded by Store as a line from
+ * a CSV file.
+ *
+ * It assumes the first column is the label except that has_label_ is 
configured
+ * to false. Or the data is used in deploy mode.
+ */
+class CSVRecordLayer : public SingleLabelRecordLayer {
+ public:
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) 
override;
+
+ protected:
+  bool Parse(int k, int flag, const string& key, const string& val) override;
+  void LoadRecord(const string& backend,
+                  const string& path,
+                  Blob<float>* to) override;
+
+ private:
+  std::string sep_;
+  bool has_label_;
+};
+
+/**
+ * Do preprocessing for images, including cropping, mirroring, resizing.
+ */
+class ImagePreprocessLayer : public InputLayer {
+ public:
+  void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) 
override;
+  void ComputeFeature(int flag, const vector<Layer*>& srclayers);
+
+ private:
+  bool mirror_ = false;
+  int cropsize_ = 0;
+  int resize_ = 0;
+  float scale_ = 1;
+};
+
+/************************End of new input layers***************************/
 /**
  * Base layer for reading ::Record  from local Shard, HDFS, lmdb, etc.
  */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/include/neuralnet/layer.h
----------------------------------------------------------------------
diff --git a/include/neuralnet/layer.h b/include/neuralnet/layer.h
index bf83163..5ed0c7e 100644
--- a/include/neuralnet/layer.h
+++ b/include/neuralnet/layer.h
@@ -34,6 +34,8 @@
 
 namespace singa {
 using std::vector;
+// TODO(wangwei) make AuxType a template argument for Layer.
+using AuxType = int;
 /**
  * Base layer class.
  *
@@ -186,6 +188,12 @@ class Layer {
     return &data_;
   }
   /**
+   * @return auxiliary data, e.g., image label.
+   */
+  virtual const vector<AuxType>& aux_data(const Layer* from = nullptr) const {
+    return aux_data_;
+  }
+  /**
    * @see data().
    * @return the const ref of the Blob for the gradient of this layer, mainly
    * used in BP algorithm.
@@ -205,6 +213,7 @@ class Layer {
  protected:
   LayerProto layer_conf_;
   Blob<float> data_, grad_;
+  vector<AuxType> aux_data_;
 };
 
 /**

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/include/singa.h
----------------------------------------------------------------------
diff --git a/include/singa.h b/include/singa.h
index 6c801ab..63acb0a 100644
--- a/include/singa.h
+++ b/include/singa.h
@@ -31,6 +31,7 @@
 #include "utils/param.h"
 #include "utils/singleton.h"
 #include "utils/factory.h"
+#include "io/store.h"
 #include "./driver.h"
 
 #endif  // SINGA_SINGA_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/include/utils/tokenizer.h
----------------------------------------------------------------------
diff --git a/include/utils/tokenizer.h b/include/utils/tokenizer.h
new file mode 100644
index 0000000..fc6ba8a
--- /dev/null
+++ b/include/utils/tokenizer.h
@@ -0,0 +1,59 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#ifndef SINGA_UTILS_TOKENIER_H_
+#define SINGA_UTILS_TOKENIER_H_
+#include <string>
+#include <glog/logging.h>
+namespace singa {
+/**
+ * Tokenize a string.
+ *
+ * example:
+ * Tokenizer t("assa,asf;wes", ",;");
+ * string x;
+ * t >> x; // x is assa
+ * t >> x; // x is asf
+ * t >> x; // x is wes
+ * cout << (t >> x); // print 0.
+ */
+class Tokenizer {
+ public:
+  Tokenizer(const std::string& str, const std::string& sep): start_(0),
+  sep_(sep), buf_(str) {}
+  Tokenizer & operator>>(std::string& out) {
+    CHECK_LT(start_, buf_.length());
+    int start = start_;
+    auto pos = buf_.find_first_of(sep_, start);
+    if (pos == std::string::npos)
+      pos = buf_.length();
+    start_ = pos + 1;
+    out = buf_.substr(start, pos);
+    return *this;
+  }
+
+  bool Valid() { return start_ < buf_.length(); }
+  private:
+   unsigned start_;
+   std::string sep_;
+   const std::string& buf_;
+};
+} /* singa */
+#endif // SINGA_UTILS_TOKENIER_H_

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/src/driver.cc
----------------------------------------------------------------------
diff --git a/src/driver.cc b/src/driver.cc
index 28669fa..0d3bbfc 100644
--- a/src/driver.cc
+++ b/src/driver.cc
@@ -54,6 +54,11 @@ void Driver::Init(int argc, char **argv) {
   ReadProtoFromTextFile(argv[arg_pos+1], &job_conf_);
 
   // register layers
+
+  RegisterLayer<ProtoRecordLayer, int>(kProtoRecord);
+  RegisterLayer<CSVRecordLayer, int>(kCSVRecord);
+  RegisterLayer<ImagePreprocessLayer, int>(kImagePreprocess);
+
   RegisterLayer<BridgeDstLayer, int>(kBridgeDst);
   RegisterLayer<BridgeSrcLayer, int>(kBridgeSrc);
   RegisterLayer<ConvolutionLayer, int>(kConvolution);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/src/io/store.cc
----------------------------------------------------------------------
diff --git a/src/io/store.cc b/src/io/store.cc
index 6412628..8d3bf13 100644
--- a/src/io/store.cc
+++ b/src/io/store.cc
@@ -19,7 +19,6 @@
 *
 *************************************************************/
 
-
 #include "io/store.h"
 #include "io/kvfile_store.h"
 #include "io/textfile_store.h"
@@ -52,6 +51,12 @@ Store* CreateStore(const std::string& backend) {
 #endif
   return store;
 }
+
+Store* OpenStore(const string& backend, const string& path, Mode mode) {
+  auto store = CreateStore(backend);
+  store->Open(path, mode);
+  return store;
+}
 } /* io */
 
 } /* singa */

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/src/io/textfile_store.cc
----------------------------------------------------------------------
diff --git a/src/io/textfile_store.cc b/src/io/textfile_store.cc
index 74ec9a4..77694a0 100644
--- a/src/io/textfile_store.cc
+++ b/src/io/textfile_store.cc
@@ -40,6 +40,7 @@ void TextFileStore::Close() {
       fs_->close();
     }
     delete fs_;
+    fs_ = nullptr;
   }
 }

[3/5] incubator-singa git commit: SINGA-82 Refactor input layers using data store abstraction

Reply via email to