SINGA-97  Add HDFS Store

Change create_data.cc to support "make create_hdfs" which creates and uploads 
data
to HDFS. Default HDFS directory is "hdfs://node0:9000/examples/cifar10", which 
can be
customized with "make create_hdfs HDFS_DIR=xxx".


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/aada3658
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/aada3658
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/aada3658

Branch: refs/heads/master
Commit: aada36581c9ec8965976ef5aa91311ae5f52ad70
Parents: 4cfe813
Author: Anh Dinh <[email protected]>
Authored: Fri Jan 1 18:46:52 2016 +0800
Committer: WANG Sheng <[email protected]>
Committed: Sat Jan 2 19:58:14 2016 +0800

----------------------------------------------------------------------
 examples/cifar10/Makefile.example | 14 ++++++++++++++
 examples/cifar10/create_data.cc   | 25 +++++++++++++------------
 examples/mnist/Makefile.example   | 14 ++++++++++++++
 examples/mnist/create_data.cc     |  6 +++++-
 4 files changed, 46 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aada3658/examples/cifar10/Makefile.example
----------------------------------------------------------------------
diff --git a/examples/cifar10/Makefile.example 
b/examples/cifar10/Makefile.example
index dd65d7d..16dc052 100644
--- a/examples/cifar10/Makefile.example
+++ b/examples/cifar10/Makefile.example
@@ -22,14 +22,28 @@ libs :=singa glog protobuf
 
 .PHONY: all download create
 
+HDFS_DIR := hdfs://node0:9000/examples/cifar10
+
 download: cifar-10-binary-bin
 
 cifar-10-binary-bin:
        wget http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
        tar xf cifar-10-binary.tar.gz
 
+compile:
+       $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog -lhdfs3 \
+               -I../../include -L../../.libs/ 
-Wl,-unresolved-symbols=ignore-in-shared-libs \
+               -Wl,-rpath=../../.libs/  -o create_data.bin
+
 create:
        $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog \
                -I../../include -L../../.libs/ 
-Wl,-unresolved-symbols=ignore-in-shared-libs \
                -Wl,-rpath=../../.libs/  -o create_data.bin
        ./create_data.bin cifar-10-batches-bin .
+
+create_hdfs:
+       $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog \
+               -I../../include -L../../.libs/ 
-Wl,-unresolved-symbols=ignore-in-shared-libs \
+               -Wl,-rpath=../../.libs/  -o create_data.bin
+       ./create_data.bin cifar-10-batches-bin $(HDFS_DIR) 
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aada3658/examples/cifar10/create_data.cc
----------------------------------------------------------------------
diff --git a/examples/cifar10/create_data.cc b/examples/cifar10/create_data.cc
index 5873c0e..37c58f6 100644
--- a/examples/cifar10/create_data.cc
+++ b/examples/cifar10/create_data.cc
@@ -19,16 +19,16 @@
 *
 *************************************************************/
 
-//
-// This code creates training and test DataShard for CIFAR dataset.
-// It is adapted from the convert_cifar_data from Caffe
-//
-// Usage:
-//    create_shard.bin input_folder output_folder
-//
-// The CIFAR dataset could be downloaded at
-//    http://www.cs.toronto.edu/~kriz/cifar.html
-//
+
+/**
+ * Create training and test DataShard for CIFAR dataset. 
+ * It is adapted from convert_cifar_data from Caffe. 
+ *    create_shard.bin <input> <output_folder> 
+ * 
+ * Read from JobConf object the option to use KVfile, HDFS or other (1st layer
+ * store_conf object). 
+ * To load to HDFS, specify "hdfs://namenode/examples" as the output folder
+ */
 
 #include <glog/logging.h>
 #include <fstream>
@@ -38,6 +38,7 @@
 
 #include "singa/io/store.h"
 #include "singa/proto/common.pb.h"
+#include "singa/utils/common.h"
 
 using std::string;
 
@@ -58,7 +59,6 @@ void create_data(const string& input_folder, const string& 
output_folder) {
   int label;
   char str_buffer[kCIFARImageNBytes];
   string rec_buf;
-
   singa::RecordProto image;
   image.add_shape(3);
   image.add_shape(kCIFARSize);
@@ -69,7 +69,8 @@ void create_data(const string& input_folder, const string& 
output_folder) {
   for (int i = 0; i < kCIFARImageNBytes; i++)
     mean.add_data(0.f);
 
-  auto store = singa::io::CreateStore("kvfile");
+  string store_backend = (output_folder.find("hdfs")!=-1) ? "hdfsfile" : 
"kvfile";  
+  auto store = singa::io::CreateStore(store_backend);
   CHECK(store->Open(output_folder + "/train_data.bin", singa::io::kCreate));
   LOG(INFO) << "Preparing training data";
   int count = 0;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aada3658/examples/mnist/Makefile.example
----------------------------------------------------------------------
diff --git a/examples/mnist/Makefile.example b/examples/mnist/Makefile.example
index 733633d..48d2fd8 100644
--- a/examples/mnist/Makefile.example
+++ b/examples/mnist/Makefile.example
@@ -23,6 +23,8 @@ libs :=singa glog protobuf
 
 .PHONY: all download create
 
+HDFS_DIR := hdfs://node0:9000/examples/cifar10
+
 download: mnist
 
 mnist:
@@ -33,9 +35,21 @@ mnist:
        gunzip train-images-idx3-ubyte.gz && gunzip train-labels-idx1-ubyte.gz
        gunzip t10k-images-idx3-ubyte.gz && gunzip t10k-labels-idx1-ubyte.gz
 
+compile:
+       $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog -lhdfs3 
-I../../include \
+               -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs 
-Wl,-rpath=../../.libs/ \
+               -o create_data.bin
+
 create:
        $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog 
-I../../include \
                -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs 
-Wl,-rpath=../../.libs/ \
                -o create_data.bin
        ./create_data.bin train-images-idx3-ubyte train-labels-idx1-ubyte 
train_data.bin
        ./create_data.bin t10k-images-idx3-ubyte t10k-labels-idx1-ubyte 
test_data.bin
+
+create_hdfs:
+       $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog \
+               -I../../include -L../../.libs/ 
-Wl,-unresolved-symbols=ignore-in-shared-libs \
+               -Wl,-rpath=../../.libs/  -o create_data.bin
+       ./create_data.bin cifar-10-batches-bin $(HDFS_DIR) 
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aada3658/examples/mnist/create_data.cc
----------------------------------------------------------------------
diff --git a/examples/mnist/create_data.cc b/examples/mnist/create_data.cc
index 5e51e97..59da860 100644
--- a/examples/mnist/create_data.cc
+++ b/examples/mnist/create_data.cc
@@ -46,6 +46,8 @@ uint32_t swap_endian(uint32_t val) {
     return (val << 16) | (val >> 16);
 }
 
+// output is the full path, unlike create_data in CIFAR with only
+// specifies the directory
 void create_data(const char* image_filename, const char* label_filename,
         const char* output) {
   // Open files
@@ -76,7 +78,9 @@ void create_data(const char* image_filename, const char* 
label_filename,
   image_file.read(reinterpret_cast<char*>(&cols), 4);
   cols = swap_endian(cols);
 
-  auto store = singa::io::OpenStore("kvfile", output, singa::io::kCreate);
+  // read backend from the job.conf
+  string store_backend = (output_folder.find("hdfs")!=-1) ? "hdfsfile" : 
"kvfile";  
+  auto store = singa::io::CreateStore(store_backend);
   char label;
   char* pixels = new char[rows * cols];
   int count = 0;

Reply via email to