SINGA-97 Add HDFS Store Change create_data.cc to support "make create_hdfs" which creates and uploads data to HDFS. Default HDFS directory is "hdfs://node0:9000/examples/cifar10", which can be customized with "make create_hdfs HDFS_DIR=xxx".
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/aada3658 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/aada3658 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/aada3658 Branch: refs/heads/master Commit: aada36581c9ec8965976ef5aa91311ae5f52ad70 Parents: 4cfe813 Author: Anh Dinh <[email protected]> Authored: Fri Jan 1 18:46:52 2016 +0800 Committer: WANG Sheng <[email protected]> Committed: Sat Jan 2 19:58:14 2016 +0800 ---------------------------------------------------------------------- examples/cifar10/Makefile.example | 14 ++++++++++++++ examples/cifar10/create_data.cc | 25 +++++++++++++------------ examples/mnist/Makefile.example | 14 ++++++++++++++ examples/mnist/create_data.cc | 6 +++++- 4 files changed, 46 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aada3658/examples/cifar10/Makefile.example ---------------------------------------------------------------------- diff --git a/examples/cifar10/Makefile.example b/examples/cifar10/Makefile.example index dd65d7d..16dc052 100644 --- a/examples/cifar10/Makefile.example +++ b/examples/cifar10/Makefile.example @@ -22,14 +22,28 @@ libs :=singa glog protobuf .PHONY: all download create +HDFS_DIR := hdfs://node0:9000/examples/cifar10 + download: cifar-10-binary-bin cifar-10-binary-bin: wget http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz tar xf cifar-10-binary.tar.gz +compile: + $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog -lhdfs3 \ + -I../../include -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs \ + -Wl,-rpath=../../.libs/ -o create_data.bin + create: $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog \ -I../../include -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs \ -Wl,-rpath=../../.libs/ -o create_data.bin ./create_data.bin cifar-10-batches-bin . + +create_hdfs: + $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog \ + -I../../include -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs \ + -Wl,-rpath=../../.libs/ -o create_data.bin + ./create_data.bin cifar-10-batches-bin $(HDFS_DIR) + http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aada3658/examples/cifar10/create_data.cc ---------------------------------------------------------------------- diff --git a/examples/cifar10/create_data.cc b/examples/cifar10/create_data.cc index 5873c0e..37c58f6 100644 --- a/examples/cifar10/create_data.cc +++ b/examples/cifar10/create_data.cc @@ -19,16 +19,16 @@ * *************************************************************/ -// -// This code creates training and test DataShard for CIFAR dataset. -// It is adapted from the convert_cifar_data from Caffe -// -// Usage: -// create_shard.bin input_folder output_folder -// -// The CIFAR dataset could be downloaded at -// http://www.cs.toronto.edu/~kriz/cifar.html -// + +/** + * Create training and test DataShard for CIFAR dataset. + * It is adapted from convert_cifar_data from Caffe. + * create_shard.bin <input> <output_folder> + * + * Read from JobConf object the option to use KVfile, HDFS or other (1st layer + * store_conf object). + * To load to HDFS, specify "hdfs://namenode/examples" as the output folder + */ #include <glog/logging.h> #include <fstream> @@ -38,6 +38,7 @@ #include "singa/io/store.h" #include "singa/proto/common.pb.h" +#include "singa/utils/common.h" using std::string; @@ -58,7 +59,6 @@ void create_data(const string& input_folder, const string& output_folder) { int label; char str_buffer[kCIFARImageNBytes]; string rec_buf; - singa::RecordProto image; image.add_shape(3); image.add_shape(kCIFARSize); @@ -69,7 +69,8 @@ void create_data(const string& input_folder, const string& output_folder) { for (int i = 0; i < kCIFARImageNBytes; i++) mean.add_data(0.f); - auto store = singa::io::CreateStore("kvfile"); + string store_backend = (output_folder.find("hdfs")!=-1) ? "hdfsfile" : "kvfile"; + auto store = singa::io::CreateStore(store_backend); CHECK(store->Open(output_folder + "/train_data.bin", singa::io::kCreate)); LOG(INFO) << "Preparing training data"; int count = 0; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aada3658/examples/mnist/Makefile.example ---------------------------------------------------------------------- diff --git a/examples/mnist/Makefile.example b/examples/mnist/Makefile.example index 733633d..48d2fd8 100644 --- a/examples/mnist/Makefile.example +++ b/examples/mnist/Makefile.example @@ -23,6 +23,8 @@ libs :=singa glog protobuf .PHONY: all download create +HDFS_DIR := hdfs://node0:9000/examples/cifar10 + download: mnist mnist: @@ -33,9 +35,21 @@ mnist: gunzip train-images-idx3-ubyte.gz && gunzip train-labels-idx1-ubyte.gz gunzip t10k-images-idx3-ubyte.gz && gunzip t10k-labels-idx1-ubyte.gz +compile: + $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog -lhdfs3 -I../../include \ + -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs -Wl,-rpath=../../.libs/ \ + -o create_data.bin + create: $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog -I../../include \ -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs -Wl,-rpath=../../.libs/ \ -o create_data.bin ./create_data.bin train-images-idx3-ubyte train-labels-idx1-ubyte train_data.bin ./create_data.bin t10k-images-idx3-ubyte t10k-labels-idx1-ubyte test_data.bin + +create_hdfs: + $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog \ + -I../../include -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs \ + -Wl,-rpath=../../.libs/ -o create_data.bin + ./create_data.bin cifar-10-batches-bin $(HDFS_DIR) + http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/aada3658/examples/mnist/create_data.cc ---------------------------------------------------------------------- diff --git a/examples/mnist/create_data.cc b/examples/mnist/create_data.cc index 5e51e97..59da860 100644 --- a/examples/mnist/create_data.cc +++ b/examples/mnist/create_data.cc @@ -46,6 +46,8 @@ uint32_t swap_endian(uint32_t val) { return (val << 16) | (val >> 16); } +// output is the full path, unlike create_data in CIFAR with only +// specifies the directory void create_data(const char* image_filename, const char* label_filename, const char* output) { // Open files @@ -76,7 +78,9 @@ void create_data(const char* image_filename, const char* label_filename, image_file.read(reinterpret_cast<char*>(&cols), 4); cols = swap_endian(cols); - auto store = singa::io::OpenStore("kvfile", output, singa::io::kCreate); + // read backend from the job.conf + string store_backend = (output_folder.find("hdfs")!=-1) ? "hdfsfile" : "kvfile"; + auto store = singa::io::CreateStore(store_backend); char label; char* pixels = new char[rows * cols]; int count = 0;
