SINGA-82 Refactor input layers using data store abstraction * Add StoreLayer to read data from Store, e.g., KVFile, TextFile (will add support for HDFS later). * Implemente subclasses of StoreLayer to parse different format tuples, e.g., SingleLabelImageRecord or CSV line. * Update examples to use the new input layers. * Add unit tests. * Add a function for Layer class, which returns a vector<AuxType> for auxiliary data (e.g., label).
TODO 1. make AuxType a template argument of Layer class, and extend data() to return a vector of Blob for multiple dense features. 2. separate layer classeses into different files to make the structure of the source folder clear. Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/5f010caa Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/5f010caa Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/5f010caa Branch: refs/heads/master Commit: 5f010caabd7c09cd9fabee666d93a36377639270 Parents: d99b24c Author: Wei Wang <[email protected]> Authored: Tue Oct 6 01:10:40 2015 +0800 Committer: wang sheng <[email protected]> Committed: Wed Oct 7 15:19:59 2015 +0800 ---------------------------------------------------------------------- Makefile.am | 15 +- examples/cifar10/Makefile.example | 11 +- examples/cifar10/create_data.cc | 135 +++++++++ examples/cifar10/create_shard.cc | 131 --------- examples/cifar10/job.conf | 42 ++- examples/mnist/Makefile.example | 10 +- examples/mnist/conv.conf | 42 ++- examples/mnist/create_data.cc | 119 ++++++++ examples/mnist/create_shard.cc | 120 -------- examples/mnist/job.conf | 46 ++- examples/rbm/autoencoder.conf | 41 ++- examples/rbm/rbm1.conf | 52 ++-- examples/rbm/rbm2.conf | 52 ++-- examples/rbm/rbm3.conf | 51 ++-- examples/rbm/rbm4.conf | 53 ++-- examples/rnnlm/Makefile.example | 6 +- examples/rnnlm/create_data.cc | 472 +++++++++++++++++++++++++++++++ examples/rnnlm/create_shard.cc | 471 ------------------------------ examples/rnnlm/job.conf | 13 +- examples/rnnlm/main.cc | 1 - examples/rnnlm/rnnlm.cc | 91 +++--- examples/rnnlm/rnnlm.h | 4 +- examples/rnnlm/rnnlm.proto | 5 +- include/io/kvfile_store.h | 1 + include/io/store.h | 24 +- include/io/textfile_store.h | 1 + include/neuralnet/input_layer.h | 121 ++++++++ include/neuralnet/layer.h | 9 + include/singa.h | 1 + include/utils/tokenizer.h | 59 ++++ src/driver.cc | 5 + src/io/store.cc | 7 +- src/io/textfile_store.cc | 1 + src/neuralnet/input_layer.cc | 218 +++++++++++++- src/neuralnet/loss_layer.cc | 4 +- src/proto/job.proto | 26 +- src/test/test_csv_record_layer.cc | 92 ++++++ src/test/test_proto_record_layer.cc | 122 ++++++++ src/utils/image_transform.cc | 56 ++++ 39 files changed, 1711 insertions(+), 1019 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/Makefile.am ---------------------------------------------------------------------- diff --git a/Makefile.am b/Makefile.am index f8e765d..a1496bd 100644 --- a/Makefile.am +++ b/Makefile.am @@ -35,6 +35,7 @@ SINGA_SRCS := src/driver.cc \ src/utils/updater.cc \ src/utils/data_shard.cc \ src/utils/blob.cc \ + src/utils/image_transform.cc \ src/server.cc \ src/worker.cc \ src/stub.cc \ @@ -64,6 +65,8 @@ SINGA_HDRS := include/singa.h \ include/utils/blob.h \ include/utils/updater.h \ include/utils/tinydir.h \ + include/utils/tokenizer.h \ + include/utils/image_transform.h \ include/server.h \ include/worker.h \ include/stub.h \ @@ -84,10 +87,10 @@ SINGA_HDRS := include/singa.h \ include/mshadow/tensor_random.h \ include/comm/msg.h \ include/comm/socket.h - src/io/store.h \ - src/io/kvfile.h \ - src/io/kvfile_store.h \ - src/io/textfile_store.h + include/io/store.h \ + include/io/kvfile.h \ + include/io/kvfile_store.h \ + include/io/textfile_store.h GTEST_SRCS := include/gtest/gtest-all.cc GTEST_HRDS := include/gtest/gtest.h @@ -98,7 +101,9 @@ TEST_SRCS := include/gtest/gtest_main.cc \ src/test/test_neuralnet.cc \ src/test/test_paramslicer.cc \ src/test/test_shard.cc \ - src/test/test_store.cc + src/test/test_store.cc \ + src/test/test_proto_record_layer.cc \ + src/test/test_csv_record_layer.cc #EXTRA_PROGRAMS = $(PROGS) EXTRA_PROGRAMS = singatest http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/cifar10/Makefile.example ---------------------------------------------------------------------- diff --git a/examples/cifar10/Makefile.example b/examples/cifar10/Makefile.example index 9e65a58..dd65d7d 100644 --- a/examples/cifar10/Makefile.example +++ b/examples/cifar10/Makefile.example @@ -29,12 +29,7 @@ cifar-10-binary-bin: tar xf cifar-10-binary.tar.gz create: - $(CXX) create_shard.cc -std=c++11 -lsinga -lprotobuf -lglog \ + $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog \ -I../../include -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs \ - -Wl,-rpath=../../.libs/ -o create_shard.bin - mkdir cifar10_train_shard - mkdir cifar10_test_shard - ./create_shard.bin cifar-10-batches-bin . - - - + -Wl,-rpath=../../.libs/ -o create_data.bin + ./create_data.bin cifar-10-batches-bin . http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/cifar10/create_data.cc ---------------------------------------------------------------------- diff --git a/examples/cifar10/create_data.cc b/examples/cifar10/create_data.cc new file mode 100644 index 0000000..5fddd1d --- /dev/null +++ b/examples/cifar10/create_data.cc @@ -0,0 +1,135 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +// +// This code creates training and test DataShard for CIFAR dataset. +// It is adapted from the convert_cifar_data from Caffe +// +// Usage: +// create_shard.bin input_folder output_folder +// +// The CIFAR dataset could be downloaded at +// http://www.cs.toronto.edu/~kriz/cifar.html +// + +#include <glog/logging.h> +#include <fstream> +#include <string> +#include <cstdint> +#include <iostream> + +#include "./singa.h" + +using std::string; + +const int kCIFARSize = 32; +const int kCIFARImageNBytes = 3072; +const int kCIFARBatchSize = 10000; +const int kCIFARTrainBatches = 5; + +void read_image(std::ifstream* file, int* label, char* buffer) { + char label_char; + file->read(&label_char, 1); + *label = label_char; + file->read(buffer, kCIFARImageNBytes); + return; +} + +void create_data(const string& input_folder, const string& output_folder) { + int label; + char str_buffer[kCIFARImageNBytes]; + string rec_buf; + + singa::SingleLabelImageRecord image;; + image.add_shape(3); + image.add_shape(kCIFARSize); + image.add_shape(kCIFARSize); + + singa::SingleLabelImageRecord mean; + mean.CopyFrom(image); + for (int i = 0; i < kCIFARImageNBytes; i++) + mean.add_data(0.f); + + auto store = singa::io::CreateStore("kvfile"); + CHECK(store->Open(output_folder + "/train_data.bin", singa::io::kCreate)); + LOG(INFO) << "Preparing training data"; + int count = 0; + for (int fileid = 0; fileid < kCIFARTrainBatches; ++fileid) { + LOG(INFO) << "Training Batch " << fileid + 1; + snprintf(str_buffer, kCIFARImageNBytes, "/data_batch_%d.bin", fileid + 1); + std::ifstream data_file((input_folder + str_buffer).c_str(), + std::ios::in | std::ios::binary); + CHECK(data_file.is_open()) << "Unable to open train file #" << fileid + 1; + for (int itemid = 0; itemid < kCIFARBatchSize; ++itemid) { + read_image(&data_file, &label, str_buffer); + image.set_label(label); + image.set_pixel(str_buffer, kCIFARImageNBytes); + image.SerializeToString(&rec_buf); + int length = snprintf(str_buffer, kCIFARImageNBytes, "%05d", count); + CHECK(store->Write(string(str_buffer, length), rec_buf)); + + const string& pixels = image.pixel(); + for (int i = 0; i < kCIFARImageNBytes; i++) + mean.set_data(i, mean.data(i) + static_cast<uint8_t>(pixels[i])); + count += 1; + } + } + store->Flush(); + store->Close(); + + LOG(INFO) << "Create image mean"; + store->Open(output_folder + "/image_mean.bin", singa::io::kCreate); + for (int i = 0; i < kCIFARImageNBytes; i++) + mean.set_data(i, mean.data(i) / count); + mean.SerializeToString(&rec_buf); + store->Write("mean", rec_buf); + store->Flush(); + store->Close(); + + LOG(INFO) << "Create test data"; + store->Open(output_folder + "/test_data.bin", singa::io::kCreate); + std::ifstream data_file((input_folder + "/test_batch.bin").c_str(), + std::ios::in | std::ios::binary); + CHECK(data_file.is_open()) << "Unable to open test file."; + for (int itemid = 0; itemid < kCIFARBatchSize; ++itemid) { + read_image(&data_file, &label, str_buffer); + image.set_label(label); + image.set_pixel(str_buffer, kCIFARImageNBytes); + image.SerializeToString(&rec_buf); + int length = snprintf(str_buffer, kCIFARImageNBytes, "%05d", itemid); + CHECK(store->Write(string(str_buffer, length), rec_buf)); + } + store->Flush(); + store->Close(); +} + +int main(int argc, char** argv) { + if (argc != 3) { + std::cout <<"Create train and test DataShard for Cifar dataset.\n" + << "Usage:\n" + << " create_data.bin input_folder output_folder\n" + << "Where the input folder should contain the binary batch files.\n"; + } else { + google::InitGoogleLogging(argv[0]); + create_data(string(argv[1]), string(argv[2])); + } + return 0; +} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/cifar10/create_shard.cc ---------------------------------------------------------------------- diff --git a/examples/cifar10/create_shard.cc b/examples/cifar10/create_shard.cc deleted file mode 100644 index 0a00639..0000000 --- a/examples/cifar10/create_shard.cc +++ /dev/null @@ -1,131 +0,0 @@ -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - -// -// This code creates training and test DataShard for CIFAR dataset. -// It is adapted from the convert_cifar_data from Caffe -// -// Usage: -// create_shard.bin input_folder output_folder -// -// The CIFAR dataset could be downloaded at -// http://www.cs.toronto.edu/~kriz/cifar.html -// - -#include <fstream> -#include <string> - -#include <glog/logging.h> -#include <cstdint> -#include <iostream> - -#include "singa.h" - -using std::string; - -using singa::DataShard; -using singa::WriteProtoToBinaryFile; - -const int kCIFARSize = 32; -const int kCIFARImageNBytes = 3072; -const int kCIFARBatchSize = 10000; -const int kCIFARTrainBatches = 5; - -void read_image(std::ifstream* file, int* label, char* buffer) { - char label_char; - file->read(&label_char, 1); - *label = label_char; - file->read(buffer, kCIFARImageNBytes); - return; -} - -void create_shard(const string& input_folder, const string& output_folder) { - int label; - // Data buffer - char str_buffer[kCIFARImageNBytes]; - singa::Record record; - singa::SingleLabelImageRecord* image=record.mutable_image();; - image->add_shape(3); - image->add_shape(kCIFARSize); - image->add_shape(kCIFARSize); - - singa::SingleLabelImageRecord mean; - mean.CopyFrom(*image); - for(int i=0;i<kCIFARImageNBytes;i++) - mean.add_data(0.); - - DataShard train_shard(output_folder+"/cifar10_train_shard",DataShard::kCreate); - LOG(INFO) << "Writing Training data"; - int count=0; - for (int fileid = 0; fileid < kCIFARTrainBatches; ++fileid) { - // Open files - LOG(INFO) << "Training Batch " << fileid + 1; - snprintf(str_buffer, kCIFARImageNBytes, "/data_batch_%d.bin", fileid + 1); - std::ifstream data_file((input_folder + str_buffer).c_str(), - std::ios::in | std::ios::binary); - CHECK(data_file) << "Unable to open train file #" << fileid + 1; - for (int itemid = 0; itemid < kCIFARBatchSize; ++itemid) { - read_image(&data_file, &label, str_buffer); - image->set_label(label); - image->set_pixel(str_buffer, kCIFARImageNBytes); - int length = snprintf(str_buffer, kCIFARImageNBytes, "%05d", - fileid * kCIFARBatchSize + itemid); - CHECK(train_shard.Insert(string(str_buffer, length), record)); - - const string& pixels=image->pixel(); - for(int i=0;i<kCIFARImageNBytes;i++) - mean.set_data(i, mean.data(i)+static_cast<uint8_t>(pixels[i])); - count+=1; - } - } - train_shard.Flush(); - for(int i=0;i<kCIFARImageNBytes;i++) - mean.set_data(i, mean.data(i)/count); - WriteProtoToBinaryFile(mean, (output_folder+"/image_mean.bin").c_str()); - - LOG(INFO) << "Writing Testing data"; - DataShard test_shard(output_folder+"/cifar10_test_shard",DataShard::kCreate); - // Open files - std::ifstream data_file((input_folder + "/test_batch.bin").c_str(), - std::ios::in | std::ios::binary); - CHECK(data_file) << "Unable to open test file."; - for (int itemid = 0; itemid < kCIFARBatchSize; ++itemid) { - read_image(&data_file, &label, str_buffer); - image->set_label(label); - image->set_pixel(str_buffer, kCIFARImageNBytes); - int length = snprintf(str_buffer, kCIFARImageNBytes, "%05d", itemid); - CHECK(test_shard.Insert(string(str_buffer, length), record)); - } - test_shard.Flush(); -} - -int main(int argc, char** argv) { - if (argc != 3) { - std::cout<<"Create train and test DataShard for Cifar dataset.\n" - <<"Usage:\n" - <<" create_shard.bin input_folder output_folder\n" - <<"Where the input folder should contain the binary batch files.\n"; - } else { - google::InitGoogleLogging(argv[0]); - create_shard(string(argv[1]), string(argv[2])); - } - return 0; -} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/cifar10/job.conf ---------------------------------------------------------------------- diff --git a/examples/cifar10/job.conf b/examples/cifar10/job.conf index 343d969..57f4b36 100644 --- a/examples/cifar10/job.conf +++ b/examples/cifar10/job.conf @@ -1,7 +1,7 @@ name: "cifar10-convnet" train_steps: 1000 test_steps: 100 -test_freq:300 +test_freq: 300 disp_freq: 30 train_one_batch { alg: kBP @@ -24,41 +24,38 @@ updater{ neuralnet { layer{ name: "data" - type: kShardData - sharddata_conf { - path: "examples/cifar10/cifar10_train_shard" + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/cifar10/train_data.bin" + mean_file: "examples/cifar10/image_mean.bin" batchsize: 64 random_skip: 5000 + shape: 3 + shape: 32 + shape: 32 } exclude: kTest } layer{ name: "data" - type: kShardData - sharddata_conf { - path: "examples/cifar10/cifar10_test_shard" + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/cifar10/test_data.bin" + mean_file: "examples/cifar10/image_mean.bin" batchsize: 100 + shape: 3 + shape: 32 + shape: 32 } exclude: kTrain } - layer{ - name:"rgb" - type: kRGBImage - srclayers: "data" - rgbimage_conf { - meanfile: "examples/cifar10/image_mean.bin" - } - } - layer{ - name: "label" - type: kLabel - srclayers: "data" - } layer { name: "conv1" type: kCConvolution - srclayers: "rgb" + srclayers: "data" convolution_conf { num_filters: 32 kernel: 5 @@ -223,7 +220,6 @@ neuralnet { } } } - layer{ name: "loss" type: kSoftmaxLoss @@ -231,7 +227,7 @@ neuralnet { topk:1 } srclayers:"ip1" - srclayers: "label" + srclayers: "data" } } cluster { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/mnist/Makefile.example ---------------------------------------------------------------------- diff --git a/examples/mnist/Makefile.example b/examples/mnist/Makefile.example index 4df4edd..733633d 100644 --- a/examples/mnist/Makefile.example +++ b/examples/mnist/Makefile.example @@ -34,10 +34,8 @@ mnist: gunzip t10k-images-idx3-ubyte.gz && gunzip t10k-labels-idx1-ubyte.gz create: - $(CXX) create_shard.cc -std=c++11 -lsinga -lprotobuf -lglog -I../../include \ + $(CXX) create_data.cc -std=c++11 -lsinga -lprotobuf -lglog -I../../include \ -L../../.libs/ -Wl,-unresolved-symbols=ignore-in-shared-libs -Wl,-rpath=../../.libs/ \ - -o create_shard.bin - mkdir mnist_train_shard - mkdir mnist_test_shard - ./create_shard.bin train-images-idx3-ubyte train-labels-idx1-ubyte mnist_train_shard - ./create_shard.bin t10k-images-idx3-ubyte t10k-labels-idx1-ubyte mnist_test_shard + -o create_data.bin + ./create_data.bin train-images-idx3-ubyte train-labels-idx1-ubyte train_data.bin + ./create_data.bin t10k-images-idx3-ubyte t10k-labels-idx1-ubyte test_data.bin http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/mnist/conv.conf ---------------------------------------------------------------------- diff --git a/examples/mnist/conv.conf b/examples/mnist/conv.conf index 7f7a158..ba631c1 100644 --- a/examples/mnist/conv.conf +++ b/examples/mnist/conv.conf @@ -22,43 +22,39 @@ updater { neuralnet { layer { name: "data" - type: kShardData - sharddata_conf { - path: "examples/mnist/mnist_train_shard" + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/mnist/train_data.bin" batchsize: 64 + std_value: 255 + random_skip: 5000 + shape: 1 + shape: 28 + shape: 28 } exclude: kTest } layer { name: "data" - type: kShardData - sharddata_conf { - path: "examples/mnist/mnist_test_shard" + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/mnist/test_data.bin" + std_value: 255 batchsize: 100 + shape: 1 + shape: 28 + shape: 28 } exclude: kTrain } - layer{ - name:"mnist" - type: kMnist - srclayers: "data" - mnist_conf { - norm_a:255 - norm_b:0 - } - } - - layer{ - name: "label" - type: kLabel - srclayers: "data" - } layer { name: "conv1" type: kCConvolution - srclayers: "mnist" + srclayers: "data" convolution_conf { num_filters: 20 kernel: 5 @@ -181,7 +177,7 @@ neuralnet { topk:1 } srclayers:"ip2" - srclayers:"label" + srclayers:"data" } } cluster { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/mnist/create_data.cc ---------------------------------------------------------------------- diff --git a/examples/mnist/create_data.cc b/examples/mnist/create_data.cc new file mode 100644 index 0000000..aad1f56 --- /dev/null +++ b/examples/mnist/create_data.cc @@ -0,0 +1,119 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +// +// This code creates DataShard for MNIST dataset. +// It is adapted from the convert_mnist_data from Caffe +// +// Usage: +// create_shard.bin input_image_file input_label_file output_folder +// The MNIST dataset could be downloaded at +// http://yann.lecun.com/exdb/mnist/ + +#include <glog/logging.h> +#include <cstdint> +#include <iostream> + +#include <fstream> +#include <string> + +#include "io/store.h" +#include "utils/common.h" +#include "proto/common.pb.h" + +using std::string; + +uint32_t swap_endian(uint32_t val) { + val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF); + return (val << 16) | (val >> 16); +} + +void create_data(const char* image_filename, const char* label_filename, + const char* output) { + // Open files + std::ifstream image_file(image_filename, std::ios::in | std::ios::binary); + std::ifstream label_file(label_filename, std::ios::in | std::ios::binary); + CHECK(image_file) << "Unable to open file " << image_filename; + CHECK(label_file) << "Unable to open file " << label_filename; + // Read the magic and the meta data + uint32_t magic; + uint32_t num_items; + uint32_t num_labels; + uint32_t rows; + uint32_t cols; + + image_file.read(reinterpret_cast<char*>(&magic), 4); + magic = swap_endian(magic); + CHECK_EQ(magic, 2051) << "Incorrect image file magic."; + label_file.read(reinterpret_cast<char*>(&magic), 4); + magic = swap_endian(magic); + CHECK_EQ(magic, 2049) << "Incorrect label file magic."; + image_file.read(reinterpret_cast<char*>(&num_items), 4); + num_items = swap_endian(num_items); + label_file.read(reinterpret_cast<char*>(&num_labels), 4); + num_labels = swap_endian(num_labels); + CHECK_EQ(num_items, num_labels); + image_file.read(reinterpret_cast<char*>(&rows), 4); + rows = swap_endian(rows); + image_file.read(reinterpret_cast<char*>(&cols), 4); + cols = swap_endian(cols); + + auto store = singa::io::OpenStore("kvfile", output, singa::io::kCreate); + char label; + char* pixels = new char[rows * cols]; + int count = 0; + const int kMaxKeyLength = 10; + char key[kMaxKeyLength]; + string value; + + singa::SingleLabelImageRecord image; + image.add_shape(rows); + image.add_shape(cols); + LOG(INFO) << "A total of " << num_items << " items."; + LOG(INFO) << "Rows: " << rows << " Cols: " << cols; + for (int item_id = 0; item_id < num_items; ++item_id) { + image_file.read(pixels, rows * cols); + label_file.read(&label, 1); + image.set_pixel(pixels, rows*cols); + image.set_label(label); + snprintf(key, kMaxKeyLength, "%08d", item_id); + image.SerializeToString(&value); + store->Write(string(key), value); + } + delete pixels; + store->Flush(); + delete store; +} + +int main(int argc, char** argv) { + if (argc != 4) { + std::cout<<"This program create a DataShard for a MNIST dataset\n" + "Usage:\n" + " create_shard.bin input_image_file input_label_file output_db_file\n" + "The MNIST dataset could be downloaded at\n" + " http://yann.lecun.com/exdb/mnist/\n" + "You should gunzip them after downloading."; + } else { + google::InitGoogleLogging(argv[0]); + create_data(argv[1], argv[2], argv[3]); + } + return 0; +} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/mnist/create_shard.cc ---------------------------------------------------------------------- diff --git a/examples/mnist/create_shard.cc b/examples/mnist/create_shard.cc deleted file mode 100644 index 3d7bd97..0000000 --- a/examples/mnist/create_shard.cc +++ /dev/null @@ -1,120 +0,0 @@ -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - -// -// This code creates DataShard for MNIST dataset. -// It is adapted from the convert_mnist_data from Caffe -// -// Usage: -// create_shard.bin input_image_file input_label_file output_folder -// The MNIST dataset could be downloaded at -// http://yann.lecun.com/exdb/mnist/ - -#include <glog/logging.h> -#include <cstdint> -#include <iostream> - -#include <fstream> -#include <string> - -#include "utils/data_shard.h" -#include "utils/common.h" -#include "proto/common.pb.h" - -using singa::DataShard; -using singa::WriteProtoToBinaryFile; -using std::string; - -uint32_t swap_endian(uint32_t val) { - val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF); - return (val << 16) | (val >> 16); -} - -void create_shard(const char* image_filename, const char* label_filename, - const char* output) { - // Open files - std::ifstream image_file(image_filename, std::ios::in | std::ios::binary); - std::ifstream label_file(label_filename, std::ios::in | std::ios::binary); - CHECK(image_file) << "Unable to open file " << image_filename; - CHECK(label_file) << "Unable to open file " << label_filename; - // Read the magic and the meta data - uint32_t magic; - uint32_t num_items; - uint32_t num_labels; - uint32_t rows; - uint32_t cols; - - image_file.read(reinterpret_cast<char*>(&magic), 4); - magic = swap_endian(magic); - CHECK_EQ(magic, 2051) << "Incorrect image file magic."; - label_file.read(reinterpret_cast<char*>(&magic), 4); - magic = swap_endian(magic); - CHECK_EQ(magic, 2049) << "Incorrect label file magic."; - image_file.read(reinterpret_cast<char*>(&num_items), 4); - num_items = swap_endian(num_items); - label_file.read(reinterpret_cast<char*>(&num_labels), 4); - num_labels = swap_endian(num_labels); - CHECK_EQ(num_items, num_labels); - image_file.read(reinterpret_cast<char*>(&rows), 4); - rows = swap_endian(rows); - image_file.read(reinterpret_cast<char*>(&cols), 4); - cols = swap_endian(cols); - - DataShard shard(output, DataShard::kCreate); - char label; - char* pixels = new char[rows * cols]; - int count = 0; - const int kMaxKeyLength = 10; - char key[kMaxKeyLength]; - string value; - - singa::Record record; - singa::SingleLabelImageRecord* image=record.mutable_image(); - image->add_shape(rows); - image->add_shape(cols); - LOG(INFO) << "A total of " << num_items << " items."; - LOG(INFO) << "Rows: " << rows << " Cols: " << cols; - for (int item_id = 0; item_id < num_items; ++item_id) { - image_file.read(pixels, rows * cols); - label_file.read(&label, 1); - image->set_pixel(pixels, rows*cols); - image->set_label(label); - snprintf(key, kMaxKeyLength, "%08d", item_id); - shard.Insert(string(key), record); - } - delete pixels; - shard.Flush(); -} - -int main(int argc, char** argv) { - if (argc != 4) { - std::cout<<"This program create a DataShard for a MNIST dataset\n" - "Usage:\n" - " create_shard.bin input_image_file input_label_file output_db_file\n" - "The MNIST dataset could be downloaded at\n" - " http://yann.lecun.com/exdb/mnist/\n" - "You should gunzip them after downloading."; - } else { - google::InitGoogleLogging(argv[0]); - create_shard(argv[1], argv[2], argv[3]); - } - return 0; -} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/mnist/job.conf ---------------------------------------------------------------------- diff --git a/examples/mnist/job.conf b/examples/mnist/job.conf index 6d02561..bfbf17d 100644 --- a/examples/mnist/job.conf +++ b/examples/mnist/job.conf @@ -21,45 +21,37 @@ updater{ neuralnet { layer { name: "data" - type: kShardData - sharddata_conf { - path: "examples/mnist/mnist_train_shard" - batchsize: 1000 + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/mnist/train_data.bin" + random_skip: 5000 + batchsize: 64 + shape: 784 + std_value: 127.5 + mean_value: 127.5 } exclude: kTest } layer { name: "data" - type: kShardData - sharddata_conf { - path: "examples/mnist/mnist_test_shard" - batchsize: 1000 + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/mnist/test_data.bin" + batchsize: 100 + shape: 784 + std_value: 127.5 + mean_value: 127.5 } exclude: kTrain } layer{ - name:"mnist" - type: kMnist - srclayers: "data" - mnist_conf { - norm_a: 127.5 - norm_b: 1 - } - } - - - layer{ - name: "label" - type: kLabel - srclayers: "data" - } - - layer{ name: "fc1" type: kInnerProduct - srclayers:"mnist" + srclayers:"data" innerproduct_conf{ num_output: 2500 } @@ -239,7 +231,7 @@ neuralnet { topk:1 } srclayers:"fc6" - srclayers:"label" + srclayers:"data" } } cluster { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rbm/autoencoder.conf ---------------------------------------------------------------------- diff --git a/examples/rbm/autoencoder.conf b/examples/rbm/autoencoder.conf index c818c6e..5799adf 100644 --- a/examples/rbm/autoencoder.conf +++ b/examples/rbm/autoencoder.conf @@ -21,44 +21,35 @@ updater{ neuralnet { layer { name: "data" - type: kShardData - sharddata_conf { - path: "examples/mnist/mnist_train_shard" - batchsize: 1000 + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/mnist/train_data.bin" + batchsize: 100 + std_value: 255 + shape: 784 } exclude: kTest } layer { name: "data" - type: kShardData - sharddata_conf { - path: "examples/mnist/mnist_test_shard" - batchsize: 1000 + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/mnist/test_data.bin" + std_value: 255 + batchsize: 100 + shape: 784 } exclude: kTrain } - layer{ - name:"mnist" - type: kMnist - srclayers: "data" - mnist_conf { - norm_a: 255 - norm_b: 0 - } - } - - layer{ - name: "label" - type: kLabel - srclayers: "data" - } layer{ name: "Inner1" type: kInnerProduct - srclayers:"mnist" + srclayers:"data" innerproduct_conf{ num_output: 1000 } @@ -228,7 +219,7 @@ neuralnet { name: "loss" type:kEuclideanLoss srclayers:"Sigmoid8" - srclayers:"mnist" + srclayers:"data" } } cluster { http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rbm/rbm1.conf ---------------------------------------------------------------------- diff --git a/examples/rbm/rbm1.conf b/examples/rbm/rbm1.conf index d185766..1c23d47 100644 --- a/examples/rbm/rbm1.conf +++ b/examples/rbm/rbm1.conf @@ -17,42 +17,36 @@ updater{ } neuralnet { -layer { - name: "data" - type: kShardData - sharddata_conf { - path: "examples/mnist/mnist_train_shard" - batchsize: 100 - } - exclude: kTest -} - - -layer { - name: "data" - type: kShardData - sharddata_conf { - path: "examples/mnist/mnist_test_shard" - batchsize: 100 + layer { + name: "data" + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/mnist/train_data.bin" + batchsize: 100 + std_value: 255 + shape: 784 + } + exclude: kTest } - exclude: kTrain -} - -layer{ - name:"mnist" - type: kMnist - srclayers: "data" - mnist_conf { - norm_a: 255 - norm_b: 0 + layer { + name: "data" + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/mnist/test_data.bin" + std_value: 255 + batchsize: 100 + shape: 784 + } + exclude: kTrain } -} layer{ name: "RBMVis" type: kRBMVis - srclayers:"mnist" + srclayers:"data" srclayers:"RBMHid" rbm_conf{ hdim: 1000 http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rbm/rbm2.conf ---------------------------------------------------------------------- diff --git a/examples/rbm/rbm2.conf b/examples/rbm/rbm2.conf index 52dc698..2f51208 100644 --- a/examples/rbm/rbm2.conf +++ b/examples/rbm/rbm2.conf @@ -18,42 +18,36 @@ updater{ } neuralnet { -layer { - name: "data" - type: kShardData - sharddata_conf { - path: "examples/mnist/mnist_train_shard" - batchsize: 100 - } - exclude: kTest -} - - -layer { - name: "data" - type: kShardData - sharddata_conf { - path: "examples/mnist/mnist_test_shard" - batchsize: 100 + layer { + name: "data" + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/mnist/train_data.bin" + batchsize: 100 + std_value: 255 + shape: 784 + } + exclude: kTest } - exclude: kTrain -} - -layer{ - name:"mnist" - type: kMnist - srclayers: "data" - mnist_conf { - norm_a: 255 - norm_b: 0 + layer { + name: "data" + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/mnist/test_data.bin" + std_value: 255 + batchsize: 100 + shape: 784 + } + exclude: kTrain } -} layer{ name: "Inner1" type: kInnerProduct - srclayers:"mnist" + srclayers:"data" innerproduct_conf{ num_output: 1000 } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rbm/rbm3.conf ---------------------------------------------------------------------- diff --git a/examples/rbm/rbm3.conf b/examples/rbm/rbm3.conf index 354fb3b..5df9ae3 100644 --- a/examples/rbm/rbm3.conf +++ b/examples/rbm/rbm3.conf @@ -20,42 +20,37 @@ updater{ neuralnet { -layer { - name: "data" - type: kShardData - sharddata_conf { - path: "examples/mnist/mnist_train_shard" - batchsize: 100 + layer { + name: "data" + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/mnist/train_data.bin" + batchsize: 100 + std_value: 255 + shape: 784 + } + exclude: kTest } - exclude: kTest -} - -layer { - name: "data" - type: kShardData - sharddata_conf { - path: "examples/mnist/mnist_test_shard" - batchsize: 100 + layer { + name: "data" + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/mnist/test_data.bin" + std_value: 255 + batchsize: 100 + shape: 784 + } + exclude: kTrain } - exclude: kTrain -} layer{ - name:"mnist" - type: kMnist - srclayers: "data" - mnist_conf { - norm_a: 255 - norm_b: 0 - } -} - -layer{ name: "Inner1" type: kInnerProduct - srclayers:"mnist" + srclayers:"data" innerproduct_conf{ num_output: 1000 } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rbm/rbm4.conf ---------------------------------------------------------------------- diff --git a/examples/rbm/rbm4.conf b/examples/rbm/rbm4.conf index ebf39fa..a34a75c 100644 --- a/examples/rbm/rbm4.conf +++ b/examples/rbm/rbm4.conf @@ -18,42 +18,37 @@ updater{ } neuralnet { -layer { - name: "data" - type: kShardData - sharddata_conf { - path: "examples/mnist/mnist_train_shard" - batchsize: 100 + layer { + name: "data" + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/mnist/train_data.bin" + batchsize: 100 + std_value: 255 + shape: 784 + } + exclude: kTest } - exclude: kTest -} - -layer { - name: "data" - type: kShardData - sharddata_conf { - path: "examples/mnist/mnist_test_shard" - batchsize: 100 + layer { + name: "data" + type: kProtoRecord + store_conf { + backend: "kvfile" + path: "examples/mnist/test_data.bin" + std_value: 255 + batchsize: 100 + shape: 784 + } + exclude: kTrain } - exclude: kTrain -} -layer{ - name:"mnist" - type: kMnist - srclayers: "data" - mnist_conf { - norm_a: 255 - norm_b: 0 - } -} - -layer{ + layer{ name: "Inner1" type: kInnerProduct - srclayers:"mnist" + srclayers:"data" innerproduct_conf{ num_output: 1000 } http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/Makefile.example ---------------------------------------------------------------------- diff --git a/examples/rnnlm/Makefile.example b/examples/rnnlm/Makefile.example index 48efd17..5c5ef97 100644 --- a/examples/rnnlm/Makefile.example +++ b/examples/rnnlm/Makefile.example @@ -38,11 +38,11 @@ download: create: protoc --proto_path=../../src/proto --proto_path=. --cpp_out=. rnnlm.proto - $(CXX) create_shard.cc rnnlm.pb.cc -std=c++11 -lsinga -lprotobuf -lzookeeper_mt -lglog -I../../include -I../../include/proto \ + $(CXX) create_data.cc rnnlm.pb.cc -std=c++11 -lsinga -lprotobuf -lzookeeper_mt -lglog -I../../include -I../../include/proto \ -L../../.libs/ -L/usr/local/lib -Wl,-unresolved-symbols=ignore-in-shared-libs -Wl,-rpath=../../.libs/ \ - -o create_shard.bin + -o create_data.bin for d in $(dirshards); do mkdir -p $${d}; done - ./create_shard.bin -train $(dirname)/train -test $(dirname)/test -valid $(dirname)/valid -class_size $(numclass) + ./create_data.bin -train $(dirname)/train -test $(dirname)/test -valid $(dirname)/valid -class_size $(numclass) rnnlm: http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/create_data.cc ---------------------------------------------------------------------- diff --git a/examples/rnnlm/create_data.cc b/examples/rnnlm/create_data.cc new file mode 100644 index 0000000..d63a8df --- /dev/null +++ b/examples/rnnlm/create_data.cc @@ -0,0 +1,472 @@ +/* + * This file include code from rnnlmlib-0.4 whose licence is as follows: +Copyright (c) 2010-2012 Tomas Mikolov +Copyright (c) 2013 Cantab Research Ltd +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither name of copyright holders nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ + +// +// This code creates DataShard for RNNLM dataset. +// The RNNLM dataset could be downloaded at +// http://www.rnnlm.org/ +// +// Usage: +// create_shard.bin -train [train_file] -valid [valid_file] +// -test [test_file] -class_size [# of classes] + +#include <cstring> +#include <cstdlib> +#include <cstdio> +#include <cmath> +#include <algorithm> +#include <fstream> + +#include "io/store.h" +#include "utils/common.h" +#include "proto/common.pb.h" +#include "./rnnlm.pb.h" + +#define MAX_STRING 100 +#define BUFFER_LEN 32 +#define NL_STRING "</s>" + +using std::string; +using std::max; +using std::min; + +struct vocab_word { + int cn; + char word[MAX_STRING]; + int class_index; +}; + +struct vocab_word *vocab; +int vocab_max_size; +int vocab_size; +int *vocab_hash; +int vocab_hash_size; +int debug_mode; +int old_classes; +int *class_start; +int *class_end; +int class_size; + +char train_file[MAX_STRING]; +char valid_file[MAX_STRING]; +char test_file[MAX_STRING]; + +int valid_mode; +int test_mode; + +unsigned int getWordHash(char *word) { + unsigned int hash, a; + + hash = 0; + for (a = 0; a < strlen(word); a++) hash = hash * 237 + word[a]; + hash = hash % vocab_hash_size; + + return hash; +} + +int searchVocab(char *word) { + int a; + unsigned int hash; + + hash = getWordHash(word); + + if (vocab_hash[hash] == -1) return -1; + if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; + + for (a = 0; a < vocab_size; a++) { // search in vocabulary + if (!strcmp(word, vocab[a].word)) { + vocab_hash[hash] = a; + return a; + } + } + + return -1; // return OOV if not found +} + +int addWordToVocab(char *word) { + unsigned int hash; + + snprintf(vocab[vocab_size].word, strlen(word)+1, "%s", word); + vocab[vocab_size].cn = 0; + vocab_size++; + + if (vocab_size + 2 >= vocab_max_size) { // reallocate memory if needed + vocab_max_size += 100; + vocab = (struct vocab_word *) realloc( + vocab, + vocab_max_size * sizeof(struct vocab_word)); + } + + hash = getWordHash(word); + vocab_hash[hash] = vocab_size - 1; + + return vocab_size - 1; +} + +void readWord(char *word, FILE *fin) { + int a = 0, ch; + + while (!feof(fin)) { + ch = fgetc(fin); + + if (ch == 13) continue; + + if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { + if (a > 0) { + if (ch == '\n') ungetc(ch, fin); + break; + } + + if (ch == '\n') { + snprintf(word, strlen(NL_STRING) + 1, + "%s", const_cast<char *>(NL_STRING)); + return; + } else { + continue; + } + } + + word[a] = static_cast<char>(ch); + a++; + + if (a >= MAX_STRING) { + // printf("Too long word found!\n"); //truncate too long words + a--; + } + } + word[a] = 0; +} + +void sortVocab() { + int a, b, max; + vocab_word swap; + + for (a = 1; a < vocab_size; a++) { + max = a; + for (b = a + 1; b < vocab_size; b++) + if (vocab[max].cn < vocab[b].cn) max = b; + + swap = vocab[max]; + vocab[max] = vocab[a]; + vocab[a] = swap; + } +} + +int learnVocabFromTrainFile() { + char word[MAX_STRING]; + FILE *fin; + int a, i, train_wcn; + + for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; + + fin = fopen(train_file, "rb"); + + vocab_size = 0; + + addWordToVocab(const_cast<char *>(NL_STRING)); + + train_wcn = 0; + while (1) { + readWord(word, fin); + if (feof(fin)) break; + + train_wcn++; + + i = searchVocab(word); + if (i == -1) { + a = addWordToVocab(word); + vocab[a].cn = 1; + } else { + vocab[i].cn++; + } + } + + sortVocab(); + + if (debug_mode > 0) { + printf("Vocab size: %d\n", vocab_size); + printf("Words in train file: %d\n", train_wcn); + } + + fclose(fin); + return 0; +} + +int splitClasses() { + double df, dd; + int i, a, b; + + df = 0; + dd = 0; + a = 0; + b = 0; + + class_start = reinterpret_cast<int *>(calloc(class_size, sizeof(int))); + memset(class_start, 0x7f, sizeof(int) * class_size); + class_end = reinterpret_cast<int *>(calloc(class_size, sizeof(int))); + memset(class_end, 0, sizeof(int) * class_size); + + if (old_classes) { // old classes + for (i = 0; i < vocab_size; i++) + b += vocab[i].cn; + for (i = 0; i < vocab_size; i++) { + df += vocab[i].cn / static_cast<double>(b); + if (df > 1) df = 1; + if (df > (a + 1) / static_cast<double>(class_size)) { + vocab[i].class_index = a; + if (a < class_size - 1) a++; + } else { + vocab[i].class_index = a; + } + } + } else { // new classes + for (i = 0; i < vocab_size; i++) + b += vocab[i].cn; + for (i = 0; i < vocab_size; i++) + dd += sqrt(vocab[i].cn / static_cast<double>(b)); + for (i = 0; i < vocab_size; i++) { + df += sqrt(vocab[i].cn / static_cast<double>(b)) / dd; + if (df > 1) df = 1; + if (df > (a + 1) / static_cast<double>(class_size)) { + vocab[i].class_index = a; + if (a < class_size - 1) a++; + } else { + vocab[i].class_index = a; + } + } + } + + // after dividing classes, update class start and class end information + for (i = 0; i < vocab_size; i++) { + a = vocab[i].class_index; + class_start[a] = min(i, class_start[a]); + class_end[a] = max(i + 1, class_end[a]); + } + return 0; +} + +int init_class() { + // debug_mode = 1; + debug_mode = 0; + vocab_max_size = 100; // largest length value for each word + vocab_size = 0; + vocab = (struct vocab_word *) calloc(vocab_max_size, + sizeof(struct vocab_word)); + vocab_hash_size = 100000000; + vocab_hash = reinterpret_cast<int *>(calloc(vocab_hash_size, sizeof(int))); + old_classes = 1; + + // read vocab + learnVocabFromTrainFile(); + + // split classes + splitClasses(); + + return 0; +} + +int create_data(const char *input_file, const char *output) { + auto* store = singa::io::OpenStore("kvfile", output, singa::io::kCreate); + WordRecord wordRecord; + + FILE *fin; + int a, i; + fin = fopen(input_file, "rb"); + + int wcnt = 0; + char key[BUFFER_LEN]; + char wordstr[MAX_STRING]; + string value; + while (1) { + readWord(wordstr, fin); + if (feof(fin)) break; + i = searchVocab(wordstr); + if (i == -1) { + if (debug_mode) printf("unknown word [%s] detected!", wordstr); + } else { + wordRecord.set_word(string(wordstr)); + wordRecord.set_word_index(i); + int class_idx = vocab[i].class_index; + wordRecord.set_class_index(class_idx); + wordRecord.set_class_start(class_start[class_idx]); + wordRecord.set_class_end(class_end[class_idx]); + int length = snprintf(key, BUFFER_LEN, "%05d", wcnt++); + wordRecord.SerializeToString(&value); + store->Write(string(key, length), value); + } + } + + fclose(fin); + store->Flush(); + delete store; + return 0; +} + +int argPos(char *str, int argc, char **argv) { + int a; + + for (a = 1; a < argc; a++) + if (!strcmp(str, argv[a])) + return a; + + return -1; +} + +int main(int argc, char **argv) { + int i; + FILE *f; + + // set debug mode + i = argPos(const_cast<char *>("-debug"), argc, argv); + if (i > 0) { + debug_mode = 1; + if (debug_mode > 0) + printf("debug mode: %d\n", debug_mode); + } + + // search for train file + i = argPos(const_cast<char *>("-train"), argc, argv); + if (i > 0) { + if (i + 1 == argc) { + printf("ERROR: training data file not specified!\n"); + return 0; + } + + snprintf(train_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]); + + if (debug_mode > 0) + printf("train file: %s\n", train_file); + + f = fopen(train_file, "rb"); + if (f == NULL) { + printf("ERROR: training data file not found!\n"); + return 0; + } + fclose(f); + } else { + printf("ERROR: training data must be set.\n"); + } + + // search for valid file + i = argPos(const_cast<char *>("-valid"), argc, argv); + if (i > 0) { + if (i + 1 == argc) { + printf("ERROR: validating data file not specified!\n"); + return 0; + } + + snprintf(valid_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]); + + if (debug_mode > 0) + printf("valid file: %s\n", valid_file); + + f = fopen(valid_file, "rb"); + if (f == NULL) { + printf("ERROR: validating data file not found!\n"); + return 0; + } + fclose(f); + valid_mode = 1; + } + + // search for test file + i = argPos(const_cast<char *>("-test"), argc, argv); + if (i > 0) { + if (i + 1 == argc) { + printf("ERROR: testing data file not specified!\n"); + return 0; + } + + snprintf(test_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]); + + if (debug_mode > 0) + printf("test file: %s\n", test_file); + + f = fopen(test_file, "rb"); + if (f == NULL) { + printf("ERROR: testing data file not found!\n"); + return 0; + } + fclose(f); + test_mode = 1; + } + + // search for class size + i = argPos(const_cast<char *>("-class_size"), argc, argv); + if (i > 0) { + if (i + 1 == argc) { + printf("ERROR: class size not specified!\n"); + return 0; + } + + class_size = atoi(argv[i + 1]); + + if (debug_mode > 0) + printf("class size: %d\n", class_size); + } + if (class_size <= 0) { + printf("ERROR: no or invalid class size received!\n"); + return 0; + } + + init_class(); + + create_data(train_file, "train_data.bin"); + if (valid_mode) create_data(valid_file, "valid_data.bin"); + if (test_mode) create_data(test_file, "test_data.bin"); + + return 0; +} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/create_shard.cc ---------------------------------------------------------------------- diff --git a/examples/rnnlm/create_shard.cc b/examples/rnnlm/create_shard.cc deleted file mode 100644 index 536ce1f..0000000 --- a/examples/rnnlm/create_shard.cc +++ /dev/null @@ -1,471 +0,0 @@ -/* - * This file include code from rnnlmlib-0.4 whose licence is as follows: -Copyright (c) 2010-2012 Tomas Mikolov -Copyright (c) 2013 Cantab Research Ltd -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither name of copyright holders nor the names of its contributors -may be used to endorse or promote products derived from this software -without specific prior written permission. - - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -/************************************************************ -* -* Licensed to the Apache Software Foundation (ASF) under one -* or more contributor license agreements. See the NOTICE file -* distributed with this work for additional information -* regarding copyright ownership. The ASF licenses this file -* to you under the Apache License, Version 2.0 (the -* "License"); you may not use this file except in compliance -* with the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an -* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -* KIND, either express or implied. See the License for the -* specific language governing permissions and limitations -* under the License. -* -*************************************************************/ - -// -// This code creates DataShard for RNNLM dataset. -// The RNNLM dataset could be downloaded at -// http://www.rnnlm.org/ -// -// Usage: -// create_shard.bin -train [train_file] -valid [valid_file] -// -test [test_file] -class_size [# of classes] - -#include <cstring> -#include <cstdlib> -#include <cstdio> -#include <cmath> -#include <algorithm> -#include <fstream> - -#include "utils/data_shard.h" -#include "utils/common.h" -#include "proto/common.pb.h" -#include "./rnnlm.pb.h" - -#define MAX_STRING 100 -#define BUFFER_LEN 32 -#define NL_STRING "</s>" - -using std::string; -using std::max; -using std::min; -using singa::DataShard; - -struct vocab_word { - int cn; - char word[MAX_STRING]; - int class_index; -}; - -struct vocab_word *vocab; -int vocab_max_size; -int vocab_size; -int *vocab_hash; -int vocab_hash_size; -int debug_mode; -int old_classes; -int *class_start; -int *class_end; -int class_size; - -char train_file[MAX_STRING]; -char valid_file[MAX_STRING]; -char test_file[MAX_STRING]; - -int valid_mode; -int test_mode; - -unsigned int getWordHash(char *word) { - unsigned int hash, a; - - hash = 0; - for (a = 0; a < strlen(word); a++) hash = hash * 237 + word[a]; - hash = hash % vocab_hash_size; - - return hash; -} - -int searchVocab(char *word) { - int a; - unsigned int hash; - - hash = getWordHash(word); - - if (vocab_hash[hash] == -1) return -1; - if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; - - for (a = 0; a < vocab_size; a++) { // search in vocabulary - if (!strcmp(word, vocab[a].word)) { - vocab_hash[hash] = a; - return a; - } - } - - return -1; // return OOV if not found -} - -int addWordToVocab(char *word) { - unsigned int hash; - - snprintf(vocab[vocab_size].word, strlen(word)+1, "%s", word); - vocab[vocab_size].cn = 0; - vocab_size++; - - if (vocab_size + 2 >= vocab_max_size) { // reallocate memory if needed - vocab_max_size += 100; - vocab = (struct vocab_word *) realloc( - vocab, - vocab_max_size * sizeof(struct vocab_word)); - } - - hash = getWordHash(word); - vocab_hash[hash] = vocab_size - 1; - - return vocab_size - 1; -} - -void readWord(char *word, FILE *fin) { - int a = 0, ch; - - while (!feof(fin)) { - ch = fgetc(fin); - - if (ch == 13) continue; - - if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { - if (a > 0) { - if (ch == '\n') ungetc(ch, fin); - break; - } - - if (ch == '\n') { - snprintf(word, strlen(NL_STRING) + 1, - "%s", const_cast<char *>(NL_STRING)); - return; - } else { - continue; - } - } - - word[a] = static_cast<char>(ch); - a++; - - if (a >= MAX_STRING) { - // printf("Too long word found!\n"); //truncate too long words - a--; - } - } - word[a] = 0; -} - -void sortVocab() { - int a, b, max; - vocab_word swap; - - for (a = 1; a < vocab_size; a++) { - max = a; - for (b = a + 1; b < vocab_size; b++) - if (vocab[max].cn < vocab[b].cn) max = b; - - swap = vocab[max]; - vocab[max] = vocab[a]; - vocab[a] = swap; - } -} - -int learnVocabFromTrainFile() { - char word[MAX_STRING]; - FILE *fin; - int a, i, train_wcn; - - for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; - - fin = fopen(train_file, "rb"); - - vocab_size = 0; - - addWordToVocab(const_cast<char *>(NL_STRING)); - - train_wcn = 0; - while (1) { - readWord(word, fin); - if (feof(fin)) break; - - train_wcn++; - - i = searchVocab(word); - if (i == -1) { - a = addWordToVocab(word); - vocab[a].cn = 1; - } else { - vocab[i].cn++; - } - } - - sortVocab(); - - if (debug_mode > 0) { - printf("Vocab size: %d\n", vocab_size); - printf("Words in train file: %d\n", train_wcn); - } - - fclose(fin); - return 0; -} - -int splitClasses() { - double df, dd; - int i, a, b; - - df = 0; - dd = 0; - a = 0; - b = 0; - - class_start = reinterpret_cast<int *>(calloc(class_size, sizeof(int))); - memset(class_start, 0x7f, sizeof(int) * class_size); - class_end = reinterpret_cast<int *>(calloc(class_size, sizeof(int))); - memset(class_end, 0, sizeof(int) * class_size); - - if (old_classes) { // old classes - for (i = 0; i < vocab_size; i++) - b += vocab[i].cn; - for (i = 0; i < vocab_size; i++) { - df += vocab[i].cn / static_cast<double>(b); - if (df > 1) df = 1; - if (df > (a + 1) / static_cast<double>(class_size)) { - vocab[i].class_index = a; - if (a < class_size - 1) a++; - } else { - vocab[i].class_index = a; - } - } - } else { // new classes - for (i = 0; i < vocab_size; i++) - b += vocab[i].cn; - for (i = 0; i < vocab_size; i++) - dd += sqrt(vocab[i].cn / static_cast<double>(b)); - for (i = 0; i < vocab_size; i++) { - df += sqrt(vocab[i].cn / static_cast<double>(b)) / dd; - if (df > 1) df = 1; - if (df > (a + 1) / static_cast<double>(class_size)) { - vocab[i].class_index = a; - if (a < class_size - 1) a++; - } else { - vocab[i].class_index = a; - } - } - } - - // after dividing classes, update class start and class end information - for (i = 0; i < vocab_size; i++) { - a = vocab[i].class_index; - class_start[a] = min(i, class_start[a]); - class_end[a] = max(i + 1, class_end[a]); - } - return 0; -} - -int init_class() { - // debug_mode = 1; - debug_mode = 0; - vocab_max_size = 100; // largest length value for each word - vocab_size = 0; - vocab = (struct vocab_word *) calloc(vocab_max_size, - sizeof(struct vocab_word)); - vocab_hash_size = 100000000; - vocab_hash = reinterpret_cast<int *>(calloc(vocab_hash_size, sizeof(int))); - old_classes = 1; - - // read vocab - learnVocabFromTrainFile(); - - // split classes - splitClasses(); - - return 0; -} - -int create_shard(const char *input_file, const char *output) { - DataShard dataShard(output, DataShard::kCreate); - singa::Record record; - auto* wordRecord = record.MutableExtension(word); - - FILE *fin; - int a, i; - fin = fopen(input_file, "rb"); - - int wcnt = 0; - char key[BUFFER_LEN]; - char wordstr[MAX_STRING]; - while (1) { - readWord(wordstr, fin); - if (feof(fin)) break; - i = searchVocab(wordstr); - if (i == -1) { - if (debug_mode) printf("unknown word [%s] detected!", wordstr); - } else { - wordRecord->set_word(string(wordstr)); - wordRecord->set_word_index(i); - int class_idx = vocab[i].class_index; - wordRecord->set_class_index(class_idx); - wordRecord->set_class_start(class_start[class_idx]); - wordRecord->set_class_end(class_end[class_idx]); - int length = snprintf(key, BUFFER_LEN, "%05d", wcnt++); - dataShard.Insert(string(key, length), record); - } - } - - dataShard.Flush(); - fclose(fin); - return 0; -} - -int argPos(char *str, int argc, char **argv) { - int a; - - for (a = 1; a < argc; a++) - if (!strcmp(str, argv[a])) - return a; - - return -1; -} - -int main(int argc, char **argv) { - int i; - FILE *f; - - // set debug mode - i = argPos(const_cast<char *>("-debug"), argc, argv); - if (i > 0) { - debug_mode = 1; - if (debug_mode > 0) - printf("debug mode: %d\n", debug_mode); - } - - // search for train file - i = argPos(const_cast<char *>("-train"), argc, argv); - if (i > 0) { - if (i + 1 == argc) { - printf("ERROR: training data file not specified!\n"); - return 0; - } - - snprintf(train_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]); - - if (debug_mode > 0) - printf("train file: %s\n", train_file); - - f = fopen(train_file, "rb"); - if (f == NULL) { - printf("ERROR: training data file not found!\n"); - return 0; - } - fclose(f); - } else { - printf("ERROR: training data must be set.\n"); - } - - // search for valid file - i = argPos(const_cast<char *>("-valid"), argc, argv); - if (i > 0) { - if (i + 1 == argc) { - printf("ERROR: validating data file not specified!\n"); - return 0; - } - - snprintf(valid_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]); - - if (debug_mode > 0) - printf("valid file: %s\n", valid_file); - - f = fopen(valid_file, "rb"); - if (f == NULL) { - printf("ERROR: validating data file not found!\n"); - return 0; - } - fclose(f); - valid_mode = 1; - } - - // search for test file - i = argPos(const_cast<char *>("-test"), argc, argv); - if (i > 0) { - if (i + 1 == argc) { - printf("ERROR: testing data file not specified!\n"); - return 0; - } - - snprintf(test_file, strlen(argv[i + 1])+1, "%s", argv[i + 1]); - - if (debug_mode > 0) - printf("test file: %s\n", test_file); - - f = fopen(test_file, "rb"); - if (f == NULL) { - printf("ERROR: testing data file not found!\n"); - return 0; - } - fclose(f); - test_mode = 1; - } - - // search for class size - i = argPos(const_cast<char *>("-class_size"), argc, argv); - if (i > 0) { - if (i + 1 == argc) { - printf("ERROR: class size not specified!\n"); - return 0; - } - - class_size = atoi(argv[i + 1]); - - if (debug_mode > 0) - printf("class size: %d\n", class_size); - } - if (class_size <= 0) { - printf("ERROR: no or invalid class size received!\n"); - return 0; - } - - init_class(); - - create_shard(train_file, "train_shard"); - if (valid_mode) create_shard(valid_file, "valid_shard"); - if (test_mode) create_shard(test_file, "test_shard"); - - return 0; -} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/job.conf ---------------------------------------------------------------------- diff --git a/examples/rnnlm/job.conf b/examples/rnnlm/job.conf index 021692f..a1f803d 100644 --- a/examples/rnnlm/job.conf +++ b/examples/rnnlm/job.conf @@ -33,7 +33,8 @@ layer { name: "data" user_type: "kData" [data_conf] { - path: "examples/rnnlm/train_shard" + backend: "kvfile" + path: "examples/rnnlm/train_data.bin" max_window: 10 } exclude: kVal @@ -43,19 +44,13 @@ layer { name: "data" user_type: "kData" [data_conf] { - path: "examples/rnnlm/valid_shard" + path: "examples/rnnlm/valid_data.bin" max_window: 10 } exclude: kTrain } layer{ - name:"label" - user_type: "kLabel" - srclayers: "data" -} - -layer{ name: "embedding" user_type: "kEmbedding" srclayers: "data" @@ -90,7 +85,7 @@ layer{ name: "loss" user_type: "kLoss" srclayers:"hidden" - srclayers:"label" + srclayers:"data" [loss_conf] { nclass:100 vocab_size: 3720 http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/main.cc ---------------------------------------------------------------------- diff --git a/examples/rnnlm/main.cc b/examples/rnnlm/main.cc index ea1dcdd..ceb8eb7 100644 --- a/examples/rnnlm/main.cc +++ b/examples/rnnlm/main.cc @@ -36,7 +36,6 @@ int main(int argc, char **argv) { driver.RegisterLayer<rnnlm::HiddenLayer, std::string>("kHidden"); driver.RegisterLayer<rnnlm::LossLayer, std::string>("kLoss"); driver.RegisterLayer<rnnlm::DataLayer, std::string>("kData"); - driver.RegisterLayer<rnnlm::LabelLayer, std::string>("kLabel"); singa::JobProto jobConf = driver.job_conf(); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/rnnlm.cc ---------------------------------------------------------------------- diff --git a/examples/rnnlm/rnnlm.cc b/examples/rnnlm/rnnlm.cc index c086972..a6b65f7 100644 --- a/examples/rnnlm/rnnlm.cc +++ b/examples/rnnlm/rnnlm.cc @@ -52,42 +52,62 @@ inline Tensor<cpu, 1> RTensor1(Blob<float>* blob) { /*******DataLayer**************/ DataLayer::~DataLayer() { - if (shard_ != nullptr) - delete shard_; - shard_ = nullptr; + if (store_ != nullptr) + delete store_; } void DataLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) { RNNLayer::Setup(conf, srclayers); - shard_ = new singa::DataShard( - conf.GetExtension(data_conf).path(), - singa::DataShard::kRead); string key; max_window_ = conf.GetExtension(data_conf).max_window(); - records_.resize(max_window_ + 1); // resize to # of records in data layer + data_.Reshape(vector<int>{max_window_ + 1, 4}); window_ = 0; - shard_->Next(&key, &records_[window_]); +} + +void SetInst(int k, WordRecord& word, Blob<float>* to) { + float* dptr = to->mutable_cpu_data() + k * 4; + dptr[0] = static_cast<float>(word.word_index()); + dptr[1] = static_cast<float>(word.class_index()); + dptr[2] = static_cast<float>(word.class_start()); + dptr[3] = static_cast<float>(word.class_end()); +} + +void ShiftInst(int from, int to, Blob<float>* data) { + const float* f = data->cpu_data() + from * 4; + float* t = data->mutable_cpu_data() + to * 4; + // hard code the feature dim to be 4; + t[0] = f[0]; t[1] = f[1]; t[2] = f[2]; t[3] = f[3]; } void DataLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { - CHECK(records_.size() <= shard_->Count()); - records_[0] = records_[window_]; + string key, value; + WordRecord word; + if (store_ == nullptr) { + store_ = singa::io::OpenStore( + layer_conf_.GetExtension(data_conf).backend(), + layer_conf_.GetExtension(data_conf).path(), + singa::io::kRead); + store_->Read(&key, &value); + word.ParseFromString(value); + SetInst(0, word, &data_); + } + ShiftInst(window_, 0, &data_); window_ = max_window_; for (int i = 1; i <= max_window_; i++) { - string key; - if (shard_->Next(&key, &records_[i])) { - if (records_[i].GetExtension(word).word_index() == 0) { - window_ = i; - break; - } - } else { - shard_->SeekToFirst(); - CHECK(shard_->Next(&key, &records_[i])); + if (!store_->Read(&key, &value)) { + store_->SeekToFirst(); + CHECK(store_->Read(&key, &value)); + } + word.ParseFromString(value); + SetInst(i, word, &data_); + if (word.word_index() == 0) { + window_ = i; + break; } } } -/*******LabelLayer**************/ +/*******LabelLayer************** void LabelLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) { RNNLayer::Setup(conf, srclayers); @@ -108,6 +128,7 @@ void LabelLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { label[4 * i + 3] = wordrecord.class_index(); } } +*/ /*******EmbeddingLayer**************/ EmbeddingLayer::~EmbeddingLayer() { @@ -118,7 +139,7 @@ void EmbeddingLayer::Setup(const LayerProto& conf, const vector<Layer*>& srclayers) { RNNLayer::Setup(conf, srclayers); CHECK_EQ(srclayers.size(), 1); - int max_window = dynamic_cast<DataLayer*>(srclayers[0])->max_window(); + int max_window = srclayers[0]->data(this).shape()[0]; word_dim_ = conf.GetExtension(embedding_conf).word_dim(); data_.Reshape(vector<int>{max_window, word_dim_}); grad_.ReshapeLike(data_); @@ -130,12 +151,12 @@ void EmbeddingLayer::Setup(const LayerProto& conf, void EmbeddingLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { auto datalayer = dynamic_cast<DataLayer*>(srclayers[0]); window_ = datalayer->window(); - auto records = datalayer->records(); auto words = RTensor2(&data_); auto embed = RTensor2(embed_->mutable_data()); + const float* idxptr = datalayer->data(this).cpu_data(); for (int t = 0; t < window_; t++) { - int idx = static_cast<int>(records[t].GetExtension(word).word_index()); + int idx = static_cast<int>(idxptr[t * 4]); CHECK_GE(idx, 0); CHECK_LT(idx, vocab_size_); Copy(words[t], embed[idx]); @@ -147,10 +168,10 @@ void EmbeddingLayer::ComputeGradient(int flag, auto grad = RTensor2(&grad_); auto gembed = RTensor2(embed_->mutable_grad()); auto datalayer = dynamic_cast<DataLayer*>(srclayers[0]); - auto records = datalayer->records(); gembed = 0; + const float* idxptr = datalayer->data(this).cpu_data(); for (int t = 0; t < window_; t++) { - int idx = static_cast<int>(records[t].GetExtension(word).word_index()); + int idx = static_cast<int>(idxptr[t * 4]); Copy(gembed[idx], grad[t]); } } @@ -241,8 +262,9 @@ void LossLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { float loss = 0.f, ppl = 0.f; for (int t = 0; t < window_; t++) { - int start = static_cast<int>(label[t * 4 + 0]); - int end = static_cast<int>(label[t * 4 + 1]); + // label is the next word + int start = static_cast<int>(label[(t + 1) * 4 + 2]); + int end = static_cast<int>(label[(t + 1) * 4 + 3]); auto wordWeight = word_weight.Slice(start, end); CHECK_GT(end, start); @@ -254,8 +276,8 @@ void LossLayer::ComputeFeature(int flag, const vector<Layer*>& srclayers) { pclass[t] = dot(src[t], class_weight.T()); Softmax(pclass[t], pclass[t]); - int wid = static_cast<int>(label[t * 4 + 2]); - int cid = static_cast<int>(label[t * 4 + 3]); + int wid = static_cast<int>(label[(t + 1) * 4 + 0]); + int cid = static_cast<int>(label[(t + 1) * 4 + 1]); CHECK_GT(end, wid); CHECK_GE(wid, start); loss_ += -log(std::max(pword[wid - start] * pclass[t][cid], FLT_MIN)); @@ -276,10 +298,10 @@ void LossLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) { gclass_weight = 0; gword_weight = 0; for (int t = 0; t < window_; t++) { - int start = static_cast<int>(label[t * 4 + 0]); - int end = static_cast<int>(label[t * 4 + 1]); - int wid = static_cast<int>(label[t * 4 + 2]); - int cid = static_cast<int>(label[t * 4 + 3]); + int start = static_cast<int>(label[(t + 1) * 4 + 2]); + int end = static_cast<int>(label[(t + 1) * 4 + 3]); + int wid = static_cast<int>(label[(t + 1) * 4 + 0]); + int cid = static_cast<int>(label[(t + 1) * 4 + 1]); auto pword = RTensor1(&pword_[t]); CHECK_GT(end, wid); CHECK_GE(wid, start); @@ -304,6 +326,9 @@ void LossLayer::ComputeGradient(int flag, const vector<Layer*>& srclayers) { const std::string LossLayer::ToString(bool debug, int flag) { float loss = loss_ / num_; float ppl = exp10(- ppl_ / num_); + loss_ = 0; + num_ = 0; + ppl_ = 0; return "loss = " + std::to_string(loss) + ", ppl = " + std::to_string(ppl); } } // end of namespace rnnlm http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/rnnlm.h ---------------------------------------------------------------------- diff --git a/examples/rnnlm/rnnlm.h b/examples/rnnlm/rnnlm.h index 8ad7a68..cb3198e 100644 --- a/examples/rnnlm/rnnlm.h +++ b/examples/rnnlm/rnnlm.h @@ -69,20 +69,20 @@ class DataLayer : public RNNLayer, public singa::DataLayer { private: int max_window_; - singa::DataShard* shard_; + singa::io::Store* store_ = nullptr; }; /** * LabelLayer that read records_[1] to records_[window_] from DataLayer to * offer label information - */ class LabelLayer : public RNNLayer { public: void Setup(const LayerProto& conf, const vector<Layer*>& srclayers) override; void ComputeFeature(int flag, const vector<Layer*>& srclayers) override; void ComputeGradient(int flag, const vector<Layer*>& srclayers) override {} }; + */ /** http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/examples/rnnlm/rnnlm.proto ---------------------------------------------------------------------- diff --git a/examples/rnnlm/rnnlm.proto b/examples/rnnlm/rnnlm.proto index 8feb3f9..8cfec86 100644 --- a/examples/rnnlm/rnnlm.proto +++ b/examples/rnnlm/rnnlm.proto @@ -35,6 +35,7 @@ message LossProto { message DataProto { required string path = 1; optional int32 max_window = 2; + optional string backend = 3 [default = "kvfile"]; } extend singa.LayerProto { @@ -50,7 +51,3 @@ message WordRecord { optional int32 class_start = 4; optional int32 class_end = 5; } - -extend singa.Record { - optional WordRecord word = 101; -} http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/include/io/kvfile_store.h ---------------------------------------------------------------------- diff --git a/include/io/kvfile_store.h b/include/io/kvfile_store.h index bda7409..c3fd868 100644 --- a/include/io/kvfile_store.h +++ b/include/io/kvfile_store.h @@ -36,6 +36,7 @@ namespace singa { namespace io { */ class KVFileStore : public Store { public: + ~KVFileStore() { Close();} bool Open(const std::string& source, Mode mode) override; void Close() override; bool Read(std::string* key, std::string* value) override; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/include/io/store.h ---------------------------------------------------------------------- diff --git a/include/io/store.h b/include/io/store.h index 8665af0..f3cc282 100644 --- a/include/io/store.h +++ b/include/io/store.h @@ -38,6 +38,10 @@ enum Mode { kCreate, kRead, kAppend }; class Store { public: Store() { } + /** + * In case that users forget to call Close() to release resources, e.g., + * memory, you can release them here. + */ virtual ~Store() { } /** * @param[in] source path to the storage, could be a file path, folder path @@ -46,6 +50,9 @@ class Store { * @return true if open successfully, otherwise false. */ virtual bool Open(const std::string& source, Mode mode) = 0; + /** + * Release resources. + */ virtual void Close() = 0; /** * Read a tuple. @@ -73,7 +80,22 @@ class Store { virtual void Flush() {} }; -Store* CreateStore(const std::string& store); +/** + * Create a Store object. + * + * @param[in] backend identifier for a specific backend. Two backends are + * inluced currently, i.e., "kvfile", "textfile" + * @return a pointer to the newly created Store. + */ +Store* CreateStore(const string& backend); +/** + * Create and open a Store object. + * + * @param[in] backend, @see CreateStore(). + * @param[in] path + * @param[in] mode kRead or kCreate or kAppend + */ +Store* OpenStore(const string& backend, const string& path, Mode mode); } // namespace io } /* singa */ #endif // SINGA_IO_STORE_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/include/io/textfile_store.h ---------------------------------------------------------------------- diff --git a/include/io/textfile_store.h b/include/io/textfile_store.h index 4c020e9..788dc20 100644 --- a/include/io/textfile_store.h +++ b/include/io/textfile_store.h @@ -32,6 +32,7 @@ namespace singa { namespace io { */ class TextFileStore : public Store { public: + ~TextFileStore() { Close(); } bool Open(const std::string& source, Mode mode) override; void Close() override; bool Read(std::string* key, std::string* value) override; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/include/neuralnet/input_layer.h ---------------------------------------------------------------------- diff --git a/include/neuralnet/input_layer.h b/include/neuralnet/input_layer.h index b5f2dd4..4dfded0 100644 --- a/include/neuralnet/input_layer.h +++ b/include/neuralnet/input_layer.h @@ -26,6 +26,7 @@ #include <vector> #include "neuralnet/layer.h" #include "utils/data_shard.h" +#include "io/store.h" /** * \file this file includes the declarations of input layers that inherit the * base InputLayer to load input features. @@ -40,6 +41,126 @@ * ParserLayer. */ namespace singa { +using std::string; +using std::vector; + +/************************Start of new input layers***************************/ +/** + * Base class for loading data from Store. + */ +class StoreInputLayer : virtual public InputLayer { + public: + ~StoreInputLayer(); + void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override; + void ComputeFeature(int flag, const vector<Layer*>& srclayers) override; + + ConnectionType dst_layer_connection() const override { return kOneToMany; } + + protected: + /** + * Parsing the (key, val) tuple to get feature (and label). + * Subclasses must implment this function. + * @param[in] k parse this tuple as the k-th instance of one mini-batch. + * @param[in] flag used to guide the parsing, e.g., kDeploy phase should not + * parse labels from the tuple. + * @param[in] key + * @param[in] val + */ + virtual bool Parse(int k, int flag, const string& key, const string& val) = 0; + + protected: + int batchsize_; + io::Store* store_ = nullptr; +}; + +/** + * Base layer for parsing a key-value tuple as a feature vector with fixed + * length. The feature shape is indicated by users in the configuration. + * Each tuple may has a label. + */ +class SingleLabelRecordLayer : public StoreInputLayer { + public: + void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override; + void ComputeFeature(int flag, const vector<Layer*>& srclayers) override; + + protected: + /** + * Load a single record (tuple), e.g., the mean or standard variance vector. + */ + virtual void LoadRecord(const string& backend, const string& path, + Blob<float>* to) = 0; + + protected: + /** + * Feature standardization by processing each feature dimension via + * @f$ y = (x - mu)/ std @f$ + * <a href= "http://ufldl.stanford.edu/wiki/index.php/Data_Preprocessing"> + * UFLDL</a> + */ + Blob<float> mean_, std_; +}; + +/** + * Specific layer that parses the value string loaded by Store into a + * SingleLabelImageRecord. + */ +class ProtoRecordLayer : public SingleLabelRecordLayer { + public: + void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override; + + protected: + /** + * Parse key as instance ID and val into SingleLabelImageRecord. + * @copydetails StoreInputLayer::Parse() + */ + bool Parse(int k, int flag, const string& key, const string& val) override; + void LoadRecord(const string& backend, + const string& path, + Blob<float>* to) override; + + private: + // TODO(wangwei) decode the image + bool encoded_; +}; + +/** + * Specific layer that parses the value string loaded by Store as a line from + * a CSV file. + * + * It assumes the first column is the label except that has_label_ is configured + * to false. Or the data is used in deploy mode. + */ +class CSVRecordLayer : public SingleLabelRecordLayer { + public: + void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override; + + protected: + bool Parse(int k, int flag, const string& key, const string& val) override; + void LoadRecord(const string& backend, + const string& path, + Blob<float>* to) override; + + private: + std::string sep_; + bool has_label_; +}; + +/** + * Do preprocessing for images, including cropping, mirroring, resizing. + */ +class ImagePreprocessLayer : public InputLayer { + public: + void Setup(const LayerProto& proto, const vector<Layer*>& srclayers) override; + void ComputeFeature(int flag, const vector<Layer*>& srclayers); + + private: + bool mirror_ = false; + int cropsize_ = 0; + int resize_ = 0; + float scale_ = 1; +}; + +/************************End of new input layers***************************/ /** * Base layer for reading ::Record from local Shard, HDFS, lmdb, etc. */ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/include/neuralnet/layer.h ---------------------------------------------------------------------- diff --git a/include/neuralnet/layer.h b/include/neuralnet/layer.h index bf83163..5ed0c7e 100644 --- a/include/neuralnet/layer.h +++ b/include/neuralnet/layer.h @@ -34,6 +34,8 @@ namespace singa { using std::vector; +// TODO(wangwei) make AuxType a template argument for Layer. +using AuxType = int; /** * Base layer class. * @@ -186,6 +188,12 @@ class Layer { return &data_; } /** + * @return auxiliary data, e.g., image label. + */ + virtual const vector<AuxType>& aux_data(const Layer* from = nullptr) const { + return aux_data_; + } + /** * @see data(). * @return the const ref of the Blob for the gradient of this layer, mainly * used in BP algorithm. @@ -205,6 +213,7 @@ class Layer { protected: LayerProto layer_conf_; Blob<float> data_, grad_; + vector<AuxType> aux_data_; }; /** http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/include/singa.h ---------------------------------------------------------------------- diff --git a/include/singa.h b/include/singa.h index 6c801ab..63acb0a 100644 --- a/include/singa.h +++ b/include/singa.h @@ -31,6 +31,7 @@ #include "utils/param.h" #include "utils/singleton.h" #include "utils/factory.h" +#include "io/store.h" #include "./driver.h" #endif // SINGA_SINGA_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/include/utils/tokenizer.h ---------------------------------------------------------------------- diff --git a/include/utils/tokenizer.h b/include/utils/tokenizer.h new file mode 100644 index 0000000..fc6ba8a --- /dev/null +++ b/include/utils/tokenizer.h @@ -0,0 +1,59 @@ +/************************************************************ +* +* Licensed to the Apache Software Foundation (ASF) under one +* or more contributor license agreements. See the NOTICE file +* distributed with this work for additional information +* regarding copyright ownership. The ASF licenses this file +* to you under the Apache License, Version 2.0 (the +* "License"); you may not use this file except in compliance +* with the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +* +*************************************************************/ +#ifndef SINGA_UTILS_TOKENIER_H_ +#define SINGA_UTILS_TOKENIER_H_ +#include <string> +#include <glog/logging.h> +namespace singa { +/** + * Tokenize a string. + * + * example: + * Tokenizer t("assa,asf;wes", ",;"); + * string x; + * t >> x; // x is assa + * t >> x; // x is asf + * t >> x; // x is wes + * cout << (t >> x); // print 0. + */ +class Tokenizer { + public: + Tokenizer(const std::string& str, const std::string& sep): start_(0), + sep_(sep), buf_(str) {} + Tokenizer & operator>>(std::string& out) { + CHECK_LT(start_, buf_.length()); + int start = start_; + auto pos = buf_.find_first_of(sep_, start); + if (pos == std::string::npos) + pos = buf_.length(); + start_ = pos + 1; + out = buf_.substr(start, pos); + return *this; + } + + bool Valid() { return start_ < buf_.length(); } + private: + unsigned start_; + std::string sep_; + const std::string& buf_; +}; +} /* singa */ +#endif // SINGA_UTILS_TOKENIER_H_ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/src/driver.cc ---------------------------------------------------------------------- diff --git a/src/driver.cc b/src/driver.cc index 28669fa..0d3bbfc 100644 --- a/src/driver.cc +++ b/src/driver.cc @@ -54,6 +54,11 @@ void Driver::Init(int argc, char **argv) { ReadProtoFromTextFile(argv[arg_pos+1], &job_conf_); // register layers + + RegisterLayer<ProtoRecordLayer, int>(kProtoRecord); + RegisterLayer<CSVRecordLayer, int>(kCSVRecord); + RegisterLayer<ImagePreprocessLayer, int>(kImagePreprocess); + RegisterLayer<BridgeDstLayer, int>(kBridgeDst); RegisterLayer<BridgeSrcLayer, int>(kBridgeSrc); RegisterLayer<ConvolutionLayer, int>(kConvolution); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/src/io/store.cc ---------------------------------------------------------------------- diff --git a/src/io/store.cc b/src/io/store.cc index 6412628..8d3bf13 100644 --- a/src/io/store.cc +++ b/src/io/store.cc @@ -19,7 +19,6 @@ * *************************************************************/ - #include "io/store.h" #include "io/kvfile_store.h" #include "io/textfile_store.h" @@ -52,6 +51,12 @@ Store* CreateStore(const std::string& backend) { #endif return store; } + +Store* OpenStore(const string& backend, const string& path, Mode mode) { + auto store = CreateStore(backend); + store->Open(path, mode); + return store; +} } /* io */ } /* singa */ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/5f010caa/src/io/textfile_store.cc ---------------------------------------------------------------------- diff --git a/src/io/textfile_store.cc b/src/io/textfile_store.cc index 74ec9a4..77694a0 100644 --- a/src/io/textfile_store.cc +++ b/src/io/textfile_store.cc @@ -40,6 +40,7 @@ void TextFileStore::Close() { fs_->close(); } delete fs_; + fs_ = nullptr; } }
