Repository: incubator-singa Updated Branches: refs/heads/master 179144211 -> 79a241c8b
SINGA-10 Add Support for Recurrent Neural Networks (RNN) Add user-defined records for word (word string, word id, class id, startpos, endpos); Implement RnnDataLayer, WordLayer and RnnLabelLayer; Implement create_shard.cc for the sample dataset of rnnlmlib; Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/ad86f720 Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/ad86f720 Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/ad86f720 Branch: refs/heads/master Commit: ad86f720faccb4e460b3e31bd0866ab0f45c7451 Parents: 13b1c08 Author: kaiping <[email protected]> Authored: Sun Sep 13 20:07:11 2015 +0800 Committer: Wei Wang <[email protected]> Committed: Fri Sep 18 16:46:40 2015 +0800 ---------------------------------------------------------------------- examples/rnnlm/Makefile.example | 26 +++ examples/rnnlm/create_shard.cc | 400 +++++++++++++++++++++++++++++++++++ examples/rnnlm/main.cc | 3 + examples/rnnlm/rnnlm.cc | 77 +++++++ examples/rnnlm/rnnlm.h | 47 ++++ examples/rnnlm/rnnlm.proto | 15 ++ 6 files changed, 568 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ad86f720/examples/rnnlm/Makefile.example ---------------------------------------------------------------------- diff --git a/examples/rnnlm/Makefile.example b/examples/rnnlm/Makefile.example index 5eeca78..b4505cf 100644 --- a/examples/rnnlm/Makefile.example +++ b/examples/rnnlm/Makefile.example @@ -1,5 +1,31 @@ MSHADOW_FLAGS :=-DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0 +libs :=singa glog protobuf +filename = rnnlm-0.4b.tgz +# note: filelink for rnnlm-0.4b may change +filelink = https://f25ea9ccb7d3346ce6891573d543960492b92c30.googledrive.com/host/0ByxdPXuxLPS5RFM5dVNvWVhTd0U +dirname = $(patsubst %.tgz,%, $(filename)) +numclass = 100 +dirshards = train_shard valid_shard test_shard + + +.PHONY: all download create + +download: rnnlm + +rnnlm: + wget $(filelink)/$(filename) + tar zxf $(filename) + rm $(filename) + +create: + $(CXX) create_shard.cc -std=c++11 -lsinga -lprotobuf -lzookeeper_mt -lglog -I../../include \ + -L../../.libs/ -L/usr/local/lib -Wl,-unresolved-symbols=ignore-in-shared-libs -Wl,-rpath=../../.libs/ \ + -o create_shard.bin + for d in $(dirshards); do mkdir -p $${d}; done + ./create_shard.bin -train $(dirname)/train -class_size $(numclass) -test $(dirname)/test + + all: protoc --proto_path=../../src/proto --proto_path=. --cpp_out=. rnnlm.proto $(CXX) main.cc rnnlm.cc rnnlm.pb.cc $(MSHADOW_FLAGS) -std=c++11 -lsinga -lglog -lprotobuf -lopenblas -I../../include\ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ad86f720/examples/rnnlm/create_shard.cc ---------------------------------------------------------------------- diff --git a/examples/rnnlm/create_shard.cc b/examples/rnnlm/create_shard.cc new file mode 100644 index 0000000..dd56a84 --- /dev/null +++ b/examples/rnnlm/create_shard.cc @@ -0,0 +1,400 @@ +// +// This code creates DataShard for RNNLM dataset. +// It is adapted from the convert_mnist_data from Caffe +// The RNNLM dataset could be downloaded at +// http://www.rnnlm.org/ +// +// Usage: +// create_shard.bin -train train_file -class_size [-debug] [-valid valid_file] [-test test_file] + +#include <gflags/gflags.h> +#include <glog/logging.h> + + +#include "utils/data_shard.h" +#include "utils/common.h" +#include "proto/common.pb.h" +#include "singa.h" +#include "rnnlm.pb.h" + +#define MAX_STRING 100 + +#include <cstring> +#include <cstdlib> +#include <cstdio> +#include <cmath> +#include <algorithm> +#include <fstream> + +using namespace std; +using singa::DataShard; + +struct vocab_word { + int cn; + char word[MAX_STRING]; + int class_index; +}; + +struct vocab_word *vocab; +int vocab_max_size; +int vocab_size; +int *vocab_hash; +int vocab_hash_size; +int debug_mode; +int old_classes; +int *class_start; +int *class_end; +int class_size; + +char train_file[MAX_STRING]; +char valid_file[MAX_STRING]; +char test_file[MAX_STRING]; + +int valid_mode; +int test_mode; + +unsigned int getWordHash(char *word) { + unsigned int hash, a; + + hash = 0; + for (a = 0; a < strlen(word); a++) hash = hash * 237 + word[a]; + hash = hash % vocab_hash_size; + + return hash; +} + +int searchVocab(char *word) { + int a; + unsigned int hash; + + hash = getWordHash(word); + + if (vocab_hash[hash] == -1) return -1; + if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; + + for (a = 0; a < vocab_size; a++) { //search in vocabulary + if (!strcmp(word, vocab[a].word)) { + vocab_hash[hash] = a; + return a; + } + } + + return -1; //return OOV if not found +} + +int addWordToVocab(char *word) { + unsigned int hash; + + strcpy(vocab[vocab_size].word, word); + vocab[vocab_size].cn = 0; + vocab_size++; + + if (vocab_size + 2 >= vocab_max_size) { //reallocate memory if needed + vocab_max_size += 100; + vocab = (struct vocab_word *) realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); + } + + hash = getWordHash(word); + vocab_hash[hash] = vocab_size - 1; + + return vocab_size - 1; +} + +void readWord(char *word, FILE *fin) { + int a = 0, ch; + + while (!feof(fin)) { + ch = fgetc(fin); + + if (ch == 13) continue; + + if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { + if (a > 0) { + if (ch == '\n') ungetc(ch, fin); + break; + } + + if (ch == '\n') { + strcpy(word, (char *) "</s>"); + return; + } + else continue; + } + + word[a] = char(ch); + a++; + + if (a >= MAX_STRING) { + //printf("Too long word found!\n"); //truncate too long words + a--; + } + } + word[a] = 0; +} + +void sortVocab() { + int a, b, max; + vocab_word swap; + + for (a = 1; a < vocab_size; a++) { + max = a; + for (b = a + 1; b < vocab_size; b++) if (vocab[max].cn < vocab[b].cn) max = b; + + swap = vocab[max]; + vocab[max] = vocab[a]; + vocab[a] = swap; + } +} + +int learnVocabFromTrainFile() { + char word[MAX_STRING]; + FILE *fin; + int a, i, train_wcn; + + for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; + + fin = fopen(train_file, "rb"); + + vocab_size = 0; + + addWordToVocab((char *) "</s>"); + + train_wcn = 0; + while (1) { + readWord(word, fin); + if (feof(fin)) break; + + train_wcn++; + + i = searchVocab(word); + if (i == -1) { + a = addWordToVocab(word); + vocab[a].cn = 1; + } else vocab[i].cn++; + } + + sortVocab(); + + if (debug_mode > 0) { + printf("Vocab size: %d\n", vocab_size); + printf("Words in train file: %d\n", train_wcn); + } + + //train_words = train_wcn; + + fclose(fin); + return 0; +} + +int splitClasses() { + double df, dd; + int i, a, b; + + df = 0; + dd = 0; + a = 0; + b = 0; + + class_start = (int *) calloc(class_size, sizeof(int)); + memset(class_start, 0x7f, sizeof(int) * class_size); + class_end = (int *) calloc(class_size, sizeof(int)); + memset(class_end, 0, sizeof(int) * class_size); + + if (old_classes) { // old classes + for (i = 0; i < vocab_size; i++) b += vocab[i].cn; + for (i = 0; i < vocab_size; i++) { + df += vocab[i].cn / (double) b; + if (df > 1) df = 1; + if (df > (a + 1) / (double) class_size) { + vocab[i].class_index = a; + if (a < class_size - 1) a++; + } else { + vocab[i].class_index = a; + } + } + } else { // new classes + for (i = 0; i < vocab_size; i++) b += vocab[i].cn; + for (i = 0; i < vocab_size; i++) dd += sqrt(vocab[i].cn / (double) b); + for (i = 0; i < vocab_size; i++) { + df += sqrt(vocab[i].cn / (double) b) / dd; + if (df > 1) df = 1; + if (df > (a + 1) / (double) class_size) { + vocab[i].class_index = a; + if (a < class_size - 1) a++; + } else { + vocab[i].class_index = a; + } + } + } + + // after dividing classes, update class start and class end information + for(i = 0; i < vocab_size; i++) { + a = vocab[i].class_index; + class_start[a] = min(i, class_start[a]); + class_end[a] = max(i + 1, class_end[a]); + } + return 0; +} + +int init_class() { + //debug_mode = 1; + debug_mode = 0; + vocab_max_size = 100; // largest length value for each word + vocab_size = 0; + vocab = (struct vocab_word *) calloc(vocab_max_size, sizeof(struct vocab_word)); + vocab_hash_size = 100000000; + vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int)); + old_classes = 1; + + // read vocab + learnVocabFromTrainFile(); + + // split classes + splitClasses(); + + return 0; +} + +int create_shard(char *input_file, char *output_file) { + DataShard dataShard(output_file, DataShard::kCreate); + singa::WordRecord wordRecord; + + char word[MAX_STRING]; + FILE *fin; + int a, i; + fin = fopen(input_file, "rb"); + while (1) { + readWord(word, fin); + if (feof(fin)) break; + i = searchVocab(word); + if (i == -1) { + if (debug_mode) printf("unknown word [%s] detected!", word); + } else { + wordRecord.set_word(string(word)); + wordRecord.set_word_index(i); + int class_idx = vocab[i].class_index; + wordRecord.set_class_index(class_idx); + wordRecord.set_class_start(class_start[class_idx]); + wordRecord.set_class_end(class_end[class_idx]); + dataShard.Insert(word, wordRecord); + } + } + + dataShard.Flush(); + fclose(fin); + return 0; +} + +int argPos(char *str, int argc, char **argv) { + int a; + + for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) return a; + + return -1; +} + +int main(int argc, char **argv) { + int i; + FILE *f; + + //set debug mode + i = argPos((char *) "-debug", argc, argv); + if (i > 0) { + debug_mode = 1; + if (debug_mode > 0) + printf("debug mode: %d\n", debug_mode); + } + + //search for train file + i = argPos((char *) "-train", argc, argv); + if (i > 0) { + if (i + 1 == argc) { + printf("ERROR: training data file not specified!\n"); + return 0; + } + + strcpy(train_file, argv[i + 1]); + + if (debug_mode > 0) + printf("train file: %s\n", train_file); + + f = fopen(train_file, "rb"); + if (f == NULL) { + printf("ERROR: training data file not found!\n"); + return 0; + } + fclose(f); + } else { + printf("ERROR: training data must be set.\n"); + } + + //search for valid file + i = argPos((char *) "-valid", argc, argv); + if (i > 0) { + if (i + 1 == argc) { + printf("ERROR: validating data file not specified!\n"); + return 0; + } + + strcpy(valid_file, argv[i + 1]); + + if (debug_mode > 0) + printf("valid file: %s\n", valid_file); + + f = fopen(valid_file, "rb"); + if (f == NULL) { + printf("ERROR: validating data file not found!\n"); + return 0; + } + fclose(f); + valid_mode = 1; + } + + //search for test file + i = argPos((char *) "-test", argc, argv); + if (i > 0) { + if (i + 1 == argc) { + printf("ERROR: testing data file not specified!\n"); + return 0; + } + + strcpy(test_file, argv[i + 1]); + + if (debug_mode > 0) + printf("test file: %s\n", test_file); + + f = fopen(test_file, "rb"); + if (f == NULL) { + printf("ERROR: testing data file not found!\n"); + return 0; + } + fclose(f); + test_mode = 1; + } + + //search for class size + i = argPos((char *) "-class_size", argc, argv); + if (i > 0) { + if (i + 1 == argc) { + printf("ERROR: class size not specified!\n"); + return 0; + } + + class_size = atoi(argv[i + 1]); + + if (debug_mode > 0) + printf("class size: %d\n", class_size); + } + if (class_size <= 0) { + printf("ERROR: no or invalid class size received!\n"); + return 0; + } + + init_class(); + + create_shard(train_file, "train_shard"); + if (valid_mode) create_shard(valid_file, "valid_shard"); + if (test_mode) create_shard(test_file, "test_shard"); + + return 0; +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ad86f720/examples/rnnlm/main.cc ---------------------------------------------------------------------- diff --git a/examples/rnnlm/main.cc b/examples/rnnlm/main.cc index 690c158..3cb59f1 100644 --- a/examples/rnnlm/main.cc +++ b/examples/rnnlm/main.cc @@ -15,6 +15,9 @@ int main(int argc, char **argv) { driver.RegisterLayer<singa::EmbeddingLayer, std::string>("kEmbedding"); driver.RegisterLayer<singa::HiddenLayer, std::string>("kHidden"); driver.RegisterLayer<singa::OutputLayer, std::string>("kOutput"); + driver.RegisterLayer<singa::RnnDataLayer, std::string>("kRnnData"); + driver.RegisterLayer<singa::WordLayer, std::string>("kWord"); + driver.RegisterLayer<singa::RnnLabelLayer, std::string>("kRnnLabel"); singa::JobProto jobConf = driver.job_conf(); http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ad86f720/examples/rnnlm/rnnlm.cc ---------------------------------------------------------------------- diff --git a/examples/rnnlm/rnnlm.cc b/examples/rnnlm/rnnlm.cc index ddb0f63..180300f 100644 --- a/examples/rnnlm/rnnlm.cc +++ b/examples/rnnlm/rnnlm.cc @@ -25,6 +25,83 @@ inline Tensor<cpu, 1> RTensor1(Blob<float>* blob) { return tensor; } + +/*******InputLayer**************/ +RnnDataLayer::~RnnDataLayer() { + if (shard_ != nullptr) + delete shard_; + shard_ = nullptr; +} + +void RnnDataLayer::Setup(const LayerProto& proto, int npartitions) { + Layer::Setup(proto, npartitions); + shard_ = new DataShard(proto.GetExtension(input_conf).path(), DataShard::kRead); + string key; + max_window_ = proto.GetExtension(input_conf).max_window(); + records_.resize(max_window_ + 1); // # of records in data layer is max_window_ + 1 + window_ = 0; + shard_->Next(&key, &records_[window_]); +} + +void RnnDataLayer::ComputeFeature(int flag, Metric *perf) { + CHECK(records_.size() <= shard_->Count()); + records_[0] = records_[window_]; + window_ = max_window_; + singa::WordRecord wr; + for (int i = 1; i <= max_window_; i++) { + string key; + if (shard_->Next(&key, &records_[i])) { + wr = records_[i]; + if(wr.word_index() == 0) { + window_ = i; + break; + } + } + else{ + shard_->SeekToFirst(); + CHECK(shard_->Next(&key, &records_[i])); + } + } +} + +/*******WordLayer**************/ +void WordLayer::Setup(const LayerProto& proto, int npartitions) { + Layer::Setup(proto, npartitions); + CHECK_EQ(srclayers_.size(), 1); + int max_window = static_cast<RnnDataLayer*>(srclayers_[0])->max_window(); + data_.Reshape(vector<int>{max_window}); +} + +void WordLayer::ComputeFeature(int flag, Metric *perf) { + auto records = static_cast<RnnDataLayer*>(srclayers_[0])->records(); + float *word = data_.mutable_cpu_data(); + window_ = static_cast<RNNLayer*>(srclayers_[0])->window(); + for(int i = 0; i < window_; i++) { + word[i] = records[i].word_index(); + } +} + + +/*******LabelLayer**************/ +void RnnLabelLayer::Setup(const LayerProto& proto, int npartitions) { + Layer::Setup(proto, npartitions); + CHECK_EQ(srclayers_.size(), 1); + int max_window = static_cast<RnnDataLayer*>(srclayers_[0])->max_window(); + data_.Reshape(vector<int>{max_window, 4}); +} + +void RnnLabelLayer::ComputeFeature(int flag, Metric *perf) { + auto records = static_cast<RnnDataLayer*>(srclayers_[0])->records(); + float *label = data_.mutable_cpu_data(); + window_ = static_cast<RNNLayer*>(srclayers_[0])->window(); + for (int i = 0; i < window_; i++) { + label[4 * i + 0] = records[i + 1].class_start(); + label[4 * i + 1] = records[i + 1].class_end(); + label[4 * i + 2] = records[i + 1].word_index(); + label[4 * i + 3] = records[i + 1].class_index(); + } +} + /*******EmbeddingLayer**************/ EmbeddingLayer::~EmbeddingLayer() { delete embed_; http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ad86f720/examples/rnnlm/rnnlm.h ---------------------------------------------------------------------- diff --git a/examples/rnnlm/rnnlm.h b/examples/rnnlm/rnnlm.h index 14d947c..e9b7c55 100644 --- a/examples/rnnlm/rnnlm.h +++ b/examples/rnnlm/rnnlm.h @@ -1,4 +1,5 @@ #include "singa.h" +#include "rnnlm.pb.h" namespace singa { /** @@ -23,6 +24,52 @@ class RNNLayer : public NeuronLayer { }; /** + * Input layer that get read records from data shard + */ +class RnnDataLayer : public RNNLayer { + public: + ~RnnDataLayer(); + void Setup(const LayerProto& proto, int npartitions) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag, Metric* perf) override {} + int max_window() const { + return max_window_; + } + + const std::vector<singa::WordRecord>& records() const { + return records_; + } + + private: + int max_window_; + DataShard* shard_; + std::vector<singa::WordRecord> records_; +}; + + +/** + * WordLayer that read records_[0] to records_[window_ - 1] from RnnDataLayer to offer data for computation + */ +class WordLayer : public RNNLayer { + public: + void Setup(const LayerProto& proto, int npartitions) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag, Metric* perf) override {} +}; + + +/** + * LabelLayer that read records_[1] to records_[window_] from RnnDataLayer to offer label information + */ +class RnnLabelLayer : public RNNLayer { + public: + void Setup(const LayerProto& proto, int npartitions) override; + void ComputeFeature(int flag, Metric *perf) override; + void ComputeGradient(int flag, Metric* perf) override {} +}; + + +/** * Word embedding layer that get one row from the embedding matrix for each * word based on the word index */ http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/ad86f720/examples/rnnlm/rnnlm.proto ---------------------------------------------------------------------- diff --git a/examples/rnnlm/rnnlm.proto b/examples/rnnlm/rnnlm.proto index 35b6bc2..65c34ec 100644 --- a/examples/rnnlm/rnnlm.proto +++ b/examples/rnnlm/rnnlm.proto @@ -1,5 +1,6 @@ package singa; import "job.proto"; +import "common.proto"; message EmbeddingProto { @@ -12,7 +13,21 @@ message OutputProto { optional int32 vocab_size = 2; } +message InputProto { + required string path = 1; + optional int32 max_window = 2; +} + extend LayerProto { optional EmbeddingProto embedding_conf = 101; optional OutputProto output_conf = 102; + optional InputProto input_conf = 103; } + +message WordRecord { + optional string word = 1; + optional int32 word_index = 2; + optional int32 class_index = 3; + optional int32 class_start = 4; + optional int32 class_end = 5; +} \ No newline at end of file
