This is an automated email from the ASF dual-hosted git repository. zhasheng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push: new 806b41b Update large word language model example (#11405) 806b41b is described below commit 806b41bfed33d496a35f0af00997774b662990f5 Author: Haibin Lin <linhaibin.e...@gmail.com> AuthorDate: Fri Jun 29 19:43:57 2018 -0700 Update large word language model example (#11405) * add cython sampler * remove unused files * use eval batch size = 1 * update read me * update read me * update license --- example/rnn/large_word_lm/LogUniformGenerator.cc | 52 ++++++++++++++++++++++ example/rnn/large_word_lm/LogUniformGenerator.h | 45 +++++++++++++++++++ example/rnn/large_word_lm/Makefile | 25 +++++++++++ example/rnn/large_word_lm/custom_module.py | 3 +- example/rnn/large_word_lm/log_uniform.pyx | 38 ++++++++++++++++ example/rnn/large_word_lm/model.py | 21 ++++----- example/rnn/large_word_lm/readme.md | 16 +++---- example/rnn/large_word_lm/run_utils.py | 11 +++-- example/rnn/large_word_lm/sampler.py | 55 ++++++++++++++++++++++++ example/rnn/large_word_lm/setup.py | 28 ++++++++++++ example/rnn/large_word_lm/train.py | 32 +++++++++----- 11 files changed, 292 insertions(+), 34 deletions(-) diff --git a/example/rnn/large_word_lm/LogUniformGenerator.cc b/example/rnn/large_word_lm/LogUniformGenerator.cc new file mode 100644 index 0000000..ae40659 --- /dev/null +++ b/example/rnn/large_word_lm/LogUniformGenerator.cc @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2018 by Contributors + * \file LogUniformGenerator.cc + * \brief log uniform distribution generator +*/ + +#include <unordered_set> +#include <unordered_map> +#include <cmath> +#include <stddef.h> +#include <iostream> + +#include "LogUniformGenerator.h" + +LogUniformGenerator::LogUniformGenerator(const int range_max) + : range_max_(range_max), log_range_max_(log(range_max)), + generator_(), distribution_(0.0, 1.0) {} + +std::unordered_set<long> LogUniformGenerator::draw(const size_t size, int* num_tries) { + std::unordered_set<long> result; + int tries = 0; + while (result.size() != size) { + tries += 1; + double x = distribution_(generator_); + long value = lround(exp(x * log_range_max_)) - 1; + // sampling without replacement + if (result.find(value) == result.end()) { + result.emplace(value); + } + } + *num_tries = tries; + return result; +} diff --git a/example/rnn/large_word_lm/LogUniformGenerator.h b/example/rnn/large_word_lm/LogUniformGenerator.h new file mode 100644 index 0000000..b6c4f93 --- /dev/null +++ b/example/rnn/large_word_lm/LogUniformGenerator.h @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2018 by Contributors + * \file LogUniformGenerator.h + * \brief log uniform distribution generator +*/ + +#ifndef _LOG_UNIFORM_GENERATOR_H +#define _LOG_UNIFORM_GENERATOR_H + +#include <unordered_set> +#include <utility> +#include <random> + +class LogUniformGenerator { +private: + const int range_max_; + const double log_range_max_; + std::default_random_engine generator_; + std::uniform_real_distribution<double> distribution_; +public: + LogUniformGenerator(const int); + std::unordered_set<long> draw(const size_t, int*); +}; + +#endif // _LOG_UNIFORM_GENERATOR_H + diff --git a/example/rnn/large_word_lm/Makefile b/example/rnn/large_word_lm/Makefile new file mode 100644 index 0000000..116f7bb --- /dev/null +++ b/example/rnn/large_word_lm/Makefile @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +all: clean + python setup.py build_ext --inplace + +clean: + rm -rf build + rm -rf __pycache__ + rm -rf log_uniform.cpp + rm -rf log_uniform.*.so diff --git a/example/rnn/large_word_lm/custom_module.py b/example/rnn/large_word_lm/custom_module.py index 05d0fb7..a117427 100644 --- a/example/rnn/large_word_lm/custom_module.py +++ b/example/rnn/large_word_lm/custom_module.py @@ -60,7 +60,7 @@ class CustomModule(Module): priority=-param_idx) @staticmethod - def load(prefix, epoch, load_optimizer_states=False, **kwargs): + def load(prefix, epoch, load_optimizer_states=False, symbol=None, **kwargs): """Creates a model from previously saved checkpoint. Parameters @@ -90,6 +90,7 @@ class CustomModule(Module): Default ``None``, indicating no network parameters are fixed. """ sym, args, auxs = load_checkpoint(prefix, epoch) + sym = sym if symbol is None else symbol mod = CustomModule(symbol=sym, **kwargs) mod._arg_params = args mod._aux_params = auxs diff --git a/example/rnn/large_word_lm/log_uniform.pyx b/example/rnn/large_word_lm/log_uniform.pyx new file mode 100644 index 0000000..641835a --- /dev/null +++ b/example/rnn/large_word_lm/log_uniform.pyx @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from libcpp.unordered_set cimport unordered_set +import cython + +cdef extern from "LogUniformGenerator.h": + cdef cppclass LogUniformGenerator: + LogUniformGenerator(int) except + + unordered_set[long] draw(int, int*) except + + +cdef class LogUniformSampler: + cdef LogUniformGenerator* c_sampler + + def __cinit__(self, N): + self.c_sampler = new LogUniformGenerator(N) + + def __dealloc__(self): + del self.c_sampler + + def sample_unique(self, size): + cdef int num_tries + samples = list(self.c_sampler.draw(size, &num_tries)) + return samples, num_tries diff --git a/example/rnn/large_word_lm/model.py b/example/rnn/large_word_lm/model.py index 3d3c83b..0e9abda 100644 --- a/example/rnn/large_word_lm/model.py +++ b/example/rnn/large_word_lm/model.py @@ -58,7 +58,7 @@ def rnn(bptt, vocab_size, num_embed, nhid, num_layers, dropout, num_proj, batch_ init_h = S.var(prefix + 'init_h', shape=(batch_size, num_proj), init=mx.init.Zero()) init_c = S.var(prefix + 'init_c', shape=(batch_size, nhid), init=mx.init.Zero()) state_names += [prefix + 'init_h', prefix + 'init_c'] - lstmp = mx.gluon.contrib.rnn.LSTMPCell(nhid, num_proj) + lstmp = mx.gluon.contrib.rnn.LSTMPCell(nhid, num_proj, prefix=prefix) outputs, next_states = lstmp.unroll(bptt, outputs, begin_state=[init_h, init_c], \ layout='NTC', merge_outputs=True) outputs = S.Dropout(outputs, p=dropout) @@ -127,7 +127,7 @@ def sampled_softmax(num_classes, num_samples, in_dim, inputs, weight, bias, new_targets = S.zeros_like(label) return logits, new_targets -def generate_samples(label, num_splits, num_samples, num_classes): +def generate_samples(label, num_splits, sampler): """ Split labels into `num_splits` and generate candidates based on log-uniform distribution. """ @@ -139,29 +139,30 @@ def generate_samples(label, num_splits, num_samples, num_classes): samples = [] for label_split in label_splits: label_split_2d = label_split.reshape((-1,1)) - sampled_value = mx.nd.contrib.rand_zipfian(label_split_2d, num_samples, num_classes) + sampled_value = sampler.draw(label_split_2d) sampled_classes, exp_cnt_true, exp_cnt_sampled = sampled_value samples.append(sampled_classes.astype(np.float32)) - prob_targets.append(exp_cnt_true.astype(np.float32)) + prob_targets.append(exp_cnt_true.astype(np.float32).reshape((-1,1))) prob_samples.append(exp_cnt_sampled.astype(np.float32)) return samples, prob_samples, prob_targets class Model(): """ LSTMP with Importance Sampling """ - def __init__(self, args, ntokens, rescale_loss): - out = rnn(args.bptt, ntokens, args.emsize, args.nhid, args.nlayers, - args.dropout, args.num_proj, args.batch_size) + def __init__(self, ntokens, rescale_loss, bptt, emsize, + nhid, nlayers, dropout, num_proj, batch_size, k): + out = rnn(bptt, ntokens, emsize, nhid, nlayers, + dropout, num_proj, batch_size) rnn_out, self.last_states, self.lstm_args, self.state_names = out # decoder weight and bias decoder_w = S.var("decoder_weight", stype='row_sparse') decoder_b = S.var("decoder_bias", shape=(ntokens, 1), stype='row_sparse') # sampled softmax for training - sample = S.var('sample', shape=(args.k,)) - prob_sample = S.var("prob_sample", shape=(args.k,)) + sample = S.var('sample', shape=(k,)) + prob_sample = S.var("prob_sample", shape=(k,)) prob_target = S.var("prob_target") self.sample_names = ['sample', 'prob_sample', 'prob_target'] - logits, new_targets = sampled_softmax(ntokens, args.k, args.num_proj, + logits, new_targets = sampled_softmax(ntokens, k, num_proj, rnn_out, decoder_w, decoder_b, [sample, prob_sample, prob_target]) self.train_loss = cross_entropy_loss(logits, new_targets, rescale_loss=rescale_loss) diff --git a/example/rnn/large_word_lm/readme.md b/example/rnn/large_word_lm/readme.md index d74ffbd..465aaa1 100644 --- a/example/rnn/large_word_lm/readme.md +++ b/example/rnn/large_word_lm/readme.md @@ -3,17 +3,18 @@ This example implements the baseline model in [Exploring the Limits of Language Modeling](https://arxiv.org/abs/1602.02410) on the [Google 1-Billion Word](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) (GBW) dataset. -This example reaches **41.97 perplexity** after 5 training epochs on a 1-layer, 2048-unit, 512-projection LSTM Language Model. -The result is slightly better than the one reported in the paper(43.7 perplexity). +This example reaches 48.0 test perplexity after 6 training epochs on a 1-layer, 2048-unit, 512-projection LSTM Language Model. +It reaches 44.2 test perplexity after 35 epochs of training. + The main differences with the original implementation include: * Synchronized gradient updates instead of asynchronized updates -* Noise candidates are sampled with replacement -Each epoch for training takes around 80 minutes on a p3.8xlarge instance, which comes with 4 Volta V100 GPUs. +Each epoch for training (excluding time for evaluation on test set) takes around 80 minutes on a p3.8xlarge instance, which comes with 4 Volta V100 GPUs. -# Setup - Original Data Format -1. Download 1-Billion Word Dataset - [Link](http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz) +# Setup dataset and build sampler +1. Download 1-Billion Word Dataset: [Link](http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz) 2. Download pre-processed vocabulary file which maps tokens into ids. +3. Build sampler with cython by running `make` in the current directory. If you do not have cython installed, run `pip install cython` # Run the Script ``` @@ -59,8 +60,7 @@ optional arguments: To reproduce the result, run ``` -train.py --gpus=0,1,2,3 --clip=1 --lr=0.05 --dropout=0.01 --eps=0.0001 --rescale-embed=128 +train.py --gpus=0,1,2,3 --clip=10 --lr=0.2 --dropout=0.1 --eps=1 --rescale-embed=256 --test=/path/to/heldout-monolingual.tokenized.shuffled/news.en.heldout-00000-of-00050 --data=/path/to/training-monolingual.tokenized.shuffled/* -# ~42 perplexity for 5 epochs of training ``` diff --git a/example/rnn/large_word_lm/run_utils.py b/example/rnn/large_word_lm/run_utils.py index 7650530e..bd1412d 100644 --- a/example/rnn/large_word_lm/run_utils.py +++ b/example/rnn/large_word_lm/run_utils.py @@ -53,7 +53,7 @@ def get_parser(): help='report interval') parser.add_argument('--seed', type=int, default=1, help='random seed') - parser.add_argument('--checkpoint-dir', type=str, default='./checkpoint/cp', + parser.add_argument('--checkpoint-dir', type=str, default='./checkpoint', help='dir for checkpoint') parser.add_argument('--lr', type=float, default=0.1, help='initial learning rate') @@ -68,18 +68,21 @@ def evaluate(mod, data_iter, epoch, log_interval): start = time.time() total_L = 0.0 nbatch = 0 + density = 0 mod.set_states(value=0) for batch in data_iter: mod.forward(batch, is_train=False) outputs = mod.get_outputs(merge_multi_context=False) states = outputs[:-1] - total_L += outputs[-1][0].asscalar() + total_L += outputs[-1][0] mod.set_states(states=states) nbatch += 1 + # don't include padding data in the test perplexity + density += batch.data[1].mean() if (nbatch + 1) % log_interval == 0: - logging.info("Eval batch %d loss : %.7f" % (nbatch, total_L / nbatch)) + logging.info("Eval batch %d loss : %.7f" % (nbatch, (total_L / density).asscalar())) data_iter.reset() - loss = total_L / nbatch + loss = (total_L / density).asscalar() ppl = math.exp(loss) if loss < 100 else 1e37 end = time.time() logging.info('Iter[%d]\t\t CE loss %.7f, ppl %.7f. Eval duration = %.2f seconds ' % \ diff --git a/example/rnn/large_word_lm/sampler.py b/example/rnn/large_word_lm/sampler.py new file mode 100644 index 0000000..047e516 --- /dev/null +++ b/example/rnn/large_word_lm/sampler.py @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import math +import os +import numpy as np +import mxnet as mx +import log_uniform +from mxnet import ndarray + +class LogUniformSampler(): + def __init__(self, range_max, num_sampled): + self.range_max = range_max + self.num_sampled = num_sampled + self.sampler = log_uniform.LogUniformSampler(range_max) + + def _prob_helper(self, num_tries, num_sampled, prob): + if num_tries == num_sampled: + return prob * num_sampled + return (num_tries * (-prob).log1p()).expm1() * -1 + + def draw(self, true_classes): + """Draw samples from log uniform distribution and returns sampled candidates, + expected count for true classes and sampled classes.""" + range_max = self.range_max + num_sampled = self.num_sampled + ctx = true_classes.context + log_range = math.log(range_max + 1) + num_tries = 0 + true_classes = true_classes.reshape((-1,)) + sampled_classes, num_tries = self.sampler.sample_unique(num_sampled) + + true_cls = true_classes.as_in_context(ctx).astype('float64') + prob_true = ((true_cls + 2.0) / (true_cls + 1.0)).log() / log_range + count_true = self._prob_helper(num_tries, num_sampled, prob_true) + + sampled_classes = ndarray.array(sampled_classes, ctx=ctx, dtype='int64') + sampled_cls_fp64 = sampled_classes.astype('float64') + prob_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 1.0)).log() / log_range + count_sampled = self._prob_helper(num_tries, num_sampled, prob_sampled) + return [sampled_classes, count_true, count_sampled] diff --git a/example/rnn/large_word_lm/setup.py b/example/rnn/large_word_lm/setup.py new file mode 100644 index 0000000..09c4fb0 --- /dev/null +++ b/example/rnn/large_word_lm/setup.py @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from distutils.core import setup, Extension +from Cython.Build import cythonize +import numpy + +extension_name = "log_uniform" +sources = ["log_uniform.pyx", "LogUniformGenerator.cc"] +setup(ext_modules = cythonize(Extension(extension_name, + sources=sources, + language="c++", + extra_compile_args=["-std=c++11"], + include_dirs=[numpy.get_include()]))) diff --git a/example/rnn/large_word_lm/train.py b/example/rnn/large_word_lm/train.py index a1b4e31..a815914 100644 --- a/example/rnn/large_word_lm/train.py +++ b/example/rnn/large_word_lm/train.py @@ -23,6 +23,7 @@ from data import MultiSentenceIter, Vocabulary from model import * from custom_module import CustomModule import os, math, logging, sys +from sampler import LogUniformSampler if __name__ == '__main__': # parser @@ -48,9 +49,11 @@ if __name__ == '__main__': train_data = mx.io.PrefetchingIter(MultiSentenceIter(args.data, vocab, args.batch_size * ngpus, args.bptt)) # model - model = Model(args, ntokens, rescale_loss) + model = Model(ntokens, rescale_loss, args.bptt, args.emsize, args.nhid, + args.nlayers, args.dropout, args.num_proj, args.batch_size, args.k) train_loss_and_states = model.train() eval_loss_and_states = model.eval() + sampler = LogUniformSampler(ntokens, args.k) # training module data_names, label_names = ['data', 'mask'], ['label'] @@ -83,7 +86,7 @@ if __name__ == '__main__': module.set_states(value=0) state_cache = module.get_states(merge_multi_context=False)[:-num_sample_names] next_batch = train_data.next() - next_sampled_values = generate_samples(next_batch.label[0], ngpus, args.k, ntokens) + next_sampled_values = generate_samples(next_batch.label[0], ngpus, sampler) stop_iter = False while not stop_iter: batch = next_batch @@ -102,8 +105,7 @@ if __name__ == '__main__': try: # prefetch the next batch of data and samples next_batch = train_data.next() - next_sampled_values = generate_samples(next_batch.label[0], ngpus, - args.k, ntokens) + next_sampled_values = generate_samples(next_batch.label[0], ngpus, sampler) except StopIteration: stop_iter = True # cache LSTMP states of the current batch @@ -132,21 +134,29 @@ if __name__ == '__main__': nbatch += 1 # run evaluation with full softmax on cpu - module.save_checkpoint(args.checkpoint_dir, epoch, save_optimizer_states=False) - cpu_train_mod = CustomModule.load(args.checkpoint_dir, epoch, context=mx.cpu(), - state_names=train_state_names, - data_names=data_names, label_names=label_names) + if not os.path.exists(args.checkpoint_dir): + os.mkdir(args.checkpoint_dir) + ckp = os.path.join(args.checkpoint_dir, 'ckp') + module.save_checkpoint(ckp, epoch, save_optimizer_states=False) + + # use batch_size = 1 for testing + eval_batch_size = 1 + load_model = Model(ntokens, rescale_loss, args.bptt, args.emsize, args.nhid, + args.nlayers, args.dropout, args.num_proj, eval_batch_size, args.k) + cpu_train_mod = CustomModule.load(ckp, epoch, context=mx.cpu(), + state_names=train_state_names, data_names=data_names, + label_names=label_names, symbol=load_model.train()) # eval data iter eval_data = mx.io.PrefetchingIter(MultiSentenceIter(args.test, vocab, - args.batch_size, args.bptt)) + eval_batch_size, args.bptt)) cpu_train_mod.bind(data_shapes=eval_data.provide_data, label_shapes=eval_data.provide_label) # eval module - eval_module = CustomModule(symbol=eval_loss_and_states, context=mx.cpu(), data_names=data_names, + eval_module = CustomModule(symbol=load_model.eval(), context=mx.cpu(), data_names=data_names, label_names=label_names, state_names=eval_state_names) # use `shared_module` to share parameter with the training module eval_module.bind(data_shapes=eval_data.provide_data, label_shapes=eval_data.provide_label, shared_module=cpu_train_mod, for_training=False) - val_L = run_utils.evaluate(eval_module, eval_data, epoch, 20) + val_L = run_utils.evaluate(eval_module, eval_data, epoch, 1000) train_data.reset() logging.info("Training completed. ")