http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.2 ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.2 b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.2 deleted file mode 100644 index 58d28a0..0000000 Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.2 and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.3 ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.3 b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.3 deleted file mode 100644 index 1a63afe..0000000 Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.3 and /dev/null differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.kenlm_intermediate ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.kenlm_intermediate b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.kenlm_intermediate deleted file mode 100644 index fe82667..0000000 --- a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.kenlm_intermediate +++ /dev/null @@ -1,3 +0,0 @@ -KenLM intermediate binary file -Counts 6 7 6 -Payload pb http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.vocab ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.vocab b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.vocab deleted file mode 100644 index 763b2af..0000000 Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.vocab and /dev/null differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_test.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_instance_test.cc b/ext/kenlm/lm/interpolate/tune_instance_test.cc deleted file mode 100644 index a0db59c..0000000 --- a/ext/kenlm/lm/interpolate/tune_instance_test.cc +++ /dev/null @@ -1,93 +0,0 @@ -#include "lm/interpolate/tune_instance.hh" - -#include "util/file_stream.hh" -#include "util/file.hh" -#include "util/string_piece.hh" - -#define BOOST_TEST_MODULE InstanceTest -#include <boost/test/unit_test.hpp> - -#include <iostream> - -#include <vector> - -namespace lm { namespace interpolate { namespace { - -Matrix::Index FindRow(const std::vector<WordIndex> &words, WordIndex word) { - std::vector<WordIndex>::const_iterator it = std::find(words.begin(), words.end(), word); - BOOST_REQUIRE(it != words.end()); - return it - words.begin(); -} - -BOOST_AUTO_TEST_CASE(Toy) { - util::scoped_fd test_input(util::MakeTemp("temporary")); - { - util::FileStream(test_input.get()) << "c\n"; - } - - StringPiece dir("tune_instance_data/"); - if (boost::unit_test::framework::master_test_suite().argc == 2) { - StringPiece zero_file(boost::unit_test::framework::master_test_suite().argv[1]); - BOOST_REQUIRE(zero_file.size() > strlen("toy0.1")); - BOOST_REQUIRE_EQUAL("toy0.1", StringPiece(zero_file.data() + zero_file.size() - 6, 6)); - dir = StringPiece(zero_file.data(), zero_file.size() - 6); - } - - std::vector<StringPiece> model_names; - std::string full0 = std::string(dir.data(), dir.size()) + "toy0"; - std::string full1 = std::string(dir.data(), dir.size()) + "toy1"; - model_names.push_back(full0); - model_names.push_back(full1); - - util::FixedArray<Instance> instances; - Matrix ln_unigrams; - // Returns vocab id of <s> - BOOST_CHECK_EQUAL(1, LoadInstances(test_input.release(), model_names, instances, ln_unigrams)); - // <unk> - BOOST_CHECK_CLOSE(-0.90309 * M_LN10, ln_unigrams(0, 0), 0.001); - BOOST_CHECK_CLOSE(-1 * M_LN10, ln_unigrams(0, 1), 0.001); - // <s> - BOOST_CHECK_GT(-98.0, ln_unigrams(1, 0)); - BOOST_CHECK_GT(-98.0, ln_unigrams(1, 1)); - // a - BOOST_CHECK_CLOSE(-0.46943438 * M_LN10, ln_unigrams(2, 0), 0.001); - BOOST_CHECK_CLOSE(-0.6146491 * M_LN10, ln_unigrams(2, 1), 0.001); - // </s> - BOOST_CHECK_CLOSE(-0.5720968 * M_LN10, ln_unigrams(3, 0), 0.001); - BOOST_CHECK_CLOSE(-0.6146491 * M_LN10, ln_unigrams(3, 1), 0.001); - // c - BOOST_CHECK_CLOSE(-0.90309 * M_LN10, ln_unigrams(4, 0), 0.001); // <unk> - BOOST_CHECK_CLOSE(-0.7659168 * M_LN10, ln_unigrams(4, 1), 0.001); - // too lazy to do b. - - // Two instances: - // <s> predicts c - // <s> c predicts </s> - BOOST_REQUIRE_EQUAL(2, instances.size()); - BOOST_CHECK_CLOSE(-0.30103 * M_LN10, instances[0].ln_backoff(0), 0.001); - BOOST_CHECK_CLOSE(-0.30103 * M_LN10, instances[0].ln_backoff(1), 0.001); - - // Backoffs of <s> c - BOOST_CHECK_CLOSE(0.0, instances[1].ln_backoff(0), 0.001); - BOOST_CHECK_CLOSE((-0.30103 - 0.30103) * M_LN10, instances[1].ln_backoff(1), 0.001); - - // Three extensions: a, b, c - BOOST_REQUIRE_EQUAL(3, instances[0].ln_extensions.rows()); - BOOST_REQUIRE_EQUAL(3, instances[0].extension_words.size()); - - // <s> a - BOOST_CHECK_CLOSE(-0.37712017 * M_LN10, instances[0].ln_extensions(FindRow(instances[0].extension_words, 2), 0), 0.001); - // <s> c - BOOST_CHECK_CLOSE((-0.90309 + -0.30103) * M_LN10, instances[0].ln_extensions(FindRow(instances[0].extension_words, 4), 0), 0.001); - BOOST_CHECK_CLOSE(-0.4740302 * M_LN10, instances[0].ln_extensions(FindRow(instances[0].extension_words, 4), 1), 0.001); - - // <s> c </s> - BOOST_CHECK_CLOSE(-0.09113217 * M_LN10, instances[1].ln_extensions(FindRow(instances[1].extension_words, 3), 1), 0.001); - - // p_0(c | <s>) = p_0(c)b_0(<s>) = 10^(-0.90309 + -0.30103) - BOOST_CHECK_CLOSE((-0.90309 + -0.30103) * M_LN10, instances[0].ln_correct(0), 0.001); - // p_1(c | <s>) = 10^-0.4740302 - BOOST_CHECK_CLOSE(-0.4740302 * M_LN10, instances[0].ln_correct(1), 0.001); -} - -}}} // namespaces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_main.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_main.cc b/ext/kenlm/lm/interpolate/tune_main.cc deleted file mode 100644 index 8296af1..0000000 --- a/ext/kenlm/lm/interpolate/tune_main.cc +++ /dev/null @@ -1,59 +0,0 @@ -#include "lm/interpolate/tune_derivatives.hh" -#include "lm/interpolate/tune_instance.hh" -#include "util/file.hh" - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-local-typedefs" -#include <Eigen/Dense> -#pragma GCC diagnostic pop -#include <boost/program_options.hpp> - -#include <cmath> -#include <iostream> - -namespace lm { namespace interpolate { -void TuneWeights(int tune_file, const std::vector<StringPiece> &model_names, Vector &weights) { - util::FixedArray<Instance> instances; - Matrix ln_unigrams; - WordIndex bos = LoadInstances(tune_file, model_names, instances, ln_unigrams); - ComputeDerivative derive(instances, ln_unigrams, bos); - weights = Vector::Constant(model_names.size(), 1.0 / model_names.size()); - Vector gradient; - Matrix hessian; - for (std::size_t iteration = 0; iteration < 10 /*TODO fancy stopping criteria */; ++iteration) { - std::cerr << "Iteration " << iteration << ": weights ="; - for (Vector::Index i = 0; i < weights.rows(); ++i) { - std::cerr << ' ' << weights(i); - } - std::cerr << std::endl; - std::cerr << "Perplexity = " << - derive.Iteration(weights, gradient, hessian) - << std::endl; - // TODO: 1.0 step size was too big and it kept getting unstable. More math. - weights -= 0.7 * hessian.inverse() * gradient; - } -} -}} // namespaces - -int main(int argc, char *argv[]) { - Eigen::initParallel(); - namespace po = boost::program_options; - // TODO help - po::options_description options("Tuning options"); - std::string tuning_file; - std::vector<std::string> input_models; - options.add_options() - ("tuning,t", po::value<std::string>(&tuning_file)->required(), "File to tune on. This should be a text file with one sentence per line.") - ("model,m", po::value<std::vector<std::string> >(&input_models)->multitoken()->required(), "Models to interpolate"); - po::variables_map vm; - po::store(po::parse_command_line(argc, argv, options), vm); - po::notify(vm); - - std::vector<StringPiece> model_names; - for (std::vector<std::string>::const_iterator i = input_models.begin(); i != input_models.end(); ++i) { - model_names.push_back(*i); - } - lm::interpolate::Vector weights; - lm::interpolate::TuneWeights(util::OpenReadOrThrow(tuning_file.c_str()), model_names, weights); - std::cout << weights.transpose() << std::endl; -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_matrix.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_matrix.hh b/ext/kenlm/lm/interpolate/tune_matrix.hh deleted file mode 100644 index 7f1a0c9..0000000 --- a/ext/kenlm/lm/interpolate/tune_matrix.hh +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef LM_INTERPOLATE_TUNE_MATRIX_H -#define LM_INTERPOLATE_TUNE_MATRIX_H - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-local-typedefs" -#include <Eigen/Core> -#pragma GCC diagnostic pop - -namespace lm { namespace interpolate { - -typedef Eigen::MatrixXd Matrix; -typedef Eigen::VectorXd Vector; - -typedef Matrix::Scalar Accum; - -}} // namespaces -#endif // LM_INTERPOLATE_TUNE_MATRIX_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/universal_vocab.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/universal_vocab.cc b/ext/kenlm/lm/interpolate/universal_vocab.cc deleted file mode 100644 index 5cdf41e..0000000 --- a/ext/kenlm/lm/interpolate/universal_vocab.cc +++ /dev/null @@ -1,13 +0,0 @@ -#include "lm/interpolate/universal_vocab.hh" - -namespace lm { -namespace interpolate { - -UniversalVocab::UniversalVocab(const std::vector<WordIndex>& model_vocab_sizes) { - model_index_map_.resize(model_vocab_sizes.size()); - for (size_t i = 0; i < model_vocab_sizes.size(); ++i) { - model_index_map_[i].resize(model_vocab_sizes[i]); - } -} - -}} // namespaces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/universal_vocab.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/universal_vocab.hh b/ext/kenlm/lm/interpolate/universal_vocab.hh deleted file mode 100644 index c720298..0000000 --- a/ext/kenlm/lm/interpolate/universal_vocab.hh +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef LM_INTERPOLATE_UNIVERSAL_VOCAB_H -#define LM_INTERPOLATE_UNIVERSAL_VOCAB_H - -#include "lm/word_index.hh" - -#include <vector> -#include <cstddef> - -namespace lm { -namespace interpolate { - -class UniversalVocab { -public: - explicit UniversalVocab(const std::vector<WordIndex>& model_vocab_sizes); - - // GetUniversalIndex takes the model number and index for the specific - // model and returns the universal model number - WordIndex GetUniversalIdx(std::size_t model_num, WordIndex model_word_index) const { - return model_index_map_[model_num][model_word_index]; - } - - const WordIndex *Mapping(std::size_t model) const { - return &*model_index_map_[model].begin(); - } - - void InsertUniversalIdx(std::size_t model_num, WordIndex word_index, - WordIndex universal_word_index) { - model_index_map_[model_num][word_index] = universal_word_index; - } - -private: - std::vector<std::vector<WordIndex> > model_index_map_; -}; - -} // namespace interpolate -} // namespace lm - -#endif // LM_INTERPOLATE_UNIVERSAL_VOCAB_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/kenlm_benchmark_main.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/kenlm_benchmark_main.cc b/ext/kenlm/lm/kenlm_benchmark_main.cc deleted file mode 100644 index c9ee165..0000000 --- a/ext/kenlm/lm/kenlm_benchmark_main.cc +++ /dev/null @@ -1,142 +0,0 @@ -#include "lm/model.hh" -#include "util/file_stream.hh" -#include "util/file.hh" -#include "util/file_piece.hh" -#include "util/usage.hh" - -#include <stdint.h> - -namespace { - -template <class Model, class Width> void ConvertToBytes(const Model &model, int fd_in) { - util::FilePiece in(fd_in); - util::FileStream out(1); - Width width; - StringPiece word; - const Width end_sentence = (Width)model.GetVocabulary().EndSentence(); - while (true) { - while (in.ReadWordSameLine(word)) { - width = (Width)model.GetVocabulary().Index(word); - out.write(&width, sizeof(Width)); - } - if (!in.ReadLineOrEOF(word)) break; - out.write(&end_sentence, sizeof(Width)); - } -} - -template <class Model, class Width> void QueryFromBytes(const Model &model, int fd_in) { - lm::ngram::State state[3]; - const lm::ngram::State *const begin_state = &model.BeginSentenceState(); - const lm::ngram::State *next_state = begin_state; - Width kEOS = model.GetVocabulary().EndSentence(); - Width buf[4096]; - - uint64_t completed = 0; - double loaded = util::CPUTime(); - - std::cout << "CPU_to_load: " << loaded << std::endl; - - // Numerical precision: batch sums. - double total = 0.0; - while (std::size_t got = util::ReadOrEOF(fd_in, buf, sizeof(buf))) { - float sum = 0.0; - UTIL_THROW_IF2(got % sizeof(Width), "File size not a multiple of vocab id size " << sizeof(Width)); - got /= sizeof(Width); - completed += got; - // Do even stuff first. - const Width *even_end = buf + (got & ~1); - // Alternating states - const Width *i; - for (i = buf; i != even_end;) { - sum += model.FullScore(*next_state, *i, state[1]).prob; - next_state = (*i++ == kEOS) ? begin_state : &state[1]; - sum += model.FullScore(*next_state, *i, state[0]).prob; - next_state = (*i++ == kEOS) ? begin_state : &state[0]; - } - // Odd corner case. - if (got & 1) { - sum += model.FullScore(*next_state, *i, state[2]).prob; - next_state = (*i++ == kEOS) ? begin_state : &state[2]; - } - total += sum; - } - double after = util::CPUTime(); - std::cerr << "Probability sum is " << total << std::endl; - std::cout << "Queries: " << completed << std::endl; - std::cout << "CPU_excluding_load: " << (after - loaded) << "\nCPU_per_query: " << ((after - loaded) / static_cast<double>(completed)) << std::endl; - std::cout << "RSSMax: " << util::RSSMax() << std::endl; -} - -template <class Model, class Width> void DispatchFunction(const Model &model, bool query) { - if (query) { - QueryFromBytes<Model, Width>(model, 0); - } else { - ConvertToBytes<Model, Width>(model, 0); - } -} - -template <class Model> void DispatchWidth(const char *file, bool query) { - lm::ngram::Config config; - config.load_method = util::READ; - std::cerr << "Using load_method = READ." << std::endl; - Model model(file, config); - lm::WordIndex bound = model.GetVocabulary().Bound(); - if (bound <= 256) { - DispatchFunction<Model, uint8_t>(model, query); - } else if (bound <= 65536) { - DispatchFunction<Model, uint16_t>(model, query); - } else if (bound <= (1ULL << 32)) { - DispatchFunction<Model, uint32_t>(model, query); - } else { - DispatchFunction<Model, uint64_t>(model, query); - } -} - -void Dispatch(const char *file, bool query) { - using namespace lm::ngram; - lm::ngram::ModelType model_type; - if (lm::ngram::RecognizeBinary(file, model_type)) { - switch(model_type) { - case PROBING: - DispatchWidth<lm::ngram::ProbingModel>(file, query); - break; - case REST_PROBING: - DispatchWidth<lm::ngram::RestProbingModel>(file, query); - break; - case TRIE: - DispatchWidth<lm::ngram::TrieModel>(file, query); - break; - case QUANT_TRIE: - DispatchWidth<lm::ngram::QuantTrieModel>(file, query); - break; - case ARRAY_TRIE: - DispatchWidth<lm::ngram::ArrayTrieModel>(file, query); - break; - case QUANT_ARRAY_TRIE: - DispatchWidth<lm::ngram::QuantArrayTrieModel>(file, query); - break; - default: - UTIL_THROW(util::Exception, "Unrecognized kenlm model type " << model_type); - } - } else { - UTIL_THROW(util::Exception, "Binarize before running benchmarks."); - } -} - -} // namespace - -int main(int argc, char *argv[]) { - if (argc != 3 || (strcmp(argv[1], "vocab") && strcmp(argv[1], "query"))) { - std::cerr - << "Benchmark program for KenLM. Intended usage:\n" - << "#Convert text to vocabulary ids offline. These ids are tied to a model.\n" - << argv[0] << " vocab $model <$text >$text.vocab\n" - << "#Ensure files are in RAM.\n" - << "cat $text.vocab $model >/dev/null\n" - << "#Timed query against the model.\n" - << argv[0] << " query $model <$text.vocab\n"; - return 1; - } - Dispatch(argv[2], !strcmp(argv[1], "query")); - return 0; -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/left.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/left.hh b/ext/kenlm/lm/left.hh deleted file mode 100644 index 4d49686..0000000 --- a/ext/kenlm/lm/left.hh +++ /dev/null @@ -1,216 +0,0 @@ -/* Efficient left and right language model state for sentence fragments. - * Intended usage: - * Store ChartState with every chart entry. - * To do a rule application: - * 1. Make a ChartState object for your new entry. - * 2. Construct RuleScore. - * 3. Going from left to right, call Terminal or NonTerminal. - * For terminals, just pass the vocab id. - * For non-terminals, pass that non-terminal's ChartState. - * If your decoder expects scores inclusive of subtree scores (i.e. you - * label entries with the highest-scoring path), pass the non-terminal's - * score as prob. - * If your decoder expects relative scores and will walk the chart later, - * pass prob = 0.0. - * In other words, the only effect of prob is that it gets added to the - * returned log probability. - * 4. Call Finish. It returns the log probability. - * - * There's a couple more details: - * Do not pass <s> to Terminal as it is formally not a word in the sentence, - * only context. Instead, call BeginSentence. If called, it should be the - * first call after RuleScore is constructed (since <s> is always the - * leftmost). - * - * If the leftmost RHS is a non-terminal, it's faster to call BeginNonTerminal. - * - * Hashing and sorting comparison operators are provided. All state objects - * are POD. If you intend to use memcmp on raw state objects, you must call - * ZeroRemaining first, as the value of array entries beyond length is - * otherwise undefined. - * - * Usage is of course not limited to chart decoding. Anything that generates - * sentence fragments missing left context could benefit. For example, a - * phrase-based decoder could pre-score phrases, storing ChartState with each - * phrase, even if hypotheses are generated left-to-right. - */ - -#ifndef LM_LEFT_H -#define LM_LEFT_H - -#include "lm/max_order.hh" -#include "lm/state.hh" -#include "lm/return.hh" - -#include "util/murmur_hash.hh" - -#include <algorithm> - -namespace lm { -namespace ngram { - -template <class M> class RuleScore { - public: - explicit RuleScore(const M &model, ChartState &out) : model_(model), out_(&out), left_done_(false), prob_(0.0) { - out.left.length = 0; - out.right.length = 0; - } - - void BeginSentence() { - out_->right = model_.BeginSentenceState(); - // out_->left is empty. - left_done_ = true; - } - - void Terminal(WordIndex word) { - State copy(out_->right); - FullScoreReturn ret(model_.FullScore(copy, word, out_->right)); - if (left_done_) { prob_ += ret.prob; return; } - if (ret.independent_left) { - prob_ += ret.prob; - left_done_ = true; - return; - } - out_->left.pointers[out_->left.length++] = ret.extend_left; - prob_ += ret.rest; - if (out_->right.length != copy.length + 1) - left_done_ = true; - } - - // Faster version of NonTerminal for the case where the rule begins with a non-terminal. - void BeginNonTerminal(const ChartState &in, float prob = 0.0) { - prob_ = prob; - *out_ = in; - left_done_ = in.left.full; - } - - void NonTerminal(const ChartState &in, float prob = 0.0) { - prob_ += prob; - - if (!in.left.length) { - if (in.left.full) { - for (const float *i = out_->right.backoff; i < out_->right.backoff + out_->right.length; ++i) prob_ += *i; - left_done_ = true; - out_->right = in.right; - } - return; - } - - if (!out_->right.length) { - out_->right = in.right; - if (left_done_) { - prob_ += model_.UnRest(in.left.pointers, in.left.pointers + in.left.length, 1); - return; - } - if (out_->left.length) { - left_done_ = true; - } else { - out_->left = in.left; - left_done_ = in.left.full; - } - return; - } - - float backoffs[KENLM_MAX_ORDER - 1], backoffs2[KENLM_MAX_ORDER - 1]; - float *back = backoffs, *back2 = backoffs2; - unsigned char next_use = out_->right.length; - - // First word - if (ExtendLeft(in, next_use, 1, out_->right.backoff, back)) return; - - // Words after the first, so extending a bigram to begin with - for (unsigned char extend_length = 2; extend_length <= in.left.length; ++extend_length) { - if (ExtendLeft(in, next_use, extend_length, back, back2)) return; - std::swap(back, back2); - } - - if (in.left.full) { - for (const float *i = back; i != back + next_use; ++i) prob_ += *i; - left_done_ = true; - out_->right = in.right; - return; - } - - // Right state was minimized, so it's already independent of the new words to the left. - if (in.right.length < in.left.length) { - out_->right = in.right; - return; - } - - // Shift exisiting words down. - for (WordIndex *i = out_->right.words + next_use - 1; i >= out_->right.words; --i) { - *(i + in.right.length) = *i; - } - // Add words from in.right. - std::copy(in.right.words, in.right.words + in.right.length, out_->right.words); - // Assemble backoff composed on the existing state's backoff followed by the new state's backoff. - std::copy(in.right.backoff, in.right.backoff + in.right.length, out_->right.backoff); - std::copy(back, back + next_use, out_->right.backoff + in.right.length); - out_->right.length = in.right.length + next_use; - } - - float Finish() { - // A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram. - out_->left.full = left_done_ || (out_->left.length == model_.Order() - 1); - return prob_; - } - - void Reset() { - prob_ = 0.0; - left_done_ = false; - out_->left.length = 0; - out_->right.length = 0; - } - void Reset(ChartState &replacement) { - out_ = &replacement; - Reset(); - } - - private: - bool ExtendLeft(const ChartState &in, unsigned char &next_use, unsigned char extend_length, const float *back_in, float *back_out) { - ProcessRet(model_.ExtendLeft( - out_->right.words, out_->right.words + next_use, // Words to extend into - back_in, // Backoffs to use - in.left.pointers[extend_length - 1], extend_length, // Words to be extended - back_out, // Backoffs for the next score - next_use)); // Length of n-gram to use in next scoring. - if (next_use != out_->right.length) { - left_done_ = true; - if (!next_use) { - // Early exit. - out_->right = in.right; - prob_ += model_.UnRest(in.left.pointers + extend_length, in.left.pointers + in.left.length, extend_length + 1); - return true; - } - } - // Continue scoring. - return false; - } - - void ProcessRet(const FullScoreReturn &ret) { - if (left_done_) { - prob_ += ret.prob; - return; - } - if (ret.independent_left) { - prob_ += ret.prob; - left_done_ = true; - return; - } - out_->left.pointers[out_->left.length++] = ret.extend_left; - prob_ += ret.rest; - } - - const M &model_; - - ChartState *out_; - - bool left_done_; - - float prob_; -}; - -} // namespace ngram -} // namespace lm - -#endif // LM_LEFT_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/left_test.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/left_test.cc b/ext/kenlm/lm/left_test.cc deleted file mode 100644 index fdb6416..0000000 --- a/ext/kenlm/lm/left_test.cc +++ /dev/null @@ -1,397 +0,0 @@ -#include "lm/left.hh" -#include "lm/model.hh" - -#include "util/tokenize_piece.hh" - -#include <vector> - -#define BOOST_TEST_MODULE LeftTest -#include <boost/test/unit_test.hpp> -#include <boost/test/floating_point_comparison.hpp> - -namespace lm { -namespace ngram { -namespace { - -#define Term(word) score.Terminal(m.GetVocabulary().Index(word)); -#define VCheck(word, value) BOOST_CHECK_EQUAL(m.GetVocabulary().Index(word), value); - -// Apparently some Boost versions use templates and are pretty strict about types matching. -#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol)); - -template <class M> void Short(const M &m) { - ChartState base; - { - RuleScore<M> score(m, base); - Term("more"); - Term("loin"); - SLOPPY_CHECK_CLOSE(-1.206319 - 0.3561665, score.Finish(), 0.001); - } - BOOST_CHECK(base.left.full); - BOOST_CHECK_EQUAL(2, base.left.length); - BOOST_CHECK_EQUAL(1, base.right.length); - VCheck("loin", base.right.words[0]); - - ChartState more_left; - { - RuleScore<M> score(m, more_left); - Term("little"); - score.NonTerminal(base, -1.206319 - 0.3561665); - // p(little more loin | null context) - SLOPPY_CHECK_CLOSE(-1.56538, score.Finish(), 0.001); - } - BOOST_CHECK_EQUAL(3, more_left.left.length); - BOOST_CHECK_EQUAL(1, more_left.right.length); - VCheck("loin", more_left.right.words[0]); - BOOST_CHECK(more_left.left.full); - - ChartState shorter; - { - RuleScore<M> score(m, shorter); - Term("to"); - score.NonTerminal(base, -1.206319 - 0.3561665); - SLOPPY_CHECK_CLOSE(-0.30103 - 1.687872 - 1.206319 - 0.3561665, score.Finish(), 0.01); - } - BOOST_CHECK_EQUAL(1, shorter.left.length); - BOOST_CHECK_EQUAL(1, shorter.right.length); - VCheck("loin", shorter.right.words[0]); - BOOST_CHECK(shorter.left.full); -} - -template <class M> void Charge(const M &m) { - ChartState base; - { - RuleScore<M> score(m, base); - Term("on"); - Term("more"); - SLOPPY_CHECK_CLOSE(-1.509559 -0.4771212 -1.206319, score.Finish(), 0.001); - } - BOOST_CHECK_EQUAL(1, base.left.length); - BOOST_CHECK_EQUAL(1, base.right.length); - VCheck("more", base.right.words[0]); - BOOST_CHECK(base.left.full); - - ChartState extend; - { - RuleScore<M> score(m, extend); - Term("looking"); - score.NonTerminal(base, -1.509559 -0.4771212 -1.206319); - SLOPPY_CHECK_CLOSE(-3.91039, score.Finish(), 0.001); - } - BOOST_CHECK_EQUAL(2, extend.left.length); - BOOST_CHECK_EQUAL(1, extend.right.length); - VCheck("more", extend.right.words[0]); - BOOST_CHECK(extend.left.full); - - ChartState tobos; - { - RuleScore<M> score(m, tobos); - score.BeginSentence(); - score.NonTerminal(extend, -3.91039); - SLOPPY_CHECK_CLOSE(-3.471169, score.Finish(), 0.001); - } - BOOST_CHECK_EQUAL(0, tobos.left.length); - BOOST_CHECK_EQUAL(1, tobos.right.length); -} - -template <class M> float LeftToRight(const M &m, const std::vector<WordIndex> &words, bool begin_sentence = false) { - float ret = 0.0; - State right = begin_sentence ? m.BeginSentenceState() : m.NullContextState(); - for (std::vector<WordIndex>::const_iterator i = words.begin(); i != words.end(); ++i) { - State copy(right); - ret += m.Score(copy, *i, right); - } - return ret; -} - -template <class M> float RightToLeft(const M &m, const std::vector<WordIndex> &words, bool begin_sentence = false) { - float ret = 0.0; - ChartState state; - state.left.length = 0; - state.right.length = 0; - state.left.full = false; - for (std::vector<WordIndex>::const_reverse_iterator i = words.rbegin(); i != words.rend(); ++i) { - ChartState copy(state); - RuleScore<M> score(m, state); - score.Terminal(*i); - score.NonTerminal(copy, ret); - ret = score.Finish(); - } - if (begin_sentence) { - ChartState copy(state); - RuleScore<M> score(m, state); - score.BeginSentence(); - score.NonTerminal(copy, ret); - ret = score.Finish(); - } - return ret; -} - -template <class M> float TreeMiddle(const M &m, const std::vector<WordIndex> &words, bool begin_sentence = false) { - std::vector<std::pair<ChartState, float> > states(words.size()); - for (unsigned int i = 0; i < words.size(); ++i) { - RuleScore<M> score(m, states[i].first); - score.Terminal(words[i]); - states[i].second = score.Finish(); - } - while (states.size() > 1) { - std::vector<std::pair<ChartState, float> > upper((states.size() + 1) / 2); - for (unsigned int i = 0; i < states.size() / 2; ++i) { - RuleScore<M> score(m, upper[i].first); - score.NonTerminal(states[i*2].first, states[i*2].second); - score.NonTerminal(states[i*2+1].first, states[i*2+1].second); - upper[i].second = score.Finish(); - } - if (states.size() % 2) { - upper.back() = states.back(); - } - std::swap(states, upper); - } - - if (states.empty()) return 0.0; - - if (begin_sentence) { - ChartState ignored; - RuleScore<M> score(m, ignored); - score.BeginSentence(); - score.NonTerminal(states.front().first, states.front().second); - return score.Finish(); - } else { - return states.front().second; - } - -} - -template <class M> void LookupVocab(const M &m, const StringPiece &str, std::vector<WordIndex> &out) { - out.clear(); - for (util::TokenIter<util::SingleCharacter, true> i(str, ' '); i; ++i) { - out.push_back(m.GetVocabulary().Index(*i)); - } -} - -#define TEXT_TEST(str) \ - LookupVocab(m, str, words); \ - expect = LeftToRight(m, words, rest); \ - SLOPPY_CHECK_CLOSE(expect, RightToLeft(m, words, rest), 0.001); \ - SLOPPY_CHECK_CLOSE(expect, TreeMiddle(m, words, rest), 0.001); \ - -// Build sentences, or parts thereof, from right to left. -template <class M> void GrowBig(const M &m, bool rest = false) { - std::vector<WordIndex> words; - float expect; - TEXT_TEST("in biarritz watching considering looking . on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown </s>"); - TEXT_TEST("on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown </s>"); - TEXT_TEST("on a little more loin also would consider higher to look good"); - TEXT_TEST("more loin also would consider higher to look good"); - TEXT_TEST("more loin also would consider higher to look"); - TEXT_TEST("also would consider higher to look"); - TEXT_TEST("also would consider higher"); - TEXT_TEST("would consider higher to look"); - TEXT_TEST("consider higher to look"); - TEXT_TEST("consider higher to"); - TEXT_TEST("consider higher"); -} - -template <class M> void GrowSmall(const M &m, bool rest = false) { - std::vector<WordIndex> words; - float expect; - TEXT_TEST("in biarritz watching considering looking . </s>"); - TEXT_TEST("in biarritz watching considering looking ."); - TEXT_TEST("in biarritz"); -} - -template <class M> void AlsoWouldConsiderHigher(const M &m) { - ChartState also; - { - RuleScore<M> score(m, also); - score.Terminal(m.GetVocabulary().Index("also")); - SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001); - } - ChartState would; - { - RuleScore<M> score(m, would); - score.Terminal(m.GetVocabulary().Index("would")); - SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001); - } - ChartState combine_also_would; - { - RuleScore<M> score(m, combine_also_would); - score.NonTerminal(also, -1.687872); - score.NonTerminal(would, -1.687872); - SLOPPY_CHECK_CLOSE(-1.687872 - 2.0, score.Finish(), 0.001); - } - BOOST_CHECK_EQUAL(2, combine_also_would.right.length); - - ChartState also_would; - { - RuleScore<M> score(m, also_would); - score.Terminal(m.GetVocabulary().Index("also")); - score.Terminal(m.GetVocabulary().Index("would")); - SLOPPY_CHECK_CLOSE(-1.687872 - 2.0, score.Finish(), 0.001); - } - BOOST_CHECK_EQUAL(2, also_would.right.length); - - ChartState consider; - { - RuleScore<M> score(m, consider); - score.Terminal(m.GetVocabulary().Index("consider")); - SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001); - } - BOOST_CHECK_EQUAL(1, consider.left.length); - BOOST_CHECK_EQUAL(1, consider.right.length); - BOOST_CHECK(!consider.left.full); - - ChartState higher; - float higher_score; - { - RuleScore<M> score(m, higher); - score.Terminal(m.GetVocabulary().Index("higher")); - higher_score = score.Finish(); - } - SLOPPY_CHECK_CLOSE(-1.509559, higher_score, 0.001); - BOOST_CHECK_EQUAL(1, higher.left.length); - BOOST_CHECK_EQUAL(1, higher.right.length); - BOOST_CHECK(!higher.left.full); - VCheck("higher", higher.right.words[0]); - SLOPPY_CHECK_CLOSE(-0.30103, higher.right.backoff[0], 0.001); - - ChartState consider_higher; - { - RuleScore<M> score(m, consider_higher); - score.NonTerminal(consider, -1.687872); - score.NonTerminal(higher, higher_score); - SLOPPY_CHECK_CLOSE(-1.509559 - 1.687872 - 0.30103, score.Finish(), 0.001); - } - BOOST_CHECK_EQUAL(2, consider_higher.left.length); - BOOST_CHECK(!consider_higher.left.full); - - ChartState full; - { - RuleScore<M> score(m, full); - score.NonTerminal(combine_also_would, -1.687872 - 2.0); - score.NonTerminal(consider_higher, -1.509559 - 1.687872 - 0.30103); - SLOPPY_CHECK_CLOSE(-10.6879, score.Finish(), 0.001); - } - BOOST_CHECK_EQUAL(4, full.right.length); -} - -#define CHECK_SCORE(str, val) \ -{ \ - float got = val; \ - std::vector<WordIndex> indices; \ - LookupVocab(m, str, indices); \ - SLOPPY_CHECK_CLOSE(LeftToRight(m, indices), got, 0.001); \ -} - -template <class M> void FullGrow(const M &m) { - std::vector<WordIndex> words; - LookupVocab(m, "in biarritz watching considering looking . </s>", words); - - ChartState lexical[7]; - float lexical_scores[7]; - for (unsigned int i = 0; i < 7; ++i) { - RuleScore<M> score(m, lexical[i]); - score.Terminal(words[i]); - lexical_scores[i] = score.Finish(); - } - CHECK_SCORE("in", lexical_scores[0]); - CHECK_SCORE("biarritz", lexical_scores[1]); - CHECK_SCORE("watching", lexical_scores[2]); - CHECK_SCORE("</s>", lexical_scores[6]); - - ChartState l1[4]; - float l1_scores[4]; - { - RuleScore<M> score(m, l1[0]); - score.NonTerminal(lexical[0], lexical_scores[0]); - score.NonTerminal(lexical[1], lexical_scores[1]); - CHECK_SCORE("in biarritz", l1_scores[0] = score.Finish()); - } - { - RuleScore<M> score(m, l1[1]); - score.NonTerminal(lexical[2], lexical_scores[2]); - score.NonTerminal(lexical[3], lexical_scores[3]); - CHECK_SCORE("watching considering", l1_scores[1] = score.Finish()); - } - { - RuleScore<M> score(m, l1[2]); - score.NonTerminal(lexical[4], lexical_scores[4]); - score.NonTerminal(lexical[5], lexical_scores[5]); - CHECK_SCORE("looking .", l1_scores[2] = score.Finish()); - } - BOOST_CHECK_EQUAL(l1[2].left.length, 1); - l1[3] = lexical[6]; - l1_scores[3] = lexical_scores[6]; - - ChartState l2[2]; - float l2_scores[2]; - { - RuleScore<M> score(m, l2[0]); - score.NonTerminal(l1[0], l1_scores[0]); - score.NonTerminal(l1[1], l1_scores[1]); - CHECK_SCORE("in biarritz watching considering", l2_scores[0] = score.Finish()); - } - { - RuleScore<M> score(m, l2[1]); - score.NonTerminal(l1[2], l1_scores[2]); - score.NonTerminal(l1[3], l1_scores[3]); - CHECK_SCORE("looking . </s>", l2_scores[1] = score.Finish()); - } - BOOST_CHECK_EQUAL(l2[1].left.length, 1); - BOOST_CHECK(l2[1].left.full); - - ChartState top; - { - RuleScore<M> score(m, top); - score.NonTerminal(l2[0], l2_scores[0]); - score.NonTerminal(l2[1], l2_scores[1]); - CHECK_SCORE("in biarritz watching considering looking . </s>", score.Finish()); - } -} - -const char *FileLocation() { - if (boost::unit_test::framework::master_test_suite().argc < 2) { - return "test.arpa"; - } - return boost::unit_test::framework::master_test_suite().argv[1]; -} - -template <class M> void Everything() { - Config config; - config.messages = NULL; - M m(FileLocation(), config); - - Short(m); - Charge(m); - GrowBig(m); - AlsoWouldConsiderHigher(m); - GrowSmall(m); - FullGrow(m); -} - -BOOST_AUTO_TEST_CASE(ProbingAll) { - Everything<Model>(); -} -BOOST_AUTO_TEST_CASE(TrieAll) { - Everything<TrieModel>(); -} -BOOST_AUTO_TEST_CASE(QuantTrieAll) { - Everything<QuantTrieModel>(); -} -BOOST_AUTO_TEST_CASE(ArrayQuantTrieAll) { - Everything<QuantArrayTrieModel>(); -} -BOOST_AUTO_TEST_CASE(ArrayTrieAll) { - Everything<ArrayTrieModel>(); -} - -BOOST_AUTO_TEST_CASE(RestProbing) { - Config config; - config.messages = NULL; - RestProbingModel m(FileLocation(), config); - GrowBig(m, true); -} - -} // namespace -} // namespace ngram -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/lm_exception.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/lm_exception.cc b/ext/kenlm/lm/lm_exception.cc deleted file mode 100644 index 58d468f..0000000 --- a/ext/kenlm/lm/lm_exception.cc +++ /dev/null @@ -1,23 +0,0 @@ -#include "lm/lm_exception.hh" - -#include <cerrno> -#include <cstdio> - -namespace lm { - -ConfigException::ConfigException() throw() {} -ConfigException::~ConfigException() throw() {} - -LoadException::LoadException() throw() {} -LoadException::~LoadException() throw() {} - -FormatLoadException::FormatLoadException() throw() {} -FormatLoadException::~FormatLoadException() throw() {} - -VocabLoadException::VocabLoadException() throw() {} -VocabLoadException::~VocabLoadException() throw() {} - -SpecialWordMissingException::SpecialWordMissingException() throw() {} -SpecialWordMissingException::~SpecialWordMissingException() throw() {} - -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/lm_exception.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/lm_exception.hh b/ext/kenlm/lm/lm_exception.hh deleted file mode 100644 index 85a5738..0000000 --- a/ext/kenlm/lm/lm_exception.hh +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef LM_LM_EXCEPTION_H -#define LM_LM_EXCEPTION_H - -// Named to avoid conflict with util/exception.hh. - -#include "util/exception.hh" -#include "util/string_piece.hh" - -#include <exception> -#include <string> - -namespace lm { - -typedef enum {THROW_UP, COMPLAIN, SILENT} WarningAction; - -class ConfigException : public util::Exception { - public: - ConfigException() throw(); - ~ConfigException() throw(); -}; - -class LoadException : public util::Exception { - public: - virtual ~LoadException() throw(); - - protected: - LoadException() throw(); -}; - -class FormatLoadException : public LoadException { - public: - FormatLoadException() throw(); - ~FormatLoadException() throw(); -}; - -class VocabLoadException : public LoadException { - public: - virtual ~VocabLoadException() throw(); - VocabLoadException() throw(); -}; - -class SpecialWordMissingException : public VocabLoadException { - public: - explicit SpecialWordMissingException() throw(); - ~SpecialWordMissingException() throw(); -}; - -} // namespace lm - -#endif // LM_LM_EXCEPTION http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/max_order.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/max_order.hh b/ext/kenlm/lm/max_order.hh deleted file mode 100644 index 0ad1379..0000000 --- a/ext/kenlm/lm/max_order.hh +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef LM_MAX_ORDER_H -#define LM_MAX_ORDER_H -/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM. - * If not, this is the default maximum order. - * Having this limit means that State can be - * (kMaxOrder - 1) * sizeof(float) bytes instead of - * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead - */ -#ifndef KENLM_ORDER_MESSAGE -#define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile. In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'. Otherwise, edit lm/max_order.hh." -#endif - -#endif // LM_MAX_ORDER_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/model.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/model.cc b/ext/kenlm/lm/model.cc deleted file mode 100644 index a5a16bf..0000000 --- a/ext/kenlm/lm/model.cc +++ /dev/null @@ -1,349 +0,0 @@ -#include "lm/model.hh" - -#include "lm/blank.hh" -#include "lm/lm_exception.hh" -#include "lm/search_hashed.hh" -#include "lm/search_trie.hh" -#include "lm/read_arpa.hh" -#include "util/have.hh" -#include "util/murmur_hash.hh" - -#include <algorithm> -#include <functional> -#include <numeric> -#include <cmath> -#include <limits> - -namespace lm { -namespace ngram { -namespace detail { - -template <class Search, class VocabularyT> const ModelType GenericModel<Search, VocabularyT>::kModelType = Search::kModelType; - -template <class Search, class VocabularyT> uint64_t GenericModel<Search, VocabularyT>::Size(const std::vector<uint64_t> &counts, const Config &config) { - return VocabularyT::Size(counts[0], config) + Search::Size(counts, config); -} - -template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::SetupMemory(void *base, const std::vector<uint64_t> &counts, const Config &config) { - size_t goal_size = util::CheckOverflow(Size(counts, config)); - uint8_t *start = static_cast<uint8_t*>(base); - size_t allocated = VocabularyT::Size(counts[0], config); - vocab_.SetupMemory(start, allocated, counts[0], config); - start += allocated; - start = search_.SetupMemory(start, counts, config); - if (static_cast<std::size_t>(start - static_cast<uint8_t*>(base)) != goal_size) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast<uint8_t*>(base)) << " but Size says they should take " << goal_size); -} - -namespace { -void ComplainAboutARPA(const Config &config, ModelType model_type) { - if (config.write_mmap || !config.messages) return; - if (config.arpa_complain == Config::ALL) { - *config.messages << "Loading the LM will be faster if you build a binary file." << std::endl; - } else if (config.arpa_complain == Config::EXPENSIVE && - (model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) { - *config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl; - } -} - -void CheckCounts(const std::vector<uint64_t> &counts) { - UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE); - if (sizeof(uint64_t) > sizeof(std::size_t)) { - for (std::vector<uint64_t>::const_iterator i = counts.begin(); i != counts.end(); ++i) { - UTIL_THROW_IF(*i > static_cast<uint64_t>(std::numeric_limits<size_t>::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines."); - } - } -} - -} // namespace - -template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &init_config) : backing_(init_config) { - util::scoped_fd fd(util::OpenReadOrThrow(file)); - if (IsBinaryFormat(fd.get())) { - Parameters parameters; - int fd_shallow = fd.release(); - backing_.InitializeBinary(fd_shallow, kModelType, kVersion, parameters); - CheckCounts(parameters.counts); - - Config new_config(init_config); - new_config.probing_multiplier = parameters.fixed.probing_multiplier; - Search::UpdateConfigFromBinary(backing_, parameters.counts, VocabularyT::Size(parameters.counts[0], new_config), new_config); - UTIL_THROW_IF(new_config.enumerate_vocab && !parameters.fixed.has_vocabulary, FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary."); - - SetupMemory(backing_.LoadBinary(Size(parameters.counts, new_config)), parameters.counts, new_config); - vocab_.LoadedBinary(parameters.fixed.has_vocabulary, fd_shallow, new_config.enumerate_vocab, backing_.VocabStringReadingOffset()); - } else { - ComplainAboutARPA(init_config, kModelType); - InitializeFromARPA(fd.release(), file, init_config); - } - - // g++ prints warnings unless these are fully initialized. - State begin_sentence = State(); - begin_sentence.length = 1; - begin_sentence.words[0] = vocab_.BeginSentence(); - typename Search::Node ignored_node; - bool ignored_independent_left; - uint64_t ignored_extend_left; - begin_sentence.backoff[0] = search_.LookupUnigram(begin_sentence.words[0], ignored_node, ignored_independent_left, ignored_extend_left).Backoff(); - State null_context = State(); - null_context.length = 0; - P::Init(begin_sentence, null_context, vocab_, search_.Order()); -} - -template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(int fd, const char *file, const Config &config) { - // Backing file is the ARPA. - util::FilePiece f(fd, file, config.ProgressMessages()); - try { - std::vector<uint64_t> counts; - // File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_. - ReadARPACounts(f, counts); - CheckCounts(counts); - if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model."); - if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0"); - - std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config)); - // Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs. - vocab_.SetupMemory(backing_.SetupJustVocab(vocab_size, counts.size()), vocab_size, counts[0], config); - - if (config.write_mmap && config.include_vocab) { - WriteWordsWrapper wrap(config.enumerate_vocab); - vocab_.ConfigureEnumerate(&wrap, counts[0]); - search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_); - void *vocab_rebase, *search_rebase; - backing_.WriteVocabWords(wrap.Buffer(), vocab_rebase, search_rebase); - // Due to writing at the end of file, mmap may have relocated data. So remap. - vocab_.Relocate(vocab_rebase); - search_.SetupMemory(reinterpret_cast<uint8_t*>(search_rebase), counts, config); - } else { - vocab_.ConfigureEnumerate(config.enumerate_vocab, counts[0]); - search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_); - } - - if (!vocab_.SawUnk()) { - assert(config.unknown_missing != THROW_UP); - // Default probabilities for unknown. - search_.UnknownUnigram().backoff = 0.0; - search_.UnknownUnigram().prob = config.unknown_missing_logprob; - } - backing_.FinishFile(config, kModelType, kVersion, counts); - } catch (util::Exception &e) { - e << " Byte: " << f.Offset(); - throw; - } -} - -template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const { - FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state); - for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) { - ret.prob += *i; - } - return ret; -} - -template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const { - context_rend = std::min(context_rend, context_rbegin + P::Order() - 1); - FullScoreReturn ret = ScoreExceptBackoff(context_rbegin, context_rend, new_word, out_state); - - // Add the backoff weights for n-grams of order start to (context_rend - context_rbegin). - unsigned char start = ret.ngram_length; - if (context_rend - context_rbegin < static_cast<std::ptrdiff_t>(start)) return ret; - - bool independent_left; - uint64_t extend_left; - typename Search::Node node; - if (start <= 1) { - ret.prob += search_.LookupUnigram(*context_rbegin, node, independent_left, extend_left).Backoff(); - start = 2; - } else if (!search_.FastMakeNode(context_rbegin, context_rbegin + start - 1, node)) { - return ret; - } - // i is the order of the backoff we're looking for. - unsigned char order_minus_2 = start - 2; - for (const WordIndex *i = context_rbegin + start - 1; i < context_rend; ++i, ++order_minus_2) { - typename Search::MiddlePointer p(search_.LookupMiddle(order_minus_2, *i, node, independent_left, extend_left)); - if (!p.Found()) break; - ret.prob += p.Backoff(); - } - return ret; -} - -template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const { - // Generate a state from context. - context_rend = std::min(context_rend, context_rbegin + P::Order() - 1); - if (context_rend == context_rbegin) { - out_state.length = 0; - return; - } - typename Search::Node node; - bool independent_left; - uint64_t extend_left; - out_state.backoff[0] = search_.LookupUnigram(*context_rbegin, node, independent_left, extend_left).Backoff(); - out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0; - float *backoff_out = out_state.backoff + 1; - unsigned char order_minus_2 = 0; - for (const WordIndex *i = context_rbegin + 1; i < context_rend; ++i, ++backoff_out, ++order_minus_2) { - typename Search::MiddlePointer p(search_.LookupMiddle(order_minus_2, *i, node, independent_left, extend_left)); - if (!p.Found()) { - std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words); - return; - } - *backoff_out = p.Backoff(); - if (HasExtension(*backoff_out)) out_state.length = i - context_rbegin + 1; - } - std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words); -} - -template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ExtendLeft( - const WordIndex *add_rbegin, const WordIndex *add_rend, - const float *backoff_in, - uint64_t extend_pointer, - unsigned char extend_length, - float *backoff_out, - unsigned char &next_use) const { - FullScoreReturn ret; - typename Search::Node node; - if (extend_length == 1) { - typename Search::UnigramPointer ptr(search_.LookupUnigram(static_cast<WordIndex>(extend_pointer), node, ret.independent_left, ret.extend_left)); - ret.rest = ptr.Rest(); - ret.prob = ptr.Prob(); - assert(!ret.independent_left); - } else { - typename Search::MiddlePointer ptr(search_.Unpack(extend_pointer, extend_length, node)); - ret.rest = ptr.Rest(); - ret.prob = ptr.Prob(); - ret.extend_left = extend_pointer; - // If this function is called, then it does depend on left words. - ret.independent_left = false; - } - float subtract_me = ret.rest; - ret.ngram_length = extend_length; - next_use = extend_length; - ResumeScore(add_rbegin, add_rend, extend_length - 1, node, backoff_out, next_use, ret); - next_use -= extend_length; - // Charge backoffs. - for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b; - ret.prob -= subtract_me; - ret.rest -= subtract_me; - return ret; -} - -namespace { -// Do a paraonoid copy of history, assuming new_word has already been copied -// (hence the -1). out_state.length could be zero so I avoided using -// std::copy. -void CopyRemainingHistory(const WordIndex *from, State &out_state) { - WordIndex *out = out_state.words + 1; - const WordIndex *in_end = from + static_cast<ptrdiff_t>(out_state.length) - 1; - for (const WordIndex *in = from; in < in_end; ++in, ++out) *out = *in; -} -} // namespace - -/* Ugly optimized function. Produce a score excluding backoff. - * The search goes in increasing order of ngram length. - * Context goes backward, so context_begin is the word immediately preceeding - * new_word. - */ -template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ScoreExceptBackoff( - const WordIndex *const context_rbegin, - const WordIndex *const context_rend, - const WordIndex new_word, - State &out_state) const { - assert(new_word < vocab_.Bound()); - FullScoreReturn ret; - // ret.ngram_length contains the last known non-blank ngram length. - ret.ngram_length = 1; - - typename Search::Node node; - typename Search::UnigramPointer uni(search_.LookupUnigram(new_word, node, ret.independent_left, ret.extend_left)); - out_state.backoff[0] = uni.Backoff(); - ret.prob = uni.Prob(); - ret.rest = uni.Rest(); - - // This is the length of the context that should be used for continuation to the right. - out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0; - // We'll write the word anyway since it will probably be used and does no harm being there. - out_state.words[0] = new_word; - if (context_rbegin == context_rend) return ret; - - ResumeScore(context_rbegin, context_rend, 0, node, out_state.backoff + 1, out_state.length, ret); - CopyRemainingHistory(context_rbegin, out_state); - return ret; -} - -template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::ResumeScore(const WordIndex *hist_iter, const WordIndex *const context_rend, unsigned char order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const { - for (; ; ++order_minus_2, ++hist_iter, ++backoff_out) { - if (hist_iter == context_rend) return; - if (ret.independent_left) return; - if (order_minus_2 == P::Order() - 2) break; - - typename Search::MiddlePointer pointer(search_.LookupMiddle(order_minus_2, *hist_iter, node, ret.independent_left, ret.extend_left)); - if (!pointer.Found()) return; - *backoff_out = pointer.Backoff(); - ret.prob = pointer.Prob(); - ret.rest = pointer.Rest(); - ret.ngram_length = order_minus_2 + 2; - if (HasExtension(*backoff_out)) { - next_use = ret.ngram_length; - } - } - ret.independent_left = true; - typename Search::LongestPointer longest(search_.LookupLongest(*hist_iter, node)); - if (longest.Found()) { - ret.prob = longest.Prob(); - ret.rest = ret.prob; - // There is no blank in longest_. - ret.ngram_length = P::Order(); - } -} - -template <class Search, class VocabularyT> float GenericModel<Search, VocabularyT>::InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const { - float ret; - typename Search::Node node; - if (first_length == 1) { - if (pointers_begin >= pointers_end) return 0.0; - bool independent_left; - uint64_t extend_left; - typename Search::UnigramPointer ptr(search_.LookupUnigram(static_cast<WordIndex>(*pointers_begin), node, independent_left, extend_left)); - ret = ptr.Prob() - ptr.Rest(); - ++first_length; - ++pointers_begin; - } else { - ret = 0.0; - } - for (const uint64_t *i = pointers_begin; i < pointers_end; ++i, ++first_length) { - typename Search::MiddlePointer ptr(search_.Unpack(*i, first_length, node)); - ret += ptr.Prob() - ptr.Rest(); - } - return ret; -} - -template class GenericModel<HashedSearch<BackoffValue>, ProbingVocabulary>; -template class GenericModel<HashedSearch<RestValue>, ProbingVocabulary>; -template class GenericModel<trie::TrieSearch<DontQuantize, trie::DontBhiksha>, SortedVocabulary>; -template class GenericModel<trie::TrieSearch<DontQuantize, trie::ArrayBhiksha>, SortedVocabulary>; -template class GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::DontBhiksha>, SortedVocabulary>; -template class GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::ArrayBhiksha>, SortedVocabulary>; - -} // namespace detail - -base::Model *LoadVirtual(const char *file_name, const Config &config, ModelType model_type) { - RecognizeBinary(file_name, model_type); - switch (model_type) { - case PROBING: - return new ProbingModel(file_name, config); - case REST_PROBING: - return new RestProbingModel(file_name, config); - case TRIE: - return new TrieModel(file_name, config); - case QUANT_TRIE: - return new QuantTrieModel(file_name, config); - case ARRAY_TRIE: - return new ArrayTrieModel(file_name, config); - case QUANT_ARRAY_TRIE: - return new QuantArrayTrieModel(file_name, config); - default: - UTIL_THROW(FormatLoadException, "Confused by model type " << model_type); - } -} - -} // namespace ngram -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/model.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/model.hh b/ext/kenlm/lm/model.hh deleted file mode 100644 index b2bbe39..0000000 --- a/ext/kenlm/lm/model.hh +++ /dev/null @@ -1,155 +0,0 @@ -#ifndef LM_MODEL_H -#define LM_MODEL_H - -#include "lm/bhiksha.hh" -#include "lm/binary_format.hh" -#include "lm/config.hh" -#include "lm/facade.hh" -#include "lm/quantize.hh" -#include "lm/search_hashed.hh" -#include "lm/search_trie.hh" -#include "lm/state.hh" -#include "lm/value.hh" -#include "lm/vocab.hh" -#include "lm/weights.hh" - -#include "util/murmur_hash.hh" - -#include <algorithm> -#include <vector> -#include <cstring> - -namespace util { class FilePiece; } - -namespace lm { -namespace ngram { -namespace detail { - -// Should return the same results as SRI. -// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts. -template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> { - private: - typedef base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> P; - public: - // This is the model type returned by RecognizeBinary. - static const ModelType kModelType; - - static const unsigned int kVersion = Search::kVersion; - - /* Get the size of memory that will be mapped given ngram counts. This - * does not include small non-mapped control structures, such as this class - * itself. - */ - static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config = Config()); - - /* Load the model from a file. It may be an ARPA or binary file. Binary - * files must have the format expected by this class or you'll get an - * exception. So TrieModel can only load ARPA or binary created by - * TrieModel. To classify binary files, call RecognizeBinary in - * lm/binary_format.hh. - */ - explicit GenericModel(const char *file, const Config &config = Config()); - - /* Score p(new_word | in_state) and incorporate new_word into out_state. - * Note that in_state and out_state must be different references: - * &in_state != &out_state. - */ - FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const; - - /* Slower call without in_state. Try to remember state, but sometimes it - * would cost too much memory or your decoder isn't setup properly. - * To use this function, make an array of WordIndex containing the context - * vocabulary ids in reverse order. Then, pass the bounds of the array: - * [context_rbegin, context_rend). The new_word is not part of the context - * array unless you intend to repeat words. - */ - FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const; - - /* Get the state for a context. Don't use this if you can avoid it. Use - * BeginSentenceState or NullContextState and extend from those. If - * you're only going to use this state to call FullScore once, use - * FullScoreForgotState. - * To use this function, make an array of WordIndex containing the context - * vocabulary ids in reverse order. Then, pass the bounds of the array: - * [context_rbegin, context_rend). - */ - void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const; - - /* More efficient version of FullScore where a partial n-gram has already - * been scored. - * NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE. - */ - FullScoreReturn ExtendLeft( - // Additional context in reverse order. This will update add_rend to - const WordIndex *add_rbegin, const WordIndex *add_rend, - // Backoff weights to use. - const float *backoff_in, - // extend_left returned by a previous query. - uint64_t extend_pointer, - // Length of n-gram that the pointer corresponds to. - unsigned char extend_length, - // Where to write additional backoffs for [extend_length + 1, min(Order() - 1, return.ngram_length)] - float *backoff_out, - // Amount of additional content that should be considered by the next call. - unsigned char &next_use) const; - - /* Return probabilities minus rest costs for an array of pointers. The - * first length should be the length of the n-gram to which pointers_begin - * points. - */ - float UnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const { - // Compiler should optimize this if away. - return Search::kDifferentRest ? InternalUnRest(pointers_begin, pointers_end, first_length) : 0.0; - } - - private: - FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const; - - // Score bigrams and above. Do not include backoff. - void ResumeScore(const WordIndex *context_rbegin, const WordIndex *const context_rend, unsigned char starting_order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const; - - // Appears after Size in the cc file. - void SetupMemory(void *start, const std::vector<uint64_t> &counts, const Config &config); - - void InitializeFromARPA(int fd, const char *file, const Config &config); - - float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const; - - BinaryFormat backing_; - - VocabularyT vocab_; - - Search search_; -}; - -} // namespace detail - -// Instead of typedef, inherit. This allows the Model etc to be forward declared. -// Oh the joys of C and C++. -#define LM_COMMA() , -#define LM_NAME_MODEL(name, from)\ -class name : public from {\ - public:\ - name(const char *file, const Config &config = Config()) : from(file, config) {}\ -}; - -LM_NAME_MODEL(ProbingModel, detail::GenericModel<detail::HashedSearch<BackoffValue> LM_COMMA() ProbingVocabulary>); -LM_NAME_MODEL(RestProbingModel, detail::GenericModel<detail::HashedSearch<RestValue> LM_COMMA() ProbingVocabulary>); -LM_NAME_MODEL(TrieModel, detail::GenericModel<trie::TrieSearch<DontQuantize LM_COMMA() trie::DontBhiksha> LM_COMMA() SortedVocabulary>); -LM_NAME_MODEL(ArrayTrieModel, detail::GenericModel<trie::TrieSearch<DontQuantize LM_COMMA() trie::ArrayBhiksha> LM_COMMA() SortedVocabulary>); -LM_NAME_MODEL(QuantTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::DontBhiksha> LM_COMMA() SortedVocabulary>); -LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::ArrayBhiksha> LM_COMMA() SortedVocabulary>); - -// Default implementation. No real reason for it to be the default. -typedef ::lm::ngram::ProbingVocabulary Vocabulary; -typedef ProbingModel Model; - -/* Autorecognize the file type, load, and return the virtual base class. Don't - * use the virtual base class if you can avoid it. Instead, use the above - * classes as template arguments to your own virtual feature function.*/ -base::Model *LoadVirtual(const char *file_name, const Config &config = Config(), ModelType if_arpa = PROBING); - -} // namespace ngram -} // namespace lm - -#endif // LM_MODEL_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/model_test.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/model_test.cc b/ext/kenlm/lm/model_test.cc deleted file mode 100644 index d408d6f..0000000 --- a/ext/kenlm/lm/model_test.cc +++ /dev/null @@ -1,448 +0,0 @@ -#include "lm/model.hh" - -#include <cstdlib> -#include <cstring> - -#define BOOST_TEST_MODULE ModelTest -#include <boost/test/unit_test.hpp> -#include <boost/test/floating_point_comparison.hpp> - -// Apparently some Boost versions use templates and are pretty strict about types matching. -#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol)); - -namespace lm { -namespace ngram { - -std::ostream &operator<<(std::ostream &o, const State &state) { - o << "State length " << static_cast<unsigned int>(state.length) << ':'; - for (const WordIndex *i = state.words; i < state.words + state.length; ++i) { - o << ' ' << *i; - } - return o; -} - -namespace { - -// Stupid bjam reverses the command line arguments randomly. -const char *TestLocation() { - if (boost::unit_test::framework::master_test_suite().argc < 3) { - return "test.arpa"; - } - char **argv = boost::unit_test::framework::master_test_suite().argv; - return argv[strstr(argv[1], "nounk") ? 2 : 1]; -} -const char *TestNoUnkLocation() { - if (boost::unit_test::framework::master_test_suite().argc < 3) { - return "test_nounk.arpa"; - } - char **argv = boost::unit_test::framework::master_test_suite().argv; - return argv[strstr(argv[1], "nounk") ? 1 : 2]; -} - -template <class Model> State GetState(const Model &model, const char *word, const State &in) { - WordIndex context[in.length + 1]; - context[0] = model.GetVocabulary().Index(word); - std::copy(in.words, in.words + in.length, context + 1); - State ret; - model.GetState(context, context + in.length + 1, ret); - return ret; -} - -#define StartTest(word, ngram, score, indep_left) \ - ret = model.FullScore( \ - state, \ - model.GetVocabulary().Index(word), \ - out);\ - SLOPPY_CHECK_CLOSE(score, ret.prob, 0.001); \ - BOOST_CHECK_EQUAL(static_cast<unsigned int>(ngram), ret.ngram_length); \ - BOOST_CHECK_GE(std::min<unsigned char>(ngram, 5 - 1), out.length); \ - BOOST_CHECK_EQUAL(indep_left, ret.independent_left); \ - BOOST_CHECK_EQUAL(out, GetState(model, word, state)); - -#define AppendTest(word, ngram, score, indep_left) \ - StartTest(word, ngram, score, indep_left) \ - state = out; - -template <class M> void Starters(const M &model) { - FullScoreReturn ret; - Model::State state(model.BeginSentenceState()); - Model::State out; - - StartTest("looking", 2, -0.4846522, true); - - // , probability plus <s> backoff - StartTest(",", 1, -1.383514 + -0.4149733, true); - // <unk> probability plus <s> backoff - StartTest("this_is_not_found", 1, -1.995635 + -0.4149733, true); -} - -template <class M> void Continuation(const M &model) { - FullScoreReturn ret; - Model::State state(model.BeginSentenceState()); - Model::State out; - - AppendTest("looking", 2, -0.484652, true); - AppendTest("on", 3, -0.348837, true); - AppendTest("a", 4, -0.0155266, true); - AppendTest("little", 5, -0.00306122, true); - State preserve = state; - AppendTest("the", 1, -4.04005, true); - AppendTest("biarritz", 1, -1.9889, true); - AppendTest("not_found", 1, -2.29666, true); - AppendTest("more", 1, -1.20632 - 20.0, true); - AppendTest(".", 2, -0.51363, true); - AppendTest("</s>", 3, -0.0191651, true); - BOOST_CHECK_EQUAL(0, state.length); - - state = preserve; - AppendTest("more", 5, -0.00181395, true); - BOOST_CHECK_EQUAL(4, state.length); - AppendTest("loin", 5, -0.0432557, true); - BOOST_CHECK_EQUAL(1, state.length); -} - -template <class M> void Blanks(const M &model) { - FullScoreReturn ret; - State state(model.NullContextState()); - State out; - AppendTest("also", 1, -1.687872, false); - AppendTest("would", 2, -2, true); - AppendTest("consider", 3, -3, true); - State preserve = state; - AppendTest("higher", 4, -4, true); - AppendTest("looking", 5, -5, true); - BOOST_CHECK_EQUAL(1, state.length); - - state = preserve; - // also would consider not_found - AppendTest("not_found", 1, -1.995635 - 7.0 - 0.30103, true); - - state = model.NullContextState(); - // higher looking is a blank. - AppendTest("higher", 1, -1.509559, false); - AppendTest("looking", 2, -1.285941 - 0.30103, false); - - State higher_looking = state; - - BOOST_CHECK_EQUAL(1, state.length); - AppendTest("not_found", 1, -1.995635 - 0.4771212, true); - - state = higher_looking; - // higher looking consider - AppendTest("consider", 1, -1.687872 - 0.4771212, true); - - state = model.NullContextState(); - AppendTest("would", 1, -1.687872, false); - BOOST_CHECK_EQUAL(1, state.length); - AppendTest("consider", 2, -1.687872 -0.30103, false); - BOOST_CHECK_EQUAL(2, state.length); - AppendTest("higher", 3, -1.509559 - 0.30103, false); - BOOST_CHECK_EQUAL(3, state.length); - AppendTest("looking", 4, -1.285941 - 0.30103, false); -} - -template <class M> void Unknowns(const M &model) { - FullScoreReturn ret; - State state(model.NullContextState()); - State out; - - AppendTest("not_found", 1, -1.995635, false); - State preserve = state; - AppendTest("not_found2", 2, -15.0, true); - AppendTest("not_found3", 2, -15.0 - 2.0, true); - - state = preserve; - AppendTest("however", 2, -4, true); - AppendTest("not_found3", 3, -6, true); -} - -template <class M> void MinimalState(const M &model) { - FullScoreReturn ret; - State state(model.NullContextState()); - State out; - - AppendTest("baz", 1, -6.535897, true); - BOOST_CHECK_EQUAL(0, state.length); - state = model.NullContextState(); - AppendTest("foo", 1, -3.141592, true); - BOOST_CHECK_EQUAL(1, state.length); - AppendTest("bar", 2, -6.0, true); - // Has to include the backoff weight. - BOOST_CHECK_EQUAL(1, state.length); - AppendTest("bar", 1, -2.718281 + 3.0, true); - BOOST_CHECK_EQUAL(1, state.length); - - state = model.NullContextState(); - AppendTest("to", 1, -1.687872, false); - AppendTest("look", 2, -0.2922095, true); - BOOST_CHECK_EQUAL(2, state.length); - AppendTest("a", 3, -7, true); -} - -template <class M> void ExtendLeftTest(const M &model) { - State right; - FullScoreReturn little(model.FullScore(model.NullContextState(), model.GetVocabulary().Index("little"), right)); - const float kLittleProb = -1.285941; - SLOPPY_CHECK_CLOSE(kLittleProb, little.prob, 0.001); - unsigned char next_use; - float backoff_out[4]; - - FullScoreReturn extend_none(model.ExtendLeft(NULL, NULL, NULL, little.extend_left, 1, NULL, next_use)); - BOOST_CHECK_EQUAL(0, next_use); - BOOST_CHECK_EQUAL(little.extend_left, extend_none.extend_left); - SLOPPY_CHECK_CLOSE(little.prob - little.rest, extend_none.prob, 0.001); - BOOST_CHECK_EQUAL(1, extend_none.ngram_length); - - const WordIndex a = model.GetVocabulary().Index("a"); - float backoff_in = 3.14; - // a little - FullScoreReturn extend_a(model.ExtendLeft(&a, &a + 1, &backoff_in, little.extend_left, 1, backoff_out, next_use)); - BOOST_CHECK_EQUAL(1, next_use); - SLOPPY_CHECK_CLOSE(-0.69897, backoff_out[0], 0.001); - SLOPPY_CHECK_CLOSE(-0.09132547 - little.rest, extend_a.prob, 0.001); - BOOST_CHECK_EQUAL(2, extend_a.ngram_length); - BOOST_CHECK(!extend_a.independent_left); - - const WordIndex on = model.GetVocabulary().Index("on"); - FullScoreReturn extend_on(model.ExtendLeft(&on, &on + 1, &backoff_in, extend_a.extend_left, 2, backoff_out, next_use)); - BOOST_CHECK_EQUAL(1, next_use); - SLOPPY_CHECK_CLOSE(-0.4771212, backoff_out[0], 0.001); - SLOPPY_CHECK_CLOSE(-0.0283603 - (extend_a.rest + little.rest), extend_on.prob, 0.001); - BOOST_CHECK_EQUAL(3, extend_on.ngram_length); - BOOST_CHECK(!extend_on.independent_left); - - const WordIndex both[2] = {a, on}; - float backoff_in_arr[4]; - FullScoreReturn extend_both(model.ExtendLeft(both, both + 2, backoff_in_arr, little.extend_left, 1, backoff_out, next_use)); - BOOST_CHECK_EQUAL(2, next_use); - SLOPPY_CHECK_CLOSE(-0.69897, backoff_out[0], 0.001); - SLOPPY_CHECK_CLOSE(-0.4771212, backoff_out[1], 0.001); - SLOPPY_CHECK_CLOSE(-0.0283603 - little.rest, extend_both.prob, 0.001); - BOOST_CHECK_EQUAL(3, extend_both.ngram_length); - BOOST_CHECK(!extend_both.independent_left); - BOOST_CHECK_EQUAL(extend_on.extend_left, extend_both.extend_left); -} - -#define StatelessTest(word, provide, ngram, score) \ - ret = model.FullScoreForgotState(indices + num_words - word, indices + num_words - word + provide, indices[num_words - word - 1], state); \ - SLOPPY_CHECK_CLOSE(score, ret.prob, 0.001); \ - BOOST_CHECK_EQUAL(static_cast<unsigned int>(ngram), ret.ngram_length); \ - model.GetState(indices + num_words - word, indices + num_words - word + provide, before); \ - ret = model.FullScore(before, indices[num_words - word - 1], out); \ - BOOST_CHECK(state == out); \ - SLOPPY_CHECK_CLOSE(score, ret.prob, 0.001); \ - BOOST_CHECK_EQUAL(static_cast<unsigned int>(ngram), ret.ngram_length); - -template <class M> void Stateless(const M &model) { - const char *words[] = {"<s>", "looking", "on", "a", "little", "the", "biarritz", "not_found", "more", ".", "</s>"}; - const size_t num_words = sizeof(words) / sizeof(const char*); - // Silience "array subscript is above array bounds" when extracting end pointer. - WordIndex indices[num_words + 1]; - for (unsigned int i = 0; i < num_words; ++i) { - indices[num_words - 1 - i] = model.GetVocabulary().Index(words[i]); - } - FullScoreReturn ret; - State state, out, before; - - ret = model.FullScoreForgotState(indices + num_words - 1, indices + num_words, indices[num_words - 2], state); - SLOPPY_CHECK_CLOSE(-0.484652, ret.prob, 0.001); - StatelessTest(1, 1, 2, -0.484652); - - // looking - StatelessTest(1, 2, 2, -0.484652); - // on - AppendTest("on", 3, -0.348837, true); - StatelessTest(2, 3, 3, -0.348837); - StatelessTest(2, 2, 3, -0.348837); - StatelessTest(2, 1, 2, -0.4638903); - // a - StatelessTest(3, 4, 4, -0.0155266); - // little - AppendTest("little", 5, -0.00306122, true); - StatelessTest(4, 5, 5, -0.00306122); - // the - AppendTest("the", 1, -4.04005, true); - StatelessTest(5, 5, 1, -4.04005); - // No context of the. - StatelessTest(5, 0, 1, -1.687872); - // biarritz - StatelessTest(6, 1, 1, -1.9889); - // not found - StatelessTest(7, 1, 1, -2.29666); - StatelessTest(7, 0, 1, -1.995635); - - WordIndex unk[1]; - unk[0] = 0; - model.GetState(unk, unk + 1, state); - BOOST_CHECK_EQUAL(1, state.length); - BOOST_CHECK_EQUAL(static_cast<WordIndex>(0), state.words[0]); -} - -template <class M> void NoUnkCheck(const M &model) { - WordIndex unk_index = 0; - State state; - - FullScoreReturn ret = model.FullScoreForgotState(&unk_index, &unk_index + 1, unk_index, state); - SLOPPY_CHECK_CLOSE(-100.0, ret.prob, 0.001); -} - -template <class M> void Everything(const M &m) { - Starters(m); - Continuation(m); - Blanks(m); - Unknowns(m); - MinimalState(m); - ExtendLeftTest(m); - Stateless(m); -} - -class ExpectEnumerateVocab : public EnumerateVocab { - public: - ExpectEnumerateVocab() {} - - void Add(WordIndex index, const StringPiece &str) { - BOOST_CHECK_EQUAL(seen.size(), index); - seen.push_back(std::string(str.data(), str.length())); - } - - void Check(const base::Vocabulary &vocab) { - BOOST_CHECK_EQUAL(37ULL, seen.size()); - BOOST_REQUIRE(!seen.empty()); - BOOST_CHECK_EQUAL("<unk>", seen[0]); - for (WordIndex i = 0; i < seen.size(); ++i) { - BOOST_CHECK_EQUAL(i, vocab.Index(seen[i])); - } - } - - void Clear() { - seen.clear(); - } - - std::vector<std::string> seen; -}; - -template <class ModelT> void LoadingTest() { - Config config; - config.arpa_complain = Config::NONE; - config.messages = NULL; - config.probing_multiplier = 2.0; - { - ExpectEnumerateVocab enumerate; - config.enumerate_vocab = &enumerate; - ModelT m(TestLocation(), config); - enumerate.Check(m.GetVocabulary()); - BOOST_CHECK_EQUAL((WordIndex)37, m.GetVocabulary().Bound()); - Everything(m); - } - { - ExpectEnumerateVocab enumerate; - config.enumerate_vocab = &enumerate; - ModelT m(TestNoUnkLocation(), config); - enumerate.Check(m.GetVocabulary()); - BOOST_CHECK_EQUAL((WordIndex)37, m.GetVocabulary().Bound()); - NoUnkCheck(m); - } -} - -BOOST_AUTO_TEST_CASE(probing) { - LoadingTest<Model>(); -} -BOOST_AUTO_TEST_CASE(trie) { - LoadingTest<TrieModel>(); -} -BOOST_AUTO_TEST_CASE(quant_trie) { - LoadingTest<QuantTrieModel>(); -} -BOOST_AUTO_TEST_CASE(bhiksha_trie) { - LoadingTest<ArrayTrieModel>(); -} -BOOST_AUTO_TEST_CASE(quant_bhiksha_trie) { - LoadingTest<QuantArrayTrieModel>(); -} - -template <class ModelT> void BinaryTest(Config::WriteMethod write_method) { - Config config; - config.write_mmap = "test.binary"; - config.messages = NULL; - config.write_method = write_method; - ExpectEnumerateVocab enumerate; - config.enumerate_vocab = &enumerate; - - { - ModelT copy_model(TestLocation(), config); - enumerate.Check(copy_model.GetVocabulary()); - enumerate.Clear(); - Everything(copy_model); - } - - config.write_mmap = NULL; - - ModelType type; - BOOST_REQUIRE(RecognizeBinary("test.binary", type)); - BOOST_CHECK_EQUAL(ModelT::kModelType, type); - - { - ModelT binary("test.binary", config); - enumerate.Check(binary.GetVocabulary()); - Everything(binary); - } - unlink("test.binary"); - - // Now test without <unk>. - config.write_mmap = "test_nounk.binary"; - config.messages = NULL; - enumerate.Clear(); - { - ModelT copy_model(TestNoUnkLocation(), config); - enumerate.Check(copy_model.GetVocabulary()); - enumerate.Clear(); - NoUnkCheck(copy_model); - } - config.write_mmap = NULL; - { - ModelT binary(TestNoUnkLocation(), config); - enumerate.Check(binary.GetVocabulary()); - NoUnkCheck(binary); - } - unlink("test_nounk.binary"); -} - -template <class ModelT> void BinaryTest() { - BinaryTest<ModelT>(Config::WRITE_MMAP); - BinaryTest<ModelT>(Config::WRITE_AFTER); -} - -BOOST_AUTO_TEST_CASE(write_and_read_probing) { - BinaryTest<ProbingModel>(); -} -BOOST_AUTO_TEST_CASE(write_and_read_rest_probing) { - BinaryTest<RestProbingModel>(); -} -BOOST_AUTO_TEST_CASE(write_and_read_trie) { - BinaryTest<TrieModel>(); -} -BOOST_AUTO_TEST_CASE(write_and_read_quant_trie) { - BinaryTest<QuantTrieModel>(); -} -BOOST_AUTO_TEST_CASE(write_and_read_array_trie) { - BinaryTest<ArrayTrieModel>(); -} -BOOST_AUTO_TEST_CASE(write_and_read_quant_array_trie) { - BinaryTest<QuantArrayTrieModel>(); -} - -BOOST_AUTO_TEST_CASE(rest_max) { - Config config; - config.arpa_complain = Config::NONE; - config.messages = NULL; - - RestProbingModel model(TestLocation(), config); - State state, out; - FullScoreReturn ret(model.FullScore(model.NullContextState(), model.GetVocabulary().Index("."), state)); - SLOPPY_CHECK_CLOSE(-0.2705918, ret.rest, 0.001); - SLOPPY_CHECK_CLOSE(-0.01916512, model.FullScore(state, model.GetVocabulary().EndSentence(), out).rest, 0.001); -} - -} // namespace -} // namespace ngram -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/model_type.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/model_type.hh b/ext/kenlm/lm/model_type.hh deleted file mode 100644 index dcdc6ac..0000000 --- a/ext/kenlm/lm/model_type.hh +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef LM_MODEL_TYPE_H -#define LM_MODEL_TYPE_H - -namespace lm { -namespace ngram { - -/* Not the best numbering system, but it grew this way for historical reasons - * and I want to preserve existing binary files. */ -typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType; - -// Historical names. -const ModelType HASH_PROBING = PROBING; -const ModelType TRIE_SORTED = TRIE; -const ModelType QUANT_TRIE_SORTED = QUANT_TRIE; -const ModelType ARRAY_TRIE_SORTED = ARRAY_TRIE; -const ModelType QUANT_ARRAY_TRIE_SORTED = QUANT_ARRAY_TRIE; - -const static ModelType kQuantAdd = static_cast<ModelType>(QUANT_TRIE - TRIE); -const static ModelType kArrayAdd = static_cast<ModelType>(ARRAY_TRIE - TRIE); - -} // namespace ngram -} // namespace lm -#endif // LM_MODEL_TYPE_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/neural/Jamfile ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/neural/Jamfile b/ext/kenlm/lm/neural/Jamfile deleted file mode 100644 index 14cd8e3..0000000 --- a/ext/kenlm/lm/neural/Jamfile +++ /dev/null @@ -1,6 +0,0 @@ -with-eigen = [ option.get "with-eigen" ] ; -if ! $(with-eigen) && ! [ test_flags "" : "#include <Eigen/Dense>\nint main() {}" ] { - with-eigen = "/usr/include/eigen3" ; -} -with-eigen = <include>$(with-eigen) ; -fakelib neural : ..//kenlm wordvecs.cc : $(with-eigen) : : <cxxflags>-fopenmp <linkflags>-fopenmp $(with-eigen) ; http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/neural/wordvecs.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/neural/wordvecs.hh b/ext/kenlm/lm/neural/wordvecs.hh deleted file mode 100644 index 921a2b2..0000000 --- a/ext/kenlm/lm/neural/wordvecs.hh +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef LM_NEURAL_WORDVECS_H -#define LM_NEURAL_WORDVECS_H - -#include "util/scoped.hh" -#include "lm/vocab.hh" - -#include <Eigen/Dense> - -namespace util { class FilePiece; } - -namespace lm { -namespace neural { - -class WordVecs { - public: - // Columns of the matrix are word vectors. The column index is the word. - typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor> Storage; - - /* The file should begin with a line stating the number of word vectors and - * the length of the vectors. Then it's followed by lines containing a - * word followed by floating-point values. - */ - explicit WordVecs(util::FilePiece &in); - - const Storage &Vectors() const { return vecs_; } - - WordIndex Index(StringPiece str) const { return vocab_.Index(str); } - - private: - util::scoped_malloc vocab_backing_; - ngram::ProbingVocabulary vocab_; - - Storage vecs_; -}; - -}} // namespaces - -#endif // LM_NEURAL_WORDVECS_H
