http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/pipeline.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/pipeline.cc b/ext/kenlm/lm/interpolate/pipeline.cc deleted file mode 100644 index 47b8288..0000000 --- a/ext/kenlm/lm/interpolate/pipeline.cc +++ /dev/null @@ -1,159 +0,0 @@ -#include "lm/interpolate/pipeline.hh" - -#include "lm/common/compare.hh" -#include "lm/common/print.hh" -#include "lm/common/renumber.hh" -#include "lm/vocab.hh" -#include "lm/interpolate/backoff_reunification.hh" -#include "lm/interpolate/interpolate_info.hh" -#include "lm/interpolate/merge_probabilities.hh" -#include "lm/interpolate/merge_vocab.hh" -#include "lm/interpolate/normalize.hh" -#include "lm/interpolate/universal_vocab.hh" -#include "util/stream/chain.hh" -#include "util/stream/count_records.hh" -#include "util/stream/io.hh" -#include "util/stream/multi_stream.hh" -#include "util/stream/sort.hh" -#include "util/fixed_array.hh" - -namespace lm { namespace interpolate { namespace { - -/* Put the original input files on chains and renumber them */ -void SetupInputs(std::size_t buffer_size, const UniversalVocab &vocab, util::FixedArray<ModelBuffer> &models, bool exclude_highest, util::FixedArray<util::stream::Chains> &chains, util::FixedArray<util::stream::ChainPositions> &positions) { - chains.clear(); - positions.clear(); - // TODO: much better memory sizing heuristics e.g. not making the chain larger than it will use. - util::stream::ChainConfig config(0, 2, buffer_size); - for (std::size_t i = 0; i < models.size(); ++i) { - chains.push_back(models[i].Order() - exclude_highest); - for (std::size_t j = 0; j < models[i].Order() - exclude_highest; ++j) { - config.entry_size = sizeof(WordIndex) * (j + 1) + sizeof(float) * 2; // TODO do not include wasteful backoff for highest. - chains.back().push_back(config); - } - models[i].Source(chains.back()); - for (std::size_t j = 0; j < models[i].Order() - exclude_highest; ++j) { - chains[i][j] >> Renumber(vocab.Mapping(i), j + 1); - } - } - for (std::size_t i = 0; i < chains.size(); ++i) { - positions.push_back(chains[i]); - } -} - -template <class SortOrder> void ApplySort(const util::stream::SortConfig &config, util::stream::Chains &chains) { - util::stream::Sorts<SortOrder> sorts(chains.size()); - for (std::size_t i = 0; i < chains.size(); ++i) { - sorts.push_back(chains[i], config, SortOrder(i + 1)); - } - chains.Wait(true); - // TODO memory management - for (std::size_t i = 0; i < sorts.size(); ++i) { - sorts[i].Merge(sorts[i].DefaultLazy()); - } - for (std::size_t i = 0; i < sorts.size(); ++i) { - sorts[i].Output(chains[i], sorts[i].DefaultLazy()); - } -}; - -} // namespace - -void Pipeline(util::FixedArray<ModelBuffer> &models, const Config &config, int write_file) { - // Setup InterpolateInfo and UniversalVocab. - InterpolateInfo info; - info.lambdas = config.lambdas; - std::vector<WordIndex> vocab_sizes; - - util::scoped_fd vocab_null(util::MakeTemp(config.sort.temp_prefix)); - std::size_t max_order = 0; - util::FixedArray<util::scoped_fd> vocab_files(models.size()); - for (ModelBuffer *i = models.begin(); i != models.end(); ++i) { - info.orders.push_back(i->Order()); - vocab_sizes.push_back(i->Counts()[0]); - vocab_files.push_back(util::DupOrThrow(i->VocabFile())); - max_order = std::max(max_order, i->Order()); - } - UniversalVocab vocab(vocab_sizes); - { - ngram::ImmediateWriteWordsWrapper writer(NULL, vocab_null.get(), 0); - MergeVocab(vocab_files, vocab, writer); - } - vocab_files.clear(); - - std::cerr << "Merging probabilities." << std::endl; - // Pass 1: merge probabilities - util::FixedArray<util::stream::Chains> input_chains(models.size()); - util::FixedArray<util::stream::ChainPositions> models_by_order(models.size()); - SetupInputs(config.BufferSize(), vocab, models, false, input_chains, models_by_order); - - util::stream::Chains merged_probs(max_order); - for (std::size_t i = 0; i < max_order; ++i) { - merged_probs.push_back(util::stream::ChainConfig(PartialProbGamma::TotalSize(info, i + 1), 2, config.BufferSize())); // TODO: not buffer_size - } - MergeProbabilities(info, models_by_order, merged_probs); - std::vector<uint64_t> counts(max_order); - for (std::size_t i = 0; i < max_order; ++i) { - merged_probs[i] >> util::stream::CountRecords(&counts[i]); - } - - // Pass 2: normalize. - ApplySort<ContextOrder>(config.sort, merged_probs); - std::cerr << "Normalizing" << std::endl; - SetupInputs(config.BufferSize(), vocab, models, true, input_chains, models_by_order); - util::stream::Chains probabilities(max_order), backoffs(max_order - 1); - std::size_t block_count = 2; - for (std::size_t i = 0; i < max_order; ++i) { - // Careful accounting to ensure RewindableStream can fit the entire vocabulary. - block_count = std::max<std::size_t>(block_count, 2); - // This much needs to fit in RewindableStream. - std::size_t fit = NGram<float>::TotalSize(i + 1) * counts[0]; - // fit / (block_count - 1) rounded up - std::size_t min_block = (fit + block_count - 2) / (block_count - 1); - std::size_t specify = std::max(config.BufferSize(), min_block * block_count); - probabilities.push_back(util::stream::ChainConfig(NGram<float>::TotalSize(i + 1), block_count, specify)); - } - for (std::size_t i = 0; i < max_order - 1; ++i) { - backoffs.push_back(util::stream::ChainConfig(sizeof(float), 2, config.BufferSize())); - } - Normalize(info, models_by_order, merged_probs, probabilities, backoffs); - - util::FixedArray<util::stream::FileBuffer> backoff_buffers(backoffs.size()); - for (std::size_t i = 0; i < max_order - 1; ++i) { - backoff_buffers.push_back(util::MakeTemp(config.sort.temp_prefix)); - backoffs[i] >> backoff_buffers.back().Sink(); - } - - // Pass 3: backoffs in the right place. - ApplySort<SuffixOrder>(config.sort, probabilities); - // TODO destroy universal vocab to save RAM. - // TODO these should be freed before merge sort happens in the above function. - backoffs.Wait(true); - merged_probs.Wait(true); - std::cerr << "Reunifying backoffs" << std::endl; - - util::stream::ChainPositions prob_pos(max_order - 1); - util::stream::Chains combined(max_order - 1); - for (std::size_t i = 0; i < max_order - 1; ++i) { - backoffs[i] >> backoff_buffers[i].Source(true); - prob_pos.push_back(probabilities[i].Add()); - combined.push_back(util::stream::ChainConfig(NGram<ProbBackoff>::TotalSize(i + 1), 2, config.BufferSize())); - } - util::stream::ChainPositions backoff_pos(backoffs); - - ReunifyBackoff(prob_pos, backoff_pos, combined); - - util::stream::ChainPositions output_pos(max_order); - for (std::size_t i = 0; i < max_order - 1; ++i) { - output_pos.push_back(combined[i].Add()); - } - output_pos.push_back(probabilities.back().Add()); - - probabilities >> util::stream::kRecycle; - backoffs >> util::stream::kRecycle; - combined >> util::stream::kRecycle; - - // TODO genericize to ModelBuffer etc. - PrintARPA(vocab_null.get(), write_file, counts).Run(output_pos); -} - -}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/pipeline.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/pipeline.hh b/ext/kenlm/lm/interpolate/pipeline.hh deleted file mode 100644 index b200248..0000000 --- a/ext/kenlm/lm/interpolate/pipeline.hh +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef LM_INTERPOLATE_PIPELINE_H -#define LM_INTERPOLATE_PIPELINE_H - -#include "lm/common/model_buffer.hh" -#include "util/fixed_array.hh" -#include "util/stream/config.hh" - -#include <cstddef> -#include <string> - -namespace lm { namespace interpolate { - -struct Config { - std::vector<float> lambdas; - util::stream::SortConfig sort; - std::size_t BufferSize() const { return sort.buffer_size; } -}; - -void Pipeline(util::FixedArray<ModelBuffer> &models, const Config &config, int write_file); - -}} // namespaces -#endif // LM_INTERPOLATE_PIPELINE_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/split_worker.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/split_worker.cc b/ext/kenlm/lm/interpolate/split_worker.cc deleted file mode 100644 index e777bf0..0000000 --- a/ext/kenlm/lm/interpolate/split_worker.cc +++ /dev/null @@ -1,40 +0,0 @@ -#include "lm/interpolate/split_worker.hh" -#include "lm/common/ngram.hh" - -namespace lm { -namespace interpolate { - -SplitWorker::SplitWorker(std::size_t order, util::stream::Chain &backoff_chain, - util::stream::Chain &sort_chain) - : order_(order) { - backoff_chain >> backoff_input_; - sort_chain >> sort_input_; -} - -void SplitWorker::Run(const util::stream::ChainPosition &position) { - // input: ngram record (id, prob, and backoff) - // output: a float to the backoff_input stream - // an ngram id and a float to the sort_input stream - for (util::stream::Stream stream(position); stream; ++stream) { - NGram<ProbBackoff> ngram(stream.Get(), order_); - - // write id and prob to the sort stream - float prob = ngram.Value().prob; - lm::WordIndex *out = reinterpret_cast<lm::WordIndex *>(sort_input_.Get()); - for (const lm::WordIndex *it = ngram.begin(); it != ngram.end(); ++it) { - *out++ = *it; - } - *reinterpret_cast<float *>(out) = prob; - ++sort_input_; - - // write backoff to the backoff output stream - float boff = ngram.Value().backoff; - *reinterpret_cast<float *>(backoff_input_.Get()) = boff; - ++backoff_input_; - } - sort_input_.Poison(); - backoff_input_.Poison(); -} - -} -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/split_worker.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/split_worker.hh b/ext/kenlm/lm/interpolate/split_worker.hh deleted file mode 100644 index 15fae68..0000000 --- a/ext/kenlm/lm/interpolate/split_worker.hh +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef KENLM_INTERPOLATE_SPLIT_WORKER_H_ -#define KENLM_INTERPOLATE_SPLIT_WORKER_H_ - -#include "util/stream/chain.hh" -#include "util/stream/stream.hh" - -namespace lm { -namespace interpolate { - -class SplitWorker { - public: - /** - * Constructs a split worker for a particular order. It writes the - * split-off backoff values to the backoff chain and the ngram id and - * probability to the sort chain for each ngram in the input. - */ - SplitWorker(std::size_t order, util::stream::Chain &backoff_chain, - util::stream::Chain &sort_chain); - - /** - * The callback invoked to handle the input from the ngram intermediate - * files. - */ - void Run(const util::stream::ChainPosition& position); - - private: - /** - * The ngram order we are reading/writing for. - */ - std::size_t order_; - - /** - * The stream to write to for the backoff values. - */ - util::stream::Stream backoff_input_; - - /** - * The stream to write to for the ngram id + probability values. - */ - util::stream::Stream sort_input_; -}; -} -} -#endif http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/streaming_example_main.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/streaming_example_main.cc b/ext/kenlm/lm/interpolate/streaming_example_main.cc deleted file mode 100644 index 1f543cb..0000000 --- a/ext/kenlm/lm/interpolate/streaming_example_main.cc +++ /dev/null @@ -1,195 +0,0 @@ -#include "lm/common/compare.hh" -#include "lm/common/model_buffer.hh" -#include "lm/common/ngram.hh" -#include "util/stream/chain.hh" -#include "util/stream/multi_stream.hh" -#include "util/stream/sort.hh" -#include "lm/interpolate/split_worker.hh" - -#include <boost/program_options.hpp> -#include <boost/version.hpp> - -#if defined(_WIN32) || defined(_WIN64) - -// Windows doesn't define <unistd.h> -// -// So we define what we need here instead: -// -#define STDIN_FILENO = 0 -#define STDOUT_FILENO = 1 -#else // Huzzah for POSIX! -#include <unistd.h> -#endif - -/* - * This is a simple example program that takes in intermediate - * suffix-sorted ngram files and outputs two sets of files: one for backoff - * probability values (raw numbers, in suffix order) and one for - * probability values (ngram id and probability, in *context* order) - */ -int main(int argc, char *argv[]) { - using namespace lm::interpolate; - - const std::size_t ONE_GB = 1 << 30; - const std::size_t SIXTY_FOUR_MB = 1 << 26; - const std::size_t NUMBER_OF_BLOCKS = 2; - - std::string FILE_NAME = "ngrams"; - std::string CONTEXT_SORTED_FILENAME = "csorted-ngrams"; - std::string BACKOFF_FILENAME = "backoffs"; - std::string TMP_DIR = "/tmp/"; - - try { - namespace po = boost::program_options; - po::options_description options("canhazinterp Pass-3 options"); - - options.add_options() - ("help,h", po::bool_switch(), "Show this help message") - ("ngrams,n", po::value<std::string>(&FILE_NAME), "ngrams file") - ("csortngrams,c", po::value<std::string>(&CONTEXT_SORTED_FILENAME), "context sorted ngrams file") - ("backoffs,b", po::value<std::string>(&BACKOFF_FILENAME), "backoffs file") - ("tmpdir,t", po::value<std::string>(&TMP_DIR), "tmp dir"); - po::variables_map vm; - po::store(po::parse_command_line(argc, argv, options), vm); - - // Display help - if(vm["help"].as<bool>()) { - std::cerr << "Usage: " << options << std::endl; - return 1; - } - } - catch(const std::exception &e) { - - std::cerr << e.what() << std::endl; - return 1; - - } - - // The basic strategy here is to have three chains: - // - The first reads the ngram order inputs using ModelBuffer. Those are - // then stripped of their backoff values and fed into the third chain; - // the backoff values *themselves* are written to the second chain. - // - // - The second chain takes the backoff values and writes them out to a - // file (one for each order). - // - // - The third chain takes just the probability values and ngrams and - // writes them out, sorted in context-order, to a file (one for each - // order). - - // This will be used to read in the binary intermediate files. There is - // one file per order (e.g. ngrams.1, ngrams.2, ...) - lm::ModelBuffer buffer(FILE_NAME); - - // Create a separate chains for each ngram order for: - // - Input from the intermediate files - // - Output to the backoff file - // - Output to the (context-sorted) probability file - util::stream::Chains ngram_inputs(buffer.Order()); - util::stream::Chains backoff_chains(buffer.Order()); - util::stream::Chains prob_chains(buffer.Order()); - for (std::size_t i = 0; i < buffer.Order(); ++i) { - ngram_inputs.push_back(util::stream::ChainConfig( - lm::NGram<lm::ProbBackoff>::TotalSize(i + 1), NUMBER_OF_BLOCKS, ONE_GB)); - - backoff_chains.push_back( - util::stream::ChainConfig(sizeof(float), NUMBER_OF_BLOCKS, ONE_GB)); - - prob_chains.push_back(util::stream::ChainConfig( - sizeof(lm::WordIndex) * (i + 1) + sizeof(float), NUMBER_OF_BLOCKS, - ONE_GB)); - } - - // This sets the input for each of the ngram order chains to the - // appropriate file - buffer.Source(ngram_inputs); - - util::FixedArray<util::scoped_ptr<SplitWorker> > workers(buffer.Order()); - for (std::size_t i = 0; i < buffer.Order(); ++i) { - // Attach a SplitWorker to each of the ngram input chains, writing to the - // corresponding order's backoff and probability chains - workers.push_back( - new SplitWorker(i + 1, backoff_chains[i], prob_chains[i])); - ngram_inputs[i] >> boost::ref(*workers.back()); - } - - util::stream::SortConfig sort_cfg; - sort_cfg.temp_prefix = TMP_DIR; - sort_cfg.buffer_size = SIXTY_FOUR_MB; - sort_cfg.total_memory = ONE_GB; - - // This will parallel merge sort the individual order files, putting - // them in context-order instead of suffix-order. - // - // Two new threads will be running, each owned by the chains[i] object. - // - The first executes BlockSorter.Run() to sort the n-gram entries - // - The second executes WriteAndRecycle.Run() to write each sorted - // block to disk as a temporary file - util::stream::Sorts<lm::ContextOrder> sorts(buffer.Order()); - for (std::size_t i = 0; i < prob_chains.size(); ++i) { - sorts.push_back(prob_chains[i], sort_cfg, lm::ContextOrder(i + 1)); - } - - // Set the sort output to be on the same chain - for (std::size_t i = 0; i < prob_chains.size(); ++i) { - // The following call to Chain::Wait() - // joins the threads owned by chains[i]. - // - // As such the following call won't return - // until all threads owned by chains[i] have completed. - // - // The following call also resets chain[i] - // so that it can be reused - // (including free'ing the memory previously used by the chain) - prob_chains[i].Wait(); - - // In an ideal world (without memory restrictions) - // we could merge all of the previously sorted blocks - // by reading them all completely into memory - // and then running merge sort over them. - // - // In the real world, we have memory restrictions; - // depending on how many blocks we have, - // and how much memory we can use to read from each block - // (sort_config.buffer_size) - // it may be the case that we have insufficient memory - // to read sort_config.buffer_size of data from each block from disk. - // - // If this occurs, then it will be necessary to perform one or more rounds - // of merge sort on disk; - // doing so will reduce the number of blocks that we will eventually - // need to read from - // when performing the final round of merge sort in memory. - // - // So, the following call determines whether it is necessary - // to perform one or more rounds of merge sort on disk; - // if such on-disk merge sorting is required, such sorting is performed. - // - // Finally, the following method launches a thread that calls - // OwningMergingReader.Run() - // to perform the final round of merge sort in memory. - // - // Merge sort could have be invoked directly - // so that merge sort memory doesn't coexist with Chain memory. - sorts[i].Output(prob_chains[i]); - } - - // Create another model buffer for our output on e.g. csorted-ngrams.1, - // csorted-ngrams.2, ... - lm::ModelBuffer output_buf(CONTEXT_SORTED_FILENAME, true, false); - output_buf.Sink(prob_chains, buffer.Counts()); - - // Create a third model buffer for our backoff output on e.g. backoff.1, - // backoff.2, ... - lm::ModelBuffer boff_buf(BACKOFF_FILENAME, true, false); - boff_buf.Sink(backoff_chains, buffer.Counts()); - - // Joins all threads that chains owns, - // and does a for loop over each chain object in chains, - // calling chain.Wait() on each such chain object - ngram_inputs.Wait(true); - backoff_chains.Wait(true); - prob_chains.Wait(true); - - return 0; -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/toy_data/toy.linear_interpolation.lambda1_0.4.lambda2_0.6.lm ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/toy_data/toy.linear_interpolation.lambda1_0.4.lambda2_0.6.lm b/ext/kenlm/lm/interpolate/toy_data/toy.linear_interpolation.lambda1_0.4.lambda2_0.6.lm deleted file mode 100644 index f4bac3d..0000000 --- a/ext/kenlm/lm/interpolate/toy_data/toy.linear_interpolation.lambda1_0.4.lambda2_0.6.lm +++ /dev/null @@ -1,23 +0,0 @@ - -\data\ -ngram 1=5 -ngram 2=8 - -\1-grams: --0.5850267 </s> --99 <s> -7.004176 --0.7447274 <unk> --0.4685211 a -7.272811 --0.6575773 b -99 - -\2-grams: --0.2839967 <s> a --0.3187588 <s> b --0.79588 a </s> --0.5058454 a a --0.2773661 a b --0.7447275 b </s> --0.1135093 b a --0.9586073 b b - -\end\ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/toy_data/toy.loglinear_interpolation.lambda1_0.4.lambda2_0.6.lm ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/toy_data/toy.loglinear_interpolation.lambda1_0.4.lambda2_0.6.lm b/ext/kenlm/lm/interpolate/toy_data/toy.loglinear_interpolation.lambda1_0.4.lambda2_0.6.lm deleted file mode 100644 index 9874ba9..0000000 --- a/ext/kenlm/lm/interpolate/toy_data/toy.loglinear_interpolation.lambda1_0.4.lambda2_0.6.lm +++ /dev/null @@ -1,23 +0,0 @@ - -\data\ -ngram 1=5 -ngram 2=8 - -\1-grams: --0.6190513 </s> --99 <s> -6.885247 --99 <unk> --0.3394633 a 0 --0.5200813 b 0 - -\2-grams: --0.2807584 <s> a --0.3222944 <s> b --0.70763 a </s> --0.6780174 a a --0.2261675 a b --0.7991036 b </s> --0.1237299 b a --1.050158 b b - -\end\ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/toy_data/toy1.lm ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/toy_data/toy1.lm b/ext/kenlm/lm/interpolate/toy_data/toy1.lm deleted file mode 100644 index fba4263..0000000 --- a/ext/kenlm/lm/interpolate/toy_data/toy1.lm +++ /dev/null @@ -1,23 +0,0 @@ - -\data\ -ngram 1=5 -ngram 2=8 - -\1-grams: --1.3010299957 </s> --99 <s> -7.446389 --99 <unk> --0.6020599913 a 0 --0.6020599913 b 0 - -\2-grams: --0.15490196 <s> a --0.5228787453 <s> b --1 a </s> --1.5228787453 a a --0.0604807474 a b --0.5228787453 b </s> --0.3010299957 b a --0.6989700043 b b - -\end\ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/toy_data/toy2.lm ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/toy_data/toy2.lm b/ext/kenlm/lm/interpolate/toy_data/toy2.lm deleted file mode 100644 index 67c2ba1..0000000 --- a/ext/kenlm/lm/interpolate/toy_data/toy2.lm +++ /dev/null @@ -1,23 +0,0 @@ - -\data\ -ngram 1=5 -ngram 2=8 - -\1-grams: --0.3979400087 </s> --99 <s> -7.446389 --99 <unk> --0.3979400087 a 0 --0.6989700043 b 0 - -\2-grams: --0.3979400087 <s> a --0.2218487496 <s> b --0.6989700043 a </s> --0.3010299957 a a --0.5228787453 a b --1 b </s> --0.0222763947 b a --1.3010299957 b b - -\end\ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/train_params_grant_main.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/train_params_grant_main.cc b/ext/kenlm/lm/interpolate/train_params_grant_main.cc deleted file mode 100644 index a106cac..0000000 --- a/ext/kenlm/lm/interpolate/train_params_grant_main.cc +++ /dev/null @@ -1,561 +0,0 @@ -#include "lm/ngram_query.hh" -#include "lm/model.hh" -#include "lm/word_index.hh" -#include "lm/interpolate/enumerate_global_vocab.hh" - -#include <string> -#include <vector> -#include <iostream> -#include <fstream> -#include <map> -#include <iomanip> - -#include <boost/program_options.hpp> -#include <boost/version.hpp> -#include <boost/foreach.hpp> - -#include "util/fixed_array.hh" - -#include <Eigen/Eigen> -#include <Eigen/Dense> - -// typedef Eigen::MatrixXf FMatrix; -// typedef Eigen::VectorXf FVector; -typedef Eigen::MatrixXd DMatrix; -typedef Eigen::VectorXd DVector; - -bool HAS_BIAS = true; - -using namespace lm::ngram; -using namespace lm; - -inline double logProb(Model *model, const std::vector<std::string> &ctx, - WordIndex word_idx) { - // Horribly inefficient - const Vocabulary &vocab = model->GetVocabulary(); - - State nextState; // throwaway - - WordIndex context_idx[ctx.size()]; - - // reverse context - for (std::size_t i = 0; i < ctx.size(); i++) { - context_idx[ctx.size() - 1 - i] = vocab.Index(ctx[i]); - } - - FullScoreReturn score = model->FullScoreForgotState( - context_idx, &(context_idx[ctx.size() - 1]), word_idx, nextState); - - double ret = score.prob; - // std::cerr << "w: " << word << " p: " << ret << std::endl; - return ret; -} - -inline double logProb(Model *model, double unkprob, - const std::vector<std::string> &ctx, - const std::string &word) { - // Horribly inefficient - const Vocabulary &vocab = model->GetVocabulary(); - - WordIndex word_idx = vocab.Index(word); - if (word_idx == lm::kUNK) return unkprob; - - return logProb(model, ctx, word_idx); -} - -void set_features(const std::vector<std::string> &ctx, const std::string &word, - const std::vector<Model *> &models, - const std::vector<double> &unkprobs, DVector &v) { - if (HAS_BIAS) { - v(0) = 1; - for (std::size_t i = 0; i < models.size(); ++i) - v(i + 1) = logProb(models[i], unkprobs[i], ctx, word); - } else { - for (std::size_t i = 0; i < models.size(); ++i) - v(i) = logProb(models[i], unkprobs[i], ctx, word); - } -} - -void train_params(const std::vector<std::vector<std::string> > &corpus, - const std::vector<std::string> &vocab, - const std::vector<Model *> &models) { - using namespace std; - - // A safeguarded Newton's method to find optimum parameters. - // Reverts to steepest-descent linesearch if Newton step does not improve - // objective. - // - // Two Boolean variables below are used to "AllowExtrapolation" and - // "AllowNegativeParams". - - bool AllowExtrapolation = true; // if true, params need not sum to one - bool AllowNegativeParams = true; // if true, params can be negative - const int ITERATIONS = 20; - double minstepsize = 1.0e-9; // convergence criterion - int context_size = 5; // (context_size+1)-grams considered in perplexity - double stepdecreasefactor = 0.1; // if step unsuccessful - double initstepsize = 1.0; // Initial step size - std::size_t linesinstartercorpus = 12; // The first few lines are tuned - // first, to find basin of attraction - // for Newton - // bias + #models - const std::size_t nlambdas = models.size() + (HAS_BIAS ? 1 : 0); - - // initialize to sum to 1 - DVector params = DVector::Constant(nlambdas, 1.0 / nlambdas); - DMatrix N = DMatrix::Constant( - nlambdas, nlambdas - 1, - -1.0 / sqrt((nlambdas - 1) * (nlambdas - 1) + nlambdas - 1.0)); - for (unsigned i = 0; i < nlambdas - 1; ++i) - N(i, i) = N(i, i) * (1.0 - nlambdas); - // N is nullspace matrix, each column sums to zero - - cerr << setprecision(16) << "++ Parameter training ++" << endl; - if (AllowExtrapolation) - cerr << " Allowing extrapolation (sharpening and flattening of individual " - "LM distributions)" << endl; - else - cerr << " Interpolating only (not sharpening or flattening individual LM " - "distributions)" << endl; - if (AllowNegativeParams) - cerr << " Allowing negative parameters\n" - << " (more general but slow and rarely useful\n" - << " -LM with negative weight has probability rankings reversed and " - "is weighted more highly than all LMs with positive weights)" - << endl; - else - cerr << "Not allowing negative parameters (mild assumption, and faster)" - << endl; - cerr << " Maximum number of iterations: " << ITERATIONS << endl; - cerr << " Minimum step size: " << minstepsize << endl; - cerr << " Perplexity computed with " << context_size + 1 << "-grams" << endl; - - if ((!AllowExtrapolation) && (nlambdas == 1)) { - // Nothing to optimize. One parameter, and it sums to one. - cerr << "Training complete. Best weights:" << endl; - cerr << setprecision(16) << 1.0 << endl; - return; - } - - // Smart initialization of full tuning by tuning on smaller set first - vector<std::size_t> linestotune; - if (linesinstartercorpus < corpus.size()) - linestotune.push_back(linesinstartercorpus); - linestotune.push_back(corpus.size()); - - for (std::size_t setiter = 0; setiter < linestotune.size(); ++setiter) { - cerr << " Now tuning the first " << linestotune[setiter] << " lines" - << endl; - - vector<DVector> paramhistory; - double bestppl = 0.0; // best recorded ppl - DVector bestgrad = DVector::Zero(nlambdas); // corresp. gradient, - // feasible direction - DVector bestparams = DVector::Zero(nlambdas); // corresp. weights - double maxbestgradstep = 0.0; // max feasible step in grad. direction - double stepsize = initstepsize; // Initial step size - - for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations - cerr << "ITERATION " << iter + 1 << " (of max " << ITERATIONS - << "), step size " << stepsize << " (of min " << minstepsize - << "), weights: " << endl; - cerr << params << endl; - - paramhistory.push_back(params); - // Hard-coded to be 6-gram perplexity - vector<string> context(context_size, "<s>"); - double ppl = 0.0; - DVector grad = DVector::Zero(nlambdas); - DMatrix H = DMatrix::Zero(nlambdas, nlambdas); - cerr << "o"; - - std::vector<double> unkprobs(models.size()); - // for each sentence in tuning corpus - for (std::size_t ci = 0; ci < linestotune[setiter]; ++ci) { - const vector<string> &sentence = corpus[ci]; - // pad our beginning context - std::fill(context.begin(), context.end(), "<s>"); - - // for each word in sentence - for (std::size_t t = 0; t < sentence.size(); ++t) { - // fill in unk probabilities for this context, to avoid having to - // look them up redundantly later - for (std::size_t mi = 0; mi < models.size(); ++mi) { - unkprobs[mi] = logProb(models[mi], context, lm::kUNK); - } - - DVector feats = DVector::Zero(nlambdas); - // probs for actual n-gram - set_features(context, sentence[t], models, unkprobs, feats); - - double z = 0.0; - double maxlogprob = 0.0; // Allows us to avoid overflow with - // negative params - DVector expectfeats = DVector::Zero(nlambdas); - DMatrix expectfeatmatrix = DMatrix::Zero(nlambdas, nlambdas); - // Logically, this should be in the loop's scope - DVector iterfeats = DVector::Zero(nlambdas); - - // probs over possible n-grams, for normalization - for (std::size_t i = 0; i < vocab.size(); ++i) { - set_features(context, vocab[i], models, unkprobs, iterfeats); - double logprob = params.dot(iterfeats); - if (i == 0) { - // maxlogprob=logprob;// more precise, less underflow - maxlogprob = 0.0; // reduces number of updates - } else if (logprob > maxlogprob) { - // Adjust all old values to new scaling - double adjust = exp(maxlogprob - logprob); - z *= adjust; - expectfeats *= adjust; - expectfeatmatrix *= adjust; - maxlogprob = logprob; - } - double us = exp(params.dot(iterfeats) - maxlogprob); // measure - - z += us; - expectfeats += us * iterfeats; - expectfeatmatrix += us * (iterfeats * iterfeats.transpose()); - } - expectfeats /= z; // Expectation - expectfeatmatrix /= z; // Expectation - - // Add sentence[t] to the end of the context - context[0] = sentence[t]; - std::rotate(context.begin(), context.begin() + 1, context.end()); - - // Perplexity (actually log(perplexity)) - ppl += params.dot(feats) - log(z); - // Gradient - grad += feats - expectfeats; - // Hessian - H += -expectfeatmatrix + expectfeats * expectfeats.transpose(); - } - cerr << "."; - } - ppl *= -1.0 / corpus.size(); - // The gradient and Hessian coefficients cancel out, so don't really need - // to do this, but it's fast. - grad *= -1.0 / corpus.size(); - H *= -1.0 / corpus.size(); - cerr << " log(PPL)=" << ppl << " PPL=" << exp(ppl) << endl; - - // Use results to determine next params to evaluate - if ((ppl < bestppl) || (iter == 0)) { - // Found a new best - bestppl = ppl; - bestparams = params; - double beststepsize = stepsize; - if (iter > 0) - cerr << " New best point found, step size " << beststepsize << endl; - else - cerr << " New best point found" << endl; - - bestgrad = grad; - DVector deltaparams = DVector::Zero(nlambdas); - - bool reverttograd = false; - - { - double gradnorm = 0.0; - double solvenorm = 0.0; - double errnorm = 0.0; - // Find Newton step - if (AllowExtrapolation) { - deltaparams = -H.colPivHouseholderQr().solve(grad); - Eigen::SelfAdjointEigenSolver<DMatrix> eigensolver(H); - cerr << "Eigenvalues (negative values should be negligible):\n" - << eigensolver.eigenvalues() << endl; - gradnorm = grad.norm(); - solvenorm = (H * deltaparams).norm(); - errnorm = (grad + H * deltaparams).norm(); - } else { - // Project gradient to interpolation space - bestgrad = N * N.transpose() * bestgrad; - - // need to work in nullspace to maintain unit sum - DMatrix Hnull = DMatrix::Zero(nlambdas - 1, nlambdas - 1); - - // Looks like we don't need the three lines below -- we can do it - // in-line (if we don't want eigenvalues) - Hnull = N.transpose() * H * N; - Eigen::SelfAdjointEigenSolver<DMatrix> eigensolver(Hnull); - cerr << "Eigenvalues (best if all positive):\n" - << eigensolver.eigenvalues() << endl; - deltaparams = - -N * Hnull.fullPivHouseholderQr().solve(N.transpose() * grad); - gradnorm = (N.transpose() * grad).norm(); - solvenorm = (Hnull * deltaparams).norm(); - errnorm = (N.transpose() * grad + Hnull * deltaparams).norm(); - } - // eventually, params = bestparams + deltaparams; - cerr << " Error norm " << errnorm << ", gradient norm " << gradnorm - << ", solution norm " << solvenorm << endl; - // Check for numerical errors. Don't trust Newton step if they are too - // big. - if (errnorm < 1e-12 * std::max(1.0, std::min(gradnorm, solvenorm))) { - stepsize = 0.0; - for (std::size_t i = 0; i < nlambdas; i++) - stepsize += deltaparams(i) * deltaparams(i); - stepsize = sqrt(stepsize); // holds length of Newton step - cerr << "Newton step, length " << stepsize << ": " << endl; - cerr << deltaparams << endl; - - // Don't let the Newton step get much bigger than last successful - // step (likely would have to shrink later, anyway) - if (stepsize > 2.0 * beststepsize) { - stepsize = 1.5 * beststepsize; - reverttograd = true; - cerr << "Reverting to gradient, because Newton step is too large." - << endl; - } - } else { - stepsize = 1.5 * beststepsize; - reverttograd = true; - cerr << "Reverting to gradient, because Newton step computation " - "unsuccessful." << endl; - } - // Make the gradient unit norm, in feasible search direction. - if (!AllowNegativeParams) { - // Project gradient to be a feasible search direction - vector<bool> active(nlambdas, false); - std::size_t numactive = 0; - for (std::size_t i = 0; i < nlambdas; i++) { - // Project gradient to inactive constraints - if ((bestparams(i) == 0) && (bestgrad(i) > 0)) { - active[i] = true; - bestgrad(i) = 0.0; // Do this now, in case extrapolation - // allowed. - ++numactive; - } - } - if (numactive > 0) { - if (!AllowExtrapolation) { - // Project gradient, for activity concerns - DMatrix tmpN = DMatrix::Constant( - nlambdas, nlambdas - 1, - -1.0 / sqrt((nlambdas - numactive - 1) * - (nlambdas - numactive - 1) + - nlambdas - numactive - 1.0)); - - for (std::size_t i = 0; i < nlambdas - 1; ++i) - tmpN(i, i) = tmpN(i, i) * (1.0 - (nlambdas - numactive)); - - for (std::size_t i = 0; i < nlambdas; ++i) { - if (active[i]) { - for (std::size_t j = 0; j < nlambdas - 1; ++i) { - tmpN(i, j) = 0; - } - } - } - - // projected gradient onto unit sum and active set constraints - bestgrad = -tmpN * tmpN.transpose() * bestgrad; - } - } - } - } - double norm = 0.0; - for (std::size_t i = 0; i < nlambdas; i++) - norm += bestgrad(i) * bestgrad(i); - if (norm != 0) { - bestgrad /= sqrt(norm); - } else { - cerr << " Gradient is zero. Exiting."; - break; - } - cerr << "Gradient, unit length: " << endl; - cerr << bestgrad << endl; - - // Find max step in gradient direction that remains feasible. - if (!AllowNegativeParams) { - double limitfraction = 0.5; // Not 1: If Newton step is bad, probably - // will need to reduce later anyway - for (std::size_t i = 0; i < nlambdas; i++) { - if (bestparams(i) - maxbestgradstep * bestgrad(i) < 0) { - double tmplimitfraction = - bestparams(i) / (bestgrad(i) * maxbestgradstep); - if (tmplimitfraction < limitfraction) - limitfraction = tmplimitfraction; - } - } - maxbestgradstep = stepsize * limitfraction; - cerr << " Max grad step: " << maxbestgradstep << endl; - } else { - maxbestgradstep = stepsize; - } - - if (!reverttograd) { - if (!AllowNegativeParams) { - for (std::size_t i = 0; i < nlambdas; i++) { - if (bestparams(i) + deltaparams(i) < 0) { - // Can't do Newton step. Revert to descent. - reverttograd = true; - } - } - } - if (reverttograd) { - cerr << "Reverting to gradient, since Newton step infeasible:" - << endl; - } - } - - if (reverttograd) { - stepsize = maxbestgradstep; - deltaparams = -bestgrad * stepsize; - } - - params = bestparams + deltaparams; - cerr << "Change in weights from best, step size " << stepsize << ": " - << endl; - cerr << deltaparams << endl; - } else { - // Last attempt failed at being better, so move in gradient direction - // with reduced step. - // stepsize reduction factor is empirical - stepsize = std::min(stepdecreasefactor * stepsize, maxbestgradstep); - cerr << "Taking smaller step: " << stepsize << endl; - params = bestparams - bestgrad * stepsize; - } - // Clean the parameters up. - double sumparams = 0.0; - for (std::size_t i = 0; i < nlambdas; i++) { - if (!AllowNegativeParams) { - if (params(i) < 1e-12) { - // snap to zero, for active set and duplicate weights - params(i) = 0; - } - } - sumparams += params(i); - } - if (!AllowExtrapolation) params /= sumparams; - - bool duplicateentry = false; - for (std::size_t i = 0; i < paramhistory.size(); ++i) { - if (params == paramhistory[i]) duplicateentry = true; - } - while ((duplicateentry) && (stepsize >= minstepsize)) { - cerr << "Duplicate weight found: " << endl; - cerr << params << endl; - stepsize *= 0.5; // Step in this direction is duplicate, so try again - // with smaller step - params = bestparams - stepsize * bestgrad; - - sumparams = 0.0; - for (std::size_t i = 0; i < nlambdas; i++) { - if (!AllowNegativeParams) { - if (params(i) < 1e-12) params(i) = 0; - } - sumparams += params(i); - } - if (!AllowExtrapolation) params /= sumparams; - - duplicateentry = false; - for (std::size_t i = 0; i < paramhistory.size(); ++i) { - if (params == paramhistory[i]) duplicateentry = true; - } - } - if (stepsize < minstepsize) break; // No need to make another step - } - - params = bestparams; // So that next setiter is correct - cerr << "Training complete. Best weights:" << endl; - cerr << params << endl; - } -} - -int main(int argc, char **argv) { - std::string tuning_data; - std::vector<std::string> lms; - - try { - namespace po = boost::program_options; - po::options_description options("train-params"); - - options.add_options()("help,h", po::bool_switch(), - "Show this help message")( - "no_bias_term,B", po::bool_switch(), "Do not include a 'bias' feature")( - "tuning_data,t", po::value<std::string>(&tuning_data), - "File to tune perplexity on")( - "model,m", po::value<std::vector<std::string> >(&lms), - "Language models in KenLM format to interpolate"); - po::variables_map vm; - po::store(po::parse_command_line(argc, argv, options), vm); - - // Display help - if (argc == 1 || vm["help"].as<bool>()) { - std::cerr << options << std::endl; - return 1; - } - if (vm["no_bias_term"].as<bool>()) HAS_BIAS = false; - lms = vm["model"].as<std::vector<std::string> >(); - tuning_data = vm["tuning_data"].as<std::string>(); - } catch (const std::exception &e) { - std::cerr << e.what() << std::endl; - return 1; - } - if (lms.size() < 2) { - std::cerr - << "Please specify at least two language model files with -m LM.KLM\n"; - return 1; - } - if (tuning_data.empty()) { - std::cerr << "Please specify tuning set with -t FILE.TXT\n"; - return 1; - } - - // Growable vocab here - // GrowableVocab gvoc(100000); //dummy default - - // no comment - std::map<std::string, int *> vmap; - - // stuff it into the - EnumerateGlobalVocab *globalVocabBuilder = - new EnumerateGlobalVocab(&vmap, lms.size()); - - Config cfg; - cfg.enumerate_vocab = (EnumerateVocab *)globalVocabBuilder; - - // load models - std::vector<Model *> models; - for (std::size_t i = 0; i < lms.size(); i++) { - std::cerr << "Loading LM file: " << lms[i] << std::endl; - - // haaaack - globalVocabBuilder->SetCurModel(i); // yes this is dumb - - Model *this_model = new Model(lms[i].c_str(), cfg); - models.push_back(this_model); - } - - // assemble vocabulary vector - std::vector<std::string> vocab; - std::cerr << "Global Vocab Map has size: " << vmap.size() << std::endl; - - for (std::map<std::string, int *>::iterator iter = vmap.begin(); - iter != vmap.end(); ++iter) { - vocab.push_back(iter->first); - } - std::cerr << "Vocab vector has size: " << vocab.size() << std::endl; - - // load context sorted ngrams into vector of vectors - std::vector<std::vector<std::string> > corpus; - - std::cerr << "Loading context-sorted ngrams: " << tuning_data << std::endl; - std::ifstream infile(tuning_data.c_str()); - - for (std::string line; std::getline(infile, line);) { - std::vector<std::string> words; - std::stringstream stream(line); - std::string word; - - while (stream >> word) - words.push_back(word); - corpus.push_back(words); - } - - train_params(corpus, vocab, models); - - return 0; -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/train_params_main.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/train_params_main.cc b/ext/kenlm/lm/interpolate/train_params_main.cc deleted file mode 100644 index 39233e8..0000000 --- a/ext/kenlm/lm/interpolate/train_params_main.cc +++ /dev/null @@ -1,349 +0,0 @@ -#include "lm/ngram_query.hh" -#include "lm/model.hh" -#include "lm/word_index.hh" -#include "lm/interpolate/enumerate_global_vocab.hh" - - -#include <string> -#include <vector> -#include <iostream> -#include <fstream> -#include <map> - -#include <boost/program_options.hpp> -#include <boost/version.hpp> -#include <boost/foreach.hpp> - -#include "util/fixed_array.hh" - -#include <Eigen/Eigen> - -typedef Eigen::MatrixXf FMatrix; -typedef Eigen::VectorXf FVector; - -bool HAS_BIAS = true; - -using namespace lm::ngram; -using namespace lm; - -inline float logProb(Model * model, const std::vector<std::string>& ctx, const std::string& word) { - - // Horribly inefficient - const Vocabulary &vocab = model->GetVocabulary(); - - State nextState; //throwaway - - WordIndex word_idx = vocab.Index(word); - WordIndex context_idx[ctx.size()]; - - //reverse context - for(unsigned int i = 0; i < ctx.size(); i++) { - context_idx[ctx.size() - 1 - i] = vocab.Index(ctx[i]); - } - - FullScoreReturn score = model->FullScoreForgotState(context_idx, &(context_idx[ctx.size() -1]), word_idx, nextState); - - float ret = score.prob; - //std::cerr << "w: " << word << " p: " << ret << std::endl; - return ret; -} - -void set_features(const std::vector<std::string>& ctx, - const std::string& word, - const std::vector<Model *>& models, - FVector& v) { - - //std::cerr << "setting feats for " << word << std::endl; - - if (HAS_BIAS) { - v(0) = 1; - for (unsigned i=0; i < models.size(); ++i) - v(i + 1) = logProb(models[i], ctx, word); - } else { - for (unsigned i=0; i < models.size(); ++i) - v(i) = logProb(models[i], ctx, word); - } -} - -void translate_input( - const std::vector<std::vector<std::string> >& corpus, - const std::vector<std::string>& gvocab, - const std::vector<Model *>& models, - std::vector<std::vector<std::vector<WordIndex> > >&translated_corpus, - std::vector<std::vector<WordIndex> >&translated_vocab - ) { - translated_corpus.resize(models.size()); - translated_vocab.resize(models.size()); - for (unsigned mn=0; mn < models.size(); ++mn) { // models - - const Vocabulary &vocab = models[mn]->GetVocabulary(); - - for (unsigned i = 0; i < gvocab.size(); ++i) { - translated_vocab[mn].push_back(vocab.Index(gvocab[i])); - } - - translated_corpus[mn].resize(corpus.size()); - for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus - const std::vector<std::string>& sentence = corpus[ci]; - for (int t = sentence.size() -1; t >= 0; --t) { // words in sentence - translated_corpus[mn][ci].push_back(vocab.Index(sentence[t])); - } - for (int i=0; i<5; ++i) { - translated_corpus[mn][ci].push_back(vocab.Index("<s>")); - } - } - } -} - - -void train_params_fast( - const std::vector<std::vector<std::string> >& corpus, - const std::vector<std::string>& vocab, - const std::vector<Model *>& models) { - using namespace std; - - // model / sentence / words in sentence in reverse order with <s> padding - std::vector<std::vector<std::vector<WordIndex> > > t_corpus; - std::vector<std::vector<WordIndex> > t_vocab; - translate_input(corpus, vocab, models, t_corpus, t_vocab); - - - - const int ITERATIONS = 10; - const int nlambdas = models.size() + (HAS_BIAS ? 1 : 0); // bias + #models - FVector params = FVector::Zero(nlambdas); - vector<FVector> feats(vocab.size(), params); - vector<float> us(vocab.size(), 0); - vector<float> ps(vocab.size(), 0); - FVector grad = FVector::Zero(nlambdas); - FMatrix H = FMatrix::Zero(nlambdas, nlambdas); - FVector ef = FVector::Zero(nlambdas); - for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations - grad.setZero(); - H.setZero(); - double loss = 0; - unsigned numchars = 0; - for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus - const vector<string>& sentence = corpus[ci]; - double z = 0; - for (int t = sentence.size() -1 ; t >=0; --t) { // words in sentence - ++numchars; - int ref_word = 0; - for (unsigned i = 0; i < vocab.size(); ++i) { // vocab - // set_features(context, vocab[i], models, feats[i]); - for (unsigned j=0; j < models.size(); ++j) { - // NOTE: reference ---- WordIndex word_idx = t_corpus[j][ci][t]; - WordIndex word_idx = t_vocab[j][i]; - State nextState; //throwaway - FullScoreReturn score = models[j]->FullScoreForgotState(&(t_corpus[j][ci][t]), &(t_corpus[j][ci][t+5]), word_idx, nextState); - feats[i](j) = score.prob; - // feats[i](j) = logProb(models[j], ctx, word); - } - - us[i] = params.dot(feats[i]); - z += exp(double(us[i])); - } - //std::cerr << "there..." << std::endl; - const float logz = log(z); - - // expected feature values - ef.setZero(); - for (unsigned i = 0; i < vocab.size(); ++i) { - ps[i] = expf(us[i] - logz); - ef += ps[i] * feats[i]; - } - loss -= log(ps[ref_word]); - const FVector& reffeats = feats[ref_word]; - grad += ef - reffeats; - - // Hessian - for (unsigned i = 0; i < vocab.size(); ++i) - H.noalias() += ps[i] * feats[i] * feats[i].transpose() - - ps[i] * feats[i] * ef.transpose(); - - // this should just be the state for each model - } - cerr << "."; - } - cerr << "ITERATION " << (iter + 1) << ": PPL=" << exp(loss / numchars) << endl; - params = H.colPivHouseholderQr().solve(grad); - cerr << params << endl; - } -} - - - - -//const util::FixedArray<Model *>& models) -void train_params( - const std::vector<std::vector<std::string> >& corpus, - const std::vector<std::string>& vocab, - const std::vector<Model *>& models) { - using namespace std; - - vector<string> context(5, "<s>"); - const int ITERATIONS = 10; - const int nlambdas = models.size() + (HAS_BIAS ? 1 : 0); // bias + #models - FVector params = FVector::Zero(nlambdas); - vector<FVector> feats(vocab.size(), params); - vector<float> us(vocab.size(), 0); - vector<float> ps(vocab.size(), 0); - FVector grad = FVector::Zero(nlambdas); - FMatrix H = FMatrix::Zero(nlambdas, nlambdas); - FVector ef = FVector::Zero(nlambdas); - for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations - grad.setZero(); - H.setZero(); - double loss = 0; - unsigned numchars = 0; - for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus - const vector<string>& sentence = corpus[ci]; - std::fill(context.begin(), context.end(), "<s>"); - for (unsigned t = 0; t < sentence.size(); ++t) { // words in sentence - ++numchars; - const string& ref_word_string = sentence[t]; - int ref_word = 0; // TODO - double z = 0; - //std::cerr << "here..." << std::endl; - for (unsigned i = 0; i < vocab.size(); ++i) { // vocab - set_features(context, vocab[i], models, feats[i]); - us[i] = params.dot(feats[i]); - z += exp(double(us[i])); - } - //std::cerr << "there..." << std::endl; - context.push_back(ref_word_string); - const float logz = log(z); - - // expected feature values - ef.setZero(); - for (unsigned i = 0; i < vocab.size(); ++i) { - ps[i] = expf(us[i] - logz); - ef += ps[i] * feats[i]; - } - loss -= log(ps[ref_word]); - const FVector& reffeats = feats[ref_word]; - grad += ef - reffeats; - - // Hessian - for (unsigned i = 0; i < vocab.size(); ++i) - H.noalias() += ps[i] * feats[i] * feats[i].transpose() - - ps[i] * feats[i] * ef.transpose(); - - // this should just be the state for each model - } - cerr << "."; - } - cerr << "ITERATION " << (iter + 1) << ": PPL=" << exp(loss / numchars) << endl; - params = H.colPivHouseholderQr().solve(grad); - cerr << params << endl; - } -} - -int main(int argc, char** argv) { - - std::string tuning_data; - std::vector<std::string> lms; - - try { - namespace po = boost::program_options; - po::options_description options("train-params"); - - options.add_options() - ("help,h", po::bool_switch(), "Show this help message") - ("no_bias_term,B", po::bool_switch(), "Do not include a 'bias' feature") - ("tuning_data,t", po::value<std::string>(&tuning_data), "File to tune perplexity on") - ("model,m", po::value<std::vector<std::string> >(&lms), "Language models in KenLM format to interpolate"); - po::variables_map vm; - po::store(po::parse_command_line(argc, argv, options), vm); - - // Display help - if(argc == 1 || vm["help"].as<bool>()) { - std::cerr << options << std::endl; - return 1; - } - if (vm["no_bias_term"].as<bool>()) - HAS_BIAS = false; - lms = vm["model"].as<std::vector<std::string> >(); - tuning_data = vm["tuning_data"].as<std::string>(); - } - catch(const std::exception &e) { - - std::cerr << e.what() << std::endl; - return 1; - - } - if (lms.size() < 2) { - std::cerr << "Please specify at least two language model files with -m LM.KLM\n"; - return 1; - } - if (tuning_data.empty()) { - std::cerr << "Please specify tuning set with -t FILE.TXT\n"; - return 1; - } - - //Growable vocab here - //GrowableVocab gvoc(100000); //dummy default - - std::map<std::string, int*> vmap; - util::FixedArray<WordIndex> vm(2); - - //stuff it into the - EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(&vmap, lms.size()); - // EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(vm); - - Config cfg; - cfg.enumerate_vocab = (EnumerateVocab *) globalVocabBuilder; - - //load models - //util::FixedArray<Model *> models(lms.size()); - std::vector<Model *> models; - for(int i=0; i < lms.size(); i++) { - std::cerr << "Loading LM file: " << lms[i] << std::endl; - - //haaaack - globalVocabBuilder->SetCurModel(i); //yes this is dumb - - //models[i] = new Model(lms[i].c_str()); - Model * this_model = new Model(lms[i].c_str(), cfg); - models.push_back( this_model ); - - } - - //assemble vocabulary vector - std::vector<std::string> vocab; - std::cerr << "Global Vocab Map has size: " << vmap.size() << std::endl; - - std::pair<StringPiece,int *> me; - - for(std::map<std::string, int*>::iterator iter = vmap.begin(); iter != vmap.end(); ++iter) { - vocab.push_back(iter->first); - } - std::cerr << "Vocab vector has size: " << vocab.size() << std::endl; - - //load context sorted ngrams into vector of vectors - std::vector<std::vector<std::string> > corpus; - - std::cerr << "Loading context-sorted ngrams: " << tuning_data << std::endl; - std::ifstream infile(tuning_data); - - for(std::string line; std::getline(infile, line); ) { - - std::vector<std::string> words; { - - std::stringstream stream(line); - std::string word; - - while(stream >> word) { - words.push_back(word); - } - } - corpus.push_back(words); - } - - train_params_fast(corpus, vocab, models); - - return 0; -} - - - http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_derivatives.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_derivatives.cc b/ext/kenlm/lm/interpolate/tune_derivatives.cc deleted file mode 100644 index c20e637..0000000 --- a/ext/kenlm/lm/interpolate/tune_derivatives.cc +++ /dev/null @@ -1,91 +0,0 @@ -#include "lm/interpolate/tune_derivatives.hh" - -namespace lm { namespace interpolate { - -ComputeDerivative::ComputeDerivative(const util::FixedArray<Instance> &instances, const Matrix &ln_unigrams, WordIndex bos) - : instances_(instances), ln_unigrams_(ln_unigrams), bos_(bos) { - neg_correct_summed_ = Vector::Zero(ln_unigrams.cols()); - for (const Instance *i = instances.begin(); i != instances.end(); ++i) { - neg_correct_summed_ -= i->ln_correct; - } -} - -Accum ComputeDerivative::Iteration(const Vector &weights, Vector &gradient, Matrix &hessian) { - gradient = neg_correct_summed_; - hessian = Matrix::Zero(weights.rows(), weights.rows()); - - // TODO: loop instead to force low-memory evaluation - // Compute p_I(x). - Vector interp_uni((ln_unigrams_ * weights).array().exp()); - // Even -inf doesn't work for <s> because weights can be negative. Manually set it to zero. - interp_uni(bos_) = 0.0; - Accum Z_epsilon = interp_uni.sum(); - interp_uni /= Z_epsilon; - // unigram_cross(i) = \sum_{all x} p_I(x) ln p_i(x) - Vector unigram_cross(ln_unigrams_.transpose() * interp_uni); - - Accum sum_B_I = 0.0; - Accum sum_ln_Z_context = 0.0; - - Vector weighted_extensions; - Matrix convolve; - Vector full_cross; - - for (const Instance *n = instances_.begin(); n != instances_.end(); ++n) { - Accum ln_weighted_backoffs = n->ln_backoff.dot(weights); - Accum weighted_backoffs = exp(ln_weighted_backoffs); - - // Compute \sum_{x: model does not backoff to unigram} p_I(x) - Accum sum_x_p_I = 0.0; - for (std::vector<WordIndex>::const_iterator x = n->extension_words.begin(); x != n->extension_words.end(); ++x) { - sum_x_p_I += interp_uni(*x); - } - weighted_extensions = (n->ln_extensions * weights).array().exp(); - Accum Z_context = Z_epsilon * weighted_backoffs * (1.0 - sum_x_p_I) + weighted_extensions.sum(); - sum_ln_Z_context += log(Z_context); - - Accum B_I = Z_epsilon / Z_context * weighted_backoffs; - sum_B_I += B_I; - - // This is the gradient term for this instance except for -log p_i(w_n | w_1^{n-1}) which was accounted for as part of neg_correct_sum_. - // full_cross(i) is \sum_{all x} p_I(x | context) log p_i(x | context) - full_cross = - // Uncorrected term - B_I * (n->ln_backoff + unigram_cross) - // Correction term: add correct values - + n->ln_extensions.transpose() * weighted_extensions / Z_context - // Subtract values that should not have been charged. - - sum_x_p_I * B_I * n->ln_backoff; - for (std::vector<WordIndex>::const_iterator x = n->extension_words.begin(); x != n->extension_words.end(); ++x) { - full_cross.noalias() -= interp_uni(*x) * B_I * ln_unigrams_.row(*x); - } - - gradient += full_cross; - - convolve = unigram_cross * n->ln_backoff.transpose(); - // There's one missing term here, which is independent of context and done at the end. - hessian.noalias() += - // First term of Hessian, assuming all models back off to unigram. - B_I * (convolve + convolve.transpose() + n->ln_backoff * n->ln_backoff.transpose()) - // Second term of Hessian, with correct full probabilities. - - full_cross * full_cross.transpose(); - - // Adjust the first term of the Hessian to account for extension - for (std::size_t x = 0; x < n->extension_words.size(); ++x) { - WordIndex universal_x = n->extension_words[x]; - hessian.noalias() += - // Replacement terms. - weighted_extensions(x) / Z_context * n->ln_extensions.row(x).transpose() * n->ln_extensions.row(x) - // Presumed unigrams. TODO: individual terms with backoffs pulled out? Maybe faster? - - interp_uni(universal_x) * B_I * (ln_unigrams_.row(universal_x).transpose() + n->ln_backoff) * (ln_unigrams_.row(universal_x) + n->ln_backoff.transpose()); - } - } - - for (Matrix::Index x = 0; x < interp_uni.rows(); ++x) { - // \sum_{contexts} B_I(context) \sum_x p_I(x) log p_i(x) log p_j(x) - hessian.noalias() += sum_B_I * interp_uni(x) * ln_unigrams_.row(x).transpose() * ln_unigrams_.row(x); - } - return exp((neg_correct_summed_.dot(weights) + sum_ln_Z_context) / static_cast<double>(instances_.size())); -} - -}} // namespaces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_derivatives.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_derivatives.hh b/ext/kenlm/lm/interpolate/tune_derivatives.hh deleted file mode 100644 index 40c058e..0000000 --- a/ext/kenlm/lm/interpolate/tune_derivatives.hh +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef LM_INTERPOLATE_TUNE_DERIVATIVES_H -#define LM_INTERPOLATE_TUNE_DERIVATIVES_H - -#include "lm/interpolate/tune_instance.hh" - -#include <Eigen/Core> -#include <cmath> - -namespace lm { namespace interpolate { - -class ComputeDerivative { - public: - explicit ComputeDerivative(const util::FixedArray<Instance> &instances, const Matrix &ln_unigrams, WordIndex bos); - - Accum Iteration(const Vector &weights, Vector &gradient, Matrix &hessian); - - private: - const util::FixedArray<Instance> &instances_; - const Matrix &ln_unigrams_; - - const WordIndex bos_; - - // neg_correct_summed_(i) = -\sum_n ln p_i(w_n | w_1^{n-1}) - Vector neg_correct_summed_; -}; - -}} // namespaces - -#endif // LM_INTERPOLATE_TUNE_DERIVATIVES_H - http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_derivatives_test.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_derivatives_test.cc b/ext/kenlm/lm/interpolate/tune_derivatives_test.cc deleted file mode 100644 index 75c0d12..0000000 --- a/ext/kenlm/lm/interpolate/tune_derivatives_test.cc +++ /dev/null @@ -1,89 +0,0 @@ -#include "lm/interpolate/tune_derivatives.hh" - -#include "lm/interpolate/tune_instance.hh" - -#define BOOST_TEST_MODULE DerivativeTest -#include <boost/test/unit_test.hpp> - -namespace lm { namespace interpolate { namespace { - -BOOST_AUTO_TEST_CASE(Small) { - // Three vocabulary words plus <s>, two models. - Matrix unigrams(4, 2); - unigrams << - 0.1, 0.6, - 0.4, 0.3, - 0.5, 0.1, - // <s> - 1.0, 1.0; - unigrams = unigrams.array().log(); - - // One instance - util::FixedArray<Instance> instances(1); - instances.push_back(2); - Instance &instance = instances.back(); - - instance.ln_backoff << 0.2, 0.4; - instance.ln_backoff = instance.ln_backoff.array().log(); - - // Sparse cases: model 0 word 2 and model 1 word 1. - - // Assuming that model 1 only matches word 1, this is p_1(1 | context) - Accum model_1_word_1 = 1.0 - .6 * .4 - .1 * .4; - - // We'll suppose correct has WordIndex 1, which backs off in model 0, and matches in model 1 - instance.ln_correct << (0.4 * 0.2), model_1_word_1; - instance.ln_correct = instance.ln_correct.array().log(); - - Accum model_0_word_2 = 1.0 - .1 * .2 - .4 * .2; - - instance.extension_words.push_back(1); - instance.extension_words.push_back(2); - instance.ln_extensions.resize(2, 2); - instance.ln_extensions << - (0.4 * 0.2), model_1_word_1, - model_0_word_2, 0.1 * 0.4; - instance.ln_extensions = instance.ln_extensions.array().log(); - - ComputeDerivative compute(instances, unigrams, 3); - Vector weights(2); - weights << 0.9, 1.2; - - Vector gradient(2); - Matrix hessian(2,2); - compute.Iteration(weights, gradient, hessian); - - // p_I(x | context) - Vector p_I(3); - p_I << - pow(0.1 * 0.2, 0.9) * pow(0.6 * 0.4, 1.2), - pow(0.4 * 0.2, 0.9) * pow(model_1_word_1, 1.2), - pow(model_0_word_2, 0.9) * pow(0.1 * 0.4, 1.2); - p_I /= p_I.sum(); - - Vector expected_gradient = -instance.ln_correct; - expected_gradient(0) += p_I(0) * log(0.1 * 0.2); - expected_gradient(0) += p_I(1) * log(0.4 * 0.2); - expected_gradient(0) += p_I(2) * log(model_0_word_2); - BOOST_CHECK_CLOSE(expected_gradient(0), gradient(0), 0.01); - - expected_gradient(1) += p_I(0) * log(0.6 * 0.4); - expected_gradient(1) += p_I(1) * log(model_1_word_1); - expected_gradient(1) += p_I(2) * log(0.1 * 0.4); - BOOST_CHECK_CLOSE(expected_gradient(1), gradient(1), 0.01); - - Matrix expected_hessian(2, 2); - expected_hessian(1, 0) = - // First term - p_I(0) * log(0.1 * 0.2) * log(0.6 * 0.4) + - p_I(1) * log(0.4 * 0.2) * log(model_1_word_1) + - p_I(2) * log(model_0_word_2) * log(0.1 * 0.4); - expected_hessian(1, 0) -= - (p_I(0) * log(0.1 * 0.2) + p_I(1) * log(0.4 * 0.2) + p_I(2) * log(model_0_word_2)) * - (p_I(0) * log(0.6 * 0.4) + p_I(1) * log(model_1_word_1) + p_I(2) * log(0.1 * 0.4)); - expected_hessian(0, 1) = expected_hessian(1, 0); - BOOST_CHECK_CLOSE(expected_hessian(1, 0), hessian(1, 0), 0.01); - BOOST_CHECK_CLOSE(expected_hessian(0, 1), hessian(0, 1), 0.01); -} - -}}} // namespaces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_instance.cc b/ext/kenlm/lm/interpolate/tune_instance.cc deleted file mode 100644 index f1c9924..0000000 --- a/ext/kenlm/lm/interpolate/tune_instance.cc +++ /dev/null @@ -1,354 +0,0 @@ -#include "lm/interpolate/tune_instance.hh" - -#include "lm/common/model_buffer.hh" -#include "lm/common/ngram_stream.hh" -#include "lm/common/renumber.hh" -#include "lm/enumerate_vocab.hh" -#include "lm/interpolate/merge_vocab.hh" -#include "lm/interpolate/universal_vocab.hh" -#include "lm/lm_exception.hh" -#include "util/file_piece.hh" -#include "util/murmur_hash.hh" -#include "util/stream/chain.hh" -#include "util/tokenize_piece.hh" - -#include <boost/unordered_map.hpp> - -#include <cmath> -#include <limits> -#include <vector> - -namespace lm { namespace interpolate { - -// An extension without backoff weights applied yet. -#pragma pack(push) -#pragma pack(1) -struct InitialExtension { - Extension ext; - // Order from which it came. - uint8_t order; -}; -#pragma pack(pop) - -// Intended use -// For each model: -// stream through orders jointly in suffix order: -// Call MatchedBackoff for full matches. -// Call Exit when the context matches. -// Call FinishModel with the unigram probability of the correct word, get full -// probability in return. -// Use Backoffs to adjust records that were written to the stream. -class InstanceMatch { - public: - InstanceMatch(ModelIndex models, uint8_t max_order, const WordIndex correct) - : seen_(std::numeric_limits<WordIndex>::max()), - backoffs_(Matrix::Zeros(models, max_order)), - correct_(correct), correct_from_(1), correct_ln_prob_(std::numeric_limits<float>::quiet_NaN()) {} - - void MatchedBackoff(ModelIndex model, uint8_t order, float ln_backoff) { - backoffs_(model, order - 1) = ln_backoff; - } - - // We only want the highest-order matches, which are the first to be exited for a given word. - void Exit(const InitialExtension &from, util::stream::Stream &out) { - if (from.ext.word == seen_) return; - seen_ = from.ext.word; - *static_cast<InitialExtension*>(out.Get()) = from; - ++out; - if (UTIL_UNLIKELY(correct_ == from.ext.word)) { - correct_from_ = from.order; - correct_ln_prob_ = from.ext.ln_prob; - } - } - - WordIndex Correct() const { return correct_; } - - // Call this after each model has been passed through. The - float FinishModel(ModelIndex model, float correct_ln_unigram) { - seen_ = std::numeric_limits<WordIndex>::max(); - // Turn backoffs into multiplied values (added in log space). - // So backoffs_(model, order - 1) is the penalty for matching order. - float accum = 0.0; - for (int order = backoffs_.cols() - 1; order >= 0; --order) { - accum += backoffs_(model, order); - backoffs_(model, order) = accum; - } - if (correct_from_ == 1) { - correct_ln_prob_ = correct_ln_unigram; - } - if (correct_from_ - 1 < backoffs_.cols()) { - correct_ln_prob_ += backoffs_(model, correct_from_ - 1); - } - correct_from_ = 1; - return correct_ln_prob_; - } - - const Matrix &Backoffs() const { - return backoffs_; - } - - private: - // What's the last word we've seen? Used to act only on exiting the longest match. - WordIndex seen_; - - Matrix backoffs_; - - const WordIndex correct_; - - // These only apply to the most recent model. - uint8_t correct_from_; - - float correct_ln_prob_; -}; - -namespace { - -// Forward information to multiple instances of a context. -class DispatchContext { - public: - void Register(InstanceMatch &context) { - registered_.push_back(&context); - } - - void MatchedBackoff(uint8_t order, float ln_backoff) { - for (std::vector<InstanceMatch*>::iterator i = registered_.begin(); i != registered_.end(); ++i) - (*i)->MatchedBackoff(order, ln_backoff); - } - - void Exit(const InitialExtension &from, util::stream::Stream &out) { - for (std::vector<InstanceMatch*>::iterator i = registered_.begin(); i != registered_.end(); ++i) { - (*i)->Exit(from, out); - } - } - - private: - std::vector<InstanceMatch*> registered_; -}; - -// Map from n-gram hash to contexts in the tuning data. -typedef boost::unordered_map<uint64_t, DispatchContext> ContextMap; - -class ApplyBackoffs { - public: - explicit ApplyBackoffs(const InstanceMatch *backoffs) : backoffs_(backoffs) {} - - void Run(const util::stream::ChainPosition &position) { - for (util::stream::Stream stream(position); stream; ++stream) { - InitialExtension &ini = *reinterpret_cast<InitialExtension*>(stream.Get()); - ini.ext.ln_prob += backoffs_[ini.ext.instance] - } - } - - private: - const InstanceMatch *backoffs_; -}; - -Instances::ReadExtensions(util::stream::Chain &on) { - if (extensions_first_.get()) { - // Lazy sort and save a sorted copy to disk. TODO: cut down on record size by stripping out order information? - extensions_first_->Output(on); - extensions_first_->reset(); - // TODO: apply backoff data!!!! - - extensions_subsequent_.reset(new util::stream::FileBuffer(util::MakeTemp(sorting_config_.temp_prefix))); - on >> extensions_subsequent_->Sink(); - } else { - on >> extensions_subsequent_->Source(); - } -} - -class UnigramLoader { - public: - UnigramLoader(ContextMap &contexts_for_backoffs, Matrix &ln_probs, std::size_t model_number) - : map_(contexts_for_backoffs), - prob_(ln_probs.col(model_number)) {} - - void Run(const util::stream::ChainPosition &position) { - // TODO handle the case of a unigram model? - NGramStream<ProbBackoff> input(position); - assert(input); - Accum unk = input->Value().prob * M_LN10; - WordIndex previous = 0; - for (; input; ++input) { - WordIndex word = *input->begin(); - prob_.segment(previous, word - previous) = Vector::Constant(word - previous, unk); - prob_(word) = input->Value().prob * M_LN10; - ContextMap::iterator i = map_.find(util::MurmurHashNative(input->begin(), sizeof(WordIndex))); - if (i != map_.end()) { - i->second.MatchedBackoff(1, input->Value().backoff * M_LN10); - } - previous = word + 1; - } - prob_.segment(previous, prob_.rows() - previous) = Vector::Constant(prob_.rows() - previous, unk); - } - - private: - ContextMap &map_; - Matrix::ColXpr prob_; - std::size_t model_; -}; - -class MiddleLoader { - public: - explicit MiddleLoader(ContextMap &map) - : map_(map) {} - - void Run(const util::stream::ChainPosition &position) { - NGramStream<ProbBackoff> input(position); - const std::size_t full_size = (uint8_t*)input->end() - (uint8_t*)input->begin(); - const std::size_t context_size = full_size - sizeof(WordIndex); - ContextMap::iterator i; - for (; input; ++input) { - i = map_.find(util::MurmurHashNative(input->begin(), full_size)); - if (i != map_.end()) { - i->second.MatchedBackoff(input->Order(), input->Value().backoff * M_LN10); - } - i = map_.find(util::MurmurHashNative(input->begin(), context_size)); - if (i != map_.end()) { - i->second.MatchedContext(input->Order(), *(input->end() - 1), input->Value().prob * M_LN10); - } - } - } - - private: - ContextMap &map_; -}; - -class HighestLoader { - public: - HighestLoader(ContextMap &map, uint8_t order) - : map_(map), order_(order) {} - - void Run(const util::stream::ChainPosition &position) { - ContextMap::iterator i; - const std::size_t context_size = sizeof(WordIndex) * (order_ - 1); - for (ProxyStream<NGram<float> > input(position, NGram<float>(NULL, order_)); input; ++input) { - i = map_.find(util::MurmurHashNative(input->begin(), context_size)); - if (i != map_.end()) { - i->second.MatchedContext(order_, *(input->end() - 1), input->Value() * M_LN10); - } - } - } - - private: - ContextMap &map_; - const uint8_t order_; -}; - -class IdentifyTuning : public EnumerateVocab { - public: - IdentifyTuning(int tuning_file, std::vector<WordIndex> &out) : indices_(out) { - indices_.clear(); - StringPiece line; - std::size_t counter = 0; - std::vector<std::size_t> &eos = words_[util::MurmurHashNative("</s>", 4)]; - for (util::FilePiece f(tuning_file); f.ReadLineOrEOF(line);) { - for (util::TokenIter<util::BoolCharacter, true> word(line, util::kSpaces); word; ++word) { - UTIL_THROW_IF(*word == "<s>" || *word == "</s>", FormatLoadException, "Illegal word in tuning data: " << *word); - words_[util::MurmurHashNative(word->data(), word->size())].push_back(counter++); - } - eos.push_back(counter++); - } - // Also get <s> - indices_.resize(counter + 1); - words_[util::MurmurHashNative("<s>", 3)].push_back(indices_.size() - 1); - } - - void Add(WordIndex id, const StringPiece &str) { - boost::unordered_map<uint64_t, std::vector<std::size_t> >::iterator i = words_.find(util::MurmurHashNative(str.data(), str.size())); - if (i != words_.end()) { - for (std::vector<std::size_t>::iterator j = i->second.begin(); j != i->second.end(); ++j) { - indices_[*j] = id; - } - } - } - - WordIndex FinishGetBOS() { - WordIndex ret = indices_.back(); - indices_.pop_back(); - return ret; - } - - private: - std::vector<WordIndex> &indices_; - - boost::unordered_map<uint64_t, std::vector<std::size_t> > words_; -}; - -} // namespace - -Instance::Instance(std::size_t num_models) : ln_backoff(num_models), ln_correct(num_models), ln_extensions(0, num_models) {} - -WordIndex LoadInstances(int tuning_file, const std::vector<StringPiece> &model_names, util::FixedArray<Instance> &instances, Matrix &ln_unigrams) { - util::FixedArray<ModelBuffer> models(model_names.size()); - std::vector<WordIndex> vocab_sizes; - vocab_sizes.reserve(model_names.size()); - util::FixedArray<util::scoped_fd> vocab_files(model_names.size()); - std::size_t max_order = 0; - for (std::vector<StringPiece>::const_iterator i = model_names.begin(); i != model_names.end(); ++i) { - models.push_back(*i); - vocab_sizes.push_back(models.back().Counts()[0]); - vocab_files.push_back(models.back().StealVocabFile()); - max_order = std::max(max_order, models.back().Order()); - } - UniversalVocab vocab(vocab_sizes); - std::vector<WordIndex> tuning_words; - WordIndex bos; - WordIndex combined_vocab_size; - { - IdentifyTuning identify(tuning_file, tuning_words); - combined_vocab_size = MergeVocab(vocab_files, vocab, identify); - bos = identify.FinishGetBOS(); - } - - instances.Init(tuning_words.size()); - util::FixedArray<InstanceBuilder> builders(tuning_words.size()); - std::vector<WordIndex> context; - context.push_back(bos); - - // Populate the map from contexts to instance builders. - ContextMap cmap; - const WordIndex eos = tuning_words.back(); - for (std::size_t i = 0; i < tuning_words.size(); ++i) { - instances.push_back(model_names.size()); - builders.push_back(tuning_words[i], max_order); - for (std::size_t j = 0; j < context.size(); ++j) { - cmap[util::MurmurHashNative(&context[j], sizeof(WordIndex) * (context.size() - j))].Register(builders.back()); - } - // Prepare for next word. - if (tuning_words[i] == eos) { - context.clear(); - context.push_back(bos); - } else { - if (context.size() == max_order) { - context.erase(context.begin()); - } - context.push_back(tuning_words[i]); - } - } - - ln_unigrams.resize(combined_vocab_size, models.size()); - - // Scan through input files. Sadly not parallel due to an underlying hash table. - for (std::size_t m = 0; m < models.size(); ++m) { - for (std::size_t order = 1; order <= models[m].Order(); ++order) { - util::stream::Chain chain(util::stream::ChainConfig(sizeof(ProbBackoff) + order * sizeof(WordIndex), 2, 64 * 1048576)); - models[m].Source(order - 1, chain); - chain >> Renumber(vocab.Mapping(m), order); - if (order == 1) { - chain >> UnigramLoader(cmap, ln_unigrams, m); - } else if (order < models[m].Order()) { - chain >> MiddleLoader(cmap); - } else { - chain >> HighestLoader(cmap, order); - } - } - for (std::size_t instance = 0; instance < tuning_words.size(); ++instance) { - builders[instance].Dump(m, ln_unigrams, instances[instance]); - } - ln_unigrams(bos, m) = -99; // Does not matter as long as it does not produce nans since tune_derivatives sets this to zero. - } - return bos; -} - -}} // namespaces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_instance.hh b/ext/kenlm/lm/interpolate/tune_instance.hh deleted file mode 100644 index c11eec9..0000000 --- a/ext/kenlm/lm/interpolate/tune_instance.hh +++ /dev/null @@ -1,86 +0,0 @@ -#ifndef LM_INTERPOLATE_TUNE_INSTANCE_H -#define LM_INTERPOLATE_TUNE_INSTANCE_H - -#include "lm/interpolate/tune_matrix.hh" -#include "lm/word_index.hh" -#include "util/scoped.hh" -#include "util/stream/config.hh" -#include "util/string_piece.hh" - -#include <boost/optional.hpp> - -#include <vector> - -namespace util { namespace stream { -template <class S, class T> class Sort; -class Chain; -class FileBuffer; -}} // namespaces - -namespace lm { namespace interpolate { - -typedef uint32_t InstanceIndex; -typedef uint32_t ModelIndex; - -struct Extension { - // Which tuning instance does this belong to? - InstanceIndex instance; - WordIndex word; - ModelIndex model; - // ln p_{model} (word | context(instance)) - float ln_prob; - - bool operator<(const Extension &other) const { - if (instance != other.instance) - return instance < other.instance; - if (word != other.word) - return word < other.word; - if (model != other.model) - return model < other.model; - return false; - } -}; - -class Instances { - public: - Instances(int tune_file, const std::vector<StringPiece> &model_names); - - Eigen::ConstRowXpr Backoffs(InstanceIndex instance) const { - return ln_backoffs_.row(instance); - } - - const Vector &CorrectGradientTerm() const { return neg_ln_correct_sum_; } - - const Matrix &LNUnigrams() const { return ln_unigrams_; } - - void ReadExtensions(util::stream::Chain &to); - - private: - // backoffs_(instance, model) is the backoff all the way to unigrams. - typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> BackoffMatrix; - BackoffMatrix ln_backoffs_; - - // neg_correct_sum_(model) = -\sum_{instances} ln p_{model}(correct(instance) | context(instance)). - // This appears as a term in the gradient. - Vector neg_ln_correct_sum_; - - // unigrams_(word, model) = ln p_{model}(word). - Matrix ln_unigrams_; - - struct ExtensionCompare { - bool operator()(const void *f, const void *s) const { - return reinterpret_cast<const Extension &>(f) < reinterpret_cast<const Extension &>(s); - } - }; - - // This is the source of data for the first iteration. - util::scoped_ptr<util::stream::Sort<ExtensionCompare> > extensions_first_; - - // Source of data for subsequent iterations. This contains already-sorted data. - util::scoped_ptr<util::stream::FileBuffer> extensions_subsequent_; - - const util::stream::SortConfig sorting_config_; -}; - -}} // namespaces -#endif // LM_INTERPOLATE_TUNE_INSTANCE_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/generate.sh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/generate.sh b/ext/kenlm/lm/interpolate/tune_instance_data/generate.sh deleted file mode 100755 index d725572..0000000 --- a/ext/kenlm/lm/interpolate/tune_instance_data/generate.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -../../../bin/lmplz --discount_fallback -o 3 -S 100M --intermediate toy0 --arpa toy0.arpa <<EOF -a a b a -b a a b -EOF -../../../bin/lmplz --discount_fallback -o 3 -S 100M --intermediate toy1 --arpa toy1.arpa <<EOF -a a b b b b b b b -c -EOF http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.1 ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.1 b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.1 deleted file mode 100644 index 1b66c51..0000000 Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.1 and /dev/null differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.2 ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.2 b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.2 deleted file mode 100644 index d735b1c..0000000 Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.2 and /dev/null differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.3 ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.3 b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.3 deleted file mode 100644 index 2d97aa3..0000000 Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.3 and /dev/null differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.kenlm_intermediate ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.kenlm_intermediate b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.kenlm_intermediate deleted file mode 100644 index 8513475..0000000 --- a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.kenlm_intermediate +++ /dev/null @@ -1,3 +0,0 @@ -KenLM intermediate binary file -Counts 5 7 7 -Payload pb http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.vocab ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.vocab b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.vocab deleted file mode 100644 index 520c0f9..0000000 Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.vocab and /dev/null differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.1 ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.1 b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.1 deleted file mode 100644 index a50cec6..0000000 Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.1 and /dev/null differ
