http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/search_trie.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/search_trie.cc b/ext/kenlm/lm/search_trie.cc deleted file mode 100644 index a63985a..0000000 --- a/ext/kenlm/lm/search_trie.cc +++ /dev/null @@ -1,600 +0,0 @@ -/* This is where the trie is built. It's on-disk. */ -#include "lm/search_trie.hh" - -#include "lm/bhiksha.hh" -#include "lm/binary_format.hh" -#include "lm/blank.hh" -#include "lm/lm_exception.hh" -#include "lm/max_order.hh" -#include "lm/quantize.hh" -#include "lm/trie.hh" -#include "lm/trie_sort.hh" -#include "lm/vocab.hh" -#include "lm/weights.hh" -#include "lm/word_index.hh" -#include "util/ersatz_progress.hh" -#include "util/mmap.hh" -#include "util/proxy_iterator.hh" -#include "util/scoped.hh" -#include "util/sized_iterator.hh" - -#include <algorithm> -#include <cstring> -#include <cstdio> -#include <cstdlib> -#include <queue> -#include <limits> -#include <numeric> -#include <vector> - -#if defined(_WIN32) || defined(_WIN64) -#include <windows.h> -#endif - -namespace lm { -namespace ngram { -namespace trie { -namespace { - -void ReadOrThrow(FILE *from, void *data, size_t size) { - UTIL_THROW_IF(1 != std::fread(data, size, 1, from), util::ErrnoException, "Short read"); -} - -int Compare(unsigned char order, const void *first_void, const void *second_void) { - const WordIndex *first = reinterpret_cast<const WordIndex*>(first_void), *second = reinterpret_cast<const WordIndex*>(second_void); - const WordIndex *end = first + order; - for (; first != end; ++first, ++second) { - if (*first < *second) return -1; - if (*first > *second) return 1; - } - return 0; -} - -struct ProbPointer { - unsigned char array; - uint64_t index; -}; - -// Array of n-grams and float indices. -class BackoffMessages { - public: - void Init(std::size_t entry_size) { - current_ = NULL; - allocated_ = NULL; - entry_size_ = entry_size; - } - - void Add(const WordIndex *to, ProbPointer index) { - while (current_ + entry_size_ > allocated_) { - std::size_t allocated_size = allocated_ - (uint8_t*)backing_.get(); - Resize(std::max<std::size_t>(allocated_size * 2, entry_size_)); - } - memcpy(current_, to, entry_size_ - sizeof(ProbPointer)); - *reinterpret_cast<ProbPointer*>(current_ + entry_size_ - sizeof(ProbPointer)) = index; - current_ += entry_size_; - } - - void Apply(float *const *const base, FILE *unigrams) { - FinishedAdding(); - if (current_ == allocated_) return; - rewind(unigrams); - ProbBackoff weights; - WordIndex unigram = 0; - ReadOrThrow(unigrams, &weights, sizeof(weights)); - for (; current_ != allocated_; current_ += entry_size_) { - const WordIndex &cur_word = *reinterpret_cast<const WordIndex*>(current_); - for (; unigram < cur_word; ++unigram) { - ReadOrThrow(unigrams, &weights, sizeof(weights)); - } - if (!HasExtension(weights.backoff)) { - weights.backoff = kExtensionBackoff; - UTIL_THROW_IF(fseek(unigrams, -sizeof(weights), SEEK_CUR), util::ErrnoException, "Seeking backwards to denote unigram extension failed."); - util::WriteOrThrow(unigrams, &weights, sizeof(weights)); - } - const ProbPointer &write_to = *reinterpret_cast<const ProbPointer*>(current_ + sizeof(WordIndex)); - base[write_to.array][write_to.index] += weights.backoff; - } - backing_.reset(); - } - - void Apply(float *const *const base, RecordReader &reader) { - FinishedAdding(); - if (current_ == allocated_) return; - // We'll also use the same buffer to record messages to blanks that they extend. - WordIndex *extend_out = reinterpret_cast<WordIndex*>(current_); - const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex); - for (reader.Rewind(); reader && (current_ != allocated_); ) { - switch (Compare(order, reader.Data(), current_)) { - case -1: - ++reader; - break; - case 1: - // Message but nobody to receive it. Write it down at the beginning of the buffer so we can inform this blank that it extends. - for (const WordIndex *w = reinterpret_cast<const WordIndex *>(current_); w != reinterpret_cast<const WordIndex *>(current_) + order; ++w, ++extend_out) *extend_out = *w; - current_ += entry_size_; - break; - case 0: - float &backoff = reinterpret_cast<ProbBackoff*>((uint8_t*)reader.Data() + order * sizeof(WordIndex))->backoff; - if (!HasExtension(backoff)) { - backoff = kExtensionBackoff; - reader.Overwrite(&backoff, sizeof(float)); - } else { - const ProbPointer &write_to = *reinterpret_cast<const ProbPointer*>(current_ + entry_size_ - sizeof(ProbPointer)); - base[write_to.array][write_to.index] += backoff; - } - current_ += entry_size_; - break; - } - } - // Now this is a list of blanks that extend right. - entry_size_ = sizeof(WordIndex) * order; - Resize(sizeof(WordIndex) * (extend_out - (const WordIndex*)backing_.get())); - current_ = (uint8_t*)backing_.get(); - } - - // Call after Apply - bool Extends(unsigned char order, const WordIndex *words) { - if (current_ == allocated_) return false; - assert(order * sizeof(WordIndex) == entry_size_); - while (true) { - switch(Compare(order, words, current_)) { - case 1: - current_ += entry_size_; - if (current_ == allocated_) return false; - break; - case -1: - return false; - case 0: - return true; - } - } - } - - private: - void FinishedAdding() { - Resize(current_ - (uint8_t*)backing_.get()); - // Sort requests in same order as files. - std::sort( - util::SizedIterator(util::SizedProxy(backing_.get(), entry_size_)), - util::SizedIterator(util::SizedProxy(current_, entry_size_)), - util::SizedCompare<EntryCompare>(EntryCompare((entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex)))); - current_ = (uint8_t*)backing_.get(); - } - - void Resize(std::size_t to) { - std::size_t current = current_ - (uint8_t*)backing_.get(); - backing_.call_realloc(to); - current_ = (uint8_t*)backing_.get() + current; - allocated_ = (uint8_t*)backing_.get() + to; - } - - util::scoped_malloc backing_; - - uint8_t *current_, *allocated_; - - std::size_t entry_size_; -}; - -const float kBadProb = std::numeric_limits<float>::infinity(); - -class SRISucks { - public: - SRISucks() { - for (BackoffMessages *i = messages_; i != messages_ + KENLM_MAX_ORDER - 1; ++i) - i->Init(sizeof(ProbPointer) + sizeof(WordIndex) * (i - messages_ + 1)); - } - - void Send(unsigned char begin, unsigned char order, const WordIndex *to, float prob_basis) { - assert(prob_basis != kBadProb); - ProbPointer pointer; - pointer.array = order - 1; - pointer.index = values_[order - 1].size(); - for (unsigned char i = begin; i < order; ++i) { - messages_[i - 1].Add(to, pointer); - } - values_[order - 1].push_back(prob_basis); - } - - void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) { - for (unsigned char i = 0; i < KENLM_MAX_ORDER - 1; ++i) { - it_[i] = values_[i].empty() ? NULL : &*values_[i].begin(); - } - messages_[0].Apply(it_, unigram_file); - BackoffMessages *messages = messages_ + 1; - const RecordReader *end = reader + total_order - 2 /* exclude unigrams and longest order */; - for (; reader != end; ++messages, ++reader) { - messages->Apply(it_, *reader); - } - } - - ProbBackoff GetBlank(unsigned char total_order, unsigned char order, const WordIndex *indices) { - assert(order > 1); - ProbBackoff ret; - ret.prob = *(it_[order - 1]++); - ret.backoff = ((order != total_order - 1) && messages_[order - 1].Extends(order, indices)) ? kExtensionBackoff : kNoExtensionBackoff; - return ret; - } - - const std::vector<float> &Values(unsigned char order) const { - return values_[order - 1]; - } - - private: - // This used to be one array. Then I needed to separate it by order for quantization to work. - std::vector<float> values_[KENLM_MAX_ORDER - 1]; - BackoffMessages messages_[KENLM_MAX_ORDER - 1]; - - float *it_[KENLM_MAX_ORDER - 1]; -}; - -class FindBlanks { - public: - FindBlanks(unsigned char order, const ProbBackoff *unigrams, SRISucks &messages) - : counts_(order), unigrams_(unigrams), sri_(messages) {} - - float UnigramProb(WordIndex index) const { - return unigrams_[index].prob; - } - - void Unigram(WordIndex /*index*/) { - ++counts_[0]; - } - - void MiddleBlank(const unsigned char order, const WordIndex *indices, unsigned char lower, float prob_basis) { - sri_.Send(lower, order, indices + 1, prob_basis); - ++counts_[order - 1]; - } - - void Middle(const unsigned char order, const void * /*data*/) { - ++counts_[order - 1]; - } - - void Longest(const void * /*data*/) { - ++counts_.back(); - } - - const std::vector<uint64_t> &Counts() const { - return counts_; - } - - private: - std::vector<uint64_t> counts_; - - const ProbBackoff *unigrams_; - - SRISucks &sri_; -}; - -// Phase to actually write n-grams to the trie. -template <class Quant, class Bhiksha> class WriteEntries { - public: - WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) : - contexts_(contexts), - quant_(quant), - unigrams_(unigrams), - middle_(middle), - longest_(longest), - bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : static_cast<BitPacked&>(*middle_)), - order_(order), - sri_(sri) {} - - float UnigramProb(WordIndex index) const { return unigrams_[index].weights.prob; } - - void Unigram(WordIndex word) { - unigrams_[word].next = bigram_pack_.InsertIndex(); - } - - void MiddleBlank(const unsigned char order, const WordIndex *indices, unsigned char /*lower*/, float /*prob_base*/) { - ProbBackoff weights = sri_.GetBlank(order_, order, indices); - typename Quant::MiddlePointer(quant_, order - 2, middle_[order - 2].Insert(indices[order - 1])).Write(weights.prob, weights.backoff); - } - - void Middle(const unsigned char order, const void *data) { - RecordReader &context = contexts_[order - 1]; - const WordIndex *words = reinterpret_cast<const WordIndex*>(data); - ProbBackoff weights = *reinterpret_cast<const ProbBackoff*>(words + order); - if (context && !memcmp(data, context.Data(), sizeof(WordIndex) * order)) { - SetExtension(weights.backoff); - ++context; - } - typename Quant::MiddlePointer(quant_, order - 2, middle_[order - 2].Insert(words[order - 1])).Write(weights.prob, weights.backoff); - } - - void Longest(const void *data) { - const WordIndex *words = reinterpret_cast<const WordIndex*>(data); - typename Quant::LongestPointer(quant_, longest_.Insert(words[order_ - 1])).Write(reinterpret_cast<const Prob*>(words + order_)->prob); - } - - private: - RecordReader *contexts_; - const Quant &quant_; - UnigramValue *const unigrams_; - BitPackedMiddle<Bhiksha> *const middle_; - BitPackedLongest &longest_; - BitPacked &bigram_pack_; - const unsigned char order_; - SRISucks &sri_; -}; - -struct Gram { - Gram(const WordIndex *in_begin, unsigned char order) : begin(in_begin), end(in_begin + order) {} - - const WordIndex *begin, *end; - - // For queue, this is the direction we want. - bool operator<(const Gram &other) const { - return std::lexicographical_compare(other.begin, other.end, begin, end); - } -}; - -template <class Doing> class BlankManager { - public: - BlankManager(unsigned char total_order, Doing &doing) : total_order_(total_order), been_length_(0), doing_(doing) { - for (float *i = basis_; i != basis_ + KENLM_MAX_ORDER - 1; ++i) *i = kBadProb; - } - - void Visit(const WordIndex *to, unsigned char length, float prob) { - basis_[length - 1] = prob; - unsigned char overlap = std::min<unsigned char>(length - 1, been_length_); - const WordIndex *cur; - WordIndex *pre; - for (cur = to, pre = been_; cur != to + overlap; ++cur, ++pre) { - if (*pre != *cur) break; - } - if (cur == to + length - 1) { - *pre = *cur; - been_length_ = length; - return; - } - // There are blanks to insert starting with order blank. - unsigned char blank = cur - to + 1; - UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context."); - const float *lower_basis; - for (lower_basis = basis_ + blank - 2; *lower_basis == kBadProb; --lower_basis) {} - unsigned char based_on = lower_basis - basis_ + 1; - for (; cur != to + length - 1; ++blank, ++cur, ++pre) { - assert(*lower_basis != kBadProb); - doing_.MiddleBlank(blank, to, based_on, *lower_basis); - *pre = *cur; - // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram. - basis_[blank - 1] = kBadProb; - } - *pre = *cur; - been_length_ = length; - } - - private: - const unsigned char total_order_; - - WordIndex been_[KENLM_MAX_ORDER]; - unsigned char been_length_; - - float basis_[KENLM_MAX_ORDER]; - - Doing &doing_; -}; - -template <class Doing> void RecursiveInsert(const unsigned char total_order, const WordIndex unigram_count, RecordReader *input, std::ostream *progress_out, const char *message, Doing &doing) { - util::ErsatzProgress progress(unigram_count + 1, progress_out, message); - WordIndex unigram = 0; - std::priority_queue<Gram> grams; - if (unigram_count) grams.push(Gram(&unigram, 1)); - for (unsigned char i = 2; i <= total_order; ++i) { - if (input[i-2]) grams.push(Gram(reinterpret_cast<const WordIndex*>(input[i-2].Data()), i)); - } - - BlankManager<Doing> blank(total_order, doing); - - while (!grams.empty()) { - Gram top = grams.top(); - grams.pop(); - unsigned char order = top.end - top.begin; - if (order == 1) { - blank.Visit(&unigram, 1, doing.UnigramProb(unigram)); - doing.Unigram(unigram); - progress.Set(unigram); - if (++unigram < unigram_count) grams.push(top); - } else { - if (order == total_order) { - blank.Visit(top.begin, order, reinterpret_cast<const Prob*>(top.end)->prob); - doing.Longest(top.begin); - } else { - blank.Visit(top.begin, order, reinterpret_cast<const ProbBackoff*>(top.end)->prob); - doing.Middle(order, top.begin); - } - RecordReader &reader = input[order - 2]; - if (++reader) grams.push(top); - } - } -} - -void SanityCheckCounts(const std::vector<uint64_t> &initial, const std::vector<uint64_t> &fixed) { - if (fixed[0] != initial[0]) UTIL_THROW(util::Exception, "Unigram count should be constant but initial is " << initial[0] << " and recounted is " << fixed[0]); - if (fixed.back() != initial.back()) UTIL_THROW(util::Exception, "Longest count should be constant but it changed from " << initial.back() << " to " << fixed.back()); - for (unsigned char i = 0; i < initial.size(); ++i) { - if (fixed[i] < initial[i]) UTIL_THROW(util::Exception, "Counts came out lower than expected. This shouldn't happen"); - } -} - -template <class Quant> void TrainQuantizer(uint8_t order, uint64_t count, const std::vector<float> &additional, RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) { - std::vector<float> probs(additional), backoffs; - probs.reserve(count + additional.size()); - backoffs.reserve(count); - for (reader.Rewind(); reader; ++reader) { - const ProbBackoff &weights = *reinterpret_cast<const ProbBackoff*>(reinterpret_cast<const uint8_t*>(reader.Data()) + sizeof(WordIndex) * order); - probs.push_back(weights.prob); - if (weights.backoff != 0.0) backoffs.push_back(weights.backoff); - ++progress; - } - quant.Train(order, probs, backoffs); -} - -template <class Quant> void TrainProbQuantizer(uint8_t order, uint64_t count, RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) { - std::vector<float> probs, backoffs; - probs.reserve(count); - for (reader.Rewind(); reader; ++reader) { - const Prob &weights = *reinterpret_cast<const Prob*>(reinterpret_cast<const uint8_t*>(reader.Data()) + sizeof(WordIndex) * order); - probs.push_back(weights.prob); - ++progress; - } - quant.TrainProb(order, probs); -} - -void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) { - // Fill unigram probabilities. - try { - rewind(file); - for (WordIndex i = 0; i < unigram_count; ++i) { - ReadOrThrow(file, &unigrams[i].weights, sizeof(ProbBackoff)); - if (contexts && *reinterpret_cast<const WordIndex*>(contexts.Data()) == i) { - SetExtension(unigrams[i].weights.backoff); - ++contexts; - } - } - } catch (util::Exception &e) { - e << " while re-reading unigram probabilities"; - throw; - } -} - -} // namespace - -template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing) { - RecordReader inputs[KENLM_MAX_ORDER - 1]; - RecordReader contexts[KENLM_MAX_ORDER - 1]; - - for (unsigned char i = 2; i <= counts.size(); ++i) { - inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff))); - contexts[i-2].Init(files.Context(i), (i-1) * sizeof(WordIndex)); - } - - SRISucks sri; - std::vector<uint64_t> fixed_counts; - util::scoped_FILE unigram_file; - util::scoped_fd unigram_fd(files.StealUnigram()); - { - util::scoped_memory unigrams; - MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams); - FindBlanks finder(counts.size(), reinterpret_cast<const ProbBackoff*>(unigrams.get()), sri); - RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Identifying n-grams omitted by SRI", finder); - fixed_counts = finder.Counts(); - } - unigram_file.reset(util::FDOpenOrThrow(unigram_fd)); - for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) { - if (*i) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading"); - } - SanityCheckCounts(counts, fixed_counts); - counts = fixed_counts; - - sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs); - - void *vocab_relocate; - void *search_base = backing.GrowForSearch(TrieSearch<Quant, Bhiksha>::Size(fixed_counts, config), vocab.UnkCountChangePadding(), vocab_relocate); - vocab.Relocate(vocab_relocate); - out.SetupMemory(reinterpret_cast<uint8_t*>(search_base), fixed_counts, config); - - for (unsigned char i = 2; i <= counts.size(); ++i) { - inputs[i-2].Rewind(); - } - if (Quant::kTrain) { - util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0), - config.ProgressMessages(), "Quantizing"); - for (unsigned char i = 2; i < counts.size(); ++i) { - TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, quant); - } - TrainProbQuantizer(counts.size(), counts.back(), inputs[counts.size() - 2], progress, quant); - quant.FinishedLoading(config); - } - - UnigramValue *unigrams = out.unigram_.Raw(); - PopulateUnigramWeights(unigram_file.get(), counts[0], contexts[0], unigrams); - unigram_file.reset(); - - for (unsigned char i = 2; i <= counts.size(); ++i) { - inputs[i-2].Rewind(); - } - // Fill entries except unigram probabilities. - { - WriteEntries<Quant, Bhiksha> writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri); - RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Writing trie", writer); - // Write the last unigram entry, which is the end pointer for the bigrams. - writer.Unigram(counts[0]); - } - - // Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation. - for (unsigned char order = 2; order <= counts.size(); ++order) { - const RecordReader &context = contexts[order - 2]; - if (context) { - FormatLoadException e; - e << "A " << static_cast<unsigned int>(order) << "-gram has context"; - const WordIndex *ctx = reinterpret_cast<const WordIndex*>(context.Data()); - for (const WordIndex *i = ctx; i != ctx + order - 1; ++i) { - e << ' ' << *i; - } - e << " so this context must appear in the model as a " << static_cast<unsigned int>(order - 1) << "-gram but it does not"; - throw e; - } - } - - /* Set ending offsets so the last entry will be sized properly */ - // Last entry for unigrams was already set. - if (out.middle_begin_ != out.middle_end_) { - for (typename TrieSearch<Quant, Bhiksha>::Middle *i = out.middle_begin_; i != out.middle_end_ - 1; ++i) { - i->FinishedLoading((i+1)->InsertIndex(), config); - } - (out.middle_end_ - 1)->FinishedLoading(out.longest_.InsertIndex(), config); - } -} - -template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, Bhiksha>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) { - quant_.SetupMemory(start, counts.size(), config); - start += Quant::Size(counts.size(), config); - unigram_.Init(start); - start += Unigram::Size(counts[0]); - FreeMiddles(); - middle_begin_ = static_cast<Middle*>(malloc(sizeof(Middle) * (counts.size() - 2))); - middle_end_ = middle_begin_ + (counts.size() - 2); - std::vector<uint8_t*> middle_starts(counts.size() - 2); - for (unsigned char i = 2; i < counts.size(); ++i) { - middle_starts[i-2] = start; - start += Middle::Size(Quant::MiddleBits(config), counts[i-1], counts[0], counts[i], config); - } - // Crazy backwards thing so we initialize using pointers to ones that have already been initialized - for (unsigned char i = counts.size() - 1; i >= 2; --i) { - // use "placement new" syntax to initalize Middle in an already-allocated memory location - new (middle_begin_ + i - 2) Middle( - middle_starts[i-2], - quant_.MiddleBits(config), - counts[i-1], - counts[0], - counts[i], - (i == counts.size() - 1) ? static_cast<const BitPacked&>(longest_) : static_cast<const BitPacked &>(middle_begin_[i-1]), - config); - } - longest_.Init(start, quant_.LongestBits(config), counts[0]); - return start + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]); -} - -template <class Quant, class Bhiksha> void TrieSearch<Quant, Bhiksha>::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing) { - std::string temporary_prefix; - if (!config.temporary_directory_prefix.empty()) { - temporary_prefix = config.temporary_directory_prefix; - } else if (config.write_mmap) { - temporary_prefix = config.write_mmap; - } else { - temporary_prefix = file; - } - // At least 1MB sorting memory. - SortedFiles sorted(config, f, counts, std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab); - - BuildTrie(sorted, counts, config, *this, quant_, vocab, backing); -} - -template class TrieSearch<DontQuantize, DontBhiksha>; -template class TrieSearch<DontQuantize, ArrayBhiksha>; -template class TrieSearch<SeparatelyQuantize, DontBhiksha>; -template class TrieSearch<SeparatelyQuantize, ArrayBhiksha>; - -} // namespace trie -} // namespace ngram -} // namespace lm
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/search_trie.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/search_trie.hh b/ext/kenlm/lm/search_trie.hh deleted file mode 100644 index 1adba6e..0000000 --- a/ext/kenlm/lm/search_trie.hh +++ /dev/null @@ -1,129 +0,0 @@ -#ifndef LM_SEARCH_TRIE_H -#define LM_SEARCH_TRIE_H - -#include "lm/config.hh" -#include "lm/model_type.hh" -#include "lm/return.hh" -#include "lm/trie.hh" -#include "lm/weights.hh" - -#include "util/file.hh" -#include "util/file_piece.hh" - -#include <vector> -#include <cstdlib> -#include <cassert> - -namespace lm { -namespace ngram { -class BinaryFormat; -class SortedVocabulary; -namespace trie { - -template <class Quant, class Bhiksha> class TrieSearch; -class SortedFiles; -template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing); - -template <class Quant, class Bhiksha> class TrieSearch { - public: - typedef NodeRange Node; - - typedef ::lm::ngram::trie::UnigramPointer UnigramPointer; - typedef typename Quant::MiddlePointer MiddlePointer; - typedef typename Quant::LongestPointer LongestPointer; - - static const bool kDifferentRest = false; - - static const ModelType kModelType = static_cast<ModelType>(TRIE_SORTED + Quant::kModelTypeAdd + Bhiksha::kModelTypeAdd); - - static const unsigned int kVersion = 1; - - static void UpdateConfigFromBinary(const BinaryFormat &file, const std::vector<uint64_t> &counts, uint64_t offset, Config &config) { - Quant::UpdateConfigFromBinary(file, offset, config); - // Currently the unigram pointers are not compresssed, so there will only be a header for order > 2. - if (counts.size() > 2) - Bhiksha::UpdateConfigFromBinary(file, offset + Quant::Size(counts.size(), config) + Unigram::Size(counts[0]), config); - } - - static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) { - uint64_t ret = Quant::Size(counts.size(), config) + Unigram::Size(counts[0]); - for (unsigned char i = 1; i < counts.size() - 1; ++i) { - ret += Middle::Size(Quant::MiddleBits(config), counts[i], counts[0], counts[i+1], config); - } - return ret + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]); - } - - TrieSearch() : middle_begin_(NULL), middle_end_(NULL) {} - - ~TrieSearch() { FreeMiddles(); } - - uint8_t *SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config); - - void InitializeFromARPA(const char *file, util::FilePiece &f, std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, BinaryFormat &backing); - - unsigned char Order() const { - return middle_end_ - middle_begin_ + 2; - } - - ProbBackoff &UnknownUnigram() { return unigram_.Unknown(); } - - UnigramPointer LookupUnigram(WordIndex word, Node &next, bool &independent_left, uint64_t &extend_left) const { - extend_left = static_cast<uint64_t>(word); - UnigramPointer ret(unigram_.Find(word, next)); - independent_left = (next.begin == next.end); - return ret; - } - - MiddlePointer Unpack(uint64_t extend_pointer, unsigned char extend_length, Node &node) const { - return MiddlePointer(quant_, extend_length - 2, middle_begin_[extend_length - 2].ReadEntry(extend_pointer, node)); - } - - MiddlePointer LookupMiddle(unsigned char order_minus_2, WordIndex word, Node &node, bool &independent_left, uint64_t &extend_left) const { - util::BitAddress address(middle_begin_[order_minus_2].Find(word, node, extend_left)); - independent_left = (address.base == NULL) || (node.begin == node.end); - return MiddlePointer(quant_, order_minus_2, address); - } - - LongestPointer LookupLongest(WordIndex word, const Node &node) const { - return LongestPointer(quant_, longest_.Find(word, node)); - } - - bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const { - assert(begin != end); - bool independent_left; - uint64_t ignored; - LookupUnigram(*begin, node, independent_left, ignored); - for (const WordIndex *i = begin + 1; i < end; ++i) { - if (independent_left || !LookupMiddle(i - begin - 1, *i, node, independent_left, ignored).Found()) return false; - } - return true; - } - - private: - friend void BuildTrie<Quant, Bhiksha>(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing); - - // Middles are managed manually so we can delay construction and they don't have to be copyable. - void FreeMiddles() { - for (const Middle *i = middle_begin_; i != middle_end_; ++i) { - i->~Middle(); - } - std::free(middle_begin_); - } - - typedef trie::BitPackedMiddle<Bhiksha> Middle; - - typedef trie::BitPackedLongest Longest; - Longest longest_; - - Middle *middle_begin_, *middle_end_; - Quant quant_; - - typedef ::lm::ngram::trie::Unigram Unigram; - Unigram unigram_; -}; - -} // namespace trie -} // namespace ngram -} // namespace lm - -#endif // LM_SEARCH_TRIE_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/sizes.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/sizes.cc b/ext/kenlm/lm/sizes.cc deleted file mode 100644 index dd831c5..0000000 --- a/ext/kenlm/lm/sizes.cc +++ /dev/null @@ -1,63 +0,0 @@ -#include "lm/sizes.hh" -#include "lm/model.hh" -#include "util/file_piece.hh" - -#include <vector> -#include <iomanip> - -namespace lm { -namespace ngram { - -void ShowSizes(const std::vector<uint64_t> &counts, const lm::ngram::Config &config) { - uint64_t sizes[6]; - sizes[0] = ProbingModel::Size(counts, config); - sizes[1] = RestProbingModel::Size(counts, config); - sizes[2] = TrieModel::Size(counts, config); - sizes[3] = QuantTrieModel::Size(counts, config); - sizes[4] = ArrayTrieModel::Size(counts, config); - sizes[5] = QuantArrayTrieModel::Size(counts, config); - uint64_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / sizeof(uint64_t)); - uint64_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / sizeof(uint64_t)); - uint64_t divide; - char prefix; - if (min_length < (1 << 10) * 10) { - prefix = ' '; - divide = 1; - } else if (min_length < (1 << 20) * 10) { - prefix = 'k'; - divide = 1 << 10; - } else if (min_length < (1ULL << 30) * 10) { - prefix = 'M'; - divide = 1 << 20; - } else { - prefix = 'G'; - divide = 1 << 30; - } - long int length = std::max<long int>(2, static_cast<long int>(ceil(log10((double) max_length / divide)))); - std::cerr << "Memory estimate for binary LM:\ntype "; - - // right align bytes. - for (long int i = 0; i < length - 2; ++i) std::cerr << ' '; - - std::cerr << prefix << "B\n" - "probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n" - "probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r models -p " << config.probing_multiplier << "\n" - "trie " << std::setw(length) << (sizes[2] / divide) << " without quantization\n" - "trie " << std::setw(length) << (sizes[3] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n" - "trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n" - "trie " << std::setw(length) << (sizes[5] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n"; -} - -void ShowSizes(const std::vector<uint64_t> &counts) { - lm::ngram::Config config; - ShowSizes(counts, config); -} - -void ShowSizes(const char *file, const lm::ngram::Config &config) { - std::vector<uint64_t> counts; - util::FilePiece f(file); - lm::ReadARPACounts(f, counts); - ShowSizes(counts, config); -} - -}} //namespaces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/sizes.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/sizes.hh b/ext/kenlm/lm/sizes.hh deleted file mode 100644 index eb7e99d..0000000 --- a/ext/kenlm/lm/sizes.hh +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef LM_SIZES_H -#define LM_SIZES_H - -#include <vector> - -#include <stdint.h> - -namespace lm { namespace ngram { - -struct Config; - -void ShowSizes(const std::vector<uint64_t> &counts, const lm::ngram::Config &config); -void ShowSizes(const std::vector<uint64_t> &counts); -void ShowSizes(const char *file, const lm::ngram::Config &config); - -}} // namespaces -#endif // LM_SIZES_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/state.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/state.hh b/ext/kenlm/lm/state.hh deleted file mode 100644 index 2195dee..0000000 --- a/ext/kenlm/lm/state.hh +++ /dev/null @@ -1,125 +0,0 @@ -#ifndef LM_STATE_H -#define LM_STATE_H - -#include "lm/max_order.hh" -#include "lm/word_index.hh" -#include "util/murmur_hash.hh" - -#include <cstring> - -namespace lm { -namespace ngram { - -// This is a POD but if you want memcmp to return the same as operator==, call -// ZeroRemaining first. -class State { - public: - bool operator==(const State &other) const { - if (length != other.length) return false; - return !memcmp(words, other.words, length * sizeof(WordIndex)); - } - - // Three way comparison function. - int Compare(const State &other) const { - if (length != other.length) return length < other.length ? -1 : 1; - return memcmp(words, other.words, length * sizeof(WordIndex)); - } - - bool operator<(const State &other) const { - if (length != other.length) return length < other.length; - return memcmp(words, other.words, length * sizeof(WordIndex)) < 0; - } - - // Call this before using raw memcmp. - void ZeroRemaining() { - for (unsigned char i = length; i < KENLM_MAX_ORDER - 1; ++i) { - words[i] = 0; - backoff[i] = 0.0; - } - } - - unsigned char Length() const { return length; } - - // You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD. - // This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit. - WordIndex words[KENLM_MAX_ORDER - 1]; - float backoff[KENLM_MAX_ORDER - 1]; - unsigned char length; -}; - -typedef State Right; - -inline uint64_t hash_value(const State &state, uint64_t seed = 0) { - return util::MurmurHashNative(state.words, sizeof(WordIndex) * state.length, seed); -} - -struct Left { - bool operator==(const Left &other) const { - return - length == other.length && - (!length || (pointers[length - 1] == other.pointers[length - 1] && full == other.full)); - } - - int Compare(const Left &other) const { - if (length < other.length) return -1; - if (length > other.length) return 1; - if (length == 0) return 0; // Must be full. - if (pointers[length - 1] > other.pointers[length - 1]) return 1; - if (pointers[length - 1] < other.pointers[length - 1]) return -1; - return (int)full - (int)other.full; - } - - bool operator<(const Left &other) const { - return Compare(other) == -1; - } - - void ZeroRemaining() { - for (uint64_t * i = pointers + length; i < pointers + KENLM_MAX_ORDER - 1; ++i) - *i = 0; - } - - uint64_t pointers[KENLM_MAX_ORDER - 1]; - unsigned char length; - bool full; -}; - -inline uint64_t hash_value(const Left &left) { - unsigned char add[2]; - add[0] = left.length; - add[1] = left.full; - return util::MurmurHashNative(add, 2, left.length ? left.pointers[left.length - 1] : 0); -} - -struct ChartState { - bool operator==(const ChartState &other) const { - return (right == other.right) && (left == other.left); - } - - int Compare(const ChartState &other) const { - int lres = left.Compare(other.left); - if (lres) return lres; - return right.Compare(other.right); - } - - bool operator<(const ChartState &other) const { - return Compare(other) < 0; - } - - void ZeroRemaining() { - left.ZeroRemaining(); - right.ZeroRemaining(); - } - - Left left; - State right; -}; - -inline uint64_t hash_value(const ChartState &state) { - return hash_value(state.right, hash_value(state.left)); -} - - -} // namespace ngram -} // namespace lm - -#endif // LM_STATE_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/test.arpa ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/test.arpa b/ext/kenlm/lm/test.arpa deleted file mode 100644 index c4d2e6d..0000000 --- a/ext/kenlm/lm/test.arpa +++ /dev/null @@ -1,124 +0,0 @@ - -\data\ -ngram 1=37 -ngram 2=47 -ngram 3=11 -ngram 4=6 -ngram 5=4 - -\1-grams: --1.383514 , -0.30103 --1.139057 . -0.845098 --1.029493 </s> --99 <s> -0.4149733 --1.995635 <unk> -20 --1.285941 a -0.69897 --1.687872 also -0.30103 --1.687872 beyond -0.30103 --1.687872 biarritz -0.30103 --1.687872 call -0.30103 --1.687872 concerns -0.30103 --1.687872 consider -0.30103 --1.687872 considering -0.30103 --1.687872 for -0.30103 --1.509559 higher -0.30103 --1.687872 however -0.30103 --1.687872 i -0.30103 --1.687872 immediate -0.30103 --1.687872 in -0.30103 --1.687872 is -0.30103 --1.285941 little -0.69897 --1.383514 loin -0.30103 --1.687872 look -0.30103 --1.285941 looking -0.4771212 --1.206319 more -0.544068 --1.509559 on -0.4771212 --1.509559 screening -0.4771212 --1.687872 small -0.30103 --1.687872 the -0.30103 --1.687872 to -0.30103 --1.687872 watch -0.30103 --1.687872 watching -0.30103 --1.687872 what -0.30103 --1.687872 would -0.30103 --3.141592 foo --2.718281 bar 3.0 --6.535897 baz -0.0 - -\2-grams: --0.6925742 , . --0.7522095 , however --0.7522095 , is --0.0602359 . </s> --0.4846522 <s> looking -0.4771214 --1.051485 <s> screening --1.07153 <s> the --1.07153 <s> watching --1.07153 <s> what --0.09132547 a little -0.69897 --0.2922095 also call --0.2922095 beyond immediate --0.2705918 biarritz . --0.2922095 call for --0.2922095 concerns in --0.2922095 consider watch --0.2922095 considering consider --0.2834328 for , --0.5511513 higher more --0.5845945 higher small --0.2834328 however , --0.2922095 i would --0.2922095 immediate concerns --0.2922095 in biarritz --0.2922095 is to --0.09021038 little more -0.1998621 --0.7273645 loin , --0.6925742 loin . --0.6708385 loin </s> --0.2922095 look beyond --0.4638903 looking higher --0.4638903 looking on -0.4771212 --0.5136299 more . -0.4771212 --0.3561665 more loin --0.1649931 on a -0.4771213 --0.1649931 screening a -0.4771213 --0.2705918 small . --0.287799 the screening --0.2922095 to look --0.2622373 watch </s> --0.2922095 watching considering --0.2922095 what i --0.2922095 would also --2 also would -6 --15 <unk> <unk> -2 --4 <unk> however -1 --6 foo bar - -\3-grams: --0.01916512 more . </s> --0.0283603 on a little -0.4771212 --0.0283603 screening a little -0.4771212 --0.01660496 a little more -0.09409451 --0.3488368 <s> looking higher --0.3488368 <s> looking on -0.4771212 --0.1892331 little more loin --0.04835128 looking on a -0.4771212 --3 also would consider -7 --6 <unk> however <unk> -12 --7 to look a - -\4-grams: --0.009249173 looking on a little -0.4771212 --0.005464747 on a little more -0.4771212 --0.005464747 screening a little more --0.1453306 a little more loin --0.01552657 <s> looking on a -0.4771212 --4 also would consider higher -8 - -\5-grams: --0.003061223 <s> looking on a little --0.001813953 looking on a little more --0.0432557 on a little more loin --5 also would consider higher looking - -\end\ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/test_nounk.arpa ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/test_nounk.arpa b/ext/kenlm/lm/test_nounk.arpa deleted file mode 100644 index e38fc85..0000000 --- a/ext/kenlm/lm/test_nounk.arpa +++ /dev/null @@ -1,120 +0,0 @@ - -\data\ -ngram 1=36 -ngram 2=45 -ngram 3=10 -ngram 4=6 -ngram 5=4 - -\1-grams: --1.383514 , -0.30103 --1.139057 . -0.845098 --1.029493 </s> --99 <s> -0.4149733 --1.285941 a -0.69897 --1.687872 also -0.30103 --1.687872 beyond -0.30103 --1.687872 biarritz -0.30103 --1.687872 call -0.30103 --1.687872 concerns -0.30103 --1.687872 consider -0.30103 --1.687872 considering -0.30103 --1.687872 for -0.30103 --1.509559 higher -0.30103 --1.687872 however -0.30103 --1.687872 i -0.30103 --1.687872 immediate -0.30103 --1.687872 in -0.30103 --1.687872 is -0.30103 --1.285941 little -0.69897 --1.383514 loin -0.30103 --1.687872 look -0.30103 --1.285941 looking -0.4771212 --1.206319 more -0.544068 --1.509559 on -0.4771212 --1.509559 screening -0.4771212 --1.687872 small -0.30103 --1.687872 the -0.30103 --1.687872 to -0.30103 --1.687872 watch -0.30103 --1.687872 watching -0.30103 --1.687872 what -0.30103 --1.687872 would -0.30103 --3.141592 foo --2.718281 bar 3.0 --6.535897 baz -0.0 - -\2-grams: --0.6925742 , . --0.7522095 , however --0.7522095 , is --0.0602359 . </s> --0.4846522 <s> looking -0.4771214 --1.051485 <s> screening --1.07153 <s> the --1.07153 <s> watching --1.07153 <s> what --0.09132547 a little -0.69897 --0.2922095 also call --0.2922095 beyond immediate --0.2705918 biarritz . --0.2922095 call for --0.2922095 concerns in --0.2922095 consider watch --0.2922095 considering consider --0.2834328 for , --0.5511513 higher more --0.5845945 higher small --0.2834328 however , --0.2922095 i would --0.2922095 immediate concerns --0.2922095 in biarritz --0.2922095 is to --0.09021038 little more -0.1998621 --0.7273645 loin , --0.6925742 loin . --0.6708385 loin </s> --0.2922095 look beyond --0.4638903 looking higher --0.4638903 looking on -0.4771212 --0.5136299 more . -0.4771212 --0.3561665 more loin --0.1649931 on a -0.4771213 --0.1649931 screening a -0.4771213 --0.2705918 small . --0.287799 the screening --0.2922095 to look --0.2622373 watch </s> --0.2922095 watching considering --0.2922095 what i --0.2922095 would also --2 also would -6 --6 foo bar - -\3-grams: --0.01916512 more . </s> --0.0283603 on a little -0.4771212 --0.0283603 screening a little -0.4771212 --0.01660496 a little more -0.09409451 --0.3488368 <s> looking higher --0.3488368 <s> looking on -0.4771212 --0.1892331 little more loin --0.04835128 looking on a -0.4771212 --3 also would consider -7 --7 to look a - -\4-grams: --0.009249173 looking on a little -0.4771212 --0.005464747 on a little more -0.4771212 --0.005464747 screening a little more --0.1453306 a little more loin --0.01552657 <s> looking on a -0.4771212 --4 also would consider higher -8 - -\5-grams: --0.003061223 <s> looking on a little --0.001813953 looking on a little more --0.0432557 on a little more loin --5 also would consider higher looking - -\end\ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/trie.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/trie.cc b/ext/kenlm/lm/trie.cc deleted file mode 100644 index 72ad544..0000000 --- a/ext/kenlm/lm/trie.cc +++ /dev/null @@ -1,131 +0,0 @@ -#include "lm/trie.hh" - -#include "lm/bhiksha.hh" -#include "util/bit_packing.hh" -#include "util/exception.hh" -#include "util/sorted_uniform.hh" - -#include <cassert> - -namespace lm { -namespace ngram { -namespace trie { -namespace { - -class KeyAccessor { - public: - KeyAccessor(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_t total_bits) - : base_(reinterpret_cast<const uint8_t*>(base)), key_mask_(key_mask), key_bits_(key_bits), total_bits_(total_bits) {} - - typedef uint64_t Key; - - Key operator()(uint64_t index) const { - return util::ReadInt57(base_, index * static_cast<uint64_t>(total_bits_), key_bits_, key_mask_); - } - - private: - const uint8_t *const base_; - const WordIndex key_mask_; - const uint8_t key_bits_, total_bits_; -}; - -bool FindBitPacked(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_t total_bits, uint64_t begin_index, uint64_t end_index, const uint64_t max_vocab, const uint64_t key, uint64_t &at_index) { - KeyAccessor accessor(base, key_mask, key_bits, total_bits); - if (!util::BoundedSortedUniformFind<uint64_t, KeyAccessor, util::PivotSelect<sizeof(WordIndex)>::T>(accessor, begin_index - 1, (uint64_t)0, end_index, max_vocab, key, at_index)) return false; - return true; -} -} // namespace - -uint64_t BitPacked::BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits) { - uint8_t total_bits = util::RequiredBits(max_vocab) + remaining_bits; - // Extra entry for next pointer at the end. - // +7 then / 8 to round up bits and convert to bytes - // +sizeof(uint64_t) so that ReadInt57 etc don't go segfault. - // Note that this waste is O(order), not O(number of ngrams). - return ((1 + entries) * total_bits + 7) / 8 + sizeof(uint64_t); -} - -void BitPacked::BaseInit(void *base, uint64_t max_vocab, uint8_t remaining_bits) { - util::BitPackingSanity(); - word_bits_ = util::RequiredBits(max_vocab); - word_mask_ = (1ULL << word_bits_) - 1ULL; - if (word_bits_ > 57) UTIL_THROW(util::Exception, "Sorry, word indices more than " << (1ULL << 57) << " are not implemented. Edit util/bit_packing.hh and fix the bit packing functions."); - total_bits_ = word_bits_ + remaining_bits; - - base_ = static_cast<uint8_t*>(base); - insert_index_ = 0; - max_vocab_ = max_vocab; -} - -template <class Bhiksha> uint64_t BitPackedMiddle<Bhiksha>::Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_ptr, const Config &config) { - return Bhiksha::Size(entries + 1, max_ptr, config) + BaseSize(entries, max_vocab, quant_bits + Bhiksha::InlineBits(entries + 1, max_ptr, config)); -} - -template <class Bhiksha> BitPackedMiddle<Bhiksha>::BitPackedMiddle(void *base, uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const BitPacked &next_source, const Config &config) : - BitPacked(), - quant_bits_(quant_bits), - // If the offset of the method changes, also change TrieSearch::UpdateConfigFromBinary. - bhiksha_(base, entries + 1, max_next, config), - next_source_(&next_source) { - if (entries + 1 >= (1ULL << 57) || (max_next >= (1ULL << 57))) UTIL_THROW(util::Exception, "Sorry, this does not support more than " << (1ULL << 57) << " n-grams of a particular order. Edit util/bit_packing.hh and fix the bit packing functions."); - BaseInit(reinterpret_cast<uint8_t*>(base) + Bhiksha::Size(entries + 1, max_next, config), max_vocab, quant_bits_ + bhiksha_.InlineBits()); -} - -template <class Bhiksha> util::BitAddress BitPackedMiddle<Bhiksha>::Insert(WordIndex word) { - assert(word <= word_mask_); - uint64_t at_pointer = insert_index_ * total_bits_; - - util::WriteInt57(base_, at_pointer, word_bits_, word); - at_pointer += word_bits_; - util::BitAddress ret(base_, at_pointer); - at_pointer += quant_bits_; - uint64_t next = next_source_->InsertIndex(); - bhiksha_.WriteNext(base_, at_pointer, insert_index_, next); - ++insert_index_; - return ret; -} - -template <class Bhiksha> util::BitAddress BitPackedMiddle<Bhiksha>::Find(WordIndex word, NodeRange &range, uint64_t &pointer) const { - uint64_t at_pointer; - if (!FindBitPacked(base_, word_mask_, word_bits_, total_bits_, range.begin, range.end, max_vocab_, word, at_pointer)) { - return util::BitAddress(NULL, 0); - } - pointer = at_pointer; - at_pointer *= total_bits_; - at_pointer += word_bits_; - bhiksha_.ReadNext(base_, at_pointer + quant_bits_, pointer, total_bits_, range); - - return util::BitAddress(base_, at_pointer); -} - -template <class Bhiksha> void BitPackedMiddle<Bhiksha>::FinishedLoading(uint64_t next_end, const Config &config) { - // Write at insert_index. . . - uint64_t last_next_write = insert_index_ * total_bits_ + - // at the offset where the next pointers are stored. - (total_bits_ - bhiksha_.InlineBits()); - bhiksha_.WriteNext(base_, last_next_write, insert_index_, next_end); - bhiksha_.FinishedLoading(config); -} - -util::BitAddress BitPackedLongest::Insert(WordIndex index) { - assert(index <= word_mask_); - uint64_t at_pointer = insert_index_ * total_bits_; - util::WriteInt57(base_, at_pointer, word_bits_, index); - at_pointer += word_bits_; - ++insert_index_; - return util::BitAddress(base_, at_pointer); -} - -util::BitAddress BitPackedLongest::Find(WordIndex word, const NodeRange &range) const { - uint64_t at_pointer; - if (!FindBitPacked(base_, word_mask_, word_bits_, total_bits_, range.begin, range.end, max_vocab_, word, at_pointer)) return util::BitAddress(NULL, 0); - at_pointer = at_pointer * total_bits_ + word_bits_; - return util::BitAddress(base_, at_pointer); -} - -template class BitPackedMiddle<DontBhiksha>; -template class BitPackedMiddle<ArrayBhiksha>; - -} // namespace trie -} // namespace ngram -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/trie.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/trie.hh b/ext/kenlm/lm/trie.hh deleted file mode 100644 index b7f0458..0000000 --- a/ext/kenlm/lm/trie.hh +++ /dev/null @@ -1,146 +0,0 @@ -#ifndef LM_TRIE_H -#define LM_TRIE_H - -#include "lm/weights.hh" -#include "lm/word_index.hh" -#include "util/bit_packing.hh" - -#include <cstddef> - -#include <stdint.h> - -namespace lm { -namespace ngram { -struct Config; -namespace trie { - -struct NodeRange { - uint64_t begin, end; -}; - -// TODO: if the number of unigrams is a concern, also bit pack these records. -struct UnigramValue { - ProbBackoff weights; - uint64_t next; - uint64_t Next() const { return next; } -}; - -class UnigramPointer { - public: - explicit UnigramPointer(const ProbBackoff &to) : to_(&to) {} - - UnigramPointer() : to_(NULL) {} - - bool Found() const { return to_ != NULL; } - - float Prob() const { return to_->prob; } - float Backoff() const { return to_->backoff; } - float Rest() const { return Prob(); } - - private: - const ProbBackoff *to_; -}; - -class Unigram { - public: - Unigram() {} - - void Init(void *start) { - unigram_ = static_cast<UnigramValue*>(start); - } - - static uint64_t Size(uint64_t count) { - // +1 in case unknown doesn't appear. +1 for the final next. - return (count + 2) * sizeof(UnigramValue); - } - - const ProbBackoff &Lookup(WordIndex index) const { return unigram_[index].weights; } - - ProbBackoff &Unknown() { return unigram_[0].weights; } - - UnigramValue *Raw() { - return unigram_; - } - - UnigramPointer Find(WordIndex word, NodeRange &next) const { - UnigramValue *val = unigram_ + word; - next.begin = val->next; - next.end = (val+1)->next; - return UnigramPointer(val->weights); - } - - private: - UnigramValue *unigram_; -}; - -class BitPacked { - public: - BitPacked() {} - - uint64_t InsertIndex() const { - return insert_index_; - } - - protected: - static uint64_t BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits); - - void BaseInit(void *base, uint64_t max_vocab, uint8_t remaining_bits); - - uint8_t word_bits_; - uint8_t total_bits_; - uint64_t word_mask_; - - uint8_t *base_; - - uint64_t insert_index_, max_vocab_; -}; - -template <class Bhiksha> class BitPackedMiddle : public BitPacked { - public: - static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const Config &config); - - // next_source need not be initialized. - BitPackedMiddle(void *base, uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const BitPacked &next_source, const Config &config); - - util::BitAddress Insert(WordIndex word); - - void FinishedLoading(uint64_t next_end, const Config &config); - - util::BitAddress Find(WordIndex word, NodeRange &range, uint64_t &pointer) const; - - util::BitAddress ReadEntry(uint64_t pointer, NodeRange &range) { - uint64_t addr = pointer * total_bits_; - addr += word_bits_; - bhiksha_.ReadNext(base_, addr + quant_bits_, pointer, total_bits_, range); - return util::BitAddress(base_, addr); - } - - private: - uint8_t quant_bits_; - Bhiksha bhiksha_; - - const BitPacked *next_source_; -}; - -class BitPackedLongest : public BitPacked { - public: - static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab) { - return BaseSize(entries, max_vocab, quant_bits); - } - - BitPackedLongest() {} - - void Init(void *base, uint8_t quant_bits, uint64_t max_vocab) { - BaseInit(base, max_vocab, quant_bits); - } - - util::BitAddress Insert(WordIndex word); - - util::BitAddress Find(WordIndex word, const NodeRange &node) const; -}; - -} // namespace trie -} // namespace ngram -} // namespace lm - -#endif // LM_TRIE_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/trie_sort.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/trie_sort.cc b/ext/kenlm/lm/trie_sort.cc deleted file mode 100644 index 33a2f96..0000000 --- a/ext/kenlm/lm/trie_sort.cc +++ /dev/null @@ -1,304 +0,0 @@ -#include "lm/trie_sort.hh" - -#include "lm/config.hh" -#include "lm/lm_exception.hh" -#include "lm/read_arpa.hh" -#include "lm/vocab.hh" -#include "lm/weights.hh" -#include "lm/word_index.hh" -#include "util/file_piece.hh" -#include "util/mmap.hh" -#include "util/proxy_iterator.hh" -#include "util/sized_iterator.hh" - -#include <algorithm> -#include <cstring> -#include <cstdio> -#include <cstdlib> -#include <deque> -#include <iterator> -#include <limits> -#include <vector> - -namespace lm { -namespace ngram { -namespace trie { -namespace { - -typedef util::SizedIterator NGramIter; - -// Proxy for an entry except there is some extra cruft between the entries. This is used to sort (n-1)-grams using the same memory as the sorted n-grams. -class PartialViewProxy { - public: - PartialViewProxy() : attention_size_(0), inner_() {} - - PartialViewProxy(void *ptr, std::size_t block_size, std::size_t attention_size) : attention_size_(attention_size), inner_(ptr, block_size) {} - - operator std::string() const { - return std::string(reinterpret_cast<const char*>(inner_.Data()), attention_size_); - } - - PartialViewProxy &operator=(const PartialViewProxy &from) { - memcpy(inner_.Data(), from.inner_.Data(), attention_size_); - return *this; - } - - PartialViewProxy &operator=(const std::string &from) { - memcpy(inner_.Data(), from.data(), attention_size_); - return *this; - } - - const void *Data() const { return inner_.Data(); } - void *Data() { return inner_.Data(); } - - friend void swap(PartialViewProxy first, PartialViewProxy second) { - std::swap_ranges(reinterpret_cast<char*>(first.Data()), reinterpret_cast<char*>(first.Data()) + first.attention_size_, reinterpret_cast<char*>(second.Data())); - } - - private: - friend class util::ProxyIterator<PartialViewProxy>; - - typedef std::string value_type; - - const std::size_t attention_size_; - - typedef util::SizedInnerIterator InnerIterator; - InnerIterator &Inner() { return inner_; } - const InnerIterator &Inner() const { return inner_; } - InnerIterator inner_; -}; - -typedef util::ProxyIterator<PartialViewProxy> PartialIter; - -FILE *DiskFlush(const void *mem_begin, const void *mem_end, const std::string &temp_prefix) { - util::scoped_fd file(util::MakeTemp(temp_prefix)); - util::WriteOrThrow(file.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin); - return util::FDOpenOrThrow(file); -} - -FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_prefix, std::size_t entry_size, unsigned char order) { - const size_t context_size = sizeof(WordIndex) * (order - 1); - // Sort just the contexts using the same memory. - PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size)); - PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), entry_size, context_size)); - -#if defined(_WIN32) || defined(_WIN64) - std::stable_sort -#else - std::sort -#endif - (context_begin, context_end, util::SizedCompare<EntryCompare, PartialViewProxy>(EntryCompare(order - 1))); - - util::scoped_FILE out(util::FMakeTemp(temp_prefix)); - - // Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator. - if (context_begin == context_end) return out.release(); - PartialIter i(context_begin); - util::WriteOrThrow(out.get(), i->Data(), context_size); - const void *previous = i->Data(); - ++i; - for (; i != context_end; ++i) { - if (memcmp(previous, i->Data(), context_size)) { - util::WriteOrThrow(out.get(), i->Data(), context_size); - previous = i->Data(); - } - } - return out.release(); -} - -struct ThrowCombine { - void operator()(std::size_t entry_size, unsigned char order, const void *first, const void *second, FILE * /*out*/) const { - const WordIndex *base = reinterpret_cast<const WordIndex*>(first); - FormatLoadException e; - e << "Duplicate n-gram detected with vocab ids"; - for (const WordIndex *i = base; i != base + order; ++i) { - e << ' ' << *i; - } - throw e; - } -}; - -// Useful for context files that just contain records with no value. -struct FirstCombine { - void operator()(std::size_t entry_size, unsigned char /*order*/, const void *first, const void * /*second*/, FILE *out) const { - util::WriteOrThrow(out, first, entry_size); - } -}; - -template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const std::string &temp_prefix, std::size_t weights_size, unsigned char order, const Combine &combine) { - std::size_t entry_size = sizeof(WordIndex) * order + weights_size; - RecordReader first, second; - first.Init(first_file, entry_size); - second.Init(second_file, entry_size); - util::scoped_FILE out_file(util::FMakeTemp(temp_prefix)); - EntryCompare less(order); - while (first && second) { - if (less(first.Data(), second.Data())) { - util::WriteOrThrow(out_file.get(), first.Data(), entry_size); - ++first; - } else if (less(second.Data(), first.Data())) { - util::WriteOrThrow(out_file.get(), second.Data(), entry_size); - ++second; - } else { - combine(entry_size, order, first.Data(), second.Data(), out_file.get()); - ++first; ++second; - } - } - for (RecordReader &remains = (first ? first : second); remains; ++remains) { - util::WriteOrThrow(out_file.get(), remains.Data(), entry_size); - } - return out_file.release(); -} - -} // namespace - -void RecordReader::Init(FILE *file, std::size_t entry_size) { - entry_size_ = entry_size; - data_.reset(malloc(entry_size)); - UTIL_THROW_IF(!data_.get(), util::ErrnoException, "Failed to malloc read buffer"); - file_ = file; - if (file) { - rewind(file); - remains_ = true; - ++*this; - } else { - remains_ = false; - } -} - -void RecordReader::Overwrite(const void *start, std::size_t amount) { - long internal = (uint8_t*)start - (uint8_t*)data_.get(); - UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision"); - util::WriteOrThrow(file_, start, amount); - long forward = entry_size_ - internal - amount; -#if !defined(_WIN32) && !defined(_WIN64) - if (forward) -#endif - UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision"); -} - -void RecordReader::Rewind() { - if (file_) { - rewind(file_); - remains_ = true; - ++*this; - } else { - remains_ = false; - } -} - -SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { - PositiveProbWarn warn(config.positive_log_probability); - unigram_.reset(util::MakeTemp(file_prefix)); - { - // In case <unk> appears. - size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff); - util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_.get(), size_out), size_out); - Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get()), warn); - CheckSpecials(config, vocab); - if (!vocab.SawUnk()) ++counts[0]; - } - - // Only use as much buffer as we need. - size_t buffer_use = 0; - for (unsigned int order = 2; order < counts.size(); ++order) { - buffer_use = std::max<size_t>(buffer_use, static_cast<size_t>((sizeof(WordIndex) * order + 2 * sizeof(float)) * counts[order - 1])); - } - buffer_use = std::max<size_t>(buffer_use, static_cast<size_t>((sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back())); - buffer = std::min<size_t>(buffer, buffer_use); - - util::scoped_malloc mem; - mem.reset(malloc(buffer)); - if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer); - - for (unsigned char order = 2; order <= counts.size(); ++order) { - ConvertToSorted(f, vocab, counts, file_prefix, order, warn, mem.get(), buffer); - } - ReadEnd(f); -} - -namespace { -class Closer { - public: - explicit Closer(std::deque<FILE*> &files) : files_(files) {} - - ~Closer() { - for (std::deque<FILE*>::iterator i = files_.begin(); i != files_.end(); ++i) { - util::scoped_FILE deleter(*i); - } - } - - void PopFront() { - util::scoped_FILE deleter(files_.front()); - files_.pop_front(); - } - private: - std::deque<FILE*> &files_; -}; -} // namespace - -void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) { - ReadNGramHeader(f, order); - const size_t count = counts[order - 1]; - // Size of weights. Does it include backoff? - const size_t words_size = sizeof(WordIndex) * order; - const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float)); - const size_t entry_size = words_size + weights_size; - const size_t batch_size = std::min(count, mem_size / entry_size); - uint8_t *const begin = reinterpret_cast<uint8_t*>(mem); - - std::deque<FILE*> files, contexts; - Closer files_closer(files), contexts_closer(contexts); - - for (std::size_t batch = 0, done = 0; done < count; ++batch) { - uint8_t *out = begin; - uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size; - if (order == counts.size()) { - for (; out != out_end; out += entry_size) { - std::reverse_iterator<WordIndex*> it(reinterpret_cast<WordIndex*>(out) + order); - ReadNGram(f, order, vocab, it, *reinterpret_cast<Prob*>(out + words_size), warn); - } - } else { - for (; out != out_end; out += entry_size) { - std::reverse_iterator<WordIndex*> it(reinterpret_cast<WordIndex*>(out) + order); - ReadNGram(f, order, vocab, it, *reinterpret_cast<ProbBackoff*>(out + words_size), warn); - } - } - // Sort full records by full n-gram. - util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size); - // parallel_sort uses too much RAM. TODO: figure out why windows sort doesn't like my proxies. -#if defined(_WIN32) || defined(_WIN64) - std::stable_sort -#else - std::sort -#endif - (NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare<EntryCompare>(EntryCompare(order))); - files.push_back(DiskFlush(begin, out_end, file_prefix)); - contexts.push_back(WriteContextFile(begin, out_end, file_prefix, entry_size, order)); - - done += (out_end - begin) / entry_size; - } - - // All individual files created. Merge them. - - while (files.size() > 1) { - files.push_back(MergeSortedFiles(files[0], files[1], file_prefix, weights_size, order, ThrowCombine())); - files_closer.PopFront(); - files_closer.PopFront(); - contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], file_prefix, 0, order - 1, FirstCombine())); - contexts_closer.PopFront(); - contexts_closer.PopFront(); - } - - if (!files.empty()) { - // Steal from closers. - full_[order - 2].reset(files.front()); - files.pop_front(); - context_[order - 2].reset(contexts.front()); - contexts.pop_front(); - } -} - -} // namespace trie -} // namespace ngram -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/trie_sort.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/trie_sort.hh b/ext/kenlm/lm/trie_sort.hh deleted file mode 100644 index 594efee..0000000 --- a/ext/kenlm/lm/trie_sort.hh +++ /dev/null @@ -1,114 +0,0 @@ -// Step of trie builder: create sorted files. - -#ifndef LM_TRIE_SORT_H -#define LM_TRIE_SORT_H - -#include "lm/max_order.hh" -#include "lm/word_index.hh" - -#include "util/file.hh" -#include "util/scoped.hh" - -#include <cstddef> -#include <functional> -#include <string> -#include <vector> - -#include <stdint.h> - -namespace util { -class FilePiece; -} // namespace util - -namespace lm { -class PositiveProbWarn; -namespace ngram { -class SortedVocabulary; -struct Config; - -namespace trie { - -class EntryCompare : public std::binary_function<const void*, const void*, bool> { - public: - explicit EntryCompare(unsigned char order) : order_(order) {} - - bool operator()(const void *first_void, const void *second_void) const { - const WordIndex *first = static_cast<const WordIndex*>(first_void); - const WordIndex *second = static_cast<const WordIndex*>(second_void); - const WordIndex *end = first + order_; - for (; first != end; ++first, ++second) { - if (*first < *second) return true; - if (*first > *second) return false; - } - return false; - } - private: - unsigned char order_; -}; - -class RecordReader { - public: - RecordReader() : remains_(true) {} - - void Init(FILE *file, std::size_t entry_size); - - void *Data() { return data_.get(); } - const void *Data() const { return data_.get(); } - - RecordReader &operator++() { - std::size_t ret = fread(data_.get(), entry_size_, 1, file_); - if (!ret) { - UTIL_THROW_IF(!feof(file_), util::ErrnoException, "Error reading temporary file"); - remains_ = false; - } - return *this; - } - - operator bool() const { return remains_; } - - void Rewind(); - - std::size_t EntrySize() const { return entry_size_; } - - void Overwrite(const void *start, std::size_t amount); - - private: - FILE *file_; - - util::scoped_malloc data_; - - bool remains_; - - std::size_t entry_size_; -}; - -class SortedFiles { - public: - // Build from ARPA - SortedFiles(const Config &config, util::FilePiece &f, std::vector<uint64_t> &counts, std::size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab); - - int StealUnigram() { - return unigram_.release(); - } - - FILE *Full(unsigned char order) { - return full_[order - 2].get(); - } - - FILE *Context(unsigned char of_order) { - return context_[of_order - 2].get(); - } - - private: - void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size); - - util::scoped_fd unigram_; - - util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1]; -}; - -} // namespace trie -} // namespace ngram -} // namespace lm - -#endif // LM_TRIE_SORT_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/value.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/value.hh b/ext/kenlm/lm/value.hh deleted file mode 100644 index d2425cc..0000000 --- a/ext/kenlm/lm/value.hh +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef LM_VALUE_H -#define LM_VALUE_H - -#include "lm/config.hh" -#include "lm/model_type.hh" -#include "lm/value_build.hh" -#include "lm/weights.hh" -#include "util/bit_packing.hh" - -#include <stdint.h> - -namespace lm { -namespace ngram { - -// Template proxy for probing unigrams and middle. -template <class Weights> class GenericProbingProxy { - public: - explicit GenericProbingProxy(const Weights &to) : to_(&to) {} - - GenericProbingProxy() : to_(0) {} - - bool Found() const { return to_ != 0; } - - float Prob() const { - util::FloatEnc enc; - enc.f = to_->prob; - enc.i |= util::kSignBit; - return enc.f; - } - - float Backoff() const { return to_->backoff; } - - bool IndependentLeft() const { - util::FloatEnc enc; - enc.f = to_->prob; - return enc.i & util::kSignBit; - } - - protected: - const Weights *to_; -}; - -// Basic proxy for trie unigrams. -template <class Weights> class GenericTrieUnigramProxy { - public: - explicit GenericTrieUnigramProxy(const Weights &to) : to_(&to) {} - - GenericTrieUnigramProxy() : to_(0) {} - - bool Found() const { return to_ != 0; } - float Prob() const { return to_->prob; } - float Backoff() const { return to_->backoff; } - float Rest() const { return Prob(); } - - protected: - const Weights *to_; -}; - -struct BackoffValue { - typedef ProbBackoff Weights; - static const ModelType kProbingModelType = PROBING; - - class ProbingProxy : public GenericProbingProxy<Weights> { - public: - explicit ProbingProxy(const Weights &to) : GenericProbingProxy<Weights>(to) {} - ProbingProxy() {} - float Rest() const { return Prob(); } - }; - - class TrieUnigramProxy : public GenericTrieUnigramProxy<Weights> { - public: - explicit TrieUnigramProxy(const Weights &to) : GenericTrieUnigramProxy<Weights>(to) {} - TrieUnigramProxy() {} - float Rest() const { return Prob(); } - }; - - struct ProbingEntry { - typedef uint64_t Key; - typedef Weights Value; - uint64_t key; - ProbBackoff value; - uint64_t GetKey() const { return key; } - }; - - struct TrieUnigramValue { - Weights weights; - uint64_t next; - uint64_t Next() const { return next; } - }; - - const static bool kDifferentRest = false; - - template <class Model, class C> void Callback(const Config &, unsigned int, typename Model::Vocabulary &, C &callback) { - NoRestBuild build; - callback(build); - } -}; - -struct RestValue { - typedef RestWeights Weights; - static const ModelType kProbingModelType = REST_PROBING; - - class ProbingProxy : public GenericProbingProxy<RestWeights> { - public: - explicit ProbingProxy(const Weights &to) : GenericProbingProxy<RestWeights>(to) {} - ProbingProxy() {} - float Rest() const { return to_->rest; } - }; - - class TrieUnigramProxy : public GenericTrieUnigramProxy<Weights> { - public: - explicit TrieUnigramProxy(const Weights &to) : GenericTrieUnigramProxy<Weights>(to) {} - TrieUnigramProxy() {} - float Rest() const { return to_->rest; } - }; - -// gcc 4.1 doesn't properly back dependent types :-(. -#pragma pack(push) -#pragma pack(4) - struct ProbingEntry { - typedef uint64_t Key; - typedef Weights Value; - Key key; - Value value; - Key GetKey() const { return key; } - }; - - struct TrieUnigramValue { - Weights weights; - uint64_t next; - uint64_t Next() const { return next; } - }; -#pragma pack(pop) - - const static bool kDifferentRest = true; - - template <class Model, class C> void Callback(const Config &config, unsigned int order, typename Model::Vocabulary &vocab, C &callback) { - switch (config.rest_function) { - case Config::REST_MAX: - { - MaxRestBuild build; - callback(build); - } - break; - case Config::REST_LOWER: - { - LowerRestBuild<Model> build(config, order, vocab); - callback(build); - } - break; - } - } -}; - -} // namespace ngram -} // namespace lm - -#endif // LM_VALUE_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/value_build.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/value_build.cc b/ext/kenlm/lm/value_build.cc deleted file mode 100644 index ac623a6..0000000 --- a/ext/kenlm/lm/value_build.cc +++ /dev/null @@ -1,59 +0,0 @@ -#include "lm/value_build.hh" - -#include "lm/model.hh" -#include "lm/read_arpa.hh" - -namespace lm { -namespace ngram { - -template <class Model> LowerRestBuild<Model>::LowerRestBuild(const Config &config, unsigned int order, const typename Model::Vocabulary &vocab) { - UTIL_THROW_IF(config.rest_lower_files.size() != order - 1, ConfigException, "This model has order " << order << " so there should be " << (order - 1) << " lower-order models for rest cost purposes."); - Config for_lower = config; - for_lower.write_mmap = NULL; - for_lower.rest_lower_files.clear(); - - // Unigram models aren't supported, so this is a custom loader. - // TODO: optimize the unigram loading? - { - util::FilePiece uni(config.rest_lower_files[0].c_str()); - std::vector<uint64_t> number; - ReadARPACounts(uni, number); - UTIL_THROW_IF(number.size() != 1, FormatLoadException, "Expected the unigram model to have order 1, not " << number.size()); - ReadNGramHeader(uni, 1); - unigrams_.resize(number[0]); - unigrams_[0] = config.unknown_missing_logprob; - PositiveProbWarn warn; - for (uint64_t i = 0; i < number[0]; ++i) { - WordIndex w; - Prob entry; - ReadNGram(uni, 1, vocab, &w, entry, warn); - unigrams_[w] = entry.prob; - } - } - - try { - for (unsigned int i = 2; i < order; ++i) { - models_.push_back(new Model(config.rest_lower_files[i - 1].c_str(), for_lower)); - UTIL_THROW_IF(models_.back()->Order() != i, FormatLoadException, "Lower order file " << config.rest_lower_files[i-1] << " should have order " << i); - } - } catch (...) { - for (typename std::vector<const Model*>::const_iterator i = models_.begin(); i != models_.end(); ++i) { - delete *i; - } - models_.clear(); - throw; - } - - // TODO: force/check same vocab. -} - -template <class Model> LowerRestBuild<Model>::~LowerRestBuild() { - for (typename std::vector<const Model*>::const_iterator i = models_.begin(); i != models_.end(); ++i) { - delete *i; - } -} - -template class LowerRestBuild<ProbingModel>; - -} // namespace ngram -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/value_build.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/value_build.hh b/ext/kenlm/lm/value_build.hh deleted file mode 100644 index 49989ab..0000000 --- a/ext/kenlm/lm/value_build.hh +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef LM_VALUE_BUILD_H -#define LM_VALUE_BUILD_H - -#include "lm/weights.hh" -#include "lm/word_index.hh" -#include "util/bit_packing.hh" - -#include <vector> - -namespace lm { -namespace ngram { - -struct Config; -struct BackoffValue; -struct RestValue; - -class NoRestBuild { - public: - typedef BackoffValue Value; - - NoRestBuild() {} - - void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const {} - void SetRest(const WordIndex *, unsigned int, const ProbBackoff &) const {} - - template <class Second> bool MarkExtends(ProbBackoff &weights, const Second &) const { - util::UnsetSign(weights.prob); - return false; - } - - // Probing doesn't need to go back to unigram. - const static bool kMarkEvenLower = false; -}; - -class MaxRestBuild { - public: - typedef RestValue Value; - - MaxRestBuild() {} - - void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const {} - void SetRest(const WordIndex *, unsigned int, RestWeights &weights) const { - weights.rest = weights.prob; - util::SetSign(weights.rest); - } - - bool MarkExtends(RestWeights &weights, const RestWeights &to) const { - util::UnsetSign(weights.prob); - if (weights.rest >= to.rest) return false; - weights.rest = to.rest; - return true; - } - bool MarkExtends(RestWeights &weights, const Prob &to) const { - util::UnsetSign(weights.prob); - if (weights.rest >= to.prob) return false; - weights.rest = to.prob; - return true; - } - - // Probing does need to go back to unigram. - const static bool kMarkEvenLower = true; -}; - -template <class Model> class LowerRestBuild { - public: - typedef RestValue Value; - - LowerRestBuild(const Config &config, unsigned int order, const typename Model::Vocabulary &vocab); - - ~LowerRestBuild(); - - void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const {} - void SetRest(const WordIndex *vocab_ids, unsigned int n, RestWeights &weights) const { - typename Model::State ignored; - if (n == 1) { - weights.rest = unigrams_[*vocab_ids]; - } else { - weights.rest = models_[n-2]->FullScoreForgotState(vocab_ids + 1, vocab_ids + n, *vocab_ids, ignored).prob; - } - } - - template <class Second> bool MarkExtends(RestWeights &weights, const Second &) const { - util::UnsetSign(weights.prob); - return false; - } - - const static bool kMarkEvenLower = false; - - std::vector<float> unigrams_; - - std::vector<const Model*> models_; -}; - -} // namespace ngram -} // namespace lm - -#endif // LM_VALUE_BUILD_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/virtual_interface.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/virtual_interface.cc b/ext/kenlm/lm/virtual_interface.cc deleted file mode 100644 index 17a74c3..0000000 --- a/ext/kenlm/lm/virtual_interface.cc +++ /dev/null @@ -1,19 +0,0 @@ -#include "lm/virtual_interface.hh" - -#include "lm/lm_exception.hh" - -namespace lm { -namespace base { - -Vocabulary::~Vocabulary() {} - -void Vocabulary::SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) { - begin_sentence_ = begin_sentence; - end_sentence_ = end_sentence; - not_found_ = not_found; -} - -Model::~Model() {} - -} // namespace base -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/virtual_interface.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/virtual_interface.hh b/ext/kenlm/lm/virtual_interface.hh deleted file mode 100644 index ea491fb..0000000 --- a/ext/kenlm/lm/virtual_interface.hh +++ /dev/null @@ -1,160 +0,0 @@ -#ifndef LM_VIRTUAL_INTERFACE_H -#define LM_VIRTUAL_INTERFACE_H - -#include "lm/return.hh" -#include "lm/word_index.hh" -#include "util/string_piece.hh" - -#include <string> -#include <cstring> - -namespace lm { -namespace base { - -template <class T, class U, class V> class ModelFacade; - -/* Vocabulary interface. Call Index(string) and get a word index for use in - * calling Model. It provides faster convenience functions for <s>, </s>, and - * <unk> although you can also find these using Index. - * - * Some models do not load the mapping from index to string. If you need this, - * check if the model Vocabulary class implements such a function and access it - * directly. - * - * The Vocabulary object is always owned by the Model and can be retrieved from - * the Model using BaseVocabulary() for this abstract interface or - * GetVocabulary() for the actual implementation (in which case you'll need the - * actual implementation of the Model too). - */ -class Vocabulary { - public: - virtual ~Vocabulary(); - - WordIndex BeginSentence() const { return begin_sentence_; } - WordIndex EndSentence() const { return end_sentence_; } - WordIndex NotFound() const { return not_found_; } - - /* Most implementations allow StringPiece lookups and need only override - * Index(StringPiece). SRI requires null termination and overrides all - * three methods. - */ - virtual WordIndex Index(const StringPiece &str) const = 0; - virtual WordIndex Index(const std::string &str) const { - return Index(StringPiece(str)); - } - virtual WordIndex Index(const char *str) const { - return Index(StringPiece(str)); - } - - protected: - // Call SetSpecial afterward. - Vocabulary() {} - - Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) { - SetSpecial(begin_sentence, end_sentence, not_found); - } - - void SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found); - - WordIndex begin_sentence_, end_sentence_, not_found_; - - private: - // Disable copy constructors. They're private and undefined. - // Ersatz boost::noncopyable. - Vocabulary(const Vocabulary &); - Vocabulary &operator=(const Vocabulary &); -}; - -/* There are two ways to access a Model. - * - * - * OPTION 1: Access the Model directly (e.g. lm::ngram::Model in model.hh). - * - * Every Model implements the scoring function: - * float Score( - * const Model::State &in_state, - * const WordIndex new_word, - * Model::State &out_state) const; - * - * It can also return the length of n-gram matched by the model: - * FullScoreReturn FullScore( - * const Model::State &in_state, - * const WordIndex new_word, - * Model::State &out_state) const; - * - * - * There are also accessor functions: - * const State &BeginSentenceState() const; - * const State &NullContextState() const; - * const Vocabulary &GetVocabulary() const; - * unsigned int Order() const; - * - * NB: In case you're wondering why the model implementation looks like it's - * missing these methods, see facade.hh. - * - * This is the fastest way to use a model and presents a normal State class to - * be included in a hypothesis state structure. - * - * - * OPTION 2: Use the virtual interface below. - * - * The virtual interface allow you to decide which Model to use at runtime - * without templatizing everything on the Model type. However, each Model has - * its own State class, so a single State cannot be efficiently provided (it - * would require using the maximum memory of any Model's State or memory - * allocation with each lookup). This means you become responsible for - * allocating memory with size StateSize() and passing it to the Score or - * FullScore functions provided here. - * - * For example, cdec has a std::string containing the entire state of a - * hypothesis. It can reserve StateSize bytes in this string for the model - * state. - * - * All the State objects are POD, so it's ok to use raw memory for storing - * State. - * in_state and out_state must not have the same address. - */ -class Model { - public: - virtual ~Model(); - - size_t StateSize() const { return state_size_; } - const void *BeginSentenceMemory() const { return begin_sentence_memory_; } - void BeginSentenceWrite(void *to) const { memcpy(to, begin_sentence_memory_, StateSize()); } - const void *NullContextMemory() const { return null_context_memory_; } - void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, StateSize()); } - - // Requires in_state != out_state - virtual float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0; - - // Requires in_state != out_state - virtual FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0; - - // Prefer to use FullScore. The context words should be provided in reverse order. - virtual FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const = 0; - - unsigned char Order() const { return order_; } - - const Vocabulary &BaseVocabulary() const { return *base_vocab_; } - - private: - template <class T, class U, class V> friend class ModelFacade; - explicit Model(size_t state_size) : state_size_(state_size) {} - - const size_t state_size_; - const void *begin_sentence_memory_, *null_context_memory_; - - const Vocabulary *base_vocab_; - - unsigned char order_; - - // Disable copy constructors. They're private and undefined. - // Ersatz boost::noncopyable. - Model(const Model &); - Model &operator=(const Model &); -}; - -} // mamespace base -} // namespace lm - -#endif // LM_VIRTUAL_INTERFACE_H
