[08/51] [partial] incubator-joshua git commit: Converted KenLM into a submodule

mjpost Tue, 19 Apr 2016 12:35:22 -0700

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/search_trie.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/search_trie.cc b/ext/kenlm/lm/search_trie.cc
deleted file mode 100644
index a63985a..0000000
--- a/ext/kenlm/lm/search_trie.cc
+++ /dev/null
@@ -1,600 +0,0 @@
-/* This is where the trie is built.  It's on-disk.  */
-#include "lm/search_trie.hh"
-
-#include "lm/bhiksha.hh"
-#include "lm/binary_format.hh"
-#include "lm/blank.hh"
-#include "lm/lm_exception.hh"
-#include "lm/max_order.hh"
-#include "lm/quantize.hh"
-#include "lm/trie.hh"
-#include "lm/trie_sort.hh"
-#include "lm/vocab.hh"
-#include "lm/weights.hh"
-#include "lm/word_index.hh"
-#include "util/ersatz_progress.hh"
-#include "util/mmap.hh"
-#include "util/proxy_iterator.hh"
-#include "util/scoped.hh"
-#include "util/sized_iterator.hh"
-
-#include <algorithm>
-#include <cstring>
-#include <cstdio>
-#include <cstdlib>
-#include <queue>
-#include <limits>
-#include <numeric>
-#include <vector>
-
-#if defined(_WIN32) || defined(_WIN64)
-#include <windows.h>
-#endif
-
-namespace lm {
-namespace ngram {
-namespace trie {
-namespace {
-
-void ReadOrThrow(FILE *from, void *data, size_t size) {
-  UTIL_THROW_IF(1 != std::fread(data, size, 1, from), util::ErrnoException, 
"Short read");
-}
-
-int Compare(unsigned char order, const void *first_void, const void 
*second_void) {
-  const WordIndex *first = reinterpret_cast<const WordIndex*>(first_void), 
*second = reinterpret_cast<const WordIndex*>(second_void);
-  const WordIndex *end = first + order;
-  for (; first != end; ++first, ++second) {
-    if (*first < *second) return -1;
-    if (*first > *second) return 1;
-  }
-  return 0;
-}
-
-struct ProbPointer {
-  unsigned char array;
-  uint64_t index;
-};
-
-// Array of n-grams and float indices.
-class BackoffMessages {
-  public:
-    void Init(std::size_t entry_size) {
-      current_ = NULL;
-      allocated_ = NULL;
-      entry_size_ = entry_size;
-    }
-
-    void Add(const WordIndex *to, ProbPointer index) {
-      while (current_ + entry_size_ > allocated_) {
-        std::size_t allocated_size = allocated_ - (uint8_t*)backing_.get();
-        Resize(std::max<std::size_t>(allocated_size * 2, entry_size_));
-      }
-      memcpy(current_, to, entry_size_ - sizeof(ProbPointer));
-      *reinterpret_cast<ProbPointer*>(current_ + entry_size_ - 
sizeof(ProbPointer)) = index;
-      current_ += entry_size_;
-    }
-
-    void Apply(float *const *const base, FILE *unigrams) {
-      FinishedAdding();
-      if (current_ == allocated_) return;
-      rewind(unigrams);
-      ProbBackoff weights;
-      WordIndex unigram = 0;
-      ReadOrThrow(unigrams, &weights, sizeof(weights));
-      for (; current_ != allocated_; current_ += entry_size_) {
-        const WordIndex &cur_word = *reinterpret_cast<const 
WordIndex*>(current_);
-        for (; unigram < cur_word; ++unigram) {
-          ReadOrThrow(unigrams, &weights, sizeof(weights));
-        }
-        if (!HasExtension(weights.backoff)) {
-          weights.backoff = kExtensionBackoff;
-          UTIL_THROW_IF(fseek(unigrams, -sizeof(weights), SEEK_CUR), 
util::ErrnoException, "Seeking backwards to denote unigram extension failed.");
-          util::WriteOrThrow(unigrams, &weights, sizeof(weights));
-        }
-        const ProbPointer &write_to = *reinterpret_cast<const 
ProbPointer*>(current_ + sizeof(WordIndex));
-        base[write_to.array][write_to.index] += weights.backoff;
-      }
-      backing_.reset();
-    }
-
-    void Apply(float *const *const base, RecordReader &reader) {
-      FinishedAdding();
-      if (current_ == allocated_) return;
-      // We'll also use the same buffer to record messages to blanks that they 
extend.
-      WordIndex *extend_out = reinterpret_cast<WordIndex*>(current_);
-      const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / 
sizeof(WordIndex);
-      for (reader.Rewind(); reader && (current_ != allocated_); ) {
-        switch (Compare(order, reader.Data(), current_)) {
-          case -1:
-            ++reader;
-            break;
-          case 1:
-            // Message but nobody to receive it.  Write it down at the 
beginning of the buffer so we can inform this blank that it extends.
-            for (const WordIndex *w = reinterpret_cast<const WordIndex 
*>(current_); w != reinterpret_cast<const WordIndex *>(current_) + order; ++w, 
++extend_out) *extend_out = *w;
-            current_ += entry_size_;
-            break;
-          case 0:
-            float &backoff = 
reinterpret_cast<ProbBackoff*>((uint8_t*)reader.Data() + order * 
sizeof(WordIndex))->backoff;
-            if (!HasExtension(backoff)) {
-              backoff = kExtensionBackoff;
-              reader.Overwrite(&backoff, sizeof(float));
-            } else {
-              const ProbPointer &write_to = *reinterpret_cast<const 
ProbPointer*>(current_ + entry_size_ - sizeof(ProbPointer));
-              base[write_to.array][write_to.index] += backoff;
-            }
-            current_ += entry_size_;
-            break;
-        }
-      }
-      // Now this is a list of blanks that extend right.
-      entry_size_ = sizeof(WordIndex) * order;
-      Resize(sizeof(WordIndex) * (extend_out - (const 
WordIndex*)backing_.get()));
-      current_ = (uint8_t*)backing_.get();
-    }
-
-    // Call after Apply
-    bool Extends(unsigned char order, const WordIndex *words) {
-      if (current_ == allocated_) return false;
-      assert(order * sizeof(WordIndex) == entry_size_);
-      while (true) {
-        switch(Compare(order, words, current_)) {
-          case 1:
-            current_ += entry_size_;
-            if (current_ == allocated_) return false;
-            break;
-          case -1:
-            return false;
-          case 0:
-            return true;
-        }
-      }
-    }
-
-  private:
-    void FinishedAdding() {
-      Resize(current_ - (uint8_t*)backing_.get());
-      // Sort requests in same order as files.
-      std::sort(
-          util::SizedIterator(util::SizedProxy(backing_.get(), entry_size_)),
-          util::SizedIterator(util::SizedProxy(current_, entry_size_)),
-          util::SizedCompare<EntryCompare>(EntryCompare((entry_size_ - 
sizeof(ProbPointer)) / sizeof(WordIndex))));
-      current_ = (uint8_t*)backing_.get();
-    }
-
-    void Resize(std::size_t to) {
-      std::size_t current = current_ - (uint8_t*)backing_.get();
-      backing_.call_realloc(to);
-      current_ = (uint8_t*)backing_.get() + current;
-      allocated_ = (uint8_t*)backing_.get() + to;
-    }
-
-    util::scoped_malloc backing_;
-
-    uint8_t *current_, *allocated_;
-
-    std::size_t entry_size_;
-};
-
-const float kBadProb = std::numeric_limits<float>::infinity();
-
-class SRISucks {
-  public:
-    SRISucks() {
-      for (BackoffMessages *i = messages_; i != messages_ + KENLM_MAX_ORDER - 
1; ++i)
-        i->Init(sizeof(ProbPointer) + sizeof(WordIndex) * (i - messages_ + 1));
-    }
-
-    void Send(unsigned char begin, unsigned char order, const WordIndex *to, 
float prob_basis) {
-      assert(prob_basis != kBadProb);
-      ProbPointer pointer;
-      pointer.array = order - 1;
-      pointer.index = values_[order - 1].size();
-      for (unsigned char i = begin; i < order; ++i) {
-        messages_[i - 1].Add(to, pointer);
-      }
-      values_[order - 1].push_back(prob_basis);
-    }
-
-    void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, 
RecordReader *reader) {
-      for (unsigned char i = 0; i < KENLM_MAX_ORDER - 1; ++i) {
-        it_[i] = values_[i].empty() ? NULL : &*values_[i].begin();
-      }
-      messages_[0].Apply(it_, unigram_file);
-      BackoffMessages *messages = messages_ + 1;
-      const RecordReader *end = reader + total_order - 2 /* exclude unigrams 
and longest order */;
-      for (; reader != end; ++messages, ++reader) {
-        messages->Apply(it_, *reader);
-      }
-    }
-
-    ProbBackoff GetBlank(unsigned char total_order, unsigned char order, const 
WordIndex *indices) {
-      assert(order > 1);
-      ProbBackoff ret;
-      ret.prob = *(it_[order - 1]++);
-      ret.backoff = ((order != total_order - 1) && messages_[order - 
1].Extends(order, indices)) ? kExtensionBackoff : kNoExtensionBackoff;
-      return ret;
-    }
-
-    const std::vector<float> &Values(unsigned char order) const {
-      return values_[order - 1];
-    }
-
-  private:
-    // This used to be one array.  Then I needed to separate it by order for 
quantization to work.
-    std::vector<float> values_[KENLM_MAX_ORDER - 1];
-    BackoffMessages messages_[KENLM_MAX_ORDER - 1];
-
-    float *it_[KENLM_MAX_ORDER - 1];
-};
-
-class FindBlanks {
-  public:
-    FindBlanks(unsigned char order, const ProbBackoff *unigrams, SRISucks 
&messages)
-      : counts_(order), unigrams_(unigrams), sri_(messages) {}
-
-    float UnigramProb(WordIndex index) const {
-      return unigrams_[index].prob;
-    }
-
-    void Unigram(WordIndex /*index*/) {
-      ++counts_[0];
-    }
-
-    void MiddleBlank(const unsigned char order, const WordIndex *indices, 
unsigned char lower, float prob_basis) {
-      sri_.Send(lower, order, indices + 1, prob_basis);
-      ++counts_[order - 1];
-    }
-
-    void Middle(const unsigned char order, const void * /*data*/) {
-      ++counts_[order - 1];
-    }
-
-    void Longest(const void * /*data*/) {
-      ++counts_.back();
-    }
-
-    const std::vector<uint64_t> &Counts() const {
-      return counts_;
-    }
-
-  private:
-    std::vector<uint64_t> counts_;
-
-    const ProbBackoff *unigrams_;
-
-    SRISucks &sri_;
-};
-
-// Phase to actually write n-grams to the trie.
-template <class Quant, class Bhiksha> class WriteEntries {
-  public:
-    WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue 
*unigrams, BitPackedMiddle<Bhiksha> *middle, BitPackedLongest &longest, 
unsigned char order, SRISucks &sri) :
-      contexts_(contexts),
-      quant_(quant),
-      unigrams_(unigrams),
-      middle_(middle),
-      longest_(longest),
-      bigram_pack_((order == 2) ? static_cast<BitPacked&>(longest_) : 
static_cast<BitPacked&>(*middle_)),
-      order_(order),
-      sri_(sri) {}
-
-    float UnigramProb(WordIndex index) const { return 
unigrams_[index].weights.prob; }
-
-    void Unigram(WordIndex word) {
-      unigrams_[word].next = bigram_pack_.InsertIndex();
-    }
-
-    void MiddleBlank(const unsigned char order, const WordIndex *indices, 
unsigned char /*lower*/, float /*prob_base*/) {
-      ProbBackoff weights = sri_.GetBlank(order_, order, indices);
-      typename Quant::MiddlePointer(quant_, order - 2, middle_[order - 
2].Insert(indices[order - 1])).Write(weights.prob, weights.backoff);
-    }
-
-    void Middle(const unsigned char order, const void *data) {
-      RecordReader &context = contexts_[order - 1];
-      const WordIndex *words = reinterpret_cast<const WordIndex*>(data);
-      ProbBackoff weights = *reinterpret_cast<const ProbBackoff*>(words + 
order);
-      if (context && !memcmp(data, context.Data(), sizeof(WordIndex) * order)) 
{
-        SetExtension(weights.backoff);
-        ++context;
-      }
-      typename Quant::MiddlePointer(quant_, order - 2, middle_[order - 
2].Insert(words[order - 1])).Write(weights.prob, weights.backoff);
-    }
-
-    void Longest(const void *data) {
-      const WordIndex *words = reinterpret_cast<const WordIndex*>(data);
-      typename Quant::LongestPointer(quant_, longest_.Insert(words[order_ - 
1])).Write(reinterpret_cast<const Prob*>(words + order_)->prob);
-    }
-
-  private:
-    RecordReader *contexts_;
-    const Quant &quant_;
-    UnigramValue *const unigrams_;
-    BitPackedMiddle<Bhiksha> *const middle_;
-    BitPackedLongest &longest_;
-    BitPacked &bigram_pack_;
-    const unsigned char order_;
-    SRISucks &sri_;
-};
-
-struct Gram {
-  Gram(const WordIndex *in_begin, unsigned char order) : begin(in_begin), 
end(in_begin + order) {}
-
-  const WordIndex *begin, *end;
-
-  // For queue, this is the direction we want.
-  bool operator<(const Gram &other) const {
-    return std::lexicographical_compare(other.begin, other.end, begin, end);
-  }
-};
-
-template <class Doing> class BlankManager {
-  public:
-    BlankManager(unsigned char total_order, Doing &doing) : 
total_order_(total_order), been_length_(0), doing_(doing) {
-      for (float *i = basis_; i != basis_ + KENLM_MAX_ORDER - 1; ++i) *i = 
kBadProb;
-    }
-
-    void Visit(const WordIndex *to, unsigned char length, float prob) {
-      basis_[length - 1] = prob;
-      unsigned char overlap = std::min<unsigned char>(length - 1, 
been_length_);
-      const WordIndex *cur;
-      WordIndex *pre;
-      for (cur = to, pre = been_; cur != to + overlap; ++cur, ++pre) {
-        if (*pre != *cur) break;
-      }
-      if (cur == to + length - 1) {
-        *pre = *cur;
-        been_length_ = length;
-        return;
-      }
-      // There are blanks to insert starting with order blank.
-      unsigned char blank = cur - to + 1;
-      UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that 
appears as context.");
-      const float *lower_basis;
-      for (lower_basis = basis_ + blank - 2; *lower_basis == kBadProb; 
--lower_basis) {}
-      unsigned char based_on = lower_basis - basis_ + 1;
-      for (; cur != to + length - 1; ++blank, ++cur, ++pre) {
-        assert(*lower_basis != kBadProb);
-        doing_.MiddleBlank(blank, to, based_on, *lower_basis);
-        *pre = *cur;
-        // Mark that the probability is a blank so it shouldn't be used as the 
basis for a later n-gram.
-        basis_[blank - 1] = kBadProb;
-      }
-      *pre = *cur;
-      been_length_ = length;
-    }
-
-  private:
-    const unsigned char total_order_;
-
-    WordIndex been_[KENLM_MAX_ORDER];
-    unsigned char been_length_;
-
-    float basis_[KENLM_MAX_ORDER];
-
-    Doing &doing_;
-};
-
-template <class Doing> void RecursiveInsert(const unsigned char total_order, 
const WordIndex unigram_count, RecordReader *input, std::ostream *progress_out, 
const char *message, Doing &doing) {
-  util::ErsatzProgress progress(unigram_count + 1, progress_out, message);
-  WordIndex unigram = 0;
-  std::priority_queue<Gram> grams;
-  if (unigram_count) grams.push(Gram(&unigram, 1));
-  for (unsigned char i = 2; i <= total_order; ++i) {
-    if (input[i-2]) grams.push(Gram(reinterpret_cast<const 
WordIndex*>(input[i-2].Data()), i));
-  }
-
-  BlankManager<Doing> blank(total_order, doing);
-
-  while (!grams.empty()) {
-    Gram top = grams.top();
-    grams.pop();
-    unsigned char order = top.end - top.begin;
-    if (order == 1) {
-      blank.Visit(&unigram, 1, doing.UnigramProb(unigram));
-      doing.Unigram(unigram);
-      progress.Set(unigram);
-      if (++unigram < unigram_count) grams.push(top);
-    } else {
-      if (order == total_order) {
-        blank.Visit(top.begin, order, reinterpret_cast<const 
Prob*>(top.end)->prob);
-        doing.Longest(top.begin);
-      } else {
-        blank.Visit(top.begin, order, reinterpret_cast<const 
ProbBackoff*>(top.end)->prob);
-        doing.Middle(order, top.begin);
-      }
-      RecordReader &reader = input[order - 2];
-      if (++reader) grams.push(top);
-    }
-  }
-}
-
-void SanityCheckCounts(const std::vector<uint64_t> &initial, const 
std::vector<uint64_t> &fixed) {
-  if (fixed[0] != initial[0]) UTIL_THROW(util::Exception, "Unigram count 
should be constant but initial is " << initial[0] << " and recounted is " << 
fixed[0]);
-  if (fixed.back() != initial.back()) UTIL_THROW(util::Exception, "Longest 
count should be constant but it changed from " << initial.back() << " to " << 
fixed.back());
-  for (unsigned char i = 0; i < initial.size(); ++i) {
-    if (fixed[i] < initial[i]) UTIL_THROW(util::Exception, "Counts came out 
lower than expected.  This shouldn't happen");
-  }
-}
-
-template <class Quant> void TrainQuantizer(uint8_t order, uint64_t count, 
const std::vector<float> &additional, RecordReader &reader, 
util::ErsatzProgress &progress, Quant &quant) {
-  std::vector<float> probs(additional), backoffs;
-  probs.reserve(count + additional.size());
-  backoffs.reserve(count);
-  for (reader.Rewind(); reader; ++reader) {
-    const ProbBackoff &weights = *reinterpret_cast<const 
ProbBackoff*>(reinterpret_cast<const uint8_t*>(reader.Data()) + 
sizeof(WordIndex) * order);
-    probs.push_back(weights.prob);
-    if (weights.backoff != 0.0) backoffs.push_back(weights.backoff);
-    ++progress;
-  }
-  quant.Train(order, probs, backoffs);
-}
-
-template <class Quant> void TrainProbQuantizer(uint8_t order, uint64_t count, 
RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) {
-  std::vector<float> probs, backoffs;
-  probs.reserve(count);
-  for (reader.Rewind(); reader; ++reader) {
-    const Prob &weights = *reinterpret_cast<const 
Prob*>(reinterpret_cast<const uint8_t*>(reader.Data()) + sizeof(WordIndex) * 
order);
-    probs.push_back(weights.prob);
-    ++progress;
-  }
-  quant.TrainProb(order, probs);
-}
-
-void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader 
&contexts, UnigramValue *unigrams) {
-  // Fill unigram probabilities.
-  try {
-    rewind(file);
-    for (WordIndex i = 0; i < unigram_count; ++i) {
-      ReadOrThrow(file, &unigrams[i].weights, sizeof(ProbBackoff));
-      if (contexts && *reinterpret_cast<const WordIndex*>(contexts.Data()) == 
i) {
-        SetExtension(unigrams[i].weights.backoff);
-        ++contexts;
-      }
-    }
-  } catch (util::Exception &e) {
-    e << " while re-reading unigram probabilities";
-    throw;
-  }
-}
-
-} // namespace
-
-template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, 
std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> 
&out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing) {
-  RecordReader inputs[KENLM_MAX_ORDER - 1];
-  RecordReader contexts[KENLM_MAX_ORDER - 1];
-
-  for (unsigned char i = 2; i <= counts.size(); ++i) {
-    inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == 
counts.size() ? sizeof(Prob) : sizeof(ProbBackoff)));
-    contexts[i-2].Init(files.Context(i), (i-1) * sizeof(WordIndex));
-  }
-
-  SRISucks sri;
-  std::vector<uint64_t> fixed_counts;
-  util::scoped_FILE unigram_file;
-  util::scoped_fd unigram_fd(files.StealUnigram());
-  {
-    util::scoped_memory unigrams;
-    MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * 
sizeof(ProbBackoff), unigrams);
-    FindBlanks finder(counts.size(), reinterpret_cast<const 
ProbBackoff*>(unigrams.get()), sri);
-    RecursiveInsert(counts.size(), counts[0], inputs, 
config.ProgressMessages(), "Identifying n-grams omitted by SRI", finder);
-    fixed_counts = finder.Counts();
-  }
-  unigram_file.reset(util::FDOpenOrThrow(unigram_fd));
-  for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) {
-    if (*i) UTIL_THROW(FormatLoadException, "There's a bug in the trie 
implementation: the " << (i - inputs + 2) << "-gram table did not complete 
reading");
-  }
-  SanityCheckCounts(counts, fixed_counts);
-  counts = fixed_counts;
-
-  sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs);
-
-  void *vocab_relocate;
-  void *search_base = backing.GrowForSearch(TrieSearch<Quant, 
Bhiksha>::Size(fixed_counts, config), vocab.UnkCountChangePadding(), 
vocab_relocate);
-  vocab.Relocate(vocab_relocate);
-  out.SetupMemory(reinterpret_cast<uint8_t*>(search_base), fixed_counts, 
config);
-
-  for (unsigned char i = 2; i <= counts.size(); ++i) {
-    inputs[i-2].Rewind();
-  }
-  if (Quant::kTrain) {
-    util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, 
counts.end(), 0),
-                                  config.ProgressMessages(), "Quantizing");
-    for (unsigned char i = 2; i < counts.size(); ++i) {
-      TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, 
quant);
-    }
-    TrainProbQuantizer(counts.size(), counts.back(), inputs[counts.size() - 
2], progress, quant);
-    quant.FinishedLoading(config);
-  }
-
-  UnigramValue *unigrams = out.unigram_.Raw();
-  PopulateUnigramWeights(unigram_file.get(), counts[0], contexts[0], unigrams);
-  unigram_file.reset();
-
-  for (unsigned char i = 2; i <= counts.size(); ++i) {
-    inputs[i-2].Rewind();
-  }
-  // Fill entries except unigram probabilities.
-  {
-    WriteEntries<Quant, Bhiksha> writer(contexts, quant, unigrams, 
out.middle_begin_, out.longest_, counts.size(), sri);
-    RecursiveInsert(counts.size(), counts[0], inputs, 
config.ProgressMessages(), "Writing trie", writer);
-    // Write the last unigram entry, which is the end pointer for the bigrams.
-    writer.Unigram(counts[0]);
-  }
-
-  // Do not disable this error message or else too little state will be 
returned.  Both WriteEntries::Middle and returning state based on found n-grams 
will need to be fixed to handle this situation.
-  for (unsigned char order = 2; order <= counts.size(); ++order) {
-    const RecordReader &context = contexts[order - 2];
-    if (context) {
-      FormatLoadException e;
-      e << "A " << static_cast<unsigned int>(order) << "-gram has context";
-      const WordIndex *ctx = reinterpret_cast<const 
WordIndex*>(context.Data());
-      for (const WordIndex *i = ctx; i != ctx + order - 1; ++i) {
-        e << ' ' << *i;
-      }
-      e << " so this context must appear in the model as a " << 
static_cast<unsigned int>(order - 1) << "-gram but it does not";
-      throw e;
-    }
-  }
-
-  /* Set ending offsets so the last entry will be sized properly */
-  // Last entry for unigrams was already set.
-  if (out.middle_begin_ != out.middle_end_) {
-    for (typename TrieSearch<Quant, Bhiksha>::Middle *i = out.middle_begin_; i 
!= out.middle_end_ - 1; ++i) {
-      i->FinishedLoading((i+1)->InsertIndex(), config);
-    }
-    (out.middle_end_ - 1)->FinishedLoading(out.longest_.InsertIndex(), config);
-  }
-}
-
-template <class Quant, class Bhiksha> uint8_t *TrieSearch<Quant, 
Bhiksha>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, 
const Config &config) {
-  quant_.SetupMemory(start, counts.size(), config);
-  start += Quant::Size(counts.size(), config);
-  unigram_.Init(start);
-  start += Unigram::Size(counts[0]);
-  FreeMiddles();
-  middle_begin_ = static_cast<Middle*>(malloc(sizeof(Middle) * (counts.size() 
- 2)));
-  middle_end_ = middle_begin_ + (counts.size() - 2);
-  std::vector<uint8_t*> middle_starts(counts.size() - 2);
-  for (unsigned char i = 2; i < counts.size(); ++i) {
-    middle_starts[i-2] = start;
-    start += Middle::Size(Quant::MiddleBits(config), counts[i-1], counts[0], 
counts[i], config);
-  }
-  // Crazy backwards thing so we initialize using pointers to ones that have 
already been initialized
-  for (unsigned char i = counts.size() - 1; i >= 2; --i) {
-    // use "placement new" syntax to initalize Middle in an already-allocated 
memory location
-    new (middle_begin_ + i - 2) Middle(
-        middle_starts[i-2],
-        quant_.MiddleBits(config),
-        counts[i-1],
-        counts[0],
-        counts[i],
-        (i == counts.size() - 1) ? static_cast<const BitPacked&>(longest_) : 
static_cast<const BitPacked &>(middle_begin_[i-1]),
-        config);
-  }
-  longest_.Init(start, quant_.LongestBits(config), counts[0]);
-  return start + Longest::Size(Quant::LongestBits(config), counts.back(), 
counts[0]);
-}
-
-template <class Quant, class Bhiksha> void TrieSearch<Quant, 
Bhiksha>::InitializeFromARPA(const char *file, util::FilePiece &f, 
std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, 
BinaryFormat &backing) {
-  std::string temporary_prefix;
-  if (!config.temporary_directory_prefix.empty()) {
-    temporary_prefix = config.temporary_directory_prefix;
-  } else if (config.write_mmap) {
-    temporary_prefix = config.write_mmap;
-  } else {
-    temporary_prefix = file;
-  }
-  // At least 1MB sorting memory.
-  SortedFiles sorted(config, f, counts, 
std::max<size_t>(config.building_memory, 1048576), temporary_prefix, vocab);
-
-  BuildTrie(sorted, counts, config, *this, quant_, vocab, backing);
-}
-
-template class TrieSearch<DontQuantize, DontBhiksha>;
-template class TrieSearch<DontQuantize, ArrayBhiksha>;
-template class TrieSearch<SeparatelyQuantize, DontBhiksha>;
-template class TrieSearch<SeparatelyQuantize, ArrayBhiksha>;
-
-} // namespace trie
-} // namespace ngram
-} // namespace lm


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/search_trie.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/search_trie.hh b/ext/kenlm/lm/search_trie.hh
deleted file mode 100644
index 1adba6e..0000000
--- a/ext/kenlm/lm/search_trie.hh
+++ /dev/null
@@ -1,129 +0,0 @@
-#ifndef LM_SEARCH_TRIE_H
-#define LM_SEARCH_TRIE_H
-
-#include "lm/config.hh"
-#include "lm/model_type.hh"
-#include "lm/return.hh"
-#include "lm/trie.hh"
-#include "lm/weights.hh"
-
-#include "util/file.hh"
-#include "util/file_piece.hh"
-
-#include <vector>
-#include <cstdlib>
-#include <cassert>
-
-namespace lm {
-namespace ngram {
-class BinaryFormat;
-class SortedVocabulary;
-namespace trie {
-
-template <class Quant, class Bhiksha> class TrieSearch;
-class SortedFiles;
-template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, 
std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> 
&out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing);
-
-template <class Quant, class Bhiksha> class TrieSearch {
-  public:
-    typedef NodeRange Node;
-
-    typedef ::lm::ngram::trie::UnigramPointer UnigramPointer;
-    typedef typename Quant::MiddlePointer MiddlePointer;
-    typedef typename Quant::LongestPointer LongestPointer;
-
-    static const bool kDifferentRest = false;
-
-    static const ModelType kModelType = static_cast<ModelType>(TRIE_SORTED + 
Quant::kModelTypeAdd + Bhiksha::kModelTypeAdd);
-
-    static const unsigned int kVersion = 1;
-
-    static void UpdateConfigFromBinary(const BinaryFormat &file, const 
std::vector<uint64_t> &counts, uint64_t offset, Config &config) {
-      Quant::UpdateConfigFromBinary(file, offset, config);
-      // Currently the unigram pointers are not compresssed, so there will 
only be a header for order > 2.
-      if (counts.size() > 2)
-        Bhiksha::UpdateConfigFromBinary(file, offset + 
Quant::Size(counts.size(), config) + Unigram::Size(counts[0]), config);
-    }
-
-    static uint64_t Size(const std::vector<uint64_t> &counts, const Config 
&config) {
-      uint64_t ret = Quant::Size(counts.size(), config) + 
Unigram::Size(counts[0]);
-      for (unsigned char i = 1; i < counts.size() - 1; ++i) {
-        ret += Middle::Size(Quant::MiddleBits(config), counts[i], counts[0], 
counts[i+1], config);
-      }
-      return ret + Longest::Size(Quant::LongestBits(config), counts.back(), 
counts[0]);
-    }
-
-    TrieSearch() : middle_begin_(NULL), middle_end_(NULL) {}
-
-    ~TrieSearch() { FreeMiddles(); }
-
-    uint8_t *SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, 
const Config &config);
-
-    void InitializeFromARPA(const char *file, util::FilePiece &f, 
std::vector<uint64_t> &counts, const Config &config, SortedVocabulary &vocab, 
BinaryFormat &backing);
-
-    unsigned char Order() const {
-      return middle_end_ - middle_begin_ + 2;
-    }
-
-    ProbBackoff &UnknownUnigram() { return unigram_.Unknown(); }
-
-    UnigramPointer LookupUnigram(WordIndex word, Node &next, bool 
&independent_left, uint64_t &extend_left) const {
-      extend_left = static_cast<uint64_t>(word);
-      UnigramPointer ret(unigram_.Find(word, next));
-      independent_left = (next.begin == next.end);
-      return ret;
-    }
-
-    MiddlePointer Unpack(uint64_t extend_pointer, unsigned char extend_length, 
Node &node) const {
-      return MiddlePointer(quant_, extend_length - 2, 
middle_begin_[extend_length - 2].ReadEntry(extend_pointer, node));
-    }
-
-    MiddlePointer LookupMiddle(unsigned char order_minus_2, WordIndex word, 
Node &node, bool &independent_left, uint64_t &extend_left) const {
-      util::BitAddress address(middle_begin_[order_minus_2].Find(word, node, 
extend_left));
-      independent_left = (address.base == NULL) || (node.begin == node.end);
-      return MiddlePointer(quant_, order_minus_2, address);
-    }
-
-    LongestPointer LookupLongest(WordIndex word, const Node &node) const {
-      return LongestPointer(quant_, longest_.Find(word, node));
-    }
-
-    bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node 
&node) const {
-      assert(begin != end);
-      bool independent_left;
-      uint64_t ignored;
-      LookupUnigram(*begin, node, independent_left, ignored);
-      for (const WordIndex *i = begin + 1; i < end; ++i) {
-        if (independent_left || !LookupMiddle(i - begin - 1, *i, node, 
independent_left, ignored).Found()) return false;
-      }
-      return true;
-    }
-
-  private:
-    friend void BuildTrie<Quant, Bhiksha>(SortedFiles &files, 
std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> 
&out, Quant &quant, SortedVocabulary &vocab, BinaryFormat &backing);
-
-    // Middles are managed manually so we can delay construction and they 
don't have to be copyable.
-    void FreeMiddles() {
-      for (const Middle *i = middle_begin_; i != middle_end_; ++i) {
-        i->~Middle();
-      }
-      std::free(middle_begin_);
-    }
-
-    typedef trie::BitPackedMiddle<Bhiksha> Middle;
-
-    typedef trie::BitPackedLongest Longest;
-    Longest longest_;
-
-    Middle *middle_begin_, *middle_end_;
-    Quant quant_;
-
-    typedef ::lm::ngram::trie::Unigram Unigram;
-    Unigram unigram_;
-};
-
-} // namespace trie
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_SEARCH_TRIE_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/sizes.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/sizes.cc b/ext/kenlm/lm/sizes.cc
deleted file mode 100644
index dd831c5..0000000
--- a/ext/kenlm/lm/sizes.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-#include "lm/sizes.hh"
-#include "lm/model.hh"
-#include "util/file_piece.hh"
-
-#include <vector>
-#include <iomanip>
-
-namespace lm {
-namespace ngram {
-
-void ShowSizes(const std::vector<uint64_t> &counts, const lm::ngram::Config 
&config) {
-  uint64_t sizes[6];
-  sizes[0] = ProbingModel::Size(counts, config);
-  sizes[1] = RestProbingModel::Size(counts, config);
-  sizes[2] = TrieModel::Size(counts, config);
-  sizes[3] = QuantTrieModel::Size(counts, config);
-  sizes[4] = ArrayTrieModel::Size(counts, config);
-  sizes[5] = QuantArrayTrieModel::Size(counts, config);
-  uint64_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / 
sizeof(uint64_t));
-  uint64_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / 
sizeof(uint64_t));
-  uint64_t divide;
-  char prefix;
-  if (min_length < (1 << 10) * 10) {
-    prefix = ' ';
-    divide = 1;
-  } else if (min_length < (1 << 20) * 10) {
-    prefix = 'k';
-    divide = 1 << 10;
-  } else if (min_length < (1ULL << 30) * 10) {
-    prefix = 'M';
-    divide = 1 << 20;
-  } else {
-    prefix = 'G';
-    divide = 1 << 30;
-  }
-  long int length = std::max<long int>(2, static_cast<long 
int>(ceil(log10((double) max_length / divide))));
-  std::cerr << "Memory estimate for binary LM:\ntype    ";
-
-  // right align bytes.
-  for (long int i = 0; i < length - 2; ++i) std::cerr << ' ';
-
-  std::cerr << prefix << "B\n"
-    "probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " 
<< config.probing_multiplier << "\n"
-    "probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r 
models -p " << config.probing_multiplier << "\n"
-    "trie    " << std::setw(length) << (sizes[2] / divide) << " without 
quantization\n"
-    "trie    " << std::setw(length) << (sizes[3] / divide) << " assuming -q " 
<< (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " 
quantization \n"
-    "trie    " << std::setw(length) << (sizes[4] / divide) << " assuming -a " 
<< (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n"
-    "trie    " << std::setw(length) << (sizes[5] / divide) << " assuming -a " 
<< (unsigned)config.pointer_bhiksha_bits << " -q " << 
(unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array 
pointer compression and quantization\n";
-}
-
-void ShowSizes(const std::vector<uint64_t> &counts) {
-  lm::ngram::Config config;
-  ShowSizes(counts, config);
-}
-
-void ShowSizes(const char *file, const lm::ngram::Config &config) {
-  std::vector<uint64_t> counts;
-  util::FilePiece f(file);
-  lm::ReadARPACounts(f, counts);
-  ShowSizes(counts, config);
-}
-
-}} //namespaces

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/sizes.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/sizes.hh b/ext/kenlm/lm/sizes.hh
deleted file mode 100644
index eb7e99d..0000000
--- a/ext/kenlm/lm/sizes.hh
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef LM_SIZES_H
-#define LM_SIZES_H
-
-#include <vector>
-
-#include <stdint.h>
-
-namespace lm { namespace ngram {
-
-struct Config;
-
-void ShowSizes(const std::vector<uint64_t> &counts, const lm::ngram::Config 
&config);
-void ShowSizes(const std::vector<uint64_t> &counts);
-void ShowSizes(const char *file, const lm::ngram::Config &config);
-
-}} // namespaces
-#endif // LM_SIZES_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/state.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/state.hh b/ext/kenlm/lm/state.hh
deleted file mode 100644
index 2195dee..0000000
--- a/ext/kenlm/lm/state.hh
+++ /dev/null
@@ -1,125 +0,0 @@
-#ifndef LM_STATE_H
-#define LM_STATE_H
-
-#include "lm/max_order.hh"
-#include "lm/word_index.hh"
-#include "util/murmur_hash.hh"
-
-#include <cstring>
-
-namespace lm {
-namespace ngram {
-
-// This is a POD but if you want memcmp to return the same as operator==, call
-// ZeroRemaining first.
-class State {
-  public:
-    bool operator==(const State &other) const {
-      if (length != other.length) return false;
-      return !memcmp(words, other.words, length * sizeof(WordIndex));
-    }
-
-    // Three way comparison function.
-    int Compare(const State &other) const {
-      if (length != other.length) return length < other.length ? -1 : 1;
-      return memcmp(words, other.words, length * sizeof(WordIndex));
-    }
-
-    bool operator<(const State &other) const {
-      if (length != other.length) return length < other.length;
-      return memcmp(words, other.words, length * sizeof(WordIndex)) < 0;
-    }
-
-    // Call this before using raw memcmp.
-    void ZeroRemaining() {
-      for (unsigned char i = length; i < KENLM_MAX_ORDER - 1; ++i) {
-        words[i] = 0;
-        backoff[i] = 0.0;
-      }
-    }
-
-    unsigned char Length() const { return length; }
-
-    // You shouldn't need to touch anything below this line, but the members 
are public so FullState will qualify as a POD.
-    // This order minimizes total size of the struct if WordIndex is 64 bit, 
float is 32 bit, and alignment of 64 bit integers is 64 bit.
-    WordIndex words[KENLM_MAX_ORDER - 1];
-    float backoff[KENLM_MAX_ORDER - 1];
-    unsigned char length;
-};
-
-typedef State Right;
-
-inline uint64_t hash_value(const State &state, uint64_t seed = 0) {
-  return util::MurmurHashNative(state.words, sizeof(WordIndex) * state.length, 
seed);
-}
-
-struct Left {
-  bool operator==(const Left &other) const {
-    return
-      length == other.length &&
-      (!length || (pointers[length - 1] == other.pointers[length - 1] && full 
== other.full));
-  }
-
-  int Compare(const Left &other) const {
-    if (length < other.length) return -1;
-    if (length > other.length) return 1;
-    if (length == 0) return 0; // Must be full.
-    if (pointers[length - 1] > other.pointers[length - 1]) return 1;
-    if (pointers[length - 1] < other.pointers[length - 1]) return -1;
-    return (int)full - (int)other.full;
-  }
-
-  bool operator<(const Left &other) const {
-    return Compare(other) == -1;
-  }
-
-  void ZeroRemaining() {
-    for (uint64_t * i = pointers + length; i < pointers + KENLM_MAX_ORDER - 1; 
++i)
-      *i = 0;
-  }
-
-  uint64_t pointers[KENLM_MAX_ORDER - 1];
-  unsigned char length;
-  bool full;
-};
-
-inline uint64_t hash_value(const Left &left) {
-  unsigned char add[2];
-  add[0] = left.length;
-  add[1] = left.full;
-  return util::MurmurHashNative(add, 2, left.length ? 
left.pointers[left.length - 1] : 0);
-}
-
-struct ChartState {
-  bool operator==(const ChartState &other) const {
-    return (right == other.right) && (left == other.left);
-  }
-
-  int Compare(const ChartState &other) const {
-    int lres = left.Compare(other.left);
-    if (lres) return lres;
-    return right.Compare(other.right);
-  }
-
-  bool operator<(const ChartState &other) const {
-    return Compare(other) < 0;
-  }
-
-  void ZeroRemaining() {
-    left.ZeroRemaining();
-    right.ZeroRemaining();
-  }
-
-  Left left;
-  State right;
-};
-
-inline uint64_t hash_value(const ChartState &state) {
-  return hash_value(state.right, hash_value(state.left));
-}
-
-
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_STATE_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/test.arpa
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/test.arpa b/ext/kenlm/lm/test.arpa
deleted file mode 100644
index c4d2e6d..0000000
--- a/ext/kenlm/lm/test.arpa
+++ /dev/null
@@ -1,124 +0,0 @@
-
-\data\
-ngram 1=37
-ngram 2=47
-ngram 3=11
-ngram 4=6
-ngram 5=4
-
-\1-grams:
--1.383514      ,       -0.30103
--1.139057      .       -0.845098
--1.029493      </s>
--99    <s>     -0.4149733
--1.995635      <unk>   -20
--1.285941      a       -0.69897
--1.687872      also    -0.30103
--1.687872      beyond  -0.30103
--1.687872      biarritz        -0.30103
--1.687872      call    -0.30103
--1.687872      concerns        -0.30103
--1.687872      consider        -0.30103
--1.687872      considering     -0.30103
--1.687872      for     -0.30103
--1.509559      higher  -0.30103
--1.687872      however -0.30103
--1.687872      i       -0.30103
--1.687872      immediate       -0.30103
--1.687872      in      -0.30103
--1.687872      is      -0.30103
--1.285941      little  -0.69897
--1.383514      loin    -0.30103
--1.687872      look    -0.30103
--1.285941      looking -0.4771212
--1.206319      more    -0.544068
--1.509559      on      -0.4771212
--1.509559      screening       -0.4771212
--1.687872      small   -0.30103
--1.687872      the     -0.30103
--1.687872      to      -0.30103
--1.687872      watch   -0.30103
--1.687872      watching        -0.30103
--1.687872      what    -0.30103
--1.687872      would   -0.30103
--3.141592      foo
--2.718281      bar     3.0
--6.535897      baz     -0.0
-
-\2-grams:
--0.6925742     , .
--0.7522095     , however
--0.7522095     , is
--0.0602359     . </s>
--0.4846522     <s> looking     -0.4771214
--1.051485      <s> screening
--1.07153       <s> the
--1.07153       <s> watching
--1.07153       <s> what
--0.09132547    a little        -0.69897
--0.2922095     also call
--0.2922095     beyond immediate
--0.2705918     biarritz .
--0.2922095     call for
--0.2922095     concerns in
--0.2922095     consider watch
--0.2922095     considering consider
--0.2834328     for ,
--0.5511513     higher more
--0.5845945     higher small
--0.2834328     however ,
--0.2922095     i would
--0.2922095     immediate concerns
--0.2922095     in biarritz
--0.2922095     is to
--0.09021038    little more     -0.1998621
--0.7273645     loin ,
--0.6925742     loin .
--0.6708385     loin </s>
--0.2922095     look beyond
--0.4638903     looking higher
--0.4638903     looking on      -0.4771212
--0.5136299     more .  -0.4771212
--0.3561665     more loin
--0.1649931     on a    -0.4771213
--0.1649931     screening a     -0.4771213
--0.2705918     small .
--0.287799      the screening
--0.2922095     to look
--0.2622373     watch </s>
--0.2922095     watching considering
--0.2922095     what i
--0.2922095     would also
--2     also would      -6
--15    <unk> <unk>     -2
--4     <unk> however   -1
--6     foo bar
-
-\3-grams:
--0.01916512    more . </s>
--0.0283603     on a little     -0.4771212
--0.0283603     screening a little      -0.4771212
--0.01660496    a little more   -0.09409451
--0.3488368     <s> looking higher
--0.3488368     <s> looking on  -0.4771212
--0.1892331     little more loin
--0.04835128    looking on a    -0.4771212
--3     also would consider     -7
--6     <unk> however <unk>     -12
--7     to look a
-
-\4-grams:
--0.009249173   looking on a little     -0.4771212
--0.005464747   on a little more        -0.4771212
--0.005464747   screening a little more
--0.1453306     a little more loin
--0.01552657    <s> looking on a        -0.4771212
--4     also would consider higher      -8
-
-\5-grams:
--0.003061223   <s> looking on a little
--0.001813953   looking on a little more
--0.0432557     on a little more loin
--5     also would consider higher looking
-
-\end\

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/test_nounk.arpa
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/test_nounk.arpa b/ext/kenlm/lm/test_nounk.arpa
deleted file mode 100644
index e38fc85..0000000
--- a/ext/kenlm/lm/test_nounk.arpa
+++ /dev/null
@@ -1,120 +0,0 @@
-
-\data\
-ngram 1=36
-ngram 2=45
-ngram 3=10
-ngram 4=6
-ngram 5=4
-
-\1-grams:
--1.383514      ,       -0.30103
--1.139057      .       -0.845098
--1.029493      </s>
--99    <s>     -0.4149733
--1.285941      a       -0.69897
--1.687872      also    -0.30103
--1.687872      beyond  -0.30103
--1.687872      biarritz        -0.30103
--1.687872      call    -0.30103
--1.687872      concerns        -0.30103
--1.687872      consider        -0.30103
--1.687872      considering     -0.30103
--1.687872      for     -0.30103
--1.509559      higher  -0.30103
--1.687872      however -0.30103
--1.687872      i       -0.30103
--1.687872      immediate       -0.30103
--1.687872      in      -0.30103
--1.687872      is      -0.30103
--1.285941      little  -0.69897
--1.383514      loin    -0.30103
--1.687872      look    -0.30103
--1.285941      looking -0.4771212
--1.206319      more    -0.544068
--1.509559      on      -0.4771212
--1.509559      screening       -0.4771212
--1.687872      small   -0.30103
--1.687872      the     -0.30103
--1.687872      to      -0.30103
--1.687872      watch   -0.30103
--1.687872      watching        -0.30103
--1.687872      what    -0.30103
--1.687872      would   -0.30103
--3.141592      foo
--2.718281      bar     3.0
--6.535897      baz     -0.0
-
-\2-grams:
--0.6925742     , .
--0.7522095     , however
--0.7522095     , is
--0.0602359     . </s>
--0.4846522     <s> looking     -0.4771214
--1.051485      <s> screening
--1.07153       <s> the
--1.07153       <s> watching
--1.07153       <s> what
--0.09132547    a little        -0.69897
--0.2922095     also call
--0.2922095     beyond immediate
--0.2705918     biarritz .
--0.2922095     call for
--0.2922095     concerns in
--0.2922095     consider watch
--0.2922095     considering consider
--0.2834328     for ,
--0.5511513     higher more
--0.5845945     higher small
--0.2834328     however ,
--0.2922095     i would
--0.2922095     immediate concerns
--0.2922095     in biarritz
--0.2922095     is to
--0.09021038    little more     -0.1998621
--0.7273645     loin ,
--0.6925742     loin .
--0.6708385     loin </s>
--0.2922095     look beyond
--0.4638903     looking higher
--0.4638903     looking on      -0.4771212
--0.5136299     more .  -0.4771212
--0.3561665     more loin
--0.1649931     on a    -0.4771213
--0.1649931     screening a     -0.4771213
--0.2705918     small .
--0.287799      the screening
--0.2922095     to look
--0.2622373     watch </s>
--0.2922095     watching considering
--0.2922095     what i
--0.2922095     would also
--2     also would      -6
--6     foo bar
-
-\3-grams:
--0.01916512    more . </s>
--0.0283603     on a little     -0.4771212
--0.0283603     screening a little      -0.4771212
--0.01660496    a little more   -0.09409451
--0.3488368     <s> looking higher
--0.3488368     <s> looking on  -0.4771212
--0.1892331     little more loin
--0.04835128    looking on a    -0.4771212
--3     also would consider     -7
--7     to look a
-
-\4-grams:
--0.009249173   looking on a little     -0.4771212
--0.005464747   on a little more        -0.4771212
--0.005464747   screening a little more
--0.1453306     a little more loin
--0.01552657    <s> looking on a        -0.4771212
--4     also would consider higher      -8
-
-\5-grams:
--0.003061223   <s> looking on a little
--0.001813953   looking on a little more
--0.0432557     on a little more loin
--5     also would consider higher looking
-
-\end\

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/trie.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/trie.cc b/ext/kenlm/lm/trie.cc
deleted file mode 100644
index 72ad544..0000000
--- a/ext/kenlm/lm/trie.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-#include "lm/trie.hh"
-
-#include "lm/bhiksha.hh"
-#include "util/bit_packing.hh"
-#include "util/exception.hh"
-#include "util/sorted_uniform.hh"
-
-#include <cassert>
-
-namespace lm {
-namespace ngram {
-namespace trie {
-namespace {
-
-class KeyAccessor {
-  public:
-    KeyAccessor(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_t 
total_bits)
-      : base_(reinterpret_cast<const uint8_t*>(base)), key_mask_(key_mask), 
key_bits_(key_bits), total_bits_(total_bits) {}
-
-    typedef uint64_t Key;
-
-    Key operator()(uint64_t index) const {
-      return util::ReadInt57(base_, index * 
static_cast<uint64_t>(total_bits_), key_bits_, key_mask_);
-    }
-
-  private:
-    const uint8_t *const base_;
-    const WordIndex key_mask_;
-    const uint8_t key_bits_, total_bits_;
-};
-
-bool FindBitPacked(const void *base, uint64_t key_mask, uint8_t key_bits, 
uint8_t total_bits, uint64_t begin_index, uint64_t end_index, const uint64_t 
max_vocab, const uint64_t key, uint64_t &at_index) {
-  KeyAccessor accessor(base, key_mask, key_bits, total_bits);
-  if (!util::BoundedSortedUniformFind<uint64_t, KeyAccessor, 
util::PivotSelect<sizeof(WordIndex)>::T>(accessor, begin_index - 1, 
(uint64_t)0, end_index, max_vocab, key, at_index)) return false;
-  return true;
-}
-} // namespace
-
-uint64_t BitPacked::BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t 
remaining_bits) {
-  uint8_t total_bits = util::RequiredBits(max_vocab) + remaining_bits;
-  // Extra entry for next pointer at the end.
-  // +7 then / 8 to round up bits and convert to bytes
-  // +sizeof(uint64_t) so that ReadInt57 etc don't go segfault.
-  // Note that this waste is O(order), not O(number of ngrams).
-  return ((1 + entries) * total_bits + 7) / 8 + sizeof(uint64_t);
-}
-
-void BitPacked::BaseInit(void *base, uint64_t max_vocab, uint8_t 
remaining_bits) {
-  util::BitPackingSanity();
-  word_bits_ = util::RequiredBits(max_vocab);
-  word_mask_ = (1ULL << word_bits_) - 1ULL;
-  if (word_bits_ > 57) UTIL_THROW(util::Exception, "Sorry, word indices more 
than " << (1ULL << 57) << " are not implemented.  Edit util/bit_packing.hh and 
fix the bit packing functions.");
-  total_bits_ = word_bits_ + remaining_bits;
-
-  base_ = static_cast<uint8_t*>(base);
-  insert_index_ = 0;
-  max_vocab_ = max_vocab;
-}
-
-template <class Bhiksha> uint64_t BitPackedMiddle<Bhiksha>::Size(uint8_t 
quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_ptr, const 
Config &config) {
-  return Bhiksha::Size(entries + 1, max_ptr, config) + BaseSize(entries, 
max_vocab, quant_bits + Bhiksha::InlineBits(entries + 1, max_ptr, config));
-}
-
-template <class Bhiksha> BitPackedMiddle<Bhiksha>::BitPackedMiddle(void *base, 
uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, 
const BitPacked &next_source, const Config &config) :
-  BitPacked(),
-  quant_bits_(quant_bits),
-  // If the offset of the method changes, also change 
TrieSearch::UpdateConfigFromBinary.
-  bhiksha_(base, entries + 1, max_next, config),
-  next_source_(&next_source) {
-  if (entries + 1 >= (1ULL << 57) || (max_next >= (1ULL << 57)))  
UTIL_THROW(util::Exception, "Sorry, this does not support more than " << (1ULL 
<< 57) << " n-grams of a particular order.  Edit util/bit_packing.hh and fix 
the bit packing functions.");
-  BaseInit(reinterpret_cast<uint8_t*>(base) + Bhiksha::Size(entries + 1, 
max_next, config), max_vocab, quant_bits_ + bhiksha_.InlineBits());
-}
-
-template <class Bhiksha> util::BitAddress 
BitPackedMiddle<Bhiksha>::Insert(WordIndex word) {
-  assert(word <= word_mask_);
-  uint64_t at_pointer = insert_index_ * total_bits_;
-
-  util::WriteInt57(base_, at_pointer, word_bits_, word);
-  at_pointer += word_bits_;
-  util::BitAddress ret(base_, at_pointer);
-  at_pointer += quant_bits_;
-  uint64_t next = next_source_->InsertIndex();
-  bhiksha_.WriteNext(base_, at_pointer, insert_index_, next);
-  ++insert_index_;
-  return ret;
-}
-
-template <class Bhiksha> util::BitAddress 
BitPackedMiddle<Bhiksha>::Find(WordIndex word, NodeRange &range, uint64_t 
&pointer) const {
-  uint64_t at_pointer;
-  if (!FindBitPacked(base_, word_mask_, word_bits_, total_bits_, range.begin, 
range.end, max_vocab_, word, at_pointer)) {
-    return util::BitAddress(NULL, 0);
-  }
-  pointer = at_pointer;
-  at_pointer *= total_bits_;
-  at_pointer += word_bits_;
-  bhiksha_.ReadNext(base_, at_pointer + quant_bits_, pointer, total_bits_, 
range);
-
-  return util::BitAddress(base_, at_pointer);
-}
-
-template <class Bhiksha> void 
BitPackedMiddle<Bhiksha>::FinishedLoading(uint64_t next_end, const Config 
&config) {
-  // Write at insert_index. . .
-  uint64_t last_next_write = insert_index_ * total_bits_ +
-    // at the offset where the next pointers are stored.
-    (total_bits_ - bhiksha_.InlineBits());
-  bhiksha_.WriteNext(base_, last_next_write, insert_index_, next_end);
-  bhiksha_.FinishedLoading(config);
-}
-
-util::BitAddress BitPackedLongest::Insert(WordIndex index) {
-  assert(index <= word_mask_);
-  uint64_t at_pointer = insert_index_ * total_bits_;
-  util::WriteInt57(base_, at_pointer, word_bits_, index);
-  at_pointer += word_bits_;
-  ++insert_index_;
-  return util::BitAddress(base_, at_pointer);
-}
-
-util::BitAddress BitPackedLongest::Find(WordIndex word, const NodeRange 
&range) const {
-  uint64_t at_pointer;
-  if (!FindBitPacked(base_, word_mask_, word_bits_, total_bits_, range.begin, 
range.end, max_vocab_, word, at_pointer)) return util::BitAddress(NULL, 0);
-  at_pointer = at_pointer * total_bits_ + word_bits_;
-  return util::BitAddress(base_, at_pointer);
-}
-
-template class BitPackedMiddle<DontBhiksha>;
-template class BitPackedMiddle<ArrayBhiksha>;
-
-} // namespace trie
-} // namespace ngram
-} // namespace lm

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/trie.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/trie.hh b/ext/kenlm/lm/trie.hh
deleted file mode 100644
index b7f0458..0000000
--- a/ext/kenlm/lm/trie.hh
+++ /dev/null
@@ -1,146 +0,0 @@
-#ifndef LM_TRIE_H
-#define LM_TRIE_H
-
-#include "lm/weights.hh"
-#include "lm/word_index.hh"
-#include "util/bit_packing.hh"
-
-#include <cstddef>
-
-#include <stdint.h>
-
-namespace lm {
-namespace ngram {
-struct Config;
-namespace trie {
-
-struct NodeRange {
-  uint64_t begin, end;
-};
-
-// TODO: if the number of unigrams is a concern, also bit pack these records.
-struct UnigramValue {
-  ProbBackoff weights;
-  uint64_t next;
-  uint64_t Next() const { return next; }
-};
-
-class UnigramPointer {
-  public:
-    explicit UnigramPointer(const ProbBackoff &to) : to_(&to) {}
-
-    UnigramPointer() : to_(NULL) {}
-
-    bool Found() const { return to_ != NULL; }
-
-    float Prob() const { return to_->prob; }
-    float Backoff() const { return to_->backoff; }
-    float Rest() const { return Prob(); }
-
-  private:
-    const ProbBackoff *to_;
-};
-
-class Unigram {
-  public:
-    Unigram() {}
-
-    void Init(void *start) {
-      unigram_ = static_cast<UnigramValue*>(start);
-    }
-
-    static uint64_t Size(uint64_t count) {
-      // +1 in case unknown doesn't appear.  +1 for the final next.
-      return (count + 2) * sizeof(UnigramValue);
-    }
-
-    const ProbBackoff &Lookup(WordIndex index) const { return 
unigram_[index].weights; }
-
-    ProbBackoff &Unknown() { return unigram_[0].weights; }
-
-    UnigramValue *Raw() {
-      return unigram_;
-    }
-
-    UnigramPointer Find(WordIndex word, NodeRange &next) const {
-      UnigramValue *val = unigram_ + word;
-      next.begin = val->next;
-      next.end = (val+1)->next;
-      return UnigramPointer(val->weights);
-    }
-
-  private:
-    UnigramValue *unigram_;
-};
-
-class BitPacked {
-  public:
-    BitPacked() {}
-
-    uint64_t InsertIndex() const {
-      return insert_index_;
-    }
-
-  protected:
-    static uint64_t BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t 
remaining_bits);
-
-    void BaseInit(void *base, uint64_t max_vocab, uint8_t remaining_bits);
-
-    uint8_t word_bits_;
-    uint8_t total_bits_;
-    uint64_t word_mask_;
-
-    uint8_t *base_;
-
-    uint64_t insert_index_, max_vocab_;
-};
-
-template <class Bhiksha> class BitPackedMiddle : public BitPacked {
-  public:
-    static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t 
max_vocab, uint64_t max_next, const Config &config);
-
-    // next_source need not be initialized.
-    BitPackedMiddle(void *base, uint8_t quant_bits, uint64_t entries, uint64_t 
max_vocab, uint64_t max_next, const BitPacked &next_source, const Config 
&config);
-
-    util::BitAddress Insert(WordIndex word);
-
-    void FinishedLoading(uint64_t next_end, const Config &config);
-
-    util::BitAddress Find(WordIndex word, NodeRange &range, uint64_t &pointer) 
const;
-
-    util::BitAddress ReadEntry(uint64_t pointer, NodeRange &range) {
-      uint64_t addr = pointer * total_bits_;
-      addr += word_bits_;
-      bhiksha_.ReadNext(base_, addr + quant_bits_, pointer, total_bits_, 
range);
-      return util::BitAddress(base_, addr);
-    }
-
-  private:
-    uint8_t quant_bits_;
-    Bhiksha bhiksha_;
-
-    const BitPacked *next_source_;
-};
-
-class BitPackedLongest : public BitPacked {
-  public:
-    static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t 
max_vocab) {
-      return BaseSize(entries, max_vocab, quant_bits);
-    }
-
-    BitPackedLongest() {}
-
-    void Init(void *base, uint8_t quant_bits, uint64_t max_vocab) {
-      BaseInit(base, max_vocab, quant_bits);
-    }
-
-    util::BitAddress Insert(WordIndex word);
-
-    util::BitAddress Find(WordIndex word, const NodeRange &node) const;
-};
-
-} // namespace trie
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_TRIE_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/trie_sort.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/trie_sort.cc b/ext/kenlm/lm/trie_sort.cc
deleted file mode 100644
index 33a2f96..0000000
--- a/ext/kenlm/lm/trie_sort.cc
+++ /dev/null
@@ -1,304 +0,0 @@
-#include "lm/trie_sort.hh"
-
-#include "lm/config.hh"
-#include "lm/lm_exception.hh"
-#include "lm/read_arpa.hh"
-#include "lm/vocab.hh"
-#include "lm/weights.hh"
-#include "lm/word_index.hh"
-#include "util/file_piece.hh"
-#include "util/mmap.hh"
-#include "util/proxy_iterator.hh"
-#include "util/sized_iterator.hh"
-
-#include <algorithm>
-#include <cstring>
-#include <cstdio>
-#include <cstdlib>
-#include <deque>
-#include <iterator>
-#include <limits>
-#include <vector>
-
-namespace lm {
-namespace ngram {
-namespace trie {
-namespace {
-
-typedef util::SizedIterator NGramIter;
-
-// Proxy for an entry except there is some extra cruft between the entries.  
This is used to sort (n-1)-grams using the same memory as the sorted n-grams.
-class PartialViewProxy {
-  public:
-    PartialViewProxy() : attention_size_(0), inner_() {}
-
-    PartialViewProxy(void *ptr, std::size_t block_size, std::size_t 
attention_size) : attention_size_(attention_size), inner_(ptr, block_size) {}
-
-    operator std::string() const {
-      return std::string(reinterpret_cast<const char*>(inner_.Data()), 
attention_size_);
-    }
-
-    PartialViewProxy &operator=(const PartialViewProxy &from) {
-      memcpy(inner_.Data(), from.inner_.Data(), attention_size_);
-      return *this;
-    }
-
-    PartialViewProxy &operator=(const std::string &from) {
-      memcpy(inner_.Data(), from.data(), attention_size_);
-      return *this;
-    }
-
-    const void *Data() const { return inner_.Data(); }
-    void *Data() { return inner_.Data(); }
-
-    friend void swap(PartialViewProxy first, PartialViewProxy second) {
-      std::swap_ranges(reinterpret_cast<char*>(first.Data()), 
reinterpret_cast<char*>(first.Data()) + first.attention_size_, 
reinterpret_cast<char*>(second.Data()));
-    }
-
-  private:
-    friend class util::ProxyIterator<PartialViewProxy>;
-
-    typedef std::string value_type;
-
-    const std::size_t attention_size_;
-
-    typedef util::SizedInnerIterator InnerIterator;
-    InnerIterator &Inner() { return inner_; }
-    const InnerIterator &Inner() const { return inner_; }
-    InnerIterator inner_;
-};
-
-typedef util::ProxyIterator<PartialViewProxy> PartialIter;
-
-FILE *DiskFlush(const void *mem_begin, const void *mem_end, const std::string 
&temp_prefix) {
-  util::scoped_fd file(util::MakeTemp(temp_prefix));
-  util::WriteOrThrow(file.get(), mem_begin, (uint8_t*)mem_end - 
(uint8_t*)mem_begin);
-  return util::FDOpenOrThrow(file);
-}
-
-FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string 
&temp_prefix, std::size_t entry_size, unsigned char order) {
-  const size_t context_size = sizeof(WordIndex) * (order - 1);
-  // Sort just the contexts using the same memory.
-  PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), 
entry_size, context_size));
-  PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), 
entry_size, context_size));
-
-#if defined(_WIN32) || defined(_WIN64)
-  std::stable_sort
-#else
-  std::sort
-#endif
-    (context_begin, context_end, util::SizedCompare<EntryCompare, 
PartialViewProxy>(EntryCompare(order - 1)));
-
-  util::scoped_FILE out(util::FMakeTemp(temp_prefix));
-
-  // Write out to file and uniqueify at the same time.  Could have used 
unique_copy if there was an appropriate OutputIterator.
-  if (context_begin == context_end) return out.release();
-  PartialIter i(context_begin);
-  util::WriteOrThrow(out.get(), i->Data(), context_size);
-  const void *previous = i->Data();
-  ++i;
-  for (; i != context_end; ++i) {
-    if (memcmp(previous, i->Data(), context_size)) {
-      util::WriteOrThrow(out.get(), i->Data(), context_size);
-      previous = i->Data();
-    }
-  }
-  return out.release();
-}
-
-struct ThrowCombine {
-  void operator()(std::size_t entry_size, unsigned char order, const void 
*first, const void *second, FILE * /*out*/) const {
-    const WordIndex *base = reinterpret_cast<const WordIndex*>(first);
-    FormatLoadException e;
-    e << "Duplicate n-gram detected with vocab ids";
-    for (const WordIndex *i = base; i != base + order; ++i) {
-      e << ' ' << *i;
-    }
-    throw e;
-  }
-};
-
-// Useful for context files that just contain records with no value.
-struct FirstCombine {
-  void operator()(std::size_t entry_size, unsigned char /*order*/, const void 
*first, const void * /*second*/, FILE *out) const {
-    util::WriteOrThrow(out, first, entry_size);
-  }
-};
-
-template <class Combine> FILE *MergeSortedFiles(FILE *first_file, FILE 
*second_file, const std::string &temp_prefix, std::size_t weights_size, 
unsigned char order, const Combine &combine) {
-  std::size_t entry_size = sizeof(WordIndex) * order + weights_size;
-  RecordReader first, second;
-  first.Init(first_file, entry_size);
-  second.Init(second_file, entry_size);
-  util::scoped_FILE out_file(util::FMakeTemp(temp_prefix));
-  EntryCompare less(order);
-  while (first && second) {
-    if (less(first.Data(), second.Data())) {
-      util::WriteOrThrow(out_file.get(), first.Data(), entry_size);
-      ++first;
-    } else if (less(second.Data(), first.Data())) {
-      util::WriteOrThrow(out_file.get(), second.Data(), entry_size);
-      ++second;
-    } else {
-      combine(entry_size, order, first.Data(), second.Data(), out_file.get());
-      ++first; ++second;
-    }
-  }
-  for (RecordReader &remains = (first ? first : second); remains; ++remains) {
-    util::WriteOrThrow(out_file.get(), remains.Data(), entry_size);
-  }
-  return out_file.release();
-}
-
-} // namespace
-
-void RecordReader::Init(FILE *file, std::size_t entry_size) {
-  entry_size_ = entry_size;
-  data_.reset(malloc(entry_size));
-  UTIL_THROW_IF(!data_.get(), util::ErrnoException, "Failed to malloc read 
buffer");
-  file_ = file;
-  if (file) {
-    rewind(file);
-    remains_ = true;
-    ++*this;
-  } else {
-    remains_ = false;
-  }
-}
-
-void RecordReader::Overwrite(const void *start, std::size_t amount) {
-  long internal = (uint8_t*)start - (uint8_t*)data_.get();
-  UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), 
util::ErrnoException, "Couldn't seek backwards for revision");
-  util::WriteOrThrow(file_, start, amount);
-  long forward = entry_size_ - internal - amount;
-#if !defined(_WIN32) && !defined(_WIN64)
-  if (forward)
-#endif
-    UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, 
"Couldn't seek forwards past revision");
-}
-
-void RecordReader::Rewind() {
-  if (file_) {
-    rewind(file_);
-    remains_ = true;
-    ++*this;
-  } else {
-    remains_ = false;
-  }
-}
-
-SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, 
std::vector<uint64_t> &counts, size_t buffer, const std::string &file_prefix, 
SortedVocabulary &vocab) {
-  PositiveProbWarn warn(config.positive_log_probability);
-  unigram_.reset(util::MakeTemp(file_prefix));
-  {
-    // In case <unk> appears.
-    size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff);
-    util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_.get(), 
size_out), size_out);
-    Read1Grams(f, counts[0], vocab, 
reinterpret_cast<ProbBackoff*>(unigram_mmap.get()), warn);
-    CheckSpecials(config, vocab);
-    if (!vocab.SawUnk()) ++counts[0];
-  }
-
-  // Only use as much buffer as we need.
-  size_t buffer_use = 0;
-  for (unsigned int order = 2; order < counts.size(); ++order) {
-    buffer_use = std::max<size_t>(buffer_use, 
static_cast<size_t>((sizeof(WordIndex) * order + 2 * sizeof(float)) * 
counts[order - 1]));
-  }
-  buffer_use = std::max<size_t>(buffer_use, 
static_cast<size_t>((sizeof(WordIndex) * counts.size() + sizeof(float)) * 
counts.back()));
-  buffer = std::min<size_t>(buffer, buffer_use);
-
-  util::scoped_malloc mem;
-  mem.reset(malloc(buffer));
-  if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort 
buffer size " << buffer);
-
-  for (unsigned char order = 2; order <= counts.size(); ++order) {
-    ConvertToSorted(f, vocab, counts, file_prefix, order, warn, mem.get(), 
buffer);
-  }
-  ReadEnd(f);
-}
-
-namespace {
-class Closer {
-  public:
-    explicit Closer(std::deque<FILE*> &files) : files_(files) {}
-
-    ~Closer() {
-      for (std::deque<FILE*>::iterator i = files_.begin(); i != files_.end(); 
++i) {
-        util::scoped_FILE deleter(*i);
-      }
-    }
-
-    void PopFront() {
-      util::scoped_FILE deleter(files_.front());
-      files_.pop_front();
-    }
-  private:
-    std::deque<FILE*> &files_;
-};
-} // namespace
-
-void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary 
&vocab, const std::vector<uint64_t> &counts, const std::string &file_prefix, 
unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) {
-  ReadNGramHeader(f, order);
-  const size_t count = counts[order - 1];
-  // Size of weights.  Does it include backoff?
-  const size_t words_size = sizeof(WordIndex) * order;
-  const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : 
sizeof(float));
-  const size_t entry_size = words_size + weights_size;
-  const size_t batch_size = std::min(count, mem_size / entry_size);
-  uint8_t *const begin = reinterpret_cast<uint8_t*>(mem);
-
-  std::deque<FILE*> files, contexts;
-  Closer files_closer(files), contexts_closer(contexts);
-
-  for (std::size_t batch = 0, done = 0; done < count; ++batch) {
-    uint8_t *out = begin;
-    uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size;
-    if (order == counts.size()) {
-      for (; out != out_end; out += entry_size) {
-        std::reverse_iterator<WordIndex*> it(reinterpret_cast<WordIndex*>(out) 
+ order);
-        ReadNGram(f, order, vocab, it, *reinterpret_cast<Prob*>(out + 
words_size), warn);
-      }
-    } else {
-      for (; out != out_end; out += entry_size) {
-        std::reverse_iterator<WordIndex*> it(reinterpret_cast<WordIndex*>(out) 
+ order);
-        ReadNGram(f, order, vocab, it, *reinterpret_cast<ProbBackoff*>(out + 
words_size), warn);
-      }
-    }
-    // Sort full records by full n-gram.
-    util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, 
entry_size);
-    // parallel_sort uses too much RAM.  TODO: figure out why windows sort 
doesn't like my proxies.
-#if defined(_WIN32) || defined(_WIN64)
-    std::stable_sort
-#else
-    std::sort
-#endif
-        (NGramIter(proxy_begin), NGramIter(proxy_end), 
util::SizedCompare<EntryCompare>(EntryCompare(order)));
-    files.push_back(DiskFlush(begin, out_end, file_prefix));
-    contexts.push_back(WriteContextFile(begin, out_end, file_prefix, 
entry_size, order));
-
-    done += (out_end - begin) / entry_size;
-  }
-
-  // All individual files created.  Merge them.
-
-  while (files.size() > 1) {
-    files.push_back(MergeSortedFiles(files[0], files[1], file_prefix, 
weights_size, order, ThrowCombine()));
-    files_closer.PopFront();
-    files_closer.PopFront();
-    contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], file_prefix, 
0, order - 1, FirstCombine()));
-    contexts_closer.PopFront();
-    contexts_closer.PopFront();
-  }
-
-  if (!files.empty()) {
-    // Steal from closers.
-    full_[order - 2].reset(files.front());
-    files.pop_front();
-    context_[order - 2].reset(contexts.front());
-    contexts.pop_front();
-  }
-}
-
-} // namespace trie
-} // namespace ngram
-} // namespace lm

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/trie_sort.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/trie_sort.hh b/ext/kenlm/lm/trie_sort.hh
deleted file mode 100644
index 594efee..0000000
--- a/ext/kenlm/lm/trie_sort.hh
+++ /dev/null
@@ -1,114 +0,0 @@
-// Step of trie builder: create sorted files.
-
-#ifndef LM_TRIE_SORT_H
-#define LM_TRIE_SORT_H
-
-#include "lm/max_order.hh"
-#include "lm/word_index.hh"
-
-#include "util/file.hh"
-#include "util/scoped.hh"
-
-#include <cstddef>
-#include <functional>
-#include <string>
-#include <vector>
-
-#include <stdint.h>
-
-namespace util {
-class FilePiece;
-} // namespace util
-
-namespace lm {
-class PositiveProbWarn;
-namespace ngram {
-class SortedVocabulary;
-struct Config;
-
-namespace trie {
-
-class EntryCompare : public std::binary_function<const void*, const void*, 
bool> {
-  public:
-    explicit EntryCompare(unsigned char order) : order_(order) {}
-
-    bool operator()(const void *first_void, const void *second_void) const {
-      const WordIndex *first = static_cast<const WordIndex*>(first_void);
-      const WordIndex *second = static_cast<const WordIndex*>(second_void);
-      const WordIndex *end = first + order_;
-      for (; first != end; ++first, ++second) {
-        if (*first < *second) return true;
-        if (*first > *second) return false;
-      }
-      return false;
-    }
-  private:
-    unsigned char order_;
-};
-
-class RecordReader {
-  public:
-    RecordReader() : remains_(true) {}
-
-    void Init(FILE *file, std::size_t entry_size);
-
-    void *Data() { return data_.get(); }
-    const void *Data() const { return data_.get(); }
-
-    RecordReader &operator++() {
-      std::size_t ret = fread(data_.get(), entry_size_, 1, file_);
-      if (!ret) {
-        UTIL_THROW_IF(!feof(file_), util::ErrnoException, "Error reading 
temporary file");
-        remains_ = false;
-      }
-      return *this;
-    }
-
-    operator bool() const { return remains_; }
-
-    void Rewind();
-
-    std::size_t EntrySize() const { return entry_size_; }
-
-    void Overwrite(const void *start, std::size_t amount);
-
-  private:
-    FILE *file_;
-
-    util::scoped_malloc data_;
-
-    bool remains_;
-
-    std::size_t entry_size_;
-};
-
-class SortedFiles {
-  public:
-    // Build from ARPA
-    SortedFiles(const Config &config, util::FilePiece &f, 
std::vector<uint64_t> &counts, std::size_t buffer, const std::string 
&file_prefix, SortedVocabulary &vocab);
-
-    int StealUnigram() {
-      return unigram_.release();
-    }
-
-    FILE *Full(unsigned char order) {
-      return full_[order - 2].get();
-    }
-
-    FILE *Context(unsigned char of_order) {
-      return context_[of_order - 2].get();
-    }
-
-  private:
-    void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, 
const std::vector<uint64_t> &counts, const std::string &prefix, unsigned char 
order, PositiveProbWarn &warn, void *mem, std::size_t mem_size);
-
-    util::scoped_fd unigram_;
-
-    util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 
1];
-};
-
-} // namespace trie
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_TRIE_SORT_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/value.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/value.hh b/ext/kenlm/lm/value.hh
deleted file mode 100644
index d2425cc..0000000
--- a/ext/kenlm/lm/value.hh
+++ /dev/null
@@ -1,158 +0,0 @@
-#ifndef LM_VALUE_H
-#define LM_VALUE_H
-
-#include "lm/config.hh"
-#include "lm/model_type.hh"
-#include "lm/value_build.hh"
-#include "lm/weights.hh"
-#include "util/bit_packing.hh"
-
-#include <stdint.h>
-
-namespace lm {
-namespace ngram {
-
-// Template proxy for probing unigrams and middle.
-template <class Weights> class GenericProbingProxy {
-  public:
-    explicit GenericProbingProxy(const Weights &to) : to_(&to) {}
-
-    GenericProbingProxy() : to_(0) {}
-
-    bool Found() const { return to_ != 0; }
-
-    float Prob() const {
-      util::FloatEnc enc;
-      enc.f = to_->prob;
-      enc.i |= util::kSignBit;
-      return enc.f;
-    }
-
-    float Backoff() const { return to_->backoff; }
-
-    bool IndependentLeft() const {
-      util::FloatEnc enc;
-      enc.f = to_->prob;
-      return enc.i & util::kSignBit;
-    }
-
-  protected:
-    const Weights *to_;
-};
-
-// Basic proxy for trie unigrams.
-template <class Weights> class GenericTrieUnigramProxy {
-  public:
-    explicit GenericTrieUnigramProxy(const Weights &to) : to_(&to) {}
-
-    GenericTrieUnigramProxy() : to_(0) {}
-
-    bool Found() const { return to_ != 0; }
-    float Prob() const { return to_->prob; }
-    float Backoff() const { return to_->backoff; }
-    float Rest() const { return Prob(); }
-
-  protected:
-    const Weights *to_;
-};
-
-struct BackoffValue {
-  typedef ProbBackoff Weights;
-  static const ModelType kProbingModelType = PROBING;
-
-  class ProbingProxy : public GenericProbingProxy<Weights> {
-    public:
-      explicit ProbingProxy(const Weights &to) : 
GenericProbingProxy<Weights>(to) {}
-      ProbingProxy() {}
-      float Rest() const { return Prob(); }
-  };
-
-  class TrieUnigramProxy : public GenericTrieUnigramProxy<Weights> {
-    public:
-      explicit TrieUnigramProxy(const Weights &to) : 
GenericTrieUnigramProxy<Weights>(to) {}
-      TrieUnigramProxy() {}
-      float Rest() const { return Prob(); }
-  };
-
-  struct ProbingEntry {
-    typedef uint64_t Key;
-    typedef Weights Value;
-    uint64_t key;
-    ProbBackoff value;
-    uint64_t GetKey() const { return key; }
-  };
-
-  struct TrieUnigramValue {
-    Weights weights;
-    uint64_t next;
-    uint64_t Next() const { return next; }
-  };
-
-  const static bool kDifferentRest = false;
-
-  template <class Model, class C> void Callback(const Config &, unsigned int, 
typename Model::Vocabulary &, C &callback) {
-    NoRestBuild build;
-    callback(build);
-  }
-};
-
-struct RestValue {
-  typedef RestWeights Weights;
-  static const ModelType kProbingModelType = REST_PROBING;
-
-  class ProbingProxy : public GenericProbingProxy<RestWeights> {
-    public:
-      explicit ProbingProxy(const Weights &to) : 
GenericProbingProxy<RestWeights>(to) {}
-      ProbingProxy() {}
-      float Rest() const { return to_->rest; }
-  };
-
-  class TrieUnigramProxy : public GenericTrieUnigramProxy<Weights> {
-    public:
-      explicit TrieUnigramProxy(const Weights &to) : 
GenericTrieUnigramProxy<Weights>(to) {}
-      TrieUnigramProxy() {}
-      float Rest() const { return to_->rest; }
-  };
-
-// gcc 4.1 doesn't properly back dependent types :-(.
-#pragma pack(push)
-#pragma pack(4)
-  struct ProbingEntry {
-    typedef uint64_t Key;
-    typedef Weights Value;
-    Key key;
-    Value value;
-    Key GetKey() const { return key; }
-  };
-
-  struct TrieUnigramValue {
-    Weights weights;
-    uint64_t next;
-    uint64_t Next() const { return next; }
-  };
-#pragma pack(pop)
-
-  const static bool kDifferentRest = true;
-
-  template <class Model, class C> void Callback(const Config &config, unsigned 
int order, typename Model::Vocabulary &vocab, C &callback) {
-    switch (config.rest_function) {
-      case Config::REST_MAX:
-        {
-          MaxRestBuild build;
-          callback(build);
-        }
-        break;
-      case Config::REST_LOWER:
-        {
-          LowerRestBuild<Model> build(config, order, vocab);
-          callback(build);
-        }
-        break;
-    }
-  }
-};
-
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_VALUE_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/value_build.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/value_build.cc b/ext/kenlm/lm/value_build.cc
deleted file mode 100644
index ac623a6..0000000
--- a/ext/kenlm/lm/value_build.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-#include "lm/value_build.hh"
-
-#include "lm/model.hh"
-#include "lm/read_arpa.hh"
-
-namespace lm {
-namespace ngram {
-
-template <class Model> LowerRestBuild<Model>::LowerRestBuild(const Config 
&config, unsigned int order, const typename Model::Vocabulary &vocab) {
-  UTIL_THROW_IF(config.rest_lower_files.size() != order - 1, ConfigException, 
"This model has order " << order << " so there should be " << (order - 1) << " 
lower-order models for rest cost purposes.");
-  Config for_lower = config;
-  for_lower.write_mmap = NULL;
-  for_lower.rest_lower_files.clear();
-
-  // Unigram models aren't supported, so this is a custom loader.
-  // TODO: optimize the unigram loading?
-  {
-    util::FilePiece uni(config.rest_lower_files[0].c_str());
-    std::vector<uint64_t> number;
-    ReadARPACounts(uni, number);
-    UTIL_THROW_IF(number.size() != 1, FormatLoadException, "Expected the 
unigram model to have order 1, not " << number.size());
-    ReadNGramHeader(uni, 1);
-    unigrams_.resize(number[0]);
-    unigrams_[0] = config.unknown_missing_logprob;
-    PositiveProbWarn warn;
-    for (uint64_t i = 0; i < number[0]; ++i) {
-      WordIndex w;
-      Prob entry;
-      ReadNGram(uni, 1, vocab, &w, entry, warn);
-      unigrams_[w] = entry.prob;
-    }
-  }
-
-  try {
-    for (unsigned int i = 2; i < order; ++i) {
-      models_.push_back(new Model(config.rest_lower_files[i - 1].c_str(), 
for_lower));
-      UTIL_THROW_IF(models_.back()->Order() != i, FormatLoadException, "Lower 
order file " << config.rest_lower_files[i-1] << " should have order " << i);
-    }
-  } catch (...) {
-    for (typename std::vector<const Model*>::const_iterator i = 
models_.begin(); i != models_.end(); ++i) {
-      delete *i;
-    }
-    models_.clear();
-    throw;
-  }
-
-  // TODO: force/check same vocab.
-}
-
-template <class Model> LowerRestBuild<Model>::~LowerRestBuild() {
-  for (typename std::vector<const Model*>::const_iterator i = models_.begin(); 
i != models_.end(); ++i) {
-    delete *i;
-  }
-}
-
-template class LowerRestBuild<ProbingModel>;
-
-} // namespace ngram
-} // namespace lm

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/value_build.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/value_build.hh b/ext/kenlm/lm/value_build.hh
deleted file mode 100644
index 49989ab..0000000
--- a/ext/kenlm/lm/value_build.hh
+++ /dev/null
@@ -1,97 +0,0 @@
-#ifndef LM_VALUE_BUILD_H
-#define LM_VALUE_BUILD_H
-
-#include "lm/weights.hh"
-#include "lm/word_index.hh"
-#include "util/bit_packing.hh"
-
-#include <vector>
-
-namespace lm {
-namespace ngram {
-
-struct Config;
-struct BackoffValue;
-struct RestValue;
-
-class NoRestBuild {
-  public:
-    typedef BackoffValue Value;
-
-    NoRestBuild() {}
-
-    void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const 
{}
-    void SetRest(const WordIndex *, unsigned int, const ProbBackoff &) const {}
-
-    template <class Second> bool MarkExtends(ProbBackoff &weights, const 
Second &) const {
-      util::UnsetSign(weights.prob);
-      return false;
-    }
-
-    // Probing doesn't need to go back to unigram.
-    const static bool kMarkEvenLower = false;
-};
-
-class MaxRestBuild {
-  public:
-    typedef RestValue Value;
-
-    MaxRestBuild() {}
-
-    void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const 
{}
-    void SetRest(const WordIndex *, unsigned int, RestWeights &weights) const {
-      weights.rest = weights.prob;
-      util::SetSign(weights.rest);
-    }
-
-    bool MarkExtends(RestWeights &weights, const RestWeights &to) const {
-      util::UnsetSign(weights.prob);
-      if (weights.rest >= to.rest) return false;
-      weights.rest = to.rest;
-      return true;
-    }
-    bool MarkExtends(RestWeights &weights, const Prob &to) const {
-      util::UnsetSign(weights.prob);
-      if (weights.rest >= to.prob) return false;
-      weights.rest = to.prob;
-      return true;
-    }
-
-    // Probing does need to go back to unigram.
-    const static bool kMarkEvenLower = true;
-};
-
-template <class Model> class LowerRestBuild {
-  public:
-    typedef RestValue Value;
-
-    LowerRestBuild(const Config &config, unsigned int order, const typename 
Model::Vocabulary &vocab);
-
-    ~LowerRestBuild();
-
-    void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const 
{}
-    void SetRest(const WordIndex *vocab_ids, unsigned int n, RestWeights 
&weights) const {
-      typename Model::State ignored;
-      if (n == 1) {
-        weights.rest = unigrams_[*vocab_ids];
-      } else {
-        weights.rest = models_[n-2]->FullScoreForgotState(vocab_ids + 1, 
vocab_ids + n, *vocab_ids, ignored).prob;
-      }
-    }
-
-    template <class Second> bool MarkExtends(RestWeights &weights, const 
Second &) const {
-      util::UnsetSign(weights.prob);
-      return false;
-    }
-
-    const static bool kMarkEvenLower = false;
-
-    std::vector<float> unigrams_;
-
-    std::vector<const Model*> models_;
-};
-
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_VALUE_BUILD_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/virtual_interface.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/virtual_interface.cc 
b/ext/kenlm/lm/virtual_interface.cc
deleted file mode 100644
index 17a74c3..0000000
--- a/ext/kenlm/lm/virtual_interface.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "lm/virtual_interface.hh"
-
-#include "lm/lm_exception.hh"
-
-namespace lm {
-namespace base {
-
-Vocabulary::~Vocabulary() {}
-
-void Vocabulary::SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, 
WordIndex not_found) {
-  begin_sentence_ = begin_sentence;
-  end_sentence_ = end_sentence;
-  not_found_ = not_found;
-}
-
-Model::~Model() {}
-
-} // namespace base
-} // namespace lm

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/virtual_interface.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/virtual_interface.hh 
b/ext/kenlm/lm/virtual_interface.hh
deleted file mode 100644
index ea491fb..0000000
--- a/ext/kenlm/lm/virtual_interface.hh
+++ /dev/null
@@ -1,160 +0,0 @@
-#ifndef LM_VIRTUAL_INTERFACE_H
-#define LM_VIRTUAL_INTERFACE_H
-
-#include "lm/return.hh"
-#include "lm/word_index.hh"
-#include "util/string_piece.hh"
-
-#include <string>
-#include <cstring>
-
-namespace lm {
-namespace base {
-
-template <class T, class U, class V> class ModelFacade;
-
-/* Vocabulary interface.  Call Index(string) and get a word index for use in
- * calling Model.  It provides faster convenience functions for <s>, </s>, and
- * <unk> although you can also find these using Index.
- *
- * Some models do not load the mapping from index to string.  If you need this,
- * check if the model Vocabulary class implements such a function and access it
- * directly.
- *
- * The Vocabulary object is always owned by the Model and can be retrieved from
- * the Model using BaseVocabulary() for this abstract interface or
- * GetVocabulary() for the actual implementation (in which case you'll need the
- * actual implementation of the Model too).
- */
-class Vocabulary {
-  public:
-    virtual ~Vocabulary();
-
-    WordIndex BeginSentence() const { return begin_sentence_; }
-    WordIndex EndSentence() const { return end_sentence_; }
-    WordIndex NotFound() const { return not_found_; }
-
-    /* Most implementations allow StringPiece lookups and need only override
-     * Index(StringPiece).  SRI requires null termination and overrides all
-     * three methods.
-     */
-    virtual WordIndex Index(const StringPiece &str) const = 0;
-    virtual WordIndex Index(const std::string &str) const {
-      return Index(StringPiece(str));
-    }
-    virtual WordIndex Index(const char *str) const {
-      return Index(StringPiece(str));
-    }
-
-  protected:
-    // Call SetSpecial afterward.
-    Vocabulary() {}
-
-    Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex 
not_found) {
-      SetSpecial(begin_sentence, end_sentence, not_found);
-    }
-
-    void SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, 
WordIndex not_found);
-
-    WordIndex begin_sentence_, end_sentence_, not_found_;
-
-  private:
-    // Disable copy constructors.  They're private and undefined.
-    // Ersatz boost::noncopyable.
-    Vocabulary(const Vocabulary &);
-    Vocabulary &operator=(const Vocabulary &);
-};
-
-/* There are two ways to access a Model.
- *
- *
- * OPTION 1: Access the Model directly (e.g. lm::ngram::Model in model.hh).
- *
- * Every Model implements the scoring function:
- * float Score(
- *   const Model::State &in_state,
- *   const WordIndex new_word,
- *   Model::State &out_state) const;
- *
- * It can also return the length of n-gram matched by the model:
- * FullScoreReturn FullScore(
- *   const Model::State &in_state,
- *   const WordIndex new_word,
- *   Model::State &out_state) const;
- *
- *
- * There are also accessor functions:
- * const State &BeginSentenceState() const;
- * const State &NullContextState() const;
- * const Vocabulary &GetVocabulary() const;
- * unsigned int Order() const;
- *
- * NB: In case you're wondering why the model implementation looks like it's
- * missing these methods, see facade.hh.
- *
- * This is the fastest way to use a model and presents a normal State class to
- * be included in a hypothesis state structure.
- *
- *
- * OPTION 2: Use the virtual interface below.
- *
- * The virtual interface allow you to decide which Model to use at runtime
- * without templatizing everything on the Model type.  However, each Model has
- * its own State class, so a single State cannot be efficiently provided (it
- * would require using the maximum memory of any Model's State or memory
- * allocation with each lookup).  This means you become responsible for
- * allocating memory with size StateSize() and passing it to the Score or
- * FullScore functions provided here.
- *
- * For example, cdec has a std::string containing the entire state of a
- * hypothesis.  It can reserve StateSize bytes in this string for the model
- * state.
- *
- * All the State objects are POD, so it's ok to use raw memory for storing
- * State.
- * in_state and out_state must not have the same address.
- */
-class Model {
-  public:
-    virtual ~Model();
-
-    size_t StateSize() const { return state_size_; }
-    const void *BeginSentenceMemory() const { return begin_sentence_memory_; }
-    void BeginSentenceWrite(void *to) const { memcpy(to, 
begin_sentence_memory_, StateSize()); }
-    const void *NullContextMemory() const { return null_context_memory_; }
-    void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, 
StateSize()); }
-
-    // Requires in_state != out_state
-    virtual float BaseScore(const void *in_state, const WordIndex new_word, 
void *out_state) const = 0;
-
-    // Requires in_state != out_state
-    virtual FullScoreReturn BaseFullScore(const void *in_state, const 
WordIndex new_word, void *out_state) const = 0;
-
-    // Prefer to use FullScore.  The context words should be provided in 
reverse order.
-    virtual FullScoreReturn BaseFullScoreForgotState(const WordIndex 
*context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void 
*out_state) const = 0;
-
-    unsigned char Order() const { return order_; }
-
-    const Vocabulary &BaseVocabulary() const { return *base_vocab_; }
-
-  private:
-    template <class T, class U, class V> friend class ModelFacade;
-    explicit Model(size_t state_size) : state_size_(state_size) {}
-
-    const size_t state_size_;
-    const void *begin_sentence_memory_, *null_context_memory_;
-
-    const Vocabulary *base_vocab_;
-
-    unsigned char order_;
-
-    // Disable copy constructors.  They're private and undefined.
-    // Ersatz boost::noncopyable.
-    Model(const Model &);
-    Model &operator=(const Model &);
-};
-
-} // mamespace base
-} // namespace lm
-
-#endif // LM_VIRTUAL_INTERFACE_H

[08/51] [partial] incubator-joshua git commit: Converted KenLM into a submodule

Reply via email to