[12/51] [partial] incubator-joshua git commit: Converted KenLM into a submodule

mjpost Tue, 19 Apr 2016 12:35:16 -0700

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/CMakeLists.txt 
b/ext/kenlm/lm/interpolate/CMakeLists.txt
deleted file mode 100644
index c146ab6..0000000
--- a/ext/kenlm/lm/interpolate/CMakeLists.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-find_package(Eigen3 REQUIRED)
-include_directories(${EIGEN3_INCLUDE_DIR})
-
-set(KENLM_INTERPOLATE_SOURCE
-    backoff_reunification.cc
-    bounded_sequence_encoding.cc
-    enumerate_global_vocab.cc
-    merge_probabilities.cc
-    merge_vocab.cc
-    normalize.cc
-    pipeline.cc
-    split_worker.cc
-    tune_derivatives.cc
-    tune_instance.cc
-    universal_vocab.cc)
-
-add_library(kenlm_interpolate OBJECT ${KENLM_INTERPOLATE_SOURCE})
-
-set(KENLM_INTERPOLATE_EXES
-    interpolate
-    perf_enum_gv
-    streaming_example
-    train_params
-    tune)
-
-set(KENLM_INTERPOLATE_DEPENDS
-    $<TARGET_OBJECTS:kenlm>
-    $<TARGET_OBJECTS:kenlm_util>
-    $<TARGET_OBJECTS:kenlm_common>
-    $<TARGET_OBJECTS:kenlm_interpolate>)
-
-AddExes(EXES ${KENLM_INTERPOLATE_EXES}
-        DEPENDS ${KENLM_INTERPOLATE_DEPENDS}
-        LIBRARIES ${Boost_LIBRARIES} pthread)
-
-if(BUILD_TESTING)
-    set(KENLM_INTERPOLATE_TESTS
-        backoff_reunification_test
-        bounded_sequence_encoding_test
-        merge_vocab_test
-        normalize_test
-        tune_derivatives_test)
-
-    AddTests(TESTS ${KENLM_INTERPOLATE_TESTS}
-             DEPENDS ${KENLM_INTERPOLATE_DEPENDS}
-             LIBRARIES ${Boost_LIBRARIES} pthread)
-
-    # tune_instance_test needs an extra command line parameter
-    KenLMAddTest(TEST tune_instance_test
-                 DEPENDS ${KENLM_INTERPOLATE_DEPENDS}
-                 LIBRARIES ${Boost_LIBRARIES} pthread
-                 TEST_ARGS
-                 ${CMAKE_CURRENT_SOURCE_DIR}/tune_instance_data/toy0.1)
-endif()


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/Jamfile
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/Jamfile b/ext/kenlm/lm/interpolate/Jamfile
deleted file mode 100644
index 411a346..0000000
--- a/ext/kenlm/lm/interpolate/Jamfile
+++ /dev/null
@@ -1,22 +0,0 @@
-fakelib interp : ../common//common [ glob *.cc : *_main.cc *_test.cc tune_*.cc 
] : <cxxflags>-fopenmp ;
-
-import testing ;
-
-local with-eigen = [ option.get "with-eigen" ] ;
-if $(with-eigen) {
-  fakelib tuning : tune_instance.cc tune_derivatives.cc interp ..//kenlm : 
<include>$(with-eigen) ;
-  unit-test tune_derivatives_test : tune_derivatives_test.cc tuning 
/top//boost_unit_test_framework : <include>$(with-eigen) ;
-
-  obj tune_instance_test.o : tune_instance_test.cc 
/top//boost_unit_test_framework : <include>$(with-eigen) ;
-  run tune_instance_test.o tuning /top//boost_unit_test_framework : : 
tune_instance_data/toy0.1 ;
-
-  exe tune : tune_main.cc tuning /top//boost_program_options : 
<include>$(with-eigen) ;
-}
-
-exe interpolate : interpolate_main.cc interp /top//boost_program_options ;
-exe streaming_example : ../builder//builder interp streaming_example_main.cc 
/top//boost_program_options ;
-
-unit-test normalize_test : interp normalize_test.cc 
/top//boost_unit_test_framework ;
-unit-test backoff_reunification_test : interp backoff_reunification_test.cc 
/top//boost_unit_test_framework ;
-unit-test bounded_sequence_encoding_test : interp 
bounded_sequence_encoding_test.cc /top//boost_unit_test_framework ;
-run merge_vocab_test.cc interp /top//boost_unit_test_framework : : 
merge_test/test1 merge_test/test2 merge_test/test3 merge_test/test_bad_order 
merge_test/test_no_unk ;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/backoff_matrix.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/backoff_matrix.hh 
b/ext/kenlm/lm/interpolate/backoff_matrix.hh
deleted file mode 100644
index c7552df..0000000
--- a/ext/kenlm/lm/interpolate/backoff_matrix.hh
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef LM_INTERPOLATE_BACKOFF_MATRIX_H
-#define LM_INTERPOLATE_BACKOFF_MATRIX_H
-
-#include <cstddef>
-#include <vector>
-
-namespace lm { namespace interpolate {
-
-class BackoffMatrix {
-  public:
-    BackoffMatrix(std::size_t num_models, std::size_t max_order)
-      : max_order_(max_order), backing_(num_models * max_order) {}
-
-    float &Backoff(std::size_t model, std::size_t order_minus_1) {
-      return backing_[model * max_order_ + order_minus_1];
-    }
-
-    float Backoff(std::size_t model, std::size_t order_minus_1) const {
-      return backing_[model * max_order_ + order_minus_1];
-    }
-
-  private:
-    const std::size_t max_order_;
-    std::vector<float> backing_;
-};
-
-}} // namespaces
-
-#endif // LM_INTERPOLATE_BACKOFF_MATRIX_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/backoff_reunification.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/backoff_reunification.cc 
b/ext/kenlm/lm/interpolate/backoff_reunification.cc
deleted file mode 100644
index edf6ddc..0000000
--- a/ext/kenlm/lm/interpolate/backoff_reunification.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "lm/interpolate/backoff_reunification.hh"
-#include "lm/common/model_buffer.hh"
-#include "lm/common/ngram_stream.hh"
-#include "lm/common/ngram.hh"
-#include "lm/common/compare.hh"
-
-#include <cassert>
-
-namespace lm {
-namespace interpolate {
-
-namespace {
-class MergeWorker {
-public:
-  MergeWorker(std::size_t order, const util::stream::ChainPosition &prob_pos,
-              const util::stream::ChainPosition &boff_pos)
-      : order_(order), prob_pos_(prob_pos), boff_pos_(boff_pos) {
-    // nothing
-  }
-
-  void Run(const util::stream::ChainPosition &position) {
-    lm::NGramStream<ProbBackoff> stream(position);
-
-    lm::NGramStream<float> prob_input(prob_pos_);
-    util::stream::Stream boff_input(boff_pos_);
-    for (; prob_input && boff_input; ++prob_input, ++boff_input, ++stream) {
-      std::copy(prob_input->begin(), prob_input->end(), stream->begin());
-      stream->Value().prob = prob_input->Value();
-      stream->Value().backoff = *reinterpret_cast<float *>(boff_input.Get());
-    }
-    UTIL_THROW_IF2(prob_input || boff_input,
-                   "Streams were not the same size during merging");
-    stream.Poison();
-  }
-
-private:
-  std::size_t order_;
-  util::stream::ChainPosition prob_pos_;
-  util::stream::ChainPosition boff_pos_;
-};
-}
-
-// Since we are *adding* something to the output chain here, we pass in the
-// chain itself so that we can safely add a new step to the chain without
-// creating a deadlock situation (since creating a new ChainPosition will
-// make a new input/output pair---we want that position to be created
-// *here*, not before).
-void ReunifyBackoff(util::stream::ChainPositions &prob_pos,
-                    util::stream::ChainPositions &boff_pos,
-                    util::stream::Chains &output_chains) {
-  assert(prob_pos.size() == boff_pos.size());
-
-  for (size_t i = 0; i < prob_pos.size(); ++i)
-    output_chains[i] >> MergeWorker(i + 1, prob_pos[i], boff_pos[i]);
-}
-}
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/backoff_reunification.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/backoff_reunification.hh 
b/ext/kenlm/lm/interpolate/backoff_reunification.hh
deleted file mode 100644
index 327db65..0000000
--- a/ext/kenlm/lm/interpolate/backoff_reunification.hh
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef KENLM_INTERPOLATE_BACKOFF_REUNIFICATION_
-#define KENLM_INTERPOLATE_BACKOFF_REUNIFICATION_
-
-#include "util/stream/stream.hh"
-#include "util/stream/multi_stream.hh"
-
-namespace lm {
-namespace interpolate {
-
-/**
- * The third pass for the offline log-linear interpolation algorithm. This
- * reads **suffix-ordered** probability values (ngram-id, float) and
- * **suffix-ordered** backoff values (float) and writes the merged contents
- * to the output.
- *
- * @param prob_pos The chain position for each order from which to read
- *  the probability values
- * @param boff_pos The chain position for each order from which to read
- *  the backoff values
- * @param output_chains The output chains for each order
- */
-void ReunifyBackoff(util::stream::ChainPositions &prob_pos,
-                    util::stream::ChainPositions &boff_pos,
-                    util::stream::Chains &output_chains);
-}
-}
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/backoff_reunification_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/backoff_reunification_test.cc 
b/ext/kenlm/lm/interpolate/backoff_reunification_test.cc
deleted file mode 100644
index d8faf94..0000000
--- a/ext/kenlm/lm/interpolate/backoff_reunification_test.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-#include "lm/interpolate/backoff_reunification.hh"
-#include "lm/common/ngram_stream.hh"
-
-#define BOOST_TEST_MODULE InterpolateBackoffReunificationTest
-#include <boost/test/unit_test.hpp>
-
-namespace lm {
-namespace interpolate {
-
-namespace {
-
-// none of this input actually makes sense, all we care about is making
-// sure the merging works
-template <uint8_t N>
-struct Gram {
-  WordIndex ids[N];
-  float prob;
-  float boff;
-};
-
-template <uint8_t N>
-struct Grams {
-  const static Gram<N> grams[];
-};
-
-template <>
-const Gram<1> Grams<1>::grams[]
-    = {{{0}, 0.1f, 0.1f}, {{1}, 0.4f, 0.2f}, {{2}, 0.5f, 0.1f}};
-
-template <>
-const Gram<2> Grams<2>::grams[] = {{{0, 0}, 0.05f, 0.05f},
-                                   {{1, 0}, 0.05f, 0.02f},
-                                   {{1, 1}, 0.2f, 0.04f},
-                                   {{2, 2}, 0.2f, 0.01f}};
-
-template <>
-const Gram<3> Grams<3>::grams[] = {{{0, 0, 0}, 0.001f, 0.005f},
-                                   {{1, 0, 0}, 0.001f, 0.002f},
-                                   {{2, 0, 0}, 0.001f, 0.003f},
-                                   {{0, 1, 0}, 0.1f, 0.008f},
-                                   {{1, 1, 0}, 0.1f, 0.09f},
-                                   {{1, 1, 1}, 0.2f, 0.08f}};
-
-template <uint8_t N>
-class WriteInput {
-public:
-  void Run(const util::stream::ChainPosition &position) {
-    lm::NGramStream<float> output(position);
-
-    for (std::size_t i = 0; i < sizeof(Grams<N>::grams) / sizeof(Gram<N>);
-         ++i, ++output) {
-      std::copy(Grams<N>::grams[i].ids, Grams<N>::grams[i].ids + N,
-                output->begin());
-      output->Value() = Grams<N>::grams[i].prob;
-    }
-    output.Poison();
-  }
-};
-
-template <uint8_t N>
-class WriteBackoffs {
-public:
-  void Run(const util::stream::ChainPosition &position) {
-    util::stream::Stream output(position);
-
-    for (std::size_t i = 0; i < sizeof(Grams<N>::grams) / sizeof(Gram<N>);
-         ++i, ++output) {
-      *reinterpret_cast<float *>(output.Get()) = Grams<N>::grams[i].boff;
-    }
-    output.Poison();
-  }
-};
-
-template <uint8_t N>
-class CheckOutput {
-public:
-  void Run(const util::stream::ChainPosition &position) {
-    lm::NGramStream<ProbBackoff> stream(position);
-
-    std::size_t i = 0;
-    for (; stream; ++stream, ++i) {
-      std::stringstream ss;
-      for (WordIndex *idx = stream->begin(); idx != stream->end(); ++idx)
-        ss << "(" << *idx << ")";
-
-      UTIL_THROW_IF2(
-          !std::equal(stream->begin(), stream->end(), Grams<N>::grams[i].ids),
-          "Mismatched id in CheckOutput<" << (int)N << ">: " << ss.str());
-
-      UTIL_THROW_IF2(stream->Value().prob != Grams<N>::grams[i].prob,
-                     "Mismatched probability in CheckOutput<"
-                         << (int)N << ">, got " << stream->Value().prob
-                         << ", expected " << Grams<N>::grams[i].prob);
-
-      UTIL_THROW_IF2(stream->Value().backoff != Grams<N>::grams[i].boff,
-                     "Mismatched backoff in CheckOutput<"
-                         << (int)N << ">, got " << stream->Value().backoff
-                         << ", expected " << Grams<N>::grams[i].boff);
-    }
-    UTIL_THROW_IF2(i != sizeof(Grams<N>::grams) / sizeof(Gram<N>),
-                   "Did not get correct number of "
-                       << (int)N << "-grams: expected "
-                       << sizeof(Grams<N>::grams) / sizeof(Gram<N>)
-                       << ", got " << i);
-  }
-};
-}
-
-BOOST_AUTO_TEST_CASE(BackoffReunificationTest) {
-  util::stream::ChainConfig config;
-  config.total_memory = 100;
-  config.block_count = 1;
-
-  util::stream::Chains prob_chains(3);
-  config.entry_size = NGram<float>::TotalSize(1);
-  prob_chains.push_back(config);
-  prob_chains.back() >> WriteInput<1>();
-
-  config.entry_size = NGram<float>::TotalSize(2);
-  prob_chains.push_back(config);
-  prob_chains.back() >> WriteInput<2>();
-
-  config.entry_size = NGram<float>::TotalSize(3);
-  prob_chains.push_back(config);
-  prob_chains.back() >> WriteInput<3>();
-
-  util::stream::Chains boff_chains(3);
-  config.entry_size = sizeof(float);
-  boff_chains.push_back(config);
-  boff_chains.back() >> WriteBackoffs<1>();
-
-  boff_chains.push_back(config);
-  boff_chains.back() >> WriteBackoffs<2>();
-
-  boff_chains.push_back(config);
-  boff_chains.back() >> WriteBackoffs<3>();
-
-  util::stream::ChainPositions prob_pos(prob_chains);
-  util::stream::ChainPositions boff_pos(boff_chains);
-
-  util::stream::Chains output_chains(3);
-  for (std::size_t i = 0; i < 3; ++i) {
-    config.entry_size = NGram<ProbBackoff>::TotalSize(i + 1);
-    output_chains.push_back(config);
-  }
-
-  ReunifyBackoff(prob_pos, boff_pos, output_chains);
-
-  output_chains[0] >> CheckOutput<1>();
-  output_chains[1] >> CheckOutput<2>();
-  output_chains[2] >> CheckOutput<3>();
-
-  prob_chains >> util::stream::kRecycle;
-  boff_chains >> util::stream::kRecycle;
-
-  output_chains.Wait();
-}
-}
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/bounded_sequence_encoding.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/bounded_sequence_encoding.cc 
b/ext/kenlm/lm/interpolate/bounded_sequence_encoding.cc
deleted file mode 100644
index aca8ed7..0000000
--- a/ext/kenlm/lm/interpolate/bounded_sequence_encoding.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "lm/interpolate/bounded_sequence_encoding.hh"
-
-#include <algorithm>
-
-namespace lm { namespace interpolate {
-
-BoundedSequenceEncoding::BoundedSequenceEncoding(const unsigned char 
*bound_begin, const unsigned char *bound_end)
-  : entries_(bound_end - bound_begin) {
-  std::size_t full = 0;
-  Entry entry;
-  entry.shift = 0;
-  for (const unsigned char *i = bound_begin; i != bound_end; ++i) {
-    uint8_t length;
-    if (*i <= 1) {
-      length = 0;
-    } else {
-      length = sizeof(unsigned int) * 8 - __builtin_clz((unsigned int)*i);
-    }
-    entry.mask = (1ULL << length) - 1ULL;
-    if (entry.shift + length > 64) {
-      entry.shift = 0;
-      entry.next = true;
-      ++full;
-    } else {
-      entry.next = false;
-    }
-    entries_.push_back(entry);
-    entry.shift += length;
-  }
-  byte_length_ = full * sizeof(uint64_t) + (entry.shift + 7) / 8;
-  first_copy_ = std::min<std::size_t>(byte_length_, sizeof(uint64_t));
-  // Size of last uint64_t.  Zero if empty, otherwise [1,8] depending on mod.
-  overhang_ = byte_length_ == 0 ? 0 : ((byte_length_ - 1) % 8 + 1);
-}
-
-}} // namespaces

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/bounded_sequence_encoding.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/bounded_sequence_encoding.hh 
b/ext/kenlm/lm/interpolate/bounded_sequence_encoding.hh
deleted file mode 100644
index 84dd63a..0000000
--- a/ext/kenlm/lm/interpolate/bounded_sequence_encoding.hh
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H
-#define LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H
-
-/* Encodes fixed-length sequences of integers with known bounds on each entry.
- * This is used to encode how far each model has backed off.
- * TODO: make this class efficient.  Bit-level packing or multiply by bound and
- * add.
- */
-
-#include "util/exception.hh"
-#include "util/fixed_array.hh"
-
-#if BYTE_ORDER != LITTLE_ENDIAN
-#warning The interpolation code assumes little endian for now.
-#endif
-
-#include <algorithm>
-#include <cstring>
-
-namespace lm {
-namespace interpolate {
-
-class BoundedSequenceEncoding {
-  public:
-    // Encode [0, bound_begin[0]) x [0, bound_begin[1]) x [0, bound_begin[2]) 
x ... x [0, *(bound_end - 1)) for entries in the sequence
-    BoundedSequenceEncoding(const unsigned char *bound_begin, const unsigned 
char *bound_end);
-
-    std::size_t Entries() const { return entries_.size(); }
-
-    std::size_t EncodedLength() const { return byte_length_; }
-
-    void Encode(const unsigned char *from, void *to_void) const {
-      uint8_t *to = static_cast<uint8_t*>(to_void);
-      uint64_t cur = 0;
-      for (const Entry *i = entries_.begin(); i != entries_.end(); ++i, 
++from) {
-        if (UTIL_UNLIKELY(i->next)) {
-          std::memcpy(to, &cur, sizeof(uint64_t));
-          to += sizeof(uint64_t);
-          cur = 0;
-        }
-        cur |= static_cast<uint64_t>(*from) << i->shift;
-      }
-      memcpy(to, &cur, overhang_);
-    }
-
-    void Decode(const void *from_void, unsigned char *to) const {
-      const uint8_t *from = static_cast<const uint8_t*>(from_void);
-      uint64_t cur = 0;
-      memcpy(&cur, from, first_copy_);
-      for (const Entry *i = entries_.begin(); i != entries_.end(); ++i, ++to) {
-        if (UTIL_UNLIKELY(i->next)) {
-          from += sizeof(uint64_t);
-          cur = 0;
-          std::memcpy(&cur, from,
-              std::min<std::size_t>(sizeof(uint64_t), static_cast<const 
uint8_t*>(from_void) + byte_length_ - from));
-        }
-        *to = (cur >> i->shift) & i->mask;
-      }
-    }
-
-  private:
-    struct Entry {
-      bool next;
-      uint8_t shift;
-      uint64_t mask;
-    };
-    util::FixedArray<Entry> entries_;
-    std::size_t byte_length_;
-    std::size_t first_copy_;
-    std::size_t overhang_;
-};
-
-
-}} // namespaces
-
-#endif // LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc 
b/ext/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc
deleted file mode 100644
index 2c4bbd9..0000000
--- a/ext/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-#include "lm/interpolate/bounded_sequence_encoding.hh"
-
-#include "util/scoped.hh"
-
-#define BOOST_TEST_MODULE BoundedSequenceEncodingTest
-#include <boost/test/unit_test.hpp>
-
-namespace lm {
-namespace interpolate {
-namespace {
-
-void ExhaustiveTest(unsigned char *bound_begin, unsigned char *bound_end) {
-  BoundedSequenceEncoding enc(bound_begin, bound_end);
-  util::scoped_malloc backing(util::MallocOrThrow(enc.EncodedLength()));
-  std::vector<unsigned char> values(bound_end - bound_begin),
-      out(bound_end - bound_begin);
-  while (true) {
-    enc.Encode(&values[0], backing.get());
-    enc.Decode(backing.get(), &out[0]);
-    for (std::size_t i = 0; i != values.size(); ++i) {
-      BOOST_CHECK_EQUAL(values[i], out[i]);
-    }
-    for (std::size_t i = 0;; ++i) {
-      if (i == values.size()) return;
-      ++values[i];
-      if (values[i] < bound_begin[i]) break;
-      values[i] = 0;
-    }
-  }
-}
-
-void CheckEncodeDecode(unsigned char *bounds, unsigned char *input,
-                       unsigned char *output, std::size_t len) {
-  BoundedSequenceEncoding encoder(bounds, bounds + len);
-  util::scoped_malloc backing(util::MallocOrThrow(encoder.EncodedLength()));
-
-  encoder.Encode(input, backing.get());
-  encoder.Decode(backing.get(), output);
-
-  for (std::size_t i = 0; i < len; ++i) {
-    BOOST_CHECK_EQUAL(input[i], output[i]);
-  }
-}
-
-BOOST_AUTO_TEST_CASE(Exhaustive) {
-  unsigned char bounds[] = {5, 2, 3, 9, 7, 20, 8};
-  ExhaustiveTest(bounds, bounds + sizeof(bounds) / sizeof(unsigned char));
-}
-
-BOOST_AUTO_TEST_CASE(LessThan64) {
-  unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 3};
-  unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 2};
-  unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0};
-
-  std::size_t len = sizeof(bounds) / sizeof(unsigned char);
-  assert(sizeof(input) / sizeof(unsigned char) == len);
-  assert(sizeof(output) / sizeof(unsigned char) == len);
-
-  CheckEncodeDecode(bounds, input, output, len);
-}
-
-BOOST_AUTO_TEST_CASE(Exactly64) {
-  unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 255};
-  unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 16};
-  unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0};
-
-  std::size_t len = sizeof(bounds) / sizeof(unsigned char);
-  assert(sizeof(input) / sizeof(unsigned char) == len);
-  assert(sizeof(output) / sizeof(unsigned char) == len);
-
-  CheckEncodeDecode(bounds, input, output, len);
-}
-
-BOOST_AUTO_TEST_CASE(MoreThan64) {
-  unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 255, 255};
-  unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 16, 137};
-  unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
-
-  std::size_t len = sizeof(bounds) / sizeof(unsigned char);
-  assert(sizeof(input) / sizeof(unsigned char) == len);
-  assert(sizeof(output) / sizeof(unsigned char) == len);
-
-  CheckEncodeDecode(bounds, input, output, len);
-}
-
-}}} // namespaces

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/enumerate_global_vocab.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/enumerate_global_vocab.cc 
b/ext/kenlm/lm/interpolate/enumerate_global_vocab.cc
deleted file mode 100644
index 26f63c8..0000000
--- a/ext/kenlm/lm/interpolate/enumerate_global_vocab.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-
-#include "lm/interpolate/enumerate_global_vocab.hh"
-
-#include <iostream>
-#include <map>
-
-namespace lm {
-//constructor
-
-  EnumerateGlobalVocab::EnumerateGlobalVocab(std::map<std::string, int*> * vm, 
int nm) {
-
-    vmap = vm;
-    num_models = nm;
-    cur_model = 0; //blah
-    cnt = 0;
-    std::cerr << "Vocab Builder with models: " <<  nm << std::endl;
-  }
-
-  void EnumerateGlobalVocab::Add(WordIndex index, const StringPiece &str) {
-
-    std::string st = str.as_string();
-
-    //check for existence of key
-    std::map<std::string, int*>::iterator itr = vmap->find(st);
-
-    //put stuff
-    if(itr != vmap->end()) {
-      std::cerr << "Vocab exist: " << str << " M: " << cur_model << " I:" << 
index << std::endl;
-      itr->second[cur_model] = index;
-    }
-    //new key
-    else {
-
-      //create model index map for this vocab word
-      //init to 0, 0 is UNK
-      int * indices = new int[num_models];
-      memset(indices, 0, (sizeof(int)*num_models)); //this still legit?
-
-      indices[cur_model] = index;
-      (*vmap)[st] = indices;
-      std::cerr << cnt << ":Vocab add: " << str << " M: " << cur_model << " 
I:" << index << std::endl;
-      cnt++;
-    }
-
-
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/enumerate_global_vocab.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/enumerate_global_vocab.hh 
b/ext/kenlm/lm/interpolate/enumerate_global_vocab.hh
deleted file mode 100644
index c37d649..0000000
--- a/ext/kenlm/lm/interpolate/enumerate_global_vocab.hh
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef LM_ENUMERATE_GLOBAL_VOCAB_H
-#define LM_ENUMERATE_GLOBAL_VOCAB_H
-
-#include "lm/enumerate_vocab.hh"
-#include <map>
-
-/* Use this to create a global vocab across models for use when
- * calculating lambdas for interpolation. Or other stuff.
- */
-namespace lm {
-
-  class EnumerateGlobalVocab : EnumerateVocab {
-
-  public:
-
-    //yes, ugly...
-    std::map<std::string, int*> * vmap;
-    int num_models;
-    int cur_model;
-    int cnt; //stupid
-
-    ~EnumerateGlobalVocab() {}
-
-    void Add(WordIndex index, const StringPiece & str);
-
-    EnumerateGlobalVocab(std::map<std::string, int*> *, int);
-
-    void SetCurModel(int i) { cur_model = i; }
-
-  protected:
-    EnumerateGlobalVocab() {}
-
-  };
-
-} //namespace lm
-
-#endif // LM_ENUMERATE_GLOBAL_VOCAB_H
-

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/interpolate_info.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/interpolate_info.hh 
b/ext/kenlm/lm/interpolate/interpolate_info.hh
deleted file mode 100644
index ebecd92..0000000
--- a/ext/kenlm/lm/interpolate/interpolate_info.hh
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef KENLM_INTERPOLATE_INTERPOLATE_INFO_H
-#define KENLM_INTERPOLATE_INTERPOLATE_INFO_H
-
-#include <cstddef>
-#include <vector>
-#include <stdint.h>
-
-namespace lm {
-namespace interpolate {
-
-/**
- * Stores relevant info for interpolating several language models, for use
- * during the three-pass offline log-linear interpolation algorithm.
- */
-struct InterpolateInfo {
-  /**
-   * @return the number of models being interpolated
-   */
-  std::size_t Models() const {
-    return orders.size();
-  }
-
-  /**
-   * The lambda (interpolation weight) for each model.
-   */
-  std::vector<float> lambdas;
-
-  /**
-   * The maximum ngram order for each model.
-   */
-  std::vector<uint8_t> orders;
-};
-}
-}
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/interpolate_main.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/interpolate_main.cc 
b/ext/kenlm/lm/interpolate/interpolate_main.cc
deleted file mode 100644
index a99b62d..0000000
--- a/ext/kenlm/lm/interpolate/interpolate_main.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-#include "lm/common/model_buffer.hh"
-#include "lm/common/size_option.hh"
-#include "lm/interpolate/pipeline.hh"
-#include "util/fixed_array.hh"
-#include "util/usage.hh"
-
-#include <boost/program_options.hpp>
-
-#include <iostream>
-#include <vector>
-
-int main(int argc, char *argv[]) {
-  lm::interpolate::Config config;
-  std::vector<std::string> input_models;
-  namespace po = boost::program_options;
-  po::options_description options("Log-linear interpolation options");
-  options.add_options()
-    ("lambda,w", po::value<std::vector<float> 
>(&config.lambdas)->multitoken()->required(), "Interpolation weights")
-    ("model,m", po::value<std::vector<std::string> 
>(&input_models)->multitoken()->required(), "Models to interpolate")
-    ("temp_prefix,T", 
po::value<std::string>(&config.sort.temp_prefix)->default_value("/tmp/lm"), 
"Temporary file prefix")
-    ("memory,S", lm::SizeOption(config.sort.total_memory, 
util::GuessPhysicalMemory() ? "50%" : "1G"), "Sorting memory")
-    ("sort_block", lm::SizeOption(config.sort.buffer_size, "64M"), "Block 
size");
-  po::variables_map vm;
-  po::store(po::parse_command_line(argc, argv, options), vm);
-  po::notify(vm);
-
-  if (config.lambdas.size() != input_models.size()) {
-    std::cerr << "Number of models " << input_models.size() << " should match 
the number of weights" << config.lambdas.size() << "." << std::endl;
-    return 1;
-  }
-
-  util::FixedArray<lm::ModelBuffer> models(input_models.size());
-  for (std::size_t i = 0; i < input_models.size(); ++i) {
-    models.push_back(input_models[i]);
-  }
-  lm::interpolate::Pipeline(models, config, 1);
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_probabilities.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_probabilities.cc 
b/ext/kenlm/lm/interpolate/merge_probabilities.cc
deleted file mode 100644
index b6c949f..0000000
--- a/ext/kenlm/lm/interpolate/merge_probabilities.cc
+++ /dev/null
@@ -1,285 +0,0 @@
-#include "lm/interpolate/merge_probabilities.hh"
-#include "lm/common/ngram_stream.hh"
-#include "lm/interpolate/bounded_sequence_encoding.hh"
-#include "lm/interpolate/interpolate_info.hh"
-
-#include <algorithm>
-#include <limits>
-#include <numeric>
-
-namespace lm {
-namespace interpolate {
-
-/**
- * Helper to generate the BoundedSequenceEncoding used for writing the
- * from values.
- */
-BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t 
order) {
-  util::FixedArray<uint8_t> max_orders(info.orders.size());
-  for (std::size_t i = 0; i < info.orders.size(); ++i) {
-    max_orders.push_back(std::min(order, info.orders[i]));
-  }
-  return BoundedSequenceEncoding(max_orders.begin(), max_orders.end());
-}
-
-namespace {
-
-/**
- * A simple wrapper class that holds information needed to read and write
- * the ngrams of a particular order. This class has the memory needed to
- * buffer the data needed for the recursive process of computing the
- * probabilities and "from" values for each component model.
- *
- * "From" values indicate, for each model, what order (as an index, so -1)
- * was backed off to in order to arrive at a probability. For example, if a
- * 5-gram model (order index 4) backed off twice, we would write a 2.
- */
-class NGramHandler {
-public:
-  NGramHandler(uint8_t order, const InterpolateInfo &ifo,
-               util::FixedArray<util::stream::ChainPositions> &models_by_order)
-      : info(ifo),
-        encoder(MakeEncoder(info, order)),
-        out_record(order, encoder.EncodedLength()) {
-    std::size_t count_has_order = 0;
-    for (std::size_t i = 0; i < models_by_order.size(); ++i) {
-      count_has_order += (models_by_order[i].size() >= order);
-    }
-    inputs_.Init(count_has_order);
-    for (std::size_t i = 0; i < models_by_order.size(); ++i) {
-      if (models_by_order[i].size() < order)
-        continue;
-      inputs_.push_back(models_by_order[i][order - 1]);
-      if (inputs_.back()) {
-        active_.resize(active_.size() + 1);
-        active_.back().model = i;
-        active_.back().stream = &inputs_.back();
-      }
-    }
-
-    // have to init outside since NGramStreams doesn't forward to
-    // GenericStreams ctor given a ChainPositions
-
-    probs.Init(info.Models());
-    from.Init(info.Models());
-    for (std::size_t i = 0; i < info.Models(); ++i) {
-      probs.push_back(0.0);
-      from.push_back(0);
-    }
-  }
-
-  struct StreamIndex {
-    NGramStream<ProbBackoff> *stream;
-    NGramStream<ProbBackoff> &Stream() { return *stream; }
-    std::size_t model;
-  };
-
-  std::size_t ActiveSize() const {
-    return active_.size();
-  }
-
-  /**
-   * @return the input stream for a particular model that corresponds to
-   * this ngram order
-   */
-  StreamIndex &operator[](std::size_t idx) {
-    return active_[idx];
-  }
-
-  void erase(std::size_t idx) {
-    active_.erase(active_.begin() + idx);
-  }
-
-  const InterpolateInfo &info;
-  BoundedSequenceEncoding encoder;
-  PartialProbGamma out_record;
-  util::FixedArray<float> probs;
-  util::FixedArray<uint8_t> from;
-
-private:
-  std::vector<StreamIndex> active_;
-  NGramStreams<ProbBackoff> inputs_;
-};
-
-/**
- * A collection of NGramHandlers.
- */
-class NGramHandlers : public util::FixedArray<NGramHandler> {
-public:
-  explicit NGramHandlers(std::size_t num)
-      : util::FixedArray<NGramHandler>(num) {
-  }
-
-  void push_back(
-      std::size_t order, const InterpolateInfo &info,
-      util::FixedArray<util::stream::ChainPositions> &models_by_order) {
-    new (end()) NGramHandler(order, info, models_by_order);
-    Constructed();
-  }
-};
-
-/**
- * The recursive helper function that computes probability and "from"
- * values for all ngrams matching a particular suffix.
- *
- * The current order can be computed as the suffix length + 1. Note that
- * the suffix could be empty (suffix_begin == suffix_end == NULL), in which
- * case we are handling unigrams with the UNK token as the fallback
- * probability.
- *
- * @param handlers The full collection of handlers
- * @param suffix_begin A start iterator for the suffix
- * @param suffix_end An end iterator for the suffix
- * @param fallback_probs The probabilities of this ngram if we need to
- *  back off (that is, the probability of the suffix)
- * @param fallback_from The order that the corresponding fallback
- *  probability in the fallback_probs is from
- * @param combined_fallback interpolated fallback_probs
- * @param outputs The output streams, one for each order
- */
-void HandleSuffix(NGramHandlers &handlers, WordIndex *suffix_begin,
-                  WordIndex *suffix_end,
-                  const util::FixedArray<float> &fallback_probs,
-                  const util::FixedArray<uint8_t> &fallback_from,
-                  float combined_fallback,
-                  util::stream::Streams &outputs) {
-  uint8_t order = std::distance(suffix_begin, suffix_end) + 1;
-  if (order > outputs.size()) return;
-
-  util::stream::Stream &output = outputs[order - 1];
-  NGramHandler &handler = handlers[order - 1];
-
-  while (true) {
-    // find the next smallest ngram which matches our suffix
-    // TODO: priority queue driven.
-    WordIndex *minimum = NULL;
-    for (std::size_t i = 0; i < handler.ActiveSize(); ++i) {
-      if (!std::equal(suffix_begin, suffix_end, handler[i].Stream()->begin() + 
1))
-        continue;
-
-      // if we either haven't set a minimum yet or this one is smaller than
-      // the minimum we found before, replace it
-      WordIndex *last = handler[i].Stream()->begin();
-      if (!minimum || *last < *minimum) { minimum = 
handler[i].Stream()->begin(); }
-    }
-
-    // no more ngrams of this order match our suffix, so we're done
-    if (!minimum) return;
-
-    handler.out_record.ReBase(output.Get());
-    std::copy(minimum, minimum + order, handler.out_record.begin());
-
-    // Default case is having backed off.
-    std::copy(fallback_probs.begin(), fallback_probs.end(), 
handler.probs.begin());
-    std::copy(fallback_from.begin(), fallback_from.end(), 
handler.from.begin());
-
-    for (std::size_t i = 0; i < handler.ActiveSize();) {
-      if (std::equal(handler.out_record.begin(), handler.out_record.end(),
-                     handler[i].Stream()->begin())) {
-        handler.probs[handler[i].model] = 
handler.info.lambdas[handler[i].model] * handler[i].Stream()->Value().prob;
-        handler.from[handler[i].model] = order - 1;
-        if (++handler[i].Stream()) {
-          ++i;
-        } else {
-          handler.erase(i);
-        }
-      } else {
-        ++i;
-      }
-    }
-    handler.out_record.Prob() = std::accumulate(handler.probs.begin(), 
handler.probs.end(), 0.0);
-    handler.out_record.LowerProb() = combined_fallback;
-    handler.encoder.Encode(handler.from.begin(),
-                           handler.out_record.FromBegin());
-
-    // we've handled this particular ngram, so now recurse to the higher
-    // order using the current ngram as the suffix
-    HandleSuffix(handlers, handler.out_record.begin(), 
handler.out_record.end(),
-                 handler.probs, handler.from, handler.out_record.Prob(), 
outputs);
-    // consume the output
-    ++output;
-  }
-}
-
-/**
- * Kicks off the recursion for computing the probabilities and "from"
- * values for each ngram order. We begin by handling the UNK token that
- * should be at the front of each of the unigram input streams. This is
- * then output to the stream and it is used as the fallback for handling
- * our unigram case, the unigram used as the fallback for the bigram case,
- * etc.
- */
-void HandleNGrams(NGramHandlers &handlers, util::stream::Streams &outputs) {
-  PartialProbGamma unk_record(1, 0);
-  // First: populate the unk probabilities by reading the first unigram
-  // from each stream
-  util::FixedArray<float> unk_probs(handlers[0].info.Models());
-
-  // start by populating the ngram id from the first stream
-  lm::NGram<ProbBackoff> ngram = *handlers[0][0].Stream();
-  unk_record.ReBase(outputs[0].Get());
-  std::copy(ngram.begin(), ngram.end(), unk_record.begin());
-  unk_record.Prob() = 0;
-
-  // then populate the probabilities into unk_probs while "multiply" the
-  // model probabilities together into the unk record
-  //
-  // note that from doesn't need to be set for unigrams
-  assert(handlers[0].ActiveSize() == handlers[0].info.Models());
-  for (std::size_t i = 0; i < handlers[0].info.Models();) {
-    ngram = *handlers[0][i].Stream();
-    unk_probs.push_back(handlers[0].info.lambdas[i] * ngram.Value().prob);
-    unk_record.Prob() += unk_probs[i];
-    assert(*ngram.begin() == kUNK);
-    if (++handlers[0][i].Stream()) {
-      ++i;
-    } else {
-      handlers[0].erase(i);
-    }
-  }
-  float unk_combined = unk_record.Prob();
-  unk_record.LowerProb() = unk_combined;
-  // flush the unk output record
-  ++outputs[0];
-
-  // Then, begin outputting everything in lexicographic order: first we'll
-  // get the unigram then the first bigram with that context, then the
-  // first trigram with that bigram context, etc., until we exhaust all of
-  // the ngrams, then all of the (n-1)grams, etc.
-  //
-  // This function is the "root" of this recursive process.
-  util::FixedArray<uint8_t> unk_from(handlers[0].info.Models());
-  for (std::size_t i = 0; i < handlers[0].info.Models(); ++i) {
-    unk_from.push_back(0);
-  }
-
-  // the two nulls are to encode that our "fallback" word is the "0-gram"
-  // case, e.g. we "backed off" to UNK
-  // TODO: stop generating vocab ids and LowerProb for unigrams.
-  HandleSuffix(handlers, NULL, NULL, unk_probs, unk_from, unk_combined, 
outputs);
-
-  // Verify we reached the end.  And poison!
-  for (std::size_t i = 0; i < handlers.size(); ++i) {
-    UTIL_THROW_IF2(handlers[i].ActiveSize(),
-                     "MergeProbabilities did not exhaust all ngram streams");
-    outputs[i].Poison();
-  }
-}
-}
-
-void MergeProbabilities(
-    const InterpolateInfo &info,
-    util::FixedArray<util::stream::ChainPositions> &models_by_order,
-    util::stream::Chains &output_chains) {
-  NGramHandlers handlers(output_chains.size());
-  for (std::size_t i = 0; i < output_chains.size(); ++i) {
-    handlers.push_back(i + 1, info, models_by_order);
-  }
-
-  util::stream::ChainPositions output_pos(output_chains);
-  util::stream::Streams outputs(output_pos);
-
-  HandleNGrams(handlers, outputs);
-}
-}
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_probabilities.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_probabilities.hh 
b/ext/kenlm/lm/interpolate/merge_probabilities.hh
deleted file mode 100644
index 59b8046..0000000
--- a/ext/kenlm/lm/interpolate/merge_probabilities.hh
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef LM_INTERPOLATE_MERGE_PROBABILITIES_H
-#define LM_INTERPOLATE_MERGE_PROBABILITIES_H
-
-#include "lm/common/ngram.hh"
-#include "lm/interpolate/bounded_sequence_encoding.hh"
-#include "util/fixed_array.hh"
-#include "util/stream/multi_stream.hh"
-
-#include <stdint.h>
-
-namespace lm {
-namespace interpolate {
-
-struct InterpolateInfo;
-
-/**
- * Make the encoding of backoff values for a given order.  This stores values
- * in [PartialProbGamma::FromBegin(), PartialProbGamma::FromEnd())
- */
-BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t 
order);
-
-/**
- * The first pass for the offline log-linear interpolation algorithm. This
- * reads K **suffix-ordered** streams for each model, for each order, of
- * ngram records (ngram-id, prob, backoff). It further assumes that the
- * ngram-ids have been unified over all of the stream inputs.
- *
- * Its output is records of (ngram-id, prob-prod, backoff-level,
- * backoff-level, ...) where the backoff-levels (of which there are K) are
- * the context length (0 for unigrams) that the corresponding model had to
- * back off to in order to obtain a probability for that ngram-id. Each of
- * these streams is terminated with a record whose ngram-id is all
- * maximum-integers for simplicity in implementation here.
- *
- * @param models An array of length N (max_i N_i) containing at
- *  the ChainPositions for the streams for order (i + 1).
- * @param output_chains The output chains for each order (of length K)
- */
-void MergeProbabilities(
-    const InterpolateInfo &info,
-    util::FixedArray<util::stream::ChainPositions> &models_by_order,
-    util::stream::Chains &output_chains);
-
-/**
- * This class represents the output payload for this pass, which consists
- * of an ngram-id, a probability, and then a vector of orders from which
- * each of the component models backed off to for this ngram, encoded
- * using the BoundedSequenceEncoding class.
- */
-class PartialProbGamma : public lm::NGramHeader {
-public:
-  PartialProbGamma(std::size_t order, std::size_t backoff_bytes)
-      : lm::NGramHeader(NULL, order), backoff_bytes_(backoff_bytes) {
-    // nothing
-  }
-
-  std::size_t TotalSize() const {
-    return sizeof(WordIndex) * Order() + sizeof(After) + backoff_bytes_;
-  }
-
-  // TODO: cache bounded sequence encoding in the pipeline?
-  static std::size_t TotalSize(const InterpolateInfo &info, uint8_t order) {
-    return sizeof(WordIndex) * order + sizeof(After) + MakeEncoder(info, 
order).EncodedLength();
-  }
-
-  float &Prob() { return Pay().prob; }
-  float Prob() const { return Pay().prob; }
-
-  float &LowerProb() { return Pay().lower_prob; }
-  float LowerProb() const { return Pay().lower_prob; }
-
-  const uint8_t *FromBegin() const { return Pay().from; }
-  uint8_t *FromBegin() { return Pay().from; }
-
-private:
-  struct After {
-    // Note that backoff_and_normalize assumes this comes first.
-    float prob;
-    float lower_prob;
-    uint8_t from[];
-  };
-  const After &Pay() const { return *reinterpret_cast<const After *>(end()); }
-  After &Pay() { return *reinterpret_cast<After*>(end()); }
-
-  std::size_t backoff_bytes_;
-};
-
-}} // namespaces
-#endif // LM_INTERPOLATE_MERGE_PROBABILITIES_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test1
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_test/test1 
b/ext/kenlm/lm/interpolate/merge_test/test1
deleted file mode 100644
index 08d114d..0000000
Binary files a/ext/kenlm/lm/interpolate/merge_test/test1 and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test2
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_test/test2 
b/ext/kenlm/lm/interpolate/merge_test/test2
deleted file mode 100644
index fd3a380..0000000
Binary files a/ext/kenlm/lm/interpolate/merge_test/test2 and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test3
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_test/test3 
b/ext/kenlm/lm/interpolate/merge_test/test3
deleted file mode 100644
index 6c89d7f..0000000
Binary files a/ext/kenlm/lm/interpolate/merge_test/test3 and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test_bad_order
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_test/test_bad_order 
b/ext/kenlm/lm/interpolate/merge_test/test_bad_order
deleted file mode 100644
index d50a5e8..0000000
Binary files a/ext/kenlm/lm/interpolate/merge_test/test_bad_order and /dev/null 
differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test_no_unk
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_test/test_no_unk 
b/ext/kenlm/lm/interpolate/merge_test/test_no_unk
deleted file mode 100644
index fbcf12d..0000000
--- a/ext/kenlm/lm/interpolate/merge_test/test_no_unk
+++ /dev/null
@@ -1 +0,0 @@
-toto

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_vocab.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_vocab.cc 
b/ext/kenlm/lm/interpolate/merge_vocab.cc
deleted file mode 100644
index 3e84389..0000000
--- a/ext/kenlm/lm/interpolate/merge_vocab.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-#include "lm/interpolate/merge_vocab.hh"
-
-#include "lm/enumerate_vocab.hh"
-#include "lm/interpolate/universal_vocab.hh"
-#include "lm/lm_exception.hh"
-#include "lm/vocab.hh"
-#include "util/file_piece.hh"
-
-#include <queue>
-#include <string>
-#include <iostream>
-#include <vector>
-
-namespace lm {
-namespace interpolate {
-namespace {
-
-class VocabFileReader {
-  public:
-    explicit VocabFileReader(const int fd, size_t model_num, uint64_t offset = 
0);
-
-    VocabFileReader &operator++();
-    operator bool() const { return !eof_; }
-    uint64_t operator*() const { return Value(); }
-
-    uint64_t Value() const { return hash_value_; }
-    size_t ModelNum() const { return model_num_; }
-    WordIndex CurrentIndex() const { return current_index_; }
-
-    StringPiece Word() const { return word_; }
-
-  private:
-    uint64_t hash_value_;
-    WordIndex current_index_;
-    bool eof_;
-    size_t model_num_;
-    StringPiece word_;
-    util::FilePiece file_piece_;
-};
-
-VocabFileReader::VocabFileReader(const int fd, const size_t model_num, 
uint64_t offset) :
-  hash_value_(0),
-  current_index_(0),
-  eof_(false),
-  model_num_(model_num),
-  file_piece_(fd) {
-  word_ = file_piece_.ReadLine('\0');
-  UTIL_THROW_IF(word_ != "<unk>",
-                FormatLoadException,
-                "Vocabulary words are in the wrong place.");
-  // setup to initial value
-  ++*this;
-}
-
-VocabFileReader &VocabFileReader::operator++() {
-  try {
-    word_ = file_piece_.ReadLine('\0');
-  } catch(util::EndOfFileException &e) {
-    eof_ = true;
-    return *this;
-  }
-  uint64_t prev_hash_value = hash_value_;
-  hash_value_ = ngram::detail::HashForVocab(word_.data(), word_.size());
-
-  // hash values should be monotonically increasing
-  UTIL_THROW_IF(hash_value_ < prev_hash_value, FormatLoadException,
-                ": word index not monotonically increasing."
-                << " model_num: " << model_num_
-                << " prev hash: " << prev_hash_value
-                << " new hash: " << hash_value_);
-
-  ++current_index_;
-  return *this;
-}
-
-class CompareFiles {
-public:
-  bool operator()(const VocabFileReader* x,
-                  const VocabFileReader* y)
-  { return x->Value() > y->Value(); }
-};
-
-class Readers : public util::FixedArray<VocabFileReader> {
-  public:
-    Readers(std::size_t number) : util::FixedArray<VocabFileReader>(number) {}
-    void push_back(int fd, std::size_t i) {
-      new(end()) VocabFileReader(fd, i);
-      Constructed();
-    }
-};
-
-} // namespace
-
-WordIndex MergeVocab(util::FixedArray<util::scoped_fd> &files, UniversalVocab 
&vocab, EnumerateVocab &enumerate) {
-  typedef std::priority_queue<VocabFileReader*, std::vector<VocabFileReader*>, 
CompareFiles> HeapType;
-  HeapType heap;
-  Readers readers(files.size());
-  for (size_t i = 0; i < files.size(); ++i) {
-    readers.push_back(files[i].release(), i);
-    heap.push(&readers.back());
-    // initialize first index to 0 for <unk>
-    vocab.InsertUniversalIdx(i, 0, 0);
-  }
-
-  uint64_t prev_hash_value = 0;
-  // global_index starts with <unk> which is 0
-  WordIndex global_index = 0;
-
-  enumerate.Add(0, "<unk>");
-  while (!heap.empty()) {
-    VocabFileReader* top_vocab_file = heap.top();
-    if (top_vocab_file->Value() != prev_hash_value) {
-      enumerate.Add(++global_index, top_vocab_file->Word());
-    }
-    vocab.InsertUniversalIdx(top_vocab_file->ModelNum(),
-        top_vocab_file->CurrentIndex(),
-        global_index);
-
-    prev_hash_value = top_vocab_file->Value();
-
-    heap.pop();
-    if (++(*top_vocab_file)) {
-      heap.push(top_vocab_file);
-    }
-  }
-  return global_index + 1;
-}
-
-} // namespace interpolate
-} // namespace lm
-

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_vocab.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_vocab.hh 
b/ext/kenlm/lm/interpolate/merge_vocab.hh
deleted file mode 100644
index cc74d33..0000000
--- a/ext/kenlm/lm/interpolate/merge_vocab.hh
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef LM_INTERPOLATE_MERGE_VOCAB_H
-#define LM_INTERPOLATE_MERGE_VOCAB_H
-
-#include "lm/word_index.hh"
-#include "util/file.hh"
-#include "util/fixed_array.hh"
-
-namespace lm {
-
-class EnumerateVocab;
-
-namespace interpolate {
-
-class UniversalVocab;
-
-// Takes ownership of vocab_files.
-// The combined vocabulary is enumerated with enumerate.
-// Returns the size of the combined vocabulary.
-WordIndex MergeVocab(util::FixedArray<util::scoped_fd> &vocab_files, 
UniversalVocab &vocab, EnumerateVocab &enumerate);
-
-}} // namespaces
-
-#endif // LM_INTERPOLATE_MERGE_VOCAB_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_vocab_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_vocab_test.cc 
b/ext/kenlm/lm/interpolate/merge_vocab_test.cc
deleted file mode 100644
index 6df25b2..0000000
--- a/ext/kenlm/lm/interpolate/merge_vocab_test.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-#define BOOST_TEST_MODULE InterpolateMergeVocabTest
-#include <boost/test/unit_test.hpp>
-
-#include "lm/enumerate_vocab.hh"
-#include "lm/interpolate/merge_vocab.hh"
-#include "lm/interpolate/universal_vocab.hh"
-#include "lm/lm_exception.hh"
-#include "lm/vocab.hh"
-#include "lm/word_index.hh"
-#include "util/file.hh"
-#include "util/file_piece.hh"
-
-#include <cstring>
-
-namespace lm {
-namespace interpolate {
-namespace {
-
-// Stupid bjam permutes the command line arguments randomly.
-class TestFiles {
-  public:
-    TestFiles() {
-      char **argv = boost::unit_test::framework::master_test_suite().argv;
-      int argc = boost::unit_test::framework::master_test_suite().argc;
-      BOOST_REQUIRE_EQUAL(6, argc);
-      for (int i = 1; i < argc; ++i) {
-        EndsWithAssign(argv[i], "test1", test[0]);
-        EndsWithAssign(argv[i], "test2", test[1]);
-        EndsWithAssign(argv[i], "test3", test[2]);
-        EndsWithAssign(argv[i], "no_unk", no_unk);
-        EndsWithAssign(argv[i], "bad_order", bad_order);
-      }
-    }
-
-    void EndsWithAssign(char *arg, StringPiece value, util::scoped_fd &to) {
-      StringPiece str(arg);
-      if (str.size() < value.size()) return;
-      if (std::memcmp(str.data() + str.size() - value.size(), value.data(), 
value.size())) return;
-      to.reset(util::OpenReadOrThrow(arg));
-    }
-
-    util::scoped_fd test[3], no_unk, bad_order;
-};
-
-class DoNothingEnumerate : public EnumerateVocab {
-  public:
-    void Add(WordIndex, const StringPiece &) {}
-};
-
-BOOST_AUTO_TEST_CASE(MergeVocabTest) {
-  TestFiles files;
-
-  util::FixedArray<util::scoped_fd> used_files(3);
-  used_files.push_back(files.test[0].release());
-  used_files.push_back(files.test[1].release());
-  used_files.push_back(files.test[2].release());
-
-  std::vector<lm::WordIndex> model_max_idx;
-  model_max_idx.push_back(10);
-  model_max_idx.push_back(10);
-  model_max_idx.push_back(10);
-
-  util::scoped_fd combined(util::MakeTemp("temporary"));
-
-  UniversalVocab universal_vocab(model_max_idx);
-  {
-    ngram::ImmediateWriteWordsWrapper writer(NULL, combined.get(), 0);
-    MergeVocab(used_files, universal_vocab, writer);
-  }
-
-  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 0), 0);
-  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 0), 0);
-  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 0), 0);
-  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 1), 1);
-  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 1), 2);
-  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 1), 8);
-  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 5), 11);
-  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 3), 4);
-  BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 3), 10);
-
-  util::FilePiece f(combined.release());
-  BOOST_CHECK_EQUAL("<unk>", f.ReadLine('\0'));
-  BOOST_CHECK_EQUAL("a", f.ReadLine('\0'));
-  BOOST_CHECK_EQUAL("is this", f.ReadLine('\0'));
-  BOOST_CHECK_EQUAL("this a", f.ReadLine('\0'));
-  BOOST_CHECK_EQUAL("first cut", f.ReadLine('\0'));
-  BOOST_CHECK_EQUAL("this", f.ReadLine('\0'));
-  BOOST_CHECK_EQUAL("a first", f.ReadLine('\0'));
-  BOOST_CHECK_EQUAL("cut", f.ReadLine('\0'));
-  BOOST_CHECK_EQUAL("is", f.ReadLine('\0'));
-  BOOST_CHECK_EQUAL("i", f.ReadLine('\0'));
-  BOOST_CHECK_EQUAL("secd", f.ReadLine('\0'));
-  BOOST_CHECK_EQUAL("first", f.ReadLine('\0'));
-  BOOST_CHECK_THROW(f.ReadLine('\0'), util::EndOfFileException);
-}
-
-BOOST_AUTO_TEST_CASE(MergeVocabNoUnkTest) {
-  TestFiles files;
-  util::FixedArray<util::scoped_fd> used_files(1);
-  used_files.push_back(files.no_unk.release());
-
-  std::vector<lm::WordIndex> model_max_idx;
-  model_max_idx.push_back(10);
-
-  UniversalVocab universal_vocab(model_max_idx);
-  DoNothingEnumerate nothing;
-  BOOST_CHECK_THROW(MergeVocab(used_files, universal_vocab, nothing), 
FormatLoadException);
-}
-
-BOOST_AUTO_TEST_CASE(MergeVocabWrongOrderTest) {
-  TestFiles files;
-
-  util::FixedArray<util::scoped_fd> used_files(2);
-  used_files.push_back(files.test[0].release());
-  used_files.push_back(files.bad_order.release());
-
-  std::vector<lm::WordIndex> model_max_idx;
-  model_max_idx.push_back(10);
-  model_max_idx.push_back(10);
-
-  lm::interpolate::UniversalVocab universal_vocab(model_max_idx);
-  DoNothingEnumerate nothing;
-  BOOST_CHECK_THROW(MergeVocab(used_files, universal_vocab, nothing), 
FormatLoadException);
-}
-
-}}} // namespaces

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/normalize.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/normalize.cc 
b/ext/kenlm/lm/interpolate/normalize.cc
deleted file mode 100644
index f683f10..0000000
--- a/ext/kenlm/lm/interpolate/normalize.cc
+++ /dev/null
@@ -1,384 +0,0 @@
-#include "lm/interpolate/normalize.hh"
-
-#include "lm/common/compare.hh"
-#include "lm/common/ngram_stream.hh"
-#include "lm/interpolate/backoff_matrix.hh"
-#include "lm/interpolate/bounded_sequence_encoding.hh"
-#include "lm/interpolate/interpolate_info.hh"
-#include "lm/interpolate/merge_probabilities.hh"
-#include "lm/weights.hh"
-#include "lm/word_index.hh"
-#include "util/fixed_array.hh"
-#include "util/scoped.hh"
-#include "util/stream/stream.hh"
-#include "util/stream/rewindable_stream.hh"
-
-#include <functional>
-#include <queue>
-#include <vector>
-
-namespace lm { namespace interpolate {
-namespace {
-
-struct SuffixLexicographicLess : public std::binary_function<NGramHeader, 
NGramHeader, bool> {
-  bool operator()(const NGramHeader first, const NGramHeader second) const {
-    for (const WordIndex *f = first.end() - 1, *s = second.end() - 1; f >= 
first.begin() && s >= second.begin(); --f, --s) {
-      if (*f < *s) return true;
-      if (*f > *s) return false;
-    }
-    return first.size() < second.size();
-  }
-};
-
-class BackoffQueueEntry {
-  public:
-    BackoffQueueEntry(float &entry, const util::stream::ChainPosition 
&position)
-      : entry_(entry), stream_(position) {
-      entry_ = 0.0;
-    }
-
-    operator bool() const { return stream_; }
-
-    NGramHeader operator*() const { return *stream_; }
-    const NGramHeader *operator->() const { return &*stream_; }
-
-    void Enter() {
-      entry_ = stream_->Value().backoff;
-    }
-
-    BackoffQueueEntry &Next() {
-      entry_ = 0.0;
-      ++stream_;
-      return *this;
-    }
-
-  private:
-    float &entry_;
-    NGramStream<ProbBackoff> stream_;
-};
-
-struct PtrGreater : public std::binary_function<const BackoffQueueEntry *, 
const BackoffQueueEntry *, bool> {
-  bool operator()(const BackoffQueueEntry *first, const BackoffQueueEntry 
*second) const {
-    return SuffixLexicographicLess()(**second, **first);
-  }
-};
-
-class EntryOwner : public util::FixedArray<BackoffQueueEntry> {
-  public:
-    void push_back(float &entry, const util::stream::ChainPosition &position) {
-      new (end()) BackoffQueueEntry(entry, position);
-      Constructed();
-    }
-};
-
-std::size_t MaxOrder(const util::FixedArray<util::stream::ChainPositions> 
&model) {
-  std::size_t ret = 0;
-  for (const util::stream::ChainPositions *m = model.begin(); m != 
model.end(); ++m) {
-    ret = std::max(ret, m->size());
-  }
-  return ret;
-}
-
-class BackoffManager {
-  public:
-    explicit BackoffManager(const 
util::FixedArray<util::stream::ChainPositions> &models)
-      : entered_(MaxOrder(models)), matrix_(models.size(), MaxOrder(models)), 
skip_write_(MaxOrder(models)) {
-      std::size_t total = 0;
-      for (const util::stream::ChainPositions *m = models.begin(); m != 
models.end(); ++m) {
-        total += m->size();
-      }
-      for (std::size_t i = 0; i < MaxOrder(models); ++i) {
-        entered_.push_back(models.size());
-      }
-      owner_.Init(total);
-      for (const util::stream::ChainPositions *m = models.begin(); m != 
models.end(); ++m) {
-        for (const util::stream::ChainPosition *j = m->begin(); j != m->end(); 
++j) {
-          owner_.push_back(matrix_.Backoff(m - models.begin(), j - 
m->begin()), *j);
-          if (owner_.back()) {
-            queue_.push(&owner_.back());
-          }
-        }
-      }
-    }
-
-    void SetupSkip(std::size_t order, util::stream::Stream &stream) {
-      skip_write_[order - 2] = &stream;
-    }
-
-    // Move up the backoffs for the given n-gram.  The n-grams must be provided
-    // in suffix lexicographic order.
-    void Enter(const NGramHeader &to) {
-      // Check that we exited properly.
-      for (std::size_t i = to.Order() - 1; i < entered_.size(); ++i) {
-        assert(entered_[i].empty());
-      }
-      SuffixLexicographicLess less;
-      while (!queue_.empty() && less(**queue_.top(), to))
-        SkipRecord();
-      while (TopMatches(to)) {
-        BackoffQueueEntry *matches = queue_.top();
-        entered_[to.Order() - 1].push_back(matches);
-        matches->Enter();
-        queue_.pop();
-      }
-    }
-
-    void Exit(std::size_t order_minus_1) {
-      for (BackoffQueueEntry **i = entered_[order_minus_1].begin(); i != 
entered_[order_minus_1].end(); ++i) {
-        if ((*i)->Next())
-          queue_.push(*i);
-      }
-      entered_[order_minus_1].clear();
-    }
-
-    float Get(std::size_t model, std::size_t order_minus_1) const {
-      return matrix_.Backoff(model, order_minus_1);
-    }
-
-    void Finish() {
-      while (!queue_.empty())
-        SkipRecord();
-    }
-
-  private:
-    void SkipRecord() {
-      BackoffQueueEntry *top = queue_.top();
-      queue_.pop();
-      // Is this the last instance of the n-gram?
-      if (!TopMatches(**top)) {
-        // An n-gram is being skipped.  Called once per skipped n-gram,
-        // regardless of how many models it comes from.
-        *reinterpret_cast<float*>(skip_write_[(*top)->Order() - 1]->Get()) = 
0.0;
-        ++*skip_write_[(*top)->Order() - 1];
-      }
-      if (top->Next())
-        queue_.push(top);
-    }
-
-    bool TopMatches(const NGramHeader &header) const {
-      return !queue_.empty() && (*queue_.top())->Order() == header.Order() && 
std::equal(header.begin(), header.end(), (*queue_.top())->begin());
-    }
-
-    EntryOwner owner_;
-    std::priority_queue<BackoffQueueEntry*, std::vector<BackoffQueueEntry*>, 
PtrGreater> queue_;
-
-    // Indexed by order then just all the matching models.
-    util::FixedArray<util::FixedArray<BackoffQueueEntry*> > entered_;
-
-    std::size_t order_;
-
-    BackoffMatrix matrix_;
-
-    std::vector<util::stream::Stream*> skip_write_;
-};
-
-typedef long double Accum;
-
-// Handles n-grams of the same order, using recursion to call another instance
-// for higher orders.
-class Recurse {
-  public:
-    Recurse(
-        const InterpolateInfo &info, // Must stay alive the entire time.
-        std::size_t order,
-        const util::stream::ChainPosition &merged_probs,
-        const util::stream::ChainPosition &prob_out,
-        const util::stream::ChainPosition &backoff_out,
-        BackoffManager &backoffs,
-        Recurse *higher) // higher is null for the highest order.
-      : order_(order),
-        encoding_(MakeEncoder(info, order)),
-        input_(merged_probs, PartialProbGamma(order, 
encoding_.EncodedLength())),
-        prob_out_(prob_out),
-        backoff_out_(backoff_out),
-        backoffs_(backoffs),
-        lambdas_(&*info.lambdas.begin()),
-        higher_(higher),
-        decoded_backoffs_(info.Models()),
-        extended_context_(order - 1) {
-      // This is only for bigrams and above.  Summing unigrams is a much 
easier case.
-      assert(order >= 2);
-    }
-
-    // context = w_1^{n-1}
-    // z_lower = Z(w_2^{n-1})
-    // Input:
-    //   Merged probabilities without backoff applied in input_.
-    //   Backoffs via backoffs_.
-    // Calculates:
-    //   Z(w_1^{n-1}): intermediate only.
-    //   p_I(x | w_1^{n-1}) for all x: w_1^{n-1}x exists: Written to prob_out_.
-    //   b_I(w_1^{n-1}): Written to backoff_out_.
-    void SameContext(const NGramHeader &context, Accum z_lower) {
-      assert(context.size() == order_ - 1);
-      backoffs_.Enter(context);
-      prob_out_.Mark();
-
-      // This is the backoff term that applies when one assumes everything 
backs off:
-      // \prod_i b_i(w_1^{n-1})^{\lambda_i}.
-      Accum backoff_once = 0.0;
-      for (std::size_t m = 0; m < decoded_backoffs_.size(); ++m) {
-        backoff_once += lambdas_[m] * backoffs_.Get(m, order_ - 2);
-      }
-
-      Accum z_delta = 0.0;
-      std::size_t count = 0;
-      for (; input_ && std::equal(context.begin(), context.end(), 
input_->begin()); ++input_, ++prob_out_, ++count) {
-        // Apply backoffs to probabilities.
-        // TODO: change bounded sequence encoding to have an iterator for 
decoding instead of doing a copy here.
-        encoding_.Decode(input_->FromBegin(), &*decoded_backoffs_.begin());
-        for (std::size_t m = 0; m < NumModels(); ++m) {
-          // Apply the backoffs as instructed for model m.
-          float accumulated = 0.0;
-          // Change backoffs for [order it backed off to, order - 1) except
-          // with 0-indexing.  There is still the potential to charge backoff
-          // for order - 1, which is done later.  The backoffs charged here
-          // are b_m(w_{n-1}^{n-1}) ... b_m(w_2^{n-1})
-          for (unsigned char backed_to = decoded_backoffs_[m]; backed_to < 
order_ - 2; ++backed_to) {
-            accumulated += backoffs_.Get(m, backed_to);
-          }
-          float lambda = lambdas_[m];
-          // Lower p(x | w_2^{n-1}) gets all the backoffs except the highest.
-          input_->LowerProb() += accumulated * lambda;
-          // Charge the backoff b(w_1^{n-1}) if applicable, but only to attain 
p(x | w_1^{n-1})
-          if (decoded_backoffs_[m] < order_ - 1) {
-            accumulated += backoffs_.Get(m, order_ - 2);
-          }
-          input_->Prob() += accumulated * lambda;
-        }
-        // TODO: better precision/less operations here.
-        z_delta += pow(10.0, input_->Prob()) - pow(10.0, input_->LowerProb() + 
backoff_once);
-
-        // Write unnormalized probability record.
-        std::copy(input_->begin(), input_->end(), 
reinterpret_cast<WordIndex*>(prob_out_.Get()));
-        ProbWrite() = input_->Prob();
-      }
-      // TODO numerical precision.
-      Accum z = log10(pow(10.0, z_lower + backoff_once) + z_delta);
-
-      // Normalize.
-      prob_out_.Rewind();
-      for (std::size_t i = 0; i < count; ++i, ++prob_out_) {
-        ProbWrite() -= z;
-      }
-      // This allows the stream to release data.
-      prob_out_.Mark();
-
-      // Output backoff.
-      *reinterpret_cast<float*>(backoff_out_.Get()) = z_lower + backoff_once - 
z;
-      ++backoff_out_;
-
-      if (higher_.get())
-        higher_->ExtendContext(context, z);
-
-      backoffs_.Exit(order_ - 2);
-    }
-
-    // Call is given a context and z(context).
-    // Evaluates y context x for all y,x.
-    void ExtendContext(const NGramHeader &middle, Accum z_lower) {
-      assert(middle.size() == order_ - 2);
-      // Copy because the input will advance.  TODO avoid this copy by sharing 
amongst classes.
-      std::copy(middle.begin(), middle.end(), extended_context_.begin() + 1);
-      while (input_ && std::equal(middle.begin(), middle.end(), 
input_->begin() + 1)) {
-        *extended_context_.begin() = *input_->begin();
-        SameContext(NGramHeader(&*extended_context_.begin(), order_ - 1), 
z_lower);
-      }
-    }
-
-    void Finish() {
-      assert(!input_);
-      prob_out_.Poison();
-      backoff_out_.Poison();
-      if (higher_.get())
-        higher_->Finish();
-    }
-
-    // The BackoffManager class also injects backoffs when it skips ahead e.g. 
b(</s>) = 1
-    util::stream::Stream &BackoffStream() { return backoff_out_; }
-
-  private:
-    // Write the probability to the correct place in prob_out_.  Should use a 
proxy but currently incompatible with RewindableStream.
-    float &ProbWrite() {
-      return 
*reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(prob_out_.Get()) + order_ 
* sizeof(WordIndex));
-    }
-
-    std::size_t NumModels() const { return decoded_backoffs_.size(); }
-
-    const std::size_t order_;
-
-    const BoundedSequenceEncoding encoding_;
-
-    ProxyStream<PartialProbGamma> input_;
-    util::stream::RewindableStream prob_out_;
-    util::stream::Stream backoff_out_;
-
-    BackoffManager &backoffs_;
-    const float *const lambdas_;
-
-    // Higher order instance of this same class.
-    util::scoped_ptr<Recurse> higher_;
-
-    // Temporary in SameContext.
-    std::vector<unsigned char> decoded_backoffs_;
-    // Temporary in ExtendContext.
-    std::vector<WordIndex> extended_context_;
-};
-
-class Thread {
-  public:
-    Thread(const InterpolateInfo &info, 
util::FixedArray<util::stream::ChainPositions> &models_by_order, 
util::stream::Chains &prob_out, util::stream::Chains &backoff_out)
-      : info_(info), models_by_order_(models_by_order), prob_out_(prob_out), 
backoff_out_(backoff_out) {}
-
-    void Run(const util::stream::ChainPositions &merged_probabilities) {
-      // Unigrams do not have enocded backoff info.
-      ProxyStream<PartialProbGamma> in(merged_probabilities[0], 
PartialProbGamma(1, 0));
-      util::stream::RewindableStream prob_write(prob_out_[0]);
-      Accum z = 0.0;
-      prob_write.Mark();
-      WordIndex count = 0;
-      for (; in; ++in, ++prob_write, ++count) {
-        // Note assumption that probabilitity comes first
-        memcpy(prob_write.Get(), in.Get(), sizeof(WordIndex) + sizeof(float));
-        z += pow(10.0, in->Prob());
-      }
-      // TODO HACK TODO: lmplz outputs p(<s>) = 1 to get q to compute nicely.  
That will always result in 1.0 more than it should be.
-      z -= 1.0;
-      float log_z = log10(z);
-      prob_write.Rewind();
-      // Normalize unigram probabilities.
-      for (WordIndex i = 0; i < count; ++i, ++prob_write) {
-        *reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(prob_write.Get()) 
+ sizeof(WordIndex)) -= log_z;
-      }
-      prob_write.Poison();
-
-      // Now setup the higher orders.
-      util::scoped_ptr<Recurse> higher_order;
-      BackoffManager backoffs(models_by_order_);
-      std::size_t max_order = merged_probabilities.size();
-      for (std::size_t order = max_order; order >= 2; --order) {
-        higher_order.reset(new Recurse(info_, order, 
merged_probabilities[order - 1], prob_out_[order - 1], backoff_out_[order - 2], 
backoffs, higher_order.release()));
-        backoffs.SetupSkip(order, higher_order->BackoffStream());
-      }
-      if (max_order > 1) {
-        higher_order->ExtendContext(NGramHeader(NULL, 0), log_z);
-        higher_order->Finish();
-      }
-    }
-
-  private:
-    const InterpolateInfo info_;
-    util::FixedArray<util::stream::ChainPositions> &models_by_order_;
-    util::stream::ChainPositions prob_out_;
-    util::stream::ChainPositions backoff_out_;
-};
-
-} // namespace
-
-void Normalize(const InterpolateInfo &info, 
util::FixedArray<util::stream::ChainPositions> &models_by_order, 
util::stream::Chains &merged_probabilities, util::stream::Chains &prob_out, 
util::stream::Chains &backoff_out) {
-  assert(prob_out.size() == backoff_out.size() + 1);
-  // Arbitrarily put the thread on the merged_probabilities Chains.
-  merged_probabilities >> Thread(info, models_by_order, prob_out, backoff_out);
-}
-
-}} // namespaces

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/normalize.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/normalize.hh 
b/ext/kenlm/lm/interpolate/normalize.hh
deleted file mode 100644
index dd26b25..0000000
--- a/ext/kenlm/lm/interpolate/normalize.hh
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef LM_INTERPOLATE_NORMALIZE_H
-#define LM_INTERPOLATE_NORMALIZE_H
-
-#include "util/fixed_array.hh"
-
-/* Pass 2:
- * - Multiply backoff weights by the backed off probabilities from pass 1.
- * - Compute the normalization factor Z.
- * - Send Z to the next highest order.
- * - Rewind and divide by Z.
- */
-
-namespace util { namespace stream {
-class ChainPositions;
-class Chains;
-}} // namespaces
-
-namespace lm { namespace interpolate {
-
-struct InterpolateInfo;
-
-void Normalize(
-    const InterpolateInfo &info,
-    // Input full models for backoffs.  Assumes that renumbering has been 
done. Suffix order.
-    util::FixedArray<util::stream::ChainPositions> &models_by_order,
-    // Input PartialProbGamma from MergeProbabilities. Context order.
-    util::stream::Chains &merged_probabilities,
-    // Output NGram<float> with normalized probabilities. Context order.
-    util::stream::Chains &probabilities_out,
-    // Output bare floats with backoffs.  Note backoffs.size() == order - 1.  
Suffix order.
-    util::stream::Chains &backoffs_out);
-
-}} // namespaces
-
-#endif // LM_INTERPOLATE_NORMALIZE_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/normalize_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/normalize_test.cc 
b/ext/kenlm/lm/interpolate/normalize_test.cc
deleted file mode 100644
index fe220f3..0000000
--- a/ext/kenlm/lm/interpolate/normalize_test.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-#include "lm/interpolate/normalize.hh"
-
-#include "lm/interpolate/interpolate_info.hh"
-#include "lm/interpolate/merge_probabilities.hh"
-#include "lm/common/ngram_stream.hh"
-#include "util/stream/chain.hh"
-#include "util/stream/multi_stream.hh"
-
-#define BOOST_TEST_MODULE NormalizeTest
-#include <boost/test/unit_test.hpp>
-
-namespace lm { namespace interpolate { namespace {
-
-// log without backoff
-const float kInputs[] = {-0.3, 1.2, -9.8, 4.0, -7.0, 0.0};
-
-class WriteInput {
-  public:
-    WriteInput() {}
-    void Run(const util::stream::ChainPosition &to) {
-      util::stream::Stream out(to);
-      for (WordIndex i = 0; i < sizeof(kInputs) / sizeof(float); ++i, ++out) {
-        memcpy(out.Get(), &i, sizeof(WordIndex));
-        memcpy((uint8_t*)out.Get() + sizeof(WordIndex), &kInputs[i], 
sizeof(float));
-      }
-      out.Poison();
-    }
-};
-
-void CheckOutput(const util::stream::ChainPosition &from) {
-  NGramStream<float> in(from);
-  float sum = 0.0;
-  for (WordIndex i = 0; i < sizeof(kInputs) / sizeof(float) - 1 /* <s> at the 
end */; ++i) {
-    sum += pow(10.0, kInputs[i]);
-  }
-  sum = log10(sum);
-  BOOST_REQUIRE(in);
-  BOOST_CHECK_CLOSE(kInputs[0] - sum, in->Value(), 0.0001);
-  BOOST_REQUIRE(++in);
-  BOOST_CHECK_CLOSE(kInputs[1] - sum, in->Value(), 0.0001);
-  BOOST_REQUIRE(++in);
-  BOOST_CHECK_CLOSE(kInputs[2] - sum, in->Value(), 0.0001);
-  BOOST_REQUIRE(++in);
-  BOOST_CHECK_CLOSE(kInputs[3] - sum, in->Value(), 0.0001);
-  BOOST_REQUIRE(++in);
-  BOOST_CHECK_CLOSE(kInputs[4] - sum, in->Value(), 0.0001);
-  BOOST_REQUIRE(++in);
-  BOOST_CHECK_CLOSE(kInputs[5] - sum, in->Value(), 0.0001);
-  BOOST_CHECK(!++in);
-}
-
-BOOST_AUTO_TEST_CASE(Unigrams) {
-  InterpolateInfo info;
-  info.lambdas.push_back(2.0);
-  info.lambdas.push_back(-0.1);
-  info.orders.push_back(1);
-  info.orders.push_back(1);
-
-  BOOST_CHECK_EQUAL(0, MakeEncoder(info, 1).EncodedLength());
-
-  // No backoffs.
-  util::stream::Chains blank(0);
-  util::FixedArray<util::stream::ChainPositions> models_by_order(2);
-  models_by_order.push_back(blank);
-  models_by_order.push_back(blank);
-
-  util::stream::Chains merged_probabilities(1);
-  util::stream::Chains probabilities_out(1);
-  util::stream::Chains backoffs_out(0);
-
-  merged_probabilities.push_back(util::stream::ChainConfig(sizeof(WordIndex) + 
sizeof(float) + sizeof(float), 2, 24));
-  probabilities_out.push_back(util::stream::ChainConfig(sizeof(WordIndex) + 
sizeof(float), 2, 100));
-
-  merged_probabilities[0] >> WriteInput();
-  Normalize(info, models_by_order, merged_probabilities, probabilities_out, 
backoffs_out);
-
-  util::stream::ChainPosition checker(probabilities_out[0].Add());
-
-  merged_probabilities >> util::stream::kRecycle;
-  probabilities_out >> util::stream::kRecycle;
-
-  CheckOutput(checker);
-  probabilities_out.Wait();
-}
-
-}}} // namespaces

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/perf_enum_gv_main.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/perf_enum_gv_main.cc 
b/ext/kenlm/lm/interpolate/perf_enum_gv_main.cc
deleted file mode 100644
index a68e9e9..0000000
--- a/ext/kenlm/lm/interpolate/perf_enum_gv_main.cc
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
-Usage example
-1) Download from http://www.gwinnup.org/lminterp/train-params-output.tar.bz2
-2) then run      perf_enum_gv -t lm.en.dev -m model-a.3.srilm -m 
model-b.3.srilm -m model-c.3.srilm
- */
-
-#include "lm/ngram_query.hh"
-#include "lm/model.hh"
-#include "lm/word_index.hh"
-#include "lm/interpolate/enumerate_global_vocab.hh"
-
-#include "util/fixed_array.hh"
-#include "util/usage.hh"
-
-#include <string>
-#include <vector>
-#include <iostream>
-#include <fstream>
-#include <map>
-
-#include <boost/program_options.hpp>
-#include <boost/version.hpp>
-#include <boost/foreach.hpp>
-
-#include <Eigen/Eigen>
-
-#include <iostream>
-#include <sys/time.h>
-
-inline double deltaTV(const timeval& s, const timeval& e)
-{
-    return (e.tv_sec - s.tv_sec)*1000.0 + (e.tv_usec - s.tv_usec)/1000.0;
-}
-
-typedef struct timeval Wall;
-Wall GetWall() {
-  struct timeval tv;
-  gettimeofday(&tv, NULL);
-  return tv;
-}
-
-typedef Eigen::MatrixXf FMatrix;
-typedef Eigen::VectorXf FVector;
-
-
-bool HAS_BIAS = true;
-
-using namespace lm::ngram;
-using namespace lm;
-
-inline void logProb(Model * model, const std::vector<std::string>& ctx, const 
std::string& word) {
-
-  // Horribly inefficient
-  const Vocabulary &vocab = model->GetVocabulary();
-
-  State nextState; //throwaway
-
-  WordIndex word_idx = vocab.Index(word);
-  WordIndex context_idx[ctx.size()];
-
-  //reverse context
-  for(unsigned int i = 0; i < ctx.size(); i++) {
-    context_idx[ctx.size() - 1 - i] = vocab.Index(ctx[i]);
-  }
-  FullScoreReturn score = model->FullScoreForgotState(context_idx, 
&(context_idx[ctx.size() -1]), word_idx, nextState);    
-}
-
-void set_features(const std::vector<std::string>& ctx,
-                  const std::string& word,
-                  const std::vector<Model *>& models,
-                  FVector& v) {
-
-  for (unsigned i=0; i < models.size(); ++i)
-    logProb(models[i], ctx, word);
-
-}
-
-//const util::FixedArray<Model *>& models)
-void train_params(
-    const std::vector<std::vector<std::string> >& corpus,
-    const std::vector<std::string>& vocab,
-    const std::vector<Model *>& models) {
-  using namespace std;
-
-  vector<string> context(5, "<s>");
-  const int ITERATIONS = 10;
-  const int nlambdas = models.size(); // #models
-  FVector params = FVector::Zero(nlambdas);
-  vector<FVector> feats(vocab.size(), params);
-  static Wall start,stop;
-
-  for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations
-    std::cout << "iteration: " << iter 
-              << " corpus size " << corpus.size() 
-              << std::endl;
-    for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning 
corpus
-      const vector<string>& sentence = corpus[ci];
-      context.resize(5);
-      for (unsigned t = 0; t < sentence.size(); ++t) { // words in sentence
-        std::cout <<  "sentence " << ci << " word " << t << std::endl;
-        start = GetWall();
-        const string& ref_word_string = sentence[t];
-        for (unsigned i = 0; i < vocab.size(); ++i) { // vocab
-          set_features(context, vocab[i], models, feats[i]);
-        }
-        stop = GetWall();
-        std::cout << " time elapsed = " << deltaTV(start,stop)  << std::endl;
-        context.push_back(ref_word_string);
-      }
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-
-  std::string tuning_data;
-  std::vector<std::string> lms;
-
-  try {
-    namespace po = boost::program_options;
-    po::options_description options("train-params");
-
-    options.add_options()
-      ("help,h", po::bool_switch(), "Show this help message")
-      ("no_bias_term,B", po::bool_switch(), "Do not include a 'bias' feature")
-      ("tuning_data,t", po::value<std::string>(&tuning_data), "File to tune 
perplexity on")
-      ("model,m", po::value<std::vector<std::string> >(&lms), "Language models 
in KenLM format to interpolate");
-    po::variables_map vm;
-    po::store(po::parse_command_line(argc, argv, options), vm);
-
-    // Display help
-    if(argc == 1 || vm["help"].as<bool>()) {
-      std::cerr << options << std::endl;
-      return 1;
-    }
-    if (vm["no_bias_term"].as<bool>())
-      HAS_BIAS = false;
-    lms = vm["model"].as<std::vector<std::string> >();
-    tuning_data = vm["tuning_data"].as<std::string>();
-  }
-  catch(const std::exception &e) {
-
-    std::cerr << e.what() << std::endl;
-    return 1;
-
-  }
-  if (lms.size() < 2) {
-    std::cerr << "Please specify at least two language model files with -m 
LM.KLM\n";
-    return 1;
-  }
-  if (tuning_data.empty()) {
-    std::cerr << "Please specify tuning set with -t FILE.TXT\n";
-    return 1;
-  }
-
-  std::map<std::string, int*> vmap;
-  util::FixedArray<WordIndex> vm(2);
-
-  //stuff it into the
-  EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(&vmap, 
lms.size());
-  // EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(vm);
-
-  Config cfg;
-  cfg.enumerate_vocab = (EnumerateVocab *) globalVocabBuilder;
-
-  //load models
-  //util::FixedArray<Model *> models(lms.size());
-  std::vector<Model *> models;
-  for(int i=0; i < lms.size(); i++) {
-    std::cerr << "Loading LM file: " << lms[i] << std::endl;
-
-    //haaaack
-    globalVocabBuilder->SetCurModel(i); //yes this is dumb
-
-    //models[i] = new Model(lms[i].c_str());
-    Model * this_model = new Model(lms[i].c_str(), cfg);
-    models.push_back( this_model );
-
-  }
-
-  //assemble vocabulary vector
-  std::vector<std::string> vocab;
-  std::cerr << "Global Vocab Map has size: " << vmap.size() << std::endl;
-
-  std::pair<StringPiece,int *> me;
-
-  for(std::map<std::string, int*>::iterator iter = vmap.begin(); iter != 
vmap.end(); ++iter) {
-    vocab.push_back(iter->first);
-  }
-  std::cerr << "Vocab vector has size: " << vocab.size() << std::endl;
-
-  //load context sorted ngrams into vector of vectors
-  std::vector<std::vector<std::string> > corpus;
-
-  std::cerr << "Loading context-sorted ngrams: " << tuning_data << std::endl;
-  std::ifstream infile(tuning_data);
-
-  for(std::string line; std::getline(infile, line); ) {
-
-    std::vector<std::string> words; {
-
-      std::stringstream stream(line);
-      std::string word;
-
-      while(stream >> word) {
-        words.push_back(word);
-      }
-    }
-    corpus.push_back(words);
-  }
-
-  train_params(corpus, vocab, models);
-
-  return 0;
-}

[12/51] [partial] incubator-joshua git commit: Converted KenLM into a submodule

Reply via email to