http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/CMakeLists.txt b/ext/kenlm/lm/interpolate/CMakeLists.txt deleted file mode 100644 index c146ab6..0000000 --- a/ext/kenlm/lm/interpolate/CMakeLists.txt +++ /dev/null @@ -1,54 +0,0 @@ -find_package(Eigen3 REQUIRED) -include_directories(${EIGEN3_INCLUDE_DIR}) - -set(KENLM_INTERPOLATE_SOURCE - backoff_reunification.cc - bounded_sequence_encoding.cc - enumerate_global_vocab.cc - merge_probabilities.cc - merge_vocab.cc - normalize.cc - pipeline.cc - split_worker.cc - tune_derivatives.cc - tune_instance.cc - universal_vocab.cc) - -add_library(kenlm_interpolate OBJECT ${KENLM_INTERPOLATE_SOURCE}) - -set(KENLM_INTERPOLATE_EXES - interpolate - perf_enum_gv - streaming_example - train_params - tune) - -set(KENLM_INTERPOLATE_DEPENDS - $<TARGET_OBJECTS:kenlm> - $<TARGET_OBJECTS:kenlm_util> - $<TARGET_OBJECTS:kenlm_common> - $<TARGET_OBJECTS:kenlm_interpolate>) - -AddExes(EXES ${KENLM_INTERPOLATE_EXES} - DEPENDS ${KENLM_INTERPOLATE_DEPENDS} - LIBRARIES ${Boost_LIBRARIES} pthread) - -if(BUILD_TESTING) - set(KENLM_INTERPOLATE_TESTS - backoff_reunification_test - bounded_sequence_encoding_test - merge_vocab_test - normalize_test - tune_derivatives_test) - - AddTests(TESTS ${KENLM_INTERPOLATE_TESTS} - DEPENDS ${KENLM_INTERPOLATE_DEPENDS} - LIBRARIES ${Boost_LIBRARIES} pthread) - - # tune_instance_test needs an extra command line parameter - KenLMAddTest(TEST tune_instance_test - DEPENDS ${KENLM_INTERPOLATE_DEPENDS} - LIBRARIES ${Boost_LIBRARIES} pthread - TEST_ARGS - ${CMAKE_CURRENT_SOURCE_DIR}/tune_instance_data/toy0.1) -endif()
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/Jamfile ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/Jamfile b/ext/kenlm/lm/interpolate/Jamfile deleted file mode 100644 index 411a346..0000000 --- a/ext/kenlm/lm/interpolate/Jamfile +++ /dev/null @@ -1,22 +0,0 @@ -fakelib interp : ../common//common [ glob *.cc : *_main.cc *_test.cc tune_*.cc ] : <cxxflags>-fopenmp ; - -import testing ; - -local with-eigen = [ option.get "with-eigen" ] ; -if $(with-eigen) { - fakelib tuning : tune_instance.cc tune_derivatives.cc interp ..//kenlm : <include>$(with-eigen) ; - unit-test tune_derivatives_test : tune_derivatives_test.cc tuning /top//boost_unit_test_framework : <include>$(with-eigen) ; - - obj tune_instance_test.o : tune_instance_test.cc /top//boost_unit_test_framework : <include>$(with-eigen) ; - run tune_instance_test.o tuning /top//boost_unit_test_framework : : tune_instance_data/toy0.1 ; - - exe tune : tune_main.cc tuning /top//boost_program_options : <include>$(with-eigen) ; -} - -exe interpolate : interpolate_main.cc interp /top//boost_program_options ; -exe streaming_example : ../builder//builder interp streaming_example_main.cc /top//boost_program_options ; - -unit-test normalize_test : interp normalize_test.cc /top//boost_unit_test_framework ; -unit-test backoff_reunification_test : interp backoff_reunification_test.cc /top//boost_unit_test_framework ; -unit-test bounded_sequence_encoding_test : interp bounded_sequence_encoding_test.cc /top//boost_unit_test_framework ; -run merge_vocab_test.cc interp /top//boost_unit_test_framework : : merge_test/test1 merge_test/test2 merge_test/test3 merge_test/test_bad_order merge_test/test_no_unk ; http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/backoff_matrix.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/backoff_matrix.hh b/ext/kenlm/lm/interpolate/backoff_matrix.hh deleted file mode 100644 index c7552df..0000000 --- a/ext/kenlm/lm/interpolate/backoff_matrix.hh +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef LM_INTERPOLATE_BACKOFF_MATRIX_H -#define LM_INTERPOLATE_BACKOFF_MATRIX_H - -#include <cstddef> -#include <vector> - -namespace lm { namespace interpolate { - -class BackoffMatrix { - public: - BackoffMatrix(std::size_t num_models, std::size_t max_order) - : max_order_(max_order), backing_(num_models * max_order) {} - - float &Backoff(std::size_t model, std::size_t order_minus_1) { - return backing_[model * max_order_ + order_minus_1]; - } - - float Backoff(std::size_t model, std::size_t order_minus_1) const { - return backing_[model * max_order_ + order_minus_1]; - } - - private: - const std::size_t max_order_; - std::vector<float> backing_; -}; - -}} // namespaces - -#endif // LM_INTERPOLATE_BACKOFF_MATRIX_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/backoff_reunification.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/backoff_reunification.cc b/ext/kenlm/lm/interpolate/backoff_reunification.cc deleted file mode 100644 index edf6ddc..0000000 --- a/ext/kenlm/lm/interpolate/backoff_reunification.cc +++ /dev/null @@ -1,57 +0,0 @@ -#include "lm/interpolate/backoff_reunification.hh" -#include "lm/common/model_buffer.hh" -#include "lm/common/ngram_stream.hh" -#include "lm/common/ngram.hh" -#include "lm/common/compare.hh" - -#include <cassert> - -namespace lm { -namespace interpolate { - -namespace { -class MergeWorker { -public: - MergeWorker(std::size_t order, const util::stream::ChainPosition &prob_pos, - const util::stream::ChainPosition &boff_pos) - : order_(order), prob_pos_(prob_pos), boff_pos_(boff_pos) { - // nothing - } - - void Run(const util::stream::ChainPosition &position) { - lm::NGramStream<ProbBackoff> stream(position); - - lm::NGramStream<float> prob_input(prob_pos_); - util::stream::Stream boff_input(boff_pos_); - for (; prob_input && boff_input; ++prob_input, ++boff_input, ++stream) { - std::copy(prob_input->begin(), prob_input->end(), stream->begin()); - stream->Value().prob = prob_input->Value(); - stream->Value().backoff = *reinterpret_cast<float *>(boff_input.Get()); - } - UTIL_THROW_IF2(prob_input || boff_input, - "Streams were not the same size during merging"); - stream.Poison(); - } - -private: - std::size_t order_; - util::stream::ChainPosition prob_pos_; - util::stream::ChainPosition boff_pos_; -}; -} - -// Since we are *adding* something to the output chain here, we pass in the -// chain itself so that we can safely add a new step to the chain without -// creating a deadlock situation (since creating a new ChainPosition will -// make a new input/output pair---we want that position to be created -// *here*, not before). -void ReunifyBackoff(util::stream::ChainPositions &prob_pos, - util::stream::ChainPositions &boff_pos, - util::stream::Chains &output_chains) { - assert(prob_pos.size() == boff_pos.size()); - - for (size_t i = 0; i < prob_pos.size(); ++i) - output_chains[i] >> MergeWorker(i + 1, prob_pos[i], boff_pos[i]); -} -} -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/backoff_reunification.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/backoff_reunification.hh b/ext/kenlm/lm/interpolate/backoff_reunification.hh deleted file mode 100644 index 327db65..0000000 --- a/ext/kenlm/lm/interpolate/backoff_reunification.hh +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef KENLM_INTERPOLATE_BACKOFF_REUNIFICATION_ -#define KENLM_INTERPOLATE_BACKOFF_REUNIFICATION_ - -#include "util/stream/stream.hh" -#include "util/stream/multi_stream.hh" - -namespace lm { -namespace interpolate { - -/** - * The third pass for the offline log-linear interpolation algorithm. This - * reads **suffix-ordered** probability values (ngram-id, float) and - * **suffix-ordered** backoff values (float) and writes the merged contents - * to the output. - * - * @param prob_pos The chain position for each order from which to read - * the probability values - * @param boff_pos The chain position for each order from which to read - * the backoff values - * @param output_chains The output chains for each order - */ -void ReunifyBackoff(util::stream::ChainPositions &prob_pos, - util::stream::ChainPositions &boff_pos, - util::stream::Chains &output_chains); -} -} -#endif http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/backoff_reunification_test.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/backoff_reunification_test.cc b/ext/kenlm/lm/interpolate/backoff_reunification_test.cc deleted file mode 100644 index d8faf94..0000000 --- a/ext/kenlm/lm/interpolate/backoff_reunification_test.cc +++ /dev/null @@ -1,159 +0,0 @@ -#include "lm/interpolate/backoff_reunification.hh" -#include "lm/common/ngram_stream.hh" - -#define BOOST_TEST_MODULE InterpolateBackoffReunificationTest -#include <boost/test/unit_test.hpp> - -namespace lm { -namespace interpolate { - -namespace { - -// none of this input actually makes sense, all we care about is making -// sure the merging works -template <uint8_t N> -struct Gram { - WordIndex ids[N]; - float prob; - float boff; -}; - -template <uint8_t N> -struct Grams { - const static Gram<N> grams[]; -}; - -template <> -const Gram<1> Grams<1>::grams[] - = {{{0}, 0.1f, 0.1f}, {{1}, 0.4f, 0.2f}, {{2}, 0.5f, 0.1f}}; - -template <> -const Gram<2> Grams<2>::grams[] = {{{0, 0}, 0.05f, 0.05f}, - {{1, 0}, 0.05f, 0.02f}, - {{1, 1}, 0.2f, 0.04f}, - {{2, 2}, 0.2f, 0.01f}}; - -template <> -const Gram<3> Grams<3>::grams[] = {{{0, 0, 0}, 0.001f, 0.005f}, - {{1, 0, 0}, 0.001f, 0.002f}, - {{2, 0, 0}, 0.001f, 0.003f}, - {{0, 1, 0}, 0.1f, 0.008f}, - {{1, 1, 0}, 0.1f, 0.09f}, - {{1, 1, 1}, 0.2f, 0.08f}}; - -template <uint8_t N> -class WriteInput { -public: - void Run(const util::stream::ChainPosition &position) { - lm::NGramStream<float> output(position); - - for (std::size_t i = 0; i < sizeof(Grams<N>::grams) / sizeof(Gram<N>); - ++i, ++output) { - std::copy(Grams<N>::grams[i].ids, Grams<N>::grams[i].ids + N, - output->begin()); - output->Value() = Grams<N>::grams[i].prob; - } - output.Poison(); - } -}; - -template <uint8_t N> -class WriteBackoffs { -public: - void Run(const util::stream::ChainPosition &position) { - util::stream::Stream output(position); - - for (std::size_t i = 0; i < sizeof(Grams<N>::grams) / sizeof(Gram<N>); - ++i, ++output) { - *reinterpret_cast<float *>(output.Get()) = Grams<N>::grams[i].boff; - } - output.Poison(); - } -}; - -template <uint8_t N> -class CheckOutput { -public: - void Run(const util::stream::ChainPosition &position) { - lm::NGramStream<ProbBackoff> stream(position); - - std::size_t i = 0; - for (; stream; ++stream, ++i) { - std::stringstream ss; - for (WordIndex *idx = stream->begin(); idx != stream->end(); ++idx) - ss << "(" << *idx << ")"; - - UTIL_THROW_IF2( - !std::equal(stream->begin(), stream->end(), Grams<N>::grams[i].ids), - "Mismatched id in CheckOutput<" << (int)N << ">: " << ss.str()); - - UTIL_THROW_IF2(stream->Value().prob != Grams<N>::grams[i].prob, - "Mismatched probability in CheckOutput<" - << (int)N << ">, got " << stream->Value().prob - << ", expected " << Grams<N>::grams[i].prob); - - UTIL_THROW_IF2(stream->Value().backoff != Grams<N>::grams[i].boff, - "Mismatched backoff in CheckOutput<" - << (int)N << ">, got " << stream->Value().backoff - << ", expected " << Grams<N>::grams[i].boff); - } - UTIL_THROW_IF2(i != sizeof(Grams<N>::grams) / sizeof(Gram<N>), - "Did not get correct number of " - << (int)N << "-grams: expected " - << sizeof(Grams<N>::grams) / sizeof(Gram<N>) - << ", got " << i); - } -}; -} - -BOOST_AUTO_TEST_CASE(BackoffReunificationTest) { - util::stream::ChainConfig config; - config.total_memory = 100; - config.block_count = 1; - - util::stream::Chains prob_chains(3); - config.entry_size = NGram<float>::TotalSize(1); - prob_chains.push_back(config); - prob_chains.back() >> WriteInput<1>(); - - config.entry_size = NGram<float>::TotalSize(2); - prob_chains.push_back(config); - prob_chains.back() >> WriteInput<2>(); - - config.entry_size = NGram<float>::TotalSize(3); - prob_chains.push_back(config); - prob_chains.back() >> WriteInput<3>(); - - util::stream::Chains boff_chains(3); - config.entry_size = sizeof(float); - boff_chains.push_back(config); - boff_chains.back() >> WriteBackoffs<1>(); - - boff_chains.push_back(config); - boff_chains.back() >> WriteBackoffs<2>(); - - boff_chains.push_back(config); - boff_chains.back() >> WriteBackoffs<3>(); - - util::stream::ChainPositions prob_pos(prob_chains); - util::stream::ChainPositions boff_pos(boff_chains); - - util::stream::Chains output_chains(3); - for (std::size_t i = 0; i < 3; ++i) { - config.entry_size = NGram<ProbBackoff>::TotalSize(i + 1); - output_chains.push_back(config); - } - - ReunifyBackoff(prob_pos, boff_pos, output_chains); - - output_chains[0] >> CheckOutput<1>(); - output_chains[1] >> CheckOutput<2>(); - output_chains[2] >> CheckOutput<3>(); - - prob_chains >> util::stream::kRecycle; - boff_chains >> util::stream::kRecycle; - - output_chains.Wait(); -} -} -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/bounded_sequence_encoding.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/bounded_sequence_encoding.cc b/ext/kenlm/lm/interpolate/bounded_sequence_encoding.cc deleted file mode 100644 index aca8ed7..0000000 --- a/ext/kenlm/lm/interpolate/bounded_sequence_encoding.cc +++ /dev/null @@ -1,36 +0,0 @@ -#include "lm/interpolate/bounded_sequence_encoding.hh" - -#include <algorithm> - -namespace lm { namespace interpolate { - -BoundedSequenceEncoding::BoundedSequenceEncoding(const unsigned char *bound_begin, const unsigned char *bound_end) - : entries_(bound_end - bound_begin) { - std::size_t full = 0; - Entry entry; - entry.shift = 0; - for (const unsigned char *i = bound_begin; i != bound_end; ++i) { - uint8_t length; - if (*i <= 1) { - length = 0; - } else { - length = sizeof(unsigned int) * 8 - __builtin_clz((unsigned int)*i); - } - entry.mask = (1ULL << length) - 1ULL; - if (entry.shift + length > 64) { - entry.shift = 0; - entry.next = true; - ++full; - } else { - entry.next = false; - } - entries_.push_back(entry); - entry.shift += length; - } - byte_length_ = full * sizeof(uint64_t) + (entry.shift + 7) / 8; - first_copy_ = std::min<std::size_t>(byte_length_, sizeof(uint64_t)); - // Size of last uint64_t. Zero if empty, otherwise [1,8] depending on mod. - overhang_ = byte_length_ == 0 ? 0 : ((byte_length_ - 1) % 8 + 1); -} - -}} // namespaces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/bounded_sequence_encoding.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/bounded_sequence_encoding.hh b/ext/kenlm/lm/interpolate/bounded_sequence_encoding.hh deleted file mode 100644 index 84dd63a..0000000 --- a/ext/kenlm/lm/interpolate/bounded_sequence_encoding.hh +++ /dev/null @@ -1,76 +0,0 @@ -#ifndef LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H -#define LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H - -/* Encodes fixed-length sequences of integers with known bounds on each entry. - * This is used to encode how far each model has backed off. - * TODO: make this class efficient. Bit-level packing or multiply by bound and - * add. - */ - -#include "util/exception.hh" -#include "util/fixed_array.hh" - -#if BYTE_ORDER != LITTLE_ENDIAN -#warning The interpolation code assumes little endian for now. -#endif - -#include <algorithm> -#include <cstring> - -namespace lm { -namespace interpolate { - -class BoundedSequenceEncoding { - public: - // Encode [0, bound_begin[0]) x [0, bound_begin[1]) x [0, bound_begin[2]) x ... x [0, *(bound_end - 1)) for entries in the sequence - BoundedSequenceEncoding(const unsigned char *bound_begin, const unsigned char *bound_end); - - std::size_t Entries() const { return entries_.size(); } - - std::size_t EncodedLength() const { return byte_length_; } - - void Encode(const unsigned char *from, void *to_void) const { - uint8_t *to = static_cast<uint8_t*>(to_void); - uint64_t cur = 0; - for (const Entry *i = entries_.begin(); i != entries_.end(); ++i, ++from) { - if (UTIL_UNLIKELY(i->next)) { - std::memcpy(to, &cur, sizeof(uint64_t)); - to += sizeof(uint64_t); - cur = 0; - } - cur |= static_cast<uint64_t>(*from) << i->shift; - } - memcpy(to, &cur, overhang_); - } - - void Decode(const void *from_void, unsigned char *to) const { - const uint8_t *from = static_cast<const uint8_t*>(from_void); - uint64_t cur = 0; - memcpy(&cur, from, first_copy_); - for (const Entry *i = entries_.begin(); i != entries_.end(); ++i, ++to) { - if (UTIL_UNLIKELY(i->next)) { - from += sizeof(uint64_t); - cur = 0; - std::memcpy(&cur, from, - std::min<std::size_t>(sizeof(uint64_t), static_cast<const uint8_t*>(from_void) + byte_length_ - from)); - } - *to = (cur >> i->shift) & i->mask; - } - } - - private: - struct Entry { - bool next; - uint8_t shift; - uint64_t mask; - }; - util::FixedArray<Entry> entries_; - std::size_t byte_length_; - std::size_t first_copy_; - std::size_t overhang_; -}; - - -}} // namespaces - -#endif // LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc b/ext/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc deleted file mode 100644 index 2c4bbd9..0000000 --- a/ext/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc +++ /dev/null @@ -1,86 +0,0 @@ -#include "lm/interpolate/bounded_sequence_encoding.hh" - -#include "util/scoped.hh" - -#define BOOST_TEST_MODULE BoundedSequenceEncodingTest -#include <boost/test/unit_test.hpp> - -namespace lm { -namespace interpolate { -namespace { - -void ExhaustiveTest(unsigned char *bound_begin, unsigned char *bound_end) { - BoundedSequenceEncoding enc(bound_begin, bound_end); - util::scoped_malloc backing(util::MallocOrThrow(enc.EncodedLength())); - std::vector<unsigned char> values(bound_end - bound_begin), - out(bound_end - bound_begin); - while (true) { - enc.Encode(&values[0], backing.get()); - enc.Decode(backing.get(), &out[0]); - for (std::size_t i = 0; i != values.size(); ++i) { - BOOST_CHECK_EQUAL(values[i], out[i]); - } - for (std::size_t i = 0;; ++i) { - if (i == values.size()) return; - ++values[i]; - if (values[i] < bound_begin[i]) break; - values[i] = 0; - } - } -} - -void CheckEncodeDecode(unsigned char *bounds, unsigned char *input, - unsigned char *output, std::size_t len) { - BoundedSequenceEncoding encoder(bounds, bounds + len); - util::scoped_malloc backing(util::MallocOrThrow(encoder.EncodedLength())); - - encoder.Encode(input, backing.get()); - encoder.Decode(backing.get(), output); - - for (std::size_t i = 0; i < len; ++i) { - BOOST_CHECK_EQUAL(input[i], output[i]); - } -} - -BOOST_AUTO_TEST_CASE(Exhaustive) { - unsigned char bounds[] = {5, 2, 3, 9, 7, 20, 8}; - ExhaustiveTest(bounds, bounds + sizeof(bounds) / sizeof(unsigned char)); -} - -BOOST_AUTO_TEST_CASE(LessThan64) { - unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 3}; - unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 2}; - unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0}; - - std::size_t len = sizeof(bounds) / sizeof(unsigned char); - assert(sizeof(input) / sizeof(unsigned char) == len); - assert(sizeof(output) / sizeof(unsigned char) == len); - - CheckEncodeDecode(bounds, input, output, len); -} - -BOOST_AUTO_TEST_CASE(Exactly64) { - unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 255}; - unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 16}; - unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0}; - - std::size_t len = sizeof(bounds) / sizeof(unsigned char); - assert(sizeof(input) / sizeof(unsigned char) == len); - assert(sizeof(output) / sizeof(unsigned char) == len); - - CheckEncodeDecode(bounds, input, output, len); -} - -BOOST_AUTO_TEST_CASE(MoreThan64) { - unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 255, 255}; - unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 16, 137}; - unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; - - std::size_t len = sizeof(bounds) / sizeof(unsigned char); - assert(sizeof(input) / sizeof(unsigned char) == len); - assert(sizeof(output) / sizeof(unsigned char) == len); - - CheckEncodeDecode(bounds, input, output, len); -} - -}}} // namespaces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/enumerate_global_vocab.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/enumerate_global_vocab.cc b/ext/kenlm/lm/interpolate/enumerate_global_vocab.cc deleted file mode 100644 index 26f63c8..0000000 --- a/ext/kenlm/lm/interpolate/enumerate_global_vocab.cc +++ /dev/null @@ -1,48 +0,0 @@ - -#include "lm/interpolate/enumerate_global_vocab.hh" - -#include <iostream> -#include <map> - -namespace lm { -//constructor - - EnumerateGlobalVocab::EnumerateGlobalVocab(std::map<std::string, int*> * vm, int nm) { - - vmap = vm; - num_models = nm; - cur_model = 0; //blah - cnt = 0; - std::cerr << "Vocab Builder with models: " << nm << std::endl; - } - - void EnumerateGlobalVocab::Add(WordIndex index, const StringPiece &str) { - - std::string st = str.as_string(); - - //check for existence of key - std::map<std::string, int*>::iterator itr = vmap->find(st); - - //put stuff - if(itr != vmap->end()) { - std::cerr << "Vocab exist: " << str << " M: " << cur_model << " I:" << index << std::endl; - itr->second[cur_model] = index; - } - //new key - else { - - //create model index map for this vocab word - //init to 0, 0 is UNK - int * indices = new int[num_models]; - memset(indices, 0, (sizeof(int)*num_models)); //this still legit? - - indices[cur_model] = index; - (*vmap)[st] = indices; - std::cerr << cnt << ":Vocab add: " << str << " M: " << cur_model << " I:" << index << std::endl; - cnt++; - } - - - } - -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/enumerate_global_vocab.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/enumerate_global_vocab.hh b/ext/kenlm/lm/interpolate/enumerate_global_vocab.hh deleted file mode 100644 index c37d649..0000000 --- a/ext/kenlm/lm/interpolate/enumerate_global_vocab.hh +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef LM_ENUMERATE_GLOBAL_VOCAB_H -#define LM_ENUMERATE_GLOBAL_VOCAB_H - -#include "lm/enumerate_vocab.hh" -#include <map> - -/* Use this to create a global vocab across models for use when - * calculating lambdas for interpolation. Or other stuff. - */ -namespace lm { - - class EnumerateGlobalVocab : EnumerateVocab { - - public: - - //yes, ugly... - std::map<std::string, int*> * vmap; - int num_models; - int cur_model; - int cnt; //stupid - - ~EnumerateGlobalVocab() {} - - void Add(WordIndex index, const StringPiece & str); - - EnumerateGlobalVocab(std::map<std::string, int*> *, int); - - void SetCurModel(int i) { cur_model = i; } - - protected: - EnumerateGlobalVocab() {} - - }; - -} //namespace lm - -#endif // LM_ENUMERATE_GLOBAL_VOCAB_H - http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/interpolate_info.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/interpolate_info.hh b/ext/kenlm/lm/interpolate/interpolate_info.hh deleted file mode 100644 index ebecd92..0000000 --- a/ext/kenlm/lm/interpolate/interpolate_info.hh +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef KENLM_INTERPOLATE_INTERPOLATE_INFO_H -#define KENLM_INTERPOLATE_INTERPOLATE_INFO_H - -#include <cstddef> -#include <vector> -#include <stdint.h> - -namespace lm { -namespace interpolate { - -/** - * Stores relevant info for interpolating several language models, for use - * during the three-pass offline log-linear interpolation algorithm. - */ -struct InterpolateInfo { - /** - * @return the number of models being interpolated - */ - std::size_t Models() const { - return orders.size(); - } - - /** - * The lambda (interpolation weight) for each model. - */ - std::vector<float> lambdas; - - /** - * The maximum ngram order for each model. - */ - std::vector<uint8_t> orders; -}; -} -} -#endif http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/interpolate_main.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/interpolate_main.cc b/ext/kenlm/lm/interpolate/interpolate_main.cc deleted file mode 100644 index a99b62d..0000000 --- a/ext/kenlm/lm/interpolate/interpolate_main.cc +++ /dev/null @@ -1,37 +0,0 @@ -#include "lm/common/model_buffer.hh" -#include "lm/common/size_option.hh" -#include "lm/interpolate/pipeline.hh" -#include "util/fixed_array.hh" -#include "util/usage.hh" - -#include <boost/program_options.hpp> - -#include <iostream> -#include <vector> - -int main(int argc, char *argv[]) { - lm::interpolate::Config config; - std::vector<std::string> input_models; - namespace po = boost::program_options; - po::options_description options("Log-linear interpolation options"); - options.add_options() - ("lambda,w", po::value<std::vector<float> >(&config.lambdas)->multitoken()->required(), "Interpolation weights") - ("model,m", po::value<std::vector<std::string> >(&input_models)->multitoken()->required(), "Models to interpolate") - ("temp_prefix,T", po::value<std::string>(&config.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix") - ("memory,S", lm::SizeOption(config.sort.total_memory, util::GuessPhysicalMemory() ? "50%" : "1G"), "Sorting memory") - ("sort_block", lm::SizeOption(config.sort.buffer_size, "64M"), "Block size"); - po::variables_map vm; - po::store(po::parse_command_line(argc, argv, options), vm); - po::notify(vm); - - if (config.lambdas.size() != input_models.size()) { - std::cerr << "Number of models " << input_models.size() << " should match the number of weights" << config.lambdas.size() << "." << std::endl; - return 1; - } - - util::FixedArray<lm::ModelBuffer> models(input_models.size()); - for (std::size_t i = 0; i < input_models.size(); ++i) { - models.push_back(input_models[i]); - } - lm::interpolate::Pipeline(models, config, 1); -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_probabilities.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/merge_probabilities.cc b/ext/kenlm/lm/interpolate/merge_probabilities.cc deleted file mode 100644 index b6c949f..0000000 --- a/ext/kenlm/lm/interpolate/merge_probabilities.cc +++ /dev/null @@ -1,285 +0,0 @@ -#include "lm/interpolate/merge_probabilities.hh" -#include "lm/common/ngram_stream.hh" -#include "lm/interpolate/bounded_sequence_encoding.hh" -#include "lm/interpolate/interpolate_info.hh" - -#include <algorithm> -#include <limits> -#include <numeric> - -namespace lm { -namespace interpolate { - -/** - * Helper to generate the BoundedSequenceEncoding used for writing the - * from values. - */ -BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t order) { - util::FixedArray<uint8_t> max_orders(info.orders.size()); - for (std::size_t i = 0; i < info.orders.size(); ++i) { - max_orders.push_back(std::min(order, info.orders[i])); - } - return BoundedSequenceEncoding(max_orders.begin(), max_orders.end()); -} - -namespace { - -/** - * A simple wrapper class that holds information needed to read and write - * the ngrams of a particular order. This class has the memory needed to - * buffer the data needed for the recursive process of computing the - * probabilities and "from" values for each component model. - * - * "From" values indicate, for each model, what order (as an index, so -1) - * was backed off to in order to arrive at a probability. For example, if a - * 5-gram model (order index 4) backed off twice, we would write a 2. - */ -class NGramHandler { -public: - NGramHandler(uint8_t order, const InterpolateInfo &ifo, - util::FixedArray<util::stream::ChainPositions> &models_by_order) - : info(ifo), - encoder(MakeEncoder(info, order)), - out_record(order, encoder.EncodedLength()) { - std::size_t count_has_order = 0; - for (std::size_t i = 0; i < models_by_order.size(); ++i) { - count_has_order += (models_by_order[i].size() >= order); - } - inputs_.Init(count_has_order); - for (std::size_t i = 0; i < models_by_order.size(); ++i) { - if (models_by_order[i].size() < order) - continue; - inputs_.push_back(models_by_order[i][order - 1]); - if (inputs_.back()) { - active_.resize(active_.size() + 1); - active_.back().model = i; - active_.back().stream = &inputs_.back(); - } - } - - // have to init outside since NGramStreams doesn't forward to - // GenericStreams ctor given a ChainPositions - - probs.Init(info.Models()); - from.Init(info.Models()); - for (std::size_t i = 0; i < info.Models(); ++i) { - probs.push_back(0.0); - from.push_back(0); - } - } - - struct StreamIndex { - NGramStream<ProbBackoff> *stream; - NGramStream<ProbBackoff> &Stream() { return *stream; } - std::size_t model; - }; - - std::size_t ActiveSize() const { - return active_.size(); - } - - /** - * @return the input stream for a particular model that corresponds to - * this ngram order - */ - StreamIndex &operator[](std::size_t idx) { - return active_[idx]; - } - - void erase(std::size_t idx) { - active_.erase(active_.begin() + idx); - } - - const InterpolateInfo &info; - BoundedSequenceEncoding encoder; - PartialProbGamma out_record; - util::FixedArray<float> probs; - util::FixedArray<uint8_t> from; - -private: - std::vector<StreamIndex> active_; - NGramStreams<ProbBackoff> inputs_; -}; - -/** - * A collection of NGramHandlers. - */ -class NGramHandlers : public util::FixedArray<NGramHandler> { -public: - explicit NGramHandlers(std::size_t num) - : util::FixedArray<NGramHandler>(num) { - } - - void push_back( - std::size_t order, const InterpolateInfo &info, - util::FixedArray<util::stream::ChainPositions> &models_by_order) { - new (end()) NGramHandler(order, info, models_by_order); - Constructed(); - } -}; - -/** - * The recursive helper function that computes probability and "from" - * values for all ngrams matching a particular suffix. - * - * The current order can be computed as the suffix length + 1. Note that - * the suffix could be empty (suffix_begin == suffix_end == NULL), in which - * case we are handling unigrams with the UNK token as the fallback - * probability. - * - * @param handlers The full collection of handlers - * @param suffix_begin A start iterator for the suffix - * @param suffix_end An end iterator for the suffix - * @param fallback_probs The probabilities of this ngram if we need to - * back off (that is, the probability of the suffix) - * @param fallback_from The order that the corresponding fallback - * probability in the fallback_probs is from - * @param combined_fallback interpolated fallback_probs - * @param outputs The output streams, one for each order - */ -void HandleSuffix(NGramHandlers &handlers, WordIndex *suffix_begin, - WordIndex *suffix_end, - const util::FixedArray<float> &fallback_probs, - const util::FixedArray<uint8_t> &fallback_from, - float combined_fallback, - util::stream::Streams &outputs) { - uint8_t order = std::distance(suffix_begin, suffix_end) + 1; - if (order > outputs.size()) return; - - util::stream::Stream &output = outputs[order - 1]; - NGramHandler &handler = handlers[order - 1]; - - while (true) { - // find the next smallest ngram which matches our suffix - // TODO: priority queue driven. - WordIndex *minimum = NULL; - for (std::size_t i = 0; i < handler.ActiveSize(); ++i) { - if (!std::equal(suffix_begin, suffix_end, handler[i].Stream()->begin() + 1)) - continue; - - // if we either haven't set a minimum yet or this one is smaller than - // the minimum we found before, replace it - WordIndex *last = handler[i].Stream()->begin(); - if (!minimum || *last < *minimum) { minimum = handler[i].Stream()->begin(); } - } - - // no more ngrams of this order match our suffix, so we're done - if (!minimum) return; - - handler.out_record.ReBase(output.Get()); - std::copy(minimum, minimum + order, handler.out_record.begin()); - - // Default case is having backed off. - std::copy(fallback_probs.begin(), fallback_probs.end(), handler.probs.begin()); - std::copy(fallback_from.begin(), fallback_from.end(), handler.from.begin()); - - for (std::size_t i = 0; i < handler.ActiveSize();) { - if (std::equal(handler.out_record.begin(), handler.out_record.end(), - handler[i].Stream()->begin())) { - handler.probs[handler[i].model] = handler.info.lambdas[handler[i].model] * handler[i].Stream()->Value().prob; - handler.from[handler[i].model] = order - 1; - if (++handler[i].Stream()) { - ++i; - } else { - handler.erase(i); - } - } else { - ++i; - } - } - handler.out_record.Prob() = std::accumulate(handler.probs.begin(), handler.probs.end(), 0.0); - handler.out_record.LowerProb() = combined_fallback; - handler.encoder.Encode(handler.from.begin(), - handler.out_record.FromBegin()); - - // we've handled this particular ngram, so now recurse to the higher - // order using the current ngram as the suffix - HandleSuffix(handlers, handler.out_record.begin(), handler.out_record.end(), - handler.probs, handler.from, handler.out_record.Prob(), outputs); - // consume the output - ++output; - } -} - -/** - * Kicks off the recursion for computing the probabilities and "from" - * values for each ngram order. We begin by handling the UNK token that - * should be at the front of each of the unigram input streams. This is - * then output to the stream and it is used as the fallback for handling - * our unigram case, the unigram used as the fallback for the bigram case, - * etc. - */ -void HandleNGrams(NGramHandlers &handlers, util::stream::Streams &outputs) { - PartialProbGamma unk_record(1, 0); - // First: populate the unk probabilities by reading the first unigram - // from each stream - util::FixedArray<float> unk_probs(handlers[0].info.Models()); - - // start by populating the ngram id from the first stream - lm::NGram<ProbBackoff> ngram = *handlers[0][0].Stream(); - unk_record.ReBase(outputs[0].Get()); - std::copy(ngram.begin(), ngram.end(), unk_record.begin()); - unk_record.Prob() = 0; - - // then populate the probabilities into unk_probs while "multiply" the - // model probabilities together into the unk record - // - // note that from doesn't need to be set for unigrams - assert(handlers[0].ActiveSize() == handlers[0].info.Models()); - for (std::size_t i = 0; i < handlers[0].info.Models();) { - ngram = *handlers[0][i].Stream(); - unk_probs.push_back(handlers[0].info.lambdas[i] * ngram.Value().prob); - unk_record.Prob() += unk_probs[i]; - assert(*ngram.begin() == kUNK); - if (++handlers[0][i].Stream()) { - ++i; - } else { - handlers[0].erase(i); - } - } - float unk_combined = unk_record.Prob(); - unk_record.LowerProb() = unk_combined; - // flush the unk output record - ++outputs[0]; - - // Then, begin outputting everything in lexicographic order: first we'll - // get the unigram then the first bigram with that context, then the - // first trigram with that bigram context, etc., until we exhaust all of - // the ngrams, then all of the (n-1)grams, etc. - // - // This function is the "root" of this recursive process. - util::FixedArray<uint8_t> unk_from(handlers[0].info.Models()); - for (std::size_t i = 0; i < handlers[0].info.Models(); ++i) { - unk_from.push_back(0); - } - - // the two nulls are to encode that our "fallback" word is the "0-gram" - // case, e.g. we "backed off" to UNK - // TODO: stop generating vocab ids and LowerProb for unigrams. - HandleSuffix(handlers, NULL, NULL, unk_probs, unk_from, unk_combined, outputs); - - // Verify we reached the end. And poison! - for (std::size_t i = 0; i < handlers.size(); ++i) { - UTIL_THROW_IF2(handlers[i].ActiveSize(), - "MergeProbabilities did not exhaust all ngram streams"); - outputs[i].Poison(); - } -} -} - -void MergeProbabilities( - const InterpolateInfo &info, - util::FixedArray<util::stream::ChainPositions> &models_by_order, - util::stream::Chains &output_chains) { - NGramHandlers handlers(output_chains.size()); - for (std::size_t i = 0; i < output_chains.size(); ++i) { - handlers.push_back(i + 1, info, models_by_order); - } - - util::stream::ChainPositions output_pos(output_chains); - util::stream::Streams outputs(output_pos); - - HandleNGrams(handlers, outputs); -} -} -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_probabilities.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/merge_probabilities.hh b/ext/kenlm/lm/interpolate/merge_probabilities.hh deleted file mode 100644 index 59b8046..0000000 --- a/ext/kenlm/lm/interpolate/merge_probabilities.hh +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef LM_INTERPOLATE_MERGE_PROBABILITIES_H -#define LM_INTERPOLATE_MERGE_PROBABILITIES_H - -#include "lm/common/ngram.hh" -#include "lm/interpolate/bounded_sequence_encoding.hh" -#include "util/fixed_array.hh" -#include "util/stream/multi_stream.hh" - -#include <stdint.h> - -namespace lm { -namespace interpolate { - -struct InterpolateInfo; - -/** - * Make the encoding of backoff values for a given order. This stores values - * in [PartialProbGamma::FromBegin(), PartialProbGamma::FromEnd()) - */ -BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t order); - -/** - * The first pass for the offline log-linear interpolation algorithm. This - * reads K **suffix-ordered** streams for each model, for each order, of - * ngram records (ngram-id, prob, backoff). It further assumes that the - * ngram-ids have been unified over all of the stream inputs. - * - * Its output is records of (ngram-id, prob-prod, backoff-level, - * backoff-level, ...) where the backoff-levels (of which there are K) are - * the context length (0 for unigrams) that the corresponding model had to - * back off to in order to obtain a probability for that ngram-id. Each of - * these streams is terminated with a record whose ngram-id is all - * maximum-integers for simplicity in implementation here. - * - * @param models An array of length N (max_i N_i) containing at - * the ChainPositions for the streams for order (i + 1). - * @param output_chains The output chains for each order (of length K) - */ -void MergeProbabilities( - const InterpolateInfo &info, - util::FixedArray<util::stream::ChainPositions> &models_by_order, - util::stream::Chains &output_chains); - -/** - * This class represents the output payload for this pass, which consists - * of an ngram-id, a probability, and then a vector of orders from which - * each of the component models backed off to for this ngram, encoded - * using the BoundedSequenceEncoding class. - */ -class PartialProbGamma : public lm::NGramHeader { -public: - PartialProbGamma(std::size_t order, std::size_t backoff_bytes) - : lm::NGramHeader(NULL, order), backoff_bytes_(backoff_bytes) { - // nothing - } - - std::size_t TotalSize() const { - return sizeof(WordIndex) * Order() + sizeof(After) + backoff_bytes_; - } - - // TODO: cache bounded sequence encoding in the pipeline? - static std::size_t TotalSize(const InterpolateInfo &info, uint8_t order) { - return sizeof(WordIndex) * order + sizeof(After) + MakeEncoder(info, order).EncodedLength(); - } - - float &Prob() { return Pay().prob; } - float Prob() const { return Pay().prob; } - - float &LowerProb() { return Pay().lower_prob; } - float LowerProb() const { return Pay().lower_prob; } - - const uint8_t *FromBegin() const { return Pay().from; } - uint8_t *FromBegin() { return Pay().from; } - -private: - struct After { - // Note that backoff_and_normalize assumes this comes first. - float prob; - float lower_prob; - uint8_t from[]; - }; - const After &Pay() const { return *reinterpret_cast<const After *>(end()); } - After &Pay() { return *reinterpret_cast<After*>(end()); } - - std::size_t backoff_bytes_; -}; - -}} // namespaces -#endif // LM_INTERPOLATE_MERGE_PROBABILITIES_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test1 ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/merge_test/test1 b/ext/kenlm/lm/interpolate/merge_test/test1 deleted file mode 100644 index 08d114d..0000000 Binary files a/ext/kenlm/lm/interpolate/merge_test/test1 and /dev/null differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test2 ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/merge_test/test2 b/ext/kenlm/lm/interpolate/merge_test/test2 deleted file mode 100644 index fd3a380..0000000 Binary files a/ext/kenlm/lm/interpolate/merge_test/test2 and /dev/null differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test3 ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/merge_test/test3 b/ext/kenlm/lm/interpolate/merge_test/test3 deleted file mode 100644 index 6c89d7f..0000000 Binary files a/ext/kenlm/lm/interpolate/merge_test/test3 and /dev/null differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test_bad_order ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/merge_test/test_bad_order b/ext/kenlm/lm/interpolate/merge_test/test_bad_order deleted file mode 100644 index d50a5e8..0000000 Binary files a/ext/kenlm/lm/interpolate/merge_test/test_bad_order and /dev/null differ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test_no_unk ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/merge_test/test_no_unk b/ext/kenlm/lm/interpolate/merge_test/test_no_unk deleted file mode 100644 index fbcf12d..0000000 --- a/ext/kenlm/lm/interpolate/merge_test/test_no_unk +++ /dev/null @@ -1 +0,0 @@ -toto http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_vocab.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/merge_vocab.cc b/ext/kenlm/lm/interpolate/merge_vocab.cc deleted file mode 100644 index 3e84389..0000000 --- a/ext/kenlm/lm/interpolate/merge_vocab.cc +++ /dev/null @@ -1,131 +0,0 @@ -#include "lm/interpolate/merge_vocab.hh" - -#include "lm/enumerate_vocab.hh" -#include "lm/interpolate/universal_vocab.hh" -#include "lm/lm_exception.hh" -#include "lm/vocab.hh" -#include "util/file_piece.hh" - -#include <queue> -#include <string> -#include <iostream> -#include <vector> - -namespace lm { -namespace interpolate { -namespace { - -class VocabFileReader { - public: - explicit VocabFileReader(const int fd, size_t model_num, uint64_t offset = 0); - - VocabFileReader &operator++(); - operator bool() const { return !eof_; } - uint64_t operator*() const { return Value(); } - - uint64_t Value() const { return hash_value_; } - size_t ModelNum() const { return model_num_; } - WordIndex CurrentIndex() const { return current_index_; } - - StringPiece Word() const { return word_; } - - private: - uint64_t hash_value_; - WordIndex current_index_; - bool eof_; - size_t model_num_; - StringPiece word_; - util::FilePiece file_piece_; -}; - -VocabFileReader::VocabFileReader(const int fd, const size_t model_num, uint64_t offset) : - hash_value_(0), - current_index_(0), - eof_(false), - model_num_(model_num), - file_piece_(fd) { - word_ = file_piece_.ReadLine('\0'); - UTIL_THROW_IF(word_ != "<unk>", - FormatLoadException, - "Vocabulary words are in the wrong place."); - // setup to initial value - ++*this; -} - -VocabFileReader &VocabFileReader::operator++() { - try { - word_ = file_piece_.ReadLine('\0'); - } catch(util::EndOfFileException &e) { - eof_ = true; - return *this; - } - uint64_t prev_hash_value = hash_value_; - hash_value_ = ngram::detail::HashForVocab(word_.data(), word_.size()); - - // hash values should be monotonically increasing - UTIL_THROW_IF(hash_value_ < prev_hash_value, FormatLoadException, - ": word index not monotonically increasing." - << " model_num: " << model_num_ - << " prev hash: " << prev_hash_value - << " new hash: " << hash_value_); - - ++current_index_; - return *this; -} - -class CompareFiles { -public: - bool operator()(const VocabFileReader* x, - const VocabFileReader* y) - { return x->Value() > y->Value(); } -}; - -class Readers : public util::FixedArray<VocabFileReader> { - public: - Readers(std::size_t number) : util::FixedArray<VocabFileReader>(number) {} - void push_back(int fd, std::size_t i) { - new(end()) VocabFileReader(fd, i); - Constructed(); - } -}; - -} // namespace - -WordIndex MergeVocab(util::FixedArray<util::scoped_fd> &files, UniversalVocab &vocab, EnumerateVocab &enumerate) { - typedef std::priority_queue<VocabFileReader*, std::vector<VocabFileReader*>, CompareFiles> HeapType; - HeapType heap; - Readers readers(files.size()); - for (size_t i = 0; i < files.size(); ++i) { - readers.push_back(files[i].release(), i); - heap.push(&readers.back()); - // initialize first index to 0 for <unk> - vocab.InsertUniversalIdx(i, 0, 0); - } - - uint64_t prev_hash_value = 0; - // global_index starts with <unk> which is 0 - WordIndex global_index = 0; - - enumerate.Add(0, "<unk>"); - while (!heap.empty()) { - VocabFileReader* top_vocab_file = heap.top(); - if (top_vocab_file->Value() != prev_hash_value) { - enumerate.Add(++global_index, top_vocab_file->Word()); - } - vocab.InsertUniversalIdx(top_vocab_file->ModelNum(), - top_vocab_file->CurrentIndex(), - global_index); - - prev_hash_value = top_vocab_file->Value(); - - heap.pop(); - if (++(*top_vocab_file)) { - heap.push(top_vocab_file); - } - } - return global_index + 1; -} - -} // namespace interpolate -} // namespace lm - http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_vocab.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/merge_vocab.hh b/ext/kenlm/lm/interpolate/merge_vocab.hh deleted file mode 100644 index cc74d33..0000000 --- a/ext/kenlm/lm/interpolate/merge_vocab.hh +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef LM_INTERPOLATE_MERGE_VOCAB_H -#define LM_INTERPOLATE_MERGE_VOCAB_H - -#include "lm/word_index.hh" -#include "util/file.hh" -#include "util/fixed_array.hh" - -namespace lm { - -class EnumerateVocab; - -namespace interpolate { - -class UniversalVocab; - -// Takes ownership of vocab_files. -// The combined vocabulary is enumerated with enumerate. -// Returns the size of the combined vocabulary. -WordIndex MergeVocab(util::FixedArray<util::scoped_fd> &vocab_files, UniversalVocab &vocab, EnumerateVocab &enumerate); - -}} // namespaces - -#endif // LM_INTERPOLATE_MERGE_VOCAB_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_vocab_test.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/merge_vocab_test.cc b/ext/kenlm/lm/interpolate/merge_vocab_test.cc deleted file mode 100644 index 6df25b2..0000000 --- a/ext/kenlm/lm/interpolate/merge_vocab_test.cc +++ /dev/null @@ -1,126 +0,0 @@ -#define BOOST_TEST_MODULE InterpolateMergeVocabTest -#include <boost/test/unit_test.hpp> - -#include "lm/enumerate_vocab.hh" -#include "lm/interpolate/merge_vocab.hh" -#include "lm/interpolate/universal_vocab.hh" -#include "lm/lm_exception.hh" -#include "lm/vocab.hh" -#include "lm/word_index.hh" -#include "util/file.hh" -#include "util/file_piece.hh" - -#include <cstring> - -namespace lm { -namespace interpolate { -namespace { - -// Stupid bjam permutes the command line arguments randomly. -class TestFiles { - public: - TestFiles() { - char **argv = boost::unit_test::framework::master_test_suite().argv; - int argc = boost::unit_test::framework::master_test_suite().argc; - BOOST_REQUIRE_EQUAL(6, argc); - for (int i = 1; i < argc; ++i) { - EndsWithAssign(argv[i], "test1", test[0]); - EndsWithAssign(argv[i], "test2", test[1]); - EndsWithAssign(argv[i], "test3", test[2]); - EndsWithAssign(argv[i], "no_unk", no_unk); - EndsWithAssign(argv[i], "bad_order", bad_order); - } - } - - void EndsWithAssign(char *arg, StringPiece value, util::scoped_fd &to) { - StringPiece str(arg); - if (str.size() < value.size()) return; - if (std::memcmp(str.data() + str.size() - value.size(), value.data(), value.size())) return; - to.reset(util::OpenReadOrThrow(arg)); - } - - util::scoped_fd test[3], no_unk, bad_order; -}; - -class DoNothingEnumerate : public EnumerateVocab { - public: - void Add(WordIndex, const StringPiece &) {} -}; - -BOOST_AUTO_TEST_CASE(MergeVocabTest) { - TestFiles files; - - util::FixedArray<util::scoped_fd> used_files(3); - used_files.push_back(files.test[0].release()); - used_files.push_back(files.test[1].release()); - used_files.push_back(files.test[2].release()); - - std::vector<lm::WordIndex> model_max_idx; - model_max_idx.push_back(10); - model_max_idx.push_back(10); - model_max_idx.push_back(10); - - util::scoped_fd combined(util::MakeTemp("temporary")); - - UniversalVocab universal_vocab(model_max_idx); - { - ngram::ImmediateWriteWordsWrapper writer(NULL, combined.get(), 0); - MergeVocab(used_files, universal_vocab, writer); - } - - BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 0), 0); - BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 0), 0); - BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 0), 0); - BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 1), 1); - BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 1), 2); - BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 1), 8); - BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 5), 11); - BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 3), 4); - BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 3), 10); - - util::FilePiece f(combined.release()); - BOOST_CHECK_EQUAL("<unk>", f.ReadLine('\0')); - BOOST_CHECK_EQUAL("a", f.ReadLine('\0')); - BOOST_CHECK_EQUAL("is this", f.ReadLine('\0')); - BOOST_CHECK_EQUAL("this a", f.ReadLine('\0')); - BOOST_CHECK_EQUAL("first cut", f.ReadLine('\0')); - BOOST_CHECK_EQUAL("this", f.ReadLine('\0')); - BOOST_CHECK_EQUAL("a first", f.ReadLine('\0')); - BOOST_CHECK_EQUAL("cut", f.ReadLine('\0')); - BOOST_CHECK_EQUAL("is", f.ReadLine('\0')); - BOOST_CHECK_EQUAL("i", f.ReadLine('\0')); - BOOST_CHECK_EQUAL("secd", f.ReadLine('\0')); - BOOST_CHECK_EQUAL("first", f.ReadLine('\0')); - BOOST_CHECK_THROW(f.ReadLine('\0'), util::EndOfFileException); -} - -BOOST_AUTO_TEST_CASE(MergeVocabNoUnkTest) { - TestFiles files; - util::FixedArray<util::scoped_fd> used_files(1); - used_files.push_back(files.no_unk.release()); - - std::vector<lm::WordIndex> model_max_idx; - model_max_idx.push_back(10); - - UniversalVocab universal_vocab(model_max_idx); - DoNothingEnumerate nothing; - BOOST_CHECK_THROW(MergeVocab(used_files, universal_vocab, nothing), FormatLoadException); -} - -BOOST_AUTO_TEST_CASE(MergeVocabWrongOrderTest) { - TestFiles files; - - util::FixedArray<util::scoped_fd> used_files(2); - used_files.push_back(files.test[0].release()); - used_files.push_back(files.bad_order.release()); - - std::vector<lm::WordIndex> model_max_idx; - model_max_idx.push_back(10); - model_max_idx.push_back(10); - - lm::interpolate::UniversalVocab universal_vocab(model_max_idx); - DoNothingEnumerate nothing; - BOOST_CHECK_THROW(MergeVocab(used_files, universal_vocab, nothing), FormatLoadException); -} - -}}} // namespaces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/normalize.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/normalize.cc b/ext/kenlm/lm/interpolate/normalize.cc deleted file mode 100644 index f683f10..0000000 --- a/ext/kenlm/lm/interpolate/normalize.cc +++ /dev/null @@ -1,384 +0,0 @@ -#include "lm/interpolate/normalize.hh" - -#include "lm/common/compare.hh" -#include "lm/common/ngram_stream.hh" -#include "lm/interpolate/backoff_matrix.hh" -#include "lm/interpolate/bounded_sequence_encoding.hh" -#include "lm/interpolate/interpolate_info.hh" -#include "lm/interpolate/merge_probabilities.hh" -#include "lm/weights.hh" -#include "lm/word_index.hh" -#include "util/fixed_array.hh" -#include "util/scoped.hh" -#include "util/stream/stream.hh" -#include "util/stream/rewindable_stream.hh" - -#include <functional> -#include <queue> -#include <vector> - -namespace lm { namespace interpolate { -namespace { - -struct SuffixLexicographicLess : public std::binary_function<NGramHeader, NGramHeader, bool> { - bool operator()(const NGramHeader first, const NGramHeader second) const { - for (const WordIndex *f = first.end() - 1, *s = second.end() - 1; f >= first.begin() && s >= second.begin(); --f, --s) { - if (*f < *s) return true; - if (*f > *s) return false; - } - return first.size() < second.size(); - } -}; - -class BackoffQueueEntry { - public: - BackoffQueueEntry(float &entry, const util::stream::ChainPosition &position) - : entry_(entry), stream_(position) { - entry_ = 0.0; - } - - operator bool() const { return stream_; } - - NGramHeader operator*() const { return *stream_; } - const NGramHeader *operator->() const { return &*stream_; } - - void Enter() { - entry_ = stream_->Value().backoff; - } - - BackoffQueueEntry &Next() { - entry_ = 0.0; - ++stream_; - return *this; - } - - private: - float &entry_; - NGramStream<ProbBackoff> stream_; -}; - -struct PtrGreater : public std::binary_function<const BackoffQueueEntry *, const BackoffQueueEntry *, bool> { - bool operator()(const BackoffQueueEntry *first, const BackoffQueueEntry *second) const { - return SuffixLexicographicLess()(**second, **first); - } -}; - -class EntryOwner : public util::FixedArray<BackoffQueueEntry> { - public: - void push_back(float &entry, const util::stream::ChainPosition &position) { - new (end()) BackoffQueueEntry(entry, position); - Constructed(); - } -}; - -std::size_t MaxOrder(const util::FixedArray<util::stream::ChainPositions> &model) { - std::size_t ret = 0; - for (const util::stream::ChainPositions *m = model.begin(); m != model.end(); ++m) { - ret = std::max(ret, m->size()); - } - return ret; -} - -class BackoffManager { - public: - explicit BackoffManager(const util::FixedArray<util::stream::ChainPositions> &models) - : entered_(MaxOrder(models)), matrix_(models.size(), MaxOrder(models)), skip_write_(MaxOrder(models)) { - std::size_t total = 0; - for (const util::stream::ChainPositions *m = models.begin(); m != models.end(); ++m) { - total += m->size(); - } - for (std::size_t i = 0; i < MaxOrder(models); ++i) { - entered_.push_back(models.size()); - } - owner_.Init(total); - for (const util::stream::ChainPositions *m = models.begin(); m != models.end(); ++m) { - for (const util::stream::ChainPosition *j = m->begin(); j != m->end(); ++j) { - owner_.push_back(matrix_.Backoff(m - models.begin(), j - m->begin()), *j); - if (owner_.back()) { - queue_.push(&owner_.back()); - } - } - } - } - - void SetupSkip(std::size_t order, util::stream::Stream &stream) { - skip_write_[order - 2] = &stream; - } - - // Move up the backoffs for the given n-gram. The n-grams must be provided - // in suffix lexicographic order. - void Enter(const NGramHeader &to) { - // Check that we exited properly. - for (std::size_t i = to.Order() - 1; i < entered_.size(); ++i) { - assert(entered_[i].empty()); - } - SuffixLexicographicLess less; - while (!queue_.empty() && less(**queue_.top(), to)) - SkipRecord(); - while (TopMatches(to)) { - BackoffQueueEntry *matches = queue_.top(); - entered_[to.Order() - 1].push_back(matches); - matches->Enter(); - queue_.pop(); - } - } - - void Exit(std::size_t order_minus_1) { - for (BackoffQueueEntry **i = entered_[order_minus_1].begin(); i != entered_[order_minus_1].end(); ++i) { - if ((*i)->Next()) - queue_.push(*i); - } - entered_[order_minus_1].clear(); - } - - float Get(std::size_t model, std::size_t order_minus_1) const { - return matrix_.Backoff(model, order_minus_1); - } - - void Finish() { - while (!queue_.empty()) - SkipRecord(); - } - - private: - void SkipRecord() { - BackoffQueueEntry *top = queue_.top(); - queue_.pop(); - // Is this the last instance of the n-gram? - if (!TopMatches(**top)) { - // An n-gram is being skipped. Called once per skipped n-gram, - // regardless of how many models it comes from. - *reinterpret_cast<float*>(skip_write_[(*top)->Order() - 1]->Get()) = 0.0; - ++*skip_write_[(*top)->Order() - 1]; - } - if (top->Next()) - queue_.push(top); - } - - bool TopMatches(const NGramHeader &header) const { - return !queue_.empty() && (*queue_.top())->Order() == header.Order() && std::equal(header.begin(), header.end(), (*queue_.top())->begin()); - } - - EntryOwner owner_; - std::priority_queue<BackoffQueueEntry*, std::vector<BackoffQueueEntry*>, PtrGreater> queue_; - - // Indexed by order then just all the matching models. - util::FixedArray<util::FixedArray<BackoffQueueEntry*> > entered_; - - std::size_t order_; - - BackoffMatrix matrix_; - - std::vector<util::stream::Stream*> skip_write_; -}; - -typedef long double Accum; - -// Handles n-grams of the same order, using recursion to call another instance -// for higher orders. -class Recurse { - public: - Recurse( - const InterpolateInfo &info, // Must stay alive the entire time. - std::size_t order, - const util::stream::ChainPosition &merged_probs, - const util::stream::ChainPosition &prob_out, - const util::stream::ChainPosition &backoff_out, - BackoffManager &backoffs, - Recurse *higher) // higher is null for the highest order. - : order_(order), - encoding_(MakeEncoder(info, order)), - input_(merged_probs, PartialProbGamma(order, encoding_.EncodedLength())), - prob_out_(prob_out), - backoff_out_(backoff_out), - backoffs_(backoffs), - lambdas_(&*info.lambdas.begin()), - higher_(higher), - decoded_backoffs_(info.Models()), - extended_context_(order - 1) { - // This is only for bigrams and above. Summing unigrams is a much easier case. - assert(order >= 2); - } - - // context = w_1^{n-1} - // z_lower = Z(w_2^{n-1}) - // Input: - // Merged probabilities without backoff applied in input_. - // Backoffs via backoffs_. - // Calculates: - // Z(w_1^{n-1}): intermediate only. - // p_I(x | w_1^{n-1}) for all x: w_1^{n-1}x exists: Written to prob_out_. - // b_I(w_1^{n-1}): Written to backoff_out_. - void SameContext(const NGramHeader &context, Accum z_lower) { - assert(context.size() == order_ - 1); - backoffs_.Enter(context); - prob_out_.Mark(); - - // This is the backoff term that applies when one assumes everything backs off: - // \prod_i b_i(w_1^{n-1})^{\lambda_i}. - Accum backoff_once = 0.0; - for (std::size_t m = 0; m < decoded_backoffs_.size(); ++m) { - backoff_once += lambdas_[m] * backoffs_.Get(m, order_ - 2); - } - - Accum z_delta = 0.0; - std::size_t count = 0; - for (; input_ && std::equal(context.begin(), context.end(), input_->begin()); ++input_, ++prob_out_, ++count) { - // Apply backoffs to probabilities. - // TODO: change bounded sequence encoding to have an iterator for decoding instead of doing a copy here. - encoding_.Decode(input_->FromBegin(), &*decoded_backoffs_.begin()); - for (std::size_t m = 0; m < NumModels(); ++m) { - // Apply the backoffs as instructed for model m. - float accumulated = 0.0; - // Change backoffs for [order it backed off to, order - 1) except - // with 0-indexing. There is still the potential to charge backoff - // for order - 1, which is done later. The backoffs charged here - // are b_m(w_{n-1}^{n-1}) ... b_m(w_2^{n-1}) - for (unsigned char backed_to = decoded_backoffs_[m]; backed_to < order_ - 2; ++backed_to) { - accumulated += backoffs_.Get(m, backed_to); - } - float lambda = lambdas_[m]; - // Lower p(x | w_2^{n-1}) gets all the backoffs except the highest. - input_->LowerProb() += accumulated * lambda; - // Charge the backoff b(w_1^{n-1}) if applicable, but only to attain p(x | w_1^{n-1}) - if (decoded_backoffs_[m] < order_ - 1) { - accumulated += backoffs_.Get(m, order_ - 2); - } - input_->Prob() += accumulated * lambda; - } - // TODO: better precision/less operations here. - z_delta += pow(10.0, input_->Prob()) - pow(10.0, input_->LowerProb() + backoff_once); - - // Write unnormalized probability record. - std::copy(input_->begin(), input_->end(), reinterpret_cast<WordIndex*>(prob_out_.Get())); - ProbWrite() = input_->Prob(); - } - // TODO numerical precision. - Accum z = log10(pow(10.0, z_lower + backoff_once) + z_delta); - - // Normalize. - prob_out_.Rewind(); - for (std::size_t i = 0; i < count; ++i, ++prob_out_) { - ProbWrite() -= z; - } - // This allows the stream to release data. - prob_out_.Mark(); - - // Output backoff. - *reinterpret_cast<float*>(backoff_out_.Get()) = z_lower + backoff_once - z; - ++backoff_out_; - - if (higher_.get()) - higher_->ExtendContext(context, z); - - backoffs_.Exit(order_ - 2); - } - - // Call is given a context and z(context). - // Evaluates y context x for all y,x. - void ExtendContext(const NGramHeader &middle, Accum z_lower) { - assert(middle.size() == order_ - 2); - // Copy because the input will advance. TODO avoid this copy by sharing amongst classes. - std::copy(middle.begin(), middle.end(), extended_context_.begin() + 1); - while (input_ && std::equal(middle.begin(), middle.end(), input_->begin() + 1)) { - *extended_context_.begin() = *input_->begin(); - SameContext(NGramHeader(&*extended_context_.begin(), order_ - 1), z_lower); - } - } - - void Finish() { - assert(!input_); - prob_out_.Poison(); - backoff_out_.Poison(); - if (higher_.get()) - higher_->Finish(); - } - - // The BackoffManager class also injects backoffs when it skips ahead e.g. b(</s>) = 1 - util::stream::Stream &BackoffStream() { return backoff_out_; } - - private: - // Write the probability to the correct place in prob_out_. Should use a proxy but currently incompatible with RewindableStream. - float &ProbWrite() { - return *reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(prob_out_.Get()) + order_ * sizeof(WordIndex)); - } - - std::size_t NumModels() const { return decoded_backoffs_.size(); } - - const std::size_t order_; - - const BoundedSequenceEncoding encoding_; - - ProxyStream<PartialProbGamma> input_; - util::stream::RewindableStream prob_out_; - util::stream::Stream backoff_out_; - - BackoffManager &backoffs_; - const float *const lambdas_; - - // Higher order instance of this same class. - util::scoped_ptr<Recurse> higher_; - - // Temporary in SameContext. - std::vector<unsigned char> decoded_backoffs_; - // Temporary in ExtendContext. - std::vector<WordIndex> extended_context_; -}; - -class Thread { - public: - Thread(const InterpolateInfo &info, util::FixedArray<util::stream::ChainPositions> &models_by_order, util::stream::Chains &prob_out, util::stream::Chains &backoff_out) - : info_(info), models_by_order_(models_by_order), prob_out_(prob_out), backoff_out_(backoff_out) {} - - void Run(const util::stream::ChainPositions &merged_probabilities) { - // Unigrams do not have enocded backoff info. - ProxyStream<PartialProbGamma> in(merged_probabilities[0], PartialProbGamma(1, 0)); - util::stream::RewindableStream prob_write(prob_out_[0]); - Accum z = 0.0; - prob_write.Mark(); - WordIndex count = 0; - for (; in; ++in, ++prob_write, ++count) { - // Note assumption that probabilitity comes first - memcpy(prob_write.Get(), in.Get(), sizeof(WordIndex) + sizeof(float)); - z += pow(10.0, in->Prob()); - } - // TODO HACK TODO: lmplz outputs p(<s>) = 1 to get q to compute nicely. That will always result in 1.0 more than it should be. - z -= 1.0; - float log_z = log10(z); - prob_write.Rewind(); - // Normalize unigram probabilities. - for (WordIndex i = 0; i < count; ++i, ++prob_write) { - *reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(prob_write.Get()) + sizeof(WordIndex)) -= log_z; - } - prob_write.Poison(); - - // Now setup the higher orders. - util::scoped_ptr<Recurse> higher_order; - BackoffManager backoffs(models_by_order_); - std::size_t max_order = merged_probabilities.size(); - for (std::size_t order = max_order; order >= 2; --order) { - higher_order.reset(new Recurse(info_, order, merged_probabilities[order - 1], prob_out_[order - 1], backoff_out_[order - 2], backoffs, higher_order.release())); - backoffs.SetupSkip(order, higher_order->BackoffStream()); - } - if (max_order > 1) { - higher_order->ExtendContext(NGramHeader(NULL, 0), log_z); - higher_order->Finish(); - } - } - - private: - const InterpolateInfo info_; - util::FixedArray<util::stream::ChainPositions> &models_by_order_; - util::stream::ChainPositions prob_out_; - util::stream::ChainPositions backoff_out_; -}; - -} // namespace - -void Normalize(const InterpolateInfo &info, util::FixedArray<util::stream::ChainPositions> &models_by_order, util::stream::Chains &merged_probabilities, util::stream::Chains &prob_out, util::stream::Chains &backoff_out) { - assert(prob_out.size() == backoff_out.size() + 1); - // Arbitrarily put the thread on the merged_probabilities Chains. - merged_probabilities >> Thread(info, models_by_order, prob_out, backoff_out); -} - -}} // namespaces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/normalize.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/normalize.hh b/ext/kenlm/lm/interpolate/normalize.hh deleted file mode 100644 index dd26b25..0000000 --- a/ext/kenlm/lm/interpolate/normalize.hh +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef LM_INTERPOLATE_NORMALIZE_H -#define LM_INTERPOLATE_NORMALIZE_H - -#include "util/fixed_array.hh" - -/* Pass 2: - * - Multiply backoff weights by the backed off probabilities from pass 1. - * - Compute the normalization factor Z. - * - Send Z to the next highest order. - * - Rewind and divide by Z. - */ - -namespace util { namespace stream { -class ChainPositions; -class Chains; -}} // namespaces - -namespace lm { namespace interpolate { - -struct InterpolateInfo; - -void Normalize( - const InterpolateInfo &info, - // Input full models for backoffs. Assumes that renumbering has been done. Suffix order. - util::FixedArray<util::stream::ChainPositions> &models_by_order, - // Input PartialProbGamma from MergeProbabilities. Context order. - util::stream::Chains &merged_probabilities, - // Output NGram<float> with normalized probabilities. Context order. - util::stream::Chains &probabilities_out, - // Output bare floats with backoffs. Note backoffs.size() == order - 1. Suffix order. - util::stream::Chains &backoffs_out); - -}} // namespaces - -#endif // LM_INTERPOLATE_NORMALIZE_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/normalize_test.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/normalize_test.cc b/ext/kenlm/lm/interpolate/normalize_test.cc deleted file mode 100644 index fe220f3..0000000 --- a/ext/kenlm/lm/interpolate/normalize_test.cc +++ /dev/null @@ -1,86 +0,0 @@ -#include "lm/interpolate/normalize.hh" - -#include "lm/interpolate/interpolate_info.hh" -#include "lm/interpolate/merge_probabilities.hh" -#include "lm/common/ngram_stream.hh" -#include "util/stream/chain.hh" -#include "util/stream/multi_stream.hh" - -#define BOOST_TEST_MODULE NormalizeTest -#include <boost/test/unit_test.hpp> - -namespace lm { namespace interpolate { namespace { - -// log without backoff -const float kInputs[] = {-0.3, 1.2, -9.8, 4.0, -7.0, 0.0}; - -class WriteInput { - public: - WriteInput() {} - void Run(const util::stream::ChainPosition &to) { - util::stream::Stream out(to); - for (WordIndex i = 0; i < sizeof(kInputs) / sizeof(float); ++i, ++out) { - memcpy(out.Get(), &i, sizeof(WordIndex)); - memcpy((uint8_t*)out.Get() + sizeof(WordIndex), &kInputs[i], sizeof(float)); - } - out.Poison(); - } -}; - -void CheckOutput(const util::stream::ChainPosition &from) { - NGramStream<float> in(from); - float sum = 0.0; - for (WordIndex i = 0; i < sizeof(kInputs) / sizeof(float) - 1 /* <s> at the end */; ++i) { - sum += pow(10.0, kInputs[i]); - } - sum = log10(sum); - BOOST_REQUIRE(in); - BOOST_CHECK_CLOSE(kInputs[0] - sum, in->Value(), 0.0001); - BOOST_REQUIRE(++in); - BOOST_CHECK_CLOSE(kInputs[1] - sum, in->Value(), 0.0001); - BOOST_REQUIRE(++in); - BOOST_CHECK_CLOSE(kInputs[2] - sum, in->Value(), 0.0001); - BOOST_REQUIRE(++in); - BOOST_CHECK_CLOSE(kInputs[3] - sum, in->Value(), 0.0001); - BOOST_REQUIRE(++in); - BOOST_CHECK_CLOSE(kInputs[4] - sum, in->Value(), 0.0001); - BOOST_REQUIRE(++in); - BOOST_CHECK_CLOSE(kInputs[5] - sum, in->Value(), 0.0001); - BOOST_CHECK(!++in); -} - -BOOST_AUTO_TEST_CASE(Unigrams) { - InterpolateInfo info; - info.lambdas.push_back(2.0); - info.lambdas.push_back(-0.1); - info.orders.push_back(1); - info.orders.push_back(1); - - BOOST_CHECK_EQUAL(0, MakeEncoder(info, 1).EncodedLength()); - - // No backoffs. - util::stream::Chains blank(0); - util::FixedArray<util::stream::ChainPositions> models_by_order(2); - models_by_order.push_back(blank); - models_by_order.push_back(blank); - - util::stream::Chains merged_probabilities(1); - util::stream::Chains probabilities_out(1); - util::stream::Chains backoffs_out(0); - - merged_probabilities.push_back(util::stream::ChainConfig(sizeof(WordIndex) + sizeof(float) + sizeof(float), 2, 24)); - probabilities_out.push_back(util::stream::ChainConfig(sizeof(WordIndex) + sizeof(float), 2, 100)); - - merged_probabilities[0] >> WriteInput(); - Normalize(info, models_by_order, merged_probabilities, probabilities_out, backoffs_out); - - util::stream::ChainPosition checker(probabilities_out[0].Add()); - - merged_probabilities >> util::stream::kRecycle; - probabilities_out >> util::stream::kRecycle; - - CheckOutput(checker); - probabilities_out.Wait(); -} - -}}} // namespaces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/perf_enum_gv_main.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/interpolate/perf_enum_gv_main.cc b/ext/kenlm/lm/interpolate/perf_enum_gv_main.cc deleted file mode 100644 index a68e9e9..0000000 --- a/ext/kenlm/lm/interpolate/perf_enum_gv_main.cc +++ /dev/null @@ -1,215 +0,0 @@ -/* -Usage example -1) Download from http://www.gwinnup.org/lminterp/train-params-output.tar.bz2 -2) then run perf_enum_gv -t lm.en.dev -m model-a.3.srilm -m model-b.3.srilm -m model-c.3.srilm - */ - -#include "lm/ngram_query.hh" -#include "lm/model.hh" -#include "lm/word_index.hh" -#include "lm/interpolate/enumerate_global_vocab.hh" - -#include "util/fixed_array.hh" -#include "util/usage.hh" - -#include <string> -#include <vector> -#include <iostream> -#include <fstream> -#include <map> - -#include <boost/program_options.hpp> -#include <boost/version.hpp> -#include <boost/foreach.hpp> - -#include <Eigen/Eigen> - -#include <iostream> -#include <sys/time.h> - -inline double deltaTV(const timeval& s, const timeval& e) -{ - return (e.tv_sec - s.tv_sec)*1000.0 + (e.tv_usec - s.tv_usec)/1000.0; -} - -typedef struct timeval Wall; -Wall GetWall() { - struct timeval tv; - gettimeofday(&tv, NULL); - return tv; -} - -typedef Eigen::MatrixXf FMatrix; -typedef Eigen::VectorXf FVector; - - -bool HAS_BIAS = true; - -using namespace lm::ngram; -using namespace lm; - -inline void logProb(Model * model, const std::vector<std::string>& ctx, const std::string& word) { - - // Horribly inefficient - const Vocabulary &vocab = model->GetVocabulary(); - - State nextState; //throwaway - - WordIndex word_idx = vocab.Index(word); - WordIndex context_idx[ctx.size()]; - - //reverse context - for(unsigned int i = 0; i < ctx.size(); i++) { - context_idx[ctx.size() - 1 - i] = vocab.Index(ctx[i]); - } - FullScoreReturn score = model->FullScoreForgotState(context_idx, &(context_idx[ctx.size() -1]), word_idx, nextState); -} - -void set_features(const std::vector<std::string>& ctx, - const std::string& word, - const std::vector<Model *>& models, - FVector& v) { - - for (unsigned i=0; i < models.size(); ++i) - logProb(models[i], ctx, word); - -} - -//const util::FixedArray<Model *>& models) -void train_params( - const std::vector<std::vector<std::string> >& corpus, - const std::vector<std::string>& vocab, - const std::vector<Model *>& models) { - using namespace std; - - vector<string> context(5, "<s>"); - const int ITERATIONS = 10; - const int nlambdas = models.size(); // #models - FVector params = FVector::Zero(nlambdas); - vector<FVector> feats(vocab.size(), params); - static Wall start,stop; - - for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations - std::cout << "iteration: " << iter - << " corpus size " << corpus.size() - << std::endl; - for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus - const vector<string>& sentence = corpus[ci]; - context.resize(5); - for (unsigned t = 0; t < sentence.size(); ++t) { // words in sentence - std::cout << "sentence " << ci << " word " << t << std::endl; - start = GetWall(); - const string& ref_word_string = sentence[t]; - for (unsigned i = 0; i < vocab.size(); ++i) { // vocab - set_features(context, vocab[i], models, feats[i]); - } - stop = GetWall(); - std::cout << " time elapsed = " << deltaTV(start,stop) << std::endl; - context.push_back(ref_word_string); - } - } - } -} - -int main(int argc, char** argv) { - - std::string tuning_data; - std::vector<std::string> lms; - - try { - namespace po = boost::program_options; - po::options_description options("train-params"); - - options.add_options() - ("help,h", po::bool_switch(), "Show this help message") - ("no_bias_term,B", po::bool_switch(), "Do not include a 'bias' feature") - ("tuning_data,t", po::value<std::string>(&tuning_data), "File to tune perplexity on") - ("model,m", po::value<std::vector<std::string> >(&lms), "Language models in KenLM format to interpolate"); - po::variables_map vm; - po::store(po::parse_command_line(argc, argv, options), vm); - - // Display help - if(argc == 1 || vm["help"].as<bool>()) { - std::cerr << options << std::endl; - return 1; - } - if (vm["no_bias_term"].as<bool>()) - HAS_BIAS = false; - lms = vm["model"].as<std::vector<std::string> >(); - tuning_data = vm["tuning_data"].as<std::string>(); - } - catch(const std::exception &e) { - - std::cerr << e.what() << std::endl; - return 1; - - } - if (lms.size() < 2) { - std::cerr << "Please specify at least two language model files with -m LM.KLM\n"; - return 1; - } - if (tuning_data.empty()) { - std::cerr << "Please specify tuning set with -t FILE.TXT\n"; - return 1; - } - - std::map<std::string, int*> vmap; - util::FixedArray<WordIndex> vm(2); - - //stuff it into the - EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(&vmap, lms.size()); - // EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(vm); - - Config cfg; - cfg.enumerate_vocab = (EnumerateVocab *) globalVocabBuilder; - - //load models - //util::FixedArray<Model *> models(lms.size()); - std::vector<Model *> models; - for(int i=0; i < lms.size(); i++) { - std::cerr << "Loading LM file: " << lms[i] << std::endl; - - //haaaack - globalVocabBuilder->SetCurModel(i); //yes this is dumb - - //models[i] = new Model(lms[i].c_str()); - Model * this_model = new Model(lms[i].c_str(), cfg); - models.push_back( this_model ); - - } - - //assemble vocabulary vector - std::vector<std::string> vocab; - std::cerr << "Global Vocab Map has size: " << vmap.size() << std::endl; - - std::pair<StringPiece,int *> me; - - for(std::map<std::string, int*>::iterator iter = vmap.begin(); iter != vmap.end(); ++iter) { - vocab.push_back(iter->first); - } - std::cerr << "Vocab vector has size: " << vocab.size() << std::endl; - - //load context sorted ngrams into vector of vectors - std::vector<std::vector<std::string> > corpus; - - std::cerr << "Loading context-sorted ngrams: " << tuning_data << std::endl; - std::ifstream infile(tuning_data); - - for(std::string line; std::getline(infile, line); ) { - - std::vector<std::string> words; { - - std::stringstream stream(line); - std::string word; - - while(stream >> word) { - words.push_back(word); - } - } - corpus.push_back(words); - } - - train_params(corpus, vocab, models); - - return 0; -}
