http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/print.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/common/print.cc b/ext/kenlm/lm/common/print.cc deleted file mode 100644 index 518b62f..0000000 --- a/ext/kenlm/lm/common/print.cc +++ /dev/null @@ -1,62 +0,0 @@ -#include "lm/common/print.hh" - -#include "lm/common/ngram_stream.hh" -#include "util/file_stream.hh" -#include "util/file.hh" -#include "util/mmap.hh" -#include "util/scoped.hh" - -#include <sstream> -#include <cstring> - -namespace lm { - -VocabReconstitute::VocabReconstitute(int fd) { - uint64_t size = util::SizeOrThrow(fd); - util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_); - const char *const start = static_cast<const char*>(memory_.get()); - const char *i; - for (i = start; i != start + size; i += strlen(i) + 1) { - map_.push_back(i); - } - // Last one for LookupPiece. - map_.push_back(i); -} - -namespace { -template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStream<Payload> &stream, util::FileStream &out) { - out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin()); - for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) { - out << ' ' << vocab.Lookup(*i); - } -} -} // namespace - -void PrintARPA::Run(const util::stream::ChainPositions &positions) { - VocabReconstitute vocab(vocab_fd_); - util::FileStream out(out_fd_); - out << "\\data\\\n"; - for (size_t i = 0; i < positions.size(); ++i) { - out << "ngram " << (i+1) << '=' << counts_[i] << '\n'; - } - out << '\n'; - - for (unsigned order = 1; order < positions.size(); ++order) { - out << "\\" << order << "-grams:" << '\n'; - for (ProxyStream<NGram<ProbBackoff> > stream(positions[order - 1], NGram<ProbBackoff>(NULL, order)); stream; ++stream) { - PrintLead(vocab, stream, out); - out << '\t' << stream->Value().backoff << '\n'; - } - out << '\n'; - } - - out << "\\" << positions.size() << "-grams:" << '\n'; - for (ProxyStream<NGram<Prob> > stream(positions.back(), NGram<Prob>(NULL, positions.size())); stream; ++stream) { - PrintLead(vocab, stream, out); - out << '\n'; - } - out << '\n'; - out << "\\end\\\n"; -} - -} // namespace lm
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/print.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/common/print.hh b/ext/kenlm/lm/common/print.hh deleted file mode 100644 index 6aa08b3..0000000 --- a/ext/kenlm/lm/common/print.hh +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef LM_COMMON_PRINT_H -#define LM_COMMON_PRINT_H - -#include "lm/word_index.hh" -#include "util/mmap.hh" -#include "util/string_piece.hh" - -#include <cassert> -#include <vector> - -namespace util { namespace stream { class ChainPositions; }} - -// Warning: PrintARPA routines read all unigrams before all bigrams before all -// trigrams etc. So if other parts of the chain move jointly, you'll have to -// buffer. - -namespace lm { - -class VocabReconstitute { - public: - // fd must be alive for life of this object; does not take ownership. - explicit VocabReconstitute(int fd); - - const char *Lookup(WordIndex index) const { - assert(index < map_.size() - 1); - return map_[index]; - } - - StringPiece LookupPiece(WordIndex index) const { - return StringPiece(map_[index], map_[index + 1] - 1 - map_[index]); - } - - std::size_t Size() const { - // There's an extra entry to support StringPiece lengths. - return map_.size() - 1; - } - - private: - util::scoped_memory memory_; - std::vector<const char*> map_; -}; - -class PrintARPA { - public: - // Does not take ownership of vocab_fd or out_fd. - explicit PrintARPA(int vocab_fd, int out_fd, const std::vector<uint64_t> &counts) - : vocab_fd_(vocab_fd), out_fd_(out_fd), counts_(counts) {} - - void Run(const util::stream::ChainPositions &positions); - - private: - int vocab_fd_; - int out_fd_; - std::vector<uint64_t> counts_; -}; - -} // namespace lm -#endif // LM_COMMON_PRINT_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/renumber.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/common/renumber.cc b/ext/kenlm/lm/common/renumber.cc deleted file mode 100644 index 0632a14..0000000 --- a/ext/kenlm/lm/common/renumber.cc +++ /dev/null @@ -1,17 +0,0 @@ -#include "lm/common/renumber.hh" -#include "lm/common/ngram.hh" - -#include "util/stream/stream.hh" - -namespace lm { - -void Renumber::Run(const util::stream::ChainPosition &position) { - for (util::stream::Stream stream(position); stream; ++stream) { - NGramHeader gram(stream.Get(), order_); - for (WordIndex *w = gram.begin(); w != gram.end(); ++w) { - *w = new_numbers_[*w]; - } - } -} - -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/renumber.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/common/renumber.hh b/ext/kenlm/lm/common/renumber.hh deleted file mode 100644 index ca25c4d..0000000 --- a/ext/kenlm/lm/common/renumber.hh +++ /dev/null @@ -1,30 +0,0 @@ -/* Map vocab ids. This is useful to merge independently collected counts or - * change the vocab ids to the order used by the trie. - */ -#ifndef LM_COMMON_RENUMBER_H -#define LM_COMMON_RENUMBER_H - -#include "lm/word_index.hh" - -#include <cstddef> - -namespace util { namespace stream { class ChainPosition; }} - -namespace lm { - -class Renumber { - public: - // Assumes the array is large enough to map all words and stays alive while - // the thread is active. - Renumber(const WordIndex *new_numbers, std::size_t order) - : new_numbers_(new_numbers), order_(order) {} - - void Run(const util::stream::ChainPosition &position); - - private: - const WordIndex *new_numbers_; - std::size_t order_; -}; - -} // namespace lm -#endif // LM_COMMON_RENUMBER_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/size_option.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/common/size_option.cc b/ext/kenlm/lm/common/size_option.cc deleted file mode 100644 index 46a920e..0000000 --- a/ext/kenlm/lm/common/size_option.cc +++ /dev/null @@ -1,24 +0,0 @@ -#include <boost/program_options.hpp> -#include "util/usage.hh" - -namespace lm { - -namespace { -class SizeNotify { - public: - explicit SizeNotify(std::size_t &out) : behind_(out) {} - - void operator()(const std::string &from) { - behind_ = util::ParseSize(from); - } - - private: - std::size_t &behind_; -}; -} - -boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) { - return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value); -} - -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/size_option.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/common/size_option.hh b/ext/kenlm/lm/common/size_option.hh deleted file mode 100644 index d3b8e33..0000000 --- a/ext/kenlm/lm/common/size_option.hh +++ /dev/null @@ -1,11 +0,0 @@ -#include <boost/program_options.hpp> - -#include <cstddef> -#include <string> - -namespace lm { - -// Create a boost program option for data sizes. This parses sizes like 1T and 10k. -boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value); - -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/special.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/common/special.hh b/ext/kenlm/lm/common/special.hh deleted file mode 100644 index 0677cd7..0000000 --- a/ext/kenlm/lm/common/special.hh +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef LM_COMMON_SPECIAL_H -#define LM_COMMON_SPECIAL_H - -#include "lm/word_index.hh" - -namespace lm { - -class SpecialVocab { - public: - SpecialVocab(WordIndex bos, WordIndex eos) : bos_(bos), eos_(eos) {} - - bool IsSpecial(WordIndex word) const { - return word == kUNK || word == bos_ || word == eos_; - } - - WordIndex UNK() const { return kUNK; } - WordIndex BOS() const { return bos_; } - WordIndex EOS() const { return eos_; } - - private: - WordIndex bos_; - WordIndex eos_; -}; - -} // namespace lm - -#endif // LM_COMMON_SPECIAL_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/config.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/config.cc b/ext/kenlm/lm/config.cc deleted file mode 100644 index 6c695ed..0000000 --- a/ext/kenlm/lm/config.cc +++ /dev/null @@ -1,30 +0,0 @@ -#include "lm/config.hh" - -#include <iostream> - -namespace lm { -namespace ngram { - -Config::Config() : - show_progress(true), - messages(&std::cerr), - enumerate_vocab(NULL), - unknown_missing(COMPLAIN), - sentence_marker_missing(THROW_UP), - positive_log_probability(THROW_UP), - unknown_missing_logprob(-100.0), - probing_multiplier(1.5), - building_memory(1073741824ULL), // 1 GB - temporary_directory_prefix(""), - arpa_complain(ALL), - write_mmap(NULL), - write_method(WRITE_AFTER), - include_vocab(true), - rest_function(REST_MAX), - prob_bits(8), - backoff_bits(8), - pointer_bhiksha_bits(22), - load_method(util::POPULATE_OR_READ) {} - -} // namespace ngram -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/config.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/config.hh b/ext/kenlm/lm/config.hh deleted file mode 100644 index 21b9e7e..0000000 --- a/ext/kenlm/lm/config.hh +++ /dev/null @@ -1,124 +0,0 @@ -#ifndef LM_CONFIG_H -#define LM_CONFIG_H - -#include "lm/lm_exception.hh" -#include "util/mmap.hh" - -#include <iosfwd> -#include <string> -#include <vector> - -/* Configuration for ngram model. Separate header to reduce pollution. */ - -namespace lm { - -class EnumerateVocab; - -namespace ngram { - -struct Config { - // EFFECTIVE FOR BOTH ARPA AND BINARY READS - - // (default true) print progress bar to messages - bool show_progress; - - // Where to log messages including the progress bar. Set to NULL for - // silence. - std::ostream *messages; - - std::ostream *ProgressMessages() const { - return show_progress ? messages : 0; - } - - // This will be called with every string in the vocabulary by the - // constructor; it need only exist for the lifetime of the constructor. - // See enumerate_vocab.hh for more detail. Config does not take ownership; - // just delete/let it go out of scope after the constructor exits. - EnumerateVocab *enumerate_vocab; - - - // ONLY EFFECTIVE WHEN READING ARPA - - // What to do when <unk> isn't in the provided model. - WarningAction unknown_missing; - // What to do when <s> or </s> is missing from the model. - // If THROW_UP, the exception will be of type util::SpecialWordMissingException. - WarningAction sentence_marker_missing; - - // What to do with a positive log probability. For COMPLAIN and SILENT, map - // to 0. - WarningAction positive_log_probability; - - // The probability to substitute for <unk> if it's missing from the model. - // No effect if the model has <unk> or unknown_missing == THROW_UP. - float unknown_missing_logprob; - - // Size multiplier for probing hash table. Must be > 1. Space is linear in - // this. Time is probing_multiplier / (probing_multiplier - 1). No effect - // for sorted variant. - // If you find yourself setting this to a low number, consider using the - // TrieModel which has lower memory consumption. - float probing_multiplier; - - // Amount of memory to use for building. The actual memory usage will be - // higher since this just sets sort buffer size. Only applies to trie - // models. - std::size_t building_memory; - - // Template for temporary directory appropriate for passing to mkdtemp. - // The characters XXXXXX are appended before passing to mkdtemp. Only - // applies to trie. If empty, defaults to write_mmap. If that's NULL, - // defaults to input file name. - std::string temporary_directory_prefix; - - // Level of complaining to do when loading from ARPA instead of binary format. - enum ARPALoadComplain {ALL, EXPENSIVE, NONE}; - ARPALoadComplain arpa_complain; - - // While loading an ARPA file, also write out this binary format file. Set - // to NULL to disable. - const char *write_mmap; - - enum WriteMethod { - WRITE_MMAP, // Map the file directly. - WRITE_AFTER // Write after we're done. - }; - WriteMethod write_method; - - // Include the vocab in the binary file? Only effective if write_mmap != NULL. - bool include_vocab; - - - // Left rest options. Only used when the model includes rest costs. - enum RestFunction { - REST_MAX, // Maximum of any score to the left - REST_LOWER, // Use lower-order files given below. - }; - RestFunction rest_function; - // Only used for REST_LOWER. - std::vector<std::string> rest_lower_files; - - - // Quantization options. Only effective for QuantTrieModel. One value is - // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used - // to quantize (and one of the remaining backoffs will be 0). - uint8_t prob_bits, backoff_bits; - - // Bhiksha compression (simple form). Only works with trie. - uint8_t pointer_bhiksha_bits; - - - // ONLY EFFECTIVE WHEN READING BINARY - - // How to get the giant array into memory: lazy mmap, populate, read etc. - // See util/mmap.hh for details of MapMethod. - util::LoadMethod load_method; - - - // Set defaults. - Config(); -}; - -} /* namespace ngram */ } /* namespace lm */ - -#endif // LM_CONFIG_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/enumerate_vocab.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/enumerate_vocab.hh b/ext/kenlm/lm/enumerate_vocab.hh deleted file mode 100644 index f4c94cd..0000000 --- a/ext/kenlm/lm/enumerate_vocab.hh +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef LM_ENUMERATE_VOCAB_H -#define LM_ENUMERATE_VOCAB_H - -#include "lm/word_index.hh" -#include "util/string_piece.hh" - -namespace lm { - -/* If you need the actual strings in the vocabulary, inherit from this class - * and implement Add. Then put a pointer in Config.enumerate_vocab; it does - * not take ownership. Add is called once per vocab word. index starts at 0 - * and increases by 1 each time. This is only used by the Model constructor; - * the pointer is not retained by the class. - */ -class EnumerateVocab { - public: - virtual ~EnumerateVocab() {} - - virtual void Add(WordIndex index, const StringPiece &str) = 0; - - protected: - EnumerateVocab() {} -}; - -} // namespace lm - -#endif // LM_ENUMERATE_VOCAB_H - http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/facade.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/facade.hh b/ext/kenlm/lm/facade.hh deleted file mode 100644 index 325ef15..0000000 --- a/ext/kenlm/lm/facade.hh +++ /dev/null @@ -1,73 +0,0 @@ -#ifndef LM_FACADE_H -#define LM_FACADE_H - -#include "lm/virtual_interface.hh" -#include "util/string_piece.hh" - -#include <string> - -namespace lm { -namespace base { - -// Common model interface that depends on knowing the specific classes. -// Curiously recurring template pattern. -template <class Child, class StateT, class VocabularyT> class ModelFacade : public Model { - public: - typedef StateT State; - typedef VocabularyT Vocabulary; - - /* Translate from void* to State */ - FullScoreReturn BaseFullScore(const void *in_state, const WordIndex new_word, void *out_state) const { - return static_cast<const Child*>(this)->FullScore( - *reinterpret_cast<const State*>(in_state), - new_word, - *reinterpret_cast<State*>(out_state)); - } - - FullScoreReturn BaseFullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, void *out_state) const { - return static_cast<const Child*>(this)->FullScoreForgotState( - context_rbegin, - context_rend, - new_word, - *reinterpret_cast<State*>(out_state)); - } - - // Default Score function calls FullScore. Model can override this. - float Score(const State &in_state, const WordIndex new_word, State &out_state) const { - return static_cast<const Child*>(this)->FullScore(in_state, new_word, out_state).prob; - } - - float BaseScore(const void *in_state, const WordIndex new_word, void *out_state) const { - return static_cast<const Child*>(this)->Score( - *reinterpret_cast<const State*>(in_state), - new_word, - *reinterpret_cast<State*>(out_state)); - } - - const State &BeginSentenceState() const { return begin_sentence_; } - const State &NullContextState() const { return null_context_; } - const Vocabulary &GetVocabulary() const { return *static_cast<const Vocabulary*>(&BaseVocabulary()); } - - protected: - ModelFacade() : Model(sizeof(State)) {} - - virtual ~ModelFacade() {} - - // begin_sentence and null_context can disappear after. vocab should stay. - void Init(const State &begin_sentence, const State &null_context, const Vocabulary &vocab, unsigned char order) { - begin_sentence_ = begin_sentence; - null_context_ = null_context; - begin_sentence_memory_ = &begin_sentence_; - null_context_memory_ = &null_context_; - base_vocab_ = &vocab; - order_ = order; - } - - private: - State begin_sentence_, null_context_; -}; - -} // mamespace base -} // namespace lm - -#endif // LM_FACADE_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/filter/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/filter/CMakeLists.txt b/ext/kenlm/lm/filter/CMakeLists.txt deleted file mode 100644 index d4616cc..0000000 --- a/ext/kenlm/lm/filter/CMakeLists.txt +++ /dev/null @@ -1,62 +0,0 @@ -cmake_minimum_required(VERSION 2.8.8) -# -# The KenLM cmake files make use of add_library(... OBJECTS ...) -# -# This syntax allows grouping of source files when compiling -# (effectively creating "fake" libraries based on source subdirs). -# -# This syntax was only added in cmake version 2.8.8 -# -# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library - - -# This CMake file was created by Lane Schwartz <[email protected]> - -# Explicitly list the source files for this subdirectory -# -# If you add any source files to this subdirectory -# that should be included in the kenlm library, -# (this excludes any unit test files) -# you should add them to the following list: -# -# In order to set correct paths to these files -# in case this variable is referenced by CMake files in the parent directory, -# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. -# -set(KENLM_FILTER_SOURCE - ${CMAKE_CURRENT_SOURCE_DIR}/arpa_io.cc - ${CMAKE_CURRENT_SOURCE_DIR}/phrase.cc - ${CMAKE_CURRENT_SOURCE_DIR}/vocab.cc - ) - - -# Group these objects together for later use. -# -# Given add_library(foo OBJECT ${my_foo_sources}), -# refer to these objects as $<TARGET_OBJECTS:foo> -# -add_library(kenlm_filter OBJECT ${KENLM_FILTER_SOURCE}) - - -# Explicitly list the executable files to be compiled -set(EXE_LIST - filter - phrase_table_vocab -) - - -# Iterate through the executable list -foreach(exe ${EXE_LIST}) - - # Compile the executable, linking against the requisite dependent object files - add_executable(${exe} ${exe}_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_filter> $<TARGET_OBJECTS:kenlm_util>) - - # Link the executable against boost - target_link_libraries(${exe} ${Boost_LIBRARIES} pthread) - - # Group executables together - set_target_properties(${exe} PROPERTIES FOLDER executables) - -# End for loop -endforeach(exe) - http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/filter/Jamfile ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/filter/Jamfile b/ext/kenlm/lm/filter/Jamfile deleted file mode 100644 index bcf62da..0000000 --- a/ext/kenlm/lm/filter/Jamfile +++ /dev/null @@ -1,7 +0,0 @@ -fakelib lm_filter : phrase.cc vocab.cc arpa_io.cc ../../util//kenutil : <threading>multi:<library>/top//boost_thread ; - -obj main : filter_main.cc : <threading>single:<define>NTHREAD <include>../.. ; - -exe filter : main lm_filter ../../util//kenutil ..//kenlm : <threading>multi:<library>/top//boost_thread ; - -exe phrase_table_vocab : phrase_table_vocab_main.cc ../../util//kenutil ; http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/filter/arpa_io.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/filter/arpa_io.cc b/ext/kenlm/lm/filter/arpa_io.cc deleted file mode 100644 index 2cae60f..0000000 --- a/ext/kenlm/lm/filter/arpa_io.cc +++ /dev/null @@ -1,77 +0,0 @@ -#include "lm/filter/arpa_io.hh" -#include "util/file_piece.hh" -#include "util/string_stream.hh" - -#include <iostream> -#include <ostream> -#include <string> -#include <vector> - -#include <cctype> -#include <cerrno> -#include <cstring> - -namespace lm { - -ARPAInputException::ARPAInputException(const StringPiece &message) throw() { - *this << message; -} - -ARPAInputException::ARPAInputException(const StringPiece &message, const StringPiece &line) throw() { - *this << message << " in line " << line; -} - -ARPAInputException::~ARPAInputException() throw() {} - -// Seeking is the responsibility of the caller. -template <class Stream> void WriteCounts(Stream &out, const std::vector<uint64_t> &number) { - out << "\n\\data\\\n"; - for (unsigned int i = 0; i < number.size(); ++i) { - out << "ngram " << i+1 << "=" << number[i] << '\n'; - } - out << '\n'; -} - -size_t SizeNeededForCounts(const std::vector<uint64_t> &number) { - std::string buf; - util::StringStream stream(buf); - WriteCounts(stream, number); - return buf.size(); -} - -bool IsEntirelyWhiteSpace(const StringPiece &line) { - for (size_t i = 0; i < static_cast<size_t>(line.size()); ++i) { - if (!isspace(line.data()[i])) return false; - } - return true; -} - -ARPAOutput::ARPAOutput(const char *name, size_t buffer_size) - : file_backing_(util::CreateOrThrow(name)), file_(file_backing_.get(), buffer_size) {} - -void ARPAOutput::ReserveForCounts(std::streampos reserve) { - for (std::streampos i = 0; i < reserve; i += std::streampos(1)) { - file_ << '\n'; - } -} - -void ARPAOutput::BeginLength(unsigned int length) { - file_ << '\\' << length << "-grams:" << '\n'; -} - -void ARPAOutput::EndLength(unsigned int length) { - file_ << '\n'; - if (length > counts_.size()) { - counts_.resize(length); - } - counts_[length - 1] = fast_counter_; -} - -void ARPAOutput::Finish() { - file_ << "\\end\\\n"; - file_.seekp(0); - WriteCounts(file_, counts_); - file_.flush(); -} - -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/filter/arpa_io.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/filter/arpa_io.hh b/ext/kenlm/lm/filter/arpa_io.hh deleted file mode 100644 index 7489270..0000000 --- a/ext/kenlm/lm/filter/arpa_io.hh +++ /dev/null @@ -1,99 +0,0 @@ -#ifndef LM_FILTER_ARPA_IO_H -#define LM_FILTER_ARPA_IO_H -/* Input and output for ARPA format language model files. - */ -#include "lm/read_arpa.hh" -#include "util/exception.hh" -#include "util/file_stream.hh" -#include "util/string_piece.hh" -#include "util/tokenize_piece.hh" - -#include <boost/noncopyable.hpp> -#include <boost/scoped_array.hpp> - -#include <fstream> -#include <string> -#include <vector> - -#include <cstring> -#include <stdint.h> - -namespace util { class FilePiece; } - -namespace lm { - -class ARPAInputException : public util::Exception { - public: - explicit ARPAInputException(const StringPiece &message) throw(); - explicit ARPAInputException(const StringPiece &message, const StringPiece &line) throw(); - virtual ~ARPAInputException() throw(); -}; - -// Handling for the counts of n-grams at the beginning of ARPA files. -size_t SizeNeededForCounts(const std::vector<uint64_t> &number); - -/* Writes an ARPA file. This has to be seekable so the counts can be written - * at the end. Hence, I just have it own a std::fstream instead of accepting - * a separately held std::ostream. TODO: use the fast one from estimation. - */ -class ARPAOutput : boost::noncopyable { - public: - explicit ARPAOutput(const char *name, size_t buffer_size = 65536); - - void ReserveForCounts(std::streampos reserve); - - void BeginLength(unsigned int length); - - void AddNGram(const StringPiece &line) { - file_ << line << '\n'; - ++fast_counter_; - } - - void AddNGram(const StringPiece &ngram, const StringPiece &line) { - AddNGram(line); - } - - template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { - AddNGram(line); - } - - void EndLength(unsigned int length); - - void Finish(); - - private: - util::scoped_fd file_backing_; - util::FileStream file_; - size_t fast_counter_; - std::vector<uint64_t> counts_; -}; - - -template <class Output> void ReadNGrams(util::FilePiece &in, unsigned int length, uint64_t number, Output &out) { - ReadNGramHeader(in, length); - out.BeginLength(length); - for (uint64_t i = 0; i < number; ++i) { - StringPiece line = in.ReadLine(); - util::TokenIter<util::SingleCharacter> tabber(line, '\t'); - if (!tabber) throw ARPAInputException("blank line", line); - if (!++tabber) throw ARPAInputException("no tab", line); - - out.AddNGram(*tabber, line); - } - out.EndLength(length); -} - -template <class Output> void ReadARPA(util::FilePiece &in_lm, Output &out) { - std::vector<uint64_t> number; - ReadARPACounts(in_lm, number); - out.ReserveForCounts(SizeNeededForCounts(number)); - for (unsigned int i = 0; i < number.size(); ++i) { - ReadNGrams(in_lm, i + 1, number[i], out); - } - ReadEnd(in_lm); - out.Finish(); -} - -} // namespace lm - -#endif // LM_FILTER_ARPA_IO_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/filter/count_io.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/filter/count_io.hh b/ext/kenlm/lm/filter/count_io.hh deleted file mode 100644 index 1af6676..0000000 --- a/ext/kenlm/lm/filter/count_io.hh +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef LM_FILTER_COUNT_IO_H -#define LM_FILTER_COUNT_IO_H - -#include <fstream> -#include <iostream> -#include <string> - -#include "util/file_stream.hh" -#include "util/file.hh" -#include "util/file_piece.hh" - -namespace lm { - -class CountOutput : boost::noncopyable { - public: - explicit CountOutput(const char *name) : file_(util::CreateOrThrow(name)) {} - - void AddNGram(const StringPiece &line) { - file_ << line << '\n'; - } - - template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { - AddNGram(line); - } - - void AddNGram(const StringPiece &ngram, const StringPiece &line) { - AddNGram(line); - } - - private: - util::FileStream file_; -}; - -class CountBatch { - public: - explicit CountBatch(std::streamsize initial_read) - : initial_read_(initial_read) { - buffer_.reserve(initial_read); - } - - void Read(std::istream &in) { - buffer_.resize(initial_read_); - in.read(&*buffer_.begin(), initial_read_); - buffer_.resize(in.gcount()); - char got; - while (in.get(got) && got != '\n') - buffer_.push_back(got); - } - - template <class Output> void Send(Output &out) { - for (util::TokenIter<util::SingleCharacter> line(StringPiece(&*buffer_.begin(), buffer_.size()), '\n'); line; ++line) { - util::TokenIter<util::SingleCharacter> tabber(*line, '\t'); - if (!tabber) { - std::cerr << "Warning: empty n-gram count line being removed\n"; - continue; - } - util::TokenIter<util::SingleCharacter, true> words(*tabber, ' '); - if (!words) { - std::cerr << "Line has a tab but no words.\n"; - continue; - } - out.AddNGram(words, util::TokenIter<util::SingleCharacter, true>::end(), *line); - } - } - - private: - std::streamsize initial_read_; - - // This could have been a std::string but that's less happy with raw writes. - std::vector<char> buffer_; -}; - -template <class Output> void ReadCount(util::FilePiece &in_file, Output &out) { - try { - while (true) { - StringPiece line = in_file.ReadLine(); - util::TokenIter<util::SingleCharacter> tabber(line, '\t'); - if (!tabber) { - std::cerr << "Warning: empty n-gram count line being removed\n"; - continue; - } - out.AddNGram(*tabber, line); - } - } catch (const util::EndOfFileException &e) {} -} - -} // namespace lm - -#endif // LM_FILTER_COUNT_IO_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/filter/filter_main.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/filter/filter_main.cc b/ext/kenlm/lm/filter/filter_main.cc deleted file mode 100644 index 6e89d1f..0000000 --- a/ext/kenlm/lm/filter/filter_main.cc +++ /dev/null @@ -1,253 +0,0 @@ -#include "lm/filter/arpa_io.hh" -#include "lm/filter/format.hh" -#include "lm/filter/phrase.hh" -#ifndef NTHREAD -#include "lm/filter/thread.hh" -#endif -#include "lm/filter/vocab.hh" -#include "lm/filter/wrapper.hh" -#include "util/exception.hh" -#include "util/file_piece.hh" - -#include <boost/ptr_container/ptr_vector.hpp> - -#include <cstring> -#include <fstream> -#include <iostream> -#include <memory> - -namespace lm { -namespace { - -void DisplayHelp(const char *name) { - std::cerr - << "Usage: " << name << " mode [context] [phrase] [raw|arpa] [threads:m] [batch_size:m] (vocab|model):input_file output_file\n\n" - "copy mode just copies, but makes the format nicer for e.g. irstlm's broken\n" - " parser.\n" - "single mode treats the entire input as a single sentence.\n" - "multiple mode filters to multiple sentences in parallel. Each sentence is on\n" - " a separate line. A separate file is created for each sentence by appending\n" - " the 0-indexed line number to the output file name.\n" - "union mode produces one filtered model that is the union of models created by\n" - " multiple mode.\n\n" - "context means only the context (all but last word) has to pass the filter, but\n" - " the entire n-gram is output.\n\n" - "phrase means that the vocabulary is actually tab-delimited phrases and that the\n" - " phrases can generate the n-gram when assembled in arbitrary order and\n" - " clipped. Currently works with multiple or union mode.\n\n" - "The file format is set by [raw|arpa] with default arpa:\n" - "raw means space-separated tokens, optionally followed by a tab and arbitrary\n" - " text. This is useful for ngram count files.\n" - "arpa means the ARPA file format for n-gram language models.\n\n" -#ifndef NTHREAD - "threads:m sets m threads (default: conccurrency detected by boost)\n" - "batch_size:m sets the batch size for threading. Expect memory usage from this\n" - " of 2*threads*batch_size n-grams.\n\n" -#else - "This binary was compiled with -DNTHREAD, disabling threading. If you wanted\n" - " threading, compile without this flag against Boost >=1.42.0.\n\n" -#endif - "There are two inputs: vocabulary and model. Either may be given as a file\n" - " while the other is on stdin. Specify the type given as a file using\n" - " vocab: or model: before the file name. \n\n" - "For ARPA format, the output must be seekable. For raw format, it can be a\n" - " stream i.e. /dev/stdout\n"; -} - -typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} FilterMode; -typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format; - -struct Config { - Config() : -#ifndef NTHREAD - batch_size(25000), - threads(boost::thread::hardware_concurrency()), -#endif - phrase(false), - context(false), - format(FORMAT_ARPA) - { -#ifndef NTHREAD - if (!threads) threads = 1; -#endif - } - -#ifndef NTHREAD - size_t batch_size; - size_t threads; -#endif - bool phrase; - bool context; - FilterMode mode; - Format format; -}; - -template <class Format, class Filter, class OutputBuffer, class Output> void RunThreadedFilter(const Config &config, util::FilePiece &in_lm, Filter &filter, Output &output) { -#ifndef NTHREAD - if (config.threads == 1) { -#endif - Format::RunFilter(in_lm, filter, output); -#ifndef NTHREAD - } else { - typedef Controller<Filter, OutputBuffer, Output> Threaded; - Threaded threading(config.batch_size, config.threads * 2, config.threads, filter, output); - Format::RunFilter(in_lm, threading, output); - } -#endif -} - -template <class Format, class Filter, class OutputBuffer, class Output> void RunContextFilter(const Config &config, util::FilePiece &in_lm, Filter filter, Output &output) { - if (config.context) { - ContextFilter<Filter> context_filter(filter); - RunThreadedFilter<Format, ContextFilter<Filter>, OutputBuffer, Output>(config, in_lm, context_filter, output); - } else { - RunThreadedFilter<Format, Filter, OutputBuffer, Output>(config, in_lm, filter, output); - } -} - -template <class Format, class Binary> void DispatchBinaryFilter(const Config &config, util::FilePiece &in_lm, const Binary &binary, typename Format::Output &out) { - typedef BinaryFilter<Binary> Filter; - RunContextFilter<Format, Filter, BinaryOutputBuffer, typename Format::Output>(config, in_lm, Filter(binary), out); -} - -template <class Format> void DispatchFilterModes(const Config &config, std::istream &in_vocab, util::FilePiece &in_lm, const char *out_name) { - if (config.mode == MODE_MULTIPLE) { - if (config.phrase) { - typedef phrase::Multiple Filter; - phrase::Substrings substrings; - typename Format::Multiple out(out_name, phrase::ReadMultiple(in_vocab, substrings)); - RunContextFilter<Format, Filter, MultipleOutputBuffer, typename Format::Multiple>(config, in_lm, Filter(substrings), out); - } else { - typedef vocab::Multiple Filter; - boost::unordered_map<std::string, std::vector<unsigned int> > words; - typename Format::Multiple out(out_name, vocab::ReadMultiple(in_vocab, words)); - RunContextFilter<Format, Filter, MultipleOutputBuffer, typename Format::Multiple>(config, in_lm, Filter(words), out); - } - return; - } - - typename Format::Output out(out_name); - - if (config.mode == MODE_COPY) { - Format::Copy(in_lm, out); - return; - } - - if (config.mode == MODE_SINGLE) { - vocab::Single::Words words; - vocab::ReadSingle(in_vocab, words); - DispatchBinaryFilter<Format, vocab::Single>(config, in_lm, vocab::Single(words), out); - return; - } - - if (config.mode == MODE_UNION) { - if (config.phrase) { - phrase::Substrings substrings; - phrase::ReadMultiple(in_vocab, substrings); - DispatchBinaryFilter<Format, phrase::Union>(config, in_lm, phrase::Union(substrings), out); - } else { - vocab::Union::Words words; - vocab::ReadMultiple(in_vocab, words); - DispatchBinaryFilter<Format, vocab::Union>(config, in_lm, vocab::Union(words), out); - } - return; - } -} - -} // namespace -} // namespace lm - -int main(int argc, char *argv[]) { - try { - if (argc < 4) { - lm::DisplayHelp(argv[0]); - return 1; - } - - // I used to have boost::program_options, but some users didn't want to compile boost. - lm::Config config; - config.mode = lm::MODE_UNSET; - for (int i = 1; i < argc - 2; ++i) { - const char *str = argv[i]; - if (!std::strcmp(str, "copy")) { - config.mode = lm::MODE_COPY; - } else if (!std::strcmp(str, "single")) { - config.mode = lm::MODE_SINGLE; - } else if (!std::strcmp(str, "multiple")) { - config.mode = lm::MODE_MULTIPLE; - } else if (!std::strcmp(str, "union")) { - config.mode = lm::MODE_UNION; - } else if (!std::strcmp(str, "phrase")) { - config.phrase = true; - } else if (!std::strcmp(str, "context")) { - config.context = true; - } else if (!std::strcmp(str, "arpa")) { - config.format = lm::FORMAT_ARPA; - } else if (!std::strcmp(str, "raw")) { - config.format = lm::FORMAT_COUNT; -#ifndef NTHREAD - } else if (!std::strncmp(str, "threads:", 8)) { - config.threads = boost::lexical_cast<size_t>(str + 8); - if (!config.threads) { - std::cerr << "Specify at least one thread." << std::endl; - return 1; - } - } else if (!std::strncmp(str, "batch_size:", 11)) { - config.batch_size = boost::lexical_cast<size_t>(str + 11); - if (config.batch_size < 5000) { - std::cerr << "Batch size must be at least one and should probably be >= 5000" << std::endl; - if (!config.batch_size) return 1; - } -#endif - } else { - lm::DisplayHelp(argv[0]); - return 1; - } - } - - if (config.mode == lm::MODE_UNSET) { - lm::DisplayHelp(argv[0]); - return 1; - } - - if (config.phrase && config.mode != lm::MODE_UNION && config.mode != lm::MODE_MULTIPLE) { - std::cerr << "Phrase constraint currently only works in multiple or union mode. If you really need it for single, put everything on one line and use union." << std::endl; - return 1; - } - - bool cmd_is_model = true; - const char *cmd_input = argv[argc - 2]; - if (!strncmp(cmd_input, "vocab:", 6)) { - cmd_is_model = false; - cmd_input += 6; - } else if (!strncmp(cmd_input, "model:", 6)) { - cmd_input += 6; - } else if (strchr(cmd_input, ':')) { - std::cerr << "Specify vocab: or model: before the input file name, not " << cmd_input << std::endl; - return 1; - } else { - std::cerr << "Assuming that " << cmd_input << " is a model file" << std::endl; - } - std::ifstream cmd_file; - std::istream *vocab; - if (cmd_is_model) { - vocab = &std::cin; - } else { - cmd_file.open(cmd_input, std::ios::in); - UTIL_THROW_IF(!cmd_file, util::ErrnoException, "Failed to open " << cmd_input); - vocab = &cmd_file; - } - - util::FilePiece model(cmd_is_model ? util::OpenReadOrThrow(cmd_input) : 0, cmd_is_model ? cmd_input : NULL, &std::cerr); - - if (config.format == lm::FORMAT_ARPA) { - lm::DispatchFilterModes<lm::ARPAFormat>(config, *vocab, model, argv[argc - 1]); - } else if (config.format == lm::FORMAT_COUNT) { - lm::DispatchFilterModes<lm::CountFormat>(config, *vocab, model, argv[argc - 1]); - } - return 0; - } catch (const std::exception &e) { - std::cerr << e.what() << std::endl; - return 1; - } -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/filter/format.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/filter/format.hh b/ext/kenlm/lm/filter/format.hh deleted file mode 100644 index d453f05..0000000 --- a/ext/kenlm/lm/filter/format.hh +++ /dev/null @@ -1,250 +0,0 @@ -#ifndef LM_FILTER_FORMAT_H -#define LM_FILTER_FORMAT_H - -#include "lm/filter/arpa_io.hh" -#include "lm/filter/count_io.hh" - -#include <boost/lexical_cast.hpp> -#include <boost/ptr_container/ptr_vector.hpp> - -#include <iosfwd> - -namespace lm { - -template <class Single> class MultipleOutput { - private: - typedef boost::ptr_vector<Single> Singles; - typedef typename Singles::iterator SinglesIterator; - - public: - MultipleOutput(const char *prefix, size_t number) { - files_.reserve(number); - std::string tmp; - for (unsigned int i = 0; i < number; ++i) { - tmp = prefix; - tmp += boost::lexical_cast<std::string>(i); - files_.push_back(new Single(tmp.c_str())); - } - } - - void AddNGram(const StringPiece &line) { - for (SinglesIterator i = files_.begin(); i != files_.end(); ++i) - i->AddNGram(line); - } - - template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { - for (SinglesIterator i = files_.begin(); i != files_.end(); ++i) - i->AddNGram(begin, end, line); - } - - void SingleAddNGram(size_t offset, const StringPiece &line) { - files_[offset].AddNGram(line); - } - - template <class Iterator> void SingleAddNGram(size_t offset, const Iterator &begin, const Iterator &end, const StringPiece &line) { - files_[offset].AddNGram(begin, end, line); - } - - protected: - Singles files_; -}; - -class MultipleARPAOutput : public MultipleOutput<ARPAOutput> { - public: - MultipleARPAOutput(const char *prefix, size_t number) : MultipleOutput<ARPAOutput>(prefix, number) {} - - void ReserveForCounts(std::streampos reserve) { - for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i) - i->ReserveForCounts(reserve); - } - - void BeginLength(unsigned int length) { - for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i) - i->BeginLength(length); - } - - void EndLength(unsigned int length) { - for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i) - i->EndLength(length); - } - - void Finish() { - for (boost::ptr_vector<ARPAOutput>::iterator i = files_.begin(); i != files_.end(); ++i) - i->Finish(); - } -}; - -template <class Filter, class Output> class DispatchInput { - public: - DispatchInput(Filter &filter, Output &output) : filter_(filter), output_(output) {} - -/* template <class Iterator> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line) { - filter_.AddNGram(begin, end, line, output_); - }*/ - - void AddNGram(const StringPiece &ngram, const StringPiece &line) { - filter_.AddNGram(ngram, line, output_); - } - - protected: - Filter &filter_; - Output &output_; -}; - -template <class Filter, class Output> class DispatchARPAInput : public DispatchInput<Filter, Output> { - private: - typedef DispatchInput<Filter, Output> B; - - public: - DispatchARPAInput(Filter &filter, Output &output) : B(filter, output) {} - - void ReserveForCounts(std::streampos reserve) { B::output_.ReserveForCounts(reserve); } - void BeginLength(unsigned int length) { B::output_.BeginLength(length); } - - void EndLength(unsigned int length) { - B::filter_.Flush(); - B::output_.EndLength(length); - } - void Finish() { B::output_.Finish(); } -}; - -struct ARPAFormat { - typedef ARPAOutput Output; - typedef MultipleARPAOutput Multiple; - static void Copy(util::FilePiece &in, Output &out) { - ReadARPA(in, out); - } - template <class Filter, class Out> static void RunFilter(util::FilePiece &in, Filter &filter, Out &output) { - DispatchARPAInput<Filter, Out> dispatcher(filter, output); - ReadARPA(in, dispatcher); - } -}; - -struct CountFormat { - typedef CountOutput Output; - typedef MultipleOutput<Output> Multiple; - static void Copy(util::FilePiece &in, Output &out) { - ReadCount(in, out); - } - template <class Filter, class Out> static void RunFilter(util::FilePiece &in, Filter &filter, Out &output) { - DispatchInput<Filter, Out> dispatcher(filter, output); - ReadCount(in, dispatcher); - } -}; - -/* For multithreading, the buffer classes hold batches of filter inputs and - * outputs in memory. The strings get reused a lot, so keep them around - * instead of clearing each time. - */ -class InputBuffer { - public: - InputBuffer() : actual_(0) {} - - void Reserve(size_t size) { lines_.reserve(size); } - - template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { - if (lines_.size() == actual_) lines_.resize(lines_.size() + 1); - // TODO avoid this copy. - std::string &copied = lines_[actual_].line; - copied.assign(line.data(), line.size()); - lines_[actual_].ngram.set(copied.data() + (ngram.data() - line.data()), ngram.size()); - ++actual_; - } - - template <class Filter, class Output> void CallFilter(Filter &filter, Output &output) const { - for (std::vector<Line>::const_iterator i = lines_.begin(); i != lines_.begin() + actual_; ++i) { - filter.AddNGram(i->ngram, i->line, output); - } - } - - void Clear() { actual_ = 0; } - bool Empty() { return actual_ == 0; } - size_t Size() { return actual_; } - - private: - struct Line { - std::string line; - StringPiece ngram; - }; - - size_t actual_; - - std::vector<Line> lines_; -}; - -class BinaryOutputBuffer { - public: - BinaryOutputBuffer() {} - - void Reserve(size_t size) { - lines_.reserve(size); - } - - void AddNGram(const StringPiece &line) { - lines_.push_back(line); - } - - template <class Output> void Flush(Output &output) { - for (std::vector<StringPiece>::const_iterator i = lines_.begin(); i != lines_.end(); ++i) { - output.AddNGram(*i); - } - lines_.clear(); - } - - private: - std::vector<StringPiece> lines_; -}; - -class MultipleOutputBuffer { - public: - MultipleOutputBuffer() : last_(NULL) {} - - void Reserve(size_t size) { - annotated_.reserve(size); - } - - void AddNGram(const StringPiece &line) { - annotated_.resize(annotated_.size() + 1); - annotated_.back().line = line; - } - - void SingleAddNGram(size_t offset, const StringPiece &line) { - if ((line.data() == last_.data()) && (line.length() == last_.length())) { - annotated_.back().systems.push_back(offset); - } else { - annotated_.resize(annotated_.size() + 1); - annotated_.back().systems.push_back(offset); - annotated_.back().line = line; - last_ = line; - } - } - - template <class Output> void Flush(Output &output) { - for (std::vector<Annotated>::const_iterator i = annotated_.begin(); i != annotated_.end(); ++i) { - if (i->systems.empty()) { - output.AddNGram(i->line); - } else { - for (std::vector<size_t>::const_iterator j = i->systems.begin(); j != i->systems.end(); ++j) { - output.SingleAddNGram(*j, i->line); - } - } - } - annotated_.clear(); - } - - private: - struct Annotated { - // If this is empty, send to all systems. - // A filter should never send to all systems and send to a single one. - std::vector<size_t> systems; - StringPiece line; - }; - - StringPiece last_; - - std::vector<Annotated> annotated_; -}; - -} // namespace lm - -#endif // LM_FILTER_FORMAT_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/filter/phrase.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/filter/phrase.cc b/ext/kenlm/lm/filter/phrase.cc deleted file mode 100644 index d8260d5..0000000 --- a/ext/kenlm/lm/filter/phrase.cc +++ /dev/null @@ -1,292 +0,0 @@ -#include "lm/filter/phrase.hh" - -#include "lm/filter/format.hh" - -#include <algorithm> -#include <functional> -#include <iostream> -#include <queue> -#include <string> -#include <vector> - -#include <cctype> - -namespace lm { -namespace phrase { - -unsigned int ReadMultiple(std::istream &in, Substrings &out) { - bool sentence_content = false; - unsigned int sentence_id = 0; - std::vector<Hash> phrase; - std::string word; - while (in) { - char c; - // Gather a word. - while (!isspace(c = in.get()) && in) word += c; - // Treat EOF like a newline. - if (!in) c = '\n'; - // Add the word to the phrase. - if (!word.empty()) { - phrase.push_back(util::MurmurHashNative(word.data(), word.size())); - word.clear(); - } - if (c == ' ') continue; - // It's more than just a space. Close out the phrase. - if (!phrase.empty()) { - sentence_content = true; - out.AddPhrase(sentence_id, phrase.begin(), phrase.end()); - phrase.clear(); - } - if (c == '\t' || c == '\v') continue; - // It's more than a space or tab: a newline. - if (sentence_content) { - ++sentence_id; - sentence_content = false; - } - } - if (!in.eof()) in.exceptions(std::istream::failbit | std::istream::badbit); - return sentence_id + sentence_content; -} - -namespace { -typedef unsigned int Sentence; -typedef std::vector<Sentence> Sentences; -} // namespace - -namespace detail { - -const StringPiece kEndSentence("</s>"); - -class Arc { - public: - Arc() {} - - // For arcs from one vertex to another. - void SetPhrase(detail::Vertex &from, detail::Vertex &to, const Sentences &intersect) { - Set(to, intersect); - from_ = &from; - } - - /* For arcs from before the n-gram begins to somewhere in the n-gram (right - * aligned). These have no from_ vertex; it implictly matches every - * sentence. This also handles when the n-gram is a substring of a phrase. - */ - void SetRight(detail::Vertex &to, const Sentences &complete) { - Set(to, complete); - from_ = NULL; - } - - Sentence Current() const { - return *current_; - } - - bool Empty() const { - return current_ == last_; - } - - /* When this function returns: - * If Empty() then there's nothing left from this intersection. - * - * If Current() == to then to is part of the intersection. - * - * Otherwise, Current() > to. In this case, to is not part of the - * intersection and neither is anything < Current(). To determine if - * any value >= Current() is in the intersection, call LowerBound again - * with the value. - */ - void LowerBound(const Sentence to); - - private: - void Set(detail::Vertex &to, const Sentences &sentences); - - const Sentence *current_; - const Sentence *last_; - detail::Vertex *from_; -}; - -struct ArcGreater : public std::binary_function<const Arc *, const Arc *, bool> { - bool operator()(const Arc *first, const Arc *second) const { - return first->Current() > second->Current(); - } -}; - -class Vertex { - public: - Vertex() : current_(0) {} - - Sentence Current() const { - return current_; - } - - bool Empty() const { - return incoming_.empty(); - } - - void LowerBound(const Sentence to); - - private: - friend class Arc; - - void AddIncoming(Arc *arc) { - if (!arc->Empty()) incoming_.push(arc); - } - - unsigned int current_; - std::priority_queue<Arc*, std::vector<Arc*>, ArcGreater> incoming_; -}; - -void Arc::LowerBound(const Sentence to) { - current_ = std::lower_bound(current_, last_, to); - // If *current_ > to, don't advance from_. The intervening values of - // from_ may be useful for another one of its outgoing arcs. - if (!from_ || Empty() || (Current() > to)) return; - assert(Current() == to); - from_->LowerBound(to); - if (from_->Empty()) { - current_ = last_; - return; - } - assert(from_->Current() >= to); - if (from_->Current() > to) { - current_ = std::lower_bound(current_ + 1, last_, from_->Current()); - } -} - -void Arc::Set(Vertex &to, const Sentences &sentences) { - current_ = &*sentences.begin(); - last_ = &*sentences.end(); - to.AddIncoming(this); -} - -void Vertex::LowerBound(const Sentence to) { - if (Empty()) return; - // Union lower bound. - while (true) { - Arc *top = incoming_.top(); - if (top->Current() > to) { - current_ = top->Current(); - return; - } - // If top->Current() == to, we still need to verify that's an actual - // element and not just a bound. - incoming_.pop(); - top->LowerBound(to); - if (!top->Empty()) { - incoming_.push(top); - if (top->Current() == to) { - current_ = to; - return; - } - } else if (Empty()) { - return; - } - } -} - -} // namespace detail - -namespace { - -void BuildGraph(const Substrings &phrase, const std::vector<Hash> &hashes, detail::Vertex *const vertices, detail::Arc *free_arc) { - using detail::Vertex; - using detail::Arc; - assert(!hashes.empty()); - - const Hash *const first_word = &*hashes.begin(); - const Hash *const last_word = &*hashes.end() - 1; - - Hash hash = 0; - const Sentences *found; - // Phrases starting at or before the first word in the n-gram. - { - Vertex *vertex = vertices; - for (const Hash *word = first_word; ; ++word, ++vertex) { - hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word); - // Now hash is [hashes.begin(), word]. - if (word == last_word) { - if (phrase.FindSubstring(hash, found)) - (free_arc++)->SetRight(*vertex, *found); - break; - } - if (!phrase.FindRight(hash, found)) break; - (free_arc++)->SetRight(*vertex, *found); - } - } - - // Phrases starting at the second or later word in the n-gram. - Vertex *vertex_from = vertices; - for (const Hash *word_from = first_word + 1; word_from != &*hashes.end(); ++word_from, ++vertex_from) { - hash = 0; - Vertex *vertex_to = vertex_from + 1; - for (const Hash *word_to = word_from; ; ++word_to, ++vertex_to) { - // Notice that word_to and vertex_to have the same index. - hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word_to); - // Now hash covers [word_from, word_to]. - if (word_to == last_word) { - if (phrase.FindLeft(hash, found)) - (free_arc++)->SetPhrase(*vertex_from, *vertex_to, *found); - break; - } - if (!phrase.FindPhrase(hash, found)) break; - (free_arc++)->SetPhrase(*vertex_from, *vertex_to, *found); - } - } -} - -} // namespace - -namespace detail { - -// Here instead of header due to forward declaration. -ConditionCommon::ConditionCommon(const Substrings &substrings) : substrings_(substrings) {} - -// Rest of the variables are temporaries anyway -ConditionCommon::ConditionCommon(const ConditionCommon &from) : substrings_(from.substrings_) {} - -ConditionCommon::~ConditionCommon() {} - -detail::Vertex &ConditionCommon::MakeGraph() { - assert(!hashes_.empty()); - vertices_.clear(); - vertices_.resize(hashes_.size()); - arcs_.clear(); - // One for every substring. - arcs_.resize(((hashes_.size() + 1) * hashes_.size()) / 2); - BuildGraph(substrings_, hashes_, &*vertices_.begin(), &*arcs_.begin()); - return vertices_[hashes_.size() - 1]; -} - -} // namespace detail - -bool Union::Evaluate() { - detail::Vertex &last_vertex = MakeGraph(); - unsigned int lower = 0; - while (true) { - last_vertex.LowerBound(lower); - if (last_vertex.Empty()) return false; - if (last_vertex.Current() == lower) return true; - lower = last_vertex.Current(); - } -} - -template <class Output> void Multiple::Evaluate(const StringPiece &line, Output &output) { - detail::Vertex &last_vertex = MakeGraph(); - unsigned int lower = 0; - while (true) { - last_vertex.LowerBound(lower); - if (last_vertex.Empty()) return; - if (last_vertex.Current() == lower) { - output.SingleAddNGram(lower, line); - ++lower; - } else { - lower = last_vertex.Current(); - } - } -} - -template void Multiple::Evaluate<CountFormat::Multiple>(const StringPiece &line, CountFormat::Multiple &output); -template void Multiple::Evaluate<ARPAFormat::Multiple>(const StringPiece &line, ARPAFormat::Multiple &output); -template void Multiple::Evaluate<MultipleOutputBuffer>(const StringPiece &line, MultipleOutputBuffer &output); - -} // namespace phrase -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/filter/phrase.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/filter/phrase.hh b/ext/kenlm/lm/filter/phrase.hh deleted file mode 100644 index 5227ab2..0000000 --- a/ext/kenlm/lm/filter/phrase.hh +++ /dev/null @@ -1,168 +0,0 @@ -#ifndef LM_FILTER_PHRASE_H -#define LM_FILTER_PHRASE_H - -#include "util/murmur_hash.hh" -#include "util/string_piece.hh" -#include "util/tokenize_piece.hh" - -#include <boost/unordered_map.hpp> - -#include <iosfwd> -#include <vector> - -#define LM_FILTER_PHRASE_METHOD(caps, lower) \ -bool Find##caps(Hash key, const std::vector<unsigned int> *&out) const {\ - Table::const_iterator i(table_.find(key));\ - if (i==table_.end()) return false; \ - out = &i->second.lower; \ - return true; \ -} - -namespace lm { -namespace phrase { - -typedef uint64_t Hash; - -class Substrings { - private: - /* This is the value in a hash table where the key is a string. It indicates - * four sets of sentences: - * substring is sentences with a phrase containing the key as a substring. - * left is sentencess with a phrase that begins with the key (left aligned). - * right is sentences with a phrase that ends with the key (right aligned). - * phrase is sentences where the key is a phrase. - * Each set is encoded as a vector of sentence ids in increasing order. - */ - struct SentenceRelation { - std::vector<unsigned int> substring, left, right, phrase; - }; - /* Most of the CPU is hash table lookups, so let's not complicate it with - * vector equality comparisons. If a collision happens, the SentenceRelation - * structure will contain the union of sentence ids over the colliding strings. - * In that case, the filter will be slightly more permissive. - * The key here is the same as boost's hash of std::vector<std::string>. - */ - typedef boost::unordered_map<Hash, SentenceRelation> Table; - - public: - Substrings() {} - - /* If the string isn't a substring of any phrase, return NULL. Otherwise, - * return a pointer to std::vector<unsigned int> listing sentences with - * matching phrases. This set may be empty for Left, Right, or Phrase. - * Example: const std::vector<unsigned int> *FindSubstring(Hash key) - */ - LM_FILTER_PHRASE_METHOD(Substring, substring) - LM_FILTER_PHRASE_METHOD(Left, left) - LM_FILTER_PHRASE_METHOD(Right, right) - LM_FILTER_PHRASE_METHOD(Phrase, phrase) - -#pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization - // sentence_id must be non-decreasing. Iterators are over words in the phrase. - template <class Iterator> void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) { - // Iterate over all substrings. - for (Iterator start = begin; start != end; ++start) { - Hash hash = 0; - SentenceRelation *relation; - for (Iterator finish = start; finish != end; ++finish) { - hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *finish); - // Now hash is of [start, finish]. - relation = &table_[hash]; - AppendSentence(relation->substring, sentence_id); - if (start == begin) AppendSentence(relation->left, sentence_id); - } - AppendSentence(relation->right, sentence_id); - if (start == begin) AppendSentence(relation->phrase, sentence_id); - } - } - - private: - void AppendSentence(std::vector<unsigned int> &vec, unsigned int sentence_id) { - if (vec.empty() || vec.back() != sentence_id) vec.push_back(sentence_id); - } - - Table table_; -}; - -// Read a file with one sentence per line containing tab-delimited phrases of -// space-separated words. -unsigned int ReadMultiple(std::istream &in, Substrings &out); - -namespace detail { -extern const StringPiece kEndSentence; - -template <class Iterator> void MakeHashes(Iterator i, const Iterator &end, std::vector<Hash> &hashes) { - hashes.clear(); - if (i == end) return; - // TODO: check strict phrase boundaries after <s> and before </s>. For now, just skip tags. - if ((i->data()[0] == '<') && (i->data()[i->size() - 1] == '>')) { - ++i; - } - for (; i != end && (*i != kEndSentence); ++i) { - hashes.push_back(util::MurmurHashNative(i->data(), i->size())); - } -} - -class Vertex; -class Arc; - -class ConditionCommon { - protected: - ConditionCommon(const Substrings &substrings); - ConditionCommon(const ConditionCommon &from); - - ~ConditionCommon(); - - detail::Vertex &MakeGraph(); - - // Temporaries in PassNGram and Evaluate to avoid reallocation. - std::vector<Hash> hashes_; - - private: - std::vector<detail::Vertex> vertices_; - std::vector<detail::Arc> arcs_; - - const Substrings &substrings_; -}; - -} // namespace detail - -class Union : public detail::ConditionCommon { - public: - explicit Union(const Substrings &substrings) : detail::ConditionCommon(substrings) {} - - template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) { - detail::MakeHashes(begin, end, hashes_); - return hashes_.empty() || Evaluate(); - } - - private: - bool Evaluate(); -}; - -class Multiple : public detail::ConditionCommon { - public: - explicit Multiple(const Substrings &substrings) : detail::ConditionCommon(substrings) {} - - template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { - detail::MakeHashes(begin, end, hashes_); - if (hashes_.empty()) { - output.AddNGram(line); - } else { - Evaluate(line, output); - } - } - - template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { - AddNGram(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), line, output); - } - - void Flush() const {} - - private: - template <class Output> void Evaluate(const StringPiece &line, Output &output); -}; - -} // namespace phrase -} // namespace lm -#endif // LM_FILTER_PHRASE_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/filter/phrase_table_vocab_main.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/filter/phrase_table_vocab_main.cc b/ext/kenlm/lm/filter/phrase_table_vocab_main.cc deleted file mode 100644 index 9ffa35f..0000000 --- a/ext/kenlm/lm/filter/phrase_table_vocab_main.cc +++ /dev/null @@ -1,165 +0,0 @@ -#include "util/file_stream.hh" -#include "util/file_piece.hh" -#include "util/murmur_hash.hh" -#include "util/pool.hh" -#include "util/string_piece.hh" -#include "util/string_piece_hash.hh" -#include "util/tokenize_piece.hh" - -#include <boost/unordered_map.hpp> -#include <boost/unordered_set.hpp> - -#include <cstddef> -#include <vector> - -namespace { - -struct MutablePiece { - mutable StringPiece behind; - bool operator==(const MutablePiece &other) const { - return behind == other.behind; - } -}; - -std::size_t hash_value(const MutablePiece &m) { - return hash_value(m.behind); -} - -class InternString { - public: - const char *Add(StringPiece str) { - MutablePiece mut; - mut.behind = str; - std::pair<boost::unordered_set<MutablePiece>::iterator, bool> res(strs_.insert(mut)); - if (res.second) { - void *mem = backing_.Allocate(str.size() + 1); - memcpy(mem, str.data(), str.size()); - static_cast<char*>(mem)[str.size()] = 0; - res.first->behind = StringPiece(static_cast<char*>(mem), str.size()); - } - return res.first->behind.data(); - } - - private: - util::Pool backing_; - boost::unordered_set<MutablePiece> strs_; -}; - -class TargetWords { - public: - void Introduce(StringPiece source) { - vocab_.resize(vocab_.size() + 1); - std::vector<unsigned int> temp(1, vocab_.size() - 1); - Add(temp, source); - } - - void Add(const std::vector<unsigned int> &sentences, StringPiece target) { - if (sentences.empty()) return; - interns_.clear(); - for (util::TokenIter<util::SingleCharacter, true> i(target, ' '); i; ++i) { - interns_.push_back(intern_.Add(*i)); - } - for (std::vector<unsigned int>::const_iterator i(sentences.begin()); i != sentences.end(); ++i) { - boost::unordered_set<const char *> &vocab = vocab_[*i]; - for (std::vector<const char *>::const_iterator j = interns_.begin(); j != interns_.end(); ++j) { - vocab.insert(*j); - } - } - } - - void Print() const { - util::FileStream out(1); - for (std::vector<boost::unordered_set<const char *> >::const_iterator i = vocab_.begin(); i != vocab_.end(); ++i) { - for (boost::unordered_set<const char *>::const_iterator j = i->begin(); j != i->end(); ++j) { - out << *j << ' '; - } - out << '\n'; - } - } - - private: - InternString intern_; - - std::vector<boost::unordered_set<const char *> > vocab_; - - // Temporary in Add. - std::vector<const char *> interns_; -}; - -class Input { - public: - explicit Input(std::size_t max_length) - : max_length_(max_length), sentence_id_(0), empty_() {} - - void AddSentence(StringPiece sentence, TargetWords &targets) { - canonical_.clear(); - starts_.clear(); - starts_.push_back(0); - for (util::TokenIter<util::AnyCharacter, true> i(sentence, StringPiece("\0 \t", 3)); i; ++i) { - canonical_.append(i->data(), i->size()); - canonical_ += ' '; - starts_.push_back(canonical_.size()); - } - targets.Introduce(canonical_); - for (std::size_t i = 0; i < starts_.size() - 1; ++i) { - std::size_t subtract = starts_[i]; - const char *start = &canonical_[subtract]; - for (std::size_t j = i + 1; j < std::min(starts_.size(), i + max_length_ + 1); ++j) { - map_[util::MurmurHash64A(start, &canonical_[starts_[j]] - start - 1)].push_back(sentence_id_); - } - } - ++sentence_id_; - } - - // Assumes single space-delimited phrase with no space at the beginning or end. - const std::vector<unsigned int> &Matches(StringPiece phrase) const { - Map::const_iterator i = map_.find(util::MurmurHash64A(phrase.data(), phrase.size())); - return i == map_.end() ? empty_ : i->second; - } - - private: - const std::size_t max_length_; - - // hash of phrase is the key, array of sentences is the value. - typedef boost::unordered_map<uint64_t, std::vector<unsigned int> > Map; - Map map_; - - std::size_t sentence_id_; - - // Temporaries in AddSentence. - std::string canonical_; - std::vector<std::size_t> starts_; - - const std::vector<unsigned int> empty_; -}; - -} // namespace - -int main(int argc, char *argv[]) { - if (argc != 2) { - std::cerr << "Expected source text on the command line" << std::endl; - return 1; - } - Input input(7); - TargetWords targets; - try { - util::FilePiece inputs(argv[1], &std::cerr); - while (true) - input.AddSentence(inputs.ReadLine(), targets); - } catch (const util::EndOfFileException &e) {} - - util::FilePiece table(0, NULL, &std::cerr); - StringPiece line; - const StringPiece pipes("|||"); - while (true) { - try { - line = table.ReadLine(); - } catch (const util::EndOfFileException &e) { break; } - util::TokenIter<util::MultiCharacter> it(line, pipes); - StringPiece source(*it); - if (!source.empty() && source[source.size() - 1] == ' ') - source.remove_suffix(1); - targets.Add(input.Matches(source), *++it); - } - targets.Print(); -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/filter/thread.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/filter/thread.hh b/ext/kenlm/lm/filter/thread.hh deleted file mode 100644 index 88e069c..0000000 --- a/ext/kenlm/lm/filter/thread.hh +++ /dev/null @@ -1,167 +0,0 @@ -#ifndef LM_FILTER_THREAD_H -#define LM_FILTER_THREAD_H - -#include "util/thread_pool.hh" - -#include <boost/utility/in_place_factory.hpp> - -#include <deque> -#include <stack> - -namespace lm { - -template <class OutputBuffer> class ThreadBatch { - public: - ThreadBatch() {} - - void Reserve(size_t size) { - input_.Reserve(size); - output_.Reserve(size); - } - - // File reading thread. - InputBuffer &Fill(uint64_t sequence) { - sequence_ = sequence; - // Why wait until now to clear instead of after output? free in the same - // thread as allocated. - input_.Clear(); - return input_; - } - - // Filter worker thread. - template <class Filter> void CallFilter(Filter &filter) { - input_.CallFilter(filter, output_); - } - - uint64_t Sequence() const { return sequence_; } - - // File writing thread. - template <class RealOutput> void Flush(RealOutput &output) { - output_.Flush(output); - } - - private: - InputBuffer input_; - OutputBuffer output_; - - uint64_t sequence_; -}; - -template <class Batch, class Filter> class FilterWorker { - public: - typedef Batch *Request; - - FilterWorker(const Filter &filter, util::PCQueue<Request> &done) : filter_(filter), done_(done) {} - - void operator()(Request request) { - request->CallFilter(filter_); - done_.Produce(request); - } - - private: - Filter filter_; - - util::PCQueue<Request> &done_; -}; - -// There should only be one OutputWorker. -template <class Batch, class Output> class OutputWorker { - public: - typedef Batch *Request; - - OutputWorker(Output &output, util::PCQueue<Request> &done) : output_(output), done_(done), base_sequence_(0) {} - - void operator()(Request request) { - assert(request->Sequence() >= base_sequence_); - // Assemble the output in order. - uint64_t pos = request->Sequence() - base_sequence_; - if (pos >= ordering_.size()) { - ordering_.resize(pos + 1, NULL); - } - ordering_[pos] = request; - while (!ordering_.empty() && ordering_.front()) { - ordering_.front()->Flush(output_); - done_.Produce(ordering_.front()); - ordering_.pop_front(); - ++base_sequence_; - } - } - - private: - Output &output_; - - util::PCQueue<Request> &done_; - - std::deque<Request> ordering_; - - uint64_t base_sequence_; -}; - -template <class Filter, class OutputBuffer, class RealOutput> class Controller : boost::noncopyable { - private: - typedef ThreadBatch<OutputBuffer> Batch; - - public: - Controller(size_t batch_size, size_t queue, size_t workers, const Filter &filter, RealOutput &output) - : batch_size_(batch_size), queue_size_(queue), - batches_(queue), - to_read_(queue), - output_(queue, 1, boost::in_place(boost::ref(output), boost::ref(to_read_)), NULL), - filter_(queue, workers, boost::in_place(boost::ref(filter), boost::ref(output_.In())), NULL), - sequence_(0) { - for (size_t i = 0; i < queue; ++i) { - batches_[i].Reserve(batch_size); - local_read_.push(&batches_[i]); - } - NewInput(); - } - - void AddNGram(const StringPiece &ngram, const StringPiece &line, RealOutput &output) { - input_->AddNGram(ngram, line, output); - if (input_->Size() == batch_size_) { - FlushInput(); - NewInput(); - } - } - - void Flush() { - FlushInput(); - while (local_read_.size() < queue_size_) { - MoveRead(); - } - NewInput(); - } - - private: - void FlushInput() { - if (input_->Empty()) return; - filter_.Produce(local_read_.top()); - local_read_.pop(); - if (local_read_.empty()) MoveRead(); - } - - void NewInput() { - input_ = &local_read_.top()->Fill(sequence_++); - } - - void MoveRead() { - local_read_.push(to_read_.Consume()); - } - - const size_t batch_size_; - const size_t queue_size_; - - std::vector<Batch> batches_; - - util::PCQueue<Batch*> to_read_; - std::stack<Batch*> local_read_; - util::ThreadPool<OutputWorker<Batch, RealOutput> > output_; - util::ThreadPool<FilterWorker<Batch, Filter> > filter_; - - uint64_t sequence_; - InputBuffer *input_; -}; - -} // namespace lm - -#endif // LM_FILTER_THREAD_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/filter/vocab.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/filter/vocab.cc b/ext/kenlm/lm/filter/vocab.cc deleted file mode 100644 index 2aca4fc..0000000 --- a/ext/kenlm/lm/filter/vocab.cc +++ /dev/null @@ -1,53 +0,0 @@ -#include "lm/filter/vocab.hh" - -#include <istream> -#include <iostream> - -#include <cctype> - -namespace lm { -namespace vocab { - -void ReadSingle(std::istream &in, boost::unordered_set<std::string> &out) { - in.exceptions(std::istream::badbit); - std::string word; - while (in >> word) { - out.insert(word); - } -} - -namespace { -bool IsLineEnd(std::istream &in) { - int got; - do { - got = in.get(); - if (!in) return true; - if (got == '\n') return true; - } while (isspace(got)); - in.unget(); - return false; -} -}// namespace - -// Read space separated words in enter separated lines. These lines can be -// very long, so don't read an entire line at a time. -unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) { - in.exceptions(std::istream::badbit); - unsigned int sentence = 0; - bool used_id = false; - std::string word; - while (in >> word) { - used_id = true; - std::vector<unsigned int> &posting = out[word]; - if (posting.empty() || (posting.back() != sentence)) - posting.push_back(sentence); - if (IsLineEnd(in)) { - ++sentence; - used_id = false; - } - } - return sentence + used_id; -} - -} // namespace vocab -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/filter/vocab.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/filter/vocab.hh b/ext/kenlm/lm/filter/vocab.hh deleted file mode 100644 index 397a932..0000000 --- a/ext/kenlm/lm/filter/vocab.hh +++ /dev/null @@ -1,133 +0,0 @@ -#ifndef LM_FILTER_VOCAB_H -#define LM_FILTER_VOCAB_H - -// Vocabulary-based filters for language models. - -#include "util/multi_intersection.hh" -#include "util/string_piece.hh" -#include "util/string_piece_hash.hh" -#include "util/tokenize_piece.hh" - -#include <boost/noncopyable.hpp> -#include <boost/range/iterator_range.hpp> -#include <boost/unordered/unordered_map.hpp> -#include <boost/unordered/unordered_set.hpp> - -#include <string> -#include <vector> - -namespace lm { -namespace vocab { - -void ReadSingle(std::istream &in, boost::unordered_set<std::string> &out); - -// Read one sentence vocabulary per line. Return the number of sentences. -unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out); - -/* Is this a special tag like <s> or <UNK>? This actually includes anything - * surrounded with < and >, which most tokenizers separate for real words, so - * this should not catch real words as it looks at a single token. - */ -inline bool IsTag(const StringPiece &value) { - // The parser should never give an empty string. - assert(!value.empty()); - return (value.data()[0] == '<' && value.data()[value.size() - 1] == '>'); -} - -class Single { - public: - typedef boost::unordered_set<std::string> Words; - - explicit Single(const Words &vocab) : vocab_(vocab) {} - - template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) { - for (Iterator i = begin; i != end; ++i) { - if (IsTag(*i)) continue; - if (FindStringPiece(vocab_, *i) == vocab_.end()) return false; - } - return true; - } - - private: - const Words &vocab_; -}; - -class Union { - public: - typedef boost::unordered_map<std::string, std::vector<unsigned int> > Words; - - explicit Union(const Words &vocabs) : vocabs_(vocabs) {} - - template <class Iterator> bool PassNGram(const Iterator &begin, const Iterator &end) { - sets_.clear(); - - for (Iterator i(begin); i != end; ++i) { - if (IsTag(*i)) continue; - Words::const_iterator found(FindStringPiece(vocabs_, *i)); - if (vocabs_.end() == found) return false; - sets_.push_back(boost::iterator_range<const unsigned int*>(&*found->second.begin(), &*found->second.end())); - } - return (sets_.empty() || util::FirstIntersection(sets_)); - } - - private: - const Words &vocabs_; - - std::vector<boost::iterator_range<const unsigned int*> > sets_; -}; - -class Multiple { - public: - typedef boost::unordered_map<std::string, std::vector<unsigned int> > Words; - - Multiple(const Words &vocabs) : vocabs_(vocabs) {} - - private: - // Callback from AllIntersection that does AddNGram. - template <class Output> class Callback { - public: - Callback(Output &out, const StringPiece &line) : out_(out), line_(line) {} - - void operator()(unsigned int index) { - out_.SingleAddNGram(index, line_); - } - - private: - Output &out_; - const StringPiece &line_; - }; - - public: - template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { - sets_.clear(); - for (Iterator i(begin); i != end; ++i) { - if (IsTag(*i)) continue; - Words::const_iterator found(FindStringPiece(vocabs_, *i)); - if (vocabs_.end() == found) return; - sets_.push_back(boost::iterator_range<const unsigned int*>(&*found->second.begin(), &*found->second.end())); - } - if (sets_.empty()) { - output.AddNGram(line); - return; - } - - Callback<Output> cb(output, line); - util::AllIntersection(sets_, cb); - } - - template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { - AddNGram(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), line, output); - } - - void Flush() const {} - - private: - const Words &vocabs_; - - std::vector<boost::iterator_range<const unsigned int*> > sets_; -}; - -} // namespace vocab -} // namespace lm - -#endif // LM_FILTER_VOCAB_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/filter/wrapper.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/filter/wrapper.hh b/ext/kenlm/lm/filter/wrapper.hh deleted file mode 100644 index 227ec8e..0000000 --- a/ext/kenlm/lm/filter/wrapper.hh +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef LM_FILTER_WRAPPER_H -#define LM_FILTER_WRAPPER_H - -#include "util/string_piece.hh" - -#include <algorithm> -#include <string> -#include <vector> - -namespace lm { - -// Provide a single-output filter with the same interface as a -// multiple-output filter so clients code against one interface. -template <class Binary> class BinaryFilter { - public: - // Binary modes are just references (and a set) and it makes the API cleaner to copy them. - explicit BinaryFilter(Binary binary) : binary_(binary) {} - - template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { - if (binary_.PassNGram(begin, end)) - output.AddNGram(line); - } - - template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { - AddNGram(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), line, output); - } - - void Flush() const {} - - private: - Binary binary_; -}; - -// Wrap another filter to pay attention only to context words -template <class FilterT> class ContextFilter { - public: - typedef FilterT Filter; - - explicit ContextFilter(Filter &backend) : backend_(backend) {} - - template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { - // Find beginning of string or last space. - const char *last_space; - for (last_space = ngram.data() + ngram.size() - 1; last_space > ngram.data() && *last_space != ' '; --last_space) {} - backend_.AddNGram(StringPiece(ngram.data(), last_space - ngram.data()), line, output); - } - - void Flush() const {} - - private: - Filter backend_; -}; - -} // namespace lm - -#endif // LM_FILTER_WRAPPER_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/fragment_main.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/fragment_main.cc b/ext/kenlm/lm/fragment_main.cc deleted file mode 100644 index 0267cd4..0000000 --- a/ext/kenlm/lm/fragment_main.cc +++ /dev/null @@ -1,37 +0,0 @@ -#include "lm/binary_format.hh" -#include "lm/model.hh" -#include "lm/left.hh" -#include "util/tokenize_piece.hh" - -template <class Model> void Query(const char *name) { - Model model(name); - std::string line; - lm::ngram::ChartState ignored; - while (getline(std::cin, line)) { - lm::ngram::RuleScore<Model> scorer(model, ignored); - for (util::TokenIter<util::SingleCharacter, true> i(line, ' '); i; ++i) { - scorer.Terminal(model.GetVocabulary().Index(*i)); - } - std::cout << scorer.Finish() << '\n'; - } -} - -int main(int argc, char *argv[]) { - if (argc != 2) { - std::cerr << "Expected model file name." << std::endl; - return 1; - } - const char *name = argv[1]; - lm::ngram::ModelType model_type = lm::ngram::PROBING; - lm::ngram::RecognizeBinary(name, model_type); - switch (model_type) { - case lm::ngram::PROBING: - Query<lm::ngram::ProbingModel>(name); - break; - case lm::ngram::REST_PROBING: - Query<lm::ngram::RestProbingModel>(name); - break; - default: - std::cerr << "Model type not supported yet." << std::endl; - } -}
