http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/binary_format.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/binary_format.cc b/ext/kenlm/lm/binary_format.cc deleted file mode 100644 index 802943f..0000000 --- a/ext/kenlm/lm/binary_format.cc +++ /dev/null @@ -1,302 +0,0 @@ -#include "lm/binary_format.hh" - -#include "lm/lm_exception.hh" -#include "util/file.hh" -#include "util/file_piece.hh" - -#include <cstddef> -#include <cstring> -#include <limits> -#include <string> -#include <cstdlib> - -#include <stdint.h> - -namespace lm { -namespace ngram { - -const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"}; - -namespace { -const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version"; -const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0"; -// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed). -const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n"; -const long int kMagicVersion = 5; - -// Old binary files built on 32-bit machines have this header. -// TODO: eliminate with next binary release. -struct OldSanity { - char magic[sizeof(kMagicBytes)]; - float zero_f, one_f, minus_half_f; - WordIndex one_word_index, max_word_index; - uint64_t one_uint64; - - void SetToReference() { - std::memset(this, 0, sizeof(OldSanity)); - std::memcpy(magic, kMagicBytes, sizeof(magic)); - zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5; - one_word_index = 1; - max_word_index = std::numeric_limits<WordIndex>::max(); - one_uint64 = 1; - } -}; - - -// Test values aligned to 8 bytes. -struct Sanity { - char magic[ALIGN8(sizeof(kMagicBytes))]; - float zero_f, one_f, minus_half_f; - WordIndex one_word_index, max_word_index, padding_to_8; - uint64_t one_uint64; - - void SetToReference() { - std::memset(this, 0, sizeof(Sanity)); - std::memcpy(magic, kMagicBytes, sizeof(kMagicBytes)); - zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5; - one_word_index = 1; - max_word_index = std::numeric_limits<WordIndex>::max(); - padding_to_8 = 0; - one_uint64 = 1; - } -}; - -std::size_t TotalHeaderSize(unsigned char order) { - return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order); -} - -void WriteHeader(void *to, const Parameters ¶ms) { - Sanity header = Sanity(); - header.SetToReference(); - std::memcpy(to, &header, sizeof(Sanity)); - char *out = reinterpret_cast<char*>(to) + sizeof(Sanity); - - *reinterpret_cast<FixedWidthParameters*>(out) = params.fixed; - out += sizeof(FixedWidthParameters); - - uint64_t *counts = reinterpret_cast<uint64_t*>(out); - for (std::size_t i = 0; i < params.counts.size(); ++i) { - counts[i] = params.counts[i]; - } -} - -} // namespace - -bool IsBinaryFormat(int fd) { - const uint64_t size = util::SizeFile(fd); - if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false; - // Try reading the header. - util::scoped_memory memory; - try { - util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory); - } catch (const util::Exception &e) { - return false; - } - Sanity reference_header = Sanity(); - reference_header.SetToReference(); - if (!std::memcmp(memory.get(), &reference_header, sizeof(Sanity))) return true; - if (!std::memcmp(memory.get(), kMagicIncomplete, strlen(kMagicIncomplete))) { - UTIL_THROW(FormatLoadException, "This binary file did not finish building"); - } - if (!std::memcmp(memory.get(), kMagicBeforeVersion, strlen(kMagicBeforeVersion))) { - char *end_ptr; - const char *begin_version = static_cast<const char*>(memory.get()) + strlen(kMagicBeforeVersion); - long int version = std::strtol(begin_version, &end_ptr, 10); - if ((end_ptr != begin_version) && version != kMagicVersion) { - UTIL_THROW(FormatLoadException, "Binary file has version " << version << " but this implementation expects version " << kMagicVersion << " so you'll have to use the ARPA to rebuild your binary"); - } - - OldSanity old_sanity = OldSanity(); - old_sanity.SetToReference(); - UTIL_THROW_IF(!std::memcmp(memory.get(), &old_sanity, sizeof(OldSanity)), FormatLoadException, "Looks like this is an old 32-bit format. The old 32-bit format has been removed so that 64-bit and 32-bit files are exchangeable."); - UTIL_THROW(FormatLoadException, "File looks like it should be loaded with mmap, but the test values don't match. Try rebuilding the binary format LM using the same code revision, compiler, and architecture"); - } - return false; -} - -void ReadHeader(int fd, Parameters &out) { - util::SeekOrThrow(fd, sizeof(Sanity)); - util::ReadOrThrow(fd, &out.fixed, sizeof(out.fixed)); - if (out.fixed.probing_multiplier < 1.0) - UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0."); - - out.counts.resize(static_cast<std::size_t>(out.fixed.order)); - if (out.fixed.order) util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order); -} - -void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters ¶ms) { - if (params.fixed.model_type != model_type) { - if (static_cast<unsigned int>(params.fixed.model_type) >= (sizeof(kModelNames) / sizeof(const char *))) - UTIL_THROW(FormatLoadException, "The binary file claims to be model type " << static_cast<unsigned int>(params.fixed.model_type) << " but this is not implemented for in this inference code."); - UTIL_THROW(FormatLoadException, "The binary file was built for " << kModelNames[params.fixed.model_type] << " but the inference code is trying to load " << kModelNames[model_type]); - } - UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version); -} - -const std::size_t kInvalidSize = static_cast<std::size_t>(-1); - -BinaryFormat::BinaryFormat(const Config &config) - : write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method), - header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {} - -void BinaryFormat::InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms) { - file_.reset(fd); - write_mmap_ = NULL; // Ignore write requests; this is already in binary format. - ReadHeader(fd, params); - MatchCheck(model_type, search_version, params); - header_size_ = TotalHeaderSize(params.counts.size()); -} - -void BinaryFormat::ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const { - assert(header_size_ != kInvalidSize); - util::ErsatzPRead(file_.get(), to, amount, offset_excluding_header + header_size_); -} - -void *BinaryFormat::LoadBinary(std::size_t size) { - assert(header_size_ != kInvalidSize); - const uint64_t file_size = util::SizeFile(file_.get()); - // The header is smaller than a page, so we have to map the whole header as well. - uint64_t total_map = static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(size); - UTIL_THROW_IF(file_size != util::kBadSize && file_size < total_map, FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map); - - util::MapRead(load_method_, file_.get(), 0, util::CheckOverflow(total_map), mapping_); - - vocab_string_offset_ = total_map; - return reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_; -} - -void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) { - vocab_size_ = memory_size; - if (!write_mmap_) { - header_size_ = 0; - util::HugeMalloc(memory_size, true, memory_vocab_); - return reinterpret_cast<uint8_t*>(memory_vocab_.get()); - } - header_size_ = TotalHeaderSize(order); - std::size_t total = util::CheckOverflow(static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(memory_size)); - file_.reset(util::CreateOrThrow(write_mmap_)); - // some gccs complain about uninitialized variables even though all enum values are covered. - void *vocab_base = NULL; - switch (write_method_) { - case Config::WRITE_MMAP: - mapping_.reset(util::MapZeroedWrite(file_.get(), total), total, util::scoped_memory::MMAP_ALLOCATED); - util::AdviseHugePages(vocab_base, total); - vocab_base = mapping_.get(); - break; - case Config::WRITE_AFTER: - util::ResizeOrThrow(file_.get(), 0); - util::HugeMalloc(total, true, memory_vocab_); - vocab_base = memory_vocab_.get(); - break; - } - strncpy(reinterpret_cast<char*>(vocab_base), kMagicIncomplete, header_size_); - return reinterpret_cast<uint8_t*>(vocab_base) + header_size_; -} - -void *BinaryFormat::GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base) { - assert(vocab_size_ != kInvalidSize); - vocab_pad_ = vocab_pad; - std::size_t new_size = header_size_ + vocab_size_ + vocab_pad_ + memory_size; - vocab_string_offset_ = new_size; - if (!write_mmap_ || write_method_ == Config::WRITE_AFTER) { - util::HugeMalloc(memory_size, true, memory_search_); - assert(header_size_ == 0 || write_mmap_); - vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_; - util::AdviseHugePages(memory_search_.get(), memory_size); - return reinterpret_cast<uint8_t*>(memory_search_.get()); - } - - assert(write_method_ == Config::WRITE_MMAP); - // Also known as total size without vocab words. - // Grow the file to accomodate the search, using zeros. - // According to man mmap, behavior is undefined when the file is resized - // underneath a mmap that is not a multiple of the page size. So to be - // safe, we'll unmap it and map it again. - mapping_.reset(); - util::ResizeOrThrow(file_.get(), new_size); - void *ret; - MapFile(vocab_base, ret); - util::AdviseHugePages(ret, new_size); - return ret; -} - -void BinaryFormat::WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base) { - // Checking Config's include_vocab is the responsibility of the caller. - assert(header_size_ != kInvalidSize && vocab_size_ != kInvalidSize); - if (!write_mmap_) { - // Unchanged base. - vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()); - search_base = reinterpret_cast<uint8_t*>(memory_search_.get()); - return; - } - if (write_method_ == Config::WRITE_MMAP) { - mapping_.reset(); - } - util::SeekOrThrow(file_.get(), VocabStringReadingOffset()); - util::WriteOrThrow(file_.get(), &buffer[0], buffer.size()); - if (write_method_ == Config::WRITE_MMAP) { - MapFile(vocab_base, search_base); - } else { - vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_; - search_base = reinterpret_cast<uint8_t*>(memory_search_.get()); - } -} - -void BinaryFormat::FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts) { - if (!write_mmap_) return; - switch (write_method_) { - case Config::WRITE_MMAP: - util::SyncOrThrow(mapping_.get(), mapping_.size()); - break; - case Config::WRITE_AFTER: - util::SeekOrThrow(file_.get(), 0); - util::WriteOrThrow(file_.get(), memory_vocab_.get(), memory_vocab_.size()); - util::SeekOrThrow(file_.get(), header_size_ + vocab_size_ + vocab_pad_); - util::WriteOrThrow(file_.get(), memory_search_.get(), memory_search_.size()); - util::FSyncOrThrow(file_.get()); - break; - } - // header and vocab share the same mmap. - Parameters params = Parameters(); - memset(¶ms, 0, sizeof(Parameters)); - params.counts = counts; - params.fixed.order = counts.size(); - params.fixed.probing_multiplier = config.probing_multiplier; - params.fixed.model_type = model_type; - params.fixed.has_vocabulary = config.include_vocab; - params.fixed.search_version = search_version; - switch (write_method_) { - case Config::WRITE_MMAP: - WriteHeader(mapping_.get(), params); - util::SyncOrThrow(mapping_.get(), mapping_.size()); - break; - case Config::WRITE_AFTER: - { - std::vector<uint8_t> buffer(TotalHeaderSize(counts.size())); - WriteHeader(&buffer[0], params); - util::SeekOrThrow(file_.get(), 0); - util::WriteOrThrow(file_.get(), &buffer[0], buffer.size()); - } - break; - } -} - -void BinaryFormat::MapFile(void *&vocab_base, void *&search_base) { - mapping_.reset(util::MapOrThrow(vocab_string_offset_, true, util::kFileFlags, false, file_.get()), vocab_string_offset_, util::scoped_memory::MMAP_ALLOCATED); - vocab_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_; - search_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_ + vocab_size_ + vocab_pad_; -} - -bool RecognizeBinary(const char *file, ModelType &recognized) { - util::scoped_fd fd(util::OpenReadOrThrow(file)); - if (!IsBinaryFormat(fd.get())) { - return false; - } - Parameters params; - ReadHeader(fd.get(), params); - recognized = params.fixed.model_type; - return true; -} - -} // namespace ngram -} // namespace lm
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/binary_format.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/binary_format.hh b/ext/kenlm/lm/binary_format.hh deleted file mode 100644 index ff99b95..0000000 --- a/ext/kenlm/lm/binary_format.hh +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef LM_BINARY_FORMAT_H -#define LM_BINARY_FORMAT_H - -#include "lm/config.hh" -#include "lm/model_type.hh" -#include "lm/read_arpa.hh" - -#include "util/file_piece.hh" -#include "util/mmap.hh" -#include "util/scoped.hh" - -#include <cstddef> -#include <vector> - -#include <stdint.h> - -namespace lm { -namespace ngram { - -extern const char *kModelNames[6]; - -/*Inspect a file to determine if it is a binary lm. If not, return false. - * If so, return true and set recognized to the type. This is the only API in - * this header designed for use by decoder authors. - */ -bool RecognizeBinary(const char *file, ModelType &recognized); - -struct FixedWidthParameters { - unsigned char order; - float probing_multiplier; - // What type of model is this? - ModelType model_type; - // Does the end of the file have the actual strings in the vocabulary? - bool has_vocabulary; - unsigned int search_version; -}; - -// This is a macro instead of an inline function so constants can be assigned using it. -#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8) - -// Parameters stored in the header of a binary file. -struct Parameters { - FixedWidthParameters fixed; - std::vector<uint64_t> counts; -}; - -class BinaryFormat { - public: - explicit BinaryFormat(const Config &config); - - // Reading a binary file: - // Takes ownership of fd - void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms); - // Used to read parts of the file to update the config object before figuring out full size. - void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const; - // Actually load the binary file and return a pointer to the beginning of the search area. - void *LoadBinary(std::size_t size); - - uint64_t VocabStringReadingOffset() const { - assert(vocab_string_offset_ != kInvalidOffset); - return vocab_string_offset_; - } - - // Writing a binary file or initializing in RAM from ARPA: - // Size for vocabulary. - void *SetupJustVocab(std::size_t memory_size, uint8_t order); - // Warning: can change the vocaulary base pointer. - void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base); - // Warning: can change vocabulary and search base addresses. - void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base); - // Write the header at the beginning of the file. - void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts); - - private: - void MapFile(void *&vocab_base, void *&search_base); - - // Copied from configuration. - const Config::WriteMethod write_method_; - const char *write_mmap_; - util::LoadMethod load_method_; - - // File behind memory, if any. - util::scoped_fd file_; - - // If there is a file involved, a single mapping. - util::scoped_memory mapping_; - - // If the data is only in memory, separately allocate each because the trie - // knows vocab's size before it knows search's size (because SRILM might - // have pruned). - util::scoped_memory memory_vocab_, memory_search_; - - // Memory ranges. Note that these may not be contiguous and may not all - // exist. - std::size_t header_size_, vocab_size_, vocab_pad_; - // aka end of search. - uint64_t vocab_string_offset_; - - static const uint64_t kInvalidOffset = (uint64_t)-1; -}; - -bool IsBinaryFormat(int fd); - -} // namespace ngram -} // namespace lm -#endif // LM_BINARY_FORMAT_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/blank.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/blank.hh b/ext/kenlm/lm/blank.hh deleted file mode 100644 index e09054c..0000000 --- a/ext/kenlm/lm/blank.hh +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef LM_BLANK_H -#define LM_BLANK_H - -#include <limits> -#include <stdint.h> -#include <cmath> - -namespace lm { -namespace ngram { - -/* Suppose "foo bar" appears with zero backoff but there is no trigram - * beginning with these words. Then, when scoring "foo bar", the model could - * return out_state containing "bar" or even null context if "bar" also has no - * backoff and is never followed by another word. Then the backoff is set to - * kNoExtensionBackoff. If the n-gram might be extended, then out_state must - * contain the full n-gram, in which case kExtensionBackoff is set. In any - * case, if an n-gram has non-zero backoff, the full state is returned so - * backoff can be properly charged. - * These differ only in sign bit because the backoff is in fact zero in either - * case. - */ -const float kNoExtensionBackoff = -0.0; -const float kExtensionBackoff = 0.0; -const uint64_t kNoExtensionQuant = 0; -const uint64_t kExtensionQuant = 1; - -inline void SetExtension(float &backoff) { - if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff; -} - -// This compiles down nicely. -inline bool HasExtension(const float &backoff) { - typedef union { float f; uint32_t i; } UnionValue; - UnionValue compare, interpret; - compare.f = kNoExtensionBackoff; - interpret.f = backoff; - return compare.i != interpret.i; -} - -} // namespace ngram -} // namespace lm -#endif // LM_BLANK_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/build_binary_main.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/build_binary_main.cc b/ext/kenlm/lm/build_binary_main.cc deleted file mode 100644 index 35206e6..0000000 --- a/ext/kenlm/lm/build_binary_main.cc +++ /dev/null @@ -1,234 +0,0 @@ -#include "lm/model.hh" -#include "lm/sizes.hh" -#include "util/file_piece.hh" -#include "util/usage.hh" - -#include <algorithm> -#include <cstdlib> -#include <exception> -#include <iostream> -#include <iomanip> -#include <limits> -#include <cmath> -#include <cstdlib> - -#ifdef WIN32 -#include "util/getopt.hh" -#else -#include <unistd.h> -#endif - -namespace lm { -namespace ngram { -namespace { - -void Usage(const char *name, const char *default_mem) { - std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" -"-u sets the log10 probability for <unk> if the ARPA file does not have one.\n" -" Default is -100. The ARPA file will always take precedence.\n" -"-s allows models to be built even if they do not have <s> and </s>.\n" -"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n" -"-w mmap|after determines how writing is done.\n" -" mmap maps the binary file and writes to it. Default for trie.\n" -" after allocates anonymous memory, builds, and writes. Default for probing.\n" -"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n" -" model files. order1.arpa must be an ARPA file. All others may be ARPA or\n" -" the same data structure as being built. All files must have the same\n" -" vocabulary. For probing, the unigrams must be in the same order.\n\n" -"type is either probing or trie. Default is probing.\n\n" -"probing uses a probing hash table. It is the fastest but uses the most memory.\n" -"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n" -"trie is a straightforward trie with bit-level packing. It uses the least\n" -"memory and is still faster than SRI or IRST. Building the trie format uses an\n" -"on-disk sort to save memory.\n" -"-T is the temporary directory prefix. Default is the output file name.\n" -"-S determines memory use for sorting. Default is " << default_mem << ". This is compatible\n" -" with GNU sort. The number is followed by a unit: \% for percent of physical\n" -" memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y. \n" -" Default unit is K for Kilobytes.\n" -"-q turns quantization on and sets the number of bits (e.g. -q 8).\n" -"-b sets backoff quantization bits. Requires -q and defaults to that value.\n" -"-a compresses pointers using an array of offsets. The parameter is the\n" -" maximum number of bits encoded by the array. Memory is minimized subject\n" -" to the maximum, so pick 255 to minimize memory.\n\n" -"-h print this help message.\n\n" -"Get a memory estimate by passing an ARPA file without an output file name.\n"; - exit(1); -} - -// I could really use boost::lexical_cast right about now. -float ParseFloat(const char *from) { - char *end; - float ret = strtod(from, &end); - if (*end) throw util::ParseNumberException(from); - return ret; -} -unsigned long int ParseUInt(const char *from) { - char *end; - unsigned long int ret = strtoul(from, &end, 10); - if (*end) throw util::ParseNumberException(from); - return ret; -} - -uint8_t ParseBitCount(const char *from) { - unsigned long val = ParseUInt(from); - if (val > 25) { - util::ParseNumberException e(from); - e << " bit counts are limited to 25."; - } - return val; -} - -void ParseFileList(const char *from, std::vector<std::string> &to) { - to.clear(); - while (true) { - const char *i; - for (i = from; *i && *i != ' '; ++i) {} - to.push_back(std::string(from, i - from)); - if (!*i) break; - from = i + 1; - } -} - -void ProbingQuantizationUnsupported() { - std::cerr << "Quantization is only implemented in the trie data structure." << std::endl; - exit(1); -} - -} // namespace ngram -} // namespace lm -} // namespace - -int main(int argc, char *argv[]) { - using namespace lm::ngram; - - const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G"; - - if (argc == 2 && !strcmp(argv[1], "--help")) - Usage(argv[0], default_mem); - - try { - bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false; - lm::ngram::Config config; - config.building_memory = util::ParseSize(default_mem); - int opt; - while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:h")) != -1) { - switch(opt) { - case 'q': - config.prob_bits = ParseBitCount(optarg); - if (!set_backoff_bits) config.backoff_bits = config.prob_bits; - quantize = true; - break; - case 'b': - config.backoff_bits = ParseBitCount(optarg); - set_backoff_bits = true; - break; - case 'a': - config.pointer_bhiksha_bits = ParseBitCount(optarg); - bhiksha = true; - break; - case 'u': - config.unknown_missing_logprob = ParseFloat(optarg); - break; - case 'p': - config.probing_multiplier = ParseFloat(optarg); - break; - case 't': // legacy - case 'T': - config.temporary_directory_prefix = optarg; - util::NormalizeTempPrefix(config.temporary_directory_prefix); - break; - case 'm': // legacy - config.building_memory = ParseUInt(optarg) * 1048576; - break; - case 'S': - config.building_memory = std::min(static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), util::ParseSize(optarg)); - break; - case 'w': - set_write_method = true; - if (!strcmp(optarg, "mmap")) { - config.write_method = Config::WRITE_MMAP; - } else if (!strcmp(optarg, "after")) { - config.write_method = Config::WRITE_AFTER; - } else { - Usage(argv[0], default_mem); - } - break; - case 's': - config.sentence_marker_missing = lm::SILENT; - break; - case 'i': - config.positive_log_probability = lm::SILENT; - break; - case 'r': - rest = true; - ParseFileList(optarg, config.rest_lower_files); - config.rest_function = Config::REST_LOWER; - break; - case 'h': // help - default: - Usage(argv[0], default_mem); - } - } - if (!quantize && set_backoff_bits) { - std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl; - abort(); - } - if (optind + 1 == argc) { - ShowSizes(argv[optind], config); - return 0; - } - const char *model_type; - const char *from_file; - - if (optind + 2 == argc) { - model_type = "probing"; - from_file = argv[optind]; - config.write_mmap = argv[optind + 1]; - } else if (optind + 3 == argc) { - model_type = argv[optind]; - from_file = argv[optind + 1]; - config.write_mmap = argv[optind + 2]; - } else { - Usage(argv[0], default_mem); - return 1; - } - if (!strcmp(model_type, "probing")) { - if (!set_write_method) config.write_method = Config::WRITE_AFTER; - if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); - if (rest) { - RestProbingModel(from_file, config); - } else { - ProbingModel(from_file, config); - } - } else if (!strcmp(model_type, "trie")) { - if (rest) { - std::cerr << "Rest + trie is not supported yet." << std::endl; - return 1; - } - if (!set_write_method) config.write_method = Config::WRITE_MMAP; - if (quantize) { - if (bhiksha) { - QuantArrayTrieModel(from_file, config); - } else { - QuantTrieModel(from_file, config); - } - } else { - if (bhiksha) { - ArrayTrieModel(from_file, config); - } else { - TrieModel(from_file, config); - } - } - } else { - Usage(argv[0], default_mem); - } - } - catch (const std::exception &e) { - std::cerr << e.what() << std::endl; - std::cerr << "ERROR" << std::endl; - return 1; - } - std::cerr << "SUCCESS" << std::endl; - return 0; -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/CMakeLists.txt b/ext/kenlm/lm/builder/CMakeLists.txt deleted file mode 100644 index cc0d3ed..0000000 --- a/ext/kenlm/lm/builder/CMakeLists.txt +++ /dev/null @@ -1,67 +0,0 @@ -cmake_minimum_required(VERSION 2.8.8) -# -# The KenLM cmake files make use of add_library(... OBJECTS ...) -# -# This syntax allows grouping of source files when compiling -# (effectively creating "fake" libraries based on source subdirs). -# -# This syntax was only added in cmake version 2.8.8 -# -# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library - - -# This CMake file was created by Lane Schwartz <[email protected]> - -# Explicitly list the source files for this subdirectory -# -# If you add any source files to this subdirectory -# that should be included in the kenlm library, -# (this excludes any unit test files) -# you should add them to the following list: -# -# In order to set correct paths to these files -# in case this variable is referenced by CMake files in the parent directory, -# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. -# -set(KENLM_BUILDER_SOURCE - ${CMAKE_CURRENT_SOURCE_DIR}/adjust_counts.cc - ${CMAKE_CURRENT_SOURCE_DIR}/corpus_count.cc - ${CMAKE_CURRENT_SOURCE_DIR}/initial_probabilities.cc - ${CMAKE_CURRENT_SOURCE_DIR}/interpolate.cc - ${CMAKE_CURRENT_SOURCE_DIR}/output.cc - ${CMAKE_CURRENT_SOURCE_DIR}/pipeline.cc - ) - - -# Group these objects together for later use. -# -# Given add_library(foo OBJECT ${my_foo_sources}), -# refer to these objects as $<TARGET_OBJECTS:foo> -# -add_library(kenlm_builder OBJECT ${KENLM_BUILDER_SOURCE}) - - -# Compile the executable, linking against the requisite dependent object files -add_executable(lmplz lmplz_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_common> $<TARGET_OBJECTS:kenlm_builder> $<TARGET_OBJECTS:kenlm_util>) - -# Link the executable against boost -target_link_libraries(lmplz ${Boost_LIBRARIES} pthread) - -# Group executables together -set_target_properties(lmplz PROPERTIES FOLDER executables) - -if(BUILD_TESTING) - - # Explicitly list the Boost test files to be compiled - set(KENLM_BOOST_TESTS_LIST - adjust_counts_test - corpus_count_test - ) - - AddTests(TESTS ${KENLM_BOOST_TESTS_LIST} - DEPENDS $<TARGET_OBJECTS:kenlm> - $<TARGET_OBJECTS:kenlm_common> - $<TARGET_OBJECTS:kenlm_util> - $<TARGET_OBJECTS:kenlm_builder> - LIBRARIES ${Boost_LIBRARIES} pthread) -endif() http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/Jamfile ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/Jamfile b/ext/kenlm/lm/builder/Jamfile deleted file mode 100644 index 329a8e0..0000000 --- a/ext/kenlm/lm/builder/Jamfile +++ /dev/null @@ -1,13 +0,0 @@ -fakelib builder : [ glob *.cc : *test.cc *main.cc ] - ../../util//kenutil ../../util/stream//stream ../../util/double-conversion//double-conversion ..//kenlm ../common//common - : : : <library>/top//boost_thread $(timer-link) ; - -exe lmplz : lmplz_main.cc builder /top//boost_program_options ; - -exe dump_counts : dump_counts_main.cc builder ; - -alias programs : lmplz dump_counts ; - -import testing ; -unit-test corpus_count_test : corpus_count_test.cc builder /top//boost_unit_test_framework ; -unit-test adjust_counts_test : adjust_counts_test.cc builder /top//boost_unit_test_framework ; http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/README.md ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/README.md b/ext/kenlm/lm/builder/README.md deleted file mode 100644 index be0d35e..0000000 --- a/ext/kenlm/lm/builder/README.md +++ /dev/null @@ -1,47 +0,0 @@ -Dependencies -============ - -Boost >= 1.42.0 is required. - -For Ubuntu, -```bash -sudo apt-get install libboost1.48-all-dev -``` - -Alternatively, you can download, compile, and install it yourself: - -```bash -wget http://sourceforge.net/projects/boost/files/boost/1.52.0/boost_1_52_0.tar.gz/download -O boost_1_52_0.tar.gz -tar -xvzf boost_1_52_0.tar.gz -cd boost_1_52_0 -./bootstrap.sh -./b2 -sudo ./b2 install -``` - -Local install options (in a user-space prefix directory) are also possible. See http://www.boost.org/doc/libs/1_52_0/doc/html/bbv2/installation.html. - - -Building -======== - -```bash -bjam -``` -Your distribution might package bjam and boost-build separately from Boost. Both are required. - -Usage -===== - -Run -```bash -$ bin/lmplz -``` -to see command line arguments - -Running -======= - -```bash -bin/lmplz -o 5 <text >text.arpa -``` http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/TODO ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/TODO b/ext/kenlm/lm/builder/TODO deleted file mode 100644 index cb5aef3..0000000 --- a/ext/kenlm/lm/builder/TODO +++ /dev/null @@ -1,5 +0,0 @@ -More tests! -Sharding. -Some way to manage all the crazy config options. -Option to build the binary file directly. -Interpolation of different orders. http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/adjust_counts.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/adjust_counts.cc b/ext/kenlm/lm/builder/adjust_counts.cc deleted file mode 100644 index b4c5ba8..0000000 --- a/ext/kenlm/lm/builder/adjust_counts.cc +++ /dev/null @@ -1,353 +0,0 @@ -#include "lm/builder/adjust_counts.hh" -#include "lm/common/ngram_stream.hh" -#include "lm/builder/payload.hh" -#include "util/stream/timer.hh" - -#include <algorithm> -#include <iostream> -#include <limits> - -namespace lm { namespace builder { - -BadDiscountException::BadDiscountException() throw() {} -BadDiscountException::~BadDiscountException() throw() {} - -namespace { -// Return last word in full that is different. -const WordIndex* FindDifference(const NGram<BuildingPayload> &full, const NGram<BuildingPayload> &lower_last) { - const WordIndex *cur_word = full.end() - 1; - const WordIndex *pre_word = lower_last.end() - 1; - // Find last difference. - for (; pre_word >= lower_last.begin() && *pre_word == *cur_word; --cur_word, --pre_word) {} - return cur_word; -} - -class StatCollector { - public: - StatCollector(std::size_t order, std::vector<uint64_t> &counts, std::vector<uint64_t> &counts_pruned, std::vector<Discount> &discounts) - : orders_(order), full_(orders_.back()), counts_(counts), counts_pruned_(counts_pruned), discounts_(discounts) { - memset(&orders_[0], 0, sizeof(OrderStat) * order); - } - - ~StatCollector() {} - - void CalculateDiscounts(const DiscountConfig &config) { - counts_.resize(orders_.size()); - counts_pruned_.resize(orders_.size()); - for (std::size_t i = 0; i < orders_.size(); ++i) { - const OrderStat &s = orders_[i]; - counts_[i] = s.count; - counts_pruned_[i] = s.count_pruned; - } - - discounts_ = config.overwrite; - discounts_.resize(orders_.size()); - for (std::size_t i = config.overwrite.size(); i < orders_.size(); ++i) { - const OrderStat &s = orders_[i]; - try { - for (unsigned j = 1; j < 4; ++j) { - // TODO: Specialize error message for j == 3, meaning 3+ - UTIL_THROW_IF(s.n[j] == 0, BadDiscountException, "Could not calculate Kneser-Ney discounts for " - << (i+1) << "-grams with adjusted count " << (j+1) << " because we didn't observe any " - << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?\n" - << "Try deduplicating the input. To override this error for e.g. a class-based model, rerun with --discount_fallback\n"); - } - - // See equation (26) in Chen and Goodman. - discounts_[i].amount[0] = 0.0; - float y = static_cast<float>(s.n[1]) / static_cast<float>(s.n[1] + 2.0 * s.n[2]); - for (unsigned j = 1; j < 4; ++j) { - discounts_[i].amount[j] = static_cast<float>(j) - static_cast<float>(j + 1) * y * static_cast<float>(s.n[j+1]) / static_cast<float>(s.n[j]); - UTIL_THROW_IF(discounts_[i].amount[j] < 0.0 || discounts_[i].amount[j] > j, BadDiscountException, "ERROR: " << (i+1) << "-gram discount out of range for adjusted count " << j << ": " << discounts_[i].amount[j]); - } - } catch (const BadDiscountException &e) { - switch (config.bad_action) { - case THROW_UP: - throw; - case COMPLAIN: - std::cerr << "Substituting fallback discounts for order " << i << ": D1=" << config.fallback.amount[1] << " D2=" << config.fallback.amount[2] << " D3+=" << config.fallback.amount[3] << std::endl; - case SILENT: - break; - } - discounts_[i] = config.fallback; - } - } - } - - void Add(std::size_t order_minus_1, uint64_t count, bool pruned = false) { - OrderStat &stat = orders_[order_minus_1]; - ++stat.count; - if (!pruned) - ++stat.count_pruned; - if (count < 5) ++stat.n[count]; - } - - void AddFull(uint64_t count, bool pruned = false) { - ++full_.count; - if (!pruned) - ++full_.count_pruned; - if (count < 5) ++full_.n[count]; - } - - private: - struct OrderStat { - // n_1 in equation 26 of Chen and Goodman etc - uint64_t n[5]; - uint64_t count; - uint64_t count_pruned; - }; - - std::vector<OrderStat> orders_; - OrderStat &full_; - - std::vector<uint64_t> &counts_; - std::vector<uint64_t> &counts_pruned_; - std::vector<Discount> &discounts_; -}; - -// Reads all entries in order like NGramStream does. -// But deletes any entries that have <s> in the 1st (not 0th) position on the -// way out by putting other entries in their place. This disrupts the sort -// order but we don't care because the data is going to be sorted again. -class CollapseStream { - public: - CollapseStream(const util::stream::ChainPosition &position, uint64_t prune_threshold, const std::vector<bool>& prune_words) : - current_(NULL, NGram<BuildingPayload>::OrderFromSize(position.GetChain().EntrySize())), - prune_threshold_(prune_threshold), - prune_words_(prune_words), - block_(position) { - StartBlock(); - } - - const NGram<BuildingPayload> &operator*() const { return current_; } - const NGram<BuildingPayload> *operator->() const { return ¤t_; } - - operator bool() const { return block_; } - - CollapseStream &operator++() { - assert(block_); - - if (current_.begin()[1] == kBOS && current_.Base() < copy_from_) { - memcpy(current_.Base(), copy_from_, current_.TotalSize()); - UpdateCopyFrom(); - - // Mark highest order n-grams for later pruning - if(current_.Value().count <= prune_threshold_) { - current_.Value().Mark(); - } - - if(!prune_words_.empty()) { - for(WordIndex* i = current_.begin(); i != current_.end(); i++) { - if(prune_words_[*i]) { - current_.Value().Mark(); - break; - } - } - } - - } - - current_.NextInMemory(); - uint8_t *block_base = static_cast<uint8_t*>(block_->Get()); - if (current_.Base() == block_base + block_->ValidSize()) { - block_->SetValidSize(copy_from_ + current_.TotalSize() - block_base); - ++block_; - StartBlock(); - } - - // Mark highest order n-grams for later pruning - if(current_.Value().count <= prune_threshold_) { - current_.Value().Mark(); - } - - if(!prune_words_.empty()) { - for(WordIndex* i = current_.begin(); i != current_.end(); i++) { - if(prune_words_[*i]) { - current_.Value().Mark(); - break; - } - } - } - - return *this; - } - - private: - void StartBlock() { - for (; ; ++block_) { - if (!block_) return; - if (block_->ValidSize()) break; - } - current_.ReBase(block_->Get()); - copy_from_ = static_cast<uint8_t*>(block_->Get()) + block_->ValidSize(); - UpdateCopyFrom(); - - // Mark highest order n-grams for later pruning - if(current_.Value().count <= prune_threshold_) { - current_.Value().Mark(); - } - - if(!prune_words_.empty()) { - for(WordIndex* i = current_.begin(); i != current_.end(); i++) { - if(prune_words_[*i]) { - current_.Value().Mark(); - break; - } - } - } - - } - - // Find last without bos. - void UpdateCopyFrom() { - for (copy_from_ -= current_.TotalSize(); copy_from_ >= current_.Base(); copy_from_ -= current_.TotalSize()) { - if (NGram<BuildingPayload>(copy_from_, current_.Order()).begin()[1] != kBOS) break; - } - } - - NGram<BuildingPayload> current_; - - // Goes backwards in the block - uint8_t *copy_from_; - uint64_t prune_threshold_; - const std::vector<bool>& prune_words_; - util::stream::Link block_; -}; - -} // namespace - -void AdjustCounts::Run(const util::stream::ChainPositions &positions) { - UTIL_TIMER("(%w s) Adjusted counts\n"); - - const std::size_t order = positions.size(); - StatCollector stats(order, counts_, counts_pruned_, discounts_); - if (order == 1) { - - // Only unigrams. Just collect stats. - for (NGramStream<BuildingPayload> full(positions[0]); full; ++full) { - - // Do not prune <s> </s> <unk> - if(*full->begin() > 2) { - if(full->Value().count <= prune_thresholds_[0]) - full->Value().Mark(); - - if(!prune_words_.empty() && prune_words_[*full->begin()]) - full->Value().Mark(); - } - - stats.AddFull(full->Value().UnmarkedCount(), full->Value().IsMarked()); - } - - stats.CalculateDiscounts(discount_config_); - return; - } - - NGramStreams<BuildingPayload> streams; - streams.Init(positions, positions.size() - 1); - - CollapseStream full(positions[positions.size() - 1], prune_thresholds_.back(), prune_words_); - - // Initialization: <unk> has count 0 and so does <s>. - NGramStream<BuildingPayload> *lower_valid = streams.begin(); - const NGramStream<BuildingPayload> *const streams_begin = streams.begin(); - streams[0]->Value().count = 0; - *streams[0]->begin() = kUNK; - stats.Add(0, 0); - (++streams[0])->Value().count = 0; - *streams[0]->begin() = kBOS; - // <s> is not in stats yet because it will get put in later. - - // This keeps track of actual counts for lower orders. It is not output - // (only adjusted counts are), but used to determine pruning. - std::vector<uint64_t> actual_counts(positions.size(), 0); - // Something of a hack: don't prune <s>. - actual_counts[0] = std::numeric_limits<uint64_t>::max(); - - // Iterate over full (the stream of the highest order ngrams) - for (; full; ++full) { - const WordIndex *different = FindDifference(*full, **lower_valid); - std::size_t same = full->end() - 1 - different; - - // STEP 1: Output all the n-grams that changed. - for (; lower_valid >= streams.begin() + same; --lower_valid) { - uint64_t order_minus_1 = lower_valid - streams_begin; - if(actual_counts[order_minus_1] <= prune_thresholds_[order_minus_1]) - (*lower_valid)->Value().Mark(); - - if(!prune_words_.empty()) { - for(WordIndex* i = (*lower_valid)->begin(); i != (*lower_valid)->end(); i++) { - if(prune_words_[*i]) { - (*lower_valid)->Value().Mark(); - break; - } - } - } - - stats.Add(order_minus_1, (*lower_valid)->Value().UnmarkedCount(), (*lower_valid)->Value().IsMarked()); - ++*lower_valid; - } - - // STEP 2: Update n-grams that still match. - // n-grams that match get count from the full entry. - for (std::size_t i = 0; i < same; ++i) { - actual_counts[i] += full->Value().UnmarkedCount(); - } - // Increment the number of unique extensions for the longest match. - if (same) ++streams[same - 1]->Value().count; - - // STEP 3: Initialize new n-grams. - // This is here because bos is also const WordIndex *, so copy gets - // consistent argument types. - const WordIndex *full_end = full->end(); - // Initialize and mark as valid up to bos. - const WordIndex *bos; - for (bos = different; (bos > full->begin()) && (*bos != kBOS); --bos) { - NGramStream<BuildingPayload> &to = *++lower_valid; - std::copy(bos, full_end, to->begin()); - to->Value().count = 1; - actual_counts[lower_valid - streams_begin] = full->Value().UnmarkedCount(); - } - // Now bos indicates where <s> is or is the 0th word of full. - if (bos != full->begin()) { - // There is an <s> beyond the 0th word. - NGramStream<BuildingPayload> &to = *++lower_valid; - std::copy(bos, full_end, to->begin()); - - // Anything that begins with <s> has full non adjusted count. - to->Value().count = full->Value().UnmarkedCount(); - actual_counts[lower_valid - streams_begin] = full->Value().UnmarkedCount(); - } else { - stats.AddFull(full->Value().UnmarkedCount(), full->Value().IsMarked()); - } - assert(lower_valid >= &streams[0]); - } - - // The above loop outputs n-grams when it observes changes. This outputs - // the last n-grams. - for (NGramStream<BuildingPayload> *s = streams.begin(); s <= lower_valid; ++s) { - uint64_t lower_count = actual_counts[(*s)->Order() - 1]; - if(lower_count <= prune_thresholds_[(*s)->Order() - 1]) - (*s)->Value().Mark(); - - if(!prune_words_.empty()) { - for(WordIndex* i = (*s)->begin(); i != (*s)->end(); i++) { - if(prune_words_[*i]) { - (*s)->Value().Mark(); - break; - } - } - } - - stats.Add(s - streams.begin(), lower_count, (*s)->Value().IsMarked()); - ++*s; - } - // Poison everyone! Except the N-grams which were already poisoned by the input. - for (NGramStream<BuildingPayload> *s = streams.begin(); s != streams.end(); ++s) - s->Poison(); - - stats.CalculateDiscounts(discount_config_); - - // NOTE: See special early-return case for unigrams near the top of this function -} - -}} // namespaces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/adjust_counts.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/adjust_counts.hh b/ext/kenlm/lm/builder/adjust_counts.hh deleted file mode 100644 index 29319ba..0000000 --- a/ext/kenlm/lm/builder/adjust_counts.hh +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef LM_BUILDER_ADJUST_COUNTS_H -#define LM_BUILDER_ADJUST_COUNTS_H - -#include "lm/builder/discount.hh" -#include "lm/lm_exception.hh" -#include "util/exception.hh" - -#include <vector> - -#include <stdint.h> - -namespace util { namespace stream { class ChainPositions; } } - -namespace lm { -namespace builder { - -class BadDiscountException : public util::Exception { - public: - BadDiscountException() throw(); - ~BadDiscountException() throw(); -}; - -struct DiscountConfig { - // Overrides discounts for orders [1,discount_override.size()]. - std::vector<Discount> overwrite; - // If discounting fails for an order, copy them from here. - Discount fallback; - // What to do when discounts are out of range or would trigger divison by - // zero. It it does something other than THROW_UP, use fallback_discount. - WarningAction bad_action; -}; - -/* Compute adjusted counts. - * Input: unique suffix sorted N-grams (and just the N-grams) with raw counts. - * Output: [1,N]-grams with adjusted counts. - * [1,N)-grams are in suffix order - * N-grams are in undefined order (they're going to be sorted anyway). - */ -class AdjustCounts { - public: - // counts: output - // counts_pruned: output - // discounts: mostly output. If the input already has entries, they will be kept. - // prune_thresholds: input. n-grams with normal (not adjusted) count below this will be pruned. - AdjustCounts( - const std::vector<uint64_t> &prune_thresholds, - std::vector<uint64_t> &counts, - std::vector<uint64_t> &counts_pruned, - const std::vector<bool> &prune_words, - const DiscountConfig &discount_config, - std::vector<Discount> &discounts) - : prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned), - prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts) - {} - - void Run(const util::stream::ChainPositions &positions); - - private: - const std::vector<uint64_t> &prune_thresholds_; - std::vector<uint64_t> &counts_; - std::vector<uint64_t> &counts_pruned_; - const std::vector<bool> &prune_words_; - - DiscountConfig discount_config_; - std::vector<Discount> &discounts_; -}; - -} // namespace builder -} // namespace lm - -#endif // LM_BUILDER_ADJUST_COUNTS_H - http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/adjust_counts_test.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/adjust_counts_test.cc b/ext/kenlm/lm/builder/adjust_counts_test.cc deleted file mode 100644 index fff551f..0000000 --- a/ext/kenlm/lm/builder/adjust_counts_test.cc +++ /dev/null @@ -1,112 +0,0 @@ -#include "lm/builder/adjust_counts.hh" - -#include "lm/common/ngram_stream.hh" -#include "lm/builder/payload.hh" -#include "util/scoped.hh" - -#include <boost/thread/thread.hpp> -#define BOOST_TEST_MODULE AdjustCounts -#include <boost/test/unit_test.hpp> - -namespace lm { namespace builder { namespace { - -class KeepCopy { - public: - KeepCopy() : size_(0) {} - - void Run(const util::stream::ChainPosition &position) { - for (util::stream::Link link(position); link; ++link) { - mem_.call_realloc(size_ + link->ValidSize()); - memcpy(static_cast<uint8_t*>(mem_.get()) + size_, link->Get(), link->ValidSize()); - size_ += link->ValidSize(); - } - } - - uint8_t *Get() { return static_cast<uint8_t*>(mem_.get()); } - std::size_t Size() const { return size_; } - - private: - util::scoped_malloc mem_; - std::size_t size_; -}; - -struct Gram4 { - WordIndex ids[4]; - uint64_t count; -}; - -class WriteInput { - public: - void Run(const util::stream::ChainPosition &position) { - NGramStream<BuildingPayload> input(position); - Gram4 grams[] = { - {{0,0,0,0},10}, - {{0,0,3,0},3}, - // bos - {{1,1,1,2},5}, - {{0,0,3,2},5}, - }; - for (size_t i = 0; i < sizeof(grams) / sizeof(Gram4); ++i, ++input) { - memcpy(input->begin(), grams[i].ids, sizeof(WordIndex) * 4); - input->Value().count = grams[i].count; - } - input.Poison(); - } -}; - -BOOST_AUTO_TEST_CASE(Simple) { - KeepCopy outputs[4]; - std::vector<uint64_t> counts; - std::vector<Discount> discount; - { - util::stream::ChainConfig config; - config.total_memory = 100; - config.block_count = 1; - util::stream::Chains chains(4); - for (unsigned i = 0; i < 4; ++i) { - config.entry_size = NGram<BuildingPayload>::TotalSize(i + 1); - chains.push_back(config); - } - - chains[3] >> WriteInput(); - util::stream::ChainPositions for_adjust(chains); - for (unsigned i = 0; i < 4; ++i) { - chains[i] >> boost::ref(outputs[i]); - } - chains >> util::stream::kRecycle; - std::vector<uint64_t> counts_pruned(4); - std::vector<uint64_t> prune_thresholds(4); - DiscountConfig discount_config; - discount_config.fallback = Discount(); - discount_config.bad_action = THROW_UP; - BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, std::vector<bool>(), discount_config, discount).Run(for_adjust), BadDiscountException); - } - BOOST_REQUIRE_EQUAL(4UL, counts.size()); - BOOST_CHECK_EQUAL(4UL, counts[0]); - // These are no longer set because the discounts are bad. -/* BOOST_CHECK_EQUAL(4UL, counts[1]); - BOOST_CHECK_EQUAL(3UL, counts[2]); - BOOST_CHECK_EQUAL(3UL, counts[3]);*/ - BOOST_REQUIRE_EQUAL(NGram<BuildingPayload>::TotalSize(1) * 4, outputs[0].Size()); - NGram<BuildingPayload> uni(outputs[0].Get(), 1); - BOOST_CHECK_EQUAL(kUNK, *uni.begin()); - BOOST_CHECK_EQUAL(0ULL, uni.Value().count); - uni.NextInMemory(); - BOOST_CHECK_EQUAL(kBOS, *uni.begin()); - BOOST_CHECK_EQUAL(0ULL, uni.Value().count); - uni.NextInMemory(); - BOOST_CHECK_EQUAL(0UL, *uni.begin()); - BOOST_CHECK_EQUAL(2ULL, uni.Value().count); - uni.NextInMemory(); - BOOST_CHECK_EQUAL(2ULL, uni.Value().count); - BOOST_CHECK_EQUAL(2UL, *uni.begin()); - - BOOST_REQUIRE_EQUAL(NGram<BuildingPayload>::TotalSize(2) * 4, outputs[1].Size()); - NGram<BuildingPayload> bi(outputs[1].Get(), 2); - BOOST_CHECK_EQUAL(0UL, *bi.begin()); - BOOST_CHECK_EQUAL(0UL, *(bi.begin() + 1)); - BOOST_CHECK_EQUAL(1ULL, bi.Value().count); - bi.NextInMemory(); -} - -}}} // namespaces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/combine_counts.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/combine_counts.hh b/ext/kenlm/lm/builder/combine_counts.hh deleted file mode 100644 index 2eda517..0000000 --- a/ext/kenlm/lm/builder/combine_counts.hh +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef LM_BUILDER_COMBINE_COUNTS_H -#define LM_BUILDER_COMBINE_COUNTS_H - -#include "lm/builder/payload.hh" -#include "lm/common/ngram.hh" -#include "lm/common/compare.hh" -#include "lm/word_index.hh" -#include "util/stream/sort.hh" - -#include <functional> -#include <string> - -namespace lm { -namespace builder { - -// Sum counts for the same n-gram. -struct CombineCounts { - bool operator()(void *first_void, const void *second_void, const SuffixOrder &compare) const { - NGram<BuildingPayload> first(first_void, compare.Order()); - // There isn't a const version of NGram. - NGram<BuildingPayload> second(const_cast<void*>(second_void), compare.Order()); - if (memcmp(first.begin(), second.begin(), sizeof(WordIndex) * compare.Order())) return false; - first.Value().count += second.Value().count; - return true; - } -}; - -} // namespace builder -} // namespace lm - -#endif // LM_BUILDER_COMBINE_COUNTS_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/corpus_count.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/corpus_count.cc b/ext/kenlm/lm/builder/corpus_count.cc deleted file mode 100644 index 0414c22..0000000 --- a/ext/kenlm/lm/builder/corpus_count.cc +++ /dev/null @@ -1,239 +0,0 @@ -#include "lm/builder/corpus_count.hh" - -#include "lm/builder/payload.hh" -#include "lm/common/ngram.hh" -#include "lm/lm_exception.hh" -#include "lm/vocab.hh" -#include "lm/word_index.hh" -#include "util/file_stream.hh" -#include "util/file.hh" -#include "util/file_piece.hh" -#include "util/murmur_hash.hh" -#include "util/probing_hash_table.hh" -#include "util/scoped.hh" -#include "util/stream/chain.hh" -#include "util/stream/timer.hh" -#include "util/tokenize_piece.hh" - -#include <functional> - -#include <stdint.h> - -namespace lm { -namespace builder { -namespace { - -class DedupeHash : public std::unary_function<const WordIndex *, bool> { - public: - explicit DedupeHash(std::size_t order) : size_(order * sizeof(WordIndex)) {} - - std::size_t operator()(const WordIndex *start) const { - return util::MurmurHashNative(start, size_); - } - - private: - const std::size_t size_; -}; - -class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> { - public: - explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {} - - bool operator()(const WordIndex *first, const WordIndex *second) const { - return !memcmp(first, second, size_); - } - - private: - const std::size_t size_; -}; - -struct DedupeEntry { - typedef WordIndex *Key; - Key GetKey() const { return key; } - void SetKey(WordIndex *to) { key = to; } - Key key; - static DedupeEntry Construct(WordIndex *at) { - DedupeEntry ret; - ret.key = at; - return ret; - } -}; - - -// TODO: don't have this here, should be with probing hash table defaults? -const float kProbingMultiplier = 1.5; - -typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe; - -class Writer { - public: - Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size) - : block_(position), gram_(block_->Get(), order), - dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()), - dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)), - buffer_(new WordIndex[order - 1]), - block_size_(position.GetChain().BlockSize()) { - dedupe_.Clear(); - assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size); - if (order == 1) { - // Add special words. AdjustCounts is responsible if order != 1. - AddUnigramWord(kUNK); - AddUnigramWord(kBOS); - } - } - - ~Writer() { - block_->SetValidSize(reinterpret_cast<const uint8_t*>(gram_.begin()) - static_cast<const uint8_t*>(block_->Get())); - (++block_).Poison(); - } - - // Write context with a bunch of <s> - void StartSentence() { - for (WordIndex *i = gram_.begin(); i != gram_.end() - 1; ++i) { - *i = kBOS; - } - } - - void Append(WordIndex word) { - *(gram_.end() - 1) = word; - Dedupe::MutableIterator at; - bool found = dedupe_.FindOrInsert(DedupeEntry::Construct(gram_.begin()), at); - if (found) { - // Already present. - NGram<BuildingPayload> already(at->key, gram_.Order()); - ++(already.Value().count); - // Shift left by one. - memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1)); - return; - } - // Complete the write. - gram_.Value().count = 1; - // Prepare the next n-gram. - if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) { - NGram<BuildingPayload> last(gram_); - gram_.NextInMemory(); - std::copy(last.begin() + 1, last.end(), gram_.begin()); - return; - } - // Block end. Need to store the context in a temporary buffer. - std::copy(gram_.begin() + 1, gram_.end(), buffer_.get()); - dedupe_.Clear(); - block_->SetValidSize(block_size_); - gram_.ReBase((++block_)->Get()); - std::copy(buffer_.get(), buffer_.get() + gram_.Order() - 1, gram_.begin()); - } - - private: - void AddUnigramWord(WordIndex index) { - *gram_.begin() = index; - gram_.Value().count = 0; - gram_.NextInMemory(); - if (gram_.Base() == static_cast<uint8_t*>(block_->Get()) + block_size_) { - block_->SetValidSize(block_size_); - gram_.ReBase((++block_)->Get()); - } - } - - util::stream::Link block_; - - NGram<BuildingPayload> gram_; - - // This is the memory behind the invalid value in dedupe_. - std::vector<WordIndex> dedupe_invalid_; - // Hash table combiner implementation. - Dedupe dedupe_; - - // Small buffer to hold existing ngrams when shifting across a block boundary. - boost::scoped_array<WordIndex> buffer_; - - const std::size_t block_size_; -}; - -} // namespace - -float CorpusCount::DedupeMultiplier(std::size_t order) { - return kProbingMultiplier * static_cast<float>(sizeof(DedupeEntry)) / static_cast<float>(NGram<BuildingPayload>::TotalSize(order)); -} - -std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) { - return ngram::GrowableVocab<ngram::WriteUniqueWords>::MemUsage(vocab_estimate); -} - -CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol) - : from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count), - prune_words_(prune_words), prune_vocab_filename_(prune_vocab_filename), - dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)), - dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)), - disallowed_symbol_action_(disallowed_symbol) { -} - -namespace { - void ComplainDisallowed(StringPiece word, WarningAction &action) { - switch (action) { - case SILENT: - return; - case COMPLAIN: - std::cerr << "Warning: " << word << " appears in the input. All instances of <s>, </s>, and <unk> will be interpreted as whitespace." << std::endl; - action = SILENT; - return; - case THROW_UP: - UTIL_THROW(FormatLoadException, "Special word " << word << " is not allowed in the corpus. I plan to support models containing <unk> in the future. Pass --skip_symbols to convert these symbols to whitespace."); - } - } -} // namespace - -void CorpusCount::Run(const util::stream::ChainPosition &position) { - ngram::GrowableVocab<ngram::WriteUniqueWords> vocab(type_count_, vocab_write_); - token_count_ = 0; - type_count_ = 0; - const WordIndex end_sentence = vocab.FindOrInsert("</s>"); - Writer writer(NGram<BuildingPayload>::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_); - uint64_t count = 0; - bool delimiters[256]; - util::BoolCharacter::Build("\0\t\n\r ", delimiters); - try { - while(true) { - StringPiece line(from_.ReadLine()); - writer.StartSentence(); - for (util::TokenIter<util::BoolCharacter, true> w(line, delimiters); w; ++w) { - WordIndex word = vocab.FindOrInsert(*w); - if (word <= 2) { - ComplainDisallowed(*w, disallowed_symbol_action_); - continue; - } - writer.Append(word); - ++count; - } - writer.Append(end_sentence); - } - } catch (const util::EndOfFileException &e) {} - token_count_ = count; - type_count_ = vocab.Size(); - - // Create list of unigrams that are supposed to be pruned - if (!prune_vocab_filename_.empty()) { - try { - util::FilePiece prune_vocab_file(prune_vocab_filename_.c_str()); - - prune_words_.resize(vocab.Size(), true); - try { - while (true) { - StringPiece word(prune_vocab_file.ReadDelimited(delimiters)); - prune_words_[vocab.Index(word)] = false; - } - } catch (const util::EndOfFileException &e) {} - - // Never prune <unk>, <s>, </s> - prune_words_[kUNK] = false; - prune_words_[kBOS] = false; - prune_words_[kEOS] = false; - - } catch (const util::Exception &e) { - std::cerr << e.what() << std::endl; - abort(); - } - } -} - -} // namespace builder -} // namespace lm http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/corpus_count.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/corpus_count.hh b/ext/kenlm/lm/builder/corpus_count.hh deleted file mode 100644 index 165505c..0000000 --- a/ext/kenlm/lm/builder/corpus_count.hh +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef LM_BUILDER_CORPUS_COUNT_H -#define LM_BUILDER_CORPUS_COUNT_H - -#include "lm/lm_exception.hh" -#include "lm/word_index.hh" -#include "util/scoped.hh" - -#include <cstddef> -#include <string> -#include <stdint.h> -#include <vector> - -namespace util { -class FilePiece; -namespace stream { -class ChainPosition; -} // namespace stream -} // namespace util - -namespace lm { -namespace builder { - -class CorpusCount { - public: - // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size - static float DedupeMultiplier(std::size_t order); - - // How much memory vocabulary will use based on estimated size of the vocab. - static std::size_t VocabUsage(std::size_t vocab_estimate); - - // token_count: out. - // type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value. - CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol); - - void Run(const util::stream::ChainPosition &position); - - private: - util::FilePiece &from_; - int vocab_write_; - uint64_t &token_count_; - WordIndex &type_count_; - std::vector<bool>& prune_words_; - const std::string& prune_vocab_filename_; - - std::size_t dedupe_mem_size_; - util::scoped_malloc dedupe_mem_; - - WarningAction disallowed_symbol_action_; -}; - -} // namespace builder -} // namespace lm -#endif // LM_BUILDER_CORPUS_COUNT_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/corpus_count_test.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/corpus_count_test.cc b/ext/kenlm/lm/builder/corpus_count_test.cc deleted file mode 100644 index 88bcf96..0000000 --- a/ext/kenlm/lm/builder/corpus_count_test.cc +++ /dev/null @@ -1,79 +0,0 @@ -#include "lm/builder/corpus_count.hh" - -#include "lm/builder/payload.hh" -#include "lm/common/ngram_stream.hh" -#include "lm/common/ngram.hh" - -#include "util/file.hh" -#include "util/file_piece.hh" -#include "util/tokenize_piece.hh" -#include "util/stream/chain.hh" -#include "util/stream/stream.hh" - -#define BOOST_TEST_MODULE CorpusCountTest -#include <boost/test/unit_test.hpp> - -namespace lm { namespace builder { namespace { - -#define Check(str, cnt) { \ - BOOST_REQUIRE(stream); \ - w = stream->begin(); \ - for (util::TokenIter<util::AnyCharacter, true> t(str, " "); t; ++t, ++w) { \ - BOOST_CHECK_EQUAL(*t, v[*w]); \ - } \ - BOOST_CHECK_EQUAL((uint64_t)cnt, stream->Value().count); \ - ++stream; \ -} - -BOOST_AUTO_TEST_CASE(Short) { - util::scoped_fd input_file(util::MakeTemp("corpus_count_test_temp")); - const char input[] = "looking on a little more loin\non a little more loin\non foo little more loin\nbar\n\n"; - // Blocks of 10 are - // looking on a little more loin </s> on a little[duplicate] more[duplicate] loin[duplicate] </s>[duplicate] on[duplicate] foo - // little more loin </s> bar </s> </s> - - util::WriteOrThrow(input_file.get(), input, sizeof(input) - 1); - util::FilePiece input_piece(input_file.release(), "temp file"); - - util::stream::ChainConfig config; - config.entry_size = NGram<BuildingPayload>::TotalSize(3); - config.total_memory = config.entry_size * 20; - config.block_count = 2; - - util::scoped_fd vocab(util::MakeTemp("corpus_count_test_vocab")); - - util::stream::Chain chain(config); - uint64_t token_count; - WordIndex type_count = 10; - std::vector<bool> prune_words; - CorpusCount counter(input_piece, vocab.get(), token_count, type_count, prune_words, "", chain.BlockSize() / chain.EntrySize(), SILENT); - chain >> boost::ref(counter); - NGramStream<BuildingPayload> stream(chain.Add()); - chain >> util::stream::kRecycle; - - const char *v[] = {"<unk>", "<s>", "</s>", "looking", "on", "a", "little", "more", "loin", "foo", "bar"}; - - WordIndex *w; - - Check("<s> <s> looking", 1); - Check("<s> looking on", 1); - Check("looking on a", 1); - Check("on a little", 2); - Check("a little more", 2); - Check("little more loin", 2); - Check("more loin </s>", 2); - Check("<s> <s> on", 2); - Check("<s> on a", 1); - Check("<s> on foo", 1); - Check("on foo little", 1); - Check("foo little more", 1); - Check("little more loin", 1); - Check("more loin </s>", 1); - Check("<s> <s> bar", 1); - Check("<s> bar </s>", 1); - Check("<s> <s> </s>", 1); - BOOST_CHECK(!stream); - BOOST_CHECK_EQUAL(sizeof(v) / sizeof(const char*), type_count); -} - -}}} // namespaces http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/debug_print.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/debug_print.hh b/ext/kenlm/lm/builder/debug_print.hh deleted file mode 100644 index 4b9f306..0000000 --- a/ext/kenlm/lm/builder/debug_print.hh +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef LM_BUILDER_DEBUG_PRINT_H -#define LM_BUILDER_DEBUG_PRINT_H - -#include "lm/builder/payload.hh" -#include "lm/common/print.hh" -#include "lm/common/ngram_stream.hh" -#include "util/file_stream.hh" -#include "util/file.hh" - -#include <boost/lexical_cast.hpp> - -namespace lm { namespace builder { -// Not defined, only specialized. -template <class T> void PrintPayload(util::FileStream &to, const BuildingPayload &payload); -template <> inline void PrintPayload<uint64_t>(util::FileStream &to, const BuildingPayload &payload) { - to << payload.count; -} -template <> inline void PrintPayload<Uninterpolated>(util::FileStream &to, const BuildingPayload &payload) { - to << log10(payload.uninterp.prob) << ' ' << log10(payload.uninterp.gamma); -} -template <> inline void PrintPayload<ProbBackoff>(util::FileStream &to, const BuildingPayload &payload) { - to << payload.complete.prob << ' ' << payload.complete.backoff; -} - -// template parameter is the type stored. -template <class V> class Print { - public: - static void DumpSeparateFiles(const VocabReconstitute &vocab, const std::string &file_base, util::stream::Chains &chains) { - for (unsigned int i = 0; i < chains.size(); ++i) { - std::string file(file_base + boost::lexical_cast<std::string>(i)); - chains[i] >> Print(vocab, util::CreateOrThrow(file.c_str())); - } - } - - explicit Print(const VocabReconstitute &vocab, int fd) : vocab_(vocab), to_(fd) {} - - void Run(const util::stream::ChainPositions &chains) { - util::scoped_fd fd(to_); - util::FileStream out(to_); - NGramStreams<BuildingPayload> streams(chains); - for (NGramStream<BuildingPayload> *s = streams.begin(); s != streams.end(); ++s) { - DumpStream(*s, out); - } - } - - void Run(const util::stream::ChainPosition &position) { - util::scoped_fd fd(to_); - util::FileStream out(to_); - NGramStream<BuildingPayload> stream(position); - DumpStream(stream, out); - } - - private: - void DumpStream(NGramStream<BuildingPayload> &stream, util::FileStream &to) { - for (; stream; ++stream) { - PrintPayload<V>(to, stream->Value()); - for (const WordIndex *w = stream->begin(); w != stream->end(); ++w) { - to << ' ' << vocab_.Lookup(*w) << '=' << *w; - } - to << '\n'; - } - } - - const VocabReconstitute &vocab_; - int to_; -}; - -}} // namespaces - -#endif // LM_BUILDER_DEBUG_PRINT_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/discount.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/discount.hh b/ext/kenlm/lm/builder/discount.hh deleted file mode 100644 index e2f4084..0000000 --- a/ext/kenlm/lm/builder/discount.hh +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef LM_BUILDER_DISCOUNT_H -#define LM_BUILDER_DISCOUNT_H - -#include <algorithm> - -#include <stdint.h> - -namespace lm { -namespace builder { - -struct Discount { - float amount[4]; - - float Get(uint64_t count) const { - return amount[std::min<uint64_t>(count, 3)]; - } - - float Apply(uint64_t count) const { - return static_cast<float>(count) - Get(count); - } -}; - -} // namespace builder -} // namespace lm - -#endif // LM_BUILDER_DISCOUNT_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/dump_counts_main.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/dump_counts_main.cc b/ext/kenlm/lm/builder/dump_counts_main.cc deleted file mode 100644 index 26078d0..0000000 --- a/ext/kenlm/lm/builder/dump_counts_main.cc +++ /dev/null @@ -1,36 +0,0 @@ -#include "lm/common/print.hh" -#include "lm/word_index.hh" -#include "util/file.hh" -#include "util/read_compressed.hh" - -#include <boost/lexical_cast.hpp> - -#include <iostream> -#include <vector> - -int main(int argc, char *argv[]) { - if (argc != 4) { - std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n" - "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n" - "counts. Each record has order many vocabulary ids.\n" - "The vocabulary file contains the words delimited by NULL in order of id.\n" - "The vocabulary file may not be compressed because it is mmapped but the counts\n" - "file can be compressed.\n"; - return 1; - } - util::ReadCompressed counts(util::OpenReadOrThrow(argv[1])); - util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2])); - lm::VocabReconstitute vocab(vocab_file.get()); - unsigned int order = boost::lexical_cast<unsigned int>(argv[3]); - std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t)); - while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) { - UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size()); - const lm::WordIndex *words = reinterpret_cast<const lm::WordIndex*>(&*record.begin()); - for (const lm::WordIndex *i = words; i != words + order; ++i) { - UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ". Are you sure you have the right order and vocab file for these counts?"); - std::cout << vocab.Lookup(*i) << ' '; - } - // TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FileStream. - std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n'; - } -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/hash_gamma.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/hash_gamma.hh b/ext/kenlm/lm/builder/hash_gamma.hh deleted file mode 100644 index 4bef47e..0000000 --- a/ext/kenlm/lm/builder/hash_gamma.hh +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef LM_BUILDER_HASH_GAMMA__ -#define LM_BUILDER_HASH_GAMMA__ - -#include <stdint.h> - -namespace lm { namespace builder { - -#pragma pack(push) -#pragma pack(4) - -struct HashGamma { - uint64_t hash_value; - float gamma; -}; - -#pragma pack(pop) - -}} // namespaces -#endif // LM_BUILDER_HASH_GAMMA__ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/header_info.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/lm/builder/header_info.hh b/ext/kenlm/lm/builder/header_info.hh deleted file mode 100644 index d01d049..0000000 --- a/ext/kenlm/lm/builder/header_info.hh +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef LM_BUILDER_HEADER_INFO_H -#define LM_BUILDER_HEADER_INFO_H - -#include <string> -#include <vector> -#include <stdint.h> - -namespace lm { namespace builder { - -// Some configuration info that is used to add -// comments to the beginning of an ARPA file -struct HeaderInfo { - std::string input_file; - uint64_t token_count; - std::vector<uint64_t> counts_pruned; - - HeaderInfo() {} - - HeaderInfo(const std::string& input_file_in, uint64_t token_count_in, const std::vector<uint64_t> &counts_pruned_in) - : input_file(input_file_in), token_count(token_count_in), counts_pruned(counts_pruned_in) {} - - // TODO: Add smoothing type - // TODO: More info if multiple models were interpolated -}; - -}} // namespaces - -#endif
