http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/python/kenlm.pyx ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/python/kenlm.pyx b/ext/kenlm/python/kenlm.pyx deleted file mode 100644 index f312c90..0000000 --- a/ext/kenlm/python/kenlm.pyx +++ /dev/null @@ -1,231 +0,0 @@ -import os -cimport _kenlm - -cdef bytes as_str(data): - if isinstance(data, bytes): - return data - elif isinstance(data, unicode): - return data.encode('utf8') - raise TypeError('Cannot convert %s to string' % type(data)) - -cdef class FullScoreReturn: - """ - Wrapper around FullScoreReturn. - - Notes: - `prob` has been renamed to `log_prob` - `oov` has been added to flag whether the word is OOV - """ - - cdef float log_prob - cdef int ngram_length - cdef bint oov - - def __cinit__(self, log_prob, ngram_length, oov): - self.log_prob = log_prob - self.ngram_length = ngram_length - self.oov = oov - - def __repr__(self): - return '{0}({1}, {2}, {3})'.format(self.__class__.__name__, repr(self.log_prob), repr(self.ngram_length), repr(self.oov)) - - property log_prob: - def __get__(self): - return self.log_prob - - property ngram_length: - def __get__(self): - return self.ngram_length - - property oov: - def __get__(self): - return self.oov - -cdef class State: - """ - Wrapper around lm::ngram::State so that python code can make incremental queries. - - Notes: - * rich comparisons - * hashable - """ - - cdef _kenlm.State _c_state - - def __richcmp__(State qa, State qb, int op): - r = qa._c_state.Compare(qb._c_state) - if op == 0: # < - return r < 0 - elif op == 1: # <= - return r <= 0 - elif op == 2: # == - return r == 0 - elif op == 3: # != - return r != 0 - elif op == 4: # > - return r > 0 - else: # >= - return r >= 0 - - def __hash__(self): - return _kenlm.hash_value(self._c_state) - - -cdef class LanguageModel: - """ - This is not a strict wrapper, the interface is more pythonic. - It loads models and queries full sentences. - """ - - cdef _kenlm.Model* model - cdef public bytes path - cdef _kenlm.const_Vocabulary* vocab - - def __init__(self, path): - """ - Load the language model. - - :param path: path to an arpa file or a kenlm binary file. - """ - self.path = os.path.abspath(as_str(path)) - try: - self.model = _kenlm.LoadVirtual(self.path) - except RuntimeError as exception: - exception_message = str(exception).replace('\n', ' ') - raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\ - from exception - self.vocab = &self.model.BaseVocabulary() - - def __dealloc__(self): - del self.model - - property order: - def __get__(self): - return self.model.Order() - - def score(self, sentence, bos = True, eos = True): - cdef list words = as_str(sentence).split() - cdef _kenlm.State state - if bos: - self.model.BeginSentenceWrite(&state) - else: - self.model.NullContextWrite(&state) - cdef _kenlm.State out_state - cdef float total = 0 - for word in words: - total += self.model.BaseScore(&state, self.vocab.Index(word), &out_state) - state = out_state - if eos: - total += self.model.BaseScore(&state, self.vocab.EndSentence(), &out_state) - return total - - def full_scores(self, sentence, bos = True, eos = True): - """ - full_scores(sentence, bos = True, eos = Ture) -> generate full scores (prob, ngram length, oov) - @param sentence is a string (do not use boundary symbols) - @param bos should kenlm add a bos state - @param eos should kenlm add an eos state - """ - cdef list words = as_str(sentence).split() - cdef _kenlm.State state - if bos: - self.model.BeginSentenceWrite(&state) - else: - self.model.NullContextWrite(&state) - cdef _kenlm.State out_state - cdef _kenlm.FullScoreReturn ret - cdef float total = 0 - cdef _kenlm.WordIndex wid - for word in words: - wid = self.vocab.Index(word) - ret = self.model.BaseFullScore(&state, wid, &out_state) - yield (ret.prob, ret.ngram_length, wid == 0) - state = out_state - if eos: - ret = self.model.BaseFullScore(&state, - self.vocab.EndSentence(), &out_state) - yield (ret.prob, ret.ngram_length, False) - - def __contains__(self, word): - cdef bytes w = as_str(word) - return (self.vocab.Index(w) != 0) - - def __repr__(self): - return '<LanguageModel from {0}>'.format(os.path.basename(self.path)) - - def __reduce__(self): - return (_kenlm.LanguageModel, (self.path,)) - -cdef class Model: - """ - This is closer to a wrapper around lm::ngram::Model. - """ - - cdef _kenlm.Model* model - cdef public bytes path - cdef _kenlm.const_Vocabulary* vocab - - def __init__(self, path): - """ - Load the language model. - - :param path: path to an arpa file or a kenlm binary file. - """ - self.path = os.path.abspath(as_str(path)) - try: - self.model = _kenlm.LoadVirtual(self.path) - except RuntimeError as exception: - exception_message = str(exception).replace('\n', ' ') - raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\ - from exception - self.vocab = &self.model.BaseVocabulary() - - def __dealloc__(self): - del self.model - - property order: - def __get__(self): - return self.model.Order() - - def BeginSentenceWrite(self, State state): - """Change the given state to a BOS state.""" - self.model.BeginSentenceWrite(&state._c_state) - - def NullContextWrite(self, State state): - """Change the given state to a NULL state.""" - self.model.NullContextWrite(&state._c_state) - - def BaseScore(self, State in_state, str word, State out_state): - """ - Return p(word|in_state) and update the output state. - Wrapper around model.BaseScore(in_state, Index(word), out_state) - - :param word: the suffix - :param state: the context (defaults to NullContext) - :returns: p(word|state) - """ - cdef float total = self.model.BaseScore(&in_state._c_state, self.vocab.Index(as_str(word)), &out_state._c_state) - return total - - def BaseFullScore(self, State in_state, str word, State out_state): - """ - Wrapper around model.BaseScore(in_state, Index(word), out_state) - - :param word: the suffix - :param state: the context (defaults to NullContext) - :returns: FullScoreReturn(word|state) - """ - cdef _kenlm.WordIndex wid = self.vocab.Index(as_str(word)) - cdef _kenlm.FullScoreReturn ret = self.model.BaseFullScore(&in_state._c_state, wid, &out_state._c_state) - return FullScoreReturn(ret.prob, ret.ngram_length, wid == 0) - - def __contains__(self, word): - cdef bytes w = as_str(word) - return (self.vocab.Index(w) != 0) - - def __repr__(self): - return '<Model from {0}>'.format(os.path.basename(self.path)) - - def __reduce__(self): - return (_kenlm.LanguageModel, (self.path,)) -
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/setup.py ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/setup.py b/ext/kenlm/setup.py deleted file mode 100644 index ee061a8..0000000 --- a/ext/kenlm/setup.py +++ /dev/null @@ -1,48 +0,0 @@ -from distutils.core import setup -from distutils.extension import Extension -import glob -import platform -import os - -#Does gcc compile with this header and library? -def compile_test(header, library): - dummy_path = os.path.join(os.path.dirname(__file__), "dummy") - command = "bash -c \"g++ -include " + header + " -l" + library + " -x c++ - <<<'int main() {}' -o " + dummy_path + " >/dev/null 2>/dev/null && rm " + dummy_path + " 2>/dev/null\"" - return os.system(command) == 0 - - -FILES = glob.glob('util/*.cc') + glob.glob('lm/*.cc') + glob.glob('util/double-conversion/*.cc') -FILES = [fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc'))] - -LIBS = ['stdc++'] -if platform.system() != 'Darwin': - LIBS.append('rt') - - -ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER=6'] - -if compile_test('zlib.h', 'z'): - ARGS.append('-DHAVE_ZLIB') - LIBS.append('z') - -if compile_test('bzlib.h', 'bz2'): - ARGS.append('-DHAVE_BZLIB') - LIBS.append('bz2') - -if compile_test('lzma.h', 'lzma'): - ARGS.append('-DHAVE_XZLIB') - LIBS.append('lzma') - -ext_modules = [ - Extension(name='kenlm', - sources=FILES + ['python/kenlm.cpp'], - language='C++', - include_dirs=['.'], - libraries=LIBS, - extra_compile_args=ARGS) -] - -setup( - name='kenlm', - ext_modules=ext_modules -) http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/CMakeLists.txt b/ext/kenlm/util/CMakeLists.txt deleted file mode 100644 index 8a544aa..0000000 --- a/ext/kenlm/util/CMakeLists.txt +++ /dev/null @@ -1,81 +0,0 @@ -cmake_minimum_required(VERSION 2.8.8) -# -# The KenLM cmake files make use of add_library(... OBJECTS ...) -# -# This syntax allows grouping of source files when compiling -# (effectively creating "fake" libraries based on source subdirs). -# -# This syntax was only added in cmake version 2.8.8 -# -# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library - - -# This CMake file was created by Lane Schwartz <[email protected]> - - -# Explicitly list the source files for this subdirectory -# -# If you add any source files to this subdirectory -# that should be included in the kenlm library, -# (this excludes any unit test files) -# you should add them to the following list: -# -# Because we do not set PARENT_SCOPE in the following definition, -# CMake files in the parent directory won't be able to access this variable. -# -set(KENLM_UTIL_SOURCE - bit_packing.cc - ersatz_progress.cc - exception.cc - file.cc - file_piece.cc - float_to_string.cc - integer_to_string.cc - mmap.cc - murmur_hash.cc - parallel_read.cc - pool.cc - read_compressed.cc - scoped.cc - string_piece.cc - usage.cc - ) - -# This directory has children that need to be processed -add_subdirectory(double-conversion) -add_subdirectory(stream) - - -# Group these objects together for later use. -# -# Given add_library(foo OBJECT ${my_foo_sources}), -# refer to these objects as $<TARGET_OBJECTS:foo> -# -add_library(kenlm_util OBJECT ${KENLM_UTIL_DOUBLECONVERSION_SOURCE} ${KENLM_UTIL_STREAM_SOURCE} ${KENLM_UTIL_SOURCE}) - - - -# Only compile and run unit tests if tests should be run -if(BUILD_TESTING) - - # Explicitly list the Boost test files to be compiled - set(KENLM_BOOST_TESTS_LIST - bit_packing_test - joint_sort_test - multi_intersection_test - probing_hash_table_test - read_compressed_test - sorted_uniform_test - tokenize_piece_test - ) - - AddTests(TESTS ${KENLM_BOOST_TESTS_LIST} - DEPENDS $<TARGET_OBJECTS:kenlm_util> - LIBRARIES ${Boost_LIBRARIES} pthread) - - # file_piece_test requires an extra command line parameter - KenLMAddTest(TEST file_piece_test - DEPENDS $<TARGET_OBJECTS:kenlm_util> - LIBRARIES ${Boost_LIBRARIES} pthread - TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/file_piece.cc) -endif() http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/Jamfile ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/Jamfile b/ext/kenlm/util/Jamfile deleted file mode 100644 index 2eeccf4..0000000 --- a/ext/kenlm/util/Jamfile +++ /dev/null @@ -1,41 +0,0 @@ -local compressed_flags = <include>.. <define>HAVE_ZLIB ; -local compressed_deps = /top//z ; -if [ test_library "bz2" ] && [ test_header "bzlib.h" ] { - external-lib bz2 ; - compressed_flags += <define>HAVE_BZLIB ; - compressed_deps += bz2 ; -} -if [ test_library "lzma" ] && [ test_header "lzma.h" ] { - external-lib lzma ; - compressed_flags += <define>HAVE_XZLIB ; - compressed_deps += lzma ; -} - -#rt is needed for clock_gettime on linux. But it's already included with threading=multi -lib rt ; - -obj read_compressed.o : read_compressed.cc : $(compressed_flags) ; -alias read_compressed : read_compressed.o $(compressed_deps) ; -obj read_compressed_test.o : read_compressed_test.cc /top//boost_unit_test_framework : $(compressed_flags) ; -obj file_piece_test.o : file_piece_test.cc /top//boost_unit_test_framework : $(compressed_flags) ; - -fakelib parallel_read : parallel_read.cc : <threading>multi:<source>/top//boost_thread <threading>multi:<define>WITH_THREADS : : <include>.. ; - -fakelib kenutil : [ glob *.cc : parallel_read.cc read_compressed.cc *_main.cc *_test.cc ] read_compressed parallel_read double-conversion//double-conversion : <include>.. <os>LINUX,<threading>single:<source>rt : : <include>.. ; - -exe cat_compressed : cat_compressed_main.cc kenutil ; - -#Does not install this -exe probing_hash_table_benchmark : probing_hash_table_benchmark_main.cc kenutil ; - -alias programs : cat_compressed ; - -import testing ; - -run file_piece_test.o kenutil /top//boost_unit_test_framework : : file_piece.cc ; -for local t in [ glob *_test.cc : file_piece_test.cc read_compressed_test.cc ] { - local name = [ MATCH "(.*)\.cc" : $(t) ] ; - unit-test $(name) : $(t) kenutil /top//boost_unit_test_framework /top//boost_system ; -} - -build-project stream ; http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/bit_packing.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/bit_packing.cc b/ext/kenlm/util/bit_packing.cc deleted file mode 100644 index cffd9cf..0000000 --- a/ext/kenlm/util/bit_packing.cc +++ /dev/null @@ -1,40 +0,0 @@ -#include "util/bit_packing.hh" -#include "util/exception.hh" - -#include <cstring> - -namespace util { - -namespace { -template <bool> struct StaticCheck {}; -template <> struct StaticCheck<true> { typedef bool StaticAssertionPassed; }; - -// If your float isn't 4 bytes, we're hosed. -typedef StaticCheck<sizeof(float) == 4>::StaticAssertionPassed FloatSize; - -} // namespace - -uint8_t RequiredBits(uint64_t max_value) { - if (!max_value) return 0; - uint8_t ret = 1; - while (max_value >>= 1) ++ret; - return ret; -} - -void BitPackingSanity() { - const FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 }; - if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000"); - char mem[57+8]; - memset(mem, 0, sizeof(mem)); - const uint64_t test57 = 0x123456789abcdefULL; - for (uint64_t b = 0; b < 57 * 8; b += 57) { - WriteInt57(mem, b, 57, test57); - } - for (uint64_t b = 0; b < 57 * 8; b += 57) { - if (test57 != ReadInt57(mem, b, 57, (1ULL << 57) - 1)) - UTIL_THROW(Exception, "The bit packing routines are failing for your architecture. Please send a bug report with your architecture, operating system, and compiler."); - } - // TODO: more checks. -} - -} // namespace util http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/bit_packing.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/bit_packing.hh b/ext/kenlm/util/bit_packing.hh deleted file mode 100644 index b24fd9c..0000000 --- a/ext/kenlm/util/bit_packing.hh +++ /dev/null @@ -1,185 +0,0 @@ -#ifndef UTIL_BIT_PACKING_H -#define UTIL_BIT_PACKING_H - -/* Bit-level packing routines - * - * WARNING WARNING WARNING: - * The write functions assume that memory is zero initially. This makes them - * faster and is the appropriate case for mmapped language model construction. - * These routines assume that unaligned access to uint64_t is fast. This is - * the case on x86_64. I'm not sure how fast unaligned 64-bit access is on - * x86 but my target audience is large language models for which 64-bit is - * necessary. - * - * Call the BitPackingSanity function to sanity check. Calling once suffices, - * but it may be called multiple times when that's inconvenient. - * - * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at - * NICT. - */ - -#include <cassert> -#ifdef __APPLE__ -#include <architecture/byte_order.h> -#elif __linux__ -#include <endian.h> -#elif !defined(_WIN32) && !defined(_WIN64) -#include <arpa/nameser_compat.h> -#endif - -#include <stdint.h> -#include <cstring> - -namespace util { - -// Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct. -#if BYTE_ORDER == LITTLE_ENDIAN -inline uint8_t BitPackShift(uint8_t bit, uint8_t /*length*/) { - return bit; -} -#elif BYTE_ORDER == BIG_ENDIAN -inline uint8_t BitPackShift(uint8_t bit, uint8_t length) { - return 64 - length - bit; -} -#else -#error "Bit packing code isn't written for your byte order." -#endif - -inline uint64_t ReadOff(const void *base, uint64_t bit_off) { -#if defined(__arm) || defined(__arm__) - const uint8_t *base_off = reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3); - uint64_t value64; - memcpy(&value64, base_off, sizeof(value64)); - return value64; -#else - return *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3)); -#endif -} - -/* Pack integers up to 57 bits using their least significant digits. - * The length is specified using mask: - * Assumes mask == (1 << length) - 1 where length <= 57. - */ -inline uint64_t ReadInt57(const void *base, uint64_t bit_off, uint8_t length, uint64_t mask) { - return (ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, length)) & mask; -} -/* Assumes value < (1 << length) and length <= 57. - * Assumes the memory is zero initially. - */ -inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t value) { -#if defined(__arm) || defined(__arm__) - uint8_t *base_off = reinterpret_cast<uint8_t*>(base) + (bit_off >> 3); - uint64_t value64; - memcpy(&value64, base_off, sizeof(value64)); - value64 |= (value << BitPackShift(bit_off & 7, length)); - memcpy(base_off, &value64, sizeof(value64)); -#else - *reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |= - (value << BitPackShift(bit_off & 7, length)); -#endif -} - -/* Same caveats as above, but for a 25 bit limit. */ -inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) { -#if defined(__arm) || defined(__arm__) - const uint8_t *base_off = reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3); - uint32_t value32; - memcpy(&value32, base_off, sizeof(value32)); - return (value32 >> BitPackShift(bit_off & 7, length)) & mask; -#else - return (*reinterpret_cast<const uint32_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3)) >> BitPackShift(bit_off & 7, length)) & mask; -#endif -} - -inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) { -#if defined(__arm) || defined(__arm__) - uint8_t *base_off = reinterpret_cast<uint8_t*>(base) + (bit_off >> 3); - uint32_t value32; - memcpy(&value32, base_off, sizeof(value32)); - value32 |= (value << BitPackShift(bit_off & 7, length)); - memcpy(base_off, &value32, sizeof(value32)); -#else - *reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |= - (value << BitPackShift(bit_off & 7, length)); -#endif -} - -typedef union { float f; uint32_t i; } FloatEnc; - -inline float ReadFloat32(const void *base, uint64_t bit_off) { - FloatEnc encoded; - encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 32); - return encoded.f; -} -inline void WriteFloat32(void *base, uint64_t bit_off, float value) { - FloatEnc encoded; - encoded.f = value; - WriteInt57(base, bit_off, 32, encoded.i); -} - -const uint32_t kSignBit = 0x80000000; - -inline void SetSign(float &to) { - FloatEnc enc; - enc.f = to; - enc.i |= kSignBit; - to = enc.f; -} - -inline void UnsetSign(float &to) { - FloatEnc enc; - enc.f = to; - enc.i &= ~kSignBit; - to = enc.f; -} - -inline float ReadNonPositiveFloat31(const void *base, uint64_t bit_off) { - FloatEnc encoded; - encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31); - // Sign bit set means negative. - encoded.i |= kSignBit; - return encoded.f; -} -inline void WriteNonPositiveFloat31(void *base, uint64_t bit_off, float value) { - FloatEnc encoded; - encoded.f = value; - encoded.i &= ~kSignBit; - WriteInt57(base, bit_off, 31, encoded.i); -} - -void BitPackingSanity(); - -// Return bits required to store integers upto max_value. Not the most -// efficient implementation, but this is only called a few times to size tries. -uint8_t RequiredBits(uint64_t max_value); - -struct BitsMask { - static BitsMask ByMax(uint64_t max_value) { - BitsMask ret; - ret.FromMax(max_value); - return ret; - } - static BitsMask ByBits(uint8_t bits) { - BitsMask ret; - ret.bits = bits; - ret.mask = (1ULL << bits) - 1; - return ret; - } - void FromMax(uint64_t max_value) { - bits = RequiredBits(max_value); - mask = (1ULL << bits) - 1; - } - uint8_t bits; - uint64_t mask; -}; - -struct BitAddress { - BitAddress(void *in_base, uint64_t in_offset) : base(in_base), offset(in_offset) {} - - void *base; - uint64_t offset; -}; - -} // namespace util - -#endif // UTIL_BIT_PACKING_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/bit_packing_test.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/bit_packing_test.cc b/ext/kenlm/util/bit_packing_test.cc deleted file mode 100644 index c4494b6..0000000 --- a/ext/kenlm/util/bit_packing_test.cc +++ /dev/null @@ -1,59 +0,0 @@ -#include "util/bit_packing.hh" - -#define BOOST_TEST_MODULE BitPackingTest -#include <boost/test/unit_test.hpp> - -#include <cstring> - -namespace util { -namespace { - -const uint64_t test57 = 0x123456789abcdefULL; -const uint32_t test25 = 0x1234567; - -BOOST_AUTO_TEST_CASE(ZeroBit57) { - char mem[16]; - memset(mem, 0, sizeof(mem)); - WriteInt57(mem, 0, 57, test57); - BOOST_CHECK_EQUAL(test57, ReadInt57(mem, 0, 57, (1ULL << 57) - 1)); -} - -BOOST_AUTO_TEST_CASE(EachBit57) { - char mem[16]; - for (uint8_t b = 0; b < 8; ++b) { - memset(mem, 0, sizeof(mem)); - WriteInt57(mem, b, 57, test57); - BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); - } -} - -BOOST_AUTO_TEST_CASE(Consecutive57) { - char mem[57+8]; - memset(mem, 0, sizeof(mem)); - for (uint64_t b = 0; b < 57 * 8; b += 57) { - WriteInt57(mem, b, 57, test57); - BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); - } - for (uint64_t b = 0; b < 57 * 8; b += 57) { - BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); - } -} - -BOOST_AUTO_TEST_CASE(Consecutive25) { - char mem[25+8]; - memset(mem, 0, sizeof(mem)); - for (uint64_t b = 0; b < 25 * 8; b += 25) { - WriteInt25(mem, b, 25, test25); - BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1)); - } - for (uint64_t b = 0; b < 25 * 8; b += 25) { - BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1)); - } -} - -BOOST_AUTO_TEST_CASE(Sanity) { - BitPackingSanity(); -} - -} // namespace -} // namespace util http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/cat_compressed_main.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/cat_compressed_main.cc b/ext/kenlm/util/cat_compressed_main.cc deleted file mode 100644 index 0c7cda9..0000000 --- a/ext/kenlm/util/cat_compressed_main.cc +++ /dev/null @@ -1,47 +0,0 @@ -// Like cat but interprets compressed files. -#include "util/file.hh" -#include "util/read_compressed.hh" - -#include <cstring> -#include <iostream> - -namespace { -const std::size_t kBufSize = 16384; -void Copy(util::ReadCompressed &from, int to) { - util::scoped_malloc buffer(util::MallocOrThrow(kBufSize)); - while (std::size_t amount = from.Read(buffer.get(), kBufSize)) { - util::WriteOrThrow(to, buffer.get(), amount); - } -} -} // namespace - -int main(int argc, char *argv[]) { - // Lane Schwartz likes -h and --help - for (int i = 1; i < argc; ++i) { - char *arg = argv[i]; - if (!strcmp(arg, "--")) break; - if (!strcmp(arg, "-h") || !strcmp(arg, "--help")) { - std::cerr << - "A cat implementation that interprets compressed files.\n" - "Usage: " << argv[0] << " [file1] [file2] ...\n" - "If no file is provided, then stdin is read.\n"; - return 1; - } - } - - try { - if (argc == 1) { - util::ReadCompressed in(0); - Copy(in, 1); - } else { - for (int i = 1; i < argc; ++i) { - util::ReadCompressed in(util::OpenReadOrThrow(argv[i])); - Copy(in, 1); - } - } - } catch (const std::exception &e) { - std::cerr << e.what() << std::endl; - return 2; - } - return 0; -} http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/double-conversion/CMakeLists.txt b/ext/kenlm/util/double-conversion/CMakeLists.txt deleted file mode 100644 index e2cf02a..0000000 --- a/ext/kenlm/util/double-conversion/CMakeLists.txt +++ /dev/null @@ -1,39 +0,0 @@ -cmake_minimum_required(VERSION 2.8.8) -# -# The KenLM cmake files make use of add_library(... OBJECTS ...) -# -# This syntax allows grouping of source files when compiling -# (effectively creating "fake" libraries based on source subdirs). -# -# This syntax was only added in cmake version 2.8.8 -# -# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library - - -# This CMake file was created by Lane Schwartz <[email protected]> - -# Explicitly list the source files for this subdirectory -# -# If you add any source files to this subdirectory -# that should be included in the kenlm library, -# (this excludes any unit test files) -# you should add them to the following list: -# -# In order to allow CMake files in the parent directory -# to see this variable definition, we set PARENT_SCOPE. -# -# In order to set correct paths to these files -# when this variable is referenced by CMake files in the parent directory, -# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. -# -set(KENLM_UTIL_DOUBLECONVERSION_SOURCE - ${CMAKE_CURRENT_SOURCE_DIR}/bignum-dtoa.cc - ${CMAKE_CURRENT_SOURCE_DIR}/bignum.cc - ${CMAKE_CURRENT_SOURCE_DIR}/cached-powers.cc - ${CMAKE_CURRENT_SOURCE_DIR}/diy-fp.cc - ${CMAKE_CURRENT_SOURCE_DIR}/double-conversion.cc - ${CMAKE_CURRENT_SOURCE_DIR}/fast-dtoa.cc - ${CMAKE_CURRENT_SOURCE_DIR}/fixed-dtoa.cc - ${CMAKE_CURRENT_SOURCE_DIR}/strtod.cc - PARENT_SCOPE) - http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/Jamfile ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/double-conversion/Jamfile b/ext/kenlm/util/double-conversion/Jamfile deleted file mode 100644 index 7a92afa..0000000 --- a/ext/kenlm/util/double-conversion/Jamfile +++ /dev/null @@ -1 +0,0 @@ -fakelib double-conversion : [ glob *.cc ] : : : <include>. ; http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/LICENSE ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/double-conversion/LICENSE b/ext/kenlm/util/double-conversion/LICENSE deleted file mode 100644 index 933718a..0000000 --- a/ext/kenlm/util/double-conversion/LICENSE +++ /dev/null @@ -1,26 +0,0 @@ -Copyright 2006-2011, the V8 project authors. All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - * Neither the name of Google Inc. nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/bignum-dtoa.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/double-conversion/bignum-dtoa.cc b/ext/kenlm/util/double-conversion/bignum-dtoa.cc deleted file mode 100644 index 3d217bf..0000000 --- a/ext/kenlm/util/double-conversion/bignum-dtoa.cc +++ /dev/null @@ -1,640 +0,0 @@ -// Copyright 2010 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include <cmath> - -#include "bignum-dtoa.h" - -#include "bignum.h" -#include "ieee.h" - -namespace double_conversion { - -static int NormalizedExponent(uint64_t significand, int exponent) { - ASSERT(significand != 0); - while ((significand & Double::kHiddenBit) == 0) { - significand = significand << 1; - exponent = exponent - 1; - } - return exponent; -} - - -// Forward declarations: -// Returns an estimation of k such that 10^(k-1) <= v < 10^k. -static int EstimatePower(int exponent); -// Computes v / 10^estimated_power exactly, as a ratio of two bignums, numerator -// and denominator. -static void InitialScaledStartValues(uint64_t significand, - int exponent, - bool lower_boundary_is_closer, - int estimated_power, - bool need_boundary_deltas, - Bignum* numerator, - Bignum* denominator, - Bignum* delta_minus, - Bignum* delta_plus); -// Multiplies numerator/denominator so that its values lies in the range 1-10. -// Returns decimal_point s.t. -// v = numerator'/denominator' * 10^(decimal_point-1) -// where numerator' and denominator' are the values of numerator and -// denominator after the call to this function. -static void FixupMultiply10(int estimated_power, bool is_even, - int* decimal_point, - Bignum* numerator, Bignum* denominator, - Bignum* delta_minus, Bignum* delta_plus); -// Generates digits from the left to the right and stops when the generated -// digits yield the shortest decimal representation of v. -static void GenerateShortestDigits(Bignum* numerator, Bignum* denominator, - Bignum* delta_minus, Bignum* delta_plus, - bool is_even, - Vector<char> buffer, int* length); -// Generates 'requested_digits' after the decimal point. -static void BignumToFixed(int requested_digits, int* decimal_point, - Bignum* numerator, Bignum* denominator, - Vector<char>(buffer), int* length); -// Generates 'count' digits of numerator/denominator. -// Once 'count' digits have been produced rounds the result depending on the -// remainder (remainders of exactly .5 round upwards). Might update the -// decimal_point when rounding up (for example for 0.9999). -static void GenerateCountedDigits(int count, int* decimal_point, - Bignum* numerator, Bignum* denominator, - Vector<char>(buffer), int* length); - - -void BignumDtoa(double v, BignumDtoaMode mode, int requested_digits, - Vector<char> buffer, int* length, int* decimal_point) { - ASSERT(v > 0); - ASSERT(!Double(v).IsSpecial()); - uint64_t significand; - int exponent; - bool lower_boundary_is_closer; - if (mode == BIGNUM_DTOA_SHORTEST_SINGLE) { - float f = static_cast<float>(v); - ASSERT(f == v); - significand = Single(f).Significand(); - exponent = Single(f).Exponent(); - lower_boundary_is_closer = Single(f).LowerBoundaryIsCloser(); - } else { - significand = Double(v).Significand(); - exponent = Double(v).Exponent(); - lower_boundary_is_closer = Double(v).LowerBoundaryIsCloser(); - } - bool need_boundary_deltas = - (mode == BIGNUM_DTOA_SHORTEST || mode == BIGNUM_DTOA_SHORTEST_SINGLE); - - bool is_even = (significand & 1) == 0; - int normalized_exponent = NormalizedExponent(significand, exponent); - // estimated_power might be too low by 1. - int estimated_power = EstimatePower(normalized_exponent); - - // Shortcut for Fixed. - // The requested digits correspond to the digits after the point. If the - // number is much too small, then there is no need in trying to get any - // digits. - if (mode == BIGNUM_DTOA_FIXED && -estimated_power - 1 > requested_digits) { - buffer[0] = '\0'; - *length = 0; - // Set decimal-point to -requested_digits. This is what Gay does. - // Note that it should not have any effect anyways since the string is - // empty. - *decimal_point = -requested_digits; - return; - } - - Bignum numerator; - Bignum denominator; - Bignum delta_minus; - Bignum delta_plus; - // Make sure the bignum can grow large enough. The smallest double equals - // 4e-324. In this case the denominator needs fewer than 324*4 binary digits. - // The maximum double is 1.7976931348623157e308 which needs fewer than - // 308*4 binary digits. - ASSERT(Bignum::kMaxSignificantBits >= 324*4); - InitialScaledStartValues(significand, exponent, lower_boundary_is_closer, - estimated_power, need_boundary_deltas, - &numerator, &denominator, - &delta_minus, &delta_plus); - // We now have v = (numerator / denominator) * 10^estimated_power. - FixupMultiply10(estimated_power, is_even, decimal_point, - &numerator, &denominator, - &delta_minus, &delta_plus); - // We now have v = (numerator / denominator) * 10^(decimal_point-1), and - // 1 <= (numerator + delta_plus) / denominator < 10 - switch (mode) { - case BIGNUM_DTOA_SHORTEST: - case BIGNUM_DTOA_SHORTEST_SINGLE: - GenerateShortestDigits(&numerator, &denominator, - &delta_minus, &delta_plus, - is_even, buffer, length); - break; - case BIGNUM_DTOA_FIXED: - BignumToFixed(requested_digits, decimal_point, - &numerator, &denominator, - buffer, length); - break; - case BIGNUM_DTOA_PRECISION: - GenerateCountedDigits(requested_digits, decimal_point, - &numerator, &denominator, - buffer, length); - break; - default: - UNREACHABLE(); - } - buffer[*length] = '\0'; -} - - -// The procedure starts generating digits from the left to the right and stops -// when the generated digits yield the shortest decimal representation of v. A -// decimal representation of v is a number lying closer to v than to any other -// double, so it converts to v when read. -// -// This is true if d, the decimal representation, is between m- and m+, the -// upper and lower boundaries. d must be strictly between them if !is_even. -// m- := (numerator - delta_minus) / denominator -// m+ := (numerator + delta_plus) / denominator -// -// Precondition: 0 <= (numerator+delta_plus) / denominator < 10. -// If 1 <= (numerator+delta_plus) / denominator < 10 then no leading 0 digit -// will be produced. This should be the standard precondition. -static void GenerateShortestDigits(Bignum* numerator, Bignum* denominator, - Bignum* delta_minus, Bignum* delta_plus, - bool is_even, - Vector<char> buffer, int* length) { - // Small optimization: if delta_minus and delta_plus are the same just reuse - // one of the two bignums. - if (Bignum::Equal(*delta_minus, *delta_plus)) { - delta_plus = delta_minus; - } - *length = 0; - while (true) { - uint16_t digit; - digit = numerator->DivideModuloIntBignum(*denominator); - ASSERT(digit <= 9); // digit is a uint16_t and therefore always positive. - // digit = numerator / denominator (integer division). - // numerator = numerator % denominator. - buffer[(*length)++] = digit + '0'; - - // Can we stop already? - // If the remainder of the division is less than the distance to the lower - // boundary we can stop. In this case we simply round down (discarding the - // remainder). - // Similarly we test if we can round up (using the upper boundary). - bool in_delta_room_minus; - bool in_delta_room_plus; - if (is_even) { - in_delta_room_minus = Bignum::LessEqual(*numerator, *delta_minus); - } else { - in_delta_room_minus = Bignum::Less(*numerator, *delta_minus); - } - if (is_even) { - in_delta_room_plus = - Bignum::PlusCompare(*numerator, *delta_plus, *denominator) >= 0; - } else { - in_delta_room_plus = - Bignum::PlusCompare(*numerator, *delta_plus, *denominator) > 0; - } - if (!in_delta_room_minus && !in_delta_room_plus) { - // Prepare for next iteration. - numerator->Times10(); - delta_minus->Times10(); - // We optimized delta_plus to be equal to delta_minus (if they share the - // same value). So don't multiply delta_plus if they point to the same - // object. - if (delta_minus != delta_plus) { - delta_plus->Times10(); - } - } else if (in_delta_room_minus && in_delta_room_plus) { - // Let's see if 2*numerator < denominator. - // If yes, then the next digit would be < 5 and we can round down. - int compare = Bignum::PlusCompare(*numerator, *numerator, *denominator); - if (compare < 0) { - // Remaining digits are less than .5. -> Round down (== do nothing). - } else if (compare > 0) { - // Remaining digits are more than .5 of denominator. -> Round up. - // Note that the last digit could not be a '9' as otherwise the whole - // loop would have stopped earlier. - // We still have an assert here in case the preconditions were not - // satisfied. - ASSERT(buffer[(*length) - 1] != '9'); - buffer[(*length) - 1]++; - } else { - // Halfway case. - // TODO(floitsch): need a way to solve half-way cases. - // For now let's round towards even (since this is what Gay seems to - // do). - - if ((buffer[(*length) - 1] - '0') % 2 == 0) { - // Round down => Do nothing. - } else { - ASSERT(buffer[(*length) - 1] != '9'); - buffer[(*length) - 1]++; - } - } - return; - } else if (in_delta_room_minus) { - // Round down (== do nothing). - return; - } else { // in_delta_room_plus - // Round up. - // Note again that the last digit could not be '9' since this would have - // stopped the loop earlier. - // We still have an ASSERT here, in case the preconditions were not - // satisfied. - ASSERT(buffer[(*length) -1] != '9'); - buffer[(*length) - 1]++; - return; - } - } -} - - -// Let v = numerator / denominator < 10. -// Then we generate 'count' digits of d = x.xxxxx... (without the decimal point) -// from left to right. Once 'count' digits have been produced we decide wether -// to round up or down. Remainders of exactly .5 round upwards. Numbers such -// as 9.999999 propagate a carry all the way, and change the -// exponent (decimal_point), when rounding upwards. -static void GenerateCountedDigits(int count, int* decimal_point, - Bignum* numerator, Bignum* denominator, - Vector<char>(buffer), int* length) { - ASSERT(count >= 0); - for (int i = 0; i < count - 1; ++i) { - uint16_t digit; - digit = numerator->DivideModuloIntBignum(*denominator); - ASSERT(digit <= 9); // digit is a uint16_t and therefore always positive. - // digit = numerator / denominator (integer division). - // numerator = numerator % denominator. - buffer[i] = digit + '0'; - // Prepare for next iteration. - numerator->Times10(); - } - // Generate the last digit. - uint16_t digit; - digit = numerator->DivideModuloIntBignum(*denominator); - if (Bignum::PlusCompare(*numerator, *numerator, *denominator) >= 0) { - digit++; - } - buffer[count - 1] = digit + '0'; - // Correct bad digits (in case we had a sequence of '9's). Propagate the - // carry until we hat a non-'9' or til we reach the first digit. - for (int i = count - 1; i > 0; --i) { - if (buffer[i] != '0' + 10) break; - buffer[i] = '0'; - buffer[i - 1]++; - } - if (buffer[0] == '0' + 10) { - // Propagate a carry past the top place. - buffer[0] = '1'; - (*decimal_point)++; - } - *length = count; -} - - -// Generates 'requested_digits' after the decimal point. It might omit -// trailing '0's. If the input number is too small then no digits at all are -// generated (ex.: 2 fixed digits for 0.00001). -// -// Input verifies: 1 <= (numerator + delta) / denominator < 10. -static void BignumToFixed(int requested_digits, int* decimal_point, - Bignum* numerator, Bignum* denominator, - Vector<char>(buffer), int* length) { - // Note that we have to look at more than just the requested_digits, since - // a number could be rounded up. Example: v=0.5 with requested_digits=0. - // Even though the power of v equals 0 we can't just stop here. - if (-(*decimal_point) > requested_digits) { - // The number is definitively too small. - // Ex: 0.001 with requested_digits == 1. - // Set decimal-point to -requested_digits. This is what Gay does. - // Note that it should not have any effect anyways since the string is - // empty. - *decimal_point = -requested_digits; - *length = 0; - return; - } else if (-(*decimal_point) == requested_digits) { - // We only need to verify if the number rounds down or up. - // Ex: 0.04 and 0.06 with requested_digits == 1. - ASSERT(*decimal_point == -requested_digits); - // Initially the fraction lies in range (1, 10]. Multiply the denominator - // by 10 so that we can compare more easily. - denominator->Times10(); - if (Bignum::PlusCompare(*numerator, *numerator, *denominator) >= 0) { - // If the fraction is >= 0.5 then we have to include the rounded - // digit. - buffer[0] = '1'; - *length = 1; - (*decimal_point)++; - } else { - // Note that we caught most of similar cases earlier. - *length = 0; - } - return; - } else { - // The requested digits correspond to the digits after the point. - // The variable 'needed_digits' includes the digits before the point. - int needed_digits = (*decimal_point) + requested_digits; - GenerateCountedDigits(needed_digits, decimal_point, - numerator, denominator, - buffer, length); - } -} - - -// Returns an estimation of k such that 10^(k-1) <= v < 10^k where -// v = f * 2^exponent and 2^52 <= f < 2^53. -// v is hence a normalized double with the given exponent. The output is an -// approximation for the exponent of the decimal approimation .digits * 10^k. -// -// The result might undershoot by 1 in which case 10^k <= v < 10^k+1. -// Note: this property holds for v's upper boundary m+ too. -// 10^k <= m+ < 10^k+1. -// (see explanation below). -// -// Examples: -// EstimatePower(0) => 16 -// EstimatePower(-52) => 0 -// -// Note: e >= 0 => EstimatedPower(e) > 0. No similar claim can be made for e<0. -static int EstimatePower(int exponent) { - // This function estimates log10 of v where v = f*2^e (with e == exponent). - // Note that 10^floor(log10(v)) <= v, but v <= 10^ceil(log10(v)). - // Note that f is bounded by its container size. Let p = 53 (the double's - // significand size). Then 2^(p-1) <= f < 2^p. - // - // Given that log10(v) == log2(v)/log2(10) and e+(len(f)-1) is quite close - // to log2(v) the function is simplified to (e+(len(f)-1)/log2(10)). - // The computed number undershoots by less than 0.631 (when we compute log3 - // and not log10). - // - // Optimization: since we only need an approximated result this computation - // can be performed on 64 bit integers. On x86/x64 architecture the speedup is - // not really measurable, though. - // - // Since we want to avoid overshooting we decrement by 1e10 so that - // floating-point imprecisions don't affect us. - // - // Explanation for v's boundary m+: the computation takes advantage of - // the fact that 2^(p-1) <= f < 2^p. Boundaries still satisfy this requirement - // (even for denormals where the delta can be much more important). - - const double k1Log10 = 0.30102999566398114; // 1/lg(10) - - // For doubles len(f) == 53 (don't forget the hidden bit). - const int kSignificandSize = Double::kSignificandSize; - double estimate = ceil((exponent + kSignificandSize - 1) * k1Log10 - 1e-10); - return static_cast<int>(estimate); -} - - -// See comments for InitialScaledStartValues. -static void InitialScaledStartValuesPositiveExponent( - uint64_t significand, int exponent, - int estimated_power, bool need_boundary_deltas, - Bignum* numerator, Bignum* denominator, - Bignum* delta_minus, Bignum* delta_plus) { - // A positive exponent implies a positive power. - ASSERT(estimated_power >= 0); - // Since the estimated_power is positive we simply multiply the denominator - // by 10^estimated_power. - - // numerator = v. - numerator->AssignUInt64(significand); - numerator->ShiftLeft(exponent); - // denominator = 10^estimated_power. - denominator->AssignPowerUInt16(10, estimated_power); - - if (need_boundary_deltas) { - // Introduce a common denominator so that the deltas to the boundaries are - // integers. - denominator->ShiftLeft(1); - numerator->ShiftLeft(1); - // Let v = f * 2^e, then m+ - v = 1/2 * 2^e; With the common - // denominator (of 2) delta_plus equals 2^e. - delta_plus->AssignUInt16(1); - delta_plus->ShiftLeft(exponent); - // Same for delta_minus. The adjustments if f == 2^p-1 are done later. - delta_minus->AssignUInt16(1); - delta_minus->ShiftLeft(exponent); - } -} - - -// See comments for InitialScaledStartValues -static void InitialScaledStartValuesNegativeExponentPositivePower( - uint64_t significand, int exponent, - int estimated_power, bool need_boundary_deltas, - Bignum* numerator, Bignum* denominator, - Bignum* delta_minus, Bignum* delta_plus) { - // v = f * 2^e with e < 0, and with estimated_power >= 0. - // This means that e is close to 0 (have a look at how estimated_power is - // computed). - - // numerator = significand - // since v = significand * 2^exponent this is equivalent to - // numerator = v * / 2^-exponent - numerator->AssignUInt64(significand); - // denominator = 10^estimated_power * 2^-exponent (with exponent < 0) - denominator->AssignPowerUInt16(10, estimated_power); - denominator->ShiftLeft(-exponent); - - if (need_boundary_deltas) { - // Introduce a common denominator so that the deltas to the boundaries are - // integers. - denominator->ShiftLeft(1); - numerator->ShiftLeft(1); - // Let v = f * 2^e, then m+ - v = 1/2 * 2^e; With the common - // denominator (of 2) delta_plus equals 2^e. - // Given that the denominator already includes v's exponent the distance - // to the boundaries is simply 1. - delta_plus->AssignUInt16(1); - // Same for delta_minus. The adjustments if f == 2^p-1 are done later. - delta_minus->AssignUInt16(1); - } -} - - -// See comments for InitialScaledStartValues -static void InitialScaledStartValuesNegativeExponentNegativePower( - uint64_t significand, int exponent, - int estimated_power, bool need_boundary_deltas, - Bignum* numerator, Bignum* denominator, - Bignum* delta_minus, Bignum* delta_plus) { - // Instead of multiplying the denominator with 10^estimated_power we - // multiply all values (numerator and deltas) by 10^-estimated_power. - - // Use numerator as temporary container for power_ten. - Bignum* power_ten = numerator; - power_ten->AssignPowerUInt16(10, -estimated_power); - - if (need_boundary_deltas) { - // Since power_ten == numerator we must make a copy of 10^estimated_power - // before we complete the computation of the numerator. - // delta_plus = delta_minus = 10^estimated_power - delta_plus->AssignBignum(*power_ten); - delta_minus->AssignBignum(*power_ten); - } - - // numerator = significand * 2 * 10^-estimated_power - // since v = significand * 2^exponent this is equivalent to - // numerator = v * 10^-estimated_power * 2 * 2^-exponent. - // Remember: numerator has been abused as power_ten. So no need to assign it - // to itself. - ASSERT(numerator == power_ten); - numerator->MultiplyByUInt64(significand); - - // denominator = 2 * 2^-exponent with exponent < 0. - denominator->AssignUInt16(1); - denominator->ShiftLeft(-exponent); - - if (need_boundary_deltas) { - // Introduce a common denominator so that the deltas to the boundaries are - // integers. - numerator->ShiftLeft(1); - denominator->ShiftLeft(1); - // With this shift the boundaries have their correct value, since - // delta_plus = 10^-estimated_power, and - // delta_minus = 10^-estimated_power. - // These assignments have been done earlier. - // The adjustments if f == 2^p-1 (lower boundary is closer) are done later. - } -} - - -// Let v = significand * 2^exponent. -// Computes v / 10^estimated_power exactly, as a ratio of two bignums, numerator -// and denominator. The functions GenerateShortestDigits and -// GenerateCountedDigits will then convert this ratio to its decimal -// representation d, with the required accuracy. -// Then d * 10^estimated_power is the representation of v. -// (Note: the fraction and the estimated_power might get adjusted before -// generating the decimal representation.) -// -// The initial start values consist of: -// - a scaled numerator: s.t. numerator/denominator == v / 10^estimated_power. -// - a scaled (common) denominator. -// optionally (used by GenerateShortestDigits to decide if it has the shortest -// decimal converting back to v): -// - v - m-: the distance to the lower boundary. -// - m+ - v: the distance to the upper boundary. -// -// v, m+, m-, and therefore v - m- and m+ - v all share the same denominator. -// -// Let ep == estimated_power, then the returned values will satisfy: -// v / 10^ep = numerator / denominator. -// v's boundarys m- and m+: -// m- / 10^ep == v / 10^ep - delta_minus / denominator -// m+ / 10^ep == v / 10^ep + delta_plus / denominator -// Or in other words: -// m- == v - delta_minus * 10^ep / denominator; -// m+ == v + delta_plus * 10^ep / denominator; -// -// Since 10^(k-1) <= v < 10^k (with k == estimated_power) -// or 10^k <= v < 10^(k+1) -// we then have 0.1 <= numerator/denominator < 1 -// or 1 <= numerator/denominator < 10 -// -// It is then easy to kickstart the digit-generation routine. -// -// The boundary-deltas are only filled if the mode equals BIGNUM_DTOA_SHORTEST -// or BIGNUM_DTOA_SHORTEST_SINGLE. - -static void InitialScaledStartValues(uint64_t significand, - int exponent, - bool lower_boundary_is_closer, - int estimated_power, - bool need_boundary_deltas, - Bignum* numerator, - Bignum* denominator, - Bignum* delta_minus, - Bignum* delta_plus) { - if (exponent >= 0) { - InitialScaledStartValuesPositiveExponent( - significand, exponent, estimated_power, need_boundary_deltas, - numerator, denominator, delta_minus, delta_plus); - } else if (estimated_power >= 0) { - InitialScaledStartValuesNegativeExponentPositivePower( - significand, exponent, estimated_power, need_boundary_deltas, - numerator, denominator, delta_minus, delta_plus); - } else { - InitialScaledStartValuesNegativeExponentNegativePower( - significand, exponent, estimated_power, need_boundary_deltas, - numerator, denominator, delta_minus, delta_plus); - } - - if (need_boundary_deltas && lower_boundary_is_closer) { - // The lower boundary is closer at half the distance of "normal" numbers. - // Increase the common denominator and adapt all but the delta_minus. - denominator->ShiftLeft(1); // *2 - numerator->ShiftLeft(1); // *2 - delta_plus->ShiftLeft(1); // *2 - } -} - - -// This routine multiplies numerator/denominator so that its values lies in the -// range 1-10. That is after a call to this function we have: -// 1 <= (numerator + delta_plus) /denominator < 10. -// Let numerator the input before modification and numerator' the argument -// after modification, then the output-parameter decimal_point is such that -// numerator / denominator * 10^estimated_power == -// numerator' / denominator' * 10^(decimal_point - 1) -// In some cases estimated_power was too low, and this is already the case. We -// then simply adjust the power so that 10^(k-1) <= v < 10^k (with k == -// estimated_power) but do not touch the numerator or denominator. -// Otherwise the routine multiplies the numerator and the deltas by 10. -static void FixupMultiply10(int estimated_power, bool is_even, - int* decimal_point, - Bignum* numerator, Bignum* denominator, - Bignum* delta_minus, Bignum* delta_plus) { - bool in_range; - if (is_even) { - // For IEEE doubles half-way cases (in decimal system numbers ending with 5) - // are rounded to the closest floating-point number with even significand. - in_range = Bignum::PlusCompare(*numerator, *delta_plus, *denominator) >= 0; - } else { - in_range = Bignum::PlusCompare(*numerator, *delta_plus, *denominator) > 0; - } - if (in_range) { - // Since numerator + delta_plus >= denominator we already have - // 1 <= numerator/denominator < 10. Simply update the estimated_power. - *decimal_point = estimated_power + 1; - } else { - *decimal_point = estimated_power; - numerator->Times10(); - if (Bignum::Equal(*delta_minus, *delta_plus)) { - delta_minus->Times10(); - delta_plus->AssignBignum(*delta_minus); - } else { - delta_minus->Times10(); - delta_plus->Times10(); - } - } -} - -} // namespace double_conversion http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/bignum-dtoa.h ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/double-conversion/bignum-dtoa.h b/ext/kenlm/util/double-conversion/bignum-dtoa.h deleted file mode 100644 index 34b9619..0000000 --- a/ext/kenlm/util/double-conversion/bignum-dtoa.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright 2010 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef DOUBLE_CONVERSION_BIGNUM_DTOA_H_ -#define DOUBLE_CONVERSION_BIGNUM_DTOA_H_ - -#include "utils.h" - -namespace double_conversion { - -enum BignumDtoaMode { - // Return the shortest correct representation. - // For example the output of 0.299999999999999988897 is (the less accurate but - // correct) 0.3. - BIGNUM_DTOA_SHORTEST, - // Same as BIGNUM_DTOA_SHORTEST but for single-precision floats. - BIGNUM_DTOA_SHORTEST_SINGLE, - // Return a fixed number of digits after the decimal point. - // For instance fixed(0.1, 4) becomes 0.1000 - // If the input number is big, the output will be big. - BIGNUM_DTOA_FIXED, - // Return a fixed number of digits, no matter what the exponent is. - BIGNUM_DTOA_PRECISION -}; - -// Converts the given double 'v' to ascii. -// The result should be interpreted as buffer * 10^(point-length). -// The buffer will be null-terminated. -// -// The input v must be > 0 and different from NaN, and Infinity. -// -// The output depends on the given mode: -// - SHORTEST: produce the least amount of digits for which the internal -// identity requirement is still satisfied. If the digits are printed -// (together with the correct exponent) then reading this number will give -// 'v' again. The buffer will choose the representation that is closest to -// 'v'. If there are two at the same distance, than the number is round up. -// In this mode the 'requested_digits' parameter is ignored. -// - FIXED: produces digits necessary to print a given number with -// 'requested_digits' digits after the decimal point. The produced digits -// might be too short in which case the caller has to fill the gaps with '0's. -// Example: toFixed(0.001, 5) is allowed to return buffer="1", point=-2. -// Halfway cases are rounded up. The call toFixed(0.15, 2) thus returns -// buffer="2", point=0. -// Note: the length of the returned buffer has no meaning wrt the significance -// of its digits. That is, just because it contains '0's does not mean that -// any other digit would not satisfy the internal identity requirement. -// - PRECISION: produces 'requested_digits' where the first digit is not '0'. -// Even though the length of produced digits usually equals -// 'requested_digits', the function is allowed to return fewer digits, in -// which case the caller has to fill the missing digits with '0's. -// Halfway cases are again rounded up. -// 'BignumDtoa' expects the given buffer to be big enough to hold all digits -// and a terminating null-character. -void BignumDtoa(double v, BignumDtoaMode mode, int requested_digits, - Vector<char> buffer, int* length, int* point); - -} // namespace double_conversion - -#endif // DOUBLE_CONVERSION_BIGNUM_DTOA_H_ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/bignum.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/double-conversion/bignum.cc b/ext/kenlm/util/double-conversion/bignum.cc deleted file mode 100644 index 747491a..0000000 --- a/ext/kenlm/util/double-conversion/bignum.cc +++ /dev/null @@ -1,764 +0,0 @@ -// Copyright 2010 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include "bignum.h" -#include "utils.h" - -namespace double_conversion { - -Bignum::Bignum() - : bigits_(bigits_buffer_, kBigitCapacity), used_digits_(0), exponent_(0) { - for (int i = 0; i < kBigitCapacity; ++i) { - bigits_[i] = 0; - } -} - - -template<typename S> -static int BitSize(S value) { - return 8 * sizeof(value); -} - -// Guaranteed to lie in one Bigit. -void Bignum::AssignUInt16(uint16_t value) { - ASSERT(kBigitSize >= BitSize(value)); - Zero(); - if (value == 0) return; - - EnsureCapacity(1); - bigits_[0] = value; - used_digits_ = 1; -} - - -void Bignum::AssignUInt64(uint64_t value) { - const int kUInt64Size = 64; - - Zero(); - if (value == 0) return; - - int needed_bigits = kUInt64Size / kBigitSize + 1; - EnsureCapacity(needed_bigits); - for (int i = 0; i < needed_bigits; ++i) { - bigits_[i] = value & kBigitMask; - value = value >> kBigitSize; - } - used_digits_ = needed_bigits; - Clamp(); -} - - -void Bignum::AssignBignum(const Bignum& other) { - exponent_ = other.exponent_; - for (int i = 0; i < other.used_digits_; ++i) { - bigits_[i] = other.bigits_[i]; - } - // Clear the excess digits (if there were any). - for (int i = other.used_digits_; i < used_digits_; ++i) { - bigits_[i] = 0; - } - used_digits_ = other.used_digits_; -} - - -static uint64_t ReadUInt64(Vector<const char> buffer, - int from, - int digits_to_read) { - uint64_t result = 0; - for (int i = from; i < from + digits_to_read; ++i) { - int digit = buffer[i] - '0'; - ASSERT(0 <= digit && digit <= 9); - result = result * 10 + digit; - } - return result; -} - - -void Bignum::AssignDecimalString(Vector<const char> value) { - // 2^64 = 18446744073709551616 > 10^19 - const int kMaxUint64DecimalDigits = 19; - Zero(); - int length = value.length(); - int pos = 0; - // Let's just say that each digit needs 4 bits. - while (length >= kMaxUint64DecimalDigits) { - uint64_t digits = ReadUInt64(value, pos, kMaxUint64DecimalDigits); - pos += kMaxUint64DecimalDigits; - length -= kMaxUint64DecimalDigits; - MultiplyByPowerOfTen(kMaxUint64DecimalDigits); - AddUInt64(digits); - } - uint64_t digits = ReadUInt64(value, pos, length); - MultiplyByPowerOfTen(length); - AddUInt64(digits); - Clamp(); -} - - -static int HexCharValue(char c) { - if ('0' <= c && c <= '9') return c - '0'; - if ('a' <= c && c <= 'f') return 10 + c - 'a'; - if ('A' <= c && c <= 'F') return 10 + c - 'A'; - UNREACHABLE(); - return 0; // To make compiler happy. -} - - -void Bignum::AssignHexString(Vector<const char> value) { - Zero(); - int length = value.length(); - - int needed_bigits = length * 4 / kBigitSize + 1; - EnsureCapacity(needed_bigits); - int string_index = length - 1; - for (int i = 0; i < needed_bigits - 1; ++i) { - // These bigits are guaranteed to be "full". - Chunk current_bigit = 0; - for (int j = 0; j < kBigitSize / 4; j++) { - current_bigit += HexCharValue(value[string_index--]) << (j * 4); - } - bigits_[i] = current_bigit; - } - used_digits_ = needed_bigits - 1; - - Chunk most_significant_bigit = 0; // Could be = 0; - for (int j = 0; j <= string_index; ++j) { - most_significant_bigit <<= 4; - most_significant_bigit += HexCharValue(value[j]); - } - if (most_significant_bigit != 0) { - bigits_[used_digits_] = most_significant_bigit; - used_digits_++; - } - Clamp(); -} - - -void Bignum::AddUInt64(uint64_t operand) { - if (operand == 0) return; - Bignum other; - other.AssignUInt64(operand); - AddBignum(other); -} - - -void Bignum::AddBignum(const Bignum& other) { - ASSERT(IsClamped()); - ASSERT(other.IsClamped()); - - // If this has a greater exponent than other append zero-bigits to this. - // After this call exponent_ <= other.exponent_. - Align(other); - - // There are two possibilities: - // aaaaaaaaaaa 0000 (where the 0s represent a's exponent) - // bbbbb 00000000 - // ---------------- - // ccccccccccc 0000 - // or - // aaaaaaaaaa 0000 - // bbbbbbbbb 0000000 - // ----------------- - // cccccccccccc 0000 - // In both cases we might need a carry bigit. - - EnsureCapacity(1 + Max(BigitLength(), other.BigitLength()) - exponent_); - Chunk carry = 0; - int bigit_pos = other.exponent_ - exponent_; - ASSERT(bigit_pos >= 0); - for (int i = 0; i < other.used_digits_; ++i) { - Chunk sum = bigits_[bigit_pos] + other.bigits_[i] + carry; - bigits_[bigit_pos] = sum & kBigitMask; - carry = sum >> kBigitSize; - bigit_pos++; - } - - while (carry != 0) { - Chunk sum = bigits_[bigit_pos] + carry; - bigits_[bigit_pos] = sum & kBigitMask; - carry = sum >> kBigitSize; - bigit_pos++; - } - used_digits_ = Max(bigit_pos, used_digits_); - ASSERT(IsClamped()); -} - - -void Bignum::SubtractBignum(const Bignum& other) { - ASSERT(IsClamped()); - ASSERT(other.IsClamped()); - // We require this to be bigger than other. - ASSERT(LessEqual(other, *this)); - - Align(other); - - int offset = other.exponent_ - exponent_; - Chunk borrow = 0; - int i; - for (i = 0; i < other.used_digits_; ++i) { - ASSERT((borrow == 0) || (borrow == 1)); - Chunk difference = bigits_[i + offset] - other.bigits_[i] - borrow; - bigits_[i + offset] = difference & kBigitMask; - borrow = difference >> (kChunkSize - 1); - } - while (borrow != 0) { - Chunk difference = bigits_[i + offset] - borrow; - bigits_[i + offset] = difference & kBigitMask; - borrow = difference >> (kChunkSize - 1); - ++i; - } - Clamp(); -} - - -void Bignum::ShiftLeft(int shift_amount) { - if (used_digits_ == 0) return; - exponent_ += shift_amount / kBigitSize; - int local_shift = shift_amount % kBigitSize; - EnsureCapacity(used_digits_ + 1); - BigitsShiftLeft(local_shift); -} - - -void Bignum::MultiplyByUInt32(uint32_t factor) { - if (factor == 1) return; - if (factor == 0) { - Zero(); - return; - } - if (used_digits_ == 0) return; - - // The product of a bigit with the factor is of size kBigitSize + 32. - // Assert that this number + 1 (for the carry) fits into double chunk. - ASSERT(kDoubleChunkSize >= kBigitSize + 32 + 1); - DoubleChunk carry = 0; - for (int i = 0; i < used_digits_; ++i) { - DoubleChunk product = static_cast<DoubleChunk>(factor) * bigits_[i] + carry; - bigits_[i] = static_cast<Chunk>(product & kBigitMask); - carry = (product >> kBigitSize); - } - while (carry != 0) { - EnsureCapacity(used_digits_ + 1); - bigits_[used_digits_] = carry & kBigitMask; - used_digits_++; - carry >>= kBigitSize; - } -} - - -void Bignum::MultiplyByUInt64(uint64_t factor) { - if (factor == 1) return; - if (factor == 0) { - Zero(); - return; - } - ASSERT(kBigitSize < 32); - uint64_t carry = 0; - uint64_t low = factor & 0xFFFFFFFF; - uint64_t high = factor >> 32; - for (int i = 0; i < used_digits_; ++i) { - uint64_t product_low = low * bigits_[i]; - uint64_t product_high = high * bigits_[i]; - uint64_t tmp = (carry & kBigitMask) + product_low; - bigits_[i] = tmp & kBigitMask; - carry = (carry >> kBigitSize) + (tmp >> kBigitSize) + - (product_high << (32 - kBigitSize)); - } - while (carry != 0) { - EnsureCapacity(used_digits_ + 1); - bigits_[used_digits_] = carry & kBigitMask; - used_digits_++; - carry >>= kBigitSize; - } -} - - -void Bignum::MultiplyByPowerOfTen(int exponent) { - const uint64_t kFive27 = UINT64_2PART_C(0x6765c793, fa10079d); - const uint16_t kFive1 = 5; - const uint16_t kFive2 = kFive1 * 5; - const uint16_t kFive3 = kFive2 * 5; - const uint16_t kFive4 = kFive3 * 5; - const uint16_t kFive5 = kFive4 * 5; - const uint16_t kFive6 = kFive5 * 5; - const uint32_t kFive7 = kFive6 * 5; - const uint32_t kFive8 = kFive7 * 5; - const uint32_t kFive9 = kFive8 * 5; - const uint32_t kFive10 = kFive9 * 5; - const uint32_t kFive11 = kFive10 * 5; - const uint32_t kFive12 = kFive11 * 5; - const uint32_t kFive13 = kFive12 * 5; - const uint32_t kFive1_to_12[] = - { kFive1, kFive2, kFive3, kFive4, kFive5, kFive6, - kFive7, kFive8, kFive9, kFive10, kFive11, kFive12 }; - - ASSERT(exponent >= 0); - if (exponent == 0) return; - if (used_digits_ == 0) return; - - // We shift by exponent at the end just before returning. - int remaining_exponent = exponent; - while (remaining_exponent >= 27) { - MultiplyByUInt64(kFive27); - remaining_exponent -= 27; - } - while (remaining_exponent >= 13) { - MultiplyByUInt32(kFive13); - remaining_exponent -= 13; - } - if (remaining_exponent > 0) { - MultiplyByUInt32(kFive1_to_12[remaining_exponent - 1]); - } - ShiftLeft(exponent); -} - - -void Bignum::Square() { - ASSERT(IsClamped()); - int product_length = 2 * used_digits_; - EnsureCapacity(product_length); - - // Comba multiplication: compute each column separately. - // Example: r = a2a1a0 * b2b1b0. - // r = 1 * a0b0 + - // 10 * (a1b0 + a0b1) + - // 100 * (a2b0 + a1b1 + a0b2) + - // 1000 * (a2b1 + a1b2) + - // 10000 * a2b2 - // - // In the worst case we have to accumulate nb-digits products of digit*digit. - // - // Assert that the additional number of bits in a DoubleChunk are enough to - // sum up used_digits of Bigit*Bigit. - if ((1 << (2 * (kChunkSize - kBigitSize))) <= used_digits_) { - UNIMPLEMENTED(); - } - DoubleChunk accumulator = 0; - // First shift the digits so we don't overwrite them. - int copy_offset = used_digits_; - for (int i = 0; i < used_digits_; ++i) { - bigits_[copy_offset + i] = bigits_[i]; - } - // We have two loops to avoid some 'if's in the loop. - for (int i = 0; i < used_digits_; ++i) { - // Process temporary digit i with power i. - // The sum of the two indices must be equal to i. - int bigit_index1 = i; - int bigit_index2 = 0; - // Sum all of the sub-products. - while (bigit_index1 >= 0) { - Chunk chunk1 = bigits_[copy_offset + bigit_index1]; - Chunk chunk2 = bigits_[copy_offset + bigit_index2]; - accumulator += static_cast<DoubleChunk>(chunk1) * chunk2; - bigit_index1--; - bigit_index2++; - } - bigits_[i] = static_cast<Chunk>(accumulator) & kBigitMask; - accumulator >>= kBigitSize; - } - for (int i = used_digits_; i < product_length; ++i) { - int bigit_index1 = used_digits_ - 1; - int bigit_index2 = i - bigit_index1; - // Invariant: sum of both indices is again equal to i. - // Inner loop runs 0 times on last iteration, emptying accumulator. - while (bigit_index2 < used_digits_) { - Chunk chunk1 = bigits_[copy_offset + bigit_index1]; - Chunk chunk2 = bigits_[copy_offset + bigit_index2]; - accumulator += static_cast<DoubleChunk>(chunk1) * chunk2; - bigit_index1--; - bigit_index2++; - } - // The overwritten bigits_[i] will never be read in further loop iterations, - // because bigit_index1 and bigit_index2 are always greater - // than i - used_digits_. - bigits_[i] = static_cast<Chunk>(accumulator) & kBigitMask; - accumulator >>= kBigitSize; - } - // Since the result was guaranteed to lie inside the number the - // accumulator must be 0 now. - ASSERT(accumulator == 0); - - // Don't forget to update the used_digits and the exponent. - used_digits_ = product_length; - exponent_ *= 2; - Clamp(); -} - - -void Bignum::AssignPowerUInt16(uint16_t base, int power_exponent) { - ASSERT(base != 0); - ASSERT(power_exponent >= 0); - if (power_exponent == 0) { - AssignUInt16(1); - return; - } - Zero(); - int shifts = 0; - // We expect base to be in range 2-32, and most often to be 10. - // It does not make much sense to implement different algorithms for counting - // the bits. - while ((base & 1) == 0) { - base >>= 1; - shifts++; - } - int bit_size = 0; - int tmp_base = base; - while (tmp_base != 0) { - tmp_base >>= 1; - bit_size++; - } - int final_size = bit_size * power_exponent; - // 1 extra bigit for the shifting, and one for rounded final_size. - EnsureCapacity(final_size / kBigitSize + 2); - - // Left to Right exponentiation. - int mask = 1; - while (power_exponent >= mask) mask <<= 1; - - // The mask is now pointing to the bit above the most significant 1-bit of - // power_exponent. - // Get rid of first 1-bit; - mask >>= 2; - uint64_t this_value = base; - - bool delayed_multipliciation = false; - const uint64_t max_32bits = 0xFFFFFFFF; - while (mask != 0 && this_value <= max_32bits) { - this_value = this_value * this_value; - // Verify that there is enough space in this_value to perform the - // multiplication. The first bit_size bits must be 0. - if ((power_exponent & mask) != 0) { - uint64_t base_bits_mask = - ~((static_cast<uint64_t>(1) << (64 - bit_size)) - 1); - bool high_bits_zero = (this_value & base_bits_mask) == 0; - if (high_bits_zero) { - this_value *= base; - } else { - delayed_multipliciation = true; - } - } - mask >>= 1; - } - AssignUInt64(this_value); - if (delayed_multipliciation) { - MultiplyByUInt32(base); - } - - // Now do the same thing as a bignum. - while (mask != 0) { - Square(); - if ((power_exponent & mask) != 0) { - MultiplyByUInt32(base); - } - mask >>= 1; - } - - // And finally add the saved shifts. - ShiftLeft(shifts * power_exponent); -} - - -// Precondition: this/other < 16bit. -uint16_t Bignum::DivideModuloIntBignum(const Bignum& other) { - ASSERT(IsClamped()); - ASSERT(other.IsClamped()); - ASSERT(other.used_digits_ > 0); - - // Easy case: if we have less digits than the divisor than the result is 0. - // Note: this handles the case where this == 0, too. - if (BigitLength() < other.BigitLength()) { - return 0; - } - - Align(other); - - uint16_t result = 0; - - // Start by removing multiples of 'other' until both numbers have the same - // number of digits. - while (BigitLength() > other.BigitLength()) { - // This naive approach is extremely inefficient if the this divided other - // might be big. This function is implemented for doubleToString where - // the result should be small (less than 10). - ASSERT(other.bigits_[other.used_digits_ - 1] >= ((1 << kBigitSize) / 16)); - // Remove the multiples of the first digit. - // Example this = 23 and other equals 9. -> Remove 2 multiples. - result += bigits_[used_digits_ - 1]; - SubtractTimes(other, bigits_[used_digits_ - 1]); - } - - ASSERT(BigitLength() == other.BigitLength()); - - // Both bignums are at the same length now. - // Since other has more than 0 digits we know that the access to - // bigits_[used_digits_ - 1] is safe. - Chunk this_bigit = bigits_[used_digits_ - 1]; - Chunk other_bigit = other.bigits_[other.used_digits_ - 1]; - - if (other.used_digits_ == 1) { - // Shortcut for easy (and common) case. - int quotient = this_bigit / other_bigit; - bigits_[used_digits_ - 1] = this_bigit - other_bigit * quotient; - result += quotient; - Clamp(); - return result; - } - - int division_estimate = this_bigit / (other_bigit + 1); - result += division_estimate; - SubtractTimes(other, division_estimate); - - if (other_bigit * (division_estimate + 1) > this_bigit) { - // No need to even try to subtract. Even if other's remaining digits were 0 - // another subtraction would be too much. - return result; - } - - while (LessEqual(other, *this)) { - SubtractBignum(other); - result++; - } - return result; -} - - -template<typename S> -static int SizeInHexChars(S number) { - ASSERT(number > 0); - int result = 0; - while (number != 0) { - number >>= 4; - result++; - } - return result; -} - - -static char HexCharOfValue(int value) { - ASSERT(0 <= value && value <= 16); - if (value < 10) return value + '0'; - return value - 10 + 'A'; -} - - -bool Bignum::ToHexString(char* buffer, int buffer_size) const { - ASSERT(IsClamped()); - // Each bigit must be printable as separate hex-character. - ASSERT(kBigitSize % 4 == 0); - const int kHexCharsPerBigit = kBigitSize / 4; - - if (used_digits_ == 0) { - if (buffer_size < 2) return false; - buffer[0] = '0'; - buffer[1] = '\0'; - return true; - } - // We add 1 for the terminating '\0' character. - int needed_chars = (BigitLength() - 1) * kHexCharsPerBigit + - SizeInHexChars(bigits_[used_digits_ - 1]) + 1; - if (needed_chars > buffer_size) return false; - int string_index = needed_chars - 1; - buffer[string_index--] = '\0'; - for (int i = 0; i < exponent_; ++i) { - for (int j = 0; j < kHexCharsPerBigit; ++j) { - buffer[string_index--] = '0'; - } - } - for (int i = 0; i < used_digits_ - 1; ++i) { - Chunk current_bigit = bigits_[i]; - for (int j = 0; j < kHexCharsPerBigit; ++j) { - buffer[string_index--] = HexCharOfValue(current_bigit & 0xF); - current_bigit >>= 4; - } - } - // And finally the last bigit. - Chunk most_significant_bigit = bigits_[used_digits_ - 1]; - while (most_significant_bigit != 0) { - buffer[string_index--] = HexCharOfValue(most_significant_bigit & 0xF); - most_significant_bigit >>= 4; - } - return true; -} - - -Bignum::Chunk Bignum::BigitAt(int index) const { - if (index >= BigitLength()) return 0; - if (index < exponent_) return 0; - return bigits_[index - exponent_]; -} - - -int Bignum::Compare(const Bignum& a, const Bignum& b) { - ASSERT(a.IsClamped()); - ASSERT(b.IsClamped()); - int bigit_length_a = a.BigitLength(); - int bigit_length_b = b.BigitLength(); - if (bigit_length_a < bigit_length_b) return -1; - if (bigit_length_a > bigit_length_b) return +1; - for (int i = bigit_length_a - 1; i >= Min(a.exponent_, b.exponent_); --i) { - Chunk bigit_a = a.BigitAt(i); - Chunk bigit_b = b.BigitAt(i); - if (bigit_a < bigit_b) return -1; - if (bigit_a > bigit_b) return +1; - // Otherwise they are equal up to this digit. Try the next digit. - } - return 0; -} - - -int Bignum::PlusCompare(const Bignum& a, const Bignum& b, const Bignum& c) { - ASSERT(a.IsClamped()); - ASSERT(b.IsClamped()); - ASSERT(c.IsClamped()); - if (a.BigitLength() < b.BigitLength()) { - return PlusCompare(b, a, c); - } - if (a.BigitLength() + 1 < c.BigitLength()) return -1; - if (a.BigitLength() > c.BigitLength()) return +1; - // The exponent encodes 0-bigits. So if there are more 0-digits in 'a' than - // 'b' has digits, then the bigit-length of 'a'+'b' must be equal to the one - // of 'a'. - if (a.exponent_ >= b.BigitLength() && a.BigitLength() < c.BigitLength()) { - return -1; - } - - Chunk borrow = 0; - // Starting at min_exponent all digits are == 0. So no need to compare them. - int min_exponent = Min(Min(a.exponent_, b.exponent_), c.exponent_); - for (int i = c.BigitLength() - 1; i >= min_exponent; --i) { - Chunk chunk_a = a.BigitAt(i); - Chunk chunk_b = b.BigitAt(i); - Chunk chunk_c = c.BigitAt(i); - Chunk sum = chunk_a + chunk_b; - if (sum > chunk_c + borrow) { - return +1; - } else { - borrow = chunk_c + borrow - sum; - if (borrow > 1) return -1; - borrow <<= kBigitSize; - } - } - if (borrow == 0) return 0; - return -1; -} - - -void Bignum::Clamp() { - while (used_digits_ > 0 && bigits_[used_digits_ - 1] == 0) { - used_digits_--; - } - if (used_digits_ == 0) { - // Zero. - exponent_ = 0; - } -} - - -bool Bignum::IsClamped() const { - return used_digits_ == 0 || bigits_[used_digits_ - 1] != 0; -} - - -void Bignum::Zero() { - for (int i = 0; i < used_digits_; ++i) { - bigits_[i] = 0; - } - used_digits_ = 0; - exponent_ = 0; -} - - -void Bignum::Align(const Bignum& other) { - if (exponent_ > other.exponent_) { - // If "X" represents a "hidden" digit (by the exponent) then we are in the - // following case (a == this, b == other): - // a: aaaaaaXXXX or a: aaaaaXXX - // b: bbbbbbX b: bbbbbbbbXX - // We replace some of the hidden digits (X) of a with 0 digits. - // a: aaaaaa000X or a: aaaaa0XX - int zero_digits = exponent_ - other.exponent_; - EnsureCapacity(used_digits_ + zero_digits); - for (int i = used_digits_ - 1; i >= 0; --i) { - bigits_[i + zero_digits] = bigits_[i]; - } - for (int i = 0; i < zero_digits; ++i) { - bigits_[i] = 0; - } - used_digits_ += zero_digits; - exponent_ -= zero_digits; - ASSERT(used_digits_ >= 0); - ASSERT(exponent_ >= 0); - } -} - - -void Bignum::BigitsShiftLeft(int shift_amount) { - ASSERT(shift_amount < kBigitSize); - ASSERT(shift_amount >= 0); - Chunk carry = 0; - for (int i = 0; i < used_digits_; ++i) { - Chunk new_carry = bigits_[i] >> (kBigitSize - shift_amount); - bigits_[i] = ((bigits_[i] << shift_amount) + carry) & kBigitMask; - carry = new_carry; - } - if (carry != 0) { - bigits_[used_digits_] = carry; - used_digits_++; - } -} - - -void Bignum::SubtractTimes(const Bignum& other, int factor) { - ASSERT(exponent_ <= other.exponent_); - if (factor < 3) { - for (int i = 0; i < factor; ++i) { - SubtractBignum(other); - } - return; - } - Chunk borrow = 0; - int exponent_diff = other.exponent_ - exponent_; - for (int i = 0; i < other.used_digits_; ++i) { - DoubleChunk product = static_cast<DoubleChunk>(factor) * other.bigits_[i]; - DoubleChunk remove = borrow + product; - Chunk difference = bigits_[i + exponent_diff] - (remove & kBigitMask); - bigits_[i + exponent_diff] = difference & kBigitMask; - borrow = static_cast<Chunk>((difference >> (kChunkSize - 1)) + - (remove >> kBigitSize)); - } - for (int i = other.used_digits_ + exponent_diff; i < used_digits_; ++i) { - if (borrow == 0) return; - Chunk difference = bigits_[i] - borrow; - bigits_[i] = difference & kBigitMask; - borrow = difference >> (kChunkSize - 1); - ++i; - } - Clamp(); -} - - -} // namespace double_conversion
