http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/strtod.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/double-conversion/strtod.cc b/ext/kenlm/util/double-conversion/strtod.cc deleted file mode 100644 index 55b4daa..0000000 --- a/ext/kenlm/util/double-conversion/strtod.cc +++ /dev/null @@ -1,558 +0,0 @@ -// Copyright 2010 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include <cstdarg> -#include <climits> - -#include "strtod.h" -#include "bignum.h" -#include "cached-powers.h" -#include "ieee.h" - -namespace double_conversion { - -// 2^53 = 9007199254740992. -// Any integer with at most 15 decimal digits will hence fit into a double -// (which has a 53bit significand) without loss of precision. -static const int kMaxExactDoubleIntegerDecimalDigits = 15; -// 2^64 = 18446744073709551616 > 10^19 -static const int kMaxUint64DecimalDigits = 19; - -// Max double: 1.7976931348623157 x 10^308 -// Min non-zero double: 4.9406564584124654 x 10^-324 -// Any x >= 10^309 is interpreted as +infinity. -// Any x <= 10^-324 is interpreted as 0. -// Note that 2.5e-324 (despite being smaller than the min double) will be read -// as non-zero (equal to the min non-zero double). -static const int kMaxDecimalPower = 309; -static const int kMinDecimalPower = -324; - -// 2^64 = 18446744073709551616 -static const uint64_t kMaxUint64 = UINT64_2PART_C(0xFFFFFFFF, FFFFFFFF); - - -static const double exact_powers_of_ten[] = { - 1.0, // 10^0 - 10.0, - 100.0, - 1000.0, - 10000.0, - 100000.0, - 1000000.0, - 10000000.0, - 100000000.0, - 1000000000.0, - 10000000000.0, // 10^10 - 100000000000.0, - 1000000000000.0, - 10000000000000.0, - 100000000000000.0, - 1000000000000000.0, - 10000000000000000.0, - 100000000000000000.0, - 1000000000000000000.0, - 10000000000000000000.0, - 100000000000000000000.0, // 10^20 - 1000000000000000000000.0, - // 10^22 = 0x21e19e0c9bab2400000 = 0x878678326eac9 * 2^22 - 10000000000000000000000.0 -}; -static const int kExactPowersOfTenSize = ARRAY_SIZE(exact_powers_of_ten); - -// Maximum number of significant digits in the decimal representation. -// In fact the value is 772 (see conversions.cc), but to give us some margin -// we round up to 780. -static const int kMaxSignificantDecimalDigits = 780; - -static Vector<const char> TrimLeadingZeros(Vector<const char> buffer) { - for (int i = 0; i < buffer.length(); i++) { - if (buffer[i] != '0') { - return buffer.SubVector(i, buffer.length()); - } - } - return Vector<const char>(buffer.start(), 0); -} - - -static Vector<const char> TrimTrailingZeros(Vector<const char> buffer) { - for (int i = buffer.length() - 1; i >= 0; --i) { - if (buffer[i] != '0') { - return buffer.SubVector(0, i + 1); - } - } - return Vector<const char>(buffer.start(), 0); -} - - -static void CutToMaxSignificantDigits(Vector<const char> buffer, - int exponent, - char* significant_buffer, - int* significant_exponent) { - for (int i = 0; i < kMaxSignificantDecimalDigits - 1; ++i) { - significant_buffer[i] = buffer[i]; - } - // The input buffer has been trimmed. Therefore the last digit must be - // different from '0'. - ASSERT(buffer[buffer.length() - 1] != '0'); - // Set the last digit to be non-zero. This is sufficient to guarantee - // correct rounding. - significant_buffer[kMaxSignificantDecimalDigits - 1] = '1'; - *significant_exponent = - exponent + (buffer.length() - kMaxSignificantDecimalDigits); -} - - -// Trims the buffer and cuts it to at most kMaxSignificantDecimalDigits. -// If possible the input-buffer is reused, but if the buffer needs to be -// modified (due to cutting), then the input needs to be copied into the -// buffer_copy_space. -static void TrimAndCut(Vector<const char> buffer, int exponent, - char* buffer_copy_space, int space_size, - Vector<const char>* trimmed, int* updated_exponent) { - Vector<const char> left_trimmed = TrimLeadingZeros(buffer); - Vector<const char> right_trimmed = TrimTrailingZeros(left_trimmed); - exponent += left_trimmed.length() - right_trimmed.length(); - if (right_trimmed.length() > kMaxSignificantDecimalDigits) { - ASSERT(space_size >= kMaxSignificantDecimalDigits); - CutToMaxSignificantDigits(right_trimmed, exponent, - buffer_copy_space, updated_exponent); - *trimmed = Vector<const char>(buffer_copy_space, - kMaxSignificantDecimalDigits); - } else { - *trimmed = right_trimmed; - *updated_exponent = exponent; - } -} - - -// Reads digits from the buffer and converts them to a uint64. -// Reads in as many digits as fit into a uint64. -// When the string starts with "1844674407370955161" no further digit is read. -// Since 2^64 = 18446744073709551616 it would still be possible read another -// digit if it was less or equal than 6, but this would complicate the code. -static uint64_t ReadUint64(Vector<const char> buffer, - int* number_of_read_digits) { - uint64_t result = 0; - int i = 0; - while (i < buffer.length() && result <= (kMaxUint64 / 10 - 1)) { - int digit = buffer[i++] - '0'; - ASSERT(0 <= digit && digit <= 9); - result = 10 * result + digit; - } - *number_of_read_digits = i; - return result; -} - - -// Reads a DiyFp from the buffer. -// The returned DiyFp is not necessarily normalized. -// If remaining_decimals is zero then the returned DiyFp is accurate. -// Otherwise it has been rounded and has error of at most 1/2 ulp. -static void ReadDiyFp(Vector<const char> buffer, - DiyFp* result, - int* remaining_decimals) { - int read_digits; - uint64_t significand = ReadUint64(buffer, &read_digits); - if (buffer.length() == read_digits) { - *result = DiyFp(significand, 0); - *remaining_decimals = 0; - } else { - // Round the significand. - if (buffer[read_digits] >= '5') { - significand++; - } - // Compute the binary exponent. - int exponent = 0; - *result = DiyFp(significand, exponent); - *remaining_decimals = buffer.length() - read_digits; - } -} - - -static bool DoubleStrtod(Vector<const char> trimmed, - int exponent, - double* result) { -#if !defined(DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS) - // On x86 the floating-point stack can be 64 or 80 bits wide. If it is - // 80 bits wide (as is the case on Linux) then double-rounding occurs and the - // result is not accurate. - // We know that Windows32 uses 64 bits and is therefore accurate. - // Note that the ARM simulator is compiled for 32bits. It therefore exhibits - // the same problem. - return false; -#endif - if (trimmed.length() <= kMaxExactDoubleIntegerDecimalDigits) { - int read_digits; - // The trimmed input fits into a double. - // If the 10^exponent (resp. 10^-exponent) fits into a double too then we - // can compute the result-double simply by multiplying (resp. dividing) the - // two numbers. - // This is possible because IEEE guarantees that floating-point operations - // return the best possible approximation. - if (exponent < 0 && -exponent < kExactPowersOfTenSize) { - // 10^-exponent fits into a double. - *result = static_cast<double>(ReadUint64(trimmed, &read_digits)); - ASSERT(read_digits == trimmed.length()); - *result /= exact_powers_of_ten[-exponent]; - return true; - } - if (0 <= exponent && exponent < kExactPowersOfTenSize) { - // 10^exponent fits into a double. - *result = static_cast<double>(ReadUint64(trimmed, &read_digits)); - ASSERT(read_digits == trimmed.length()); - *result *= exact_powers_of_ten[exponent]; - return true; - } - int remaining_digits = - kMaxExactDoubleIntegerDecimalDigits - trimmed.length(); - if ((0 <= exponent) && - (exponent - remaining_digits < kExactPowersOfTenSize)) { - // The trimmed string was short and we can multiply it with - // 10^remaining_digits. As a result the remaining exponent now fits - // into a double too. - *result = static_cast<double>(ReadUint64(trimmed, &read_digits)); - ASSERT(read_digits == trimmed.length()); - *result *= exact_powers_of_ten[remaining_digits]; - *result *= exact_powers_of_ten[exponent - remaining_digits]; - return true; - } - } - return false; -} - - -// Returns 10^exponent as an exact DiyFp. -// The given exponent must be in the range [1; kDecimalExponentDistance[. -static DiyFp AdjustmentPowerOfTen(int exponent) { - ASSERT(0 < exponent); - ASSERT(exponent < PowersOfTenCache::kDecimalExponentDistance); - // Simply hardcode the remaining powers for the given decimal exponent - // distance. - ASSERT(PowersOfTenCache::kDecimalExponentDistance == 8); - switch (exponent) { - case 1: return DiyFp(UINT64_2PART_C(0xa0000000, 00000000), -60); - case 2: return DiyFp(UINT64_2PART_C(0xc8000000, 00000000), -57); - case 3: return DiyFp(UINT64_2PART_C(0xfa000000, 00000000), -54); - case 4: return DiyFp(UINT64_2PART_C(0x9c400000, 00000000), -50); - case 5: return DiyFp(UINT64_2PART_C(0xc3500000, 00000000), -47); - case 6: return DiyFp(UINT64_2PART_C(0xf4240000, 00000000), -44); - case 7: return DiyFp(UINT64_2PART_C(0x98968000, 00000000), -40); - default: - UNREACHABLE(); - return DiyFp(0, 0); - } -} - - -// If the function returns true then the result is the correct double. -// Otherwise it is either the correct double or the double that is just below -// the correct double. -static bool DiyFpStrtod(Vector<const char> buffer, - int exponent, - double* result) { - DiyFp input; - int remaining_decimals; - ReadDiyFp(buffer, &input, &remaining_decimals); - // Since we may have dropped some digits the input is not accurate. - // If remaining_decimals is different than 0 than the error is at most - // .5 ulp (unit in the last place). - // We don't want to deal with fractions and therefore keep a common - // denominator. - const int kDenominatorLog = 3; - const int kDenominator = 1 << kDenominatorLog; - // Move the remaining decimals into the exponent. - exponent += remaining_decimals; - int error = (remaining_decimals == 0 ? 0 : kDenominator / 2); - - int old_e = input.e(); - input.Normalize(); - error <<= old_e - input.e(); - - ASSERT(exponent <= PowersOfTenCache::kMaxDecimalExponent); - if (exponent < PowersOfTenCache::kMinDecimalExponent) { - *result = 0.0; - return true; - } - DiyFp cached_power; - int cached_decimal_exponent; - PowersOfTenCache::GetCachedPowerForDecimalExponent(exponent, - &cached_power, - &cached_decimal_exponent); - - if (cached_decimal_exponent != exponent) { - int adjustment_exponent = exponent - cached_decimal_exponent; - DiyFp adjustment_power = AdjustmentPowerOfTen(adjustment_exponent); - input.Multiply(adjustment_power); - if (kMaxUint64DecimalDigits - buffer.length() >= adjustment_exponent) { - // The product of input with the adjustment power fits into a 64 bit - // integer. - ASSERT(DiyFp::kSignificandSize == 64); - } else { - // The adjustment power is exact. There is hence only an error of 0.5. - error += kDenominator / 2; - } - } - - input.Multiply(cached_power); - // The error introduced by a multiplication of a*b equals - // error_a + error_b + error_a*error_b/2^64 + 0.5 - // Substituting a with 'input' and b with 'cached_power' we have - // error_b = 0.5 (all cached powers have an error of less than 0.5 ulp), - // error_ab = 0 or 1 / kDenominator > error_a*error_b/ 2^64 - int error_b = kDenominator / 2; - int error_ab = (error == 0 ? 0 : 1); // We round up to 1. - int fixed_error = kDenominator / 2; - error += error_b + error_ab + fixed_error; - - old_e = input.e(); - input.Normalize(); - error <<= old_e - input.e(); - - // See if the double's significand changes if we add/subtract the error. - int order_of_magnitude = DiyFp::kSignificandSize + input.e(); - int effective_significand_size = - Double::SignificandSizeForOrderOfMagnitude(order_of_magnitude); - int precision_digits_count = - DiyFp::kSignificandSize - effective_significand_size; - if (precision_digits_count + kDenominatorLog >= DiyFp::kSignificandSize) { - // This can only happen for very small denormals. In this case the - // half-way multiplied by the denominator exceeds the range of an uint64. - // Simply shift everything to the right. - int shift_amount = (precision_digits_count + kDenominatorLog) - - DiyFp::kSignificandSize + 1; - input.set_f(input.f() >> shift_amount); - input.set_e(input.e() + shift_amount); - // We add 1 for the lost precision of error, and kDenominator for - // the lost precision of input.f(). - error = (error >> shift_amount) + 1 + kDenominator; - precision_digits_count -= shift_amount; - } - // We use uint64_ts now. This only works if the DiyFp uses uint64_ts too. - ASSERT(DiyFp::kSignificandSize == 64); - ASSERT(precision_digits_count < 64); - uint64_t one64 = 1; - uint64_t precision_bits_mask = (one64 << precision_digits_count) - 1; - uint64_t precision_bits = input.f() & precision_bits_mask; - uint64_t half_way = one64 << (precision_digits_count - 1); - precision_bits *= kDenominator; - half_way *= kDenominator; - DiyFp rounded_input(input.f() >> precision_digits_count, - input.e() + precision_digits_count); - if (precision_bits >= half_way + error) { - rounded_input.set_f(rounded_input.f() + 1); - } - // If the last_bits are too close to the half-way case than we are too - // inaccurate and round down. In this case we return false so that we can - // fall back to a more precise algorithm. - - *result = Double(rounded_input).value(); - if (half_way - error < precision_bits && precision_bits < half_way + error) { - // Too imprecise. The caller will have to fall back to a slower version. - // However the returned number is guaranteed to be either the correct - // double, or the next-lower double. - return false; - } else { - return true; - } -} - - -// Returns -// - -1 if buffer*10^exponent < diy_fp. -// - 0 if buffer*10^exponent == diy_fp. -// - +1 if buffer*10^exponent > diy_fp. -// Preconditions: -// buffer.length() + exponent <= kMaxDecimalPower + 1 -// buffer.length() + exponent > kMinDecimalPower -// buffer.length() <= kMaxDecimalSignificantDigits -static int CompareBufferWithDiyFp(Vector<const char> buffer, - int exponent, - DiyFp diy_fp) { - ASSERT(buffer.length() + exponent <= kMaxDecimalPower + 1); - ASSERT(buffer.length() + exponent > kMinDecimalPower); - ASSERT(buffer.length() <= kMaxSignificantDecimalDigits); - // Make sure that the Bignum will be able to hold all our numbers. - // Our Bignum implementation has a separate field for exponents. Shifts will - // consume at most one bigit (< 64 bits). - // ln(10) == 3.3219... - ASSERT(((kMaxDecimalPower + 1) * 333 / 100) < Bignum::kMaxSignificantBits); - Bignum buffer_bignum; - Bignum diy_fp_bignum; - buffer_bignum.AssignDecimalString(buffer); - diy_fp_bignum.AssignUInt64(diy_fp.f()); - if (exponent >= 0) { - buffer_bignum.MultiplyByPowerOfTen(exponent); - } else { - diy_fp_bignum.MultiplyByPowerOfTen(-exponent); - } - if (diy_fp.e() > 0) { - diy_fp_bignum.ShiftLeft(diy_fp.e()); - } else { - buffer_bignum.ShiftLeft(-diy_fp.e()); - } - return Bignum::Compare(buffer_bignum, diy_fp_bignum); -} - - -// Returns true if the guess is the correct double. -// Returns false, when guess is either correct or the next-lower double. -static bool ComputeGuess(Vector<const char> trimmed, int exponent, - double* guess) { - if (trimmed.length() == 0) { - *guess = 0.0; - return true; - } - if (exponent + trimmed.length() - 1 >= kMaxDecimalPower) { - *guess = Double::Infinity(); - return true; - } - if (exponent + trimmed.length() <= kMinDecimalPower) { - *guess = 0.0; - return true; - } - - if (DoubleStrtod(trimmed, exponent, guess) || - DiyFpStrtod(trimmed, exponent, guess)) { - return true; - } - if (*guess == Double::Infinity()) { - return true; - } - return false; -} - -double Strtod(Vector<const char> buffer, int exponent) { - char copy_buffer[kMaxSignificantDecimalDigits]; - Vector<const char> trimmed; - int updated_exponent; - TrimAndCut(buffer, exponent, copy_buffer, kMaxSignificantDecimalDigits, - &trimmed, &updated_exponent); - exponent = updated_exponent; - - double guess; - bool is_correct = ComputeGuess(trimmed, exponent, &guess); - if (is_correct) return guess; - - DiyFp upper_boundary = Double(guess).UpperBoundary(); - int comparison = CompareBufferWithDiyFp(trimmed, exponent, upper_boundary); - if (comparison < 0) { - return guess; - } else if (comparison > 0) { - return Double(guess).NextDouble(); - } else if ((Double(guess).Significand() & 1) == 0) { - // Round towards even. - return guess; - } else { - return Double(guess).NextDouble(); - } -} - -float Strtof(Vector<const char> buffer, int exponent) { - char copy_buffer[kMaxSignificantDecimalDigits]; - Vector<const char> trimmed; - int updated_exponent; - TrimAndCut(buffer, exponent, copy_buffer, kMaxSignificantDecimalDigits, - &trimmed, &updated_exponent); - exponent = updated_exponent; - - double double_guess; - bool is_correct = ComputeGuess(trimmed, exponent, &double_guess); - - float float_guess = static_cast<float>(double_guess); - if (float_guess == double_guess) { - // This shortcut triggers for integer values. - return float_guess; - } - - // We must catch double-rounding. Say the double has been rounded up, and is - // now a boundary of a float, and rounds up again. This is why we have to - // look at previous too. - // Example (in decimal numbers): - // input: 12349 - // high-precision (4 digits): 1235 - // low-precision (3 digits): - // when read from input: 123 - // when rounded from high precision: 124. - // To do this we simply look at the neigbors of the correct result and see - // if they would round to the same float. If the guess is not correct we have - // to look at four values (since two different doubles could be the correct - // double). - - double double_next = Double(double_guess).NextDouble(); - double double_previous = Double(double_guess).PreviousDouble(); - - float f1 = static_cast<float>(double_previous); -#ifndef NDEBUG - float f2 = float_guess; -#endif - float f3 = static_cast<float>(double_next); - float f4; - if (is_correct) { - f4 = f3; - } else { - double double_next2 = Double(double_next).NextDouble(); - f4 = static_cast<float>(double_next2); - } -#ifndef NDEBUG - ASSERT(f1 <= f2 && f2 <= f3 && f3 <= f4); -#endif - - // If the guess doesn't lie near a single-precision boundary we can simply - // return its float-value. - if (f1 == f4) { - return float_guess; - } - - ASSERT((f1 != f2 && f2 == f3 && f3 == f4) || - (f1 == f2 && f2 != f3 && f3 == f4) || - (f1 == f2 && f2 == f3 && f3 != f4)); - - // guess and next are the two possible canditates (in the same way that - // double_guess was the lower candidate for a double-precision guess). - float guess = f1; - float next = f4; - DiyFp upper_boundary; - if (guess == 0.0f) { - float min_float = 1e-45f; - upper_boundary = Double(static_cast<double>(min_float) / 2).AsDiyFp(); - } else { - upper_boundary = Single(guess).UpperBoundary(); - } - int comparison = CompareBufferWithDiyFp(trimmed, exponent, upper_boundary); - if (comparison < 0) { - return guess; - } else if (comparison > 0) { - return next; - } else if ((Single(guess).Significand() & 1) == 0) { - // Round towards even. - return guess; - } else { - return next; - } -} - -} // namespace double_conversion
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/strtod.h ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/double-conversion/strtod.h b/ext/kenlm/util/double-conversion/strtod.h deleted file mode 100644 index ed0293b..0000000 --- a/ext/kenlm/util/double-conversion/strtod.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2010 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef DOUBLE_CONVERSION_STRTOD_H_ -#define DOUBLE_CONVERSION_STRTOD_H_ - -#include "utils.h" - -namespace double_conversion { - -// The buffer must only contain digits in the range [0-9]. It must not -// contain a dot or a sign. It must not start with '0', and must not be empty. -double Strtod(Vector<const char> buffer, int exponent); - -// The buffer must only contain digits in the range [0-9]. It must not -// contain a dot or a sign. It must not start with '0', and must not be empty. -float Strtof(Vector<const char> buffer, int exponent); - -} // namespace double_conversion - -#endif // DOUBLE_CONVERSION_STRTOD_H_ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/utils.h ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/double-conversion/utils.h b/ext/kenlm/util/double-conversion/utils.h deleted file mode 100644 index 9ccb3b6..0000000 --- a/ext/kenlm/util/double-conversion/utils.h +++ /dev/null @@ -1,320 +0,0 @@ -// Copyright 2010 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#ifndef DOUBLE_CONVERSION_UTILS_H_ -#define DOUBLE_CONVERSION_UTILS_H_ - -#include <stdlib.h> -#include <string.h> - -#include <assert.h> -#ifndef ASSERT -#define ASSERT(condition) (assert(condition)) -#endif -#ifndef UNIMPLEMENTED -#define UNIMPLEMENTED() (abort()) -#endif -#ifndef UNREACHABLE -#define UNREACHABLE() (abort()) -#endif - -// Double operations detection based on target architecture. -// Linux uses a 80bit wide floating point stack on x86. This induces double -// rounding, which in turn leads to wrong results. -// An easy way to test if the floating-point operations are correct is to -// evaluate: 89255.0/1e22. If the floating-point stack is 64 bits wide then -// the result is equal to 89255e-22. -// The best way to test this, is to create a division-function and to compare -// the output of the division with the expected result. (Inlining must be -// disabled.) -// On Linux,x86 89255e-22 != Div_double(89255.0/1e22) -#if defined(_M_X64) || defined(__x86_64__) || \ - defined(__ARMEL__) || defined(__avr32__) || \ - defined(__hppa__) || defined(__ia64__) || \ - defined(__mips__) || defined(__powerpc__) || \ - defined(__sparc__) || defined(__sparc) || defined(__s390__) || \ - defined(__SH4__) || defined(__alpha__) || \ - defined(_MIPS_ARCH_MIPS32R2) -#define DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS 1 -#elif defined(_M_IX86) || defined(__i386__) || defined(__i386) -#if defined(_WIN32) -// Windows uses a 64bit wide floating point stack. -#define DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS 1 -#else -#undef DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS -#endif // _WIN32 -#else -#error Target architecture was not detected as supported by Double-Conversion. -#endif - - -#if defined(_WIN32) && !defined(__MINGW32__) - -typedef signed char int8_t; -typedef unsigned char uint8_t; -typedef short int16_t; // NOLINT -typedef unsigned short uint16_t; // NOLINT -typedef int int32_t; -typedef unsigned int uint32_t; -typedef __int64 int64_t; -typedef unsigned __int64 uint64_t; -// intptr_t and friends are defined in crtdefs.h through stdio.h. - -#else - -#include <stdint.h> - -#endif - -// The following macro works on both 32 and 64-bit platforms. -// Usage: instead of writing 0x1234567890123456 -// write UINT64_2PART_C(0x12345678,90123456); -#define UINT64_2PART_C(a, b) (((static_cast<uint64_t>(a) << 32) + 0x##b##u)) - - -// The expression ARRAY_SIZE(a) is a compile-time constant of type -// size_t which represents the number of elements of the given -// array. You should only use ARRAY_SIZE on statically allocated -// arrays. -#ifndef ARRAY_SIZE -#define ARRAY_SIZE(a) \ - ((sizeof(a) / sizeof(*(a))) / \ - static_cast<size_t>(!(sizeof(a) % sizeof(*(a))))) -#endif - -// A macro to disallow the evil copy constructor and operator= functions -// This should be used in the private: declarations for a class -#ifndef DISALLOW_COPY_AND_ASSIGN -#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ - TypeName(const TypeName&); \ - void operator=(const TypeName&) -#endif - -// A macro to disallow all the implicit constructors, namely the -// default constructor, copy constructor and operator= functions. -// -// This should be used in the private: declarations for a class -// that wants to prevent anyone from instantiating it. This is -// especially useful for classes containing only static methods. -#ifndef DISALLOW_IMPLICIT_CONSTRUCTORS -#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ - TypeName(); \ - DISALLOW_COPY_AND_ASSIGN(TypeName) -#endif - -namespace double_conversion { - -static const int kCharSize = sizeof(char); - -// Returns the maximum of the two parameters. -template <typename T> -static T Max(T a, T b) { - return a < b ? b : a; -} - - -// Returns the minimum of the two parameters. -template <typename T> -static T Min(T a, T b) { - return a < b ? a : b; -} - - -inline int StrLength(const char* string) { - size_t length = strlen(string); - ASSERT(length == static_cast<size_t>(static_cast<int>(length))); - return static_cast<int>(length); -} - -// This is a simplified version of V8's Vector class. -template <typename T> -class Vector { - public: - Vector() : start_(NULL), length_(0) {} - Vector(T* data, int length) : start_(data), length_(length) { - ASSERT(length == 0 || (length > 0 && data != NULL)); - } - - // Returns a vector using the same backing storage as this one, - // spanning from and including 'from', to but not including 'to'. - Vector<T> SubVector(int from, int to) { - ASSERT(to <= length_); - ASSERT(from < to); - ASSERT(0 <= from); - return Vector<T>(start() + from, to - from); - } - - // Returns the length of the vector. - int length() const { return length_; } - - // Returns whether or not the vector is empty. - bool is_empty() const { return length_ == 0; } - - // Returns the pointer to the start of the data in the vector. - T* start() const { return start_; } - - // Access individual vector elements - checks bounds in debug mode. - T& operator[](int index) const { - ASSERT(0 <= index && index < length_); - return start_[index]; - } - - T& first() { return start_[0]; } - - T& last() { return start_[length_ - 1]; } - - private: - T* start_; - int length_; -}; - - -// Helper class for building result strings in a character buffer. The -// purpose of the class is to use safe operations that checks the -// buffer bounds on all operations in debug mode. -class StringBuilder { - public: - StringBuilder(char* buffer, int size) - : buffer_(buffer, size), position_(0) { } - - ~StringBuilder() { if (!is_finalized()) Finalize(); } - - int size() const { return buffer_.length(); } - - // Get the current position in the builder. - int position() const { - ASSERT(!is_finalized()); - return position_; - } - - // Reset the position. - void Reset() { position_ = 0; } - - // Add a single character to the builder. It is not allowed to add - // 0-characters; use the Finalize() method to terminate the string - // instead. - void AddCharacter(char c) { - // I just extract raw data not a cstr so null is fine. - //ASSERT(c != '\0'); - ASSERT(!is_finalized() && position_ < buffer_.length()); - buffer_[position_++] = c; - } - - // Add an entire string to the builder. Uses strlen() internally to - // compute the length of the input string. - void AddString(const char* s) { - AddSubstring(s, StrLength(s)); - } - - // Add the first 'n' characters of the given string 's' to the - // builder. The input string must have enough characters. - void AddSubstring(const char* s, int n) { - ASSERT(!is_finalized() && position_ + n < buffer_.length()); - // I just extract raw data not a cstr so null is fine. - //ASSERT(static_cast<size_t>(n) <= strlen(s)); - memmove(&buffer_[position_], s, n * kCharSize); - position_ += n; - } - - - // Add character padding to the builder. If count is non-positive, - // nothing is added to the builder. - void AddPadding(char c, int count) { - for (int i = 0; i < count; i++) { - AddCharacter(c); - } - } - - // Finalize the string by 0-terminating it and returning the buffer. - char* Finalize() { - ASSERT(!is_finalized() && position_ < buffer_.length()); - buffer_[position_] = '\0'; - // Make sure nobody managed to add a 0-character to the - // buffer while building the string. - // I just extract raw data not a cstr so null is fine. - //ASSERT(strlen(buffer_.start()) == static_cast<size_t>(position_)); - position_ = -1; - ASSERT(is_finalized()); - return buffer_.start(); - } - - private: - Vector<char> buffer_; - int position_; - - bool is_finalized() const { return position_ < 0; } - - DISALLOW_IMPLICIT_CONSTRUCTORS(StringBuilder); -}; - -// The type-based aliasing rule allows the compiler to assume that pointers of -// different types (for some definition of different) never alias each other. -// Thus the following code does not work: -// -// float f = foo(); -// int fbits = *(int*)(&f); -// -// The compiler 'knows' that the int pointer can't refer to f since the types -// don't match, so the compiler may cache f in a register, leaving random data -// in fbits. Using C++ style casts makes no difference, however a pointer to -// char data is assumed to alias any other pointer. This is the 'memcpy -// exception'. -// -// Bit_cast uses the memcpy exception to move the bits from a variable of one -// type of a variable of another type. Of course the end result is likely to -// be implementation dependent. Most compilers (gcc-4.2 and MSVC 2005) -// will completely optimize BitCast away. -// -// There is an additional use for BitCast. -// Recent gccs will warn when they see casts that may result in breakage due to -// the type-based aliasing rule. If you have checked that there is no breakage -// you can use BitCast to cast one pointer type to another. This confuses gcc -// enough that it can no longer see that you have cast one pointer type to -// another thus avoiding the warning. -template <class Dest, class Source> -inline Dest BitCast(const Source& source) { - // Compile time assertion: sizeof(Dest) == sizeof(Source) - // A compile error here means your Dest and Source have different sizes. - typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1] -#if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8 - __attribute__((unused)) -#endif - ; - - Dest dest; - memmove(&dest, &source, sizeof(dest)); - return dest; -} - -template <class Dest, class Source> -inline Dest BitCast(Source* source) { - return BitCast<Dest>(reinterpret_cast<uintptr_t>(source)); -} - -} // namespace double_conversion - -#endif // DOUBLE_CONVERSION_UTILS_H_ http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/ersatz_progress.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/ersatz_progress.cc b/ext/kenlm/util/ersatz_progress.cc deleted file mode 100644 index 55c82e7..0000000 --- a/ext/kenlm/util/ersatz_progress.cc +++ /dev/null @@ -1,47 +0,0 @@ -#include "util/ersatz_progress.hh" - -#include <algorithm> -#include <ostream> -#include <limits> -#include <string> - -namespace util { - -namespace { const unsigned char kWidth = 100; } - -const char kProgressBanner[] = "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n"; - -ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits<uint64_t>::max()), complete_(next_), out_(NULL) {} - -ErsatzProgress::~ErsatzProgress() { - if (out_) Finished(); -} - -ErsatzProgress::ErsatzProgress(uint64_t complete, std::ostream *to, const std::string &message) - : current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) { - if (!out_) { - next_ = std::numeric_limits<uint64_t>::max(); - return; - } - if (!message.empty()) *out_ << message << '\n'; - *out_ << kProgressBanner; -} - -void ErsatzProgress::Milestone() { - if (!out_) { current_ = 0; return; } - if (!complete_) return; - unsigned char stone = std::min(static_cast<uint64_t>(kWidth), (current_ * kWidth) / complete_); - - for (; stones_written_ < stone; ++stones_written_) { - (*out_) << '*'; - } - if (stone == kWidth) { - (*out_) << std::endl; - next_ = std::numeric_limits<uint64_t>::max(); - out_ = NULL; - } else { - next_ = std::max(next_, ((stone + 1) * complete_ + kWidth - 1) / kWidth); - } -} - -} // namespace util http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/ersatz_progress.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/ersatz_progress.hh b/ext/kenlm/util/ersatz_progress.hh deleted file mode 100644 index b47aded..0000000 --- a/ext/kenlm/util/ersatz_progress.hh +++ /dev/null @@ -1,57 +0,0 @@ -#ifndef UTIL_ERSATZ_PROGRESS_H -#define UTIL_ERSATZ_PROGRESS_H - -#include <iostream> -#include <string> -#include <stdint.h> - -// Ersatz version of boost::progress so core language model doesn't depend on -// boost. Also adds option to print nothing. - -namespace util { - -extern const char kProgressBanner[]; - -class ErsatzProgress { - public: - // No output. - ErsatzProgress(); - - // Null means no output. The null value is useful for passing along the ostream pointer from another caller. - explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = ""); - - ~ErsatzProgress(); - - ErsatzProgress &operator++() { - if (++current_ >= next_) Milestone(); - return *this; - } - - ErsatzProgress &operator+=(uint64_t amount) { - if ((current_ += amount) >= next_) Milestone(); - return *this; - } - - void Set(uint64_t to) { - if ((current_ = to) >= next_) Milestone(); - } - - void Finished() { - Set(complete_); - } - - private: - void Milestone(); - - uint64_t current_, next_, complete_; - unsigned char stones_written_; - std::ostream *out_; - - // noncopyable - ErsatzProgress(const ErsatzProgress &other); - ErsatzProgress &operator=(const ErsatzProgress &other); -}; - -} // namespace util - -#endif // UTIL_ERSATZ_PROGRESS_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/exception.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/exception.cc b/ext/kenlm/util/exception.cc deleted file mode 100644 index e644d2c..0000000 --- a/ext/kenlm/util/exception.cc +++ /dev/null @@ -1,105 +0,0 @@ -#include "util/exception.hh" - -#ifdef __GXX_RTTI -#include <typeinfo> -#endif - -#include <cerrno> -#include <cstring> - -#if defined(_WIN32) || defined(_WIN64) -#include <windows.h> -#include <io.h> -#endif - -namespace util { - -Exception::Exception() throw() {} -Exception::~Exception() throw() {} - -void Exception::SetLocation(const char *file, unsigned int line, const char *func, const char *child_name, const char *condition) { - /* The child class might have set some text, but we want this to come first. - * Another option would be passing this information to the constructor, but - * then child classes would have to accept constructor arguments and pass - * them down. - */ - std::string old_text; - std::swap(old_text, what_); - StringStream stream(what_); - stream << file << ':' << line; - if (func) stream << " in " << func << " threw "; - if (child_name) { - stream << child_name; - } else { -#ifdef __GXX_RTTI - stream << typeid(this).name(); -#else - stream << "an exception"; -#endif - } - if (condition) { - stream << " because `" << condition << '\''; - } - stream << ".\n"; - stream << old_text; -} - -namespace { - -#ifdef __GNUC__ -const char *HandleStrerror(int ret, const char *buf) __attribute__ ((unused)); -const char *HandleStrerror(const char *ret, const char * /*buf*/) __attribute__ ((unused)); -#endif -// At least one of these functions will not be called. -#ifdef __clang__ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunused-function" -#endif -// The XOPEN version. -const char *HandleStrerror(int ret, const char *buf) { - if (!ret) return buf; - return NULL; -} - -// The GNU version. -const char *HandleStrerror(const char *ret, const char * /*buf*/) { - return ret; -} -#ifdef __clang__ -#pragma clang diagnostic pop -#endif -} // namespace - -ErrnoException::ErrnoException() throw() : errno_(errno) { - char buf[200]; - buf[0] = 0; -#if defined(sun) || defined(_WIN32) || defined(_WIN64) - const char *add = strerror(errno); -#else - const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf); -#endif - - if (add) { - *this << add << ' '; - } -} - -ErrnoException::~ErrnoException() throw() {} - -OverflowException::OverflowException() throw() {} -OverflowException::~OverflowException() throw() {} - -#if defined(_WIN32) || defined(_WIN64) -WindowsException::WindowsException() throw() { - unsigned int last_error = GetLastError(); - char error_msg[256] = ""; - if (!FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, last_error, LANG_NEUTRAL, error_msg, sizeof(error_msg), NULL)) { - *this << "Windows error " << GetLastError() << " while formatting Windows error " << last_error << ". "; - } else { - *this << "Windows error " << last_error << ": " << error_msg; - } -} -WindowsException::~WindowsException() throw() {} -#endif - -} // namespace util http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/exception.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/exception.hh b/ext/kenlm/util/exception.hh deleted file mode 100644 index 57d803d..0000000 --- a/ext/kenlm/util/exception.hh +++ /dev/null @@ -1,159 +0,0 @@ -#ifndef UTIL_EXCEPTION_H -#define UTIL_EXCEPTION_H - -#include "util/string_stream.hh" - -#include <exception> -#include <limits> -#include <string> -#include <stdint.h> - -namespace util { - -template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data); - -class Exception : public std::exception { - public: - Exception() throw(); - virtual ~Exception() throw(); - - const char *what() const throw() { return what_.c_str(); } - - // For use by the UTIL_THROW macros. - void SetLocation( - const char *file, - unsigned int line, - const char *func, - const char *child_name, - const char *condition); - - private: - template <class Except, class Data> friend typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data); - - // This helps restrict operator<< defined below. - template <class T> struct ExceptionTag { - typedef T Identity; - }; - - std::string what_; -}; - -/* This implements the normal operator<< for Exception and all its children. - * SFINAE means it only applies to Exception. Think of this as an ersatz - * boost::enable_if. - */ -template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data) { - StringStream(e.what_) << data; - return e; -} - -#ifdef __GNUC__ -#define UTIL_FUNC_NAME __PRETTY_FUNCTION__ -#else -#ifdef _WIN32 -#define UTIL_FUNC_NAME __FUNCTION__ -#else -#define UTIL_FUNC_NAME NULL -#endif -#endif - -/* Create an instance of Exception, add the message Modify, and throw it. - * Modify is appended to the what() message and can contain << for ostream - * operations. - * - * do .. while kludge to swallow trailing ; character - * http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html . - * Arg can be a constructor argument to the exception. - */ -#define UTIL_THROW_BACKEND(Condition, Exception, Arg, Modify) do { \ - Exception UTIL_e Arg; \ - UTIL_e.SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, #Exception, Condition); \ - UTIL_e << Modify; \ - throw UTIL_e; \ -} while (0) - -#define UTIL_THROW_ARG(Exception, Arg, Modify) \ - UTIL_THROW_BACKEND(NULL, Exception, Arg, Modify) - -#define UTIL_THROW(Exception, Modify) \ - UTIL_THROW_BACKEND(NULL, Exception, , Modify); - -#define UTIL_THROW2(Modify) \ - UTIL_THROW_BACKEND(NULL, util::Exception, , Modify); - -#if __GNUC__ >= 3 -#define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0) -#else -#define UTIL_UNLIKELY(x) (x) -#endif - -#if __GNUC__ >= 3 -#define UTIL_LIKELY(x) __builtin_expect (!!(x), 1) -#else -#define UTIL_LIKELY(x) (x) -#endif - -#define UTIL_THROW_IF_ARG(Condition, Exception, Arg, Modify) do { \ - if (UTIL_UNLIKELY(Condition)) { \ - UTIL_THROW_BACKEND(#Condition, Exception, Arg, Modify); \ - } \ -} while (0) - -#define UTIL_THROW_IF(Condition, Exception, Modify) \ - UTIL_THROW_IF_ARG(Condition, Exception, , Modify) - -#define UTIL_THROW_IF2(Condition, Modify) \ - UTIL_THROW_IF_ARG(Condition, util::Exception, , Modify) - -// Exception that records errno and adds it to the message. -class ErrnoException : public Exception { - public: - ErrnoException() throw(); - - virtual ~ErrnoException() throw(); - - int Error() const throw() { return errno_; } - - private: - int errno_; -}; - -// file wasn't there, or couldn't be open for some reason -class FileOpenException : public Exception { - public: - FileOpenException() throw() {} - ~FileOpenException() throw() {} -}; - -// Utilities for overflow checking. -class OverflowException : public Exception { - public: - OverflowException() throw(); - ~OverflowException() throw(); -}; - -template <unsigned len> inline std::size_t CheckOverflowInternal(uint64_t value) { - UTIL_THROW_IF(value > static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), OverflowException, "Integer overflow detected. This model is too big for 32-bit code."); - return value; -} - -template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) { - return value; -} - -inline std::size_t CheckOverflow(uint64_t value) { - return CheckOverflowInternal<sizeof(std::size_t)>(value); -} - -#if defined(_WIN32) || defined(_WIN64) -/* Thrown for Windows specific operations. */ -class WindowsException : public Exception { - public: - WindowsException() throw(); - ~WindowsException() throw(); -}; -#endif - -} // namespace util - -#endif // UTIL_EXCEPTION_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/fake_ostream.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/fake_ostream.hh b/ext/kenlm/util/fake_ostream.hh deleted file mode 100644 index 2f76053..0000000 --- a/ext/kenlm/util/fake_ostream.hh +++ /dev/null @@ -1,111 +0,0 @@ -#ifndef UTIL_FAKE_OSTREAM_H -#define UTIL_FAKE_OSTREAM_H - -#include "util/float_to_string.hh" -#include "util/integer_to_string.hh" -#include "util/string_piece.hh" - -#include <cassert> -#include <limits> - -#include <stdint.h> - -namespace util { - -/* Like std::ostream but without being incredibly slow. - * Supports most of the built-in types except for long double. - * - * The FakeOStream class is intended to be inherited from. The inherting class - * should provide: - * public: - * Derived &flush(); - * Derived &write(const void *data, std::size_t length); - * - * private: or protected: - * friend class FakeOStream; - * char *Ensure(std::size_t amount); - * void AdvanceTo(char *to); - * - * The Ensure function makes enough space for an in-place write and returns - * where to write. The AdvanceTo function happens after the write, saying how - * much was actually written. - * - * Precondition: - * amount <= kToStringMaxBytes for in-place writes. - */ -template <class Derived> class FakeOStream { - public: - FakeOStream() {} - - // This also covers std::string and char* - Derived &operator<<(StringPiece str) { - return C().write(str.data(), str.size()); - } - - // Handle integers by size and signedness. - private: - template <class Arg> struct EnableIfKludge { - typedef Derived type; - }; - template <class From, unsigned Length = sizeof(From), bool Signed = std::numeric_limits<From>::is_signed, bool IsInteger = std::numeric_limits<From>::is_integer> struct Coerce {}; - - template <class From> struct Coerce<From, 2, false, true> { typedef uint16_t To; }; - template <class From> struct Coerce<From, 4, false, true> { typedef uint32_t To; }; - template <class From> struct Coerce<From, 8, false, true> { typedef uint64_t To; }; - - template <class From> struct Coerce<From, 2, true, true> { typedef int16_t To; }; - template <class From> struct Coerce<From, 4, true, true> { typedef int32_t To; }; - template <class From> struct Coerce<From, 8, true, true> { typedef int64_t To; }; - public: - template <class From> typename EnableIfKludge<typename Coerce<From>::To>::type &operator<<(const From value) { - return CallToString(static_cast<typename Coerce<From>::To>(value)); - } - - // Character types that get copied as bytes instead of displayed as integers. - Derived &operator<<(char val) { return put(val); } - Derived &operator<<(signed char val) { return put(static_cast<char>(val)); } - Derived &operator<<(unsigned char val) { return put(static_cast<char>(val)); } - - Derived &operator<<(bool val) { return put(val + '0'); } - // enums will fall back to int but are not caught by the template. - Derived &operator<<(int val) { return CallToString(static_cast<typename Coerce<int>::To>(val)); } - - Derived &operator<<(float val) { return CallToString(val); } - Derived &operator<<(double val) { return CallToString(val); } - - // This is here to catch all the other pointer types. - Derived &operator<<(const void *value) { return CallToString(value); } - // This is here because the above line also catches const char*. - Derived &operator<<(const char *value) { return *this << StringPiece(value); } - Derived &operator<<(char *value) { return *this << StringPiece(value); } - - Derived &put(char val) { - char *c = C().Ensure(1); - *c = val; - C().AdvanceTo(++c); - return C(); - } - - char widen(char val) const { return val; } - - private: - // References to derived class for convenience. - Derived &C() { - return *static_cast<Derived*>(this); - } - - const Derived &C() const { - return *static_cast<const Derived*>(this); - } - - // This is separate to prevent an infinite loop if the compiler considers - // types the same (i.e. gcc std::size_t and uint64_t or uint32_t). - template <class T> Derived &CallToString(const T value) { - C().AdvanceTo(ToString(value, C().Ensure(ToStringBuf<T>::kBytes))); - return C(); - } -}; - -} // namespace - -#endif // UTIL_FAKE_OSTREAM_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/file.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/file.cc b/ext/kenlm/util/file.cc deleted file mode 100644 index e8976bc..0000000 --- a/ext/kenlm/util/file.cc +++ /dev/null @@ -1,574 +0,0 @@ -#define _LARGEFILE64_SOURCE -#define _FILE_OFFSET_BITS 64 - -#include "util/file.hh" - -#include "util/exception.hh" - -#include <algorithm> -#include <cstdlib> -#include <cstdio> -#include <iostream> -#include <limits> -#include <sstream> - - -#include <cassert> -#include <cerrno> -#include <climits> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <stdint.h> - -#if defined(__MINGW32__) -#include <windows.h> -#include <unistd.h> -#warning "The file functions on MinGW have not been tested for file sizes above 2^31 - 1. Please read https://stackoverflow.com/questions/12539488/determine-64-bit-file-size-in-c-on-mingw-32-bit and fix" -#elif defined(_WIN32) || defined(_WIN64) -#include <windows.h> -#include <io.h> -#else -#include <unistd.h> -#endif - -namespace util { - -scoped_fd::~scoped_fd() { - if (fd_ != -1 && close(fd_)) { - std::cerr << "Could not close file " << fd_ << std::endl; - std::abort(); - } -} - -void scoped_FILE_closer::Close(std::FILE *file) { - if (file && std::fclose(file)) { - std::cerr << "Could not close file " << file << std::endl; - std::abort(); - } -} - -// Note that ErrnoException records errno before NameFromFD is called. -FDException::FDException(int fd) throw() : fd_(fd), name_guess_(NameFromFD(fd)) { - *this << "in " << name_guess_ << ' '; -} - -FDException::~FDException() throw() {} - -EndOfFileException::EndOfFileException() throw() { - *this << "End of file"; -} -EndOfFileException::~EndOfFileException() throw() {} - -bool InputFileIsStdin(StringPiece path) { - return path == "-" || path == "/dev/stdin"; -} - -bool OutputFileIsStdout(StringPiece path) { - return path == "-" || path == "/dev/stdout"; -} - -int OpenReadOrThrow(const char *name) { - int ret; -#if defined(_WIN32) || defined(_WIN64) - UTIL_THROW_IF(-1 == (ret = _open(name, _O_BINARY | _O_RDONLY)), ErrnoException, "while opening " << name); -#else - UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name); -#endif - return ret; -} - -int CreateOrThrow(const char *name) { - int ret; -#if defined(_WIN32) || defined(_WIN64) - UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR | _O_BINARY, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); -#else - UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); -#endif - return ret; -} - -uint64_t SizeFile(int fd) { -#if defined __MINGW32__ - struct stat sb; - // Does this handle 64-bit? - int ret = fstat(fd, &sb); - if (ret == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize; - return sb.st_size; -#elif defined(_WIN32) || defined(_WIN64) - __int64 ret = _filelengthi64(fd); - return (ret == -1) ? kBadSize : ret; -#else // Not windows. - -#ifdef OS_ANDROID - struct stat64 sb; - int ret = fstat64(fd, &sb); -#else - struct stat sb; - int ret = fstat(fd, &sb); -#endif - if (ret == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize; - return sb.st_size; -#endif -} - -uint64_t SizeOrThrow(int fd) { - uint64_t ret = SizeFile(fd); - UTIL_THROW_IF_ARG(ret == kBadSize, FDException, (fd), "Failed to size"); - return ret; -} - -void ResizeOrThrow(int fd, uint64_t to) { -#if defined __MINGW32__ - // Does this handle 64-bit? - int ret = ftruncate -#elif defined(_WIN32) || defined(_WIN64) - errno_t ret = _chsize_s -#elif defined(OS_ANDROID) - int ret = ftruncate64 -#else - int ret = ftruncate -#endif - (fd, to); - UTIL_THROW_IF_ARG(ret, FDException, (fd), "while resizing to " << to << " bytes"); -} - -namespace { -std::size_t GuardLarge(std::size_t size) { - // The following operating systems have broken read/write/pread/pwrite that - // only supports up to 2^31. - // OS X man pages claim to support 64-bit, but Kareem M. Darwish had problems - // building with larger files, so APPLE is also here. -#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID) || defined(__MINGW32__) - return size < INT_MAX ? size : INT_MAX; -#else - return size; -#endif -} -} - -#if defined(_WIN32) || defined(_WIN64) -namespace { -const std::size_t kMaxDWORD = static_cast<std::size_t>(4294967295UL); -} // namespace -#endif - -std::size_t PartialRead(int fd, void *to, std::size_t amount) { -#if defined(_WIN32) || defined(_WIN64) - DWORD ret; - HANDLE file_handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd)); - DWORD larger_size = static_cast<DWORD>(std::min<std::size_t>(kMaxDWORD, amount)); - DWORD smaller_size = 28672; // Received reports that 31346 worked but higher values did not. This rounds down to the nearest multiple of 4096, the page size. - if (!ReadFile(file_handle, to, larger_size, &ret, NULL)) - { - DWORD last_error = GetLastError(); - if (last_error != ERROR_NOT_ENOUGH_MEMORY || !ReadFile(file_handle, to, smaller_size, &ret, NULL)) { - UTIL_THROW(WindowsException, "Windows error in ReadFile."); - } - } -#else - errno = 0; - ssize_t ret; - do { - ret = read(fd, to, GuardLarge(amount)); - } while (ret == -1 && errno == EINTR); - UTIL_THROW_IF_ARG(ret < 0, FDException, (fd), "while reading " << amount << " bytes"); -#endif - return static_cast<std::size_t>(ret); -} - -void ReadOrThrow(int fd, void *to_void, std::size_t amount) { - uint8_t *to = static_cast<uint8_t*>(to_void); - while (amount) { - std::size_t ret = PartialRead(fd, to, amount); - UTIL_THROW_IF(ret == 0, EndOfFileException, " in " << NameFromFD(fd) << " but there should be " << amount << " more bytes to read."); - amount -= ret; - to += ret; - } -} - -std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) { - uint8_t *to = static_cast<uint8_t*>(to_void); - std::size_t remaining = amount; - while (remaining) { - std::size_t ret = PartialRead(fd, to, remaining); - if (!ret) return amount - remaining; - remaining -= ret; - to += ret; - } - return amount; -} - -void WriteOrThrow(int fd, const void *data_void, std::size_t size) { - const uint8_t *data = static_cast<const uint8_t*>(data_void); - while (size) { -#if defined(_WIN32) || defined(_WIN64) - int ret; -#else - ssize_t ret; -#endif - errno = 0; - do { - ret = -#if defined(_WIN32) || defined(_WIN64) - _write -#else - write -#endif - (fd, data, GuardLarge(size)); - } while (ret == -1 && errno == EINTR); - UTIL_THROW_IF_ARG(ret < 1, FDException, (fd), "while writing " << size << " bytes"); - data += ret; - size -= ret; - } -} - -void WriteOrThrow(FILE *to, const void *data, std::size_t size) { - if (!size) return; - UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), ErrnoException, "Short write; requested size " << size); -} - -void ErsatzPRead(int fd, void *to_void, std::size_t size, uint64_t off) { - uint8_t *to = static_cast<uint8_t*>(to_void); - while (size) { -#if defined(_WIN32) || defined(_WIN64) - /* BROKEN: changes file pointer. Even if you save it and change it back, it won't be safe to use concurrently with write() or read() which lmplz does. */ - // size_t might be 64-bit. DWORD is always 32. - DWORD reading = static_cast<DWORD>(std::min<std::size_t>(kMaxDWORD, size)); - DWORD ret; - OVERLAPPED overlapped; - memset(&overlapped, 0, sizeof(OVERLAPPED)); - overlapped.Offset = static_cast<DWORD>(off); - overlapped.OffsetHigh = static_cast<DWORD>(off >> 32); - UTIL_THROW_IF(!ReadFile((HANDLE)_get_osfhandle(fd), to, reading, &ret, &overlapped), WindowsException, "ReadFile failed for offset " << off); -#else - ssize_t ret; - errno = 0; - ret = -#ifdef OS_ANDROID - pread64 -#else - pread -#endif - (fd, to, GuardLarge(size), off); - if (ret <= 0) { - if (ret == -1 && errno == EINTR) continue; - UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd)); - UTIL_THROW_ARG(FDException, (fd), "while reading " << size << " bytes at offset " << off); - } -#endif - size -= ret; - off += ret; - to += ret; - } -} - -void ErsatzPWrite(int fd, const void *from_void, std::size_t size, uint64_t off) { - const uint8_t *from = static_cast<const uint8_t*>(from_void); - while(size) { -#if defined(_WIN32) || defined(_WIN64) - /* Changes file pointer. Even if you save it and change it back, it won't be safe to use concurrently with write() or read() */ - // size_t might be 64-bit. DWORD is always 32. - DWORD writing = static_cast<DWORD>(std::min<std::size_t>(kMaxDWORD, size)); - DWORD ret; - OVERLAPPED overlapped; - memset(&overlapped, 0, sizeof(OVERLAPPED)); - overlapped.Offset = static_cast<DWORD>(off); - overlapped.OffsetHigh = static_cast<DWORD>(off >> 32); - UTIL_THROW_IF(!WriteFile((HANDLE)_get_osfhandle(fd), from, writing, &ret, &overlapped), Exception, "WriteFile failed for offset " << off); -#else - ssize_t ret; - errno = 0; - ret = -#ifdef OS_ANDROID - pwrite64 -#else - pwrite -#endif - (fd, from, GuardLarge(size), off); - if (ret <= 0) { - if (ret == -1 && errno == EINTR) continue; - UTIL_THROW_IF(ret == 0, EndOfFileException, " for writing " << size << " bytes at " << off << " from " << NameFromFD(fd)); - UTIL_THROW_ARG(FDException, (fd), "while writing " << size << " bytes at offset " << off); - } -#endif - size -= ret; - off += ret; - from += ret; - } -} - - -void FSyncOrThrow(int fd) { -// Apparently windows doesn't have fsync? -#if !defined(_WIN32) && !defined(_WIN64) - UTIL_THROW_IF_ARG(-1 == fsync(fd), FDException, (fd), "while syncing"); -#endif -} - -namespace { - -// Static assert for 64-bit off_t size. -#if !defined(_WIN32) && !defined(_WIN64) && !defined(OS_ANDROID) -template <unsigned> struct CheckOffT; -template <> struct CheckOffT<8> { - struct True {}; -}; -// If there's a compiler error on the next line, then off_t isn't 64 bit. And -// that makes me a sad panda. -typedef CheckOffT<sizeof(off_t)>::True IgnoredType; -#endif - -// Can't we all just get along? -void InternalSeek(int fd, int64_t off, int whence) { - if ( -#if defined __MINGW32__ - // Does this handle 64-bit? - (off_t)-1 == lseek(fd, off, whence) -#elif defined(_WIN32) || defined(_WIN64) - (__int64)-1 == _lseeki64(fd, off, whence) -#elif defined(OS_ANDROID) - (off64_t)-1 == lseek64(fd, off, whence) -#else - (off_t)-1 == lseek(fd, off, whence) -#endif - ) UTIL_THROW_ARG(FDException, (fd), "while seeking to " << off << " whence " << whence); -} -} // namespace - -void SeekOrThrow(int fd, uint64_t off) { - InternalSeek(fd, off, SEEK_SET); -} - -void AdvanceOrThrow(int fd, int64_t off) { - InternalSeek(fd, off, SEEK_CUR); -} - -void SeekEnd(int fd) { - InternalSeek(fd, 0, SEEK_END); -} - -std::FILE *FDOpenOrThrow(scoped_fd &file) { - std::FILE *ret = fdopen(file.get(), "r+b"); - UTIL_THROW_IF_ARG(!ret, FDException, (file.get()), "Could not fdopen for write"); - file.release(); - return ret; -} - -std::FILE *FDOpenReadOrThrow(scoped_fd &file) { - std::FILE *ret = fdopen(file.get(), "rb"); - UTIL_THROW_IF_ARG(!ret, FDException, (file.get()), "Could not fdopen for read"); - file.release(); - return ret; -} - -// Sigh. Windows temporary file creation is full of race conditions. -#if defined(_WIN32) || defined(_WIN64) -/* mkstemp extracted from libc/sysdeps/posix/tempname.c. Copyright - (C) 1991-1999, 2000, 2001, 2006 Free Software Foundation, Inc. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. */ - -/* This has been modified from the original version to rename the function and - * set the Windows temporary flag. */ - -static const char letters[] = -"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; - -/* Generate a temporary file name based on TMPL. TMPL must match the - rules for mk[s]temp (i.e. end in "XXXXXX"). The name constructed - does not exist at the time of the call to mkstemp. TMPL is - overwritten with the result. */ -int -mkstemp_and_unlink(char *tmpl) -{ - int len; - char *XXXXXX; - static unsigned long long value; - unsigned long long random_time_bits; - unsigned int count; - int fd = -1; - int save_errno = errno; - - /* A lower bound on the number of temporary files to attempt to - generate. The maximum total number of temporary file names that - can exist for a given template is 62**6. It should never be - necessary to try all these combinations. Instead if a reasonable - number of names is tried (we define reasonable as 62**3) fail to - give the system administrator the chance to remove the problems. */ -#define ATTEMPTS_MIN (62 * 62 * 62) - - /* The number of times to attempt to generate a temporary file. To - conform to POSIX, this must be no smaller than TMP_MAX. */ -#if ATTEMPTS_MIN < TMP_MAX - unsigned int attempts = TMP_MAX; -#else - unsigned int attempts = ATTEMPTS_MIN; -#endif - - len = strlen (tmpl); - if (len < 6 || strcmp (&tmpl[len - 6], "XXXXXX")) - { - errno = EINVAL; - return -1; - } - -/* This is where the Xs start. */ - XXXXXX = &tmpl[len - 6]; - - /* Get some more or less random data. */ - { - SYSTEMTIME stNow; - FILETIME ftNow; - - // get system time - GetSystemTime(&stNow); - stNow.wMilliseconds = 500; - if (!SystemTimeToFileTime(&stNow, &ftNow)) - { - errno = -1; - return -1; - } - - random_time_bits = (((unsigned long long)ftNow.dwHighDateTime << 32) - | (unsigned long long)ftNow.dwLowDateTime); - } - value += random_time_bits ^ (unsigned long long)GetCurrentThreadId (); - - for (count = 0; count < attempts; value += 7777, ++count) - { - unsigned long long v = value; - - /* Fill in the random bits. */ - XXXXXX[0] = letters[v % 62]; - v /= 62; - XXXXXX[1] = letters[v % 62]; - v /= 62; - XXXXXX[2] = letters[v % 62]; - v /= 62; - XXXXXX[3] = letters[v % 62]; - v /= 62; - XXXXXX[4] = letters[v % 62]; - v /= 62; - XXXXXX[5] = letters[v % 62]; - - /* Modified for windows and to unlink */ - // fd = open (tmpl, O_RDWR | O_CREAT | O_EXCL, _S_IREAD | _S_IWRITE); - int flags = _O_RDWR | _O_CREAT | _O_EXCL | _O_BINARY; - flags |= _O_TEMPORARY; - fd = _open (tmpl, flags, _S_IREAD | _S_IWRITE); - if (fd >= 0) - { - errno = save_errno; - return fd; - } - else if (errno != EEXIST) - return -1; - } - - /* We got out of the loop because we ran out of combinations to try. */ - errno = EEXIST; - return -1; -} -#else -int -mkstemp_and_unlink(char *tmpl) { - int ret = mkstemp(tmpl); - if (ret != -1) { - UTIL_THROW_IF(unlink(tmpl), ErrnoException, "while deleting delete " << tmpl); - } - return ret; -} -#endif - -// If it's a directory, add a /. This lets users say -T /tmp without creating -// /tmpAAAAAA -void NormalizeTempPrefix(std::string &base) { - if (base.empty()) return; - if (base[base.size() - 1] == '/') return; - struct stat sb; - // It's fine for it to not exist. - if (-1 == stat(base.c_str(), &sb)) return; - if ( -#if defined(_WIN32) || defined(_WIN64) - sb.st_mode & _S_IFDIR -#else - S_ISDIR(sb.st_mode) -#endif - ) base += '/'; -} - -int MakeTemp(const StringPiece &base) { - std::string name(base.data(), base.size()); - name += "XXXXXX"; - name.push_back(0); - int ret; - UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&name[0])), ErrnoException, "while making a temporary based on " << base); - return ret; -} - -std::FILE *FMakeTemp(const StringPiece &base) { - util::scoped_fd file(MakeTemp(base)); - return FDOpenOrThrow(file); -} - -int DupOrThrow(int fd) { - int ret = dup(fd); - UTIL_THROW_IF_ARG(ret == -1, FDException, (fd), "in duplicating the file descriptor"); - return ret; -} - -namespace { -// Try to name things but be willing to fail too. -bool TryName(int fd, std::string &out) { -#if defined(_WIN32) || defined(_WIN64) - return false; -#else - std::string name("/proc/self/fd/"); - std::ostringstream convert; - convert << fd; - name += convert.str(); - - struct stat sb; - if (-1 == lstat(name.c_str(), &sb)) - return false; - out.resize(sb.st_size + 1); - // lstat gave us a size, but I've seen it grow, possibly due to symlinks on top of symlinks. - while (true) { - ssize_t ret = readlink(name.c_str(), &out[0], out.size()); - if (-1 == ret) - return false; - if ((size_t)ret < out.size()) { - out.resize(ret); - break; - } - // Exponential growth. - out.resize(out.size() * 2); - } - // Don't use the non-file names. - if (!out.empty() && out[0] != '/') - return false; - return true; -#endif -} -} // namespace - -std::string NameFromFD(int fd) { - std::string ret; - if (TryName(fd, ret)) return ret; - switch (fd) { - case 0: return "stdin"; - case 1: return "stdout"; - case 2: return "stderr"; - } - ret = "fd "; - std::ostringstream convert; - convert << fd; - ret += convert.str(); - return ret; -} - -} // namespace util http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/file.hh ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/file.hh b/ext/kenlm/util/file.hh deleted file mode 100644 index f7cb4d6..0000000 --- a/ext/kenlm/util/file.hh +++ /dev/null @@ -1,154 +0,0 @@ -#ifndef UTIL_FILE_H -#define UTIL_FILE_H - -#include "util/exception.hh" -#include "util/scoped.hh" -#include "util/string_piece.hh" - -#include <cstddef> -#include <cstdio> -#include <string> -#include <stdint.h> - -namespace util { - -class scoped_fd { - public: - scoped_fd() : fd_(-1) {} - - explicit scoped_fd(int fd) : fd_(fd) {} - - ~scoped_fd(); - - void reset(int to = -1) { - scoped_fd other(fd_); - fd_ = to; - } - - int get() const { return fd_; } - - int operator*() const { return fd_; } - - int release() { - int ret = fd_; - fd_ = -1; - return ret; - } - - private: - int fd_; - - scoped_fd(const scoped_fd &); - scoped_fd &operator=(const scoped_fd &); -}; - -struct scoped_FILE_closer { - static void Close(std::FILE *file); -}; -typedef scoped<std::FILE, scoped_FILE_closer> scoped_FILE; - -/* Thrown for any operation where the fd is known. */ -class FDException : public ErrnoException { - public: - explicit FDException(int fd) throw(); - - virtual ~FDException() throw(); - - // This may no longer be valid if the exception was thrown past open. - int FD() const { return fd_; } - - // Guess from NameFromFD. - const std::string &NameGuess() const { return name_guess_; } - - private: - int fd_; - - std::string name_guess_; -}; - -// End of file reached. -class EndOfFileException : public Exception { - public: - EndOfFileException() throw(); - ~EndOfFileException() throw(); -}; - -// Open for read only. -int OpenReadOrThrow(const char *name); -// Create file if it doesn't exist, truncate if it does. Opened for write. -int CreateOrThrow(const char *name); - -/** Does the given input file path denote standard input? - * - * Returns true if, and only if, path is either "-" or "/dev/stdin". - * - * Opening standard input as a file may need some special treatment for - * portability. There's a convention that a dash ("-") in place of an input - * file path denotes standard input, but opening "/dev/stdin" may need to be - * special as well. - */ -bool InputPathIsStdin(StringPiece path); - -/** Does the given output file path denote standard output? - * - * Returns true if, and only if, path is either "-" or "/dev/stdout". - * - * Opening standard output as a file may need some special treatment for - * portability. There's a convention that a dash ("-") in place of an output - * file path denotes standard output, but opening "/dev/stdout" may need to be - * special as well. - */ -bool OutputPathIsStdout(StringPiece path); - -// Return value for SizeFile when it can't size properly. -const uint64_t kBadSize = (uint64_t)-1; -uint64_t SizeFile(int fd); -uint64_t SizeOrThrow(int fd); - -void ResizeOrThrow(int fd, uint64_t to); - -std::size_t PartialRead(int fd, void *to, std::size_t size); -void ReadOrThrow(int fd, void *to, std::size_t size); -std::size_t ReadOrEOF(int fd, void *to_void, std::size_t size); - -void WriteOrThrow(int fd, const void *data_void, std::size_t size); -void WriteOrThrow(FILE *to, const void *data, std::size_t size); - -/* These call pread/pwrite in a loop. However, on Windows they call ReadFile/ - * WriteFile which changes the file pointer. So it's safe to call ErsatzPRead - * and ErsatzPWrite concurrently (or any combination thereof). But it changes - * the file pointer on windows, so it's not safe to call concurrently with - * anything that uses the implicit file pointer e.g. the Read/Write functions - * above. - */ -void ErsatzPRead(int fd, void *to, std::size_t size, uint64_t off); -void ErsatzPWrite(int fd, const void *data_void, std::size_t size, uint64_t off); - -void FSyncOrThrow(int fd); - -// Seeking -void SeekOrThrow(int fd, uint64_t off); -void AdvanceOrThrow(int fd, int64_t off); -void SeekEnd(int fd); - -std::FILE *FDOpenOrThrow(scoped_fd &file); -std::FILE *FDOpenReadOrThrow(scoped_fd &file); - -// Temporary files -// Append a / if base is a directory. -void NormalizeTempPrefix(std::string &base); -int MakeTemp(const StringPiece &prefix); -std::FILE *FMakeTemp(const StringPiece &prefix); - -// dup an fd. -int DupOrThrow(int fd); - -/* Attempt get file name from fd. This won't always work (i.e. on Windows or - * a pipe). The file might have been renamed. It's intended for diagnostics - * and logging only. - */ -std::string NameFromFD(int fd); - -} // namespace util - -#endif // UTIL_FILE_H http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/file_piece.cc ---------------------------------------------------------------------- diff --git a/ext/kenlm b/ext/kenlm new file mode 160000 index 0000000..56fdb5c --- /dev/null +++ b/ext/kenlm @@ -0,0 +1 @@ +Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5 diff --git a/ext/kenlm/util/file_piece.cc b/ext/kenlm/util/file_piece.cc deleted file mode 100644 index 0a4d3a9..0000000 --- a/ext/kenlm/util/file_piece.cc +++ /dev/null @@ -1,337 +0,0 @@ -#include "util/file_piece.hh" - -#include "util/double-conversion/double-conversion.h" -#include "util/exception.hh" -#include "util/file.hh" -#include "util/mmap.hh" - -#if defined(_WIN32) || defined(_WIN64) -#include <io.h> -#else -#include <unistd.h> -#endif - -#include <cassert> -#include <cerrno> -#include <cmath> -#include <cstdlib> -#include <iostream> -#include <limits> -#include <string> - -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> - -namespace util { - -ParseNumberException::ParseNumberException(StringPiece value) throw() { - *this << "Could not parse \"" << value << "\" into a "; -} - -// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). -const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - -FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) : - file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()), - progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) { - Initialize(name, show_progress, min_buffer); -} - -namespace { -std::string NamePossiblyFind(int fd, const char *name) { - if (name) return name; - return NameFromFD(fd); -} -} // namespace - -FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) : - file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()), - progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + NamePossiblyFind(fd, name)) { - Initialize(NamePossiblyFind(fd, name).c_str(), show_progress, min_buffer); -} - -FilePiece::FilePiece(std::istream &stream, const char *name, std::size_t min_buffer) : - total_size_(kBadSize), page_(SizePage()) { - InitializeNoRead("istream", min_buffer); - - fallback_to_read_ = true; - HugeMalloc(default_map_size_, false, data_); - position_ = data_.begin(); - position_end_ = position_; - - fell_back_.Reset(stream); -} - -FilePiece::~FilePiece() {} - -StringPiece FilePiece::ReadLine(char delim, bool strip_cr) { - std::size_t skip = 0; - while (true) { - for (const char *i = position_ + skip; i < position_end_; ++i) { - if (*i == delim) { - // End of line. - // Take 1 byte off the end if it's an unwanted carriage return. - const std::size_t subtract_cr = ( - (strip_cr && i > position_ && *(i - 1) == '\r') ? - 1 : 0); - StringPiece ret(position_, i - position_ - subtract_cr); - position_ = i + 1; - return ret; - } - } - if (at_end_) { - if (position_ == position_end_) { - Shift(); - } - return Consume(position_end_); - } - skip = position_end_ - position_; - Shift(); - } -} - -bool FilePiece::ReadLineOrEOF(StringPiece &to, char delim, bool strip_cr) { - try { - to = ReadLine(delim, strip_cr); - } catch (const util::EndOfFileException &e) { return false; } - return true; -} - -float FilePiece::ReadFloat() { - return ReadNumber<float>(); -} -double FilePiece::ReadDouble() { - return ReadNumber<double>(); -} -long int FilePiece::ReadLong() { - return ReadNumber<long int>(); -} -unsigned long int FilePiece::ReadULong() { - return ReadNumber<unsigned long int>(); -} - -// Factored out so that istream can call this. -void FilePiece::InitializeNoRead(const char *name, std::size_t min_buffer) { - file_name_ = name; - - default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2); - position_ = NULL; - position_end_ = NULL; - mapped_offset_ = 0; - at_end_ = false; -} - -void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) { - InitializeNoRead(name, min_buffer); - - if (total_size_ == kBadSize) { - // So the assertion passes. - fallback_to_read_ = false; - if (show_progress) - *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl; - TransitionToRead(); - } else { - fallback_to_read_ = false; - } - Shift(); - // gzip detect. - if ((position_end_ >= position_ + ReadCompressed::kMagicSize) && ReadCompressed::DetectCompressedMagic(position_)) { - if (!fallback_to_read_) { - at_end_ = false; - TransitionToRead(); - } - } -} - -namespace { - -static const double_conversion::StringToDoubleConverter kConverter( - double_conversion::StringToDoubleConverter::ALLOW_TRAILING_JUNK | double_conversion::StringToDoubleConverter::ALLOW_LEADING_SPACES, - std::numeric_limits<double>::quiet_NaN(), - std::numeric_limits<double>::quiet_NaN(), - "inf", - "NaN"); - -StringPiece FirstToken(StringPiece str) { - const char *i; - for (i = str.data(); i != str.data() + str.size(); ++i) { - if (kSpaces[(unsigned char)*i]) break; - } - return StringPiece(str.data(), i - str.data()); -} - -const char *ParseNumber(StringPiece str, float &out) { - int count; - out = kConverter.StringToFloat(str.data(), str.size(), &count); - UTIL_THROW_IF_ARG(std::isnan(out) && str != "NaN" && str != "nan", ParseNumberException, (FirstToken(str)), "float"); - return str.data() + count; -} -const char *ParseNumber(StringPiece str, double &out) { - int count; - out = kConverter.StringToDouble(str.data(), str.size(), &count); - UTIL_THROW_IF_ARG(std::isnan(out) && str != "NaN" && str != "nan", ParseNumberException, (FirstToken(str)), "double"); - return str.data() + count; -} -const char *ParseNumber(StringPiece str, long int &out) { - char *end; - errno = 0; - out = strtol(str.data(), &end, 10); - UTIL_THROW_IF_ARG(errno || (end == str.data()), ParseNumberException, (FirstToken(str)), "long int"); - return end; -} -const char *ParseNumber(StringPiece str, unsigned long int &out) { - char *end; - errno = 0; - out = strtoul(str.data(), &end, 10); - UTIL_THROW_IF_ARG(errno || (end == str.data()), ParseNumberException, (FirstToken(str)), "unsigned long int"); - return end; -} -} // namespace - -template <class T> T FilePiece::ReadNumber() { - SkipSpaces(); - while (last_space_ < position_) { - if (UTIL_UNLIKELY(at_end_)) { - // Hallucinate a null off the end of the file. - std::string buffer(position_, position_end_); - T ret; - // Has to be null-terminated. - const char *begin = buffer.c_str(); - const char *end = ParseNumber(StringPiece(begin, buffer.size()), ret); - position_ += end - begin; - return ret; - } - Shift(); - } - T ret; - position_ = ParseNumber(StringPiece(position_, last_space_ - position_), ret); - return ret; -} - -const char *FilePiece::FindDelimiterOrEOF(const bool *delim) { - std::size_t skip = 0; - while (true) { - for (const char *i = position_ + skip; i < position_end_; ++i) { - if (delim[static_cast<unsigned char>(*i)]) return i; - } - if (at_end_) { - if (position_ == position_end_) Shift(); - return position_end_; - } - skip = position_end_ - position_; - Shift(); - } -} - -void FilePiece::Shift() { - if (at_end_) { - progress_.Finished(); - throw EndOfFileException(); - } - uint64_t desired_begin = position_ - data_.begin() + mapped_offset_; - - if (!fallback_to_read_) MMapShift(desired_begin); - // Notice an mmap failure might set the fallback. - if (fallback_to_read_) ReadShift(); - - for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) { - if (kSpaces[static_cast<unsigned char>(*last_space_)]) break; - } -} - -void FilePiece::MMapShift(uint64_t desired_begin) { - // Use mmap. - uint64_t ignore = desired_begin % page_; - // Duplicate request for Shift means give more data. - if (position_ == data_.begin() + ignore && position_) { - default_map_size_ *= 2; - } - // Local version so that in case of failure it doesn't overwrite the class variable. - uint64_t mapped_offset = desired_begin - ignore; - - uint64_t mapped_size; - if (default_map_size_ >= static_cast<std::size_t>(total_size_ - mapped_offset)) { - at_end_ = true; - mapped_size = total_size_ - mapped_offset; - } else { - mapped_size = default_map_size_; - } - - // Forcibly clear the existing mmap first. - data_.reset(); - try { - MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_); - } catch (const util::ErrnoException &e) { - if (desired_begin) { - SeekOrThrow(*file_, desired_begin); - } - // The mmap was scheduled to end the file, but now we're going to read it. - at_end_ = false; - TransitionToRead(); - return; - } - mapped_offset_ = mapped_offset; - position_ = data_.begin() + ignore; - position_end_ = data_.begin() + mapped_size; - - progress_.Set(desired_begin); -} - -void FilePiece::TransitionToRead() { - assert(!fallback_to_read_); - fallback_to_read_ = true; - data_.reset(); - HugeMalloc(default_map_size_, false, data_); - position_ = data_.begin(); - position_end_ = position_; - - try { - fell_back_.Reset(file_.release()); - } catch (util::Exception &e) { - e << " in file " << file_name_; - throw; - } -} - -void FilePiece::ReadShift() { - assert(fallback_to_read_); - // Bytes [data_.begin(), position_) have been consumed. - // Bytes [position_, position_end_) have been read into the buffer. - - // Start at the beginning of the buffer if there's nothing useful in it. - if (position_ == position_end_) { - mapped_offset_ += (position_end_ - data_.begin()); - position_ = data_.begin(); - position_end_ = position_; - } - - std::size_t already_read = position_end_ - data_.begin(); - - if (already_read == default_map_size_) { - if (position_ == data_.begin()) { - // Buffer too small. - std::size_t valid_length = position_end_ - position_; - default_map_size_ *= 2; - HugeRealloc(default_map_size_, false, data_); - position_ = data_.begin(); - position_end_ = position_ + valid_length; - } else { - std::size_t moving = position_end_ - position_; - memmove(data_.get(), position_, moving); - position_ = data_.begin(); - position_end_ = position_ + moving; - already_read = moving; - } - } - - std::size_t read_return = fell_back_.Read(static_cast<uint8_t*>(data_.get()) + already_read, default_map_size_ - already_read); - progress_.Set(fell_back_.RawAmount()); - - if (read_return == 0) { - at_end_ = true; - } - position_end_ += read_return; -} - -} // namespace util
