Repository: impala Updated Branches: refs/heads/master 1dd4403fd -> 0a901fcb2
Revert "IMPALA-7412: width_bucket() function overflows too easily" IMPALA-7459 has been traced to this commit. This reverts commit a8b32dbafa1f015c8316f205e32bbdce349f2474. Change-Id: I61945c9199818ec98d7fb187e5931c76f7082c1a Reviewed-on: http://gerrit.cloudera.org:8080/11265 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/83bb4aed Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/83bb4aed Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/83bb4aed Branch: refs/heads/master Commit: 83bb4aed71db140ab13b2d727e7e09bb2077ccc6 Parents: 1dd4403 Author: Joe McDonnell <[email protected]> Authored: Fri Aug 17 16:19:14 2018 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Sat Aug 18 03:05:46 2018 +0000 ---------------------------------------------------------------------- be/src/exprs/expr-test.cc | 29 ++++-- be/src/exprs/math-functions-ir.cc | 156 ++++++++++++++++++----------- be/src/util/bit-util.h | 53 +--------- tests/query_test/test_decimal_fuzz.py | 63 +----------- 4 files changed, 125 insertions(+), 176 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/83bb4aed/be/src/exprs/expr-test.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc index 8cc0e9d..d0e3ff3 100644 --- a/be/src/exprs/expr-test.cc +++ b/be/src/exprs/expr-test.cc @@ -5348,11 +5348,16 @@ TEST_F(ExprTest, MathFunctions) { // Test when min > max TestErrorString("width_bucket(22, 50, 5, 4)", "UDF ERROR: Lower bound cannot be greater than or equal to the upper bound\n"); - // IMPALA-7412: Test max - min should not overflow anymore - TestValue("width_bucket(11, -9, 99999999999999999999999999999999999999, 4000)", - TYPE_BIGINT, 1); - TestValue("width_bucket(1, -99999999999999999999999999999999999999, 9, 40)", - TYPE_BIGINT, 40); + // Test max - min will overflow during width_bucket evaluation + TestErrorString("width_bucket(11, -9, 99999999999999999999999999999999999999, 4000)", + "UDF ERROR: Overflow while evaluating the difference between min_range: -9 and " + "max_range: 99999999999999999999999999999999999999\n"); + // If expr - min overflows during width_bucket evaluation, max - min will also + // overflow. Since we evaluate max - min before evaluating expr - min, we will never + // end up overflowing expr - min. + TestErrorString("width_bucket(1, -99999999999999999999999999999999999999, 9, 40)", + "UDF ERROR: Overflow while evaluating the difference between min_range: " + "-99999999999999999999999999999999999999 and max_range: 9\n"); // Test when dist_from_min * buckets cannot be stored in a int128_t (overflows) // and needs to be stored in a int256_t TestValue("width_bucket(8000000000000000000000000000000000000," @@ -5375,12 +5380,16 @@ TEST_F(ExprTest, MathFunctions) { // max and min value that would require int256_t for evalation TestValue("width_bucket(10000000000000000000000000000000000000, 1," "99999999999999999999999999999999999999, 15)", TYPE_BIGINT, 2); - // IMPALA-7412: These should not overflow anymore - TestValue("width_bucket(cast(-0.10 as decimal(37,30)), cast(-0.36028797018963968 " + // IMPALA-7242/IMPALA-7243: check for overflow when converting IntVal to DecimalValue + TestErrorString("width_bucket(cast(-0.10 as decimal(37,30)), cast(-0.36028797018963968 " "as decimal(25,25)), cast(9151517.4969773200562764155787276999832" - "as decimal(38,31)), 1328180220)", TYPE_BIGINT, 38); - TestValue("width_bucket(cast(9 as decimal(10,7)), cast(-60000 as decimal(11,6)), " - "cast(10 as decimal(7,5)), 249895273);", TYPE_BIGINT, 249891109); + "as decimal(38,31)), 1328180220)", + "UDF ERROR: Overflow while representing the num_buckets:1328180220 as a " + "DecimalVal\n"); + TestErrorString("width_bucket(cast(9 as decimal(10,7)), cast(-60000 as decimal(11,6)), " + "cast(10 as decimal(7,5)), 249895273);", + "UDF ERROR: Overflow while representing the num_buckets:249895273 as a " + "DecimalVal\n"); // Run twice to test deterministic behavior. for (uint32_t seed : {0, 1234}) { stringstream rand, random; http://git-wip-us.apache.org/repos/asf/impala/blob/83bb4aed/be/src/exprs/math-functions-ir.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/math-functions-ir.cc b/be/src/exprs/math-functions-ir.cc index 7103a10..28c9e1e 100644 --- a/be/src/exprs/math-functions-ir.cc +++ b/be/src/exprs/math-functions-ir.cc @@ -457,32 +457,57 @@ DoubleVal MathFunctions::FmodDouble(FunctionContext* ctx, const DoubleVal& a, // The bucket_number is evaluated using the following formula: // -// bucket_number = dist_from_min * num_buckets / range_size + 1 +// bucket_number = dist_from_min * num_buckets / range_size // where - // dist_from_min = expr - min_range // range_size = max_range - min_range -// num_buckets = number of buckets +// buckets = number of buckets // -// Since expr, min_range, and max_range are all decimals with the same -// byte size, precision, and scale we can interpret them as plain integers -// and still calculate the proper bucket. +// The results of the above subtractions are stored in Decimal16Value to avoid an overflow +// in the following cases: +// case 1: +// T1 is decimal8Value +// When evaluating this particular expression +// dist_from_min = expr - min_range +// If expr is a max positive value which can be represented in decimal8Value and +// min_range < 0 the resulting dist_from_min can be represented in decimal16Val +// without overflowing +// case 2: +// T1 is decimal16Value +// Subtracting a negative min_range from expr can result in an overflow in which +// case the function errors out. There is no decimal32Val to handle this. So +// storing it in decimal16Value. +// case 3: +// T1 is decimal4Value +// We can store the results in a decimal8Value. But this change hard codes to +// store the result in decimal16Val for now to be compatible with the other +// decimal*Vals // -// There are some possibilities of overflowing during the calculation: -// range_size = max_range - min_range -// dist_from_min = expr - min_range -// dist_from_min * num_buckets +// The result of this multiplication dist_from_min * buckets is stored as a int256_t +// if storing it in a int128_t would overflow. // -// For all the above cases we use a bigger integer type provided by the -// BitUtil::DoubleWidth<> metafunction. +// To perform the division, range_size is scaled up. The scale and precision of the +// numerator and denominator are adjusted to be the same. This avoids the need to compute +// the resulting scale and precision. template <class T1> BigIntVal MathFunctions::WidthBucketImpl(FunctionContext* ctx, const T1& expr, const T1& min_range, const T1& max_range, const IntVal& num_buckets) { - auto expr_val = expr.value(); - using ActualType = decltype(expr_val); - auto min_range_val = min_range.value(); - auto max_range_val = max_range.value(); - auto num_buckets_val = static_cast<ActualType>(num_buckets.val); + // FE casts expr, min_range and max_range to be of the scale and precision + int input_scale = ctx->impl()->GetConstFnAttr(FunctionContextImpl::ARG_TYPE_SCALE, 1); + int input_precision = ctx->impl()->GetConstFnAttr( + FunctionContextImpl::ARG_TYPE_PRECISION, 1); + + bool overflow = false; + Decimal16Value range_size = max_range.template Subtract<int128_t>(input_scale, + min_range, input_scale, input_precision, input_scale, false, &overflow); + if (UNLIKELY(overflow)) { + ostringstream error_msg; + error_msg << "Overflow while evaluating the difference between min_range: " << + min_range.value() << " and max_range: " << max_range.value(); + ctx->SetError(error_msg.str().c_str()); + return BigIntVal::null(); + } if (UNLIKELY(num_buckets.val <= 0)) { ostringstream error_msg; @@ -491,58 +516,73 @@ BigIntVal MathFunctions::WidthBucketImpl(FunctionContext* ctx, return BigIntVal::null(); } - if (UNLIKELY(min_range_val >= max_range_val)) { + if (UNLIKELY(min_range >= max_range)) { ctx->SetError("Lower bound cannot be greater than or equal to the upper bound"); return BigIntVal::null(); } - if (expr_val < min_range_val) return 0; - if (expr_val >= max_range_val) { - return BigIntVal(static_cast<int64_t>(num_buckets.val) + 1); + if (UNLIKELY(expr < min_range)) return 0; + + if (UNLIKELY(expr >= max_range)) { + BigIntVal result; + result.val = num_buckets.val; + ++result.val; + return result; } - bool bigger_type_needed = false; - // It is likely that this if stmt can be evaluated during codegen because - // 'max_range' and 'min_range' are almost certainly constant expressions: - if (max_range_val >= 0 && min_range_val < 0) { - if (static_cast<UnsignedType<ActualType>>(max_range_val) + - static_cast<UnsignedType<ActualType>>(abs(min_range_val)) >= - static_cast<UnsignedType<ActualType>>(BitUtil::Max<ActualType>())) { - bigger_type_needed = true; - } + Decimal16Value dist_from_min = expr.template Subtract<int128_t>(input_scale, + min_range, input_scale, input_precision, input_scale, false, &overflow); + DCHECK_EQ(overflow, false); + + Decimal16Value buckets = Decimal16Value::FromInt(input_precision, input_scale, + num_buckets.val, &overflow); + + if (UNLIKELY(overflow)) { + stringstream error_msg; + error_msg << "Overflow while representing the num_buckets:" << num_buckets.val + << " as a DecimalVal"; + ctx->SetError(error_msg.str().c_str()); + return BigIntVal::null(); + } + bool needs_int256 = false; + // Check if dist_from_min * buckets would overflow and if there is a need to + // store the intermediate results in int256_t to avoid an overflows + // Check if scaling up range size overflows and if there is a need to store the + // intermediate results in int256_t to avoid the overflow + if (UNLIKELY(BitUtil::CountLeadingZeros(abs(buckets.value())) + + BitUtil::CountLeadingZeros(abs(dist_from_min.value())) <= 128 || + BitUtil::CountLeadingZeros(range_size.value()) + + detail::MinLeadingZerosAfterScaling(BitUtil::CountLeadingZeros( + range_size.value()), input_scale) <= 128)) { + needs_int256 = true; } - auto MultiplicationOverflows = [](ActualType lhs, ActualType rhs) { - DCHECK(lhs > 0 && rhs > 0); - using ActualType = decltype(lhs); - return BitUtil::CountLeadingZeros(lhs) + BitUtil::CountLeadingZeros(rhs) <= - BitUtil::UnsignedWidth<ActualType>() + 1; - }; - - // It is likely that this can be evaluated during codegen: - bool multiplication_can_overflow = bigger_type_needed || MultiplicationOverflows( - max_range_val - min_range_val, num_buckets_val); - // 'expr_val' is not likely to be a constant expression, so this almost certainly - // needs runtime evaluation if bigger_type_needed is false: - bigger_type_needed = multiplication_can_overflow && MultiplicationOverflows( - expr_val - min_range_val, num_buckets_val); - - auto BucketFunc = [](auto element, auto min_rng, auto max_rng, auto buckets) { - auto range_size = max_rng - min_rng; - auto dist_from_min = element - min_rng; - auto ret = dist_from_min * buckets / range_size; - return BigIntVal(static_cast<int64_t>(ret) + 1); - }; - - if (bigger_type_needed) { - using BiggerType = typename DoubleWidth<ActualType>::type; - - return BucketFunc(static_cast<BiggerType>(expr_val), - static_cast<BiggerType>(min_range_val), static_cast<BiggerType>(max_range_val), - static_cast<BiggerType>(num_buckets.val)); + int128_t result; + if (needs_int256) { + // resulting scale should be 2 * input_scale as per multiplication rules + int256_t x = ConvertToInt256(buckets.value()) * ConvertToInt256( + dist_from_min.value()); + + // Since "range_size" and "x" have different scales, the divison would require + // evaluating the resulting scale. To avoid this we scale up the denominator to + // match the scale of the numerator. + int256_t y = DecimalUtil::MultiplyByScale<int256_t>(ConvertToInt256( + range_size.value()), input_scale, false); + result = ConvertToInt128(x / y, DecimalUtil::MAX_UNSCALED_DECIMAL16, + &overflow); + DCHECK_EQ(overflow, false); } else { - return BucketFunc(expr_val, min_range_val, max_range_val, num_buckets.val); + // resulting scale should be 2 * input_scale as per multiplication rules + int128_t x = buckets.value() * dist_from_min.value(); + + // Since "range_size" and "x" have different scales, the divison would require + // evaluating the resulting scale. To avoid this we scale up the denominator to + // match the scale of the numerator. + int128_t y = DecimalUtil::MultiplyByScale<int128_t>(range_size.value(), + input_scale, false); + result = x / y; // NOLINT: clang-tidy thinks y may equal zero here. } + return (BigIntVal(abs(result) + 1)); } BigIntVal MathFunctions::WidthBucket(FunctionContext* ctx, const DecimalVal& expr, http://git-wip-us.apache.org/repos/asf/impala/blob/83bb4aed/be/src/util/bit-util.h ---------------------------------------------------------------------- diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h index d07dd9d..8a65509 100644 --- a/be/src/util/bit-util.h +++ b/be/src/util/bit-util.h @@ -28,7 +28,8 @@ #include <climits> #include <limits> #include <typeinfo> -#include <type_traits> + +#include <boost/type_traits/make_unsigned.hpp> #include "common/compiler-util.h" #include "gutil/bits.h" @@ -38,40 +39,7 @@ namespace impala { -/// Nested 'type' corresponds to the unsigned version of T. -template <typename T> -struct MakeUnsigned { - using type = std::make_unsigned_t<T>; -}; - -template <> -struct MakeUnsigned<int128_t> { - using type = __uint128_t; -}; - -template <typename T> -using UnsignedType = typename MakeUnsigned<T>::type; - -// Doubles the width of integer types (e.g. int32_t -> int64_t). -// Currently only works with a few signed types. -// Feel free to extend it to other types as well. -template <typename T> -struct DoubleWidth {}; - -template <> -struct DoubleWidth<int32_t> { - using type = int64_t; -}; - -template <> -struct DoubleWidth<int64_t> { - using type = int128_t; -}; - -template <> -struct DoubleWidth<int128_t> { - using type = int256_t; -}; +using boost::make_unsigned; /// Utility class to do standard bit tricks /// TODO: is this in boost or something else like that? @@ -91,17 +59,6 @@ class BitUtil { std::is_same<CVR_REMOVED, __int128>::value ? 127 : -1; } - /// Returns the max value that can be represented in T. - template<typename T, typename CVR_REMOVED = typename std::decay<T>::type, - typename std::enable_if<std::is_integral<CVR_REMOVED> {}|| - std::is_same<CVR_REMOVED, __int128> {}, int>::type = 0> - constexpr static inline CVR_REMOVED Max() { - return std::is_integral<CVR_REMOVED>::value ? - std::numeric_limits<CVR_REMOVED>::max() : - std::is_same<CVR_REMOVED, __int128>::value ? - static_cast<UnsignedType<CVR_REMOVED>>(-1) / 2 : -1; - } - /// Return an integer signifying the sign of the value, returning +1 for /// positive integers (and zero), -1 for negative integers. /// The extra shift is to silence GCC warnings about full width shift on @@ -211,7 +168,7 @@ class BitUtil { template<typename T> static inline int PopcountSigned(T v) { // Converting to same-width unsigned then extending preserves the bit pattern. - return BitUtil::Popcount(static_cast<UnsignedType<T>>(v)); + return BitUtil::Popcount(static_cast<typename make_unsigned<T>::type>(v)); } /// Returns the 'num_bits' least-significant bits of 'v'. @@ -292,7 +249,7 @@ class BitUtil { template <typename T> constexpr static T ShiftRightLogical(T v, int shift) { // Conversion to unsigned ensures most significant bits always filled with 0's - return static_cast<UnsignedType<T>>(v) >> shift; + return static_cast<typename make_unsigned<T>::type>(v) >> shift; } /// Get an specific bit of a numeric type http://git-wip-us.apache.org/repos/asf/impala/blob/83bb4aed/tests/query_test/test_decimal_fuzz.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_decimal_fuzz.py b/tests/query_test/test_decimal_fuzz.py index 1433ec3..a129e33 100644 --- a/tests/query_test/test_decimal_fuzz.py +++ b/tests/query_test/test_decimal_fuzz.py @@ -30,9 +30,6 @@ from tests.common.test_vector import ImpalaTestDimension, ImpalaTestMatrix class TestDecimalFuzz(ImpalaTestSuite): - # Impala's max precision for decimals is 38, so we should have the same in the tests - decimal.getcontext().prec = 38 - @classmethod def get_workload(cls): return 'functional-query' @@ -203,7 +200,7 @@ class TestDecimalFuzz(ImpalaTestSuite): return True return False - def execute_one_decimal_op(self): + def execute_one(self): '''Executes a single query and compares the result to a result that we computed in Python.''' op = random.choice(['+', '-', '*', '/', '%']) @@ -246,60 +243,6 @@ class TestDecimalFuzz(ImpalaTestSuite): expected_result = None assert self.result_equals(expected_result, result) - def test_decimal_ops(self, vector): - for _ in xrange(self.iterations): - self.execute_one_decimal_op() - - def width_bucket(self, val, min_range, max_range, num_buckets): - # Multiplying the values by 10**40 guarantees that the numbers can be converted - # to int without losing information. - val_int = int(decimal.Decimal(val) * 10**40) - min_range_int = int(decimal.Decimal(min_range) * 10**40) - max_range_int = int(decimal.Decimal(max_range) * 10**40) - - if min_range_int >= max_range_int: - return None - if val_int < min_range_int: - return 0 - if val_int > max_range_int: - return num_buckets + 1 - - range_size = max_range_int - min_range_int - dist_from_min = val_int - min_range_int - return (num_buckets * dist_from_min) / range_size + 1 - - def execute_one_width_bucket(self): - val, val_prec, val_scale = self.get_decimal() - min_range, min_range_prec, min_range_scale = self.get_decimal() - max_range, max_range_prec, max_range_scale = self.get_decimal() - num_buckets = random.randint(1, 2147483647) - - query = ('select width_bucket(' - 'cast({val} as decimal({val_prec},{val_scale})), ' - 'cast({min_range} as decimal({min_range_prec},{min_range_scale})), ' - 'cast({max_range} as decimal({max_range_prec},{max_range_scale})), ' - '{num_buckets})') - - query = query.format(val=val, val_prec=val_prec, val_scale=val_scale, - min_range=min_range, min_range_prec=min_range_prec, - min_range_scale=min_range_scale, - max_range=max_range, max_range_prec=max_range_prec, - max_range_scale=max_range_scale, - num_buckets=num_buckets) - - expected_result = self.width_bucket(val, min_range, max_range, num_buckets) - if not expected_result: - return - - try: - result = self.execute_scalar(query, query_options={'decimal_v2': 'true'}) - assert int(result) == expected_result - except ImpalaBeeswaxException as e: - if "You need to wrap the arguments in a CAST" not in str(e): - # Sometimes the decimal inputs are incompatible with each other, so it's ok - # to ignore this error. - raise e - - def test_width_bucket(self, vector): + def test_fuzz(self, vector): for _ in xrange(self.iterations): - self.execute_one_width_bucket() + self.execute_one()
