[1/7] impala git commit: Revert "IMPALA-7412: width_bucket() function overflows too easily"

joemcdonnell Sat, 18 Aug 2018 13:38:34 -0700

Repository: impala
Updated Branches:
  refs/heads/master 1dd4403fd -> 0a901fcb2



Revert "IMPALA-7412: width_bucket() function overflows too easily"

IMPALA-7459 has been traced to this commit.

This reverts commit a8b32dbafa1f015c8316f205e32bbdce349f2474.

Change-Id: I61945c9199818ec98d7fb187e5931c76f7082c1a
Reviewed-on: http://gerrit.cloudera.org:8080/11265
Reviewed-by: Impala Public Jenkins <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/83bb4aed
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/83bb4aed
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/83bb4aed

Branch: refs/heads/master
Commit: 83bb4aed71db140ab13b2d727e7e09bb2077ccc6
Parents: 1dd4403
Author: Joe McDonnell <[email protected]>
Authored: Fri Aug 17 16:19:14 2018 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Sat Aug 18 03:05:46 2018 +0000

----------------------------------------------------------------------
 be/src/exprs/expr-test.cc             |  29 ++++--
 be/src/exprs/math-functions-ir.cc     | 156 ++++++++++++++++++-----------
 be/src/util/bit-util.h                |  53 +---------
 tests/query_test/test_decimal_fuzz.py |  63 +-----------
 4 files changed, 125 insertions(+), 176 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/83bb4aed/be/src/exprs/expr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index 8cc0e9d..d0e3ff3 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -5348,11 +5348,16 @@ TEST_F(ExprTest, MathFunctions) {
   // Test when min > max
   TestErrorString("width_bucket(22, 50, 5, 4)",
       "UDF ERROR: Lower bound cannot be greater than or equal to the upper 
bound\n");
-  // IMPALA-7412: Test max - min should not overflow anymore
-  TestValue("width_bucket(11, -9, 99999999999999999999999999999999999999, 
4000)",
-      TYPE_BIGINT, 1);
-  TestValue("width_bucket(1, -99999999999999999999999999999999999999, 9, 40)",
-      TYPE_BIGINT, 40);
+  // Test max - min will overflow during width_bucket evaluation
+  TestErrorString("width_bucket(11, -9, 
99999999999999999999999999999999999999, 4000)",
+      "UDF ERROR: Overflow while evaluating the difference between min_range: 
-9 and "
+      "max_range: 99999999999999999999999999999999999999\n");
+  // If expr - min overflows during width_bucket evaluation, max - min will 
also
+  // overflow. Since we evaluate max - min before evaluating expr - min, we 
will never
+  // end up overflowing expr - min.
+  TestErrorString("width_bucket(1, -99999999999999999999999999999999999999, 9, 
40)",
+      "UDF ERROR: Overflow while evaluating the difference between min_range: "
+      "-99999999999999999999999999999999999999 and max_range: 9\n");
   // Test when dist_from_min * buckets cannot be stored in a int128_t 
(overflows)
   // and needs to be stored in a int256_t
   TestValue("width_bucket(8000000000000000000000000000000000000,"
@@ -5375,12 +5380,16 @@ TEST_F(ExprTest, MathFunctions) {
   // max and min value that would require int256_t for evalation
   TestValue("width_bucket(10000000000000000000000000000000000000, 1,"
             "99999999999999999999999999999999999999, 15)", TYPE_BIGINT, 2);
-  // IMPALA-7412: These should not overflow anymore
-  TestValue("width_bucket(cast(-0.10 as decimal(37,30)), 
cast(-0.36028797018963968 "
+  // IMPALA-7242/IMPALA-7243: check for overflow when converting IntVal to 
DecimalValue
+  TestErrorString("width_bucket(cast(-0.10 as decimal(37,30)), 
cast(-0.36028797018963968 "
       "as decimal(25,25)), cast(9151517.4969773200562764155787276999832"
-      "as decimal(38,31)), 1328180220)", TYPE_BIGINT, 38);
-  TestValue("width_bucket(cast(9 as decimal(10,7)), cast(-60000 as 
decimal(11,6)), "
-      "cast(10 as decimal(7,5)), 249895273);", TYPE_BIGINT, 249891109);
+      "as decimal(38,31)), 1328180220)",
+      "UDF ERROR: Overflow while representing the num_buckets:1328180220 as a "
+      "DecimalVal\n");
+  TestErrorString("width_bucket(cast(9 as decimal(10,7)), cast(-60000 as 
decimal(11,6)), "
+      "cast(10 as decimal(7,5)), 249895273);",
+      "UDF ERROR: Overflow while representing the num_buckets:249895273 as a "
+      "DecimalVal\n");
   // Run twice to test deterministic behavior.
   for (uint32_t seed : {0, 1234}) {
     stringstream rand, random;

http://git-wip-us.apache.org/repos/asf/impala/blob/83bb4aed/be/src/exprs/math-functions-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/math-functions-ir.cc 
b/be/src/exprs/math-functions-ir.cc
index 7103a10..28c9e1e 100644
--- a/be/src/exprs/math-functions-ir.cc
+++ b/be/src/exprs/math-functions-ir.cc
@@ -457,32 +457,57 @@ DoubleVal MathFunctions::FmodDouble(FunctionContext* ctx, 
const DoubleVal& a,
 
 // The bucket_number is evaluated using the following formula:
 //
-//   bucket_number = dist_from_min * num_buckets / range_size + 1
+//   bucket_number = dist_from_min * num_buckets / range_size
 //      where -
 //        dist_from_min = expr - min_range
 //        range_size = max_range - min_range
-//        num_buckets = number of buckets
+//        buckets = number of buckets
 //
-// Since expr, min_range, and max_range are all decimals with the same
-// byte size, precision, and scale we can interpret them as plain integers
-// and still calculate the proper bucket.
+// The results of the above subtractions are stored in Decimal16Value to avoid 
an overflow
+// in the following cases:
+//   case 1:
+//      T1 is decimal8Value
+//         When evaluating this particular expression
+//            dist_from_min = expr - min_range
+//         If expr is a max positive value which can be represented in 
decimal8Value and
+//         min_range < 0 the resulting dist_from_min can be represented in 
decimal16Val
+//         without overflowing
+//   case 2:
+//      T1 is decimal16Value
+//         Subtracting a negative min_range from expr can result in an 
overflow in which
+//         case the function errors out. There is no decimal32Val to handle 
this. So
+//         storing it in decimal16Value.
+//   case 3:
+//      T1 is decimal4Value
+//         We can store the results in a decimal8Value. But this change hard 
codes to
+//         store the result in decimal16Val for now to be compatible with the 
other
+//         decimal*Vals
 //
-// There are some possibilities of overflowing during the calculation:
-// range_size = max_range - min_range
-// dist_from_min = expr - min_range
-// dist_from_min * num_buckets
+// The result of this multiplication dist_from_min * buckets is stored as a 
int256_t
+// if storing it in a int128_t would overflow.
 //
-// For all the above cases we use a bigger integer type provided by the
-// BitUtil::DoubleWidth<> metafunction.
+// To perform the division, range_size is scaled up. The scale and precision 
of the
+// numerator and denominator are adjusted to be the same. This avoids the need 
to compute
+// the resulting scale and precision.
 template <class  T1>
 BigIntVal MathFunctions::WidthBucketImpl(FunctionContext* ctx,
     const T1& expr, const T1& min_range,
     const T1& max_range, const IntVal& num_buckets) {
-  auto expr_val = expr.value();
-  using ActualType = decltype(expr_val);
-  auto min_range_val = min_range.value();
-  auto max_range_val = max_range.value();
-  auto num_buckets_val = static_cast<ActualType>(num_buckets.val);
+  // FE casts expr, min_range and max_range to be of the scale and precision
+  int input_scale = 
ctx->impl()->GetConstFnAttr(FunctionContextImpl::ARG_TYPE_SCALE, 1);
+  int input_precision = ctx->impl()->GetConstFnAttr(
+      FunctionContextImpl::ARG_TYPE_PRECISION, 1);
+
+  bool overflow = false;
+  Decimal16Value range_size = max_range.template 
Subtract<int128_t>(input_scale,
+      min_range, input_scale, input_precision, input_scale, false, &overflow);
+  if (UNLIKELY(overflow)) {
+    ostringstream error_msg;
+    error_msg << "Overflow while evaluating the difference between min_range: 
" <<
+        min_range.value() << " and max_range: " << max_range.value();
+    ctx->SetError(error_msg.str().c_str());
+    return BigIntVal::null();
+  }
 
   if (UNLIKELY(num_buckets.val <= 0)) {
     ostringstream error_msg;
@@ -491,58 +516,73 @@ BigIntVal MathFunctions::WidthBucketImpl(FunctionContext* 
ctx,
     return BigIntVal::null();
   }
 
-  if (UNLIKELY(min_range_val >= max_range_val)) {
+  if (UNLIKELY(min_range >= max_range)) {
     ctx->SetError("Lower bound cannot be greater than or equal to the upper 
bound");
     return BigIntVal::null();
   }
 
-  if (expr_val < min_range_val) return 0;
-  if (expr_val >= max_range_val) {
-    return BigIntVal(static_cast<int64_t>(num_buckets.val) + 1);
+  if (UNLIKELY(expr < min_range)) return 0;
+
+  if (UNLIKELY(expr >= max_range)) {
+    BigIntVal result;
+    result.val = num_buckets.val;
+    ++result.val;
+    return result;
   }
 
-  bool bigger_type_needed = false;
-  // It is likely that this if stmt can be evaluated during codegen because
-  // 'max_range' and 'min_range' are almost certainly constant expressions:
-  if (max_range_val >= 0 && min_range_val < 0) {
-    if (static_cast<UnsignedType<ActualType>>(max_range_val) +
-        static_cast<UnsignedType<ActualType>>(abs(min_range_val)) >=
-        static_cast<UnsignedType<ActualType>>(BitUtil::Max<ActualType>())) {
-      bigger_type_needed = true;
-    }
+  Decimal16Value dist_from_min = expr.template Subtract<int128_t>(input_scale,
+      min_range, input_scale, input_precision, input_scale, false, &overflow);
+  DCHECK_EQ(overflow, false);
+
+  Decimal16Value buckets = Decimal16Value::FromInt(input_precision, 
input_scale,
+      num_buckets.val, &overflow);
+
+  if (UNLIKELY(overflow)) {
+    stringstream error_msg;
+    error_msg << "Overflow while representing the num_buckets:" << 
num_buckets.val
+        << " as a DecimalVal";
+    ctx->SetError(error_msg.str().c_str());
+    return BigIntVal::null();
+  }
+  bool needs_int256 = false;
+  // Check if dist_from_min * buckets would overflow and if there is a need to
+  // store the intermediate results in int256_t to avoid an overflows
+  // Check if scaling up range size overflows and if there is a need to store 
the
+  // intermediate results in int256_t to avoid the overflow
+  if (UNLIKELY(BitUtil::CountLeadingZeros(abs(buckets.value())) +
+      BitUtil::CountLeadingZeros(abs(dist_from_min.value())) <= 128 ||
+      BitUtil::CountLeadingZeros(range_size.value()) +
+      detail::MinLeadingZerosAfterScaling(BitUtil::CountLeadingZeros(
+      range_size.value()), input_scale) <= 128)) {
+    needs_int256 = true;
   }
 
-  auto MultiplicationOverflows = [](ActualType lhs, ActualType rhs) {
-    DCHECK(lhs > 0 && rhs > 0);
-    using ActualType = decltype(lhs);
-    return BitUtil::CountLeadingZeros(lhs) + BitUtil::CountLeadingZeros(rhs) <=
-        BitUtil::UnsignedWidth<ActualType>() + 1;
-  };
-
-  // It is likely that this can be evaluated during codegen:
-  bool multiplication_can_overflow = bigger_type_needed ||  
MultiplicationOverflows(
-      max_range_val - min_range_val, num_buckets_val);
-  // 'expr_val' is not likely to be a constant expression, so this almost 
certainly
-  // needs runtime evaluation if bigger_type_needed is false:
-  bigger_type_needed = multiplication_can_overflow &&  MultiplicationOverflows(
-      expr_val - min_range_val, num_buckets_val);
-
-  auto BucketFunc = [](auto element, auto min_rng, auto max_rng, auto buckets) 
{
-    auto range_size = max_rng - min_rng;
-    auto dist_from_min = element - min_rng;
-    auto ret = dist_from_min * buckets / range_size;
-    return BigIntVal(static_cast<int64_t>(ret) + 1);
-  };
-
-  if (bigger_type_needed) {
-    using BiggerType = typename DoubleWidth<ActualType>::type;
-
-    return BucketFunc(static_cast<BiggerType>(expr_val),
-        static_cast<BiggerType>(min_range_val), 
static_cast<BiggerType>(max_range_val),
-        static_cast<BiggerType>(num_buckets.val));
+  int128_t result;
+  if (needs_int256) {
+    // resulting scale should be 2 * input_scale as per multiplication rules
+    int256_t x = ConvertToInt256(buckets.value()) * ConvertToInt256(
+        dist_from_min.value());
+
+    // Since "range_size" and "x" have different scales, the divison would 
require
+    // evaluating the resulting scale. To avoid this we scale up the 
denominator to
+    // match the scale of the numerator.
+    int256_t y = DecimalUtil::MultiplyByScale<int256_t>(ConvertToInt256(
+        range_size.value()), input_scale, false);
+    result = ConvertToInt128(x / y, DecimalUtil::MAX_UNSCALED_DECIMAL16,
+        &overflow);
+    DCHECK_EQ(overflow, false);
   } else {
-    return BucketFunc(expr_val, min_range_val, max_range_val, num_buckets.val);
+    // resulting scale should be 2 * input_scale as per multiplication rules
+    int128_t x = buckets.value() * dist_from_min.value();
+
+    // Since "range_size" and "x" have different scales, the divison would 
require
+    // evaluating the resulting scale. To avoid this we scale up the 
denominator to
+    // match the scale of the numerator.
+    int128_t y = DecimalUtil::MultiplyByScale<int128_t>(range_size.value(),
+        input_scale, false);
+    result = x / y; // NOLINT: clang-tidy thinks y may equal zero here.
   }
+  return (BigIntVal(abs(result) + 1));
 }
 
 BigIntVal MathFunctions::WidthBucket(FunctionContext* ctx, const DecimalVal& 
expr,

http://git-wip-us.apache.org/repos/asf/impala/blob/83bb4aed/be/src/util/bit-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h
index d07dd9d..8a65509 100644
--- a/be/src/util/bit-util.h
+++ b/be/src/util/bit-util.h
@@ -28,7 +28,8 @@
 #include <climits>
 #include <limits>
 #include <typeinfo>
-#include <type_traits>
+
+#include <boost/type_traits/make_unsigned.hpp>
 
 #include "common/compiler-util.h"
 #include "gutil/bits.h"
@@ -38,40 +39,7 @@
 
 namespace impala {
 
-/// Nested 'type' corresponds to the unsigned version of T.
-template <typename T>
-struct MakeUnsigned {
-  using type = std::make_unsigned_t<T>;
-};
-
-template <>
-struct MakeUnsigned<int128_t> {
-  using type = __uint128_t;
-};
-
-template <typename T>
-using UnsignedType = typename MakeUnsigned<T>::type;
-
-// Doubles the width of integer types (e.g. int32_t -> int64_t).
-// Currently only works with a few signed types.
-// Feel free to extend it to other types as well.
-template <typename T>
-struct DoubleWidth {};
-
-template <>
-struct DoubleWidth<int32_t> {
-  using type = int64_t;
-};
-
-template <>
-struct DoubleWidth<int64_t> {
-  using type = int128_t;
-};
-
-template <>
-struct DoubleWidth<int128_t> {
-  using type = int256_t;
-};
+using boost::make_unsigned;
 
 /// Utility class to do standard bit tricks
 /// TODO: is this in boost or something else like that?
@@ -91,17 +59,6 @@ class BitUtil {
         std::is_same<CVR_REMOVED, __int128>::value ? 127 : -1;
   }
 
-  /// Returns the max value that can be represented in T.
-  template<typename T, typename CVR_REMOVED = typename std::decay<T>::type,
-      typename std::enable_if<std::is_integral<CVR_REMOVED> {}||
-                              std::is_same<CVR_REMOVED, __int128> {}, 
int>::type = 0>
-  constexpr static inline CVR_REMOVED Max() {
-    return std::is_integral<CVR_REMOVED>::value ?
-        std::numeric_limits<CVR_REMOVED>::max() :
-        std::is_same<CVR_REMOVED, __int128>::value ?
-            static_cast<UnsignedType<CVR_REMOVED>>(-1) / 2 : -1;
-  }
-
   /// Return an integer signifying the sign of the value, returning +1 for
   /// positive integers (and zero), -1 for negative integers.
   /// The extra shift is to silence GCC warnings about full width shift on
@@ -211,7 +168,7 @@ class BitUtil {
   template<typename T>
   static inline int PopcountSigned(T v) {
     // Converting to same-width unsigned then extending preserves the bit 
pattern.
-    return BitUtil::Popcount(static_cast<UnsignedType<T>>(v));
+    return BitUtil::Popcount(static_cast<typename make_unsigned<T>::type>(v));
   }
 
   /// Returns the 'num_bits' least-significant bits of 'v'.
@@ -292,7 +249,7 @@ class BitUtil {
   template <typename T>
   constexpr static T ShiftRightLogical(T v, int shift) {
     // Conversion to unsigned ensures most significant bits always filled with 
0's
-    return static_cast<UnsignedType<T>>(v) >> shift;
+    return static_cast<typename make_unsigned<T>::type>(v) >> shift;
   }
 
   /// Get an specific bit of a numeric type

http://git-wip-us.apache.org/repos/asf/impala/blob/83bb4aed/tests/query_test/test_decimal_fuzz.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_decimal_fuzz.py 
b/tests/query_test/test_decimal_fuzz.py
index 1433ec3..a129e33 100644
--- a/tests/query_test/test_decimal_fuzz.py
+++ b/tests/query_test/test_decimal_fuzz.py
@@ -30,9 +30,6 @@ from tests.common.test_vector import ImpalaTestDimension, 
ImpalaTestMatrix
 
 class TestDecimalFuzz(ImpalaTestSuite):
 
-  # Impala's max precision for decimals is 38, so we should have the same in 
the tests
-  decimal.getcontext().prec = 38
-
   @classmethod
   def get_workload(cls):
     return 'functional-query'
@@ -203,7 +200,7 @@ class TestDecimalFuzz(ImpalaTestSuite):
         return True
     return False
 
-  def execute_one_decimal_op(self):
+  def execute_one(self):
     '''Executes a single query and compares the result to a result that we 
computed in
     Python.'''
     op = random.choice(['+', '-', '*', '/', '%'])
@@ -246,60 +243,6 @@ class TestDecimalFuzz(ImpalaTestSuite):
         expected_result = None
       assert self.result_equals(expected_result, result)
 
-  def test_decimal_ops(self, vector):
-    for _ in xrange(self.iterations):
-      self.execute_one_decimal_op()
-
-  def width_bucket(self, val, min_range, max_range, num_buckets):
-    # Multiplying the values by 10**40 guarantees that the numbers can be 
converted
-    # to int without losing information.
-    val_int = int(decimal.Decimal(val) * 10**40)
-    min_range_int = int(decimal.Decimal(min_range) * 10**40)
-    max_range_int = int(decimal.Decimal(max_range) * 10**40)
-
-    if min_range_int >= max_range_int:
-      return None
-    if val_int < min_range_int:
-      return 0
-    if val_int > max_range_int:
-      return num_buckets + 1
-
-    range_size = max_range_int - min_range_int
-    dist_from_min = val_int - min_range_int
-    return (num_buckets * dist_from_min) / range_size + 1
-
-  def execute_one_width_bucket(self):
-    val, val_prec, val_scale = self.get_decimal()
-    min_range, min_range_prec, min_range_scale = self.get_decimal()
-    max_range, max_range_prec, max_range_scale = self.get_decimal()
-    num_buckets = random.randint(1, 2147483647)
-
-    query = ('select width_bucket('
-        'cast({val} as decimal({val_prec},{val_scale})), '
-        'cast({min_range} as decimal({min_range_prec},{min_range_scale})), '
-        'cast({max_range} as decimal({max_range_prec},{max_range_scale})), '
-        '{num_buckets})')
-
-    query = query.format(val=val, val_prec=val_prec, val_scale=val_scale,
-        min_range=min_range, min_range_prec=min_range_prec,
-        min_range_scale=min_range_scale,
-        max_range=max_range, max_range_prec=max_range_prec,
-        max_range_scale=max_range_scale,
-        num_buckets=num_buckets)
-
-    expected_result = self.width_bucket(val, min_range, max_range, num_buckets)
-    if not expected_result:
-      return
-
-    try:
-      result = self.execute_scalar(query, query_options={'decimal_v2': 'true'})
-      assert int(result) == expected_result
-    except ImpalaBeeswaxException as e:
-      if "You need to wrap the arguments in a CAST" not in str(e):
-        # Sometimes the decimal inputs are incompatible with each other, so 
it's ok
-        # to ignore this error.
-        raise e
-
-  def test_width_bucket(self, vector):
+  def test_fuzz(self, vector):
     for _ in xrange(self.iterations):
-      self.execute_one_width_bucket()
+      self.execute_one()

[1/7] impala git commit: Revert "IMPALA-7412: width_bucket() function overflows too easily"

Reply via email to