Repository: impala Updated Branches: refs/heads/master 8060f4d50 -> dde930830
IMPALA-4848: Add WIDTH_BUCKET() function Syntax : width_bucket(expr decimal, min_val decimal, max_val decimal, num_buckets int) This function creates equiwidth histograms , where the histogram range is divided into num_buckets buckets having identical sizes. This function returns the bucket in which the expr value would fall. min_val and max_val are the minimum and maximum value of the histogram range respectively. -> This function returns NULL if expr is a NULL. -> This function returns 0 if expr < min_val -> This function returns num_buckets + 1 if expr > max_val E.g. [localhost:21000] > select width_bucket(8, 1, 20, 3); +---------------------------+ | width_bucket(8, 1, 20, 3) | +---------------------------+ | 2 | +---------------------------+ Change-Id: I081bc916b1bef7b929ca161a9aade3b54c6b858f Reviewed-on: http://gerrit.cloudera.org:8080/6023 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/dde93083 Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/dde93083 Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/dde93083 Branch: refs/heads/master Commit: dde930830b763f58e2f2e73b8b0fe265c4c33f2b Parents: 8060f4d Author: aphadke <[email protected]> Authored: Wed Feb 15 14:09:51 2017 -0800 Committer: Impala Public Jenkins <[email protected]> Committed: Sat Jun 30 04:47:23 2018 +0000 ---------------------------------------------------------------------- be/src/exprs/expr-test.cc | 65 ++++++++ be/src/exprs/math-functions-ir.cc | 156 ++++++++++++++++++- be/src/exprs/math-functions.h | 15 +- be/src/util/string-util.cc | 8 + be/src/util/string-util.h | 4 + common/function-registry/impala_functions.py | 3 +- .../java/org/apache/impala/analysis/Expr.java | 2 +- 7 files changed, 246 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/dde93083/be/src/exprs/expr-test.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc index aac0f9e..999f41a 100644 --- a/be/src/exprs/expr-test.cc +++ b/be/src/exprs/expr-test.cc @@ -60,6 +60,7 @@ #include "udf/udf-test-harness.h" #include "util/debug-util.h" #include "util/string-parser.h" +#include "util/string-util.h" #include "util/test-info.h" #include "gen-cpp/ImpalaInternalService_types.h" @@ -594,6 +595,17 @@ class ExprTest : public testing::Test { ASSERT_FALSE(status.ok()) << "stmt: " << stmt << "\nunexpected Status::OK."; } + // "Execute 'expr' and check that the returned error ends with 'error_string'" + void TestErrorString(const string& expr, const string& error_string) { + string stmt = "select " + expr; + vector<FieldSchema> result_types; + string result_row; + Status status = executor_->Exec(stmt, &result_types); + status = executor_->FetchResult(&result_row); + ASSERT_FALSE(status.ok()); + ASSERT_TRUE(EndsWith(status.msg().msg(), error_string)); + } + template <typename T> void TestFixedPointComparisons(bool test_boundaries) { int64_t t_min = numeric_limits<T>::min(); int64_t t_max = numeric_limits<T>::max(); @@ -5311,6 +5323,59 @@ TEST_F(ExprTest, MathFunctions) { TestValue("sqrt(2.0)", TYPE_DOUBLE, sqrt(2.0)); TestValue("dsqrt(81.0)", TYPE_DOUBLE, 9); + TestValue("width_bucket(6.3, 2, 17, 2)", TYPE_BIGINT, 1); + TestValue("width_bucket(11, 6, 14, 3)", TYPE_BIGINT, 2); + TestValue("width_bucket(-1, -5, 5, 3)", TYPE_BIGINT, 2); + TestValue("width_bucket(1, -5, 5, 3)", TYPE_BIGINT, 2); + TestValue("width_bucket(3, 5, 20.1, 4)", TYPE_BIGINT, 0); + TestIsNull("width_bucket(NULL, 5, 20.1, 4)", TYPE_BIGINT); + TestIsNull("width_bucket(22, NULL, 20.1, 4)", TYPE_BIGINT); + TestIsNull("width_bucket(22, 5, NULL, 4)", TYPE_BIGINT); + TestIsNull("width_bucket(22, 5, 20.1, NULL)", TYPE_BIGINT); + + TestValue("width_bucket(22, 5, 20.1, 4)", TYPE_BIGINT, 5); + // Test when the result (bucket number) is greater than the max value that can be + // stored in a IntVal + TestValue("width_bucket(22, 5, 20.1, 2147483647)", TYPE_BIGINT, 2147483648); + // Test when min and max of the bucket width range are equal. + TestErrorString("width_bucket(22, 5, 5, 4)", + "UDF ERROR: Lower bound cannot be greater than or equal to the upper bound\n"); + // Test when min > max + TestErrorString("width_bucket(22, 50, 5, 4)", + "UDF ERROR: Lower bound cannot be greater than or equal to the upper bound\n"); + // Test max - min will overflow during width_bucket evaluation + TestErrorString("width_bucket(11, -9, 99999999999999999999999999999999999999, 4000)", + "UDF ERROR: Overflow while evaluating the difference between min_range: -9 and " + "max_range: 99999999999999999999999999999999999999\n"); + // If expr - min overflows during width_bucket evaluation, max - min will also + // overflow. Since we evaluate max - min before evaluating expr - min, we will never + // end up overflowing expr - min. + TestErrorString("width_bucket(1, -99999999999999999999999999999999999999, 9, 40)", + "UDF ERROR: Overflow while evaluating the difference between min_range: " + "-99999999999999999999999999999999999999 and max_range: 9\n"); + // Test when dist_from_min * buckets cannot be stored in a int128_t (overflows) + // and needs to be stored in a int256_t + TestValue("width_bucket(8000000000000000000000000000000000000," + "100000000000000000000000000000000000, 9000000000000000000000000000000000000," + "900000)", TYPE_BIGINT, 798877); + // Test when range_size * GetScaleMultiplier(input_scale) cannot be stored in a + // int128_t (overflows) and needs to be stored in a int256_t + TestValue("width_bucket(100000000, 199999.77777777777777777777777777, 99999999999.99999" + ", 40)", TYPE_BIGINT, 1); + // Test with max values for expr and num_bucket when the width_bucket can be + // evaluated with int128_t. Incrementing one of them will require using int256_t for + // width_bucket evaluation + TestValue("width_bucket(9999999999999999999999999999999999999, 1," + "99999999999999999999999999999999999999, 15)", TYPE_BIGINT, 2); + // Test with the smallest value of num_bucket for the given combination of expr, + // max and min value that would require int256_t for evalation + TestValue("width_bucket(9999999999999999999999999999999999999, 1," + "99999999999999999999999999999999999999, 16)", TYPE_BIGINT, 2); + // Test with the smallest value of expr for the given combination of num_buckets, + // max and min value that would require int256_t for evalation + TestValue("width_bucket(10000000000000000000000000000000000000, 1," + "99999999999999999999999999999999999999, 15)", TYPE_BIGINT, 2); + // Run twice to test deterministic behavior. for (uint32_t seed : {0, 1234}) { stringstream rand, random; http://git-wip-us.apache.org/repos/asf/impala/blob/dde93083/be/src/exprs/math-functions-ir.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/math-functions-ir.cc b/be/src/exprs/math-functions-ir.cc index 4f09416..91af3f4 100644 --- a/be/src/exprs/math-functions-ir.cc +++ b/be/src/exprs/math-functions-ir.cc @@ -15,21 +15,22 @@ // specific language governing permissions and limitations // under the License. -#include "exprs/math-functions.h" - #include <iomanip> #include <random> #include <sstream> #include <math.h> - +#include <stdint.h> +#include <string> #include "exprs/anyval-util.h" #include "exprs/scalar-expr.h" #include "exprs/operators.h" +#include "exprs/math-functions.h" #include "util/string-parser.h" +#include "runtime/decimal-value.inline.h" #include "runtime/runtime-state.h" #include "runtime/string-value.inline.h" #include "thirdparty/pcg-cpp-0.98/include/pcg_random.hpp" - +#include "exprs/decimal-operators.h" #include "common/names.h" using std::uppercase; @@ -454,6 +455,153 @@ DoubleVal MathFunctions::FmodDouble(FunctionContext* ctx, const DoubleVal& a, return DoubleVal(fmod(a.val, b.val)); } +// The bucket_number is evaluated using the following formula: +// +// bucket_number = dist_from_min * num_buckets / range_size +// where - +// dist_from_min = expr - min_range +// range_size = max_range - min_range +// buckets = number of buckets +// +// The results of the above subtractions are stored in Decimal16Value to avoid an overflow +// in the following cases: +// case 1: +// T1 is decimal8Value +// When evaluating this particular expression +// dist_from_min = expr - min_range +// If expr is a max positive value which can be represented in decimal8Value and +// min_range < 0 the resulting dist_from_min can be represented in decimal16Val +// without overflowing +// case 2: +// T1 is decimal16Value +// Subtracting a negative min_range from expr can result in an overflow in which +// case the function errors out. There is no decimal32Val to handle this. So +// storing it in decimal16Value. +// case 3: +// T1 is decimal4Value +// We can store the results in a decimal8Value. But this change hard codes to +// store the result in decimal16Val for now to be compatible with the other +// decimal*Vals +// +// The result of this multiplication dist_from_min * buckets is stored as a int256_t +// if storing it in a int128_t would overflow. +// +// To perform the division, range_size is scaled up. The scale and precision of the +// numerator and denominator are adjusted to be the same. This avoids the need to compute +// the resulting scale and precision. +template <class T1> +BigIntVal MathFunctions::WidthBucketImpl(FunctionContext* ctx, + const T1& expr, const T1& min_range, + const T1& max_range, const IntVal& num_buckets) { + // FE casts expr, min_range and max_range to be of the scale and precision + int input_scale = ctx->impl()->GetConstFnAttr(FunctionContextImpl::ARG_TYPE_SCALE, 1); + int input_precision = ctx->impl()->GetConstFnAttr( + FunctionContextImpl::ARG_TYPE_PRECISION, 1); + + bool overflow = false; + Decimal16Value range_size = max_range.template Subtract<int128_t>(input_scale, + min_range, input_scale, input_precision, input_scale, false, &overflow); + if (UNLIKELY(overflow)) { + ostringstream error_msg; + error_msg << "Overflow while evaluating the difference between min_range: " << + min_range.value() << " and max_range: " << max_range.value(); + ctx->SetError(error_msg.str().c_str()); + return BigIntVal::null(); + } + + if (UNLIKELY(num_buckets.val <= 0)) { + ostringstream error_msg; + error_msg << "Number of buckets should be greater than zero:" << num_buckets.val; + ctx->SetError(error_msg.str().c_str()); + return BigIntVal::null(); + } + + if (UNLIKELY(min_range >= max_range)) { + ctx->SetError("Lower bound cannot be greater than or equal to the upper bound"); + return BigIntVal::null(); + } + + if (UNLIKELY(expr < min_range)) return 0; + + if (UNLIKELY(expr >= max_range)) { + BigIntVal result; + result.val = num_buckets.val; + ++result.val; + return result; + } + + Decimal16Value dist_from_min = expr.template Subtract<int128_t>(input_scale, + min_range, input_scale, input_precision, input_scale, false, &overflow); + DCHECK_EQ(overflow, false); + + Decimal16Value buckets = Decimal16Value::FromInt(input_precision, input_scale, + num_buckets.val, &overflow); + + bool needs_int256 = false; + // Check if dist_from_min * buckets would overflow and if there is a need to + // store the intermediate results in int256_t to avoid an overflows + // Check if scaling up range size overflows and if there is a need to store the + // intermediate results in int256_t to avoid the overflow + if (UNLIKELY(BitUtil::CountLeadingZeros(abs(buckets.value())) + + BitUtil::CountLeadingZeros(abs(dist_from_min.value())) <= 128 || + BitUtil::CountLeadingZeros(range_size.value()) + + detail::MinLeadingZerosAfterScaling(BitUtil::CountLeadingZeros( + range_size.value()), input_scale) <= 128)) { + needs_int256 = true; + } + + int128_t result; + if (needs_int256) { + // resulting scale should be 2 * input_scale as per multiplication rules + int256_t x = ConvertToInt256(buckets.value()) * ConvertToInt256( + dist_from_min.value()); + + // Since "range_size" and "x" have different scales, the divison would require + // evaluating the resulting scale. To avoid this we scale up the denominator to + // match the scale of the numerator. + int256_t y = DecimalUtil::MultiplyByScale<int256_t>(ConvertToInt256( + range_size.value()), input_scale, false); + result = ConvertToInt128(x / y, DecimalUtil::MAX_UNSCALED_DECIMAL16, + &overflow); + DCHECK_EQ(overflow, false); + } else { + // resulting scale should be 2 * input_scale as per multiplication rules + int128_t x = buckets.value() * dist_from_min.value(); + + // Since "range_size" and "x" have different scales, the divison would require + // evaluating the resulting scale. To avoid this we scale up the denominator to + // match the scale of the numerator. + int128_t y = DecimalUtil::MultiplyByScale<int128_t>(range_size.value(), + input_scale, false); + result = x / y; // NOLINT: clang-tidy thinks y may equal zero here. + } + return (BigIntVal(abs(result) + 1)); +} + +BigIntVal MathFunctions::WidthBucket(FunctionContext* ctx, const DecimalVal& expr, + const DecimalVal& min_range, const DecimalVal& max_range, + const IntVal& num_buckets) { + if (expr.is_null || min_range.is_null || max_range.is_null || num_buckets.is_null) { + return BigIntVal::null(); + } + + int arg_size_type = ctx->impl()->GetConstFnAttr(FunctionContextImpl::ARG_TYPE_SIZE, 0); + switch (arg_size_type) { + case 4: + return WidthBucketImpl<Decimal4Value> (ctx, Decimal4Value(expr.val4), + Decimal4Value(min_range.val4), Decimal4Value(max_range.val4), num_buckets); + case 8: + return WidthBucketImpl<Decimal8Value> (ctx, Decimal8Value(expr.val8), + Decimal8Value(min_range.val8), Decimal8Value(max_range.val8), num_buckets); + case 16: + return WidthBucketImpl<Decimal16Value>(ctx, Decimal16Value(expr.val16), + Decimal16Value(min_range.val16), Decimal16Value(max_range.val16), num_buckets); + default: + DCHECK(false); + return BigIntVal::null(); + } +} + template <typename T> T MathFunctions::Positive(FunctionContext* ctx, const T& val) { return val; } http://git-wip-us.apache.org/repos/asf/impala/blob/dde93083/be/src/exprs/math-functions.h ---------------------------------------------------------------------- diff --git a/be/src/exprs/math-functions.h b/be/src/exprs/math-functions.h index 5c2e1de..960a80f 100644 --- a/be/src/exprs/math-functions.h +++ b/be/src/exprs/math-functions.h @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. - #ifndef IMPALA_EXPRS_MATH_FUNCTIONS_H #define IMPALA_EXPRS_MATH_FUNCTIONS_H @@ -117,6 +116,11 @@ class MathFunctions { template <bool ISLEAST> static DecimalVal LeastGreatest( FunctionContext*, int num_args, const DecimalVal* args); + + static BigIntVal WidthBucket(FunctionContext* ctx, const DecimalVal& expr, + const DecimalVal& min_range, const DecimalVal& max_range, + const IntVal& num_buckets); + private: static const int32_t MIN_BASE = 2; static const int32_t MAX_BASE = 36; @@ -140,6 +144,15 @@ class MathFunctions { /// Returns false otherwise, indicating some other error condition. static bool HandleParseResult(int8_t dest_base, int64_t* num, StringParser::ParseResult parse_res); + + /// This function creates equiwidth histograms , where the histogram range + /// is divided into num_buckets buckets having identical sizes. This function + /// returns the bucket in which the expr value would fall. min_val and + /// max_val are the minimum and maximum value of the histogram range + /// respectively. + template <typename T1> + static BigIntVal WidthBucketImpl(FunctionContext* ctx,const T1& expr, + const T1& min_range,const T1& max_range, const IntVal& num_buckets); }; } http://git-wip-us.apache.org/repos/asf/impala/blob/dde93083/be/src/util/string-util.cc ---------------------------------------------------------------------- diff --git a/be/src/util/string-util.cc b/be/src/util/string-util.cc index f8e284f..e442391 100644 --- a/be/src/util/string-util.cc +++ b/be/src/util/string-util.cc @@ -65,4 +65,12 @@ bool CommaSeparatedContains(const std::string& cs_list, const std::string& item) return false; } +bool EndsWith(const std::string& full_string, const std::string& end) { + if (full_string.size() >= end.size()) { + return (full_string.compare(full_string.size() - end.size(), end.size(), + end) == 0); + } + return false; +} + } http://git-wip-us.apache.org/repos/asf/impala/blob/dde93083/be/src/util/string-util.h ---------------------------------------------------------------------- diff --git a/be/src/util/string-util.h b/be/src/util/string-util.h index 2811a3c..8db010c 100644 --- a/be/src/util/string-util.h +++ b/be/src/util/string-util.h @@ -40,6 +40,10 @@ Status TruncateUp(const std::string& str, int32_t max_length, std::string* resul /// Return true if the comma-separated string 'cs_list' contains 'item' as one of /// the comma-separated values. bool CommaSeparatedContains(const std::string& cs_list, const std::string& item); + +/// Return true if a given string 'full_str' ends with the characters in the +/// 'end' string +bool EndsWith(const std::string& full_string, const std::string& end); } #endif http://git-wip-us.apache.org/repos/asf/impala/blob/dde93083/common/function-registry/impala_functions.py ---------------------------------------------------------------------- diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py index f86098a..b3bf1d4 100644 --- a/common/function-registry/impala_functions.py +++ b/common/function-registry/impala_functions.py @@ -388,7 +388,8 @@ visible_functions = [ '_ZN6impala13MathFunctions13LeastGreatestILb0EEEN10impala_udf9StringValEPNS2_15FunctionContextEiPKS3_'], [['greatest'], 'DECIMAL', ['DECIMAL', '...'], '_ZN6impala13MathFunctions13LeastGreatestILb0EEEN10impala_udf10DecimalValEPNS2_15FunctionContextEiPKS3_'], - + [['width_bucket'], 'BIGINT', ['DECIMAL', 'DECIMAL', 'DECIMAL', 'INT'], + '_ZN6impala13MathFunctions11WidthBucketEPN10impala_udf15FunctionContextERKNS1_10DecimalValES6_S6_RKNS1_6IntValE'], # Decimal Functions # TODO: oracle has decimal support for transcendental functions (e.g. sin()) to very # high precisions. Do we need them? It's unclear if other databases do the same. http://git-wip-us.apache.org/repos/asf/impala/blob/dde93083/fe/src/main/java/org/apache/impala/analysis/Expr.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/analysis/Expr.java b/fe/src/main/java/org/apache/impala/analysis/Expr.java index 5cdcb16..e85a4a4 100644 --- a/fe/src/main/java/org/apache/impala/analysis/Expr.java +++ b/fe/src/main/java/org/apache/impala/analysis/Expr.java @@ -504,12 +504,12 @@ abstract public class Expr extends TreeNode<Expr> implements ParseNode, Cloneabl ScalarType decimalType = (ScalarType) childType; result = decimalType.getMinResolutionDecimal(); } else { - Preconditions.checkState(childType.isDecimal() || result.isDecimal()); result = Type.getAssignmentCompatibleType( result, childType, false, strictDecimal); } } if (result != null && !result.isNull()) { + result = ((ScalarType)result).getMinResolutionDecimal(); Preconditions.checkState(result.isDecimal() || result.isInvalid()); Preconditions.checkState(!result.isWildcardDecimal()); }
