This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch 3.x in repository https://gitbox.apache.org/repos/asf/impala.git
commit e638bc04a258e123e1a2819bb4b5637ef40076c3 Author: Akos Kovacs <akov...@cloudera.com> AuthorDate: Thu Apr 30 00:18:16 2020 +0200 IMPALA-7833 Audit and fix string builtins for long string handling Some string built-in functions could crash impalad, in case the result was longer than 1 gig max size. Added some overflow checks. Overflow error messages modified not to hard code max size. Testing: * Added some backend tests to cover overflow check * Ran core tests Change-Id: I93a53845f04e61ff446b363c78db1e49cbd5dc49 Reviewed-on: http://gerrit.cloudera.org:8080/15864 Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> (cherry picked from commit 5e72ca546ec3dd83a11de5d1905db50807544c5e) --- be/src/exprs/aggregate-functions-ir.cc | 13 +++-- be/src/exprs/expr-test.cc | 4 +- be/src/exprs/string-functions-ir.cc | 58 ++++++++++++++++---- .../queries/QueryTest/large_strings.test | 61 +++++++++++++++++----- 4 files changed, 109 insertions(+), 27 deletions(-) diff --git a/be/src/exprs/aggregate-functions-ir.cc b/be/src/exprs/aggregate-functions-ir.cc index 699e80e..fd54b5e 100644 --- a/be/src/exprs/aggregate-functions-ir.cc +++ b/be/src/exprs/aggregate-functions-ir.cc @@ -30,6 +30,7 @@ #include "common/logging.h" #include "exprs/anyval-util.h" #include "exprs/hll-bias.h" +#include "gutil/strings/substitute.h" #include "runtime/date-value.h" #include "runtime/decimal-value.inline.h" #include "runtime/runtime-state.h" @@ -38,6 +39,7 @@ #include "runtime/timestamp-value.inline.h" #include "util/arithmetic-util.h" #include "util/mpfit-util.h" +#include "util/pretty-printer.h" #include "common/names.h" @@ -129,6 +131,9 @@ int64_t HllEstimateBias(int64_t estimate) { namespace impala { +const char* ERROR_CONCATENATED_STRING_MAX_SIZE_REACHED = + "Concatenated string length is larger than allowed limit of $0 character data."; + // This function initializes StringVal 'dst' with a newly allocated buffer of // 'buf_len' bytes. The new buffer will be filled with zero. If allocation fails, // 'dst' will be set to a null string. This allows execution to continue until the @@ -753,8 +758,8 @@ void AggregateFunctions::StringConcatUpdate(FunctionContext* ctx, const StringVa DCHECK(!ctx->impl()->state()->GetQueryStatus().ok()); } } else { - ctx->SetError("Concatenated string length larger than allowed limit of " - "1 GB character data."); + ctx->SetError(Substitute(ERROR_CONCATENATED_STRING_MAX_SIZE_REACHED, + PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str()); } } @@ -785,8 +790,8 @@ void AggregateFunctions::StringConcatMerge(FunctionContext* ctx, DCHECK(!ctx->impl()->state()->GetQueryStatus().ok()); } } else { - ctx->SetError("Concatenated string length larger than allowed limit of " - "1 GB character data."); + ctx->SetError(Substitute(ERROR_CONCATENATED_STRING_MAX_SIZE_REACHED, + PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str()); } } diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc index 665bf36..a5534f3 100644 --- a/be/src/exprs/expr-test.cc +++ b/be/src/exprs/expr-test.cc @@ -4502,9 +4502,9 @@ TEST_P(ExprTest, StringFunctions) { TestIsNull("repeat('ab', NULL)", TYPE_STRING); TestIsNull("repeat(NULL, NULL)", TYPE_STRING); TestErrorString("repeat('x', 1024 * 1024 * 1024 * 10)", "Number of repeats in " - "repeat() call is larger than allowed limit of 1 GB character data.\n"); + "repeat() call is larger than allowed limit of 1.00 GB character data.\n"); TestErrorString("repeat('xx', 1024 * 1024 * 1024)", "repeat() result is larger than " - "allowed limit of 1 GB character data.\n"); + "allowed limit of 1.00 GB character data.\n"); TestValue("ascii('')", TYPE_INT, 0); TestValue("ascii('abcde')", TYPE_INT, 'a'); diff --git a/be/src/exprs/string-functions-ir.cc b/be/src/exprs/string-functions-ir.cc index 5b777e3..55fe239 100644 --- a/be/src/exprs/string-functions-ir.cc +++ b/be/src/exprs/string-functions-ir.cc @@ -28,10 +28,12 @@ #include "exprs/anyval-util.h" #include "exprs/scalar-expr.h" #include "gutil/strings/charset.h" +#include "gutil/strings/substitute.h" #include "runtime/string-value.inline.h" #include "runtime/tuple-row.h" #include "util/bit-util.h" #include "util/coding-util.h" +#include "util/pretty-printer.h" #include "util/ubsan.h" #include "util/url-parser.h" @@ -43,6 +45,9 @@ using std::bitset; // NOTE: be careful not to use string::append. It is not performant. namespace impala { +const char* ERROR_CHARACTER_LIMIT_EXCEEDED = + "$0 is larger than allowed limit of $1 character data."; + // This behaves identically to the mysql implementation, namely: // - 1-indexed positions // - supported negative positions (count from the end of the string) @@ -84,6 +89,12 @@ StringVal StringFunctions::Right( StringVal StringFunctions::Space(FunctionContext* context, const BigIntVal& len) { if (len.is_null) return StringVal::null(); if (len.val <= 0) return StringVal(); + if (len.val > StringVal::MAX_LENGTH) { + context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED, + "space() result", + PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str()); + return StringVal::null(); + } StringVal result(context, len.val); if (UNLIKELY(result.is_null)) return StringVal::null(); memset(result.ptr, ' ', len.val); @@ -95,8 +106,9 @@ StringVal StringFunctions::Repeat( if (str.is_null || n.is_null) return StringVal::null(); if (str.len == 0 || n.val <= 0) return StringVal(); if (n.val > StringVal::MAX_LENGTH) { - context->SetError("Number of repeats in repeat() call is larger than allowed limit " - "of 1 GB character data."); + context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED, + "Number of repeats in repeat() call", + PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str()); return StringVal::null(); } static_assert(numeric_limits<int64_t>::max() / numeric_limits<int>::max() @@ -104,8 +116,9 @@ StringVal StringFunctions::Repeat( "multiplying StringVal::len with positive int fits in int64_t"); int64_t out_len = str.len * n.val; if (out_len > StringVal::MAX_LENGTH) { - context->SetError( - "repeat() result is larger than allowed limit of 1 GB character data."); + context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED, + "repeat() result", + PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str()); return StringVal::null(); } StringVal result(context, static_cast<int>(out_len)); @@ -125,7 +138,12 @@ StringVal StringFunctions::Lpad(FunctionContext* context, const StringVal& str, // TODO: Hive seems to go into an infinite loop if pad.len == 0, // so we should pay attention to Hive's future solution to be compatible. if (len.val <= str.len || pad.len == 0) return StringVal(str.ptr, len.val); - + if (len.val > StringVal::MAX_LENGTH) { + context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED, + "lpad() result", + PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str()); + return StringVal::null(); + } StringVal result(context, len.val); if (UNLIKELY(result.is_null)) return StringVal::null(); int padded_prefix_len = len.val - str.len; @@ -153,6 +171,12 @@ StringVal StringFunctions::Rpad(FunctionContext* context, const StringVal& str, if (len.val <= str.len || pad.len == 0) { return StringVal(str.ptr, len.val); } + if (len.val > StringVal::MAX_LENGTH) { + context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED, + "rpad() result", + PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str()); + return StringVal::null(); + } StringVal result(context, len.val); if (UNLIKELY(result.is_null)) return StringVal::null(); @@ -351,8 +375,9 @@ StringVal StringFunctions::Replace(FunctionContext* context, const StringVal& st if (UNLIKELY(space_needed > buffer_space)) { // Check to see if we can allocate a large enough buffer. if (space_needed > StringVal::MAX_LENGTH) { - context->SetError( - "String length larger than allowed limit of 1 GB character data."); + context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED, + "replace() result", + PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str()); return StringVal::null(); } // Double the buffer size whenever it fills up to amortise cost of resizing. @@ -865,11 +890,19 @@ StringVal StringFunctions::Concat( if (num_children == 1) return strs[0]; // Loop once to compute the final size and reserve space. - int32_t total_size = 0; + int64_t total_size = 0; for (int32_t i = 0; i < num_children; ++i) { if (strs[i].is_null) return StringVal::null(); total_size += strs[i].len; } + + if (total_size > StringVal::MAX_LENGTH) { + context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED, + "Concatenated string length", + PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str()); + return StringVal::null(); + } + // If total_size is zero, directly returns empty string if (total_size <= 0) return StringVal(); @@ -895,7 +928,7 @@ StringVal StringFunctions::ConcatWs(FunctionContext* context, const StringVal& s // count. int32_t valid_num_children = 0; int32_t valid_start_index = -1; - int32_t total_size = 0; + int64_t total_size = 0; for (int32_t i = 0; i < num_children; ++i) { if (strs[i].is_null) continue; @@ -911,6 +944,13 @@ StringVal StringFunctions::ConcatWs(FunctionContext* context, const StringVal& s valid_num_children++; } + if (total_size > StringVal::MAX_LENGTH) { + context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED, + "Concatenated string length", + PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str()); + return StringVal::null(); + } + // If all data are invalid, or data size is zero, return empty string. if (valid_start_index < 0 || total_size <= 0) { return StringVal(); diff --git a/testdata/workloads/functional-query/queries/QueryTest/large_strings.test b/testdata/workloads/functional-query/queries/QueryTest/large_strings.test index 848c8a1..817ea23 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/large_strings.test +++ b/testdata/workloads/functional-query/queries/QueryTest/large_strings.test @@ -54,7 +54,7 @@ select l_comment from tpch_parquet.lineitem union all select l_comment from tpch_parquet.lineitem union all select l_comment from tpch_parquet.lineitem) a ---- CATCH -Concatenated string length larger than allowed limit of 1 GB character data +Concatenated string length is larger than allowed limit of 1.00 GB character data ===== ---- QUERY # IMPALA-2620: Allocation by UDF/UDA need to take into account of memory limit. @@ -122,7 +122,7 @@ select length(concat_ws(',', s, s, s, s)) from ( select l_comment from tpch.lineitem) t1 ) t2 ---- CATCH -String length larger than allowed limit of 1 GB character data +Concatenated string length is larger than allowed limit of 1.00 GB character data ===== ---- QUERY set mem_limit=4gb; @@ -132,7 +132,7 @@ select length(repeat(s, 10)) from ( select l_comment from tpch.lineitem) t1 ) t2 ---- CATCH -repeat() result is larger than allowed limit of 1 GB character data +repeat() result is larger than allowed limit of 1.00 GB character data ===== ---- QUERY set mem_limit=4gb; @@ -142,7 +142,7 @@ select length(lpad(s, 1073741830, '!')) from ( select l_comment from tpch.lineitem) t1 ) t2 ---- CATCH -String length larger than allowed limit of 1 GB character data +lpad() result is larger than allowed limit of 1.00 GB character data ===== ---- QUERY set mem_limit=4gb; @@ -152,14 +152,14 @@ select length(rpad(s, 1073741830, '~')) from ( select l_comment from tpch.lineitem) t1 ) t2 ---- CATCH -String length larger than allowed limit of 1 GB character data +rpad() result is larger than allowed limit of 1.00 GB character data ===== ---- QUERY set mem_limit=4gb; set mem_limit=4gb; select space(1073741830); ---- CATCH -String length larger than allowed limit of 1 GB character data +space() result is larger than allowed limit of 1.00 GB character data ===== ---- QUERY set mem_limit=4gb; @@ -189,19 +189,19 @@ select replace(x, '+', '000') from (select (replace(s, ' ', '++++++++')) x from select l_comment from tpch.lineitem) t1 ) t2) t3; ---- CATCH -String length larger than allowed limit of 1 GB character data +replace() result is larger than allowed limit of 1.00 GB character data ===== ---- QUERY set mem_limit=4gb; select trunc(timestamp_col, space(1073741830)) from functional.alltypes ---- CATCH -String length larger than allowed limit of 1 GB character data +space() result is larger than allowed limit of 1.00 GB character data ===== ---- QUERY set mem_limit=4gb; select extract(timestamp_col, space(1073741830)) from functional.alltypes ---- CATCH -String length larger than allowed limit of 1 GB character data +space() result is larger than allowed limit of 1.00 GB character data ===== ---- QUERY set mem_limit=4gb; @@ -211,7 +211,7 @@ select length(madlib_encode_vector(concat_ws(',', s, s, s, s))) from ( select l_comment from tpch.lineitem) t1 ) t2 ---- CATCH -String length larger than allowed limit of 1 GB character data +Concatenated string length is larger than allowed limit of 1.00 GB character data ===== ---- QUERY set mem_limit=4gb; @@ -221,7 +221,7 @@ select length(madlib_decode_vector(concat_ws(',', s, s, s, s))) from ( select l_comment from tpch.lineitem) t1 ) t2 ---- CATCH -String length larger than allowed limit of 1 GB character data +Concatenated string length is larger than allowed limit of 1.00 GB character data ===== ---- QUERY set mem_limit=4gb; @@ -244,5 +244,42 @@ INT,INT set mem_limit=4gb; select repeat('the quick brown fox', 1024 * 1024 * 100) ---- CATCH -repeat() result is larger than allowed limit of 1 GB character data +repeat() result is larger than allowed limit of 1.00 GB character data +===== +---- QUERY +set mem_limit=4gb; +select lpad(s, 1024 * 1024 * 1024 * 5, ' ') from ( + select group_concat(l_comment, "!") s from ( + select l_comment from tpch.lineitem union all + select l_comment from tpch.lineitem) t1 + ) t2 +---- CATCH +lpad() result is larger than allowed limit of 1.00 GB character data +===== +---- QUERY +set mem_limit=4gb; +select rpad(s, 1024 * 1024 * 1024 * 5, ' ') from ( + select group_concat(l_comment, "!") s from ( + select l_comment from tpch.lineitem union all + select l_comment from tpch.lineitem) t1 + ) t2 +---- CATCH +rpad() result is larger than allowed limit of 1.00 GB character data +===== +---- QUERY +set mem_limit=4gb; +select space(1024 * 1024 * 1024 * 5); +---- CATCH +space() result is larger than allowed limit of 1.00 GB character data +===== +---- QUERY +set mem_limit=8gb; +select concat(lpad(s, 1073741820 , ' '),lpad(s, 1073741820 , ' '),lpad(s, 1073741820 , ' '),lpad(s, 1073741820 , ' '),lpad(s, 1073741820 , ' ')) from ( + select group_concat(l_comment, "!") s from ( + select l_comment from tpch.lineitem union all + select l_comment from tpch.lineitem) t1 + ) t2 +---- CATCH +Concatenated string length is larger than allowed limit of 1.00 GB character data ===== +