This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch 3.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit e638bc04a258e123e1a2819bb4b5637ef40076c3
Author: Akos Kovacs <akov...@cloudera.com>
AuthorDate: Thu Apr 30 00:18:16 2020 +0200

    IMPALA-7833 Audit and fix string builtins for long string handling
    
    Some string built-in functions could crash impalad,
    in case the result was longer than 1 gig max size.
    Added some overflow checks.
    Overflow error messages modified not to hard code max size.
    
    Testing:
    * Added some backend tests to cover overflow check
    * Ran core tests
    
    Change-Id: I93a53845f04e61ff446b363c78db1e49cbd5dc49
    Reviewed-on: http://gerrit.cloudera.org:8080/15864
    Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
    (cherry picked from commit 5e72ca546ec3dd83a11de5d1905db50807544c5e)
---
 be/src/exprs/aggregate-functions-ir.cc             | 13 +++--
 be/src/exprs/expr-test.cc                          |  4 +-
 be/src/exprs/string-functions-ir.cc                | 58 ++++++++++++++++----
 .../queries/QueryTest/large_strings.test           | 61 +++++++++++++++++-----
 4 files changed, 109 insertions(+), 27 deletions(-)

diff --git a/be/src/exprs/aggregate-functions-ir.cc 
b/be/src/exprs/aggregate-functions-ir.cc
index 699e80e..fd54b5e 100644
--- a/be/src/exprs/aggregate-functions-ir.cc
+++ b/be/src/exprs/aggregate-functions-ir.cc
@@ -30,6 +30,7 @@
 #include "common/logging.h"
 #include "exprs/anyval-util.h"
 #include "exprs/hll-bias.h"
+#include "gutil/strings/substitute.h"
 #include "runtime/date-value.h"
 #include "runtime/decimal-value.inline.h"
 #include "runtime/runtime-state.h"
@@ -38,6 +39,7 @@
 #include "runtime/timestamp-value.inline.h"
 #include "util/arithmetic-util.h"
 #include "util/mpfit-util.h"
+#include "util/pretty-printer.h"
 
 #include "common/names.h"
 
@@ -129,6 +131,9 @@ int64_t HllEstimateBias(int64_t estimate) {
 
 namespace impala {
 
+const char* ERROR_CONCATENATED_STRING_MAX_SIZE_REACHED =
+  "Concatenated string length is larger than allowed limit of $0 character 
data.";
+
 // This function initializes StringVal 'dst' with a newly allocated buffer of
 // 'buf_len' bytes. The new buffer will be filled with zero. If allocation 
fails,
 // 'dst' will be set to a null string. This allows execution to continue until 
the
@@ -753,8 +758,8 @@ void 
AggregateFunctions::StringConcatUpdate(FunctionContext* ctx, const StringVa
       DCHECK(!ctx->impl()->state()->GetQueryStatus().ok());
     }
   } else {
-    ctx->SetError("Concatenated string length larger than allowed limit of "
-        "1 GB character data.");
+    ctx->SetError(Substitute(ERROR_CONCATENATED_STRING_MAX_SIZE_REACHED,
+      PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
   }
 }
 
@@ -785,8 +790,8 @@ void AggregateFunctions::StringConcatMerge(FunctionContext* 
ctx,
       DCHECK(!ctx->impl()->state()->GetQueryStatus().ok());
     }
   } else {
-    ctx->SetError("Concatenated string length larger than allowed limit of "
-        "1 GB character data.");
+    ctx->SetError(Substitute(ERROR_CONCATENATED_STRING_MAX_SIZE_REACHED,
+      PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
   }
 }
 
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index 665bf36..a5534f3 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -4502,9 +4502,9 @@ TEST_P(ExprTest, StringFunctions) {
   TestIsNull("repeat('ab', NULL)", TYPE_STRING);
   TestIsNull("repeat(NULL, NULL)", TYPE_STRING);
   TestErrorString("repeat('x', 1024 * 1024 * 1024 * 10)", "Number of repeats 
in "
-      "repeat() call is larger than allowed limit of 1 GB character data.\n");
+      "repeat() call is larger than allowed limit of 1.00 GB character 
data.\n");
   TestErrorString("repeat('xx', 1024 * 1024 * 1024)", "repeat() result is 
larger than "
-      "allowed limit of 1 GB character data.\n");
+      "allowed limit of 1.00 GB character data.\n");
 
   TestValue("ascii('')", TYPE_INT, 0);
   TestValue("ascii('abcde')", TYPE_INT, 'a');
diff --git a/be/src/exprs/string-functions-ir.cc 
b/be/src/exprs/string-functions-ir.cc
index 5b777e3..55fe239 100644
--- a/be/src/exprs/string-functions-ir.cc
+++ b/be/src/exprs/string-functions-ir.cc
@@ -28,10 +28,12 @@
 #include "exprs/anyval-util.h"
 #include "exprs/scalar-expr.h"
 #include "gutil/strings/charset.h"
+#include "gutil/strings/substitute.h"
 #include "runtime/string-value.inline.h"
 #include "runtime/tuple-row.h"
 #include "util/bit-util.h"
 #include "util/coding-util.h"
+#include "util/pretty-printer.h"
 #include "util/ubsan.h"
 #include "util/url-parser.h"
 
@@ -43,6 +45,9 @@ using std::bitset;
 // NOTE: be careful not to use string::append.  It is not performant.
 namespace impala {
 
+const char* ERROR_CHARACTER_LIMIT_EXCEEDED =
+  "$0 is larger than allowed limit of $1 character data.";
+
 // This behaves identically to the mysql implementation, namely:
 //  - 1-indexed positions
 //  - supported negative positions (count from the end of the string)
@@ -84,6 +89,12 @@ StringVal StringFunctions::Right(
 StringVal StringFunctions::Space(FunctionContext* context, const BigIntVal& 
len) {
   if (len.is_null) return StringVal::null();
   if (len.val <= 0) return StringVal();
+  if (len.val > StringVal::MAX_LENGTH) {
+    context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
+         "space() result",
+         PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
+    return StringVal::null();
+  }
   StringVal result(context, len.val);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   memset(result.ptr, ' ', len.val);
@@ -95,8 +106,9 @@ StringVal StringFunctions::Repeat(
   if (str.is_null || n.is_null) return StringVal::null();
   if (str.len == 0 || n.val <= 0) return StringVal();
   if (n.val > StringVal::MAX_LENGTH) {
-    context->SetError("Number of repeats in repeat() call is larger than 
allowed limit "
-        "of 1 GB character data.");
+    context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
+        "Number of repeats in repeat() call",
+        PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
     return StringVal::null();
   }
   static_assert(numeric_limits<int64_t>::max() / numeric_limits<int>::max()
@@ -104,8 +116,9 @@ StringVal StringFunctions::Repeat(
       "multiplying StringVal::len with positive int fits in int64_t");
   int64_t out_len = str.len * n.val;
   if (out_len > StringVal::MAX_LENGTH) {
-    context->SetError(
-        "repeat() result is larger than allowed limit of 1 GB character 
data.");
+    context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
+        "repeat() result",
+        PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
     return StringVal::null();
   }
   StringVal result(context, static_cast<int>(out_len));
@@ -125,7 +138,12 @@ StringVal StringFunctions::Lpad(FunctionContext* context, 
const StringVal& str,
   // TODO: Hive seems to go into an infinite loop if pad.len == 0,
   // so we should pay attention to Hive's future solution to be compatible.
   if (len.val <= str.len || pad.len == 0) return StringVal(str.ptr, len.val);
-
+  if (len.val > StringVal::MAX_LENGTH) {
+    context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
+        "lpad() result",
+        PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
+    return StringVal::null();
+  }
   StringVal result(context, len.val);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   int padded_prefix_len = len.val - str.len;
@@ -153,6 +171,12 @@ StringVal StringFunctions::Rpad(FunctionContext* context, 
const StringVal& str,
   if (len.val <= str.len || pad.len == 0) {
     return StringVal(str.ptr, len.val);
   }
+  if (len.val > StringVal::MAX_LENGTH) {
+    context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
+        "rpad() result",
+        PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
+    return StringVal::null();
+  }
 
   StringVal result(context, len.val);
   if (UNLIKELY(result.is_null)) return StringVal::null();
@@ -351,8 +375,9 @@ StringVal StringFunctions::Replace(FunctionContext* 
context, const StringVal& st
       if (UNLIKELY(space_needed > buffer_space)) {
         // Check to see if we can allocate a large enough buffer.
         if (space_needed > StringVal::MAX_LENGTH) {
-          context->SetError(
-              "String length larger than allowed limit of 1 GB character 
data.");
+          context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
+              "replace() result",
+              PrettyPrinter::Print(StringVal::MAX_LENGTH, 
TUnit::BYTES)).c_str());
           return StringVal::null();
         }
         // Double the buffer size whenever it fills up to amortise cost of 
resizing.
@@ -865,11 +890,19 @@ StringVal StringFunctions::Concat(
   if (num_children == 1) return strs[0];
 
   // Loop once to compute the final size and reserve space.
-  int32_t total_size = 0;
+  int64_t total_size = 0;
   for (int32_t i = 0; i < num_children; ++i) {
     if (strs[i].is_null) return StringVal::null();
     total_size += strs[i].len;
   }
+
+  if (total_size > StringVal::MAX_LENGTH) {
+    context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
+         "Concatenated string length",
+         PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
+    return StringVal::null();
+  }
+
   // If total_size is zero, directly returns empty string
   if (total_size <= 0) return StringVal();
 
@@ -895,7 +928,7 @@ StringVal StringFunctions::ConcatWs(FunctionContext* 
context, const StringVal& s
   // count.
   int32_t valid_num_children = 0;
   int32_t valid_start_index = -1;
-  int32_t total_size = 0;
+  int64_t total_size = 0;
   for (int32_t i = 0; i < num_children; ++i) {
     if (strs[i].is_null) continue;
 
@@ -911,6 +944,13 @@ StringVal StringFunctions::ConcatWs(FunctionContext* 
context, const StringVal& s
     valid_num_children++;
   }
 
+  if (total_size > StringVal::MAX_LENGTH) {
+    context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
+         "Concatenated string length",
+         PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
+    return StringVal::null();
+  }
+
   // If all data are invalid, or data size is zero, return empty string.
   if (valid_start_index < 0 || total_size <= 0) {
     return StringVal();
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/large_strings.test 
b/testdata/workloads/functional-query/queries/QueryTest/large_strings.test
index 848c8a1..817ea23 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/large_strings.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/large_strings.test
@@ -54,7 +54,7 @@ select l_comment from tpch_parquet.lineitem union all
 select l_comment from tpch_parquet.lineitem union all
 select l_comment from tpch_parquet.lineitem) a
 ---- CATCH
-Concatenated string length larger than allowed limit of 1 GB character data
+Concatenated string length is larger than allowed limit of 1.00 GB character 
data
 =====
 ---- QUERY
 # IMPALA-2620: Allocation by UDF/UDA need to take into account of memory limit.
@@ -122,7 +122,7 @@ select length(concat_ws(',', s, s, s, s)) from (
     select l_comment from tpch.lineitem) t1
   ) t2
 ---- CATCH
-String length larger than allowed limit of 1 GB character data
+Concatenated string length is larger than allowed limit of 1.00 GB character 
data
 =====
 ---- QUERY
 set mem_limit=4gb;
@@ -132,7 +132,7 @@ select length(repeat(s, 10)) from (
     select l_comment from tpch.lineitem) t1
   ) t2
 ---- CATCH
-repeat() result is larger than allowed limit of 1 GB character data
+repeat() result is larger than allowed limit of 1.00 GB character data
 =====
 ---- QUERY
 set mem_limit=4gb;
@@ -142,7 +142,7 @@ select length(lpad(s, 1073741830, '!')) from (
     select l_comment from tpch.lineitem) t1
   ) t2
 ---- CATCH
-String length larger than allowed limit of 1 GB character data
+lpad() result is larger than allowed limit of 1.00 GB character data
 =====
 ---- QUERY
 set mem_limit=4gb;
@@ -152,14 +152,14 @@ select length(rpad(s, 1073741830, '~')) from (
     select l_comment from tpch.lineitem) t1
   ) t2
 ---- CATCH
-String length larger than allowed limit of 1 GB character data
+rpad() result is larger than allowed limit of 1.00 GB character data
 =====
 ---- QUERY
 set mem_limit=4gb;
 set mem_limit=4gb;
 select space(1073741830);
 ---- CATCH
-String length larger than allowed limit of 1 GB character data
+space() result is larger than allowed limit of 1.00 GB character data
 =====
 ---- QUERY
 set mem_limit=4gb;
@@ -189,19 +189,19 @@ select replace(x, '+', '000') from (select (replace(s, ' 
', '++++++++')) x from
     select l_comment from tpch.lineitem) t1
   ) t2) t3;
 ---- CATCH
-String length larger than allowed limit of 1 GB character data
+replace() result is larger than allowed limit of 1.00 GB character data
 =====
 ---- QUERY
 set mem_limit=4gb;
 select trunc(timestamp_col, space(1073741830)) from functional.alltypes
 ---- CATCH
-String length larger than allowed limit of 1 GB character data
+space() result is larger than allowed limit of 1.00 GB character data
 =====
 ---- QUERY
 set mem_limit=4gb;
 select extract(timestamp_col, space(1073741830)) from functional.alltypes
 ---- CATCH
-String length larger than allowed limit of 1 GB character data
+space() result is larger than allowed limit of 1.00 GB character data
 =====
 ---- QUERY
 set mem_limit=4gb;
@@ -211,7 +211,7 @@ select length(madlib_encode_vector(concat_ws(',', s, s, s, 
s))) from (
     select l_comment from tpch.lineitem) t1
   ) t2
 ---- CATCH
-String length larger than allowed limit of 1 GB character data
+Concatenated string length is larger than allowed limit of 1.00 GB character 
data
 =====
 ---- QUERY
 set mem_limit=4gb;
@@ -221,7 +221,7 @@ select length(madlib_decode_vector(concat_ws(',', s, s, s, 
s))) from (
     select l_comment from tpch.lineitem) t1
   ) t2
 ---- CATCH
-String length larger than allowed limit of 1 GB character data
+Concatenated string length is larger than allowed limit of 1.00 GB character 
data
 =====
 ---- QUERY
 set mem_limit=4gb;
@@ -244,5 +244,42 @@ INT,INT
 set mem_limit=4gb;
 select repeat('the quick brown fox', 1024 * 1024 * 100)
 ---- CATCH
-repeat() result is larger than allowed limit of 1 GB character data
+repeat() result is larger than allowed limit of 1.00 GB character data
+=====
+---- QUERY
+set mem_limit=4gb;
+select lpad(s, 1024 * 1024 * 1024 * 5, ' ') from (
+  select group_concat(l_comment, "!") s from (
+    select l_comment from tpch.lineitem union all
+    select l_comment from tpch.lineitem) t1
+  ) t2
+---- CATCH
+lpad() result is larger than allowed limit of 1.00 GB character data
+=====
+---- QUERY
+set mem_limit=4gb;
+select rpad(s, 1024 * 1024 * 1024 * 5, ' ') from (
+  select group_concat(l_comment, "!") s from (
+    select l_comment from tpch.lineitem union all
+    select l_comment from tpch.lineitem) t1
+  ) t2
+---- CATCH
+rpad() result is larger than allowed limit of 1.00 GB character data
+=====
+---- QUERY
+set mem_limit=4gb;
+select space(1024 * 1024 * 1024 * 5);
+---- CATCH
+space() result is larger than allowed limit of 1.00 GB character data
+=====
+---- QUERY
+set mem_limit=8gb;
+select concat(lpad(s, 1073741820 , ' '),lpad(s, 1073741820 , ' '),lpad(s, 
1073741820 , ' '),lpad(s, 1073741820 , ' '),lpad(s, 1073741820 , ' ')) from (
+  select group_concat(l_comment, "!") s from (
+    select l_comment from tpch.lineitem union all
+    select l_comment from tpch.lineitem) t1
+  ) t2
+---- CATCH
+Concatenated string length is larger than allowed limit of 1.00 GB character 
data
 =====
+

Reply via email to