Repository: incubator-impala
Updated Branches:
  refs/heads/master 894bb7785 -> 26eaa2660


IMPALA-4729: Implement REPLACE()

This turned out to be slightly non-trivial as REPLACE is already a
keyword, and thus the parser needs to be tweaked to allow this,
since function names act as bare identifiers.

It was difficult to get this to match performance of regexp_replace.
For expanding patterns, the fact that regexp_replace copies the
expansion inline means that it may in fact win on large strings
with sparse matches that are > dcache size apart.  Let's leave
optimizing that for later.

Testing: Added a full test for maximum size strings and got most
of the boundary conditions I could identify.  Manually ran queries
on TPC-H dataset in impala to verify both performance and correctness.
Added large string and exprs.test test clauses and ran the tests to
verify they work as expected.

Change-Id: I1780a7d8fee6d0db9dad148217fb6eb10f773329
Reviewed-on: http://gerrit.cloudera.org:8080/5776
Reviewed-by: Dan Hecht <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/0715a303
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/0715a303
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/0715a303

Branch: refs/heads/master
Commit: 0715a303ea0db01c108ece3d7692da158ada5c20
Parents: 894bb77
Author: Zach Amsden <[email protected]>
Authored: Tue Jan 24 03:04:38 2017 +0000
Committer: Impala Public Jenkins <[email protected]>
Committed: Wed Feb 15 01:33:23 2017 +0000

----------------------------------------------------------------------
 be/src/exprs/expr-test.cc                       | 116 ++++++++++++++
 be/src/exprs/string-functions-ir.cc             | 160 +++++++++++++++++++
 be/src/exprs/string-functions.h                 |   4 +
 be/src/gutil/bits.h                             |   3 -
 be/src/udf/udf-internal.h                       |   5 +
 be/src/udf/udf-test-harness.cc                  |   5 +-
 be/src/udf/udf-test-harness.h                   |   4 +-
 be/src/udf/udf.cc                               |  41 ++++-
 be/src/udf/udf.h                                |  14 ++
 common/function-registry/impala_functions.py    |   3 +
 fe/src/main/cup/sql-parser.cup                  |   4 +-
 .../org/apache/impala/analysis/ParserTest.java  |   8 +-
 .../queries/QueryTest/exprs.test                |  27 ++++
 .../queries/QueryTest/large_strings.test        |  18 +++
 14 files changed, 400 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/exprs/expr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index 52e74c5..5f3d1d0 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -36,12 +36,14 @@
 #include "exprs/like-predicate.h"
 #include "exprs/literal.h"
 #include "exprs/null-literal.h"
+#include "exprs/string-functions.h"
 #include "exprs/timestamp-functions.h"
 #include "gen-cpp/Exprs_types.h"
 #include "gen-cpp/hive_metastore_types.h"
 #include "rpc/thrift-client.h"
 #include "rpc/thrift-server.h"
 #include "runtime/runtime-state.h"
+#include "runtime/mem-pool.h"
 #include "runtime/mem-tracker.h"
 #include "runtime/raw-value.inline.h"
 #include "runtime/string-value.h"
@@ -50,6 +52,7 @@
 #include "service/impala-server.h"
 #include "testutil/impalad-query-executor.h"
 #include "testutil/in-process-servers.h"
+#include "udf/udf-test-harness.h"
 #include "util/debug-util.h"
 #include "util/string-parser.h"
 #include "util/test-info.h"
@@ -2085,6 +2088,119 @@ TEST_F(ExprTest, StringFunctions) {
     TestIsNull(length_aliases[i] + "(NULL)", TYPE_INT);
   }
 
+  TestStringValue("replace('aaaaaa', 'a', 'b')", "bbbbbb");
+  TestStringValue("replace('aaaaaa', 'aa', 'b')", "bbb");
+  TestStringValue("replace('aaaaaaa', 'aa', 'b')", "bbba");
+  TestStringValue("replace('zzzaaaaaaqqq', 'a', 'b')", "zzzbbbbbbqqq");
+  TestStringValue("replace('zzzaaaaaaaqqq', 'aa', 'b')", "zzzbbbaqqq");
+  TestStringValue("replace('aaaaaaaaaaaaaaaaaaaa', 'a', 'b')", 
"bbbbbbbbbbbbbbbbbbbb");
+  TestStringValue("replace('bobobobobo', 'bobo', 'a')", "aabo");
+  TestStringValue("replace('abc', 'abcd', 'a')", "abc");
+  TestStringValue("replace('aaaaaa', '', 'b')", "aaaaaa");
+  TestStringValue("replace('abc', 'abc', '')", "");
+  TestStringValue("replace('abcdefg', 'z', '')", "abcdefg");
+  TestStringValue("replace('', 'zoltan', 'sultan')", "");
+  TestStringValue("replace('strstrstr', 'str', 'strstr')", 
"strstrstrstrstrstr");
+  TestStringValue("replace('aaaaaa', 'a', '')", "");
+  TestIsNull("replace(NULL, 'foo', 'bar')", TYPE_STRING);
+  TestIsNull("replace('zomg', 'foo', NULL)", TYPE_STRING);
+  TestIsNull("replace('abc', NULL, 'a')", TYPE_STRING);
+
+  // Do some tests with huge strings
+  const int huge_size = 500000;
+  std::string huge_str;
+  huge_str.reserve(huge_size);
+  huge_str.append(huge_size, 'A');
+
+  std::string huge_space;
+  huge_space.reserve(huge_size);
+  huge_space.append(huge_size, ' ');
+
+  std::string huger_str;
+  huger_str.reserve(3 * huge_size);
+  huger_str.append(3 * huge_size, 'A');
+
+  TestStringValue("replace('" + huge_str + "', 'A', ' ')", huge_space);
+  TestStringValue("replace('" + huge_str + "', 'A', '')", "");
+
+  TestStringValue("replace('" + huge_str + "', 'A', 'AAA')", huger_str);
+  TestStringValue("replace('" + huger_str + "', 'AAA', 'A')", huge_str);
+
+  auto* giga_buf = new std::array<uint8_t, StringVal::MAX_LENGTH>;
+  giga_buf->fill('A');
+  (*giga_buf)[0] = 'Z';
+  (*giga_buf)[10] = 'Z';
+  (*giga_buf)[100] = 'Z';
+  (*giga_buf)[1000] = 'Z';
+  (*giga_buf)[StringVal::MAX_LENGTH-1] = 'Z';
+
+  // Hack up a function context so we can call Replace functions directly.
+  MemTracker m;
+  MemPool pool(&m);
+  FunctionContext::TypeDesc str_desc;
+  str_desc.type = FunctionContext::Type::TYPE_STRING;
+  std::vector<FunctionContext::TypeDesc> v(3, str_desc);
+  auto context = UdfTestHarness::CreateTestContext(str_desc, v, nullptr, 
&pool);
+
+  StringVal giga(static_cast<uint8_t*>(giga_buf->data()), 
StringVal::MAX_LENGTH);
+  StringVal a("A");
+  StringVal z("Z");
+  StringVal aaa("aaa");
+
+  // Replace z's with a's on giga
+  auto r1 = StringFunctions::Replace(context, giga, z, a);
+  EXPECT_EQ(r1.ptr[0], 'A');
+  EXPECT_EQ(r1.ptr[10], 'A');
+  EXPECT_EQ(r1.ptr[100], 'A');
+  EXPECT_EQ(r1.ptr[1000], 'A');
+  EXPECT_EQ(r1.ptr[StringVal::MAX_LENGTH-1], 'A');
+
+  // Entire string match is legal
+  auto r2 = StringFunctions::Replace(context, giga, giga, a);
+  EXPECT_EQ(r2, a);
+
+  // So is replacing giga with itself
+  auto r3 = StringFunctions::Replace(context, giga, giga, giga);
+  EXPECT_EQ(r3.ptr[0], 'Z');
+  EXPECT_EQ(r3.ptr[10], 'Z');
+  EXPECT_EQ(r3.ptr[100], 'Z');
+  EXPECT_EQ(r3.ptr[1000], 'Z');
+  EXPECT_EQ(r3.ptr[StringVal::MAX_LENGTH-1], 'Z');
+
+  // Expect expansion to fail as soon as possible; test with unallocated 
string space
+  // This tests overflowing the first allocation.
+  auto* short_buf = new std::array<uint8_t, 4096>;
+  short_buf->fill('A');
+  (*short_buf)[1000] = 'Z';
+  StringVal bam(static_cast<uint8_t*>(short_buf->data()), 
StringVal::MAX_LENGTH);
+  auto r4 = StringFunctions::Replace(context, bam, z, aaa);
+  EXPECT_TRUE(r4.is_null);
+
+  // Similar test for second overflow.  This tests overflowing on 
re-allocation.
+  (*short_buf)[4095] = 'Z';
+  StringVal bam2(static_cast<uint8_t*>(short_buf->data()), 
StringVal::MAX_LENGTH-2);
+  auto r5 = StringFunctions::Replace(context, bam2, z, aaa);
+  EXPECT_TRUE(r5.is_null);
+
+  // Finally, test expanding to exactly MAX_LENGTH
+  // There are 4 Zs in giga4 (not including the trailing one, as we truncate 
that)
+  StringVal giga4(static_cast<uint8_t*>(giga_buf->data()), 
StringVal::MAX_LENGTH-8);
+  auto r6 = StringFunctions::Replace(context, giga4, z, aaa);
+  EXPECT_EQ(strncmp((char*)&r6.ptr[0], "aaaA", 4), 0);
+  EXPECT_EQ(r6.len, 1 << 30);
+
+  // Finally, an expansion in the last string position
+  (*giga_buf)[StringVal::MAX_LENGTH-11] = 'Z';
+  StringVal giga5(static_cast<uint8_t*>(giga_buf->data()), 
StringVal::MAX_LENGTH-10);
+  auto r7 = StringFunctions::Replace(context, giga5, z, aaa);
+  EXPECT_EQ(r7.len, 1 << 30);
+  EXPECT_EQ(strncmp((char*)&r7.ptr[StringVal::MAX_LENGTH-4], "Aaaa", 4), 0);
+
+  UdfTestHarness::CloseContext(context);
+  delete giga_buf;
+  delete short_buf;
+  pool.FreeAll();
+
   TestStringValue("reverse('abcdefg')", "gfedcba");
   TestStringValue("reverse('')", "");
   TestIsNull("reverse(NULL)", TYPE_STRING);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/exprs/string-functions-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/string-functions-ir.cc 
b/be/src/exprs/string-functions-ir.cc
index 54efbb3..dcef9cf 100644
--- a/be/src/exprs/string-functions-ir.cc
+++ b/be/src/exprs/string-functions-ir.cc
@@ -23,10 +23,13 @@
 #include <re2/stringpiece.h>
 #include <bitset>
 
+#include <boost/static_assert.hpp>
+
 #include "exprs/anyval-util.h"
 #include "exprs/expr.h"
 #include "runtime/string-value.inline.h"
 #include "runtime/tuple-row.h"
+#include "util/bit-util.h"
 #include "util/coding-util.h"
 #include "util/url-parser.h"
 
@@ -204,6 +207,163 @@ StringVal StringFunctions::InitCap(FunctionContext* 
context, const StringVal& st
   return result;
 }
 
+struct ReplaceContext {
+  ReplaceContext(StringVal *pattern_in) {
+    pattern = StringValue::FromStringVal(*pattern_in);
+    search = StringSearch(&pattern);
+  }
+  StringValue pattern;
+  StringSearch search;
+};
+
+void StringFunctions::ReplacePrepare(FunctionContext* context,
+    FunctionContext::FunctionStateScope scope) {
+  if (scope != FunctionContext::FRAGMENT_LOCAL) return;
+  if (!context->IsArgConstant(1)) return;
+  DCHECK_EQ(context->GetArgType(1)->type, FunctionContext::TYPE_STRING);
+  StringVal* pattern = 
reinterpret_cast<StringVal*>(context->GetConstantArg(1));
+  if (pattern->is_null || pattern->len == 0) return;
+
+  struct ReplaceContext* replace = context->Allocate<ReplaceContext>();
+  if (replace != nullptr) {
+    new(replace) ReplaceContext(pattern);
+    context->SetFunctionState(scope, replace);
+  }
+}
+
+void StringFunctions::ReplaceClose(FunctionContext* context,
+    FunctionContext::FunctionStateScope scope) {
+  if (scope != FunctionContext::FRAGMENT_LOCAL) return;
+  ReplaceContext* rptr = reinterpret_cast<ReplaceContext*>
+      (context->GetFunctionState(FunctionContext::FRAGMENT_LOCAL));
+  if (rptr != nullptr) context->Free(reinterpret_cast<uint8_t*>(rptr));
+}
+
+StringVal StringFunctions::Replace(FunctionContext* context, const StringVal& 
str,
+    const StringVal& pattern, const StringVal& replace) {
+  DCHECK_LE(str.len, StringVal::MAX_LENGTH);
+  DCHECK_LE(pattern.len, StringVal::MAX_LENGTH);
+  DCHECK_LE(replace.len, StringVal::MAX_LENGTH);
+  if (str.is_null || pattern.is_null || replace.is_null) return 
StringVal::null();
+  if (pattern.len == 0 || pattern.len > str.len) return str;
+
+  // StringSearch keeps a pointer to the StringValue object, so it must remain
+  // in scope if used.
+  StringSearch search;
+  StringValue needle;
+  const StringSearch *search_ptr;
+  const ReplaceContext* rptr = reinterpret_cast<ReplaceContext*>
+      (context->GetFunctionState(FunctionContext::FRAGMENT_LOCAL));
+  if (UNLIKELY(rptr == nullptr)) {
+    needle = StringValue::FromStringVal(pattern);
+    search = StringSearch(&needle);
+    search_ptr = &search;
+  } else {
+    search_ptr = &rptr->search;
+  }
+
+  const StringValue haystack = StringValue::FromStringVal(str);
+  int64_t match_pos = search_ptr->Search(&haystack);
+
+  // No match?  Skip everything.
+  if (match_pos < 0) return str;
+
+  DCHECK_GT(pattern.len, 0);
+  DCHECK_GE(haystack.len, pattern.len);
+  int buffer_space;
+  const int delta = replace.len - pattern.len;
+  // MAX_LENGTH is unsigned, so convert back to int to do correctly signed 
compare
+  DCHECK_LE(delta, static_cast<int>(StringVal::MAX_LENGTH) - 1);
+  if ((delta > 0 && delta < 128) && haystack.len <= 128) {
+    // Quick estimate for potential matches - this heuristic is needed to win
+    // over regexp_replace on expanding patterns.  128 is arbitrarily chosen so
+    // we can't massively over-estimate the buffer size.
+    int matches_possible = 0;
+    char c = pattern.ptr[0];
+    for (int i = 0; i <= haystack.len - pattern.len; ++i) {
+      if (haystack.ptr[i] == c) ++matches_possible;
+    }
+    buffer_space = haystack.len + matches_possible * delta;
+  } else {
+    // Note - cannot overflow because pattern.len is at least one
+    static_assert(StringVal::MAX_LENGTH - 1 + StringVal::MAX_LENGTH <=
+        std::numeric_limits<decltype(buffer_space)>::max(),
+        "Buffer space computation can overflow");
+    buffer_space = haystack.len + delta;
+  }
+
+  StringVal result(context, buffer_space);
+  // If the result went over MAX_LENGTH, we can get a null result back
+  if (UNLIKELY(result.is_null)) return result;
+
+  uint8_t* ptr = result.ptr;
+  int consumed = 0;
+  while (match_pos + pattern.len <= haystack.len) {
+    // Copy in original string
+    const int unmatched_bytes = match_pos - consumed;
+    memcpy(ptr, &haystack.ptr[consumed], unmatched_bytes);
+    DCHECK_LE(ptr - result.ptr + unmatched_bytes, buffer_space);
+    ptr += unmatched_bytes;
+
+    // Copy in replacement - always safe since we always leave room for one 
more replace
+    DCHECK_LE(ptr - result.ptr + replace.len, buffer_space);
+    memcpy(ptr, replace.ptr, replace.len);
+    ptr += replace.len;
+
+    // Don't want to re-match within already replaced pattern
+    match_pos += pattern.len;
+    consumed = match_pos;
+
+    StringValue haystack_substring = haystack.Substring(match_pos);
+    int match_pos_in_substring = search_ptr->Search(&haystack_substring);
+    if (match_pos_in_substring < 0) break;
+
+    match_pos += match_pos_in_substring;
+
+    // If we had an enlarging pattern, we may need more space
+    if (delta > 0) {
+      const int bytes_produced = ptr - result.ptr;
+      const int bytes_remaining = haystack.len - consumed;
+      DCHECK_LE(bytes_produced, StringVal::MAX_LENGTH);
+      DCHECK_LE(bytes_remaining, StringVal::MAX_LENGTH - 1);
+      // Note: by above, cannot overflow
+      const int min_output = bytes_produced + bytes_remaining;
+      DCHECK_LE(min_output, StringVal::MAX_LENGTH);
+      // Also no overflow: min_output <= MAX_LENGTH and delta <= MAX_LENGTH - 1
+      const int64_t space_needed = min_output + delta;
+      if (UNLIKELY(space_needed > buffer_space)) {
+        // Double at smaller sizes, but don't grow more than a megabyte a
+        // time at larger sizes.  Reasoning: let the allocator do its job
+        // and don't depend on policy here.
+        static_assert(StringVal::MAX_LENGTH % (1 << 20) == 0,
+            "Math requires StringVal::MAX_LENGTH to be a multiple of 1MB");
+        // Must compute next power of two using 64-bit math to avoid signed 
overflow
+        // The following DCHECK was supposed to be a static assertion, but 
C++11 is
+        // broken and doesn't declare std::min or std::max to be constexpr.  
Fix this
+        // when eventually the minimum supported standard is raised to at 
least C++14
+        DCHECK_EQ(static_cast<int>(std::min<int64_t>(
+            BitUtil::RoundUpToPowerOfTwo(StringVal::MAX_LENGTH+1),
+            StringVal::MAX_LENGTH + (1 << 20))),
+            StringVal::MAX_LENGTH + (1 << 20));
+        buffer_space = static_cast<int>(std::min<int64_t>(
+            BitUtil::RoundUpToPowerOfTwo(space_needed),
+            space_needed + (1 << 20)));
+        if (UNLIKELY(!result.Resize(context, buffer_space))) return 
StringVal::null();
+        // Don't forget to move the pointer
+        ptr = result.ptr + bytes_produced;
+      }
+    }
+  }
+
+  // Copy in remainder and re-adjust size
+  const int bytes_remaining = haystack.len - consumed;
+  result.len = ptr - result.ptr + bytes_remaining;
+  DCHECK_LE(result.len, buffer_space);
+  memcpy(ptr, &haystack.ptr[consumed], bytes_remaining);
+
+  return result;
+}
+
 StringVal StringFunctions::Reverse(FunctionContext* context, const StringVal& 
str) {
   if (str.is_null) return StringVal::null();
   StringVal result(context, str.len);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/exprs/string-functions.h
----------------------------------------------------------------------
diff --git a/be/src/exprs/string-functions.h b/be/src/exprs/string-functions.h
index fb3a900..3dad5f6 100644
--- a/be/src/exprs/string-functions.h
+++ b/be/src/exprs/string-functions.h
@@ -53,6 +53,10 @@ class StringFunctions {
   static StringVal Lower(FunctionContext*, const StringVal& str);
   static StringVal Upper(FunctionContext*, const StringVal& str);
   static StringVal InitCap(FunctionContext*, const StringVal& str);
+  static void ReplacePrepare(FunctionContext*, 
FunctionContext::FunctionStateScope);
+  static void ReplaceClose(FunctionContext*, 
FunctionContext::FunctionStateScope);
+  static StringVal Replace(FunctionContext*, const StringVal& str,
+      const StringVal& pattern, const StringVal& replace);
   static StringVal Reverse(FunctionContext*, const StringVal& str);
   static StringVal Translate(FunctionContext*, const StringVal& str, const 
StringVal& src,
       const StringVal& dst);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/gutil/bits.h
----------------------------------------------------------------------
diff --git a/be/src/gutil/bits.h b/be/src/gutil/bits.h
index 579d954..bce7f69 100644
--- a/be/src/gutil/bits.h
+++ b/be/src/gutil/bits.h
@@ -4,9 +4,6 @@
 
 #include "gutil/basictypes.h"
 #include "gutil/integral_types.h"
-#include <glog/logging.h>
-#include "gutil/logging-inl.h"
-#include "gutil/macros.h"
 
 #ifndef _BITS_H_
 #define _BITS_H_

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/udf/udf-internal.h
----------------------------------------------------------------------
diff --git a/be/src/udf/udf-internal.h b/be/src/udf/udf-internal.h
index 907bb22..dc8b0b5 100644
--- a/be/src/udf/udf-internal.h
+++ b/be/src/udf/udf-internal.h
@@ -92,6 +92,11 @@ class FunctionContextImpl {
   /// TODO: free them at the batch level and save some copies?
   uint8_t* AllocateLocal(int64_t byte_size) noexcept;
 
+  /// Resize a local allocation.
+  /// If the new allocation causes the memory limit to be exceeded, the error 
will be set
+  /// in this object causing the query to fail.
+  uint8_t* ReallocateLocal(uint8_t* ptr, int64_t byte_size) noexcept;
+
   /// Frees all allocations returned by AllocateLocal().
   void FreeLocalAllocations() noexcept;
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/udf/udf-test-harness.cc
----------------------------------------------------------------------
diff --git a/be/src/udf/udf-test-harness.cc b/be/src/udf/udf-test-harness.cc
index d34b5ee..46cbbe0 100644
--- a/be/src/udf/udf-test-harness.cc
+++ b/be/src/udf/udf-test-harness.cc
@@ -28,8 +28,9 @@ using namespace impala;
 
 FunctionContext* UdfTestHarness::CreateTestContext(
     const FunctionContext::TypeDesc& return_type,
-    const vector<FunctionContext::TypeDesc>& arg_types, RuntimeState* state) {
-  return FunctionContextImpl::CreateContext(state, NULL, return_type, 
arg_types, 0, true);
+    const vector<FunctionContext::TypeDesc>& arg_types, RuntimeState* state,
+    MemPool* pool) {
+  return FunctionContextImpl::CreateContext(state, pool, return_type, 
arg_types, 0, true);
 }
 
 void UdfTestHarness::SetConstantArgs(

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/udf/udf-test-harness.h
----------------------------------------------------------------------
diff --git a/be/src/udf/udf-test-harness.h b/be/src/udf/udf-test-harness.h
index 668a44d..f730d98 100644
--- a/be/src/udf/udf-test-harness.h
+++ b/be/src/udf/udf-test-harness.h
@@ -28,6 +28,8 @@
 #include "udf/udf.h"
 #include "udf/udf-debug.h"
 
+namespace impala { class MemPool; }
+
 namespace impala_udf {
 
 /// Utility class to help test UDFs.
@@ -38,7 +40,7 @@ class UdfTestHarness {
   /// for calling delete on it. This context has additional debugging 
validation enabled.
   static FunctionContext* CreateTestContext(const FunctionContext::TypeDesc& 
return_type,
       const std::vector<FunctionContext::TypeDesc>& arg_types,
-      impala::RuntimeState* state = nullptr);
+      impala::RuntimeState* state = nullptr, impala::MemPool* pool = nullptr);
 
   /// Use with test contexts to test use of IsArgConstant() and 
GetConstantArg().
   /// constant_args should contain an AnyVal* for each argument of the UDF not 
including

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/udf/udf.cc
----------------------------------------------------------------------
diff --git a/be/src/udf/udf.cc b/be/src/udf/udf.cc
index c2a5270..a013638 100644
--- a/be/src/udf/udf.cc
+++ b/be/src/udf/udf.cc
@@ -17,6 +17,7 @@
 
 #include "udf/udf.h"
 
+#include <algorithm>
 #include <iostream>
 #include <sstream>
 #include <assert.h>
@@ -449,6 +450,29 @@ uint8_t* FunctionContextImpl::AllocateLocal(int64_t 
byte_size) noexcept {
   return buffer;
 }
 
+uint8_t* FunctionContextImpl::ReallocateLocal(uint8_t* ptr, int64_t byte_size) 
noexcept {
+  assert(!closed_);
+  uint8_t* new_ptr  = pool_->Reallocate(ptr, byte_size);
+  if (UNLIKELY(!CheckAllocResult("FunctionContextImpl::ReallocateLocal",
+      new_ptr, byte_size))) {
+    return NULL;
+  }
+  if (new_ptr != ptr) {
+    auto v = std::find(local_allocations_.rbegin(), local_allocations_.rend(), 
ptr);
+    assert(v != local_allocations_.rend());
+    // Avoid perf issue; move to end of local allocations on any reallocation 
and
+    // always start the search from there.
+    if (v != local_allocations_.rbegin()) {
+      *v = *local_allocations_.rbegin();
+    }
+    *local_allocations_.rbegin() = new_ptr;
+  }
+  VLOG_ROW << "Reallocate Local: FunctionContext=" << context_
+           << " ptr=" << reinterpret_cast<void*>(ptr) << " size=" << byte_size
+           << " result=" << reinterpret_cast<void*>(new_ptr);
+  return new_ptr;
+}
+
 void FunctionContextImpl::FreeLocalAllocations() noexcept {
   assert(!closed_);
   if (VLOG_ROW_IS_ON) {
@@ -479,7 +503,6 @@ void FunctionContextImpl::SetNonConstantArgs(
 // expr-ir.cc. This could probably use further investigation.
 StringVal::StringVal(FunctionContext* context, int len) noexcept : len(len), 
ptr(NULL) {
   if (UNLIKELY(len > StringVal::MAX_LENGTH)) {
-    std::cout << "MAX_LENGTH, Trying to allocate " << len;
     context->SetError("String length larger than allowed limit of "
                       "1 GB character data.");
     len = 0;
@@ -504,6 +527,22 @@ StringVal StringVal::CopyFrom(FunctionContext* ctx, const 
uint8_t* buf, size_t l
   return result;
 }
 
+bool StringVal::Resize(FunctionContext* ctx, int new_len) noexcept {
+  if (UNLIKELY(new_len > StringVal::MAX_LENGTH)) {
+    ctx->SetError("String length larger than allowed limit of 1 GB character 
data.");
+    len = 0;
+    is_null = true;
+    return false;
+  }
+  auto* new_ptr = ctx->impl()->ReallocateLocal(ptr, new_len);
+  if (new_ptr != nullptr) {
+    ptr = new_ptr;
+    len = new_len;
+    return true;
+  }
+  return false;
+}
+
 // TODO: why doesn't libudasample.so build if this in udf-ir.cc?
 const FunctionContext::TypeDesc* FunctionContext::GetArgType(int arg_idx) 
const {
   if (arg_idx < 0 || arg_idx >= impl_->arg_types_.size()) return NULL;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/udf/udf.h
----------------------------------------------------------------------
diff --git a/be/src/udf/udf.h b/be/src/udf/udf.h
index 4fdca2d..d4d6602 100644
--- a/be/src/udf/udf.h
+++ b/be/src/udf/udf.h
@@ -582,8 +582,22 @@ struct StringVal : public AnyVal {
   /// If the memory allocation fails, e.g. because the intermediate value 
would be too
   /// large, the constructor will construct a NULL string and set an error on 
the function
   /// context.
+  ///
+  /// The memory backing this StringVal is a local allocation, and so doesn't 
need
+  /// to be explicitly freed.
   StringVal(FunctionContext* context, int len) noexcept;
 
+  /// Reallocate a StringVal that is backed by a local allocation so that it as
+  /// at least as large as len.  May shrink or / expand the string.  If the
+  /// string is expanded, the content of the new space is undefined.
+  ///
+  /// If the resize fails, the original StringVal remains in place.  Callers 
do not
+  /// otherwise need to be concerned with backing storage, which is allocated 
from a
+  /// local allocation.
+  ///
+  /// Returns true on success, false on failure.
+  bool Resize(FunctionContext* context, int len) noexcept;
+
   /// Will create a new StringVal with the given dimension and copy the data 
from the
   /// parameters. In case of an error will return a NULL string and set an 
error on the
   /// function context.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/common/function-registry/impala_functions.py
----------------------------------------------------------------------
diff --git a/common/function-registry/impala_functions.py 
b/common/function-registry/impala_functions.py
index 2141a35..fbe6719 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -423,6 +423,9 @@ visible_functions = [
   [['lower', 'lcase'], 'STRING', ['STRING'], 'impala::StringFunctions::Lower'],
   [['upper', 'ucase'], 'STRING', ['STRING'], 'impala::StringFunctions::Upper'],
   [['initcap'], 'STRING', ['STRING'], 'impala::StringFunctions::InitCap'],
+  [['replace'], 'STRING', ['STRING', 'STRING', 'STRING'], 
'impala::StringFunctions::Replace',
+   
'_ZN6impala15StringFunctions14ReplacePrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+   
'_ZN6impala15StringFunctions12ReplaceCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
   [['reverse'], 'STRING', ['STRING'], 'impala::StringFunctions::Reverse'],
   [['translate'], 'STRING', ['STRING', 'STRING', 'STRING'],
    'impala::StringFunctions::Translate'],

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/fe/src/main/cup/sql-parser.cup
----------------------------------------------------------------------
diff --git a/fe/src/main/cup/sql-parser.cup b/fe/src/main/cup/sql-parser.cup
index adef592..867850d 100644
--- a/fe/src/main/cup/sql-parser.cup
+++ b/fe/src/main/cup/sql-parser.cup
@@ -2633,9 +2633,11 @@ non_pred_expr ::=
   {: RESULT = e; :}
   | analytic_expr:e
   {: RESULT = e; :}
-  /* Since "IF", "TRUNCATE" are keywords, need to special case these functions 
*/
+  /* Since "IF", "REPLACE", "TRUNCATE" are keywords, need to special case 
these functions */
   | KW_IF LPAREN expr_list:exprs RPAREN
   {: RESULT = new FunctionCallExpr("if", exprs); :}
+  | KW_REPLACE LPAREN expr_list:exprs RPAREN
+  {: RESULT = new FunctionCallExpr("replace", exprs); :}
   | KW_TRUNCATE LPAREN expr_list:exprs RPAREN
   {: RESULT = new FunctionCallExpr("truncate", exprs); :}
   | cast_expr:c

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/fe/src/test/java/org/apache/impala/analysis/ParserTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/analysis/ParserTest.java 
b/fe/src/test/java/org/apache/impala/analysis/ParserTest.java
index 14cd036..d0176d2 100644
--- a/fe/src/test/java/org/apache/impala/analysis/ParserTest.java
+++ b/fe/src/test/java/org/apache/impala/analysis/ParserTest.java
@@ -3027,7 +3027,7 @@ public class ParserTest extends FrontendTestBase {
         "       ^\n" +
         "Encountered: FROM\n" +
         "Expected: ALL, CASE, CAST, DEFAULT, DISTINCT, EXISTS, " +
-        "FALSE, IF, INTERVAL, NOT, NULL, " +
+        "FALSE, IF, INTERVAL, NOT, NULL, REPLACE, " +
         "STRAIGHT_JOIN, TRUNCATE, TRUE, IDENTIFIER\n");
 
     // missing from
@@ -3054,7 +3054,7 @@ public class ParserTest extends FrontendTestBase {
         "                           ^\n" +
         "Encountered: EOF\n" +
         "Expected: CASE, CAST, DEFAULT, EXISTS, FALSE, " +
-        "IF, INTERVAL, NOT, NULL, TRUNCATE, TRUE, IDENTIFIER\n");
+        "IF, INTERVAL, NOT, NULL, REPLACE, TRUNCATE, TRUE, IDENTIFIER\n");
 
     // missing predicate in where clause (group by)
     ParserError("select c, b, c from t where group by a, b",
@@ -3063,7 +3063,7 @@ public class ParserTest extends FrontendTestBase {
         "                            ^\n" +
         "Encountered: GROUP\n" +
         "Expected: CASE, CAST, DEFAULT, EXISTS, FALSE, " +
-        "IF, INTERVAL, NOT, NULL, TRUNCATE, TRUE, IDENTIFIER\n");
+        "IF, INTERVAL, NOT, NULL, REPLACE, TRUNCATE, TRUE, IDENTIFIER\n");
 
     // unmatched string literal starting with "
     ParserError("select c, \"b, c from t",
@@ -3124,7 +3124,7 @@ public class ParserTest extends FrontendTestBase {
         "                             ^\n" +
         "Encountered: COMMA\n" +
         "Expected: CASE, CAST, DEFAULT, EXISTS, FALSE, " +
-        "IF, INTERVAL, NOT, NULL, TRUNCATE, TRUE, IDENTIFIER\n");
+        "IF, INTERVAL, NOT, NULL, REPLACE, TRUNCATE, TRUE, IDENTIFIER\n");
 
     // Parsing identifiers that have different names printed as EXPECTED
     ParserError("DROP DATA SRC foo",

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/testdata/workloads/functional-query/queries/QueryTest/exprs.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/exprs.test 
b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
index d611884..fa3d225 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
@@ -1931,6 +1931,33 @@ CAUSED BY: InternalException: Could not compile regexp 
pattern: *
 Error: no argument for repetition operator: *
 ====
 ---- QUERY
+select sum(length(replace(y, x, 'bbbbbbbbbbb')))
+from (select cast(round(float_col) AS STRING) as x, string_col as y
+      from functional.alltypes) v;
+---- TYPES
+BIGINT
+---- RESULTS
+43800
+====
+---- QUERY
+select sum(length(replace(y, '0', x)))
+from (select cast(round(float_col) AS STRING) as x, string_col as y
+      from functional.alltypes) v;
+---- TYPES
+BIGINT
+---- RESULTS
+7300
+====
+---- QUERY
+select sum(length(concat(replace(y, '0', x), replace(y, '0', x))))
+from (select cast(round(float_col) AS STRING) as x, string_col as y
+      from functional.alltypes) v;
+---- TYPES
+BIGINT
+---- RESULTS
+14600
+====
+---- QUERY
 # Test for factorial operator
 select distinct int_col, int_col! from functional.alltypes order by 1
 ---- RESULTS

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/testdata/workloads/functional-query/queries/QueryTest/large_strings.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/large_strings.test 
b/testdata/workloads/functional-query/queries/QueryTest/large_strings.test
index 5c90869..4419953 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/large_strings.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/large_strings.test
@@ -164,6 +164,24 @@ select length(regexp_replace(s, '.', '++++++++')) from (
 String length larger than allowed limit of 1 GB character data
 =====
 ---- QUERY
+select length(replace(s, ' ', '++++++++')) from (
+  select group_concat(l_comment, "!") s from (
+    select l_comment from tpch.lineitem union all
+    select l_comment from tpch.lineitem) t1
+  ) t2
+---- RESULTS
+625718301
+=====
+---- QUERY
+select replace(x, '+', '000') from (select (replace(s, ' ', '++++++++')) x 
from (
+  select group_concat(l_comment, "!") s from (
+    select l_comment from tpch.lineitem union all
+    select l_comment from tpch.lineitem) t1
+  ) t2) t3;
+---- CATCH
+String length larger than allowed limit of 1 GB character data
+=====
+---- QUERY
 select trunc(timestamp_col, space(1073741830)) from functional.alltypes
 ---- CATCH
 String length larger than allowed limit of 1 GB character data

Reply via email to