Repository: incubator-impala Updated Branches: refs/heads/master 894bb7785 -> 26eaa2660
IMPALA-4729: Implement REPLACE() This turned out to be slightly non-trivial as REPLACE is already a keyword, and thus the parser needs to be tweaked to allow this, since function names act as bare identifiers. It was difficult to get this to match performance of regexp_replace. For expanding patterns, the fact that regexp_replace copies the expansion inline means that it may in fact win on large strings with sparse matches that are > dcache size apart. Let's leave optimizing that for later. Testing: Added a full test for maximum size strings and got most of the boundary conditions I could identify. Manually ran queries on TPC-H dataset in impala to verify both performance and correctness. Added large string and exprs.test test clauses and ran the tests to verify they work as expected. Change-Id: I1780a7d8fee6d0db9dad148217fb6eb10f773329 Reviewed-on: http://gerrit.cloudera.org:8080/5776 Reviewed-by: Dan Hecht <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/0715a303 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/0715a303 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/0715a303 Branch: refs/heads/master Commit: 0715a303ea0db01c108ece3d7692da158ada5c20 Parents: 894bb77 Author: Zach Amsden <[email protected]> Authored: Tue Jan 24 03:04:38 2017 +0000 Committer: Impala Public Jenkins <[email protected]> Committed: Wed Feb 15 01:33:23 2017 +0000 ---------------------------------------------------------------------- be/src/exprs/expr-test.cc | 116 ++++++++++++++ be/src/exprs/string-functions-ir.cc | 160 +++++++++++++++++++ be/src/exprs/string-functions.h | 4 + be/src/gutil/bits.h | 3 - be/src/udf/udf-internal.h | 5 + be/src/udf/udf-test-harness.cc | 5 +- be/src/udf/udf-test-harness.h | 4 +- be/src/udf/udf.cc | 41 ++++- be/src/udf/udf.h | 14 ++ common/function-registry/impala_functions.py | 3 + fe/src/main/cup/sql-parser.cup | 4 +- .../org/apache/impala/analysis/ParserTest.java | 8 +- .../queries/QueryTest/exprs.test | 27 ++++ .../queries/QueryTest/large_strings.test | 18 +++ 14 files changed, 400 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/exprs/expr-test.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc index 52e74c5..5f3d1d0 100644 --- a/be/src/exprs/expr-test.cc +++ b/be/src/exprs/expr-test.cc @@ -36,12 +36,14 @@ #include "exprs/like-predicate.h" #include "exprs/literal.h" #include "exprs/null-literal.h" +#include "exprs/string-functions.h" #include "exprs/timestamp-functions.h" #include "gen-cpp/Exprs_types.h" #include "gen-cpp/hive_metastore_types.h" #include "rpc/thrift-client.h" #include "rpc/thrift-server.h" #include "runtime/runtime-state.h" +#include "runtime/mem-pool.h" #include "runtime/mem-tracker.h" #include "runtime/raw-value.inline.h" #include "runtime/string-value.h" @@ -50,6 +52,7 @@ #include "service/impala-server.h" #include "testutil/impalad-query-executor.h" #include "testutil/in-process-servers.h" +#include "udf/udf-test-harness.h" #include "util/debug-util.h" #include "util/string-parser.h" #include "util/test-info.h" @@ -2085,6 +2088,119 @@ TEST_F(ExprTest, StringFunctions) { TestIsNull(length_aliases[i] + "(NULL)", TYPE_INT); } + TestStringValue("replace('aaaaaa', 'a', 'b')", "bbbbbb"); + TestStringValue("replace('aaaaaa', 'aa', 'b')", "bbb"); + TestStringValue("replace('aaaaaaa', 'aa', 'b')", "bbba"); + TestStringValue("replace('zzzaaaaaaqqq', 'a', 'b')", "zzzbbbbbbqqq"); + TestStringValue("replace('zzzaaaaaaaqqq', 'aa', 'b')", "zzzbbbaqqq"); + TestStringValue("replace('aaaaaaaaaaaaaaaaaaaa', 'a', 'b')", "bbbbbbbbbbbbbbbbbbbb"); + TestStringValue("replace('bobobobobo', 'bobo', 'a')", "aabo"); + TestStringValue("replace('abc', 'abcd', 'a')", "abc"); + TestStringValue("replace('aaaaaa', '', 'b')", "aaaaaa"); + TestStringValue("replace('abc', 'abc', '')", ""); + TestStringValue("replace('abcdefg', 'z', '')", "abcdefg"); + TestStringValue("replace('', 'zoltan', 'sultan')", ""); + TestStringValue("replace('strstrstr', 'str', 'strstr')", "strstrstrstrstrstr"); + TestStringValue("replace('aaaaaa', 'a', '')", ""); + TestIsNull("replace(NULL, 'foo', 'bar')", TYPE_STRING); + TestIsNull("replace('zomg', 'foo', NULL)", TYPE_STRING); + TestIsNull("replace('abc', NULL, 'a')", TYPE_STRING); + + // Do some tests with huge strings + const int huge_size = 500000; + std::string huge_str; + huge_str.reserve(huge_size); + huge_str.append(huge_size, 'A'); + + std::string huge_space; + huge_space.reserve(huge_size); + huge_space.append(huge_size, ' '); + + std::string huger_str; + huger_str.reserve(3 * huge_size); + huger_str.append(3 * huge_size, 'A'); + + TestStringValue("replace('" + huge_str + "', 'A', ' ')", huge_space); + TestStringValue("replace('" + huge_str + "', 'A', '')", ""); + + TestStringValue("replace('" + huge_str + "', 'A', 'AAA')", huger_str); + TestStringValue("replace('" + huger_str + "', 'AAA', 'A')", huge_str); + + auto* giga_buf = new std::array<uint8_t, StringVal::MAX_LENGTH>; + giga_buf->fill('A'); + (*giga_buf)[0] = 'Z'; + (*giga_buf)[10] = 'Z'; + (*giga_buf)[100] = 'Z'; + (*giga_buf)[1000] = 'Z'; + (*giga_buf)[StringVal::MAX_LENGTH-1] = 'Z'; + + // Hack up a function context so we can call Replace functions directly. + MemTracker m; + MemPool pool(&m); + FunctionContext::TypeDesc str_desc; + str_desc.type = FunctionContext::Type::TYPE_STRING; + std::vector<FunctionContext::TypeDesc> v(3, str_desc); + auto context = UdfTestHarness::CreateTestContext(str_desc, v, nullptr, &pool); + + StringVal giga(static_cast<uint8_t*>(giga_buf->data()), StringVal::MAX_LENGTH); + StringVal a("A"); + StringVal z("Z"); + StringVal aaa("aaa"); + + // Replace z's with a's on giga + auto r1 = StringFunctions::Replace(context, giga, z, a); + EXPECT_EQ(r1.ptr[0], 'A'); + EXPECT_EQ(r1.ptr[10], 'A'); + EXPECT_EQ(r1.ptr[100], 'A'); + EXPECT_EQ(r1.ptr[1000], 'A'); + EXPECT_EQ(r1.ptr[StringVal::MAX_LENGTH-1], 'A'); + + // Entire string match is legal + auto r2 = StringFunctions::Replace(context, giga, giga, a); + EXPECT_EQ(r2, a); + + // So is replacing giga with itself + auto r3 = StringFunctions::Replace(context, giga, giga, giga); + EXPECT_EQ(r3.ptr[0], 'Z'); + EXPECT_EQ(r3.ptr[10], 'Z'); + EXPECT_EQ(r3.ptr[100], 'Z'); + EXPECT_EQ(r3.ptr[1000], 'Z'); + EXPECT_EQ(r3.ptr[StringVal::MAX_LENGTH-1], 'Z'); + + // Expect expansion to fail as soon as possible; test with unallocated string space + // This tests overflowing the first allocation. + auto* short_buf = new std::array<uint8_t, 4096>; + short_buf->fill('A'); + (*short_buf)[1000] = 'Z'; + StringVal bam(static_cast<uint8_t*>(short_buf->data()), StringVal::MAX_LENGTH); + auto r4 = StringFunctions::Replace(context, bam, z, aaa); + EXPECT_TRUE(r4.is_null); + + // Similar test for second overflow. This tests overflowing on re-allocation. + (*short_buf)[4095] = 'Z'; + StringVal bam2(static_cast<uint8_t*>(short_buf->data()), StringVal::MAX_LENGTH-2); + auto r5 = StringFunctions::Replace(context, bam2, z, aaa); + EXPECT_TRUE(r5.is_null); + + // Finally, test expanding to exactly MAX_LENGTH + // There are 4 Zs in giga4 (not including the trailing one, as we truncate that) + StringVal giga4(static_cast<uint8_t*>(giga_buf->data()), StringVal::MAX_LENGTH-8); + auto r6 = StringFunctions::Replace(context, giga4, z, aaa); + EXPECT_EQ(strncmp((char*)&r6.ptr[0], "aaaA", 4), 0); + EXPECT_EQ(r6.len, 1 << 30); + + // Finally, an expansion in the last string position + (*giga_buf)[StringVal::MAX_LENGTH-11] = 'Z'; + StringVal giga5(static_cast<uint8_t*>(giga_buf->data()), StringVal::MAX_LENGTH-10); + auto r7 = StringFunctions::Replace(context, giga5, z, aaa); + EXPECT_EQ(r7.len, 1 << 30); + EXPECT_EQ(strncmp((char*)&r7.ptr[StringVal::MAX_LENGTH-4], "Aaaa", 4), 0); + + UdfTestHarness::CloseContext(context); + delete giga_buf; + delete short_buf; + pool.FreeAll(); + TestStringValue("reverse('abcdefg')", "gfedcba"); TestStringValue("reverse('')", ""); TestIsNull("reverse(NULL)", TYPE_STRING); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/exprs/string-functions-ir.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/string-functions-ir.cc b/be/src/exprs/string-functions-ir.cc index 54efbb3..dcef9cf 100644 --- a/be/src/exprs/string-functions-ir.cc +++ b/be/src/exprs/string-functions-ir.cc @@ -23,10 +23,13 @@ #include <re2/stringpiece.h> #include <bitset> +#include <boost/static_assert.hpp> + #include "exprs/anyval-util.h" #include "exprs/expr.h" #include "runtime/string-value.inline.h" #include "runtime/tuple-row.h" +#include "util/bit-util.h" #include "util/coding-util.h" #include "util/url-parser.h" @@ -204,6 +207,163 @@ StringVal StringFunctions::InitCap(FunctionContext* context, const StringVal& st return result; } +struct ReplaceContext { + ReplaceContext(StringVal *pattern_in) { + pattern = StringValue::FromStringVal(*pattern_in); + search = StringSearch(&pattern); + } + StringValue pattern; + StringSearch search; +}; + +void StringFunctions::ReplacePrepare(FunctionContext* context, + FunctionContext::FunctionStateScope scope) { + if (scope != FunctionContext::FRAGMENT_LOCAL) return; + if (!context->IsArgConstant(1)) return; + DCHECK_EQ(context->GetArgType(1)->type, FunctionContext::TYPE_STRING); + StringVal* pattern = reinterpret_cast<StringVal*>(context->GetConstantArg(1)); + if (pattern->is_null || pattern->len == 0) return; + + struct ReplaceContext* replace = context->Allocate<ReplaceContext>(); + if (replace != nullptr) { + new(replace) ReplaceContext(pattern); + context->SetFunctionState(scope, replace); + } +} + +void StringFunctions::ReplaceClose(FunctionContext* context, + FunctionContext::FunctionStateScope scope) { + if (scope != FunctionContext::FRAGMENT_LOCAL) return; + ReplaceContext* rptr = reinterpret_cast<ReplaceContext*> + (context->GetFunctionState(FunctionContext::FRAGMENT_LOCAL)); + if (rptr != nullptr) context->Free(reinterpret_cast<uint8_t*>(rptr)); +} + +StringVal StringFunctions::Replace(FunctionContext* context, const StringVal& str, + const StringVal& pattern, const StringVal& replace) { + DCHECK_LE(str.len, StringVal::MAX_LENGTH); + DCHECK_LE(pattern.len, StringVal::MAX_LENGTH); + DCHECK_LE(replace.len, StringVal::MAX_LENGTH); + if (str.is_null || pattern.is_null || replace.is_null) return StringVal::null(); + if (pattern.len == 0 || pattern.len > str.len) return str; + + // StringSearch keeps a pointer to the StringValue object, so it must remain + // in scope if used. + StringSearch search; + StringValue needle; + const StringSearch *search_ptr; + const ReplaceContext* rptr = reinterpret_cast<ReplaceContext*> + (context->GetFunctionState(FunctionContext::FRAGMENT_LOCAL)); + if (UNLIKELY(rptr == nullptr)) { + needle = StringValue::FromStringVal(pattern); + search = StringSearch(&needle); + search_ptr = &search; + } else { + search_ptr = &rptr->search; + } + + const StringValue haystack = StringValue::FromStringVal(str); + int64_t match_pos = search_ptr->Search(&haystack); + + // No match? Skip everything. + if (match_pos < 0) return str; + + DCHECK_GT(pattern.len, 0); + DCHECK_GE(haystack.len, pattern.len); + int buffer_space; + const int delta = replace.len - pattern.len; + // MAX_LENGTH is unsigned, so convert back to int to do correctly signed compare + DCHECK_LE(delta, static_cast<int>(StringVal::MAX_LENGTH) - 1); + if ((delta > 0 && delta < 128) && haystack.len <= 128) { + // Quick estimate for potential matches - this heuristic is needed to win + // over regexp_replace on expanding patterns. 128 is arbitrarily chosen so + // we can't massively over-estimate the buffer size. + int matches_possible = 0; + char c = pattern.ptr[0]; + for (int i = 0; i <= haystack.len - pattern.len; ++i) { + if (haystack.ptr[i] == c) ++matches_possible; + } + buffer_space = haystack.len + matches_possible * delta; + } else { + // Note - cannot overflow because pattern.len is at least one + static_assert(StringVal::MAX_LENGTH - 1 + StringVal::MAX_LENGTH <= + std::numeric_limits<decltype(buffer_space)>::max(), + "Buffer space computation can overflow"); + buffer_space = haystack.len + delta; + } + + StringVal result(context, buffer_space); + // If the result went over MAX_LENGTH, we can get a null result back + if (UNLIKELY(result.is_null)) return result; + + uint8_t* ptr = result.ptr; + int consumed = 0; + while (match_pos + pattern.len <= haystack.len) { + // Copy in original string + const int unmatched_bytes = match_pos - consumed; + memcpy(ptr, &haystack.ptr[consumed], unmatched_bytes); + DCHECK_LE(ptr - result.ptr + unmatched_bytes, buffer_space); + ptr += unmatched_bytes; + + // Copy in replacement - always safe since we always leave room for one more replace + DCHECK_LE(ptr - result.ptr + replace.len, buffer_space); + memcpy(ptr, replace.ptr, replace.len); + ptr += replace.len; + + // Don't want to re-match within already replaced pattern + match_pos += pattern.len; + consumed = match_pos; + + StringValue haystack_substring = haystack.Substring(match_pos); + int match_pos_in_substring = search_ptr->Search(&haystack_substring); + if (match_pos_in_substring < 0) break; + + match_pos += match_pos_in_substring; + + // If we had an enlarging pattern, we may need more space + if (delta > 0) { + const int bytes_produced = ptr - result.ptr; + const int bytes_remaining = haystack.len - consumed; + DCHECK_LE(bytes_produced, StringVal::MAX_LENGTH); + DCHECK_LE(bytes_remaining, StringVal::MAX_LENGTH - 1); + // Note: by above, cannot overflow + const int min_output = bytes_produced + bytes_remaining; + DCHECK_LE(min_output, StringVal::MAX_LENGTH); + // Also no overflow: min_output <= MAX_LENGTH and delta <= MAX_LENGTH - 1 + const int64_t space_needed = min_output + delta; + if (UNLIKELY(space_needed > buffer_space)) { + // Double at smaller sizes, but don't grow more than a megabyte a + // time at larger sizes. Reasoning: let the allocator do its job + // and don't depend on policy here. + static_assert(StringVal::MAX_LENGTH % (1 << 20) == 0, + "Math requires StringVal::MAX_LENGTH to be a multiple of 1MB"); + // Must compute next power of two using 64-bit math to avoid signed overflow + // The following DCHECK was supposed to be a static assertion, but C++11 is + // broken and doesn't declare std::min or std::max to be constexpr. Fix this + // when eventually the minimum supported standard is raised to at least C++14 + DCHECK_EQ(static_cast<int>(std::min<int64_t>( + BitUtil::RoundUpToPowerOfTwo(StringVal::MAX_LENGTH+1), + StringVal::MAX_LENGTH + (1 << 20))), + StringVal::MAX_LENGTH + (1 << 20)); + buffer_space = static_cast<int>(std::min<int64_t>( + BitUtil::RoundUpToPowerOfTwo(space_needed), + space_needed + (1 << 20))); + if (UNLIKELY(!result.Resize(context, buffer_space))) return StringVal::null(); + // Don't forget to move the pointer + ptr = result.ptr + bytes_produced; + } + } + } + + // Copy in remainder and re-adjust size + const int bytes_remaining = haystack.len - consumed; + result.len = ptr - result.ptr + bytes_remaining; + DCHECK_LE(result.len, buffer_space); + memcpy(ptr, &haystack.ptr[consumed], bytes_remaining); + + return result; +} + StringVal StringFunctions::Reverse(FunctionContext* context, const StringVal& str) { if (str.is_null) return StringVal::null(); StringVal result(context, str.len); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/exprs/string-functions.h ---------------------------------------------------------------------- diff --git a/be/src/exprs/string-functions.h b/be/src/exprs/string-functions.h index fb3a900..3dad5f6 100644 --- a/be/src/exprs/string-functions.h +++ b/be/src/exprs/string-functions.h @@ -53,6 +53,10 @@ class StringFunctions { static StringVal Lower(FunctionContext*, const StringVal& str); static StringVal Upper(FunctionContext*, const StringVal& str); static StringVal InitCap(FunctionContext*, const StringVal& str); + static void ReplacePrepare(FunctionContext*, FunctionContext::FunctionStateScope); + static void ReplaceClose(FunctionContext*, FunctionContext::FunctionStateScope); + static StringVal Replace(FunctionContext*, const StringVal& str, + const StringVal& pattern, const StringVal& replace); static StringVal Reverse(FunctionContext*, const StringVal& str); static StringVal Translate(FunctionContext*, const StringVal& str, const StringVal& src, const StringVal& dst); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/gutil/bits.h ---------------------------------------------------------------------- diff --git a/be/src/gutil/bits.h b/be/src/gutil/bits.h index 579d954..bce7f69 100644 --- a/be/src/gutil/bits.h +++ b/be/src/gutil/bits.h @@ -4,9 +4,6 @@ #include "gutil/basictypes.h" #include "gutil/integral_types.h" -#include <glog/logging.h> -#include "gutil/logging-inl.h" -#include "gutil/macros.h" #ifndef _BITS_H_ #define _BITS_H_ http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/udf/udf-internal.h ---------------------------------------------------------------------- diff --git a/be/src/udf/udf-internal.h b/be/src/udf/udf-internal.h index 907bb22..dc8b0b5 100644 --- a/be/src/udf/udf-internal.h +++ b/be/src/udf/udf-internal.h @@ -92,6 +92,11 @@ class FunctionContextImpl { /// TODO: free them at the batch level and save some copies? uint8_t* AllocateLocal(int64_t byte_size) noexcept; + /// Resize a local allocation. + /// If the new allocation causes the memory limit to be exceeded, the error will be set + /// in this object causing the query to fail. + uint8_t* ReallocateLocal(uint8_t* ptr, int64_t byte_size) noexcept; + /// Frees all allocations returned by AllocateLocal(). void FreeLocalAllocations() noexcept; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/udf/udf-test-harness.cc ---------------------------------------------------------------------- diff --git a/be/src/udf/udf-test-harness.cc b/be/src/udf/udf-test-harness.cc index d34b5ee..46cbbe0 100644 --- a/be/src/udf/udf-test-harness.cc +++ b/be/src/udf/udf-test-harness.cc @@ -28,8 +28,9 @@ using namespace impala; FunctionContext* UdfTestHarness::CreateTestContext( const FunctionContext::TypeDesc& return_type, - const vector<FunctionContext::TypeDesc>& arg_types, RuntimeState* state) { - return FunctionContextImpl::CreateContext(state, NULL, return_type, arg_types, 0, true); + const vector<FunctionContext::TypeDesc>& arg_types, RuntimeState* state, + MemPool* pool) { + return FunctionContextImpl::CreateContext(state, pool, return_type, arg_types, 0, true); } void UdfTestHarness::SetConstantArgs( http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/udf/udf-test-harness.h ---------------------------------------------------------------------- diff --git a/be/src/udf/udf-test-harness.h b/be/src/udf/udf-test-harness.h index 668a44d..f730d98 100644 --- a/be/src/udf/udf-test-harness.h +++ b/be/src/udf/udf-test-harness.h @@ -28,6 +28,8 @@ #include "udf/udf.h" #include "udf/udf-debug.h" +namespace impala { class MemPool; } + namespace impala_udf { /// Utility class to help test UDFs. @@ -38,7 +40,7 @@ class UdfTestHarness { /// for calling delete on it. This context has additional debugging validation enabled. static FunctionContext* CreateTestContext(const FunctionContext::TypeDesc& return_type, const std::vector<FunctionContext::TypeDesc>& arg_types, - impala::RuntimeState* state = nullptr); + impala::RuntimeState* state = nullptr, impala::MemPool* pool = nullptr); /// Use with test contexts to test use of IsArgConstant() and GetConstantArg(). /// constant_args should contain an AnyVal* for each argument of the UDF not including http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/udf/udf.cc ---------------------------------------------------------------------- diff --git a/be/src/udf/udf.cc b/be/src/udf/udf.cc index c2a5270..a013638 100644 --- a/be/src/udf/udf.cc +++ b/be/src/udf/udf.cc @@ -17,6 +17,7 @@ #include "udf/udf.h" +#include <algorithm> #include <iostream> #include <sstream> #include <assert.h> @@ -449,6 +450,29 @@ uint8_t* FunctionContextImpl::AllocateLocal(int64_t byte_size) noexcept { return buffer; } +uint8_t* FunctionContextImpl::ReallocateLocal(uint8_t* ptr, int64_t byte_size) noexcept { + assert(!closed_); + uint8_t* new_ptr = pool_->Reallocate(ptr, byte_size); + if (UNLIKELY(!CheckAllocResult("FunctionContextImpl::ReallocateLocal", + new_ptr, byte_size))) { + return NULL; + } + if (new_ptr != ptr) { + auto v = std::find(local_allocations_.rbegin(), local_allocations_.rend(), ptr); + assert(v != local_allocations_.rend()); + // Avoid perf issue; move to end of local allocations on any reallocation and + // always start the search from there. + if (v != local_allocations_.rbegin()) { + *v = *local_allocations_.rbegin(); + } + *local_allocations_.rbegin() = new_ptr; + } + VLOG_ROW << "Reallocate Local: FunctionContext=" << context_ + << " ptr=" << reinterpret_cast<void*>(ptr) << " size=" << byte_size + << " result=" << reinterpret_cast<void*>(new_ptr); + return new_ptr; +} + void FunctionContextImpl::FreeLocalAllocations() noexcept { assert(!closed_); if (VLOG_ROW_IS_ON) { @@ -479,7 +503,6 @@ void FunctionContextImpl::SetNonConstantArgs( // expr-ir.cc. This could probably use further investigation. StringVal::StringVal(FunctionContext* context, int len) noexcept : len(len), ptr(NULL) { if (UNLIKELY(len > StringVal::MAX_LENGTH)) { - std::cout << "MAX_LENGTH, Trying to allocate " << len; context->SetError("String length larger than allowed limit of " "1 GB character data."); len = 0; @@ -504,6 +527,22 @@ StringVal StringVal::CopyFrom(FunctionContext* ctx, const uint8_t* buf, size_t l return result; } +bool StringVal::Resize(FunctionContext* ctx, int new_len) noexcept { + if (UNLIKELY(new_len > StringVal::MAX_LENGTH)) { + ctx->SetError("String length larger than allowed limit of 1 GB character data."); + len = 0; + is_null = true; + return false; + } + auto* new_ptr = ctx->impl()->ReallocateLocal(ptr, new_len); + if (new_ptr != nullptr) { + ptr = new_ptr; + len = new_len; + return true; + } + return false; +} + // TODO: why doesn't libudasample.so build if this in udf-ir.cc? const FunctionContext::TypeDesc* FunctionContext::GetArgType(int arg_idx) const { if (arg_idx < 0 || arg_idx >= impl_->arg_types_.size()) return NULL; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/be/src/udf/udf.h ---------------------------------------------------------------------- diff --git a/be/src/udf/udf.h b/be/src/udf/udf.h index 4fdca2d..d4d6602 100644 --- a/be/src/udf/udf.h +++ b/be/src/udf/udf.h @@ -582,8 +582,22 @@ struct StringVal : public AnyVal { /// If the memory allocation fails, e.g. because the intermediate value would be too /// large, the constructor will construct a NULL string and set an error on the function /// context. + /// + /// The memory backing this StringVal is a local allocation, and so doesn't need + /// to be explicitly freed. StringVal(FunctionContext* context, int len) noexcept; + /// Reallocate a StringVal that is backed by a local allocation so that it as + /// at least as large as len. May shrink or / expand the string. If the + /// string is expanded, the content of the new space is undefined. + /// + /// If the resize fails, the original StringVal remains in place. Callers do not + /// otherwise need to be concerned with backing storage, which is allocated from a + /// local allocation. + /// + /// Returns true on success, false on failure. + bool Resize(FunctionContext* context, int len) noexcept; + /// Will create a new StringVal with the given dimension and copy the data from the /// parameters. In case of an error will return a NULL string and set an error on the /// function context. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/common/function-registry/impala_functions.py ---------------------------------------------------------------------- diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py index 2141a35..fbe6719 100644 --- a/common/function-registry/impala_functions.py +++ b/common/function-registry/impala_functions.py @@ -423,6 +423,9 @@ visible_functions = [ [['lower', 'lcase'], 'STRING', ['STRING'], 'impala::StringFunctions::Lower'], [['upper', 'ucase'], 'STRING', ['STRING'], 'impala::StringFunctions::Upper'], [['initcap'], 'STRING', ['STRING'], 'impala::StringFunctions::InitCap'], + [['replace'], 'STRING', ['STRING', 'STRING', 'STRING'], 'impala::StringFunctions::Replace', + '_ZN6impala15StringFunctions14ReplacePrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE', + '_ZN6impala15StringFunctions12ReplaceCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'], [['reverse'], 'STRING', ['STRING'], 'impala::StringFunctions::Reverse'], [['translate'], 'STRING', ['STRING', 'STRING', 'STRING'], 'impala::StringFunctions::Translate'], http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/fe/src/main/cup/sql-parser.cup ---------------------------------------------------------------------- diff --git a/fe/src/main/cup/sql-parser.cup b/fe/src/main/cup/sql-parser.cup index adef592..867850d 100644 --- a/fe/src/main/cup/sql-parser.cup +++ b/fe/src/main/cup/sql-parser.cup @@ -2633,9 +2633,11 @@ non_pred_expr ::= {: RESULT = e; :} | analytic_expr:e {: RESULT = e; :} - /* Since "IF", "TRUNCATE" are keywords, need to special case these functions */ + /* Since "IF", "REPLACE", "TRUNCATE" are keywords, need to special case these functions */ | KW_IF LPAREN expr_list:exprs RPAREN {: RESULT = new FunctionCallExpr("if", exprs); :} + | KW_REPLACE LPAREN expr_list:exprs RPAREN + {: RESULT = new FunctionCallExpr("replace", exprs); :} | KW_TRUNCATE LPAREN expr_list:exprs RPAREN {: RESULT = new FunctionCallExpr("truncate", exprs); :} | cast_expr:c http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/fe/src/test/java/org/apache/impala/analysis/ParserTest.java ---------------------------------------------------------------------- diff --git a/fe/src/test/java/org/apache/impala/analysis/ParserTest.java b/fe/src/test/java/org/apache/impala/analysis/ParserTest.java index 14cd036..d0176d2 100644 --- a/fe/src/test/java/org/apache/impala/analysis/ParserTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/ParserTest.java @@ -3027,7 +3027,7 @@ public class ParserTest extends FrontendTestBase { " ^\n" + "Encountered: FROM\n" + "Expected: ALL, CASE, CAST, DEFAULT, DISTINCT, EXISTS, " + - "FALSE, IF, INTERVAL, NOT, NULL, " + + "FALSE, IF, INTERVAL, NOT, NULL, REPLACE, " + "STRAIGHT_JOIN, TRUNCATE, TRUE, IDENTIFIER\n"); // missing from @@ -3054,7 +3054,7 @@ public class ParserTest extends FrontendTestBase { " ^\n" + "Encountered: EOF\n" + "Expected: CASE, CAST, DEFAULT, EXISTS, FALSE, " + - "IF, INTERVAL, NOT, NULL, TRUNCATE, TRUE, IDENTIFIER\n"); + "IF, INTERVAL, NOT, NULL, REPLACE, TRUNCATE, TRUE, IDENTIFIER\n"); // missing predicate in where clause (group by) ParserError("select c, b, c from t where group by a, b", @@ -3063,7 +3063,7 @@ public class ParserTest extends FrontendTestBase { " ^\n" + "Encountered: GROUP\n" + "Expected: CASE, CAST, DEFAULT, EXISTS, FALSE, " + - "IF, INTERVAL, NOT, NULL, TRUNCATE, TRUE, IDENTIFIER\n"); + "IF, INTERVAL, NOT, NULL, REPLACE, TRUNCATE, TRUE, IDENTIFIER\n"); // unmatched string literal starting with " ParserError("select c, \"b, c from t", @@ -3124,7 +3124,7 @@ public class ParserTest extends FrontendTestBase { " ^\n" + "Encountered: COMMA\n" + "Expected: CASE, CAST, DEFAULT, EXISTS, FALSE, " + - "IF, INTERVAL, NOT, NULL, TRUNCATE, TRUE, IDENTIFIER\n"); + "IF, INTERVAL, NOT, NULL, REPLACE, TRUNCATE, TRUE, IDENTIFIER\n"); // Parsing identifiers that have different names printed as EXPECTED ParserError("DROP DATA SRC foo", http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/testdata/workloads/functional-query/queries/QueryTest/exprs.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/exprs.test b/testdata/workloads/functional-query/queries/QueryTest/exprs.test index d611884..fa3d225 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test +++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test @@ -1931,6 +1931,33 @@ CAUSED BY: InternalException: Could not compile regexp pattern: * Error: no argument for repetition operator: * ==== ---- QUERY +select sum(length(replace(y, x, 'bbbbbbbbbbb'))) +from (select cast(round(float_col) AS STRING) as x, string_col as y + from functional.alltypes) v; +---- TYPES +BIGINT +---- RESULTS +43800 +==== +---- QUERY +select sum(length(replace(y, '0', x))) +from (select cast(round(float_col) AS STRING) as x, string_col as y + from functional.alltypes) v; +---- TYPES +BIGINT +---- RESULTS +7300 +==== +---- QUERY +select sum(length(concat(replace(y, '0', x), replace(y, '0', x)))) +from (select cast(round(float_col) AS STRING) as x, string_col as y + from functional.alltypes) v; +---- TYPES +BIGINT +---- RESULTS +14600 +==== +---- QUERY # Test for factorial operator select distinct int_col, int_col! from functional.alltypes order by 1 ---- RESULTS http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0715a303/testdata/workloads/functional-query/queries/QueryTest/large_strings.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/large_strings.test b/testdata/workloads/functional-query/queries/QueryTest/large_strings.test index 5c90869..4419953 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/large_strings.test +++ b/testdata/workloads/functional-query/queries/QueryTest/large_strings.test @@ -164,6 +164,24 @@ select length(regexp_replace(s, '.', '++++++++')) from ( String length larger than allowed limit of 1 GB character data ===== ---- QUERY +select length(replace(s, ' ', '++++++++')) from ( + select group_concat(l_comment, "!") s from ( + select l_comment from tpch.lineitem union all + select l_comment from tpch.lineitem) t1 + ) t2 +---- RESULTS +625718301 +===== +---- QUERY +select replace(x, '+', '000') from (select (replace(s, ' ', '++++++++')) x from ( + select group_concat(l_comment, "!") s from ( + select l_comment from tpch.lineitem union all + select l_comment from tpch.lineitem) t1 + ) t2) t3; +---- CATCH +String length larger than allowed limit of 1 GB character data +===== +---- QUERY select trunc(timestamp_col, space(1073741830)) from functional.alltypes ---- CATCH String length larger than allowed limit of 1 GB character data
