Repository: incubator-impala Updated Branches: refs/heads/master c1da1409b -> c7b7c3ece
Remove some code in like-predicate-ir.cc from cross-compilation like-predicate-ir.cc contains a lot of code which won't be called and inlined by other IR functions. (e.g. the prepare functions). To reduce the size of the bitcode module, this change removes these functions from cross compilation and moves them into like-predicate.cc instead. Change-Id: I1279622ff97af0e1bedcfd9aafdb875b01e38c7c Reviewed-on: http://gerrit.cloudera.org:8080/3483 Reviewed-by: Michael Ho <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/c7b7c3ec Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/c7b7c3ec Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/c7b7c3ec Branch: refs/heads/master Commit: c7b7c3ece2eb28db7ad76397c433ef582aeac7f8 Parents: c1da140 Author: Michael Ho <[email protected]> Authored: Thu Jun 23 22:32:18 2016 -0700 Committer: Tim Armstrong <[email protected]> Committed: Thu Jul 7 18:41:45 2016 -0700 ---------------------------------------------------------------------- be/src/exprs/CMakeLists.txt | 1 + be/src/exprs/like-predicate-ir.cc | 390 +------------------------------ be/src/exprs/like-predicate.cc | 414 +++++++++++++++++++++++++++++++++ be/src/exprs/like-predicate.h | 9 +- 4 files changed, 424 insertions(+), 390 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c7b7c3ec/be/src/exprs/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/be/src/exprs/CMakeLists.txt b/be/src/exprs/CMakeLists.txt index 452113a..747add3 100644 --- a/be/src/exprs/CMakeLists.txt +++ b/be/src/exprs/CMakeLists.txt @@ -39,6 +39,7 @@ add_library(Exprs in-predicate-ir.cc is-not-empty-predicate.cc is-null-predicate-ir.cc + like-predicate.cc like-predicate-ir.cc literal.cc math-functions-ir.cc http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c7b7c3ec/be/src/exprs/like-predicate-ir.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/like-predicate-ir.cc b/be/src/exprs/like-predicate-ir.cc index 516866a..2d9c28b 100644 --- a/be/src/exprs/like-predicate-ir.cc +++ b/be/src/exprs/like-predicate-ir.cc @@ -14,103 +14,9 @@ #include "exprs/like-predicate.h" -#include <string.h> -#include <re2/re2.h> -#include <re2/stringpiece.h> -#include <sstream> - -#include "gutil/strings/substitute.h" -#include "runtime/string-value.inline.h" -#include "string-functions.h" -#include "common/names.h" - using namespace impala_udf; -using namespace re2; namespace impala { -// A regex to match any regex pattern is equivalent to a substring search. -static const RE2 SUBSTRING_RE( - "(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*"); - -// A regex to match any regex pattern which is equivalent to matching a constant string -// at the end of the string values. -static const RE2 ENDS_WITH_RE( - "(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$"); - -// A regex to match any regex pattern which is equivalent to matching a constant string -// at the end of the string values. -static const RE2 STARTS_WITH_RE( - "\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*"); - -// A regex to match any regex pattern which is equivalent to a constant string match. -static const RE2 EQUALS_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$"); - -LikePredicate::LikePredicate(const TExprNode& node) - : Predicate(node) { -} - -LikePredicate::~LikePredicate() { -} - -void LikePredicate::LikePrepare(FunctionContext* context, - FunctionContext::FunctionStateScope scope) { - LikePrepareInternal(context, scope, true); -} - -void LikePredicate::ILikePrepare(FunctionContext* context, - FunctionContext::FunctionStateScope scope) { - LikePrepareInternal(context, scope, false); -} - -// TODO: make class StringValue and StringSearch accept a case-sensitive flag and -// switch back to using the cheaper Constant<>() functions. -void LikePredicate::LikePrepareInternal(FunctionContext* context, - FunctionContext::FunctionStateScope scope, bool case_sensitive) { - if (scope != FunctionContext::THREAD_LOCAL) return; - LikePredicateState* state = new LikePredicateState(); - state->function_ = LikeFn; - context->SetFunctionState(scope, state); - if (context->IsArgConstant(1)) { - StringVal pattern_val = *reinterpret_cast<StringVal*>(context->GetConstantArg(1)); - if (pattern_val.is_null) return; - StringValue pattern = StringValue::FromStringVal(pattern_val); - re2::RE2 substring_re("(?:%+)([^%_]*)(?:%+)"); - re2::RE2 ends_with_re("(?:%+)([^%_]*)"); - re2::RE2 starts_with_re("([^%_]*)(?:%+)"); - re2::RE2 equals_re("([^%_]*)"); - string pattern_str(pattern.ptr, pattern.len); - string search_string; - if (case_sensitive && RE2::FullMatch(pattern_str, substring_re, &search_string)) { - state->SetSearchString(search_string); - state->function_ = ConstantSubstringFn; - } else if (case_sensitive && - RE2::FullMatch(pattern_str, starts_with_re, &search_string)) { - state->SetSearchString(search_string); - state->function_ = ConstantStartsWithFn; - } else if (case_sensitive && - RE2::FullMatch(pattern_str, ends_with_re, &search_string)) { - state->SetSearchString(search_string); - state->function_ = ConstantEndsWithFn; - } else if (case_sensitive && - RE2::FullMatch(pattern_str, equals_re, &search_string)) { - state->SetSearchString(search_string); - state->function_ = ConstantEqualsFn; - } else { - string re_pattern; - ConvertLikePattern(context, - *reinterpret_cast<StringVal*>(context->GetConstantArg(1)), &re_pattern); - RE2::Options opts; - opts.set_never_nl(false); - opts.set_dot_nl(true); - opts.set_case_sensitive(case_sensitive); - state->regex_.reset(new RE2(re_pattern, opts)); - if (!state->regex_->ok()) { - context->SetError( - strings::Substitute("Invalid regex: $0", pattern_val.ptr).c_str()); - } - } - } -} BooleanVal LikePredicate::Like(FunctionContext* context, const StringVal& val, const StringVal& pattern) { @@ -119,71 +25,6 @@ BooleanVal LikePredicate::Like(FunctionContext* context, const StringVal& val, return (state->function_)(context, val, pattern); } -void LikePredicate::LikeClose(FunctionContext* context, - FunctionContext::FunctionStateScope scope) { - if (scope == FunctionContext::THREAD_LOCAL) { - LikePredicateState* state = reinterpret_cast<LikePredicateState*>( - context->GetFunctionState(FunctionContext::THREAD_LOCAL)); - delete state; - } -} - -void LikePredicate::RegexPrepare(FunctionContext* context, - FunctionContext::FunctionStateScope scope) { - RegexPrepareInternal(context, scope, true); -} - -void LikePredicate::IRegexPrepare(FunctionContext* context, - FunctionContext::FunctionStateScope scope) { - RegexPrepareInternal(context, scope, false); -} - -void LikePredicate::RegexPrepareInternal(FunctionContext* context, - FunctionContext::FunctionStateScope scope, bool case_sensitive) { - if (scope != FunctionContext::THREAD_LOCAL) return; - LikePredicateState* state = new LikePredicateState(); - context->SetFunctionState(scope, state); - state->function_ = RegexFn; - if (context->IsArgConstant(1)) { - StringVal* pattern = reinterpret_cast<StringVal*>(context->GetConstantArg(1)); - if (pattern->is_null) return; - string pattern_str(reinterpret_cast<const char*>(pattern->ptr), pattern->len); - string search_string; - // The following four conditionals check if the pattern is a constant string, - // starts with a constant string and is followed by any number of wildcard characters, - // ends with a constant string and is preceded by any number of wildcard characters or - // has a constant substring surrounded on both sides by any number of wildcard - // characters. In any of these conditions, we can search for the pattern more - // efficiently by using our own string match functions rather than regex matching. - if (case_sensitive && RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) { - state->SetSearchString(search_string); - state->function_ = ConstantEqualsFn; - } else if (case_sensitive && - RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) { - state->SetSearchString(search_string); - state->function_ = ConstantStartsWithFn; - } else if (case_sensitive && - RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) { - state->SetSearchString(search_string); - state->function_ = ConstantEndsWithFn; - } else if (case_sensitive && - RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) { - state->SetSearchString(search_string); - state->function_ = ConstantSubstringFn; - } else { - RE2::Options opts; - opts.set_case_sensitive(case_sensitive); - state->regex_.reset(new RE2(pattern_str, opts)); - if (!state->regex_->ok()) { - stringstream error; - error << "Invalid regex expression" << pattern->ptr; - context->SetError(error.str().c_str()); - } - state->function_ = ConstantRegexFnPartial; - } - } -} - BooleanVal LikePredicate::Regex(FunctionContext* context, const StringVal& val, const StringVal& pattern) { LikePredicateState* state = reinterpret_cast<LikePredicateState*>( @@ -191,238 +32,9 @@ BooleanVal LikePredicate::Regex(FunctionContext* context, const StringVal& val, return (state->function_)(context, val, pattern); } -// This prepare function is used only when 3 parameters are passed to the regexp_like() -// function. For the 2 parameter version, the RegexPrepare() function is used to prepare. -void LikePredicate::RegexpLikePrepare(FunctionContext* context, - FunctionContext::FunctionStateScope scope) { - if (scope != FunctionContext::THREAD_LOCAL) return; - LikePredicateState* state = new LikePredicateState(); - context->SetFunctionState(scope, state); - // If both the pattern and the match parameter are constant, we pre-compile the - // regular expression once here. Otherwise, the RE is compiled per row in RegexpLike() - if (context->IsArgConstant(1) && context->IsArgConstant(2)) { - StringVal* pattern; - pattern = reinterpret_cast<StringVal*>(context->GetConstantArg(1)); - if (pattern->is_null) return; - StringVal* match_parameter = reinterpret_cast<StringVal*>(context->GetConstantArg(2)); - stringstream error; - if (match_parameter->is_null) { - error << "NULL match parameter"; - context->SetError(error.str().c_str()); - return; - } - RE2::Options opts; - string error_str; - if (!StringFunctions::SetRE2Options(*match_parameter, &error_str, &opts)) { - context->SetError(error_str.c_str()); - return; - } - string pattern_str(reinterpret_cast<const char*>(pattern->ptr), pattern->len); - state->regex_.reset(new RE2(pattern_str, opts)); - if (!state->regex_->ok()) { - error << "Invalid regex expression" << pattern->ptr; - context->SetError(error.str().c_str()); - } - } -} - -// This is used only for the 3 parameter version of regexp_like(). The 2 parameter -// version calls Regex() directly. BooleanVal LikePredicate::RegexpLike(FunctionContext* context, const StringVal& val, const StringVal& pattern, const StringVal& match_parameter) { - if (val.is_null || pattern.is_null) return BooleanVal::null(); - // If either the pattern or the third optional match parameter are not constant, we - // have to recompile the RE for every row. - if (!context->IsArgConstant(2) || !context->IsArgConstant(1)) { - if (match_parameter.is_null) return BooleanVal::null(); - RE2::Options opts; - string error_str; - if (!StringFunctions::SetRE2Options(match_parameter, &error_str, &opts)) { - context->SetError(error_str.c_str()); - return BooleanVal(false); - } - string re_pattern(reinterpret_cast<const char*>(pattern.ptr), pattern.len); - re2::RE2 re(re_pattern, opts); - if (re.ok()) { - return RE2::PartialMatch(re2::StringPiece( - reinterpret_cast<const char*>(val.ptr), val.len), re); - } else { - context->SetError( - strings::Substitute("Invalid regex: $0", pattern.ptr).c_str()); - return BooleanVal(false); - } - } - return ConstantRegexFnPartial(context, val, pattern); -} - -void LikePredicate::RegexClose(FunctionContext* context, - FunctionContext::FunctionStateScope scope) { - if (scope == FunctionContext::THREAD_LOCAL) { - LikePredicateState* state = reinterpret_cast<LikePredicateState*>( - context->GetFunctionState(FunctionContext::THREAD_LOCAL)); - delete state; - } -} - -BooleanVal LikePredicate::RegexFn(FunctionContext* context, const StringVal& val, - const StringVal& pattern) { - return RegexMatch(context, val, pattern, false); -} - -BooleanVal LikePredicate::LikeFn(FunctionContext* context, const StringVal& val, - const StringVal& pattern) { - return RegexMatch(context, val, pattern, true); -} - -BooleanVal LikePredicate::ConstantSubstringFn(FunctionContext* context, - const StringVal& val, const StringVal& pattern) { - if (val.is_null) return BooleanVal::null(); - LikePredicateState* state = reinterpret_cast<LikePredicateState*>( - context->GetFunctionState(FunctionContext::THREAD_LOCAL)); - if (state->search_string_sv_.len == 0) return BooleanVal(true); - StringValue pattern_value = StringValue::FromStringVal(val); - return BooleanVal(state->substring_pattern_.Search(&pattern_value) != -1); -} - -BooleanVal LikePredicate::ConstantStartsWithFn(FunctionContext* context, - const StringVal& val, const StringVal& pattern) { - if (val.is_null) return BooleanVal::null(); - LikePredicateState* state = reinterpret_cast<LikePredicateState*>( - context->GetFunctionState(FunctionContext::THREAD_LOCAL)); - if (val.len < state->search_string_sv_.len) { - return BooleanVal(false); - } else { - StringValue v = - StringValue(reinterpret_cast<char*>(val.ptr), state->search_string_sv_.len); - return BooleanVal(state->search_string_sv_.Eq((v))); - } -} - -BooleanVal LikePredicate::ConstantEndsWithFn(FunctionContext* context, - const StringVal& val, const StringVal& pattern) { - if (val.is_null) return BooleanVal::null(); - LikePredicateState* state = reinterpret_cast<LikePredicateState*>( - context->GetFunctionState(FunctionContext::THREAD_LOCAL)); - if (val.len < state->search_string_sv_.len) { - return BooleanVal(false); - } else { - char* ptr = - reinterpret_cast<char*>(val.ptr) + val.len - state->search_string_sv_.len; - int len = state->search_string_sv_.len; - StringValue v = StringValue(ptr, len); - return BooleanVal(state->search_string_sv_.Eq(v)); - } -} - -BooleanVal LikePredicate::ConstantEqualsFn(FunctionContext* context, const StringVal& val, - const StringVal& pattern) { - if (val.is_null) return BooleanVal::null(); - LikePredicateState* state = reinterpret_cast<LikePredicateState*>( - context->GetFunctionState(FunctionContext::THREAD_LOCAL)); - return BooleanVal(state->search_string_sv_.Eq(StringValue::FromStringVal(val))); -} - -BooleanVal LikePredicate::ConstantRegexFnPartial(FunctionContext* context, - const StringVal& val, const StringVal& pattern) { - if (val.is_null) return BooleanVal::null(); - LikePredicateState* state = reinterpret_cast<LikePredicateState*>( - context->GetFunctionState(FunctionContext::THREAD_LOCAL)); - re2::StringPiece operand_sp(reinterpret_cast<const char*>(val.ptr), val.len); - return RE2::PartialMatch(operand_sp, *state->regex_); -} - -BooleanVal LikePredicate::ConstantRegexFn(FunctionContext* context, - const StringVal& val, const StringVal& pattern) { - if (val.is_null) return BooleanVal::null(); - LikePredicateState* state = reinterpret_cast<LikePredicateState*>( - context->GetFunctionState(FunctionContext::THREAD_LOCAL)); - re2::StringPiece operand_sp(reinterpret_cast<const char*>(val.ptr), val.len); - return RE2::FullMatch(operand_sp, *state->regex_); -} - -BooleanVal LikePredicate::RegexMatch(FunctionContext* context, - const StringVal& operand_value, const StringVal& pattern_value, - bool is_like_pattern) { - if (operand_value.is_null || pattern_value.is_null) return BooleanVal::null(); - if (context->IsArgConstant(1)) { - LikePredicateState* state = reinterpret_cast<LikePredicateState*>( - context->GetFunctionState(FunctionContext::THREAD_LOCAL)); - if (is_like_pattern) { - return RE2::FullMatch(re2::StringPiece(reinterpret_cast<const char*>( - operand_value.ptr), operand_value.len), *state->regex_.get()); - } else { - return RE2::PartialMatch(re2::StringPiece(reinterpret_cast<const char*>( - operand_value.ptr), operand_value.len), *state->regex_.get()); - } - } else { - string re_pattern; - RE2::Options opts; - if (is_like_pattern) { - ConvertLikePattern(context, pattern_value, &re_pattern); - opts.set_never_nl(false); - opts.set_dot_nl(true); - } else { - re_pattern = - string(reinterpret_cast<const char*>(pattern_value.ptr), pattern_value.len); - } - re2::RE2 re(re_pattern, opts); - if (re.ok()) { - if (is_like_pattern) { - return RE2::FullMatch(re2::StringPiece( - reinterpret_cast<const char*>(operand_value.ptr), operand_value.len), re); - } else { - return RE2::PartialMatch(re2::StringPiece( - reinterpret_cast<const char*>(operand_value.ptr), operand_value.len), re); - } - } else { - context->SetError( - strings::Substitute("Invalid regex: $0", pattern_value.ptr).c_str()); - return BooleanVal(false); - } - } -} - -void LikePredicate::ConvertLikePattern(FunctionContext* context, const StringVal& pattern, - string* re_pattern) { - re_pattern->clear(); - LikePredicateState* state = reinterpret_cast<LikePredicateState*>( - context->GetFunctionState(FunctionContext::THREAD_LOCAL)); - bool is_escaped = false; - for (int i = 0; i < pattern.len; ++i) { - if (!is_escaped && pattern.ptr[i] == '%') { - re_pattern->append(".*"); - } else if (!is_escaped && pattern.ptr[i] == '_') { - re_pattern->append("."); - // check for escape char before checking for regex special chars, they might overlap - } else if (!is_escaped && pattern.ptr[i] == state->escape_char_) { - is_escaped = true; - } else if ( - pattern.ptr[i] == '.' - || pattern.ptr[i] == '[' - || pattern.ptr[i] == ']' - || pattern.ptr[i] == '{' - || pattern.ptr[i] == '}' - || pattern.ptr[i] == '(' - || pattern.ptr[i] == ')' - || pattern.ptr[i] == '\\' - || pattern.ptr[i] == '*' - || pattern.ptr[i] == '+' - || pattern.ptr[i] == '?' - || pattern.ptr[i] == '|' - || pattern.ptr[i] == '^' - || pattern.ptr[i] == '$' - ) { - // escape all regex special characters; see list at - // http://www.boost.org/doc/libs/1_47_0/libs/regex/doc/html/boost_regex/syntax/basic_extended.html - re_pattern->append("\\"); - re_pattern->append(1, pattern.ptr[i]); - is_escaped = false; - } else { - // regular character or escaped special character - re_pattern->append(1, pattern.ptr[i]); - is_escaped = false; - } - } + return RegexpLikeInternal(context, val, pattern, match_parameter); } } // namespace impala http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c7b7c3ec/be/src/exprs/like-predicate.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/like-predicate.cc b/be/src/exprs/like-predicate.cc new file mode 100644 index 0000000..fd73089 --- /dev/null +++ b/be/src/exprs/like-predicate.cc @@ -0,0 +1,414 @@ +// Copyright 2012 Cloudera Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "exprs/like-predicate.h" + +#include <string.h> +#include <re2/re2.h> +#include <re2/stringpiece.h> +#include <sstream> + +#include "gutil/strings/substitute.h" +#include "runtime/string-value.inline.h" +#include "string-functions.h" +#include "common/names.h" + +using namespace impala_udf; +using namespace re2; + +namespace impala { +// A regex to match any regex pattern is equivalent to a substring search. +static const RE2 SUBSTRING_RE( + "(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*"); + +// A regex to match any regex pattern which is equivalent to matching a constant string +// at the end of the string values. +static const RE2 ENDS_WITH_RE( + "(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$"); + +// A regex to match any regex pattern which is equivalent to matching a constant string +// at the end of the string values. +static const RE2 STARTS_WITH_RE( + "\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*"); + +// A regex to match any regex pattern which is equivalent to a constant string match. +static const RE2 EQUALS_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$"); + +LikePredicate::LikePredicate(const TExprNode& node) + : Predicate(node) { +} + +LikePredicate::~LikePredicate() { +} + +void LikePredicate::LikePrepare(FunctionContext* context, + FunctionContext::FunctionStateScope scope) { + LikePrepareInternal(context, scope, true); +} + +void LikePredicate::ILikePrepare(FunctionContext* context, + FunctionContext::FunctionStateScope scope) { + LikePrepareInternal(context, scope, false); +} + +// TODO: make class StringValue and StringSearch accept a case-sensitive flag and +// switch back to using the cheaper Constant<>() functions. +void LikePredicate::LikePrepareInternal(FunctionContext* context, + FunctionContext::FunctionStateScope scope, bool case_sensitive) { + if (scope != FunctionContext::THREAD_LOCAL) return; + LikePredicateState* state = new LikePredicateState(); + state->function_ = LikeFn; + context->SetFunctionState(scope, state); + if (context->IsArgConstant(1)) { + StringVal pattern_val = *reinterpret_cast<StringVal*>(context->GetConstantArg(1)); + if (pattern_val.is_null) return; + StringValue pattern = StringValue::FromStringVal(pattern_val); + re2::RE2 substring_re("(?:%+)([^%_]*)(?:%+)"); + re2::RE2 ends_with_re("(?:%+)([^%_]*)"); + re2::RE2 starts_with_re("([^%_]*)(?:%+)"); + re2::RE2 equals_re("([^%_]*)"); + string pattern_str(pattern.ptr, pattern.len); + string search_string; + if (case_sensitive && RE2::FullMatch(pattern_str, substring_re, &search_string)) { + state->SetSearchString(search_string); + state->function_ = ConstantSubstringFn; + } else if (case_sensitive && + RE2::FullMatch(pattern_str, starts_with_re, &search_string)) { + state->SetSearchString(search_string); + state->function_ = ConstantStartsWithFn; + } else if (case_sensitive && + RE2::FullMatch(pattern_str, ends_with_re, &search_string)) { + state->SetSearchString(search_string); + state->function_ = ConstantEndsWithFn; + } else if (case_sensitive && + RE2::FullMatch(pattern_str, equals_re, &search_string)) { + state->SetSearchString(search_string); + state->function_ = ConstantEqualsFn; + } else { + string re_pattern; + ConvertLikePattern(context, + *reinterpret_cast<StringVal*>(context->GetConstantArg(1)), &re_pattern); + RE2::Options opts; + opts.set_never_nl(false); + opts.set_dot_nl(true); + opts.set_case_sensitive(case_sensitive); + state->regex_.reset(new RE2(re_pattern, opts)); + if (!state->regex_->ok()) { + context->SetError( + strings::Substitute("Invalid regex: $0", pattern_val.ptr).c_str()); + } + } + } +} + +void LikePredicate::LikeClose(FunctionContext* context, + FunctionContext::FunctionStateScope scope) { + if (scope == FunctionContext::THREAD_LOCAL) { + LikePredicateState* state = reinterpret_cast<LikePredicateState*>( + context->GetFunctionState(FunctionContext::THREAD_LOCAL)); + delete state; + } +} + +void LikePredicate::RegexPrepare(FunctionContext* context, + FunctionContext::FunctionStateScope scope) { + RegexPrepareInternal(context, scope, true); +} + +void LikePredicate::IRegexPrepare(FunctionContext* context, + FunctionContext::FunctionStateScope scope) { + RegexPrepareInternal(context, scope, false); +} + +void LikePredicate::RegexPrepareInternal(FunctionContext* context, + FunctionContext::FunctionStateScope scope, bool case_sensitive) { + if (scope != FunctionContext::THREAD_LOCAL) return; + LikePredicateState* state = new LikePredicateState(); + context->SetFunctionState(scope, state); + state->function_ = RegexFn; + if (context->IsArgConstant(1)) { + StringVal* pattern = reinterpret_cast<StringVal*>(context->GetConstantArg(1)); + if (pattern->is_null) return; + string pattern_str(reinterpret_cast<const char*>(pattern->ptr), pattern->len); + string search_string; + // The following four conditionals check if the pattern is a constant string, + // starts with a constant string and is followed by any number of wildcard characters, + // ends with a constant string and is preceded by any number of wildcard characters or + // has a constant substring surrounded on both sides by any number of wildcard + // characters. In any of these conditions, we can search for the pattern more + // efficiently by using our own string match functions rather than regex matching. + if (case_sensitive && RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) { + state->SetSearchString(search_string); + state->function_ = ConstantEqualsFn; + } else if (case_sensitive && + RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) { + state->SetSearchString(search_string); + state->function_ = ConstantStartsWithFn; + } else if (case_sensitive && + RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) { + state->SetSearchString(search_string); + state->function_ = ConstantEndsWithFn; + } else if (case_sensitive && + RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) { + state->SetSearchString(search_string); + state->function_ = ConstantSubstringFn; + } else { + RE2::Options opts; + opts.set_case_sensitive(case_sensitive); + state->regex_.reset(new RE2(pattern_str, opts)); + if (!state->regex_->ok()) { + stringstream error; + error << "Invalid regex expression" << pattern->ptr; + context->SetError(error.str().c_str()); + } + state->function_ = ConstantRegexFnPartial; + } + } +} + +// This prepare function is used only when 3 parameters are passed to the regexp_like() +// function. For the 2 parameter version, the RegexPrepare() function is used to prepare. +void LikePredicate::RegexpLikePrepare(FunctionContext* context, + FunctionContext::FunctionStateScope scope) { + if (scope != FunctionContext::THREAD_LOCAL) return; + LikePredicateState* state = new LikePredicateState(); + context->SetFunctionState(scope, state); + // If both the pattern and the match parameter are constant, we pre-compile the + // regular expression once here. Otherwise, the RE is compiled per row in RegexpLike() + if (context->IsArgConstant(1) && context->IsArgConstant(2)) { + StringVal* pattern; + pattern = reinterpret_cast<StringVal*>(context->GetConstantArg(1)); + if (pattern->is_null) return; + StringVal* match_parameter = reinterpret_cast<StringVal*>(context->GetConstantArg(2)); + stringstream error; + if (match_parameter->is_null) { + error << "NULL match parameter"; + context->SetError(error.str().c_str()); + return; + } + RE2::Options opts; + string error_str; + if (!StringFunctions::SetRE2Options(*match_parameter, &error_str, &opts)) { + context->SetError(error_str.c_str()); + return; + } + string pattern_str(reinterpret_cast<const char*>(pattern->ptr), pattern->len); + state->regex_.reset(new RE2(pattern_str, opts)); + if (!state->regex_->ok()) { + error << "Invalid regex expression" << pattern->ptr; + context->SetError(error.str().c_str()); + } + } +} + +// This is used only for the 3 parameter version of regexp_like(). The 2 parameter +// version calls Regex() directly. +BooleanVal LikePredicate::RegexpLikeInternal(FunctionContext* context, + const StringVal& val, const StringVal& pattern, const StringVal& match_parameter) { + if (val.is_null || pattern.is_null) return BooleanVal::null(); + // If either the pattern or the third optional match parameter are not constant, we + // have to recompile the RE for every row. + if (!context->IsArgConstant(2) || !context->IsArgConstant(1)) { + if (match_parameter.is_null) return BooleanVal::null(); + RE2::Options opts; + string error_str; + if (!StringFunctions::SetRE2Options(match_parameter, &error_str, &opts)) { + context->SetError(error_str.c_str()); + return BooleanVal(false); + } + string re_pattern(reinterpret_cast<const char*>(pattern.ptr), pattern.len); + re2::RE2 re(re_pattern, opts); + if (re.ok()) { + return RE2::PartialMatch(re2::StringPiece( + reinterpret_cast<const char*>(val.ptr), val.len), re); + } else { + context->SetError( + strings::Substitute("Invalid regex: $0", pattern.ptr).c_str()); + return BooleanVal(false); + } + } + return ConstantRegexFnPartial(context, val, pattern); +} + +void LikePredicate::RegexClose(FunctionContext* context, + FunctionContext::FunctionStateScope scope) { + if (scope == FunctionContext::THREAD_LOCAL) { + LikePredicateState* state = reinterpret_cast<LikePredicateState*>( + context->GetFunctionState(FunctionContext::THREAD_LOCAL)); + delete state; + } +} + +BooleanVal LikePredicate::RegexFn(FunctionContext* context, const StringVal& val, + const StringVal& pattern) { + return RegexMatch(context, val, pattern, false); +} + +BooleanVal LikePredicate::LikeFn(FunctionContext* context, const StringVal& val, + const StringVal& pattern) { + return RegexMatch(context, val, pattern, true); +} + +BooleanVal LikePredicate::ConstantSubstringFn(FunctionContext* context, + const StringVal& val, const StringVal& pattern) { + if (val.is_null) return BooleanVal::null(); + LikePredicateState* state = reinterpret_cast<LikePredicateState*>( + context->GetFunctionState(FunctionContext::THREAD_LOCAL)); + if (state->search_string_sv_.len == 0) return BooleanVal(true); + StringValue pattern_value = StringValue::FromStringVal(val); + return BooleanVal(state->substring_pattern_.Search(&pattern_value) != -1); +} + +BooleanVal LikePredicate::ConstantStartsWithFn(FunctionContext* context, + const StringVal& val, const StringVal& pattern) { + if (val.is_null) return BooleanVal::null(); + LikePredicateState* state = reinterpret_cast<LikePredicateState*>( + context->GetFunctionState(FunctionContext::THREAD_LOCAL)); + if (val.len < state->search_string_sv_.len) { + return BooleanVal(false); + } else { + StringValue v = + StringValue(reinterpret_cast<char*>(val.ptr), state->search_string_sv_.len); + return BooleanVal(state->search_string_sv_.Eq((v))); + } +} + +BooleanVal LikePredicate::ConstantEndsWithFn(FunctionContext* context, + const StringVal& val, const StringVal& pattern) { + if (val.is_null) return BooleanVal::null(); + LikePredicateState* state = reinterpret_cast<LikePredicateState*>( + context->GetFunctionState(FunctionContext::THREAD_LOCAL)); + if (val.len < state->search_string_sv_.len) { + return BooleanVal(false); + } else { + char* ptr = + reinterpret_cast<char*>(val.ptr) + val.len - state->search_string_sv_.len; + int len = state->search_string_sv_.len; + StringValue v = StringValue(ptr, len); + return BooleanVal(state->search_string_sv_.Eq(v)); + } +} + +BooleanVal LikePredicate::ConstantEqualsFn(FunctionContext* context, const StringVal& val, + const StringVal& pattern) { + if (val.is_null) return BooleanVal::null(); + LikePredicateState* state = reinterpret_cast<LikePredicateState*>( + context->GetFunctionState(FunctionContext::THREAD_LOCAL)); + return BooleanVal(state->search_string_sv_.Eq(StringValue::FromStringVal(val))); +} + +BooleanVal LikePredicate::ConstantRegexFnPartial(FunctionContext* context, + const StringVal& val, const StringVal& pattern) { + if (val.is_null) return BooleanVal::null(); + LikePredicateState* state = reinterpret_cast<LikePredicateState*>( + context->GetFunctionState(FunctionContext::THREAD_LOCAL)); + re2::StringPiece operand_sp(reinterpret_cast<const char*>(val.ptr), val.len); + return RE2::PartialMatch(operand_sp, *state->regex_); +} + +BooleanVal LikePredicate::ConstantRegexFn(FunctionContext* context, + const StringVal& val, const StringVal& pattern) { + if (val.is_null) return BooleanVal::null(); + LikePredicateState* state = reinterpret_cast<LikePredicateState*>( + context->GetFunctionState(FunctionContext::THREAD_LOCAL)); + re2::StringPiece operand_sp(reinterpret_cast<const char*>(val.ptr), val.len); + return RE2::FullMatch(operand_sp, *state->regex_); +} + +BooleanVal LikePredicate::RegexMatch(FunctionContext* context, + const StringVal& operand_value, const StringVal& pattern_value, + bool is_like_pattern) { + if (operand_value.is_null || pattern_value.is_null) return BooleanVal::null(); + if (context->IsArgConstant(1)) { + LikePredicateState* state = reinterpret_cast<LikePredicateState*>( + context->GetFunctionState(FunctionContext::THREAD_LOCAL)); + if (is_like_pattern) { + return RE2::FullMatch(re2::StringPiece(reinterpret_cast<const char*>( + operand_value.ptr), operand_value.len), *state->regex_.get()); + } else { + return RE2::PartialMatch(re2::StringPiece(reinterpret_cast<const char*>( + operand_value.ptr), operand_value.len), *state->regex_.get()); + } + } else { + string re_pattern; + RE2::Options opts; + if (is_like_pattern) { + ConvertLikePattern(context, pattern_value, &re_pattern); + opts.set_never_nl(false); + opts.set_dot_nl(true); + } else { + re_pattern = + string(reinterpret_cast<const char*>(pattern_value.ptr), pattern_value.len); + } + re2::RE2 re(re_pattern, opts); + if (re.ok()) { + if (is_like_pattern) { + return RE2::FullMatch(re2::StringPiece( + reinterpret_cast<const char*>(operand_value.ptr), operand_value.len), re); + } else { + return RE2::PartialMatch(re2::StringPiece( + reinterpret_cast<const char*>(operand_value.ptr), operand_value.len), re); + } + } else { + context->SetError( + strings::Substitute("Invalid regex: $0", pattern_value.ptr).c_str()); + return BooleanVal(false); + } + } +} + +void LikePredicate::ConvertLikePattern(FunctionContext* context, const StringVal& pattern, + string* re_pattern) { + re_pattern->clear(); + LikePredicateState* state = reinterpret_cast<LikePredicateState*>( + context->GetFunctionState(FunctionContext::THREAD_LOCAL)); + bool is_escaped = false; + for (int i = 0; i < pattern.len; ++i) { + if (!is_escaped && pattern.ptr[i] == '%') { + re_pattern->append(".*"); + } else if (!is_escaped && pattern.ptr[i] == '_') { + re_pattern->append("."); + // check for escape char before checking for regex special chars, they might overlap + } else if (!is_escaped && pattern.ptr[i] == state->escape_char_) { + is_escaped = true; + } else if ( + pattern.ptr[i] == '.' + || pattern.ptr[i] == '[' + || pattern.ptr[i] == ']' + || pattern.ptr[i] == '{' + || pattern.ptr[i] == '}' + || pattern.ptr[i] == '(' + || pattern.ptr[i] == ')' + || pattern.ptr[i] == '\\' + || pattern.ptr[i] == '*' + || pattern.ptr[i] == '+' + || pattern.ptr[i] == '?' + || pattern.ptr[i] == '|' + || pattern.ptr[i] == '^' + || pattern.ptr[i] == '$' + ) { + // escape all regex special characters; see list at + // http://www.boost.org/doc/libs/1_47_0/libs/regex/doc/html/boost_regex/syntax/basic_extended.html + re_pattern->append("\\"); + re_pattern->append(1, pattern.ptr[i]); + is_escaped = false; + } else { + // regular character or escaped special character + re_pattern->append(1, pattern.ptr[i]); + is_escaped = false; + } + } +} + +} // namespace impala http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c7b7c3ec/be/src/exprs/like-predicate.h ---------------------------------------------------------------------- diff --git a/be/src/exprs/like-predicate.h b/be/src/exprs/like-predicate.h index 6c907f2..90c78ec 100644 --- a/be/src/exprs/like-predicate.h +++ b/be/src/exprs/like-predicate.h @@ -117,11 +117,18 @@ class LikePredicate: public Predicate { static void RegexpLikePrepare(impala_udf::FunctionContext* context, impala_udf::FunctionContext::FunctionStateScope scope); - /// Handles regexp_like() when 3 parameters are passed to it + /// The cross-compiled wrapper to call RegexpLikeInternal() which is not cross-compiled. static impala_udf::BooleanVal RegexpLike(impala_udf::FunctionContext* context, const impala_udf::StringVal& val, const impala_udf::StringVal& pattern, const impala_udf::StringVal& match_parameter); + /// Handles regexp_like() when 3 parameters are passed to it. This is intentionally + /// not cross-compiled as there is no performance benefit in doing so and it will + /// consume extra codegen time. + static impala_udf::BooleanVal RegexpLikeInternal(impala_udf::FunctionContext* context, + const impala_udf::StringVal& val, const impala_udf::StringVal& pattern, + const impala_udf::StringVal& match_parameter); + static void RegexClose(impala_udf::FunctionContext*, impala_udf::FunctionContext::FunctionStateScope scope);
