IMPALA-3282: Adds regexp_escape built-in function

Escapes the following special characters in RE2 library:
.\+*?[^]$(){}=!<>|:-

Testing:
Add some unit tests into ExprTest.StringRegexpFunctions
Add some E2E tests into exprs.test

Change-Id: I84c3e0ded26f6eb20794c38b75be9b25cd111e4b
Reviewed-on: http://gerrit.cloudera.org:8080/8900
Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/9c08ca2d
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/9c08ca2d
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/9c08ca2d

Branch: refs/heads/2.x
Commit: 9c08ca2dd43c086e74c24a5f3e79f83fd9c5fecc
Parents: 530fa27
Author: Jinchul <jinc...@gmail.com>
Authored: Tue Dec 19 11:29:16 2017 +0900
Committer: Impala Public Jenkins <impala-public-jenk...@gerrit.cloudera.org>
Committed: Fri Feb 2 01:10:15 2018 +0000

----------------------------------------------------------------------
 be/src/exprs/expr-test.cc                       | 40 +++++++++++++++++++-
 be/src/exprs/string-functions-ir.cc             | 23 +++++++++++
 be/src/exprs/string-functions.h                 |  1 +
 common/function-registry/impala_functions.py    |  1 +
 .../queries/QueryTest/exprs.test                | 28 ++++++++++++++
 5 files changed, 91 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/9c08ca2d/be/src/exprs/expr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index c6e81f1..5d63f2d 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -4244,10 +4244,46 @@ TEST_F(ExprTest, StringRegexpFunctions) {
   TestIsNull("regexp_match_count(NULL, '.*')", TYPE_INT);
   TestIsNull("regexp_match_count('a123', NULL)", TYPE_INT);
   TestIsNull("regexp_match_count(NULL, NULL)", TYPE_INT);
+
+  TestIsNull("regexp_escape(NULL)", TYPE_STRING);
+  TestStringValue("regexp_escape('')", "");
+  // Test special character escape
+  // .\+*?[^]$(){}=!<>|:-
+  TestStringValue("regexp_escape('Hello.world')", R"(Hello\.world)");
+  TestStringValue(R"(regexp_escape('Hello\\world'))", R"(Hello\\world)");
+  TestStringValue("regexp_escape('Hello+world')", R"(Hello\+world)");
+  TestStringValue("regexp_escape('Hello*world')", R"(Hello\*world)");
+  TestStringValue("regexp_escape('Hello?world')", R"(Hello\?world)");
+  TestStringValue("regexp_escape('Hello[world')", R"(Hello\[world)");
+  TestStringValue("regexp_escape('Hello^world')", R"(Hello\^world)");
+  TestStringValue("regexp_escape('Hello]world')", R"(Hello\]world)");
+  TestStringValue("regexp_escape('Hello$world')", R"(Hello\$world)");
+  TestStringValue("regexp_escape('Hello(world')", R"(Hello\(world)");
+  TestStringValue("regexp_escape('Hello)world')", R"(Hello\)world)");
+  TestStringValue("regexp_escape('Hello{world')", R"(Hello\{world)");
+  TestStringValue("regexp_escape('Hello}world')", R"(Hello\}world)");
+  TestStringValue("regexp_escape('Hello=world')", R"(Hello\=world)");
+  TestStringValue("regexp_escape('Hello!world')", R"(Hello\!world)");
+  TestStringValue("regexp_escape('Hello<world')", R"(Hello\<world)");
+  TestStringValue("regexp_escape('Hello>world')", R"(Hello\>world)");
+  TestStringValue("regexp_escape('Hello|world')", R"(Hello\|world)");
+  TestStringValue("regexp_escape('Hello:world')", R"(Hello\:world)");
+  TestStringValue("regexp_escape('Hello-world')", R"(Hello\-world)");
+  // Mixed case
+  
TestStringValue(R"(regexp_escape('a.b\\c+d*e?f[g]h$i(j)k{l}m=n!o<p>q|r:s-t'))",
+      R"(a\.b\\c\+d\*e\?f\[g\]h\$i\(j\)k\{l\}m\=n\!o\<p\>q\|r\:s\-t)");
+  // Mixed case with other regexp_* functions
+  TestStringValue(R"(regexp_extract(regexp_escape('Hello\\world'),)"
+      R"('([[:alpha:]]+)(\\\\\\\\)([[:alpha:]]+)', 0))", R"(Hello\\world)");
+  TestStringValue(R"(regexp_extract(regexp_escape('Hello\\world'),)"
+      R"('([[:alpha:]]+)(\\\\\\\\)([[:alpha:]]+)', 1))", "Hello");
+  TestStringValue(R"(regexp_extract(regexp_escape('Hello\\world'),)"
+      R"('([[:alpha:]]+)(\\\\\\\\)([[:alpha:]]+)', 2))", R"(\\)");
+  TestStringValue(R"(regexp_extract(regexp_escape('Hello\\world'),)"
+      R"('([[:alpha:]]+)(\\\\\\\\)([[:alpha:]]+)', 3))", "world");
 }
 
-TEST_F(ExprTest, StringParseUrlFunction) {
-  // TODO: For now, our parse_url my not behave exactly like Hive
+TEST_F(ExprTest, StringParseUrlFunction) { // TODO: For now, our parse_url my 
not behave exactly like Hive
   // when given malformed URLs.
   // If necessary, we can closely follow Java's URL implementation
   // to behave exactly like Hive.

http://git-wip-us.apache.org/repos/asf/impala/blob/9c08ca2d/be/src/exprs/string-functions-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/string-functions-ir.cc 
b/be/src/exprs/string-functions-ir.cc
index 50378bd..50c37b2 100644
--- a/be/src/exprs/string-functions-ir.cc
+++ b/be/src/exprs/string-functions-ir.cc
@@ -26,6 +26,7 @@
 
 #include "exprs/anyval-util.h"
 #include "exprs/scalar-expr.h"
+#include "gutil/strings/charset.h"
 #include "runtime/string-value.inline.h"
 #include "runtime/tuple-row.h"
 #include "util/bit-util.h"
@@ -670,6 +671,28 @@ void StringFunctions::RegexpClose(
   context->SetFunctionState(scope, nullptr);
 }
 
+StringVal StringFunctions::RegexpEscape(FunctionContext* context, const 
StringVal& str) {
+  if (str.is_null) return StringVal::null();
+  if (str.len == 0) return str;
+
+  static const strings::CharSet 
REGEX_ESCAPE_CHARACTERS(".\\+*?[^]$(){}=!<>|:-");
+  const uint8_t* const start_ptr = str.ptr;
+  const uint8_t* const end_ptr = start_ptr + str.len;
+  StringVal result(context, str.len * 2);
+  if (UNLIKELY(result.is_null)) return StringVal::null();
+  uint8_t* dest_ptr = result.ptr;
+  for (const uint8_t* c = start_ptr; c < end_ptr; ++c) {
+    if (REGEX_ESCAPE_CHARACTERS.Test(*c)) {
+      *dest_ptr++ = '\\';
+    }
+    *dest_ptr++ = *c;
+  }
+  result.len = dest_ptr - result.ptr;
+  DCHECK_GE(result.len, str.len);
+
+  return result;
+}
+
 StringVal StringFunctions::RegexpExtract(FunctionContext* context, const 
StringVal& str,
     const StringVal& pattern, const BigIntVal& index) {
   if (str.is_null || pattern.is_null || index.is_null) return 
StringVal::null();

http://git-wip-us.apache.org/repos/asf/impala/blob/9c08ca2d/be/src/exprs/string-functions.h
----------------------------------------------------------------------
diff --git a/be/src/exprs/string-functions.h b/be/src/exprs/string-functions.h
index 91ad2cc..45876f8 100644
--- a/be/src/exprs/string-functions.h
+++ b/be/src/exprs/string-functions.h
@@ -117,6 +117,7 @@ class StringFunctions {
       re2::RE2::Options* opts);
   static void RegexpPrepare(FunctionContext*, 
FunctionContext::FunctionStateScope);
   static void RegexpClose(FunctionContext*, 
FunctionContext::FunctionStateScope);
+  static StringVal RegexpEscape(FunctionContext*, const StringVal& str);
   static StringVal RegexpExtract(FunctionContext*, const StringVal& str,
       const StringVal& pattern, const BigIntVal& index);
   static StringVal RegexpReplace(FunctionContext*, const StringVal& str,

http://git-wip-us.apache.org/repos/asf/impala/blob/9c08ca2d/common/function-registry/impala_functions.py
----------------------------------------------------------------------
diff --git a/common/function-registry/impala_functions.py 
b/common/function-registry/impala_functions.py
index b78062b..8174abb 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -462,6 +462,7 @@ visible_functions = [
   [['locate'], 'INT', ['STRING', 'STRING'], 'impala::StringFunctions::Locate'],
   [['locate'], 'INT', ['STRING', 'STRING', 'BIGINT'],
    'impala::StringFunctions::LocatePos'],
+  [['regexp_escape'], 'STRING', ['STRING'], 
'impala::StringFunctions::RegexpEscape'],
   [['regexp_extract'], 'STRING', ['STRING', 'STRING', 'BIGINT'],
    'impala::StringFunctions::RegexpExtract',
    
'_ZN6impala15StringFunctions13RegexpPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',

http://git-wip-us.apache.org/repos/asf/impala/blob/9c08ca2d/testdata/workloads/functional-query/queries/QueryTest/exprs.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/exprs.test 
b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
index a15f3b5..4d8b193 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
@@ -2452,6 +2452,34 @@ select regexp_match_count(tmp.str, tmp.pattern, 
tmp.start_pos, tmp.params) from
 Illegal match parameter x
 ====
 ---- QUERY
+select regexp_escape(tmp.str) from (values
+('a.b\\c+d*e?f[g]h$i(j)k{l}m=n!o<p>q|r:s-t' as str)) as tmp
+---- RESULTS
+'a\\.b\\\\c\\+d\\*e\\?f\\[g\\]h\\$i\\(j\\)k\\{l\\}m\\=n\\!o\\<p\\>q\\|r\\:s\\-t'
+---- TYPES
+string
+====
+---- QUERY
+select regexp_extract(regexp_escape(tmp.str),
+tmp.pattern, tmp.index) from (values
+('Hello\\world' as str, '([[:alpha:]]+)(\\\\\\\\)([[:alpha:]]+)' as pattern, 2 
as index)
+) as tmp
+---- RESULTS
+'\\\\'
+---- TYPES
+string
+====
+---- QUERY
+select regexp_extract(regexp_escape(tmp.str),
+tmp.pattern, tmp.index) from (values
+('Hello\\world' as str, '([[:alpha:]]+)(\\\\\\\\)([[:alpha:]]+)' as pattern, 3 
as index)
+) as tmp
+---- RESULTS
+'world'
+---- TYPES
+string
+====
+---- QUERY
 # IMPALA-2147: IS [NOT] DISTINCT FROM and "<=>"
 select NULL <=> NULL
 ---- RESULTS

Reply via email to