This is an automated email from the ASF dual-hosted git repository.

panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 42bb263fe83 [fix](regex) fix wrong escape of function LIKE (#30557)
42bb263fe83 is described below

commit 42bb263fe837d576b24c6d6dad6978c46f864bc5
Author: zclllyybb <[email protected]>
AuthorDate: Thu Feb 8 13:08:26 2024 +0800

    [fix](regex) fix wrong escape of function LIKE (#30557)
    
    fix wrong escape of function LIKE
---
 be/src/vec/functions/like.cpp                      | 64 ++++++++++++----------
 docs/sidebars.json                                 |  1 +
 .../conditional_functions/test_query_like.out      | 18 ++++++
 .../conditional_functions/test_query_like.out      | 18 ++++++
 .../conditional_functions/test_query_like.groovy   |  7 +++
 .../conditional_functions/test_query_like.groovy   |  7 +++
 6 files changed, 85 insertions(+), 30 deletions(-)

diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp
index 7f4ad718819..fd7eccbf97a 100644
--- a/be/src/vec/functions/like.cpp
+++ b/be/src/vec/functions/like.cpp
@@ -21,7 +21,6 @@
 #include <hs/hs_compile.h>
 #include <re2/stringpiece.h>
 
-#include <algorithm>
 #include <cstddef>
 #include <ostream>
 #include <utility>
@@ -39,26 +38,25 @@
 
 namespace doris::vectorized {
 // A regex to match any regex pattern is equivalent to a substring search.
-static const RE2 SUBSTRING_RE(
-        
"(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
+static const RE2 
SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
 
 // A regex to match any regex pattern which is equivalent to matching a 
constant string
 // at the end of the string values.
-static const RE2 
ENDS_WITH_RE("(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
+static const RE2 
ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
 
 // A regex to match any regex pattern which is equivalent to matching a 
constant string
 // at the end of the string values.
-static const RE2 
STARTS_WITH_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
+static const RE2 
STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
 
 // A regex to match any regex pattern which is equivalent to a constant string 
match.
-static const RE2 
EQUALS_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
+static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
 // A regex to match .*
-static const RE2 ALLPASS_RE("(\\\\.\\*)+");
+static const RE2 ALLPASS_RE(R"((\\.\*)+)");
 
 // Like patterns
-static const re2::RE2 
LIKE_SUBSTRING_RE("(?:%+)(((\\\\_)|([^%_\\\\]))+)(?:%+)");
+static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))");
 static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
-static const re2::RE2 
LIKE_STARTS_WITH_RE("(((\\\\%)|(\\\\_)|([^%_\\\\]))+)(?:%+)");
+static const re2::RE2 
LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))");
 static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");
 static const re2::RE2 LIKE_ALLPASS_RE("%+");
 
@@ -200,7 +198,7 @@ Status 
FunctionLikeBase::constant_regex_fn_scalar(LikeSearchState* state, const
             return Status::RuntimeError(fmt::format("hyperscan error: {}", 
ret));
         }
     } else { // fallback to re2
-        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), 
*state->regex.get());
+        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), 
*state->regex);
     }
 
     return Status::OK();
@@ -241,8 +239,8 @@ Status FunctionLikeBase::constant_regex_fn(LikeSearchState* 
state, const ColumnS
     } else { // fallback to re2
         for (size_t i = 0; i < sz; i++) {
             const auto& str_ref = val.get_data_at(i);
-            *(result.data() + i) = 
RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size),
-                                                     *state->regex.get());
+            *(result.data() + i) =
+                    RE2::PartialMatch(re2::StringPiece(str_ref.data, 
str_ref.size), *state->regex);
         }
     }
 
@@ -447,14 +445,25 @@ void FunctionLike::convert_like_pattern(LikeSearchState* 
state, const std::strin
     }
 
     // add ^ to pattern head to match line head
-    if (pattern.size() > 0 && pattern[0] != '%') {
+    if (!pattern.empty() && pattern[0] != '%') {
         re_pattern->append("^");
     }
 
     bool is_escaped = false;
-    for (size_t i = 0; i < pattern.size(); ++i) {
-        if (!is_escaped) {
-            switch (pattern[i]) {
+    // expect % and _, all chars should keep it literal means.
+    for (char i : pattern) {
+        if (is_escaped) { // last is \, this should be escape
+            if (i == '[' || i == ']' || i == '(' || i == ')' || i == '{' || i 
== '}' || i == '-' ||
+                i == '*' || i == '+' || i == '\\' || i == '|' || i == '/' || i 
== ':' || i == '^' ||
+                i == '.' || i == '$' || i == '?') {
+                re_pattern->append(1, '\\');
+            } else if (i != '%' && i != '_') {
+                re_pattern->append(2, '\\');
+            }
+            re_pattern->append(1, i);
+            is_escaped = false;
+        } else {
+            switch (i) {
             case '%':
                 re_pattern->append(".*");
                 break;
@@ -462,28 +471,23 @@ void FunctionLike::convert_like_pattern(LikeSearchState* 
state, const std::strin
                 re_pattern->append(".");
                 break;
             default:
-                is_escaped = pattern[i] == state->escape_char;
+                is_escaped = i == state->escape_char;
                 if (!is_escaped) {
-                    re_pattern->append(1, pattern[i]);
+                    // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, 
/, :, ^, ., $, ?
+                    if (i == '[' || i == ']' || i == '(' || i == ')' || i == 
'{' || i == '}' ||
+                        i == '-' || i == '*' || i == '+' || i == '\\' || i == 
'|' || i == '/' ||
+                        i == ':' || i == '^' || i == '.' || i == '$' || i == 
'?') {
+                        re_pattern->append(1, '\\');
+                    }
+                    re_pattern->append(1, i);
                 }
                 break;
             }
-        } else {
-            if (pattern[i] == '.' || pattern[i] == '[' || pattern[i] == ']' || 
pattern[i] == '{' ||
-                pattern[i] == '}' || pattern[i] == '(' || pattern[i] == ')' || 
pattern[i] == '\\' ||
-                pattern[i] == '*' || pattern[i] == '+' || pattern[i] == '?' || 
pattern[i] == '|' ||
-                pattern[i] == '^' || pattern[i] == '$') {
-                re_pattern->append("\\");
-            } else if (pattern[i] != '%' && pattern[i] != '_') {
-                re_pattern->append("\\\\");
-            }
-            re_pattern->append(1, pattern[i]);
-            is_escaped = false;
         }
     }
 
     // add $ to pattern tail to match line tail
-    if (pattern.size() > 0 && re_pattern->back() != '*') {
+    if (!pattern.empty() && re_pattern->back() != '*') {
         re_pattern->append("$");
     }
 }
diff --git a/docs/sidebars.json b/docs/sidebars.json
index da4c1020a11..bc08c6bc23d 100644
--- a/docs/sidebars.json
+++ b/docs/sidebars.json
@@ -182,6 +182,7 @@
                 "advanced/using-hll",
                 "advanced/variables",
                 "advanced/time-zone",
+                "advanced/sql-mode",
                 "advanced/small-file-mgr",
                 "advanced/cold-hot-separation",
                 "advanced/compute-node",
diff --git 
a/regression-test/data/nereids_p0/sql_functions/conditional_functions/test_query_like.out
 
b/regression-test/data/nereids_p0/sql_functions/conditional_functions/test_query_like.out
index 5a9e10ed6ca..05417f338d2 100644
--- 
a/regression-test/data/nereids_p0/sql_functions/conditional_functions/test_query_like.out
+++ 
b/regression-test/data/nereids_p0/sql_functions/conditional_functions/test_query_like.out
@@ -77,3 +77,21 @@ true false
 -- !like24 --
 false  true
 
+-- !escape1 --
+true
+
+-- !escape2 --
+false
+
+-- !escape3 --
+false
+
+-- !escape4 --
+true
+
+-- !escape5 --
+true
+
+-- !escape6 --
+true
+
diff --git 
a/regression-test/data/query_p0/sql_functions/conditional_functions/test_query_like.out
 
b/regression-test/data/query_p0/sql_functions/conditional_functions/test_query_like.out
index 5a9e10ed6ca..05417f338d2 100644
--- 
a/regression-test/data/query_p0/sql_functions/conditional_functions/test_query_like.out
+++ 
b/regression-test/data/query_p0/sql_functions/conditional_functions/test_query_like.out
@@ -77,3 +77,21 @@ true false
 -- !like24 --
 false  true
 
+-- !escape1 --
+true
+
+-- !escape2 --
+false
+
+-- !escape3 --
+false
+
+-- !escape4 --
+true
+
+-- !escape5 --
+true
+
+-- !escape6 --
+true
+
diff --git 
a/regression-test/suites/nereids_p0/sql_functions/conditional_functions/test_query_like.groovy
 
b/regression-test/suites/nereids_p0/sql_functions/conditional_functions/test_query_like.groovy
index 5f1701778b7..c345ee36cf4 100644
--- 
a/regression-test/suites/nereids_p0/sql_functions/conditional_functions/test_query_like.groovy
+++ 
b/regression-test/suites/nereids_p0/sql_functions/conditional_functions/test_query_like.groovy
@@ -47,4 +47,11 @@ suite("test_query_like", "query,p0") {
     qt_like22 """select "abcd%%1" like "abcd__1", "abcd%%1" not like "abcd__1" 
"""
     qt_like23 """select "abcd%%1" like "abcd_%_", "abcd%%1" not like "abcd_%_" 
"""
     qt_like24 """select "abcd%%1" like "abcd\\_%1", "abcd%%1" not like 
"abcd\\_%1" """
+
+    qt_escape1 """select 
'facebook_10008_T1+T2-ALL_AAA-VO_LowestCost_20230830_HSJ' LIKE 
'%facebook_10008_T1+T2%' """
+    qt_escape2 """select '!z23]' like '_[z]%' """
+    qt_escape3 """select '[123]' like '%[1.*]%' """
+    qt_escape4 """select '1\\b\\b' like '%_\\b\\b%' """
+    qt_escape5 """select '1\\d\\d' like '%_\\d\\d%' """
+    qt_escape6 """select '1dd' like '%_\\d\\d%' """
 }
diff --git 
a/regression-test/suites/query_p0/sql_functions/conditional_functions/test_query_like.groovy
 
b/regression-test/suites/query_p0/sql_functions/conditional_functions/test_query_like.groovy
index b15a5383d79..9ebb300ee5e 100644
--- 
a/regression-test/suites/query_p0/sql_functions/conditional_functions/test_query_like.groovy
+++ 
b/regression-test/suites/query_p0/sql_functions/conditional_functions/test_query_like.groovy
@@ -45,4 +45,11 @@ suite("test_query_like", "query,p0") {
     qt_like22 """select "abcd%%1" like "abcd__1", "abcd%%1" not like "abcd__1" 
"""
     qt_like23 """select "abcd%%1" like "abcd_%_", "abcd%%1" not like "abcd_%_" 
"""
     qt_like24 """select "abcd%%1" like "abcd\\_%1", "abcd%%1" not like 
"abcd\\_%1" """
+
+    qt_escape1 """select 
'facebook_10008_T1+T2-ALL_AAA-VO_LowestCost_20230830_HSJ' LIKE 
'%facebook_10008_T1+T2%' """
+    qt_escape2 """select '!z23]' like '_[z]%' """
+    qt_escape3 """select '[123]' like '%[1.*]%' """
+    qt_escape4 """select '1\\b\\b' like '%_\\b\\b%' """
+    qt_escape5 """select '1\\d\\d' like '%_\\d\\d%' """
+    qt_escape6 """select '1dd' like '%_\\d\\d%' """
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to