This is an automated email from the ASF dual-hosted git repository.
panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 42bb263fe83 [fix](regex) fix wrong escape of function LIKE (#30557)
42bb263fe83 is described below
commit 42bb263fe837d576b24c6d6dad6978c46f864bc5
Author: zclllyybb <[email protected]>
AuthorDate: Thu Feb 8 13:08:26 2024 +0800
[fix](regex) fix wrong escape of function LIKE (#30557)
fix wrong escape of function LIKE
---
be/src/vec/functions/like.cpp | 64 ++++++++++++----------
docs/sidebars.json | 1 +
.../conditional_functions/test_query_like.out | 18 ++++++
.../conditional_functions/test_query_like.out | 18 ++++++
.../conditional_functions/test_query_like.groovy | 7 +++
.../conditional_functions/test_query_like.groovy | 7 +++
6 files changed, 85 insertions(+), 30 deletions(-)
diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp
index 7f4ad718819..fd7eccbf97a 100644
--- a/be/src/vec/functions/like.cpp
+++ b/be/src/vec/functions/like.cpp
@@ -21,7 +21,6 @@
#include <hs/hs_compile.h>
#include <re2/stringpiece.h>
-#include <algorithm>
#include <cstddef>
#include <ostream>
#include <utility>
@@ -39,26 +38,25 @@
namespace doris::vectorized {
// A regex to match any regex pattern is equivalent to a substring search.
-static const RE2 SUBSTRING_RE(
-
"(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
+static const RE2
SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
// A regex to match any regex pattern which is equivalent to matching a
constant string
// at the end of the string values.
-static const RE2
ENDS_WITH_RE("(?:\\.\\*)*([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
+static const RE2
ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
// A regex to match any regex pattern which is equivalent to matching a
constant string
// at the end of the string values.
-static const RE2
STARTS_WITH_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)(?:\\.\\*)*");
+static const RE2
STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
// A regex to match any regex pattern which is equivalent to a constant string
match.
-static const RE2
EQUALS_RE("\\^([^\\.\\^\\{\\[\\(\\|\\)\\]\\}\\+\\*\\?\\$\\\\]*)\\$");
+static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
// A regex to match .*
-static const RE2 ALLPASS_RE("(\\\\.\\*)+");
+static const RE2 ALLPASS_RE(R"((\\.\*)+)");
// Like patterns
-static const re2::RE2
LIKE_SUBSTRING_RE("(?:%+)(((\\\\_)|([^%_\\\\]))+)(?:%+)");
+static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))");
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
-static const re2::RE2
LIKE_STARTS_WITH_RE("(((\\\\%)|(\\\\_)|([^%_\\\\]))+)(?:%+)");
+static const re2::RE2
LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))");
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");
static const re2::RE2 LIKE_ALLPASS_RE("%+");
@@ -200,7 +198,7 @@ Status
FunctionLikeBase::constant_regex_fn_scalar(LikeSearchState* state, const
return Status::RuntimeError(fmt::format("hyperscan error: {}",
ret));
}
} else { // fallback to re2
- *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size),
*state->regex.get());
+ *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size),
*state->regex);
}
return Status::OK();
@@ -241,8 +239,8 @@ Status FunctionLikeBase::constant_regex_fn(LikeSearchState*
state, const ColumnS
} else { // fallback to re2
for (size_t i = 0; i < sz; i++) {
const auto& str_ref = val.get_data_at(i);
- *(result.data() + i) =
RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size),
- *state->regex.get());
+ *(result.data() + i) =
+ RE2::PartialMatch(re2::StringPiece(str_ref.data,
str_ref.size), *state->regex);
}
}
@@ -447,14 +445,25 @@ void FunctionLike::convert_like_pattern(LikeSearchState*
state, const std::strin
}
// add ^ to pattern head to match line head
- if (pattern.size() > 0 && pattern[0] != '%') {
+ if (!pattern.empty() && pattern[0] != '%') {
re_pattern->append("^");
}
bool is_escaped = false;
- for (size_t i = 0; i < pattern.size(); ++i) {
- if (!is_escaped) {
- switch (pattern[i]) {
+ // expect % and _, all chars should keep it literal means.
+ for (char i : pattern) {
+ if (is_escaped) { // last is \, this should be escape
+ if (i == '[' || i == ']' || i == '(' || i == ')' || i == '{' || i
== '}' || i == '-' ||
+ i == '*' || i == '+' || i == '\\' || i == '|' || i == '/' || i
== ':' || i == '^' ||
+ i == '.' || i == '$' || i == '?') {
+ re_pattern->append(1, '\\');
+ } else if (i != '%' && i != '_') {
+ re_pattern->append(2, '\\');
+ }
+ re_pattern->append(1, i);
+ is_escaped = false;
+ } else {
+ switch (i) {
case '%':
re_pattern->append(".*");
break;
@@ -462,28 +471,23 @@ void FunctionLike::convert_like_pattern(LikeSearchState*
state, const std::strin
re_pattern->append(".");
break;
default:
- is_escaped = pattern[i] == state->escape_char;
+ is_escaped = i == state->escape_char;
if (!is_escaped) {
- re_pattern->append(1, pattern[i]);
+ // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |,
/, :, ^, ., $, ?
+ if (i == '[' || i == ']' || i == '(' || i == ')' || i ==
'{' || i == '}' ||
+ i == '-' || i == '*' || i == '+' || i == '\\' || i ==
'|' || i == '/' ||
+ i == ':' || i == '^' || i == '.' || i == '$' || i ==
'?') {
+ re_pattern->append(1, '\\');
+ }
+ re_pattern->append(1, i);
}
break;
}
- } else {
- if (pattern[i] == '.' || pattern[i] == '[' || pattern[i] == ']' ||
pattern[i] == '{' ||
- pattern[i] == '}' || pattern[i] == '(' || pattern[i] == ')' ||
pattern[i] == '\\' ||
- pattern[i] == '*' || pattern[i] == '+' || pattern[i] == '?' ||
pattern[i] == '|' ||
- pattern[i] == '^' || pattern[i] == '$') {
- re_pattern->append("\\");
- } else if (pattern[i] != '%' && pattern[i] != '_') {
- re_pattern->append("\\\\");
- }
- re_pattern->append(1, pattern[i]);
- is_escaped = false;
}
}
// add $ to pattern tail to match line tail
- if (pattern.size() > 0 && re_pattern->back() != '*') {
+ if (!pattern.empty() && re_pattern->back() != '*') {
re_pattern->append("$");
}
}
diff --git a/docs/sidebars.json b/docs/sidebars.json
index da4c1020a11..bc08c6bc23d 100644
--- a/docs/sidebars.json
+++ b/docs/sidebars.json
@@ -182,6 +182,7 @@
"advanced/using-hll",
"advanced/variables",
"advanced/time-zone",
+ "advanced/sql-mode",
"advanced/small-file-mgr",
"advanced/cold-hot-separation",
"advanced/compute-node",
diff --git
a/regression-test/data/nereids_p0/sql_functions/conditional_functions/test_query_like.out
b/regression-test/data/nereids_p0/sql_functions/conditional_functions/test_query_like.out
index 5a9e10ed6ca..05417f338d2 100644
---
a/regression-test/data/nereids_p0/sql_functions/conditional_functions/test_query_like.out
+++
b/regression-test/data/nereids_p0/sql_functions/conditional_functions/test_query_like.out
@@ -77,3 +77,21 @@ true false
-- !like24 --
false true
+-- !escape1 --
+true
+
+-- !escape2 --
+false
+
+-- !escape3 --
+false
+
+-- !escape4 --
+true
+
+-- !escape5 --
+true
+
+-- !escape6 --
+true
+
diff --git
a/regression-test/data/query_p0/sql_functions/conditional_functions/test_query_like.out
b/regression-test/data/query_p0/sql_functions/conditional_functions/test_query_like.out
index 5a9e10ed6ca..05417f338d2 100644
---
a/regression-test/data/query_p0/sql_functions/conditional_functions/test_query_like.out
+++
b/regression-test/data/query_p0/sql_functions/conditional_functions/test_query_like.out
@@ -77,3 +77,21 @@ true false
-- !like24 --
false true
+-- !escape1 --
+true
+
+-- !escape2 --
+false
+
+-- !escape3 --
+false
+
+-- !escape4 --
+true
+
+-- !escape5 --
+true
+
+-- !escape6 --
+true
+
diff --git
a/regression-test/suites/nereids_p0/sql_functions/conditional_functions/test_query_like.groovy
b/regression-test/suites/nereids_p0/sql_functions/conditional_functions/test_query_like.groovy
index 5f1701778b7..c345ee36cf4 100644
---
a/regression-test/suites/nereids_p0/sql_functions/conditional_functions/test_query_like.groovy
+++
b/regression-test/suites/nereids_p0/sql_functions/conditional_functions/test_query_like.groovy
@@ -47,4 +47,11 @@ suite("test_query_like", "query,p0") {
qt_like22 """select "abcd%%1" like "abcd__1", "abcd%%1" not like "abcd__1"
"""
qt_like23 """select "abcd%%1" like "abcd_%_", "abcd%%1" not like "abcd_%_"
"""
qt_like24 """select "abcd%%1" like "abcd\\_%1", "abcd%%1" not like
"abcd\\_%1" """
+
+ qt_escape1 """select
'facebook_10008_T1+T2-ALL_AAA-VO_LowestCost_20230830_HSJ' LIKE
'%facebook_10008_T1+T2%' """
+ qt_escape2 """select '!z23]' like '_[z]%' """
+ qt_escape3 """select '[123]' like '%[1.*]%' """
+ qt_escape4 """select '1\\b\\b' like '%_\\b\\b%' """
+ qt_escape5 """select '1\\d\\d' like '%_\\d\\d%' """
+ qt_escape6 """select '1dd' like '%_\\d\\d%' """
}
diff --git
a/regression-test/suites/query_p0/sql_functions/conditional_functions/test_query_like.groovy
b/regression-test/suites/query_p0/sql_functions/conditional_functions/test_query_like.groovy
index b15a5383d79..9ebb300ee5e 100644
---
a/regression-test/suites/query_p0/sql_functions/conditional_functions/test_query_like.groovy
+++
b/regression-test/suites/query_p0/sql_functions/conditional_functions/test_query_like.groovy
@@ -45,4 +45,11 @@ suite("test_query_like", "query,p0") {
qt_like22 """select "abcd%%1" like "abcd__1", "abcd%%1" not like "abcd__1"
"""
qt_like23 """select "abcd%%1" like "abcd_%_", "abcd%%1" not like "abcd_%_"
"""
qt_like24 """select "abcd%%1" like "abcd\\_%1", "abcd%%1" not like
"abcd\\_%1" """
+
+ qt_escape1 """select
'facebook_10008_T1+T2-ALL_AAA-VO_LowestCost_20230830_HSJ' LIKE
'%facebook_10008_T1+T2%' """
+ qt_escape2 """select '!z23]' like '_[z]%' """
+ qt_escape3 """select '[123]' like '%[1.*]%' """
+ qt_escape4 """select '1\\b\\b' like '%_\\b\\b%' """
+ qt_escape5 """select '1\\d\\d' like '%_\\d\\d%' """
+ qt_escape6 """select '1dd' like '%_\\d\\d%' """
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]