This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 15d0c1ea7a1 [fix](inverted index) fix tokenize function wrong result
when params with space seperator #32671 (#36290)
15d0c1ea7a1 is described below
commit 15d0c1ea7a1100c9e7ee3eec321e05086553279f
Author: airborne12 <[email protected]>
AuthorDate: Sun Jun 16 09:32:20 2024 +0800
[fix](inverted index) fix tokenize function wrong result when params with
space seperator #32671 (#36290)
---
be/src/vec/functions/function_tokenize.cpp | 2 +-
regression-test/data/inverted_index_p0/test_tokenize.out | 8 ++++++++
regression-test/suites/inverted_index_p0/test_tokenize.groovy | 2 ++
3 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/be/src/vec/functions/function_tokenize.cpp
b/be/src/vec/functions/function_tokenize.cpp
index 5f362bb1323..17e52fb29b3 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -38,7 +38,7 @@ namespace doris::vectorized {
Status parse(const std::string& str, std::map<std::string, std::string>&
result) {
std::regex pattern(
-
R"delimiter((?:'([^']*)'|"([^"]*)"|([^,]*))\s*=\s*(?:'([^']*)'|"([^"]*)"|([^,]*)))delimiter");
+ R"delimiter((?:'([^']*)'|"([^"]*)"|([^,
]*))\s*=\s*(?:'([^']*)'|"([^"]*)"|([^, ]*)))delimiter");
std::smatch matches;
std::string::const_iterator searchStart(str.cbegin());
diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out
b/regression-test/data/inverted_index_p0/test_tokenize.out
index a3984ca9105..d92e4b6585a 100644
--- a/regression-test/data/inverted_index_p0/test_tokenize.out
+++ b/regression-test/data/inverted_index_p0/test_tokenize.out
@@ -19,6 +19,12 @@
["人民", "可以", "得到", "更多", "实惠"]
["陕西", "陕西省", "西安", "西安市", "高新", "高新区", "新区", "创业", "业大", "大厦", "A", "座", "我",
"手机", "手机号", "手机号码", "机号码", "号码", "12345678901", "邮箱", "12345678", "qq", "com",
"ip", "information", "created", "automatically"]
+-- !sql --
+["我", "来到", "北京", "清华", "清华大学", "华大", "大学"]
+["我爱你", "中国"]
+["人民", "可以", "得到", "更多", "实惠"]
+["陕西", "陕西省", "西安", "西安市", "高新", "高新区", "新区", "创业", "业大", "大厦", "A", "座", "我",
"手机", "手机号", "手机号码", "机号码", "号码", "12345678901", "邮箱", "12345678", "qq", "com",
"ip", "information", "created", "automatically"]
+
-- !tokenize_sql --
["get", "images", "hm", "bg", "jpg", "http", "1", "0", "test", "abc", "bcd"]
@@ -28,3 +34,5 @@
-- !tokenize_sql --
["华", "夏", "智", "胜", "新", "税", "股", "票", "a"]
+-- !tokenize_sql --
+["get", "images", "hm", "bg", "jpg", "http", "1", "0", "test", "abc", "bcd"]
diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
index 2fd825d934f..bd7f3473066 100644
--- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
+++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
@@ -91,9 +91,11 @@ suite("test_tokenize"){
sql "INSERT INTO $indexTblName3 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3,
'人民可以得到更多实惠'), (4,
'陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是[email protected],,ip是1.1.1.1,this
information is created automatically.');"
qt_sql "SELECT TOKENIZE(c,
\"'parser'='chinese','parser_mode'='fine_grained'\") FROM $indexTblName3";
+ qt_sql "SELECT TOKENIZE(c, \"'parser'='chinese',
'parser_mode'='fine_grained'\") FROM $indexTblName3";
qt_tokenize_sql """SELECT TOKENIZE('GET /images/hm_bg.jpg HTTP/1.0
test:abc=bcd','"parser"="unicode","char_filter_type" =
"char_replace","char_filter_pattern" = "._=:,","char_filter_replacement" = "
"');"""
qt_tokenize_sql """SELECT TOKENIZE('华夏智胜新税股票A', '"parser"="unicode"');"""
qt_tokenize_sql """SELECT TOKENIZE('华夏智胜新税股票A',
'"parser"="unicode","stopwords" = "none"');"""
+ qt_tokenize_sql """SELECT TOKENIZE('GET /images/hm_bg.jpg HTTP/1.0
test:abc=bcd', '"parser"="unicode","char_filter_type" = "char_replace",
"char_filter_pattern" = "._=:,", "char_filter_replacement" = " "');"""
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]