This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 15d0c1ea7a1 [fix](inverted index) fix tokenize function wrong result 
when params with space seperator #32671 (#36290)
15d0c1ea7a1 is described below

commit 15d0c1ea7a1100c9e7ee3eec321e05086553279f
Author: airborne12 <[email protected]>
AuthorDate: Sun Jun 16 09:32:20 2024 +0800

    [fix](inverted index) fix tokenize function wrong result when params with 
space seperator #32671 (#36290)
---
 be/src/vec/functions/function_tokenize.cpp                    | 2 +-
 regression-test/data/inverted_index_p0/test_tokenize.out      | 8 ++++++++
 regression-test/suites/inverted_index_p0/test_tokenize.groovy | 2 ++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/be/src/vec/functions/function_tokenize.cpp 
b/be/src/vec/functions/function_tokenize.cpp
index 5f362bb1323..17e52fb29b3 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -38,7 +38,7 @@ namespace doris::vectorized {
 
 Status parse(const std::string& str, std::map<std::string, std::string>& 
result) {
     std::regex pattern(
-            
R"delimiter((?:'([^']*)'|"([^"]*)"|([^,]*))\s*=\s*(?:'([^']*)'|"([^"]*)"|([^,]*)))delimiter");
+            R"delimiter((?:'([^']*)'|"([^"]*)"|([^, 
]*))\s*=\s*(?:'([^']*)'|"([^"]*)"|([^, ]*)))delimiter");
     std::smatch matches;
 
     std::string::const_iterator searchStart(str.cbegin());
diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out 
b/regression-test/data/inverted_index_p0/test_tokenize.out
index a3984ca9105..d92e4b6585a 100644
--- a/regression-test/data/inverted_index_p0/test_tokenize.out
+++ b/regression-test/data/inverted_index_p0/test_tokenize.out
@@ -19,6 +19,12 @@
 ["人民", "可以", "得到", "更多", "实惠"]
 ["陕西", "陕西省", "西安", "西安市", "高新", "高新区", "新区", "创业", "业大", "大厦", "A", "座", "我", 
"手机", "手机号", "手机号码", "机号码", "号码", "12345678901", "邮箱", "12345678", "qq", "com", 
"ip", "information", "created", "automatically"]
 
+-- !sql --
+["我", "来到", "北京", "清华", "清华大学", "华大", "大学"]
+["我爱你", "中国"]
+["人民", "可以", "得到", "更多", "实惠"]
+["陕西", "陕西省", "西安", "西安市", "高新", "高新区", "新区", "创业", "业大", "大厦", "A", "座", "我", 
"手机", "手机号", "手机号码", "机号码", "号码", "12345678901", "邮箱", "12345678", "qq", "com", 
"ip", "information", "created", "automatically"]
+
 -- !tokenize_sql --
 ["get", "images", "hm", "bg", "jpg", "http", "1", "0", "test", "abc", "bcd"]
 
@@ -28,3 +34,5 @@
 -- !tokenize_sql --
 ["华", "夏", "智", "胜", "新", "税", "股", "票", "a"]
 
+-- !tokenize_sql --
+["get", "images", "hm", "bg", "jpg", "http", "1", "0", "test", "abc", "bcd"]
diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy 
b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
index 2fd825d934f..bd7f3473066 100644
--- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
+++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
@@ -91,9 +91,11 @@ suite("test_tokenize"){
 
     sql "INSERT INTO $indexTblName3 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, 
'人民可以得到更多实惠'), (4, 
'陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是[email protected],,ip是1.1.1.1,this 
information is created automatically.');"
     qt_sql "SELECT TOKENIZE(c, 
\"'parser'='chinese','parser_mode'='fine_grained'\") FROM $indexTblName3";
+    qt_sql "SELECT TOKENIZE(c, \"'parser'='chinese', 
'parser_mode'='fine_grained'\") FROM $indexTblName3";
 
     qt_tokenize_sql """SELECT TOKENIZE('GET /images/hm_bg.jpg HTTP/1.0 
test:abc=bcd','"parser"="unicode","char_filter_type" = 
"char_replace","char_filter_pattern" = "._=:,","char_filter_replacement" = " 
"');"""
 
     qt_tokenize_sql """SELECT TOKENIZE('华夏智胜新税股票A', '"parser"="unicode"');"""
     qt_tokenize_sql """SELECT TOKENIZE('华夏智胜新税股票A', 
'"parser"="unicode","stopwords" = "none"');"""
+    qt_tokenize_sql """SELECT TOKENIZE('GET /images/hm_bg.jpg HTTP/1.0 
test:abc=bcd', '"parser"="unicode","char_filter_type" = "char_replace", 
"char_filter_pattern" = "._=:,", "char_filter_replacement" = " "');"""
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to