This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 0560eda8003 branch-4.0: [fix](custom analyzer) Fix basic and icu
tokenizer can not be customized #60506 (#60736)
0560eda8003 is described below
commit 0560eda8003e7b200899bd5078b7d1250b125837
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Sat Feb 14 10:53:26 2026 +0800
branch-4.0: [fix](custom analyzer) Fix basic and icu tokenizer can not be
customized #60506 (#60736)
Cherry-picked from #60506
Co-authored-by: hoshinojyunn
<[email protected]>
---
.../apache/doris/indexpolicy/IndexPolicyMgr.java | 6 ++
.../analyzer/test_custom_analyzer.out | 9 +++
.../analyzer/test_custom_analyzer.groovy | 73 +++++++++++++++++++++-
3 files changed, 87 insertions(+), 1 deletion(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
index e84cbe3ffab..1a3e2a433d1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
@@ -342,6 +342,12 @@ public class IndexPolicyMgr implements Writable,
GsonPostProcessable {
case "pinyin":
validator = new PinyinTokenizerValidator();
break;
+ case "icu":
+ validator = new ICUTokenizerValidator();
+ break;
+ case "basic":
+ validator = new BasicTokenizerValidator();
+ break;
default:
Set<String> userFacingTypes =
IndexPolicy.BUILTIN_TOKENIZERS.stream()
.filter(t -> !t.equals("empty"))
diff --git
a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
index 102b797498c..afcdd9685e5 100644
--- a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
+++ b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
@@ -35,6 +35,15 @@
-- !tokenize_sql --
[{\n "token": "让"\n }, {\n "token": "我们"\n }, {\n
"token": "说"\n }, {\n "token": "hello"\n }, {\n "token":
"そして"\n }, {\n "token": "世界"\n }, {\n "token": "と"\n },
{\n "token": "つ"\n }, {\n "token": "な"\n }, {\n
"token": "が"\n }, {\n "token": "ろう"\n }]
+-- !tokenize_basic_1 --
+[{\n "token": "hello"\n }, {\n "token": "world"\n }, {\n
"token": "test"\n }]
+
+-- !tokenize_basic_2 --
+[{\n "token": "hello"\n }, {\n "token": "_"\n }, {\n
"token": "world"\n }, {\n "token": "test"\n }]
+
+-- !tokenize_basic_3 --
+[{\n "token": "hello"\n }, {\n "token": "_"\n }, {\n
"token": "world"\n }, {\n "token": "-"\n }, {\n "token":
"test"\n }]
+
-- !tokenize_pinyin1 --
[{\n "token": "ldh"\n }]
diff --git
a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
index ab559717bc3..0610fc366bf 100644
---
a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
+++
b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
@@ -246,6 +246,71 @@ suite("test_custom_analyzer", "p0") {
);
"""
+ // Test basic tokenizer with different extra_chars settings
+ // 1. basic tokenizer without extra_chars (default)
+ sql """
+ CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS basic_tokenizer_no_extra
+ PROPERTIES
+ (
+ "type" = "basic"
+ );
+ """
+
+ // 2. basic tokenizer with extra_chars = "_"
+ sql """
+ CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS
basic_tokenizer_underscore
+ PROPERTIES
+ (
+ "type" = "basic",
+ "extra_chars" = "_"
+ );
+ """
+
+ // 3. basic tokenizer with extra_chars = "_-"
+ sql """
+ CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS
basic_tokenizer_underscore_dash
+ PROPERTIES
+ (
+ "type" = "basic",
+ "extra_chars" = "_-"
+ );
+ """
+
+ // Create analyzers for each tokenizer
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS basic_analyzer_no_extra
+ PROPERTIES
+ (
+ "tokenizer" = "basic_tokenizer_no_extra",
+ "token_filter" = "lowercase"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS basic_analyzer_underscore
+ PROPERTIES
+ (
+ "tokenizer" = "basic_tokenizer_underscore",
+ "token_filter" = "lowercase"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS
basic_analyzer_underscore_dash
+ PROPERTIES
+ (
+ "tokenizer" = "basic_tokenizer_underscore_dash",
+ "token_filter" = "lowercase"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS icu_tokenizer_no_extra
+ PROPERTIES
+ (
+ "type" = "icu"
+ );
+ """
// Wait for all analyzers to be ready - increased timeout due to many
objects
sql """ select sleep(15) """
@@ -261,7 +326,13 @@ suite("test_custom_analyzer", "p0") {
qt_tokenize_sql """ select tokenize("1080º Avalanche",
'"analyzer"="lowercase_delimited"'); """
qt_tokenize_sql """ select tokenize("GET /images/hm_bg.jpg HTTP/1.0",
'"analyzer"="basic_analyzer"'); """
qt_tokenize_sql """ select tokenize("让我们说「Hello」そして世界とつながろう!",
'"analyzer"="icu_analyzer"'); """
-
+
+ // Test basic tokenizer with extra_chars settings
+ // Test input: "hello_world-test" - contains underscore and dash
+ qt_tokenize_basic_1 """ select tokenize("hello_world-test",
'"analyzer"="basic_analyzer_no_extra"'); """
+ qt_tokenize_basic_2 """ select tokenize("hello_world-test",
'"analyzer"="basic_analyzer_underscore"'); """
+ qt_tokenize_basic_3 """ select tokenize("hello_world-test",
'"analyzer"="basic_analyzer_underscore_dash"'); """
+
// Test pinyin tokenize functions - different analyzers
qt_tokenize_pinyin1 """ select tokenize("刘德华",
'"analyzer"="pinyin_analyzer"'); """
qt_tokenize_pinyin2 """ select tokenize("张学友",
'"analyzer"="pinyin_analyzer"'); """
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]