(doris) branch branch-4.0 updated: branch-4.0: [fix](custom analyzer) Fix basic and icu tokenizer can not be customized #60506 (#60736)

yiguolei Fri, 13 Feb 2026 18:53:45 -0800

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 0560eda8003 branch-4.0: [fix](custom analyzer) Fix basic and icu 
tokenizer can not be customized #60506 (#60736)
0560eda8003 is described below

commit 0560eda8003e7b200899bd5078b7d1250b125837
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Sat Feb 14 10:53:26 2026 +0800

    branch-4.0: [fix](custom analyzer) Fix basic and icu tokenizer can not be 
customized #60506 (#60736)
    
    Cherry-picked from #60506
    
    Co-authored-by: hoshinojyunn 
<[email protected]>
---
 .../apache/doris/indexpolicy/IndexPolicyMgr.java   |  6 ++
 .../analyzer/test_custom_analyzer.out              |  9 +++
 .../analyzer/test_custom_analyzer.groovy           | 73 +++++++++++++++++++++-
 3 files changed, 87 insertions(+), 1 deletion(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java 
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
index e84cbe3ffab..1a3e2a433d1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
@@ -342,6 +342,12 @@ public class IndexPolicyMgr implements Writable, 
GsonPostProcessable {
             case "pinyin":
                 validator = new PinyinTokenizerValidator();
                 break;
+            case "icu":
+                validator = new ICUTokenizerValidator();
+                break;
+            case "basic":
+                validator = new BasicTokenizerValidator();
+                break;
             default:
                 Set<String> userFacingTypes = 
IndexPolicy.BUILTIN_TOKENIZERS.stream()
                         .filter(t -> !t.equals("empty"))
diff --git 
a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out 
b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
index 102b797498c..afcdd9685e5 100644
--- a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
+++ b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
@@ -35,6 +35,15 @@
 -- !tokenize_sql --
 [{\n        "token": "让"\n    }, {\n        "token": "我们"\n    }, {\n        
"token": "说"\n    }, {\n        "token": "hello"\n    }, {\n        "token": 
"そして"\n    }, {\n        "token": "世界"\n    }, {\n        "token": "と"\n    }, 
{\n        "token": "つ"\n    }, {\n        "token": "な"\n    }, {\n        
"token": "が"\n    }, {\n        "token": "ろう"\n    }]
 
+-- !tokenize_basic_1 --
+[{\n        "token": "hello"\n    }, {\n        "token": "world"\n    }, {\n   
     "token": "test"\n    }]
+
+-- !tokenize_basic_2 --
+[{\n        "token": "hello"\n    }, {\n        "token": "_"\n    }, {\n       
 "token": "world"\n    }, {\n        "token": "test"\n    }]
+
+-- !tokenize_basic_3 --
+[{\n        "token": "hello"\n    }, {\n        "token": "_"\n    }, {\n       
 "token": "world"\n    }, {\n        "token": "-"\n    }, {\n        "token": 
"test"\n    }]
+
 -- !tokenize_pinyin1 --
 [{\n        "token": "ldh"\n    }]
 
diff --git 
a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy 
b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
index ab559717bc3..0610fc366bf 100644
--- 
a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
+++ 
b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
@@ -246,6 +246,71 @@ suite("test_custom_analyzer", "p0") {
         );
     """
 
+    // Test basic tokenizer with different extra_chars settings
+    // 1. basic tokenizer without extra_chars (default)
+    sql """
+        CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS basic_tokenizer_no_extra
+        PROPERTIES
+        (
+            "type" = "basic"
+        );
+    """
+
+    // 2. basic tokenizer with extra_chars = "_"
+    sql """
+        CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS 
basic_tokenizer_underscore
+        PROPERTIES
+        (
+            "type" = "basic",
+            "extra_chars" = "_"
+        );
+    """
+
+    // 3. basic tokenizer with extra_chars = "_-"
+    sql """
+        CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS 
basic_tokenizer_underscore_dash
+        PROPERTIES
+        (
+            "type" = "basic",
+            "extra_chars" = "_-"
+        );
+    """
+
+    // Create analyzers for each tokenizer
+    sql """
+        CREATE INVERTED INDEX ANALYZER IF NOT EXISTS basic_analyzer_no_extra
+        PROPERTIES
+        (
+            "tokenizer" = "basic_tokenizer_no_extra",
+            "token_filter" = "lowercase"
+        );
+    """
+
+    sql """
+        CREATE INVERTED INDEX ANALYZER IF NOT EXISTS basic_analyzer_underscore
+        PROPERTIES
+        (
+            "tokenizer" = "basic_tokenizer_underscore",
+            "token_filter" = "lowercase"
+        );
+    """
+
+    sql """
+        CREATE INVERTED INDEX ANALYZER IF NOT EXISTS 
basic_analyzer_underscore_dash
+        PROPERTIES
+        (
+            "tokenizer" = "basic_tokenizer_underscore_dash",
+            "token_filter" = "lowercase"
+        );
+    """
+
+    sql """
+        CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS icu_tokenizer_no_extra
+        PROPERTIES
+        (
+            "type" = "icu"
+        );
+    """
     // Wait for all analyzers to be ready - increased timeout due to many 
objects
     sql """ select sleep(15) """
 
@@ -261,7 +326,13 @@ suite("test_custom_analyzer", "p0") {
     qt_tokenize_sql """ select tokenize("1080º Avalanche", 
'"analyzer"="lowercase_delimited"'); """
     qt_tokenize_sql """ select tokenize("GET /images/hm_bg.jpg HTTP/1.0", 
'"analyzer"="basic_analyzer"'); """
     qt_tokenize_sql """ select tokenize("让我们说「Hello」そして世界とつながろう！", 
'"analyzer"="icu_analyzer"'); """
-    
+
+    // Test basic tokenizer with extra_chars settings
+    // Test input: "hello_world-test" - contains underscore and dash
+    qt_tokenize_basic_1 """ select tokenize("hello_world-test", 
'"analyzer"="basic_analyzer_no_extra"'); """
+    qt_tokenize_basic_2 """ select tokenize("hello_world-test", 
'"analyzer"="basic_analyzer_underscore"'); """
+    qt_tokenize_basic_3 """ select tokenize("hello_world-test", 
'"analyzer"="basic_analyzer_underscore_dash"'); """
+
     // Test pinyin tokenize functions - different analyzers
     qt_tokenize_pinyin1 """ select tokenize("刘德华", 
'"analyzer"="pinyin_analyzer"'); """
     qt_tokenize_pinyin2 """ select tokenize("张学友", 
'"analyzer"="pinyin_analyzer"'); """


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-4.0 updated: branch-4.0: [fix](custom analyzer) Fix basic and icu tokenizer can not be customized #60506 (#60736)

Reply via email to