(doris) branch branch-3.1 updated: branch-3.1: [fix](inverted index) Refine char_group tokenizer validation #55126 (#55191)

morrysnow Sun, 24 Aug 2025 20:08:48 -0700

This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 888fc37a86f branch-3.1: [fix](inverted index) Refine char_group 
tokenizer validation #55126 (#55191)
888fc37a86f is described below

commit 888fc37a86f78ac4efebb9ed253e2a8d891dcf9a
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Mon Aug 25 11:08:33 2025 +0800

    branch-3.1: [fix](inverted index) Refine char_group tokenizer validation 
#55126 (#55191)
    
    Cherry-picked from #55126
    
    Co-authored-by: zzzxl <[email protected]>
---
 .../indexpolicy/CharGroupTokenizerValidator.java   |   2 +-
 .../org/apache/doris/indexpolicy/IndexPolicy.java  |   2 +-
 .../analyzer/test_char_group_tokenizer.out         | Bin 999 -> 1238 bytes
 .../analyzer/test_char_group_tokenizer.groovy      |  22 +++++++++++++++++++++
 4 files changed, 24 insertions(+), 2 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharGroupTokenizerValidator.java
 
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharGroupTokenizerValidator.java
index 389c576365e..7c94267705d 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharGroupTokenizerValidator.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharGroupTokenizerValidator.java
@@ -60,7 +60,7 @@ public class CharGroupTokenizerValidator extends 
BasePolicyValidator {
             if (raw == null || raw.trim().isEmpty()) {
                 throw new DdlException("tokenize_on_chars cannot be empty if 
specified");
             }
-            String[] items = raw.split("\\s*,\\s*");
+            String[] items = raw.trim().split("(?<=\\])\\s*,\\s*(?=\\[)");
             for (String item : items) {
                 String trimmed = item.trim();
                 if (!trimmed.startsWith("[") || !trimmed.endsWith("]")) {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java 
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
index 9cb190c7266..74fa1c7f8a8 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
@@ -58,7 +58,7 @@ public class IndexPolicy implements Writable, 
GsonPostProcessable {
     public static final String PROP_TOKEN_FILTER = "token_filter";
 
     public static final Set<String> BUILTIN_TOKENIZERS = ImmutableSet.of(
-            "ngram", "edge_ngram", "keyword", "standard");
+            "ngram", "edge_ngram", "keyword", "standard", "char_group");
 
     public static final Set<String> BUILTIN_TOKEN_FILTERS = ImmutableSet.of(
             "asciifolding", "word_delimiter", "lowercase");
diff --git 
a/regression-test/data/inverted_index_p0/analyzer/test_char_group_tokenizer.out 
b/regression-test/data/inverted_index_p0/analyzer/test_char_group_tokenizer.out
index 29f00892a5c..7b7441ab622 100644
Binary files 
a/regression-test/data/inverted_index_p0/analyzer/test_char_group_tokenizer.out 
and 
b/regression-test/data/inverted_index_p0/analyzer/test_char_group_tokenizer.out 
differ
diff --git 
a/regression-test/suites/inverted_index_p0/analyzer/test_char_group_tokenizer.groovy
 
b/regression-test/suites/inverted_index_p0/analyzer/test_char_group_tokenizer.groovy
index 4eaa52664c5..d65bc7d9781 100644
--- 
a/regression-test/suites/inverted_index_p0/analyzer/test_char_group_tokenizer.groovy
+++ 
b/regression-test/suites/inverted_index_p0/analyzer/test_char_group_tokenizer.groovy
@@ -74,6 +74,23 @@ suite("test_char_group_tokenizer", "p0") {
         );
     """
 
+    // 4) Comma-separated splitting
+    sql """
+        CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS 
char_group_comma_tokenizer
+        PROPERTIES
+        (
+            "type" = "char_group",
+            "tokenize_on_chars" = "[,]"
+        );
+    """
+    sql """
+        CREATE INVERTED INDEX ANALYZER IF NOT EXISTS char_group_comma_analyzer
+        PROPERTIES
+        (
+            "tokenizer" = "char_group_comma_tokenizer"
+        );
+    """
+
     // Wait for analyzers to be ready
     sql """ select sleep(10) """
 
@@ -89,6 +106,10 @@ suite("test_char_group_tokenizer", "p0") {
     qt_tokenize_sql """ select tokenize("hello-world_test", 
'"analyzer"="char_group_custom_analyzer"'); """
     qt_tokenize_sql """ select tokenize("hello\nworld\ttest\rend", 
'"analyzer"="char_group_custom_analyzer"'); """
 
+    // Comma-separated
+    qt_tokenize_sql """ select tokenize("a,b,,c", 
'"analyzer"="char_group_comma_analyzer"'); """
+    qt_tokenize_sql """ select tokenize("a, b , c", 
'"analyzer"="char_group_comma_analyzer"'); """
+
     // Create a table to validate integration with inverted index + analyzer
     sql "DROP TABLE IF EXISTS ${tbl}"
     sql """
@@ -125,6 +146,7 @@ suite("test_char_group_tokenizer", "p0") {
         sql "drop inverted index analyzer char_group_ws_punct_analyzer"
         sql "drop inverted index analyzer char_group_cjk_analyzer"
         sql "drop inverted index analyzer char_group_custom_analyzer"
+        sql "drop inverted index analyzer char_group_comma_analyzer"
     } catch (SQLException e) {
         // It may be used by index; ignore
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-3.1 updated: branch-3.1: [fix](inverted index) Refine char_group tokenizer validation #55126 (#55191)

Reply via email to