This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 888fc37a86f branch-3.1: [fix](inverted index) Refine char_group
tokenizer validation #55126 (#55191)
888fc37a86f is described below
commit 888fc37a86f78ac4efebb9ed253e2a8d891dcf9a
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Mon Aug 25 11:08:33 2025 +0800
branch-3.1: [fix](inverted index) Refine char_group tokenizer validation
#55126 (#55191)
Cherry-picked from #55126
Co-authored-by: zzzxl <[email protected]>
---
.../indexpolicy/CharGroupTokenizerValidator.java | 2 +-
.../org/apache/doris/indexpolicy/IndexPolicy.java | 2 +-
.../analyzer/test_char_group_tokenizer.out | Bin 999 -> 1238 bytes
.../analyzer/test_char_group_tokenizer.groovy | 22 +++++++++++++++++++++
4 files changed, 24 insertions(+), 2 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharGroupTokenizerValidator.java
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharGroupTokenizerValidator.java
index 389c576365e..7c94267705d 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharGroupTokenizerValidator.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharGroupTokenizerValidator.java
@@ -60,7 +60,7 @@ public class CharGroupTokenizerValidator extends
BasePolicyValidator {
if (raw == null || raw.trim().isEmpty()) {
throw new DdlException("tokenize_on_chars cannot be empty if
specified");
}
- String[] items = raw.split("\\s*,\\s*");
+ String[] items = raw.trim().split("(?<=\\])\\s*,\\s*(?=\\[)");
for (String item : items) {
String trimmed = item.trim();
if (!trimmed.startsWith("[") || !trimmed.endsWith("]")) {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
index 9cb190c7266..74fa1c7f8a8 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicy.java
@@ -58,7 +58,7 @@ public class IndexPolicy implements Writable,
GsonPostProcessable {
public static final String PROP_TOKEN_FILTER = "token_filter";
public static final Set<String> BUILTIN_TOKENIZERS = ImmutableSet.of(
- "ngram", "edge_ngram", "keyword", "standard");
+ "ngram", "edge_ngram", "keyword", "standard", "char_group");
public static final Set<String> BUILTIN_TOKEN_FILTERS = ImmutableSet.of(
"asciifolding", "word_delimiter", "lowercase");
diff --git
a/regression-test/data/inverted_index_p0/analyzer/test_char_group_tokenizer.out
b/regression-test/data/inverted_index_p0/analyzer/test_char_group_tokenizer.out
index 29f00892a5c..7b7441ab622 100644
Binary files
a/regression-test/data/inverted_index_p0/analyzer/test_char_group_tokenizer.out
and
b/regression-test/data/inverted_index_p0/analyzer/test_char_group_tokenizer.out
differ
diff --git
a/regression-test/suites/inverted_index_p0/analyzer/test_char_group_tokenizer.groovy
b/regression-test/suites/inverted_index_p0/analyzer/test_char_group_tokenizer.groovy
index 4eaa52664c5..d65bc7d9781 100644
---
a/regression-test/suites/inverted_index_p0/analyzer/test_char_group_tokenizer.groovy
+++
b/regression-test/suites/inverted_index_p0/analyzer/test_char_group_tokenizer.groovy
@@ -74,6 +74,23 @@ suite("test_char_group_tokenizer", "p0") {
);
"""
+ // 4) Comma-separated splitting
+ sql """
+ CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS
char_group_comma_tokenizer
+ PROPERTIES
+ (
+ "type" = "char_group",
+ "tokenize_on_chars" = "[,]"
+ );
+ """
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS char_group_comma_analyzer
+ PROPERTIES
+ (
+ "tokenizer" = "char_group_comma_tokenizer"
+ );
+ """
+
// Wait for analyzers to be ready
sql """ select sleep(10) """
@@ -89,6 +106,10 @@ suite("test_char_group_tokenizer", "p0") {
qt_tokenize_sql """ select tokenize("hello-world_test",
'"analyzer"="char_group_custom_analyzer"'); """
qt_tokenize_sql """ select tokenize("hello\nworld\ttest\rend",
'"analyzer"="char_group_custom_analyzer"'); """
+ // Comma-separated
+ qt_tokenize_sql """ select tokenize("a,b,,c",
'"analyzer"="char_group_comma_analyzer"'); """
+ qt_tokenize_sql """ select tokenize("a, b , c",
'"analyzer"="char_group_comma_analyzer"'); """
+
// Create a table to validate integration with inverted index + analyzer
sql "DROP TABLE IF EXISTS ${tbl}"
sql """
@@ -125,6 +146,7 @@ suite("test_char_group_tokenizer", "p0") {
sql "drop inverted index analyzer char_group_ws_punct_analyzer"
sql "drop inverted index analyzer char_group_cjk_analyzer"
sql "drop inverted index analyzer char_group_custom_analyzer"
+ sql "drop inverted index analyzer char_group_comma_analyzer"
} catch (SQLException e) {
// It may be used by index; ignore
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]