This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris-website.git
The following commit(s) were added to refs/heads/master by this push:
new b0ce888d923 [doc](inverted index) add pinyin tokenizer and filter
(#3067)
b0ce888d923 is described below
commit b0ce888d92389c0c07d64c297349d1e6faf68e83
Author: Ryan19929 <[email protected]>
AuthorDate: Mon Nov 10 10:51:22 2025 +0800
[doc](inverted index) add pinyin tokenizer and filter (#3067)
## Versions
- [X] dev
- [X] 4.x
- [ ] 3.x
- [ ] 2.1
## Languages
- [X] Chinese
- [X] English
## Docs Checklist
- [X] Checked by AI
- [ ] Test Cases Built
---
docs/ai/text-search/custom-analyzer.md | 95 ++++++++++++++++++++++
.../version-4.x/ai/text-search/custom-analyzer.md | 95 ++++++++++++++++++++++
.../version-4.x/ai/text-search/custom-analyzer.md | 95 ++++++++++++++++++++++
3 files changed, 285 insertions(+)
diff --git a/docs/ai/text-search/custom-analyzer.md
b/docs/ai/text-search/custom-analyzer.md
index 06b9ba97e3a..9351d7d11c9 100644
--- a/docs/ai/text-search/custom-analyzer.md
+++ b/docs/ai/text-search/custom-analyzer.md
@@ -47,6 +47,21 @@ Available tokenizers:
- **char_group**: Tokenizes on specified characters
- **basic**: Simple English, numbers, Chinese, Unicode tokenizer
- **icu**: International text segmentation supporting all languages
+- **pinyin**: Chinese pinyin conversion tokenizer for Chinese text search
+ - `keep_first_letter`: When enabled, retains only the first letter of each
Chinese character. For example, `刘德华` becomes `ldh`. Default: true
+ - `keep_separate_first_letter`: When enabled, keeps the first letters of
each Chinese character separately. For example, `刘德华` becomes `l`,`d`,`h`.
Default: false. Note: This may increase query fuzziness due to term frequency
+ - `limit_first_letter_length`: Sets the maximum length of the first letter
result. Default: 16
+ - `keep_full_pinyin`: When enabled, preserves the full Pinyin of each
Chinese character. For example, `刘德华` becomes [`liu`,`de`,`hua`]. Default: true
+ - `keep_joined_full_pinyin`: When enabled, joins the full Pinyin of each
Chinese character. For example, `刘德华` becomes [`liudehua`]. Default: false
+ - `keep_none_chinese`: Keeps non-Chinese letters or numbers in the result.
Default: true
+ - `keep_none_chinese_together`: Keeps non-Chinese letters together. Default:
true. For example, `DJ音乐家` becomes `DJ`,`yin`,`yue`,`jia`. When set to false,
`DJ音乐家` becomes `D`,`J`,`yin`,`yue`,`jia`. Note: `keep_none_chinese` should be
enabled first
+ - `keep_none_chinese_in_first_letter`: Keeps non-Chinese letters in the
first letter. For example, `刘德华AT2016` becomes `ldhat2016`. Default: true
+ - `keep_none_chinese_in_joined_full_pinyin`: Keeps non-Chinese letters in
joined full Pinyin. For example, `刘德华2016` becomes `liudehua2016`. Default:
false
+ - `none_chinese_pinyin_tokenize`: Breaks non-Chinese letters into separate
Pinyin terms if they are Pinyin. Default: true. For example,
`liudehuaalibaba13zhuanghan` becomes
`liu`,`de`,`hua`,`a`,`li`,`ba`,`ba`,`13`,`zhuang`,`han`. Note:
`keep_none_chinese` and `keep_none_chinese_together` should be enabled first
+ - `keep_original`: When enabled, keeps the original input as well. Default:
false
+ - `lowercase`: Lowercases non-Chinese letters. Default: true
+ - `trim_whitespace`: Default: true
+ - `remove_duplicated_term`: When enabled, removes duplicated terms to save
index space. For example, `de的` becomes `de`. Default: false. Note:
Position-related queries may be influenced
#### 3. Creating a token_filter
@@ -61,6 +76,7 @@ Available token filters:
- **word_delimiter**: Splits tokens at non-alphanumeric characters
- **ascii_folding**: Converts non-ASCII characters to ASCII equivalents
- **lowercase**: Converts tokens to lowercase
+- **pinyin**: Converts Chinese characters to pinyin after tokenization. For
parameter details, refer to the **pinyin** tokenizer above.
#### 4. Creating an analyzer
@@ -181,3 +197,82 @@ PROPERTIES
"token_filter" = "asciifolding, lowercase"
);
```
+
+### Example 4: Chinese Pinyin Search
+
+Using pinyin tokenizer for Chinese name and text search - supports full
pinyin, first letter abbreviations, and mixed Chinese-English text.
+
+#### Using Pinyin Tokenizer
+
+```sql
+-- Create pinyin tokenizer with multiple output formats
+CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS pinyin_tokenizer
+PROPERTIES (
+ "type" = "pinyin",
+ "keep_first_letter" = "true",
+ "keep_full_pinyin" = "true",
+ "keep_joined_full_pinyin" = "true",
+ "keep_original" = "true",
+ "keep_none_chinese" = "true",
+ "lowercase" = "true",
+ "remove_duplicated_term" = "true"
+);
+
+CREATE INVERTED INDEX ANALYZER IF NOT EXISTS pinyin_analyzer
+PROPERTIES (
+ "tokenizer" = "pinyin_tokenizer"
+);
+
+CREATE TABLE contacts (
+ id BIGINT NOT NULL AUTO_INCREMENT(1),
+ name TEXT NULL,
+ INDEX idx_name (name) USING INVERTED PROPERTIES("analyzer" =
"pinyin_analyzer", "support_phrase" = "true")
+) ENGINE=OLAP
+DUPLICATE KEY(id)
+DISTRIBUTED BY RANDOM BUCKETS 1
+PROPERTIES ("replication_allocation" = "tag.location.default: 1");
+
+INSERT INTO contacts VALUES (1, "刘德华"), (2, "张学友"), (3, "郭富城");
+
+SELECT * FROM contacts WHERE name MATCH '刘德华';
+SELECT * FROM contacts WHERE name MATCH 'liudehua';
+SELECT * FROM contacts WHERE name MATCH 'liu';
+SELECT * FROM contacts WHERE name MATCH 'ldh';
+```
+
+#### Using Pinyin Filter
+
+```sql
+-- Create pinyin filter to apply after keyword tokenizer
+CREATE INVERTED INDEX TOKEN_FILTER IF NOT EXISTS pinyin_filter
+PROPERTIES (
+ "type" = "pinyin",
+ "keep_first_letter" = "true",
+ "keep_full_pinyin" = "true",
+ "keep_original" = "true",
+ "lowercase" = "true"
+);
+
+CREATE INVERTED INDEX ANALYZER IF NOT EXISTS keyword_pinyin
+PROPERTIES (
+ "tokenizer" = "keyword",
+ "token_filter" = "pinyin_filter"
+);
+
+CREATE TABLE stars (
+ id BIGINT NOT NULL AUTO_INCREMENT(1),
+ name TEXT NULL,
+ INDEX idx_name (name) USING INVERTED PROPERTIES("analyzer" =
"keyword_pinyin")
+) ENGINE=OLAP
+DUPLICATE KEY(id)
+DISTRIBUTED BY RANDOM BUCKETS 1
+PROPERTIES ("replication_allocation" = "tag.location.default: 1");
+
+INSERT INTO stars VALUES (1, "刘德华"), (2, "张学友"), (3, "刘德华ABC");
+
+-- Supports multiple search modes:
+SELECT * FROM stars WHERE name MATCH '刘德华';
+SELECT * FROM stars WHERE name MATCH 'liu';
+SELECT * FROM stars WHERE name MATCH 'ldh';
+SELECT * FROM stars WHERE name MATCH 'zxy';
+```
\ No newline at end of file
diff --git
a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/ai/text-search/custom-analyzer.md
b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/ai/text-search/custom-analyzer.md
index 69b5749dd19..f46a86db8ec 100644
---
a/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/ai/text-search/custom-analyzer.md
+++
b/i18n/zh-CN/docusaurus-plugin-content-docs/version-4.x/ai/text-search/custom-analyzer.md
@@ -54,6 +54,21 @@ PROPERTIES (
- `basic`:简单英文/数字/中文/Unicode 分词
- `extra_chars`:额外分割的 ASCII 字符(如 `[]().`)
- `icu`:ICU 国际化分词,支持多语言复杂脚本
+- `pinyin`:拼音分词器,用于中文拼音搜索
+ - `keep_first_letter`:启用时,仅保留每个汉字的首字母。例如,`刘德华` 变为 `ldh`。默认值:true
+ - `keep_separate_first_letter`:启用时,将每个汉字的首字母分别保留。例如,`刘德华` 变为
`l`,`d`,`h`。默认值:false。注意:由于词频的原因,这可能会增加查询的模糊性
+ - `limit_first_letter_length`:设置首字母结果的最大长度。默认值:16
+ - `keep_full_pinyin`:启用时,保留每个汉字的完整拼音。例如,`刘德华` 变为 [`liu`,`de`,`hua`]。默认值:true
+ - `keep_joined_full_pinyin`:启用时,连接每个汉字的完整拼音。例如,`刘德华` 变为
[`liudehua`]。默认值:false
+ - `keep_none_chinese`:在结果中保留非中文字母或数字。默认值:true
+ - `keep_none_chinese_together`:将非中文字母保持在一起。默认值:true。例如,`DJ音乐家` 变为
`DJ`,`yin`,`yue`,`jia`。当设置为 false 时,`DJ音乐家` 变为
`D`,`J`,`yin`,`yue`,`jia`。注意:需要先启用 `keep_none_chinese`
+ - `keep_none_chinese_in_first_letter`:在首字母中保留非中文字母。例如,`刘德华AT2016` 变为
`ldhat2016`。默认值:true
+ - `keep_none_chinese_in_joined_full_pinyin`:在连接的完整拼音中保留非中文字母。例如,`刘德华2016` 变为
`liudehua2016`。默认值:false
+ -
`none_chinese_pinyin_tokenize`:如果非中文字母是拼音,则将其拆分为单独的拼音词元。默认值:true。例如,`liudehuaalibaba13zhuanghan`
变为 `liu`,`de`,`hua`,`a`,`li`,`ba`,`ba`,`13`,`zhuang`,`han`。注意:需要先启用
`keep_none_chinese` 和 `keep_none_chinese_together`
+ - `keep_original`:启用时,同时保留原始输入。默认值:false
+ - `lowercase`:将非中文字母转换为小写。默认值:true
+ - `trim_whitespace`:默认值:true
+ - `remove_duplicated_term`:启用时,删除重复的词元以节省索引空间。例如,`de的` 变为
`de`。默认值:false。注意:可能会影响位置相关的查询
#### 3. token_filter(词元过滤器)
@@ -81,6 +96,7 @@ PROPERTIES (
- `type_table`:自定义字符类型映射(如 `[+ => ALPHA, - => ALPHA]`),类型含
`ALPHA`、`ALPHANUM`、`DIGIT`、`LOWER`、`SUBWORD_DELIM`、`UPPER`
- `ascii_folding`:将非 ASCII 字符映射为等效 ASCII
- `lowercase`:将 token 文本转为小写
+- `pinyin`:在分词后将中文字符转换为拼音的过滤器。参数详情请参考上文的 **pinyin** 分词器。
#### 4. analyzer(分析器)
@@ -279,3 +295,82 @@ select tokenize('hÉllo World',
'"analyzer"="keyword_lowercase"');
{"token":"hello world"}
]
```
+
+### 示例4:中文拼音搜索
+
+使用拼音分词器进行中文姓名和文本搜索 - 支持全拼、首字母缩写和中英文混合文本。
+
+#### 使用拼音分词器
+
+```sql
+-- 创建支持多种输出格式的拼音分词器
+CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS pinyin_tokenizer
+PROPERTIES (
+ "type" = "pinyin",
+ "keep_first_letter" = "true",
+ "keep_full_pinyin" = "true",
+ "keep_joined_full_pinyin" = "true",
+ "keep_original" = "true",
+ "keep_none_chinese" = "true",
+ "lowercase" = "true",
+ "remove_duplicated_term" = "true"
+);
+
+CREATE INVERTED INDEX ANALYZER IF NOT EXISTS pinyin_analyzer
+PROPERTIES (
+ "tokenizer" = "pinyin_tokenizer"
+);
+
+CREATE TABLE contacts (
+ id BIGINT NOT NULL AUTO_INCREMENT(1),
+ name TEXT NULL,
+ INDEX idx_name (name) USING INVERTED PROPERTIES("analyzer" =
"pinyin_analyzer", "support_phrase" = "true")
+) ENGINE=OLAP
+DUPLICATE KEY(id)
+DISTRIBUTED BY RANDOM BUCKETS 1
+PROPERTIES ("replication_allocation" = "tag.location.default: 1");
+
+INSERT INTO contacts VALUES (1, "刘德华"), (2, "张学友"), (3, "郭富城");
+
+SELECT * FROM contacts WHERE name MATCH '刘德华';
+SELECT * FROM contacts WHERE name MATCH 'liudehua';
+SELECT * FROM contacts WHERE name MATCH 'liu';
+SELECT * FROM contacts WHERE name MATCH 'ldh';
+```
+
+#### 使用拼音过滤器
+
+```sql
+-- 创建拼音过滤器,应用于 keyword 分词器之后
+CREATE INVERTED INDEX TOKEN_FILTER IF NOT EXISTS pinyin_filter
+PROPERTIES (
+ "type" = "pinyin",
+ "keep_first_letter" = "true",
+ "keep_full_pinyin" = "true",
+ "keep_original" = "true",
+ "lowercase" = "true"
+);
+
+CREATE INVERTED INDEX ANALYZER IF NOT EXISTS keyword_pinyin
+PROPERTIES (
+ "tokenizer" = "keyword",
+ "token_filter" = "pinyin_filter"
+);
+
+CREATE TABLE stars (
+ id BIGINT NOT NULL AUTO_INCREMENT(1),
+ name TEXT NULL,
+ INDEX idx_name (name) USING INVERTED PROPERTIES("analyzer" =
"keyword_pinyin")
+) ENGINE=OLAP
+DUPLICATE KEY(id)
+DISTRIBUTED BY RANDOM BUCKETS 1
+PROPERTIES ("replication_allocation" = "tag.location.default: 1");
+
+INSERT INTO stars VALUES (1, "刘德华"), (2, "张学友"), (3, "刘德华ABC");
+
+-- 支持多种搜索模式:
+SELECT * FROM stars WHERE name MATCH '刘德华';
+SELECT * FROM stars WHERE name MATCH 'liu';
+SELECT * FROM stars WHERE name MATCH 'ldh';
+SELECT * FROM stars WHERE name MATCH 'zxy';
+```
diff --git a/versioned_docs/version-4.x/ai/text-search/custom-analyzer.md
b/versioned_docs/version-4.x/ai/text-search/custom-analyzer.md
index 06b9ba97e3a..9351d7d11c9 100644
--- a/versioned_docs/version-4.x/ai/text-search/custom-analyzer.md
+++ b/versioned_docs/version-4.x/ai/text-search/custom-analyzer.md
@@ -47,6 +47,21 @@ Available tokenizers:
- **char_group**: Tokenizes on specified characters
- **basic**: Simple English, numbers, Chinese, Unicode tokenizer
- **icu**: International text segmentation supporting all languages
+- **pinyin**: Chinese pinyin conversion tokenizer for Chinese text search
+ - `keep_first_letter`: When enabled, retains only the first letter of each
Chinese character. For example, `刘德华` becomes `ldh`. Default: true
+ - `keep_separate_first_letter`: When enabled, keeps the first letters of
each Chinese character separately. For example, `刘德华` becomes `l`,`d`,`h`.
Default: false. Note: This may increase query fuzziness due to term frequency
+ - `limit_first_letter_length`: Sets the maximum length of the first letter
result. Default: 16
+ - `keep_full_pinyin`: When enabled, preserves the full Pinyin of each
Chinese character. For example, `刘德华` becomes [`liu`,`de`,`hua`]. Default: true
+ - `keep_joined_full_pinyin`: When enabled, joins the full Pinyin of each
Chinese character. For example, `刘德华` becomes [`liudehua`]. Default: false
+ - `keep_none_chinese`: Keeps non-Chinese letters or numbers in the result.
Default: true
+ - `keep_none_chinese_together`: Keeps non-Chinese letters together. Default:
true. For example, `DJ音乐家` becomes `DJ`,`yin`,`yue`,`jia`. When set to false,
`DJ音乐家` becomes `D`,`J`,`yin`,`yue`,`jia`. Note: `keep_none_chinese` should be
enabled first
+ - `keep_none_chinese_in_first_letter`: Keeps non-Chinese letters in the
first letter. For example, `刘德华AT2016` becomes `ldhat2016`. Default: true
+ - `keep_none_chinese_in_joined_full_pinyin`: Keeps non-Chinese letters in
joined full Pinyin. For example, `刘德华2016` becomes `liudehua2016`. Default:
false
+ - `none_chinese_pinyin_tokenize`: Breaks non-Chinese letters into separate
Pinyin terms if they are Pinyin. Default: true. For example,
`liudehuaalibaba13zhuanghan` becomes
`liu`,`de`,`hua`,`a`,`li`,`ba`,`ba`,`13`,`zhuang`,`han`. Note:
`keep_none_chinese` and `keep_none_chinese_together` should be enabled first
+ - `keep_original`: When enabled, keeps the original input as well. Default:
false
+ - `lowercase`: Lowercases non-Chinese letters. Default: true
+ - `trim_whitespace`: Default: true
+ - `remove_duplicated_term`: When enabled, removes duplicated terms to save
index space. For example, `de的` becomes `de`. Default: false. Note:
Position-related queries may be influenced
#### 3. Creating a token_filter
@@ -61,6 +76,7 @@ Available token filters:
- **word_delimiter**: Splits tokens at non-alphanumeric characters
- **ascii_folding**: Converts non-ASCII characters to ASCII equivalents
- **lowercase**: Converts tokens to lowercase
+- **pinyin**: Converts Chinese characters to pinyin after tokenization. For
parameter details, refer to the **pinyin** tokenizer above.
#### 4. Creating an analyzer
@@ -181,3 +197,82 @@ PROPERTIES
"token_filter" = "asciifolding, lowercase"
);
```
+
+### Example 4: Chinese Pinyin Search
+
+Using pinyin tokenizer for Chinese name and text search - supports full
pinyin, first letter abbreviations, and mixed Chinese-English text.
+
+#### Using Pinyin Tokenizer
+
+```sql
+-- Create pinyin tokenizer with multiple output formats
+CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS pinyin_tokenizer
+PROPERTIES (
+ "type" = "pinyin",
+ "keep_first_letter" = "true",
+ "keep_full_pinyin" = "true",
+ "keep_joined_full_pinyin" = "true",
+ "keep_original" = "true",
+ "keep_none_chinese" = "true",
+ "lowercase" = "true",
+ "remove_duplicated_term" = "true"
+);
+
+CREATE INVERTED INDEX ANALYZER IF NOT EXISTS pinyin_analyzer
+PROPERTIES (
+ "tokenizer" = "pinyin_tokenizer"
+);
+
+CREATE TABLE contacts (
+ id BIGINT NOT NULL AUTO_INCREMENT(1),
+ name TEXT NULL,
+ INDEX idx_name (name) USING INVERTED PROPERTIES("analyzer" =
"pinyin_analyzer", "support_phrase" = "true")
+) ENGINE=OLAP
+DUPLICATE KEY(id)
+DISTRIBUTED BY RANDOM BUCKETS 1
+PROPERTIES ("replication_allocation" = "tag.location.default: 1");
+
+INSERT INTO contacts VALUES (1, "刘德华"), (2, "张学友"), (3, "郭富城");
+
+SELECT * FROM contacts WHERE name MATCH '刘德华';
+SELECT * FROM contacts WHERE name MATCH 'liudehua';
+SELECT * FROM contacts WHERE name MATCH 'liu';
+SELECT * FROM contacts WHERE name MATCH 'ldh';
+```
+
+#### Using Pinyin Filter
+
+```sql
+-- Create pinyin filter to apply after keyword tokenizer
+CREATE INVERTED INDEX TOKEN_FILTER IF NOT EXISTS pinyin_filter
+PROPERTIES (
+ "type" = "pinyin",
+ "keep_first_letter" = "true",
+ "keep_full_pinyin" = "true",
+ "keep_original" = "true",
+ "lowercase" = "true"
+);
+
+CREATE INVERTED INDEX ANALYZER IF NOT EXISTS keyword_pinyin
+PROPERTIES (
+ "tokenizer" = "keyword",
+ "token_filter" = "pinyin_filter"
+);
+
+CREATE TABLE stars (
+ id BIGINT NOT NULL AUTO_INCREMENT(1),
+ name TEXT NULL,
+ INDEX idx_name (name) USING INVERTED PROPERTIES("analyzer" =
"keyword_pinyin")
+) ENGINE=OLAP
+DUPLICATE KEY(id)
+DISTRIBUTED BY RANDOM BUCKETS 1
+PROPERTIES ("replication_allocation" = "tag.location.default: 1");
+
+INSERT INTO stars VALUES (1, "刘德华"), (2, "张学友"), (3, "刘德华ABC");
+
+-- Supports multiple search modes:
+SELECT * FROM stars WHERE name MATCH '刘德华';
+SELECT * FROM stars WHERE name MATCH 'liu';
+SELECT * FROM stars WHERE name MATCH 'ldh';
+SELECT * FROM stars WHERE name MATCH 'zxy';
+```
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]