This is an automated email from the ASF dual-hosted git repository.
airborne pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
from 7631f2b8de5 [feature](partition prune) Add variable
skip_prune_predicate to skip prune predicate after partition prune (#57688)
add 94fe6aae389 [feature](inverted index) add custom analyzer support with
pinyin tokenzer and pinyin filter (#57097)
No new revisions were added by this update.
Summary of changes:
.licenserc.yaml | 1 +
be/CMakeLists.txt | 5 +-
be/dict/pinyin/pinyin.txt | 20902 +
be/dict/pinyin/pinyin_alphabet.dict | 442 +
be/dict/pinyin/polyphone.txt | 462490 ++++++++++++++++++
.../inverted_index/analysis_factory_mgr.cpp | 6 +
.../inverted_index/token_filter/pinyin_filter.cpp | 441 +
.../inverted_index/token_filter/pinyin_filter.h | 105 +
.../token_filter/pinyin_filter_factory.cpp | 76 +
.../token_filter/pinyin_filter_factory.h | 41 +
.../tokenizer/pinyin/chinese_util.cpp | 42 +
.../inverted_index/tokenizer/pinyin/chinese_util.h | 34 +
.../tokenizer/pinyin/pinyin_alphabet_tokenizer.cpp | 224 +
.../tokenizer/pinyin/pinyin_alphabet_tokenizer.h | 49 +
.../tokenizer/pinyin/pinyin_config.h | 46 +
.../tokenizer/pinyin/pinyin_format.cpp | 70 +
.../tokenizer/pinyin/pinyin_format.h | 78 +
.../tokenizer/pinyin/pinyin_formatter.cpp | 214 +
.../tokenizer/pinyin/pinyin_formatter.h | 38 +
.../tokenizer/pinyin/pinyin_tokenizer.cpp | 339 +
.../tokenizer/pinyin/pinyin_tokenizer.h | 84 +
.../tokenizer/pinyin/pinyin_tokenizer_factory.cpp | 43 +
.../tokenizer/pinyin/pinyin_tokenizer_factory.h | 39 +
.../tokenizer/pinyin/pinyin_util.cpp | 292 +
.../inverted_index/tokenizer/pinyin/pinyin_util.h | 55 +
.../inverted_index/tokenizer/pinyin/rune.h | 43 +
.../tokenizer/pinyin/smart_forest.cpp | 195 +
.../inverted_index/tokenizer/pinyin/smart_forest.h | 143 +
.../tokenizer/pinyin/smart_get_word.cpp | 219 +
.../tokenizer/pinyin/smart_get_word.h | 95 +
.../inverted_index/tokenizer/pinyin/term_item.h | 73 +
.../token_filter/pinyin_filter_test.cpp | 409 +
.../tokenizer/pinyin_alphabet_tokenizer_test.cpp | 248 +
.../tokenizer/pinyin_analysis_test.cpp | 1372 +
.../inverted_index/tokenizer/pinyin_util_test.cpp | 685 +
.../inverted_index/tokenizer/smart_forest_test.cpp | 204 +
.../tokenizer/smart_get_word_test.cpp | 361 +
.../org/apache/doris/indexpolicy/IndexPolicy.java | 4 +-
.../apache/doris/indexpolicy/IndexPolicyMgr.java | 6 +
.../indexpolicy/PinyinTokenFilterValidator.java | 159 +
.../indexpolicy/PinyinTokenizerValidator.java | 157 +
.../analyzer/test_custom_analyzer.out | 174 +
.../analyzer/test_custom_analyzer.groovy | 347 +-
43 files changed, 491046 insertions(+), 4 deletions(-)
create mode 100644 be/dict/pinyin/pinyin.txt
create mode 100644 be/dict/pinyin/pinyin_alphabet.dict
create mode 100644 be/dict/pinyin/polyphone.txt
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_factory.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_factory.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/chinese_util.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/chinese_util.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_alphabet_tokenizer.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_alphabet_tokenizer.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_config.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_format.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_format.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_formatter.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_formatter.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer_factory.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer_factory.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_util.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_util.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/rune.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/smart_forest.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/smart_forest.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/smart_get_word.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/smart_get_word.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/term_item.h
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_alphabet_tokenizer_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_analysis_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_util_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/tokenizer/smart_forest_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/tokenizer/smart_get_word_test.cpp
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenFilterValidator.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenizerValidator.java
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]