This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a change to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
from 324f78700d6 branch-3.1: [opt](kerberos) opt hdfs kerberos logic #47299
#47826 #48655 (#52193)
add 3b1547d2cf6 branch-3.1: [feature](tokenizer) add icu, basic, ik
tokenizer (#52134)
No new revisions were added by this update.
Summary of changes:
.licenserc.yaml | 1 +
be/CMakeLists.txt | 9 +
be/dict/icu/uax29/Default.txt | 152 +
be/dict/icu/uax29/MyanmarSyllable.txt | 35 +
be/dict/ik/extra_main.dic | 398716 ++++++++++++++++++
be/dict/ik/extra_single_word.dic | 12638 +
be/dict/ik/extra_single_word_full.dic | 12638 +
be/dict/ik/extra_single_word_low_freq.dic | 2714 +
be/dict/ik/extra_stopword.dic | 31 +
be/dict/ik/main.dic | 275909 ++++++++++++
be/dict/ik/preposition.dic | 25 +
be/dict/ik/quantifier.dic | 316 +
be/dict/ik/stopword.dic | 33 +
be/dict/ik/suffix.dic | 37 +
be/dict/ik/surname.dic | 131 +
be/src/clucene | 2 +-
be/src/olap/inverted_index_parser.cpp | 16 +
be/src/olap/inverted_index_parser.h | 8 +
.../inverted_index/analyzer/analyzer.cpp | 18 +
.../inverted_index/analyzer/basic/basic_analyzer.h | 56 +
.../analyzer/basic/basic_tokenizer.cpp | 108 +
.../analyzer/basic/basic_tokenizer.h | 48 +
.../analyzer/icu/break_iterator_wrapper.cpp | 107 +
.../analyzer/icu/break_iterator_wrapper.h | 54 +
.../analyzer/icu/composite_break_iterator.cpp | 83 +
.../analyzer/icu/composite_break_iterator.h | 58 +
.../analyzer/icu/default_icu_tokenizer_config.cpp | 128 +
.../analyzer/icu/default_icu_tokenizer_config.h | 44 +
.../inverted_index/analyzer/icu/icu_analyzer.h | 61 +
.../inverted_index/analyzer/icu/icu_common.h | 48 +
.../inverted_index/analyzer/icu/icu_tokenizer.cpp | 82 +
.../inverted_index/analyzer/icu/icu_tokenizer.h | 50 +
.../analyzer/icu/icu_tokenizer_config.h | 37 +
.../analyzer/icu/script_iterator.cpp | 121 +
.../inverted_index/analyzer/icu/script_iterator.h | 64 +
.../inverted_index/analyzer/ik/IKAnalyzer.h | 69 +
.../inverted_index/analyzer/ik/IKTokenizer.cpp | 66 +
.../inverted_index/analyzer/ik/IKTokenizer.h | 50 +
.../inverted_index/analyzer/ik/cfg/Configuration.h | 87 +
.../analyzer/ik/core/AnalyzeContext.cpp | 296 +
.../analyzer/ik/core/AnalyzeContext.h | 125 +
.../analyzer/ik/core/CJKSegmenter.cpp | 82 +
.../inverted_index/analyzer/ik/core/CJKSegmenter.h | 43 +
.../analyzer/ik/core/CN_QuantifierSegmenter.cpp | 164 +
.../analyzer/ik/core/CN_QuantifierSegmenter.h | 51 +
.../analyzer/ik/core/CharacterUtil.cpp | 137 +
.../analyzer/ik/core/CharacterUtil.h | 88 +
.../analyzer/ik/core/IKArbitrator.cpp | 106 +
.../inverted_index/analyzer/ik/core/IKArbitrator.h | 49 +
.../analyzer/ik/core/IKSegmenter.cpp | 80 +
.../inverted_index/analyzer/ik/core/IKSegmenter.h | 50 +
.../inverted_index/analyzer/ik/core/ISegmenter.h | 37 +
.../analyzer/ik/core/LetterSegmenter.cpp | 200 +
.../analyzer/ik/core/LetterSegmenter.h | 58 +
.../inverted_index/analyzer/ik/core/Lexeme.cpp | 72 +
.../inverted_index/analyzer/ik/core/Lexeme.h | 105 +
.../inverted_index/analyzer/ik/core/LexemePath.cpp | 166 +
.../inverted_index/analyzer/ik/core/LexemePath.h | 60 +
.../analyzer/ik/core/QuickSortSet.cpp | 143 +
.../inverted_index/analyzer/ik/core/QuickSortSet.h | 92 +
.../analyzer/ik/core/SurrogatePairSegmenter.cpp | 37 +
.../analyzer/ik/core/SurrogatePairSegmenter.h | 39 +
.../inverted_index/analyzer/ik/dic/DictSegment.cpp | 167 +
.../inverted_index/analyzer/ik/dic/DictSegment.h | 64 +
.../inverted_index/analyzer/ik/dic/Dictionary.cpp | 174 +
.../inverted_index/analyzer/ik/dic/Dictionary.h | 168 +
.../inverted_index/analyzer/ik/dic/Hit.h | 69 +
.../inverted_index/analyzer/ik/util/IKMemoryPool.h | 101 +
be/src/vec/functions/function_tokenize.cpp | 4 +-
.../inverted_index/analyzer/icu_analyzer_test.cpp | 575 +
.../inverted_index/analyzer/ik_anayzer_test.cpp | 708 +
.../analyzer/simple_analyzer_test.cpp | 147 +
.../apache/doris/analysis/InvertedIndexUtil.java | 40 +-
.../analyzer/test_basic_analyzer.out | Bin 0 -> 245 bytes
.../analyzer/test_icu_analyzer.out | Bin 0 -> 371 bytes
.../analyzer/test_ik_analyzer.out | Bin 0 -> 461 bytes
.../data/inverted_index_p0/test_tokenize.out | Bin 2012 -> 3771 bytes
.../analyzer/test_basic_analyzer.groovy | 51 +
.../analyzer/test_icu_analyzer.groovy | 51 +
.../analyzer/test_ik_analyzer.groovy | 90 +
.../inverted_index_p0/test_properties.groovy | 2 +-
.../suites/inverted_index_p0/test_tokenize.groovy | 19 +
82 files changed, 709545 insertions(+), 15 deletions(-)
create mode 100644 be/dict/icu/uax29/Default.txt
create mode 100644 be/dict/icu/uax29/MyanmarSyllable.txt
create mode 100644 be/dict/ik/extra_main.dic
create mode 100644 be/dict/ik/extra_single_word.dic
create mode 100644 be/dict/ik/extra_single_word_full.dic
create mode 100644 be/dict/ik/extra_single_word_low_freq.dic
create mode 100644 be/dict/ik/extra_stopword.dic
create mode 100644 be/dict/ik/main.dic
create mode 100644 be/dict/ik/preposition.dic
create mode 100644 be/dict/ik/quantifier.dic
create mode 100644 be/dict/ik/stopword.dic
create mode 100644 be/dict/ik/suffix.dic
create mode 100644 be/dict/ik/surname.dic
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_common.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer_config.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKAnalyzer.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CJKSegmenter.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CJKSegmenter.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKArbitrator.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKArbitrator.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/ISegmenter.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/Lexeme.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/Lexeme.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LexemePath.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LexemePath.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/QuickSortSet.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/QuickSortSet.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/dic/DictSegment.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/dic/DictSegment.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/dic/Dictionary.cpp
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/dic/Dictionary.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/dic/Hit.h
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/util/IKMemoryPool.h
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_analyzer_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/analyzer/simple_analyzer_test.cpp
create mode 100644
regression-test/data/inverted_index_p0/analyzer/test_basic_analyzer.out
create mode 100644
regression-test/data/inverted_index_p0/analyzer/test_icu_analyzer.out
create mode 100644
regression-test/data/inverted_index_p0/analyzer/test_ik_analyzer.out
create mode 100644
regression-test/suites/inverted_index_p0/analyzer/test_basic_analyzer.groovy
create mode 100644
regression-test/suites/inverted_index_p0/analyzer/test_icu_analyzer.groovy
create mode 100644
regression-test/suites/inverted_index_p0/analyzer/test_ik_analyzer.groovy
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]