This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a change to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git


    from 324f78700d6 branch-3.1: [opt](kerberos) opt hdfs kerberos logic #47299 
#47826 #48655 (#52193)
     add 3b1547d2cf6 branch-3.1: [feature](tokenizer) add icu, basic, ik 
tokenizer (#52134)

No new revisions were added by this update.

Summary of changes:
 .licenserc.yaml                                    |      1 +
 be/CMakeLists.txt                                  |      9 +
 be/dict/icu/uax29/Default.txt                      |    152 +
 be/dict/icu/uax29/MyanmarSyllable.txt              |     35 +
 be/dict/ik/extra_main.dic                          | 398716 ++++++++++++++++++
 be/dict/ik/extra_single_word.dic                   |  12638 +
 be/dict/ik/extra_single_word_full.dic              |  12638 +
 be/dict/ik/extra_single_word_low_freq.dic          |   2714 +
 be/dict/ik/extra_stopword.dic                      |     31 +
 be/dict/ik/main.dic                                | 275909 ++++++++++++
 be/dict/ik/preposition.dic                         |     25 +
 be/dict/ik/quantifier.dic                          |    316 +
 be/dict/ik/stopword.dic                            |     33 +
 be/dict/ik/suffix.dic                              |     37 +
 be/dict/ik/surname.dic                             |    131 +
 be/src/clucene                                     |      2 +-
 be/src/olap/inverted_index_parser.cpp              |     16 +
 be/src/olap/inverted_index_parser.h                |      8 +
 .../inverted_index/analyzer/analyzer.cpp           |     18 +
 .../inverted_index/analyzer/basic/basic_analyzer.h |     56 +
 .../analyzer/basic/basic_tokenizer.cpp             |    108 +
 .../analyzer/basic/basic_tokenizer.h               |     48 +
 .../analyzer/icu/break_iterator_wrapper.cpp        |    107 +
 .../analyzer/icu/break_iterator_wrapper.h          |     54 +
 .../analyzer/icu/composite_break_iterator.cpp      |     83 +
 .../analyzer/icu/composite_break_iterator.h        |     58 +
 .../analyzer/icu/default_icu_tokenizer_config.cpp  |    128 +
 .../analyzer/icu/default_icu_tokenizer_config.h    |     44 +
 .../inverted_index/analyzer/icu/icu_analyzer.h     |     61 +
 .../inverted_index/analyzer/icu/icu_common.h       |     48 +
 .../inverted_index/analyzer/icu/icu_tokenizer.cpp  |     82 +
 .../inverted_index/analyzer/icu/icu_tokenizer.h    |     50 +
 .../analyzer/icu/icu_tokenizer_config.h            |     37 +
 .../analyzer/icu/script_iterator.cpp               |    121 +
 .../inverted_index/analyzer/icu/script_iterator.h  |     64 +
 .../inverted_index/analyzer/ik/IKAnalyzer.h        |     69 +
 .../inverted_index/analyzer/ik/IKTokenizer.cpp     |     66 +
 .../inverted_index/analyzer/ik/IKTokenizer.h       |     50 +
 .../inverted_index/analyzer/ik/cfg/Configuration.h |     87 +
 .../analyzer/ik/core/AnalyzeContext.cpp            |    296 +
 .../analyzer/ik/core/AnalyzeContext.h              |    125 +
 .../analyzer/ik/core/CJKSegmenter.cpp              |     82 +
 .../inverted_index/analyzer/ik/core/CJKSegmenter.h |     43 +
 .../analyzer/ik/core/CN_QuantifierSegmenter.cpp    |    164 +
 .../analyzer/ik/core/CN_QuantifierSegmenter.h      |     51 +
 .../analyzer/ik/core/CharacterUtil.cpp             |    137 +
 .../analyzer/ik/core/CharacterUtil.h               |     88 +
 .../analyzer/ik/core/IKArbitrator.cpp              |    106 +
 .../inverted_index/analyzer/ik/core/IKArbitrator.h |     49 +
 .../analyzer/ik/core/IKSegmenter.cpp               |     80 +
 .../inverted_index/analyzer/ik/core/IKSegmenter.h  |     50 +
 .../inverted_index/analyzer/ik/core/ISegmenter.h   |     37 +
 .../analyzer/ik/core/LetterSegmenter.cpp           |    200 +
 .../analyzer/ik/core/LetterSegmenter.h             |     58 +
 .../inverted_index/analyzer/ik/core/Lexeme.cpp     |     72 +
 .../inverted_index/analyzer/ik/core/Lexeme.h       |    105 +
 .../inverted_index/analyzer/ik/core/LexemePath.cpp |    166 +
 .../inverted_index/analyzer/ik/core/LexemePath.h   |     60 +
 .../analyzer/ik/core/QuickSortSet.cpp              |    143 +
 .../inverted_index/analyzer/ik/core/QuickSortSet.h |     92 +
 .../analyzer/ik/core/SurrogatePairSegmenter.cpp    |     37 +
 .../analyzer/ik/core/SurrogatePairSegmenter.h      |     39 +
 .../inverted_index/analyzer/ik/dic/DictSegment.cpp |    167 +
 .../inverted_index/analyzer/ik/dic/DictSegment.h   |     64 +
 .../inverted_index/analyzer/ik/dic/Dictionary.cpp  |    174 +
 .../inverted_index/analyzer/ik/dic/Dictionary.h    |    168 +
 .../inverted_index/analyzer/ik/dic/Hit.h           |     69 +
 .../inverted_index/analyzer/ik/util/IKMemoryPool.h |    101 +
 be/src/vec/functions/function_tokenize.cpp         |      4 +-
 .../inverted_index/analyzer/icu_analyzer_test.cpp  |    575 +
 .../inverted_index/analyzer/ik_anayzer_test.cpp    |    708 +
 .../analyzer/simple_analyzer_test.cpp              |    147 +
 .../apache/doris/analysis/InvertedIndexUtil.java   |     40 +-
 .../analyzer/test_basic_analyzer.out               |    Bin 0 -> 245 bytes
 .../analyzer/test_icu_analyzer.out                 |    Bin 0 -> 371 bytes
 .../analyzer/test_ik_analyzer.out                  |    Bin 0 -> 461 bytes
 .../data/inverted_index_p0/test_tokenize.out       |    Bin 2012 -> 3771 bytes
 .../analyzer/test_basic_analyzer.groovy            |     51 +
 .../analyzer/test_icu_analyzer.groovy              |     51 +
 .../analyzer/test_ik_analyzer.groovy               |     90 +
 .../inverted_index_p0/test_properties.groovy       |      2 +-
 .../suites/inverted_index_p0/test_tokenize.groovy  |     19 +
 82 files changed, 709545 insertions(+), 15 deletions(-)
 create mode 100644 be/dict/icu/uax29/Default.txt
 create mode 100644 be/dict/icu/uax29/MyanmarSyllable.txt
 create mode 100644 be/dict/ik/extra_main.dic
 create mode 100644 be/dict/ik/extra_single_word.dic
 create mode 100644 be/dict/ik/extra_single_word_full.dic
 create mode 100644 be/dict/ik/extra_single_word_low_freq.dic
 create mode 100644 be/dict/ik/extra_stopword.dic
 create mode 100644 be/dict/ik/main.dic
 create mode 100644 be/dict/ik/preposition.dic
 create mode 100644 be/dict/ik/quantifier.dic
 create mode 100644 be/dict/ik/stopword.dic
 create mode 100644 be/dict/ik/suffix.dic
 create mode 100644 be/dict/ik/surname.dic
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_common.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer_config.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKAnalyzer.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/IKTokenizer.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/cfg/Configuration.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/AnalyzeContext.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CJKSegmenter.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CJKSegmenter.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CN_QuantifierSegmenter.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/CharacterUtil.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKArbitrator.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKArbitrator.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/IKSegmenter.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/ISegmenter.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LetterSegmenter.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/Lexeme.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/Lexeme.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LexemePath.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/LexemePath.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/QuickSortSet.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/QuickSortSet.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/core/SurrogatePairSegmenter.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/dic/DictSegment.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/dic/DictSegment.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/dic/Dictionary.cpp
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/dic/Dictionary.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/dic/Hit.h
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/ik/util/IKMemoryPool.h
 create mode 100644 
be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_analyzer_test.cpp
 create mode 100644 
be/test/olap/rowset/segment_v2/inverted_index/analyzer/ik_anayzer_test.cpp
 create mode 100644 
be/test/olap/rowset/segment_v2/inverted_index/analyzer/simple_analyzer_test.cpp
 create mode 100644 
regression-test/data/inverted_index_p0/analyzer/test_basic_analyzer.out
 create mode 100644 
regression-test/data/inverted_index_p0/analyzer/test_icu_analyzer.out
 create mode 100644 
regression-test/data/inverted_index_p0/analyzer/test_ik_analyzer.out
 create mode 100644 
regression-test/suites/inverted_index_p0/analyzer/test_basic_analyzer.groovy
 create mode 100644 
regression-test/suites/inverted_index_p0/analyzer/test_icu_analyzer.groovy
 create mode 100644 
regression-test/suites/inverted_index_p0/analyzer/test_ik_analyzer.groovy


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to