This is an automated email from the ASF dual-hosted git repository.

airborne pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


    from c4ff082d9c3 [fix](search) fix mow support for search function (#56927)
     add d1b0a19c083 [opt](inverted index) add custom analyzer support with 
char_filter, basic and icu tokenizer (#56243)

No new revisions were added by this update.

Summary of changes:
 be/src/olap/inverted_index_parser.cpp              |   1 -
 be/src/olap/inverted_index_parser.h                |   1 +
 .../inverted_index/analysis_factory_mgr.cpp        |  12 +
 .../inverted_index/analyzer/analyzer.cpp           |  23 +-
 .../segment_v2/inverted_index/analyzer/analyzer.h  |   5 +-
 .../inverted_index/analyzer/basic/basic_analyzer.h |  47 ++-
 .../inverted_index/analyzer/custom_analyzer.cpp    |  51 ++--
 .../inverted_index/analyzer/custom_analyzer.h      |  28 +-
 .../analyzer/custom_analyzer_config.cpp            |  10 +
 .../analyzer/custom_analyzer_config.h              |   4 +
 .../inverted_index/analyzer/icu/icu_analyzer.h     |  49 ++-
 .../tokenizer.h => char_filter/char_filter.h}      |  41 ++-
 .../char_filter/char_filter_factory.h              |  24 +-
 .../char_filter/char_replace_char_filter.cpp       |  26 +-
 .../char_filter/char_replace_char_filter.h         |  16 +-
 .../char_filter/char_replace_char_filter_factory.h |  75 +++++
 .../rowset/segment_v2/inverted_index/setting.h     |  61 +++-
 .../inverted_index/token_filter/token_filter.h     |   2 -
 .../token_filter/word_delimiter_filter_factory.h   |   2 +
 .../segment_v2/inverted_index/token_stream.h       |  37 ++-
 .../basic/basic_tokenizer.cpp                      |  39 ++-
 .../basic/basic_tokenizer.h                        |  30 +-
 .../tokenizer/basic/basic_tokenizer_factory.h      |  50 ++++
 .../icu/break_iterator_wrapper.cpp                 |   4 +-
 .../icu/break_iterator_wrapper.h                   |   4 +-
 .../icu/composite_break_iterator.cpp               |   4 +-
 .../icu/composite_break_iterator.h                 |   4 +-
 .../icu/default_icu_tokenizer_config.cpp           |   4 +-
 .../icu/default_icu_tokenizer_config.h             |   4 +-
 .../{analyzer => tokenizer}/icu/icu_common.h       |   4 +-
 .../{analyzer => tokenizer}/icu/icu_tokenizer.cpp  |  18 +-
 .../{analyzer => tokenizer}/icu/icu_tokenizer.h    |  13 +-
 .../icu/icu_tokenizer_config.h                     |   4 +-
 .../icu/icu_tokenizer_factory.h}                   |  21 +-
 .../icu/script_iterator.cpp                        |   4 +-
 .../{analyzer => tokenizer}/icu/script_iterator.h  |   4 +-
 .../tokenizer/standard/standard_tokenizer_impl.h   |   7 +-
 .../inverted_index/tokenizer/tokenizer.h           |   9 +-
 .../{token_filter/token_filter.h => util/reader.h} |  17 +-
 .../rowset/segment_v2/inverted_index_writer.cpp    |  17 +-
 .../olap/rowset/segment_v2/inverted_index_writer.h |   8 +-
 be/src/runtime/index_policy/index_policy_mgr.cpp   |  84 ++++--
 be/src/runtime/index_policy/index_policy_mgr.h     |   7 +
 be/src/vec/functions/function_tokenize.cpp         |   4 +-
 be/src/vec/functions/match.cpp                     |   6 +-
 .../inverted_index/analyzer/icu_analyzer_test.cpp  |  59 +++-
 .../analyzer/simple_analyzer_test.cpp              |   8 +-
 .../ananlyzer/custom_analyzer_test.cpp             | 328 +++++++++++++--------
 .../char_filter/char_filter_test.cpp               |  76 +++++
 .../char_replace_char_filter_factory_test.cpp      | 193 ++++++++++++
 .../inverted_index/query_v2/boolean_query_test.cpp |   3 +-
 .../segment_v2/inverted_index/setting_test.cpp     |  95 +++++-
 .../ascii_folding_filter_factory_test.cpp          |   6 +-
 .../lower_case_filter_factory_test.cpp             |   6 +-
 .../token_filter/word_delimiter_filter_test.cpp    |   6 +-
 .../tokenizer/basic_tokenizer_factory_test.cpp     | 189 ++++++++++++
 .../char_group_tokenizer_factory_test.cpp          |   6 +-
 .../tokenizer/edge_ngram_tokenizer_test.cpp        |   6 +-
 .../tokenizer/icu_tokenizer_factory_test.cpp       | 212 +++++++++++++
 .../tokenizer/keyword_analyzer_test.cpp            |   6 +-
 .../tokenizer/ngram_tokenizer_test.cpp             |   6 +-
 .../tokenizer/standard_tokenizer_factory_test.cpp  |   6 +-
 .../segment_v2/inverted_index/util/reader_test.cpp |  92 ++++++
 .../antlr4/org/apache/doris/nereids/DorisLexer.g4  |   1 +
 .../antlr4/org/apache/doris/nereids/DorisParser.g4 |   4 +
 .../doris/indexpolicy/BasicTokenizerValidator.java |  54 ++++
 .../CharReplaceCharFilterValidator.java            |  63 ++++
 ...icyTypeEnum.java => ICUTokenizerValidator.java} |  38 ++-
 .../org/apache/doris/indexpolicy/IndexPolicy.java  |   6 +-
 .../apache/doris/indexpolicy/IndexPolicyMgr.java   |  56 +++-
 .../doris/indexpolicy/IndexPolicyTypeEnum.java     |   3 +-
 .../doris/nereids/parser/LogicalPlanBuilder.java   |  28 ++
 .../apache/doris/nereids/trees/plans/PlanType.java |   3 +
 .../commands/CreateIndexCharFilterCommand.java     |  75 +++++
 .../plans/commands/DropIndexCharFilterCommand.java |  62 ++++
 .../plans/commands/ShowIndexCharFilterCommand.java |  60 ++++
 .../trees/plans/visitor/CommandVisitor.java        |  18 ++
 gensrc/thrift/AgentService.thrift                  |   3 +-
 .../analyzer/test_custom_analyzer.out              |   6 +
 .../analyzer/test_custom_analyzer1.out             |   6 +
 .../analyzer/test_custom_analyzer.groovy           |  26 +-
 .../analyzer/test_custom_analyzer1.groovy          |  69 ++++-
 82 files changed, 2273 insertions(+), 496 deletions(-)
 copy be/src/olap/rowset/segment_v2/inverted_index/{tokenizer/tokenizer.h => 
char_filter/char_filter.h} (52%)
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h
 rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer => 
tokenizer}/basic/basic_tokenizer.cpp (73%)
 rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer => 
tokenizer}/basic/basic_tokenizer.h (65%)
 create mode 100644 
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer_factory.h
 rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer => 
tokenizer}/icu/break_iterator_wrapper.cpp (97%)
 rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer => 
tokenizer}/icu/break_iterator_wrapper.h (94%)
 rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer => 
tokenizer}/icu/composite_break_iterator.cpp (97%)
 rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer => 
tokenizer}/icu/composite_break_iterator.h (94%)
 rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer => 
tokenizer}/icu/default_icu_tokenizer_config.cpp (98%)
 rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer => 
tokenizer}/icu/default_icu_tokenizer_config.h (94%)
 rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer => 
tokenizer}/icu/icu_common.h (93%)
 rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer => 
tokenizer}/icu/icu_tokenizer.cpp (84%)
 rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer => 
tokenizer}/icu/icu_tokenizer.h (82%)
 rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer => 
tokenizer}/icu/icu_tokenizer_config.h (93%)
 copy be/src/olap/rowset/segment_v2/inverted_index/{token_filter/token_filter.h 
=> tokenizer/icu/icu_tokenizer_factory.h} (61%)
 rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer => 
tokenizer}/icu/script_iterator.cpp (97%)
 rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer => 
tokenizer}/icu/script_iterator.h (95%)
 copy be/src/olap/rowset/segment_v2/inverted_index/{token_filter/token_filter.h 
=> util/reader.h} (62%)
 create mode 100644 
be/test/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_test.cpp
 create mode 100644 
be/test/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory_test.cpp
 create mode 100644 
be/test/olap/rowset/segment_v2/inverted_index/tokenizer/basic_tokenizer_factory_test.cpp
 create mode 100644 
be/test/olap/rowset/segment_v2/inverted_index/tokenizer/icu_tokenizer_factory_test.cpp
 create mode 100644 
be/test/olap/rowset/segment_v2/inverted_index/util/reader_test.cpp
 create mode 100644 
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/BasicTokenizerValidator.java
 create mode 100644 
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharReplaceCharFilterValidator.java
 copy 
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/{IndexPolicyTypeEnum.java 
=> ICUTokenizerValidator.java} (56%)
 create mode 100644 
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateIndexCharFilterCommand.java
 create mode 100644 
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DropIndexCharFilterCommand.java
 create mode 100644 
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowIndexCharFilterCommand.java


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to