This is an automated email from the ASF dual-hosted git repository.
airborne pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
from c4ff082d9c3 [fix](search) fix mow support for search function (#56927)
add d1b0a19c083 [opt](inverted index) add custom analyzer support with
char_filter, basic and icu tokenizer (#56243)
No new revisions were added by this update.
Summary of changes:
be/src/olap/inverted_index_parser.cpp | 1 -
be/src/olap/inverted_index_parser.h | 1 +
.../inverted_index/analysis_factory_mgr.cpp | 12 +
.../inverted_index/analyzer/analyzer.cpp | 23 +-
.../segment_v2/inverted_index/analyzer/analyzer.h | 5 +-
.../inverted_index/analyzer/basic/basic_analyzer.h | 47 ++-
.../inverted_index/analyzer/custom_analyzer.cpp | 51 ++--
.../inverted_index/analyzer/custom_analyzer.h | 28 +-
.../analyzer/custom_analyzer_config.cpp | 10 +
.../analyzer/custom_analyzer_config.h | 4 +
.../inverted_index/analyzer/icu/icu_analyzer.h | 49 ++-
.../tokenizer.h => char_filter/char_filter.h} | 41 ++-
.../char_filter/char_filter_factory.h | 24 +-
.../char_filter/char_replace_char_filter.cpp | 26 +-
.../char_filter/char_replace_char_filter.h | 16 +-
.../char_filter/char_replace_char_filter_factory.h | 75 +++++
.../rowset/segment_v2/inverted_index/setting.h | 61 +++-
.../inverted_index/token_filter/token_filter.h | 2 -
.../token_filter/word_delimiter_filter_factory.h | 2 +
.../segment_v2/inverted_index/token_stream.h | 37 ++-
.../basic/basic_tokenizer.cpp | 39 ++-
.../basic/basic_tokenizer.h | 30 +-
.../tokenizer/basic/basic_tokenizer_factory.h | 50 ++++
.../icu/break_iterator_wrapper.cpp | 4 +-
.../icu/break_iterator_wrapper.h | 4 +-
.../icu/composite_break_iterator.cpp | 4 +-
.../icu/composite_break_iterator.h | 4 +-
.../icu/default_icu_tokenizer_config.cpp | 4 +-
.../icu/default_icu_tokenizer_config.h | 4 +-
.../{analyzer => tokenizer}/icu/icu_common.h | 4 +-
.../{analyzer => tokenizer}/icu/icu_tokenizer.cpp | 18 +-
.../{analyzer => tokenizer}/icu/icu_tokenizer.h | 13 +-
.../icu/icu_tokenizer_config.h | 4 +-
.../icu/icu_tokenizer_factory.h} | 21 +-
.../icu/script_iterator.cpp | 4 +-
.../{analyzer => tokenizer}/icu/script_iterator.h | 4 +-
.../tokenizer/standard/standard_tokenizer_impl.h | 7 +-
.../inverted_index/tokenizer/tokenizer.h | 9 +-
.../{token_filter/token_filter.h => util/reader.h} | 17 +-
.../rowset/segment_v2/inverted_index_writer.cpp | 17 +-
.../olap/rowset/segment_v2/inverted_index_writer.h | 8 +-
be/src/runtime/index_policy/index_policy_mgr.cpp | 84 ++++--
be/src/runtime/index_policy/index_policy_mgr.h | 7 +
be/src/vec/functions/function_tokenize.cpp | 4 +-
be/src/vec/functions/match.cpp | 6 +-
.../inverted_index/analyzer/icu_analyzer_test.cpp | 59 +++-
.../analyzer/simple_analyzer_test.cpp | 8 +-
.../ananlyzer/custom_analyzer_test.cpp | 328 +++++++++++++--------
.../char_filter/char_filter_test.cpp | 76 +++++
.../char_replace_char_filter_factory_test.cpp | 193 ++++++++++++
.../inverted_index/query_v2/boolean_query_test.cpp | 3 +-
.../segment_v2/inverted_index/setting_test.cpp | 95 +++++-
.../ascii_folding_filter_factory_test.cpp | 6 +-
.../lower_case_filter_factory_test.cpp | 6 +-
.../token_filter/word_delimiter_filter_test.cpp | 6 +-
.../tokenizer/basic_tokenizer_factory_test.cpp | 189 ++++++++++++
.../char_group_tokenizer_factory_test.cpp | 6 +-
.../tokenizer/edge_ngram_tokenizer_test.cpp | 6 +-
.../tokenizer/icu_tokenizer_factory_test.cpp | 212 +++++++++++++
.../tokenizer/keyword_analyzer_test.cpp | 6 +-
.../tokenizer/ngram_tokenizer_test.cpp | 6 +-
.../tokenizer/standard_tokenizer_factory_test.cpp | 6 +-
.../segment_v2/inverted_index/util/reader_test.cpp | 92 ++++++
.../antlr4/org/apache/doris/nereids/DorisLexer.g4 | 1 +
.../antlr4/org/apache/doris/nereids/DorisParser.g4 | 4 +
.../doris/indexpolicy/BasicTokenizerValidator.java | 54 ++++
.../CharReplaceCharFilterValidator.java | 63 ++++
...icyTypeEnum.java => ICUTokenizerValidator.java} | 38 ++-
.../org/apache/doris/indexpolicy/IndexPolicy.java | 6 +-
.../apache/doris/indexpolicy/IndexPolicyMgr.java | 56 +++-
.../doris/indexpolicy/IndexPolicyTypeEnum.java | 3 +-
.../doris/nereids/parser/LogicalPlanBuilder.java | 28 ++
.../apache/doris/nereids/trees/plans/PlanType.java | 3 +
.../commands/CreateIndexCharFilterCommand.java | 75 +++++
.../plans/commands/DropIndexCharFilterCommand.java | 62 ++++
.../plans/commands/ShowIndexCharFilterCommand.java | 60 ++++
.../trees/plans/visitor/CommandVisitor.java | 18 ++
gensrc/thrift/AgentService.thrift | 3 +-
.../analyzer/test_custom_analyzer.out | 6 +
.../analyzer/test_custom_analyzer1.out | 6 +
.../analyzer/test_custom_analyzer.groovy | 26 +-
.../analyzer/test_custom_analyzer1.groovy | 69 ++++-
82 files changed, 2273 insertions(+), 496 deletions(-)
copy be/src/olap/rowset/segment_v2/inverted_index/{tokenizer/tokenizer.h =>
char_filter/char_filter.h} (52%)
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h
rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer =>
tokenizer}/basic/basic_tokenizer.cpp (73%)
rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer =>
tokenizer}/basic/basic_tokenizer.h (65%)
create mode 100644
be/src/olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer_factory.h
rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer =>
tokenizer}/icu/break_iterator_wrapper.cpp (97%)
rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer =>
tokenizer}/icu/break_iterator_wrapper.h (94%)
rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer =>
tokenizer}/icu/composite_break_iterator.cpp (97%)
rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer =>
tokenizer}/icu/composite_break_iterator.h (94%)
rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer =>
tokenizer}/icu/default_icu_tokenizer_config.cpp (98%)
rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer =>
tokenizer}/icu/default_icu_tokenizer_config.h (94%)
rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer =>
tokenizer}/icu/icu_common.h (93%)
rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer =>
tokenizer}/icu/icu_tokenizer.cpp (84%)
rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer =>
tokenizer}/icu/icu_tokenizer.h (82%)
rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer =>
tokenizer}/icu/icu_tokenizer_config.h (93%)
copy be/src/olap/rowset/segment_v2/inverted_index/{token_filter/token_filter.h
=> tokenizer/icu/icu_tokenizer_factory.h} (61%)
rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer =>
tokenizer}/icu/script_iterator.cpp (97%)
rename be/src/olap/rowset/segment_v2/inverted_index/{analyzer =>
tokenizer}/icu/script_iterator.h (95%)
copy be/src/olap/rowset/segment_v2/inverted_index/{token_filter/token_filter.h
=> util/reader.h} (62%)
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/char_filter/char_filter_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/tokenizer/basic_tokenizer_factory_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/tokenizer/icu_tokenizer_factory_test.cpp
create mode 100644
be/test/olap/rowset/segment_v2/inverted_index/util/reader_test.cpp
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/BasicTokenizerValidator.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharReplaceCharFilterValidator.java
copy
fe/fe-core/src/main/java/org/apache/doris/indexpolicy/{IndexPolicyTypeEnum.java
=> ICUTokenizerValidator.java} (56%)
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/CreateIndexCharFilterCommand.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/DropIndexCharFilterCommand.java
create mode 100644
fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/ShowIndexCharFilterCommand.java
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]