This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 192cb8a012d413bb5c6a0ebe7847c63d946f8651 Author: zzzxl <[email protected]> AuthorDate: Thu Sep 14 11:42:47 2023 +0800 [fix](invert index) fix query use char filter (#24268) --- be/src/olap/inverted_index_parser.h | 4 +- .../rowset/segment_v2/inverted_index_reader.cpp | 14 +++++- be/src/vec/exprs/vmatch_predicate.cpp | 1 + be/src/vec/functions/function_tokenize.cpp | 2 + .../apache/doris/analysis/InvertedIndexUtil.java | 42 +++++++++++++++++ .../org/apache/doris/analysis/MatchPredicate.java | 11 ++++- .../main/java/org/apache/doris/catalog/Index.java | 4 ++ .../glue/translator/ExpressionTranslator.java | 7 ++- gensrc/thrift/Exprs.thrift | 1 + .../char_filter/test_char_replace.out | 54 ++++++++++++++++++++++ .../char_filter/test_char_replace.groovy | 27 +++++++++-- 11 files changed, 160 insertions(+), 7 deletions(-) diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index 307c78e635..df4f0769f9 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -32,13 +32,15 @@ enum class InvertedIndexParserType { PARSER_UNICODE = 5, }; +using CharFilterMap = std::map<std::string, std::string>; + struct InvertedIndexCtx { InvertedIndexParserType parser_type; std::string parser_mode; + CharFilterMap char_filter_map; }; using InvertedIndexCtxSPtr = std::shared_ptr<InvertedIndexCtx>; -using CharFilterMap = std::map<std::string, std::string>; const std::string INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode"; const std::string INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained"; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 3d5801ecb7..cef35a9f51 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -51,8 +51,10 @@ #include "common/config.h" #include "common/logging.h" #include "io/fs/file_system.h" +#include "olap/inverted_index_parser.h" #include "olap/key_coder.h" #include "olap/olap_common.h" +#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" #include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h" #include "olap/rowset/segment_v2/inverted_index_cache.h" #include "olap/rowset/segment_v2/inverted_index_compound_directory.h" @@ -124,7 +126,15 @@ std::vector<std::wstring> InvertedIndexReader::get_analyse_result( // default analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>(); } - reader.reset(new lucene::util::SStringReader<char>(value.data(), value.size(), false)); + reader.reset(new lucene::util::SStringReader<char>()); + CharFilterMap& char_filter_map = inverted_index_ctx->char_filter_map; + if (!char_filter_map.empty()) { + reader.reset(CharFilterFactory::create( + char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE], reader.release(), + char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN], + char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT])); + } + reader->init(value.data(), value.size(), false); std::wstring field_ws = std::wstring(field_name.begin(), field_name.end()); std::unique_ptr<lucene::analysis::TokenStream> token_stream( @@ -232,6 +242,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run get_parser_string_from_properties(_index_meta.properties())); inverted_index_ctx->parser_mode = get_parser_mode_string_from_properties(_index_meta.properties()); + inverted_index_ctx->char_filter_map = + get_parser_char_filter_map_from_properties(_index_meta.properties()); try { std::vector<std::wstring> analyse_result = get_analyse_result(column_name, search_str, query_type, inverted_index_ctx.get()); diff --git a/be/src/vec/exprs/vmatch_predicate.cpp b/be/src/vec/exprs/vmatch_predicate.cpp index 3eee50c974..f6ba52705f 100644 --- a/be/src/vec/exprs/vmatch_predicate.cpp +++ b/be/src/vec/exprs/vmatch_predicate.cpp @@ -49,6 +49,7 @@ VMatchPredicate::VMatchPredicate(const TExprNode& node) : VExpr(node) { _inverted_index_ctx->parser_type = get_inverted_index_parser_type_from_string(node.match_predicate.parser_type); _inverted_index_ctx->parser_mode = node.match_predicate.parser_mode; + _inverted_index_ctx->char_filter_map = node.match_predicate.char_filter_map; } Status VMatchPredicate::prepare(RuntimeState* state, const RowDescriptor& desc, diff --git a/be/src/vec/functions/function_tokenize.cpp b/be/src/vec/functions/function_tokenize.cpp index c7764bcf49..72a400a58d 100644 --- a/be/src/vec/functions/function_tokenize.cpp +++ b/be/src/vec/functions/function_tokenize.cpp @@ -156,6 +156,8 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block inverted_index_ctx.parser_type = get_inverted_index_parser_type_from_string( get_parser_string_from_properties(properties)); inverted_index_ctx.parser_mode = get_parser_mode_string_from_properties(properties); + inverted_index_ctx.char_filter_map = + get_parser_char_filter_map_from_properties(properties); _do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column, dest_offsets, dest_nested_null_map); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index 84c99bfa73..5fe3d47dfa 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -20,6 +20,7 @@ package org.apache.doris.analysis; import org.apache.doris.catalog.PrimitiveType; import org.apache.doris.common.AnalysisException; +import java.util.HashMap; import java.util.Map; public class InvertedIndexUtil { @@ -36,6 +37,12 @@ public class InvertedIndexUtil { public static String INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained"; public static String INVERTED_INDEX_PARSER_COARSE_GRANULARITY = "coarse_grained"; + public static String INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type"; + public static String INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern"; + public static String INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_replacement"; + + public static String INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace"; + public static String getInvertedIndexParser(Map<String, String> properties) { String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY); // default is "none" if not set @@ -48,6 +55,41 @@ public class InvertedIndexUtil { return mode != null ? mode : INVERTED_INDEX_PARSER_FINE_GRANULARITY; } + public static Map<String, String> getInvertedIndexCharFilter(Map<String, String> properties) { + if (properties == null) { + return new HashMap<>(); + } + + if (!properties.containsKey(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE)) { + return new HashMap<>(); + } + String type = properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE); + + Map<String, String> charFilterMap = new HashMap<>(); + if (type.equals(INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE)) { + // type + charFilterMap.put(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE); + + // pattern + if (!properties.containsKey(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN)) { + return new HashMap<>(); + } + String pattern = properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN); + charFilterMap.put(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, pattern); + + // placement + String replacement = " "; + if (properties.containsKey(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT)) { + replacement = properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT); + } + charFilterMap.put(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, replacement); + } else { + return new HashMap<>(); + } + + return charFilterMap; + } + public static void checkInvertedIndexParser(String indexColName, PrimitiveType colType, Map<String, String> properties) throws AnalysisException { String parser = null; diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java index 8311a183e2..1057961452 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java @@ -37,6 +37,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.util.List; +import java.util.Map; import java.util.Objects; /** @@ -152,6 +153,7 @@ public class MatchPredicate extends Predicate { private final Operator op; private String invertedIndexParser; private String invertedIndexParserMode; + private Map<String, String> invertedIndexCharFilter; public MatchPredicate(Operator op, Expr e1, Expr e2) { super(); @@ -179,13 +181,15 @@ public class MatchPredicate extends Predicate { op = other.op; invertedIndexParser = other.invertedIndexParser; invertedIndexParserMode = other.invertedIndexParserMode; + invertedIndexCharFilter = other.invertedIndexCharFilter; } /** * use for Nereids ONLY */ public MatchPredicate(Operator op, Expr e1, Expr e2, Type retType, - NullableMode nullableMode, String invertedIndexParser, String invertedIndexParserMode) { + NullableMode nullableMode, String invertedIndexParser, String invertedIndexParserMode, + Map<String, String> invertedIndexCharFilter) { this(op, e1, e2); if (invertedIndexParser != null) { this.invertedIndexParser = invertedIndexParser; @@ -193,6 +197,9 @@ public class MatchPredicate extends Predicate { if (invertedIndexParserMode != null) { this.invertedIndexParserMode = invertedIndexParserMode; } + if (invertedIndexParserMode != null) { + this.invertedIndexCharFilter = invertedIndexCharFilter; + } fn = new Function(new FunctionName(op.name), Lists.newArrayList(e1.getType(), e2.getType()), retType, false, true, nullableMode); } @@ -224,6 +231,7 @@ public class MatchPredicate extends Predicate { msg.node_type = TExprNodeType.MATCH_PRED; msg.setOpcode(op.getOpcode()); msg.match_predicate = new TMatchPredicate(invertedIndexParser, invertedIndexParserMode); + msg.match_predicate.setCharFilterMap(invertedIndexCharFilter); } @Override @@ -278,6 +286,7 @@ public class MatchPredicate extends Predicate { if (slotRef.getColumnName().equals(columns.get(0))) { invertedIndexParser = index.getInvertedIndexParser(); invertedIndexParserMode = index.getInvertedIndexParserMode(); + invertedIndexCharFilter = index.getInvertedIndexCharFilter(); break; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java index e743087d26..e2235868a1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java @@ -135,6 +135,10 @@ public class Index implements Writable { return InvertedIndexUtil.getInvertedIndexParserMode(properties); } + public Map<String, String> getInvertedIndexCharFilter() { + return InvertedIndexUtil.getInvertedIndexCharFilter(properties); + } + public String getComment() { return comment; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java index e4777a32df..445817f9e6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java @@ -96,7 +96,9 @@ import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; @@ -188,6 +190,7 @@ public class ExpressionTranslator extends DefaultExpressionVisitor<Expr, PlanTra public Expr visitMatch(Match match, PlanTranslatorContext context) { String invertedIndexParser = InvertedIndexUtil.INVERTED_INDEX_PARSER_UNKNOWN; String invertedIndexParserMode = InvertedIndexUtil.INVERTED_INDEX_PARSER_FINE_GRANULARITY; + Map<String, String> invertedIndexCharFilter = new HashMap<>(); SlotRef left = (SlotRef) match.left().accept(this, context); OlapTable olapTbl = Optional.ofNullable(getOlapTableFromSlotDesc(left.getDesc())) .orElse(getOlapTableDirectly(left)); @@ -204,6 +207,7 @@ public class ExpressionTranslator extends DefaultExpressionVisitor<Expr, PlanTra if (columns != null && !columns.isEmpty() && left.getColumnName().equals(columns.get(0))) { invertedIndexParser = index.getInvertedIndexParser(); invertedIndexParserMode = index.getInvertedIndexParserMode(); + invertedIndexCharFilter = index.getInvertedIndexCharFilter(); break; } } @@ -217,7 +221,8 @@ public class ExpressionTranslator extends DefaultExpressionVisitor<Expr, PlanTra match.getDataType().toCatalogDataType(), NullableMode.DEPEND_ON_ARGUMENT, invertedIndexParser, - invertedIndexParserMode); + invertedIndexParserMode, + invertedIndexCharFilter); } @Override diff --git a/gensrc/thrift/Exprs.thrift b/gensrc/thrift/Exprs.thrift index e211ded8ce..e102babca2 100644 --- a/gensrc/thrift/Exprs.thrift +++ b/gensrc/thrift/Exprs.thrift @@ -142,6 +142,7 @@ struct TLikePredicate { struct TMatchPredicate { 1: required string parser_type; 2: required string parser_mode; + 3: optional map<string, string> char_filter_map; } struct TLiteralPredicate { diff --git a/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out b/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out index 3cef00b125..8020e1c877 100644 --- a/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out +++ b/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out @@ -44,3 +44,57 @@ -- !sql -- 10 +-- !sql -- +0 + +-- !sql -- +0 + +-- !sql -- +0 + +-- !sql -- +0 + +-- !sql -- +0 + +-- !sql -- +0 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + +-- !sql -- +10 + diff --git a/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy b/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy index c8916517f0..556a91c5df 100644 --- a/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy +++ b/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy @@ -34,9 +34,9 @@ suite("test_char_replace") { `a` text NULL, `b` string NULL, `c` string NULL, - INDEX a_idx(`a`) USING INVERTED PROPERTIES("parser" = "unicode") COMMENT '', - INDEX b_idx(`b`) USING INVERTED PROPERTIES("parser" = "unicode", "char_filter_type" = "char_replace", "char_filter_pattern" = "._", "char_filter_replacement" = " ") COMMENT '', - INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser" = "unicode", "char_filter_type" = "char_replace", "char_filter_pattern" = "._") COMMENT '' + INDEX a_idx(`a`) USING INVERTED PROPERTIES("parser" = "unicode", "support_phrase" = "true") COMMENT '', + INDEX b_idx(`b`) USING INVERTED PROPERTIES("parser" = "unicode", "support_phrase" = "true", "char_filter_type" = "char_replace", "char_filter_pattern" = "._", "char_filter_replacement" = " ") COMMENT '', + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser" = "unicode", "support_phrase" = "true", "char_filter_type" = "char_replace", "char_filter_pattern" = "._") COMMENT '' ) ENGINE=OLAP DUPLICATE KEY(`id`) COMMENT 'OLAP' @@ -79,4 +79,25 @@ suite("test_char_replace") { qt_sql "SELECT count() FROM ${indexTblName} where c match 'jpg'"; qt_sql "SELECT count() FROM ${indexTblName} where c match '1'"; qt_sql "SELECT count() FROM ${indexTblName} where c match '0'"; + + qt_sql "SELECT count() FROM ${indexTblName} where a match_any 'hm_bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where a match_all 'hm_bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where a match_phrase 'hm_bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where a match_any 'hm bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where a match_all 'hm bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where a match_phrase 'hm bg'"; + + qt_sql "SELECT count() FROM ${indexTblName} where b match_any 'hm_bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where b match_all 'hm_bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where b match_phrase 'hm_bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where b match_any 'hm bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where b match_all 'hm bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where b match_phrase 'hm bg'"; + + qt_sql "SELECT count() FROM ${indexTblName} where c match_any 'hm_bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where c match_all 'hm_bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where c match_phrase 'hm_bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where c match_any 'hm bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where c match_all 'hm bg'"; + qt_sql "SELECT count() FROM ${indexTblName} where c match_phrase 'hm bg'"; } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
