This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new ed108d48fa [fix](invert index) fix query use char filter (#24268)
ed108d48fa is described below
commit ed108d48fae90cb1e9cea17a21b8e0673e73fb21
Author: zzzxl <[email protected]>
AuthorDate: Thu Sep 14 11:42:47 2023 +0800
[fix](invert index) fix query use char filter (#24268)
---
be/src/olap/inverted_index_parser.h | 4 +-
.../rowset/segment_v2/inverted_index_reader.cpp | 14 +++++-
be/src/vec/exprs/vmatch_predicate.cpp | 1 +
be/src/vec/functions/function_tokenize.cpp | 2 +
.../apache/doris/analysis/InvertedIndexUtil.java | 42 +++++++++++++++++
.../org/apache/doris/analysis/MatchPredicate.java | 11 ++++-
.../main/java/org/apache/doris/catalog/Index.java | 4 ++
.../glue/translator/ExpressionTranslator.java | 7 ++-
gensrc/thrift/Exprs.thrift | 1 +
.../char_filter/test_char_replace.out | 54 ++++++++++++++++++++++
.../char_filter/test_char_replace.groovy | 27 +++++++++--
11 files changed, 160 insertions(+), 7 deletions(-)
diff --git a/be/src/olap/inverted_index_parser.h
b/be/src/olap/inverted_index_parser.h
index 307c78e635..df4f0769f9 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -32,13 +32,15 @@ enum class InvertedIndexParserType {
PARSER_UNICODE = 5,
};
+using CharFilterMap = std::map<std::string, std::string>;
+
struct InvertedIndexCtx {
InvertedIndexParserType parser_type;
std::string parser_mode;
+ CharFilterMap char_filter_map;
};
using InvertedIndexCtxSPtr = std::shared_ptr<InvertedIndexCtx>;
-using CharFilterMap = std::map<std::string, std::string>;
const std::string INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode";
const std::string INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained";
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 3d5801ecb7..cef35a9f51 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -51,8 +51,10 @@
#include "common/config.h"
#include "common/logging.h"
#include "io/fs/file_system.h"
+#include "olap/inverted_index_parser.h"
#include "olap/key_coder.h"
#include "olap/olap_common.h"
+#include
"olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
#include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h"
#include "olap/rowset/segment_v2/inverted_index_cache.h"
#include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
@@ -124,7 +126,15 @@ std::vector<std::wstring>
InvertedIndexReader::get_analyse_result(
// default
analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
}
- reader.reset(new lucene::util::SStringReader<char>(value.data(),
value.size(), false));
+ reader.reset(new lucene::util::SStringReader<char>());
+ CharFilterMap& char_filter_map = inverted_index_ctx->char_filter_map;
+ if (!char_filter_map.empty()) {
+ reader.reset(CharFilterFactory::create(
+ char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE],
reader.release(),
+ char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
+
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]));
+ }
+ reader->init(value.data(), value.size(), false);
std::wstring field_ws = std::wstring(field_name.begin(), field_name.end());
std::unique_ptr<lucene::analysis::TokenStream> token_stream(
@@ -232,6 +242,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
get_parser_string_from_properties(_index_meta.properties()));
inverted_index_ctx->parser_mode =
get_parser_mode_string_from_properties(_index_meta.properties());
+ inverted_index_ctx->char_filter_map =
+
get_parser_char_filter_map_from_properties(_index_meta.properties());
try {
std::vector<std::wstring> analyse_result =
get_analyse_result(column_name, search_str, query_type,
inverted_index_ctx.get());
diff --git a/be/src/vec/exprs/vmatch_predicate.cpp
b/be/src/vec/exprs/vmatch_predicate.cpp
index 3eee50c974..f6ba52705f 100644
--- a/be/src/vec/exprs/vmatch_predicate.cpp
+++ b/be/src/vec/exprs/vmatch_predicate.cpp
@@ -49,6 +49,7 @@ VMatchPredicate::VMatchPredicate(const TExprNode& node) :
VExpr(node) {
_inverted_index_ctx->parser_type =
get_inverted_index_parser_type_from_string(node.match_predicate.parser_type);
_inverted_index_ctx->parser_mode = node.match_predicate.parser_mode;
+ _inverted_index_ctx->char_filter_map =
node.match_predicate.char_filter_map;
}
Status VMatchPredicate::prepare(RuntimeState* state, const RowDescriptor& desc,
diff --git a/be/src/vec/functions/function_tokenize.cpp
b/be/src/vec/functions/function_tokenize.cpp
index c7764bcf49..72a400a58d 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -156,6 +156,8 @@ Status FunctionTokenize::execute_impl(FunctionContext*
/*context*/, Block& block
inverted_index_ctx.parser_type =
get_inverted_index_parser_type_from_string(
get_parser_string_from_properties(properties));
inverted_index_ctx.parser_mode =
get_parser_mode_string_from_properties(properties);
+ inverted_index_ctx.char_filter_map =
+ get_parser_char_filter_map_from_properties(properties);
_do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column,
dest_offsets,
dest_nested_null_map);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
index 84c99bfa73..5fe3d47dfa 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
@@ -20,6 +20,7 @@ package org.apache.doris.analysis;
import org.apache.doris.catalog.PrimitiveType;
import org.apache.doris.common.AnalysisException;
+import java.util.HashMap;
import java.util.Map;
public class InvertedIndexUtil {
@@ -36,6 +37,12 @@ public class InvertedIndexUtil {
public static String INVERTED_INDEX_PARSER_FINE_GRANULARITY =
"fine_grained";
public static String INVERTED_INDEX_PARSER_COARSE_GRANULARITY =
"coarse_grained";
+ public static String INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE =
"char_filter_type";
+ public static String INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN =
"char_filter_pattern";
+ public static String INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT =
"char_filter_replacement";
+
+ public static String INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE =
"char_replace";
+
public static String getInvertedIndexParser(Map<String, String>
properties) {
String parser = properties == null ? null :
properties.get(INVERTED_INDEX_PARSER_KEY);
// default is "none" if not set
@@ -48,6 +55,41 @@ public class InvertedIndexUtil {
return mode != null ? mode : INVERTED_INDEX_PARSER_FINE_GRANULARITY;
}
+ public static Map<String, String> getInvertedIndexCharFilter(Map<String,
String> properties) {
+ if (properties == null) {
+ return new HashMap<>();
+ }
+
+ if (!properties.containsKey(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE)) {
+ return new HashMap<>();
+ }
+ String type = properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE);
+
+ Map<String, String> charFilterMap = new HashMap<>();
+ if (type.equals(INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE)) {
+ // type
+ charFilterMap.put(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE,
INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE);
+
+ // pattern
+ if
(!properties.containsKey(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN)) {
+ return new HashMap<>();
+ }
+ String pattern =
properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
+ charFilterMap.put(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN,
pattern);
+
+ // placement
+ String replacement = " ";
+ if
(properties.containsKey(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT)) {
+ replacement =
properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT);
+ }
+ charFilterMap.put(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT,
replacement);
+ } else {
+ return new HashMap<>();
+ }
+
+ return charFilterMap;
+ }
+
public static void checkInvertedIndexParser(String indexColName,
PrimitiveType colType,
Map<String, String> properties) throws AnalysisException {
String parser = null;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
index 8311a183e2..1057961452 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
@@ -37,6 +37,7 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.util.List;
+import java.util.Map;
import java.util.Objects;
/**
@@ -152,6 +153,7 @@ public class MatchPredicate extends Predicate {
private final Operator op;
private String invertedIndexParser;
private String invertedIndexParserMode;
+ private Map<String, String> invertedIndexCharFilter;
public MatchPredicate(Operator op, Expr e1, Expr e2) {
super();
@@ -179,13 +181,15 @@ public class MatchPredicate extends Predicate {
op = other.op;
invertedIndexParser = other.invertedIndexParser;
invertedIndexParserMode = other.invertedIndexParserMode;
+ invertedIndexCharFilter = other.invertedIndexCharFilter;
}
/**
* use for Nereids ONLY
*/
public MatchPredicate(Operator op, Expr e1, Expr e2, Type retType,
- NullableMode nullableMode, String invertedIndexParser, String
invertedIndexParserMode) {
+ NullableMode nullableMode, String invertedIndexParser, String
invertedIndexParserMode,
+ Map<String, String> invertedIndexCharFilter) {
this(op, e1, e2);
if (invertedIndexParser != null) {
this.invertedIndexParser = invertedIndexParser;
@@ -193,6 +197,9 @@ public class MatchPredicate extends Predicate {
if (invertedIndexParserMode != null) {
this.invertedIndexParserMode = invertedIndexParserMode;
}
+ if (invertedIndexParserMode != null) {
+ this.invertedIndexCharFilter = invertedIndexCharFilter;
+ }
fn = new Function(new FunctionName(op.name),
Lists.newArrayList(e1.getType(), e2.getType()), retType,
false, true, nullableMode);
}
@@ -224,6 +231,7 @@ public class MatchPredicate extends Predicate {
msg.node_type = TExprNodeType.MATCH_PRED;
msg.setOpcode(op.getOpcode());
msg.match_predicate = new TMatchPredicate(invertedIndexParser,
invertedIndexParserMode);
+ msg.match_predicate.setCharFilterMap(invertedIndexCharFilter);
}
@Override
@@ -278,6 +286,7 @@ public class MatchPredicate extends Predicate {
if (slotRef.getColumnName().equals(columns.get(0))) {
invertedIndexParser =
index.getInvertedIndexParser();
invertedIndexParserMode =
index.getInvertedIndexParserMode();
+ invertedIndexCharFilter =
index.getInvertedIndexCharFilter();
break;
}
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java
index e743087d26..e2235868a1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java
@@ -135,6 +135,10 @@ public class Index implements Writable {
return InvertedIndexUtil.getInvertedIndexParserMode(properties);
}
+ public Map<String, String> getInvertedIndexCharFilter() {
+ return InvertedIndexUtil.getInvertedIndexCharFilter(properties);
+ }
+
public String getComment() {
return comment;
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
index d6e9911145..1f34e8c6d3 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
@@ -102,7 +102,9 @@ import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
@@ -194,6 +196,7 @@ public class ExpressionTranslator extends
DefaultExpressionVisitor<Expr, PlanTra
public Expr visitMatch(Match match, PlanTranslatorContext context) {
String invertedIndexParser =
InvertedIndexUtil.INVERTED_INDEX_PARSER_UNKNOWN;
String invertedIndexParserMode =
InvertedIndexUtil.INVERTED_INDEX_PARSER_FINE_GRANULARITY;
+ Map<String, String> invertedIndexCharFilter = new HashMap<>();
SlotRef left = (SlotRef) match.left().accept(this, context);
OlapTable olapTbl =
Optional.ofNullable(getOlapTableFromSlotDesc(left.getDesc()))
.orElse(getOlapTableDirectly(left));
@@ -210,6 +213,7 @@ public class ExpressionTranslator extends
DefaultExpressionVisitor<Expr, PlanTra
if (columns != null && !columns.isEmpty() &&
left.getColumnName().equals(columns.get(0))) {
invertedIndexParser = index.getInvertedIndexParser();
invertedIndexParserMode =
index.getInvertedIndexParserMode();
+ invertedIndexCharFilter =
index.getInvertedIndexCharFilter();
break;
}
}
@@ -223,7 +227,8 @@ public class ExpressionTranslator extends
DefaultExpressionVisitor<Expr, PlanTra
match.getDataType().toCatalogDataType(),
NullableMode.DEPEND_ON_ARGUMENT,
invertedIndexParser,
- invertedIndexParserMode);
+ invertedIndexParserMode,
+ invertedIndexCharFilter);
}
@Override
diff --git a/gensrc/thrift/Exprs.thrift b/gensrc/thrift/Exprs.thrift
index e211ded8ce..e102babca2 100644
--- a/gensrc/thrift/Exprs.thrift
+++ b/gensrc/thrift/Exprs.thrift
@@ -142,6 +142,7 @@ struct TLikePredicate {
struct TMatchPredicate {
1: required string parser_type;
2: required string parser_mode;
+ 3: optional map<string, string> char_filter_map;
}
struct TLiteralPredicate {
diff --git
a/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out
b/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out
index 3cef00b125..8020e1c877 100644
--- a/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out
+++ b/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out
@@ -44,3 +44,57 @@
-- !sql --
10
+-- !sql --
+0
+
+-- !sql --
+0
+
+-- !sql --
+0
+
+-- !sql --
+0
+
+-- !sql --
+0
+
+-- !sql --
+0
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
diff --git
a/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy
b/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy
index c8916517f0..556a91c5df 100644
---
a/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy
+++
b/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy
@@ -34,9 +34,9 @@ suite("test_char_replace") {
`a` text NULL,
`b` string NULL,
`c` string NULL,
- INDEX a_idx(`a`) USING INVERTED PROPERTIES("parser" = "unicode")
COMMENT '',
- INDEX b_idx(`b`) USING INVERTED PROPERTIES("parser" =
"unicode", "char_filter_type" = "char_replace", "char_filter_pattern" = "._",
"char_filter_replacement" = " ") COMMENT '',
- INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser" = "unicode",
"char_filter_type" = "char_replace", "char_filter_pattern" = "._") COMMENT ''
+ INDEX a_idx(`a`) USING INVERTED PROPERTIES("parser" = "unicode",
"support_phrase" = "true") COMMENT '',
+ INDEX b_idx(`b`) USING INVERTED PROPERTIES("parser" =
"unicode", "support_phrase" = "true", "char_filter_type" = "char_replace",
"char_filter_pattern" = "._", "char_filter_replacement" = " ") COMMENT '',
+ INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser" = "unicode",
"support_phrase" = "true", "char_filter_type" = "char_replace",
"char_filter_pattern" = "._") COMMENT ''
) ENGINE=OLAP
DUPLICATE KEY(`id`)
COMMENT 'OLAP'
@@ -79,4 +79,25 @@ suite("test_char_replace") {
qt_sql "SELECT count() FROM ${indexTblName} where c match 'jpg'";
qt_sql "SELECT count() FROM ${indexTblName} where c match '1'";
qt_sql "SELECT count() FROM ${indexTblName} where c match '0'";
+
+ qt_sql "SELECT count() FROM ${indexTblName} where a match_any 'hm_bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where a match_all 'hm_bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where a match_phrase 'hm_bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where a match_any 'hm bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where a match_all 'hm bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where a match_phrase 'hm bg'";
+
+ qt_sql "SELECT count() FROM ${indexTblName} where b match_any 'hm_bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where b match_all 'hm_bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where b match_phrase 'hm_bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where b match_any 'hm bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where b match_all 'hm bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where b match_phrase 'hm bg'";
+
+ qt_sql "SELECT count() FROM ${indexTblName} where c match_any 'hm_bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where c match_all 'hm_bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where c match_phrase 'hm_bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where c match_any 'hm bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where c match_all 'hm bg'";
+ qt_sql "SELECT count() FROM ${indexTblName} where c match_phrase 'hm bg'";
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]