This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new ed108d48fa [fix](invert index) fix query use char filter (#24268)
ed108d48fa is described below

commit ed108d48fae90cb1e9cea17a21b8e0673e73fb21
Author: zzzxl <[email protected]>
AuthorDate: Thu Sep 14 11:42:47 2023 +0800

    [fix](invert index) fix query use char filter (#24268)
---
 be/src/olap/inverted_index_parser.h                |  4 +-
 .../rowset/segment_v2/inverted_index_reader.cpp    | 14 +++++-
 be/src/vec/exprs/vmatch_predicate.cpp              |  1 +
 be/src/vec/functions/function_tokenize.cpp         |  2 +
 .../apache/doris/analysis/InvertedIndexUtil.java   | 42 +++++++++++++++++
 .../org/apache/doris/analysis/MatchPredicate.java  | 11 ++++-
 .../main/java/org/apache/doris/catalog/Index.java  |  4 ++
 .../glue/translator/ExpressionTranslator.java      |  7 ++-
 gensrc/thrift/Exprs.thrift                         |  1 +
 .../char_filter/test_char_replace.out              | 54 ++++++++++++++++++++++
 .../char_filter/test_char_replace.groovy           | 27 +++++++++--
 11 files changed, 160 insertions(+), 7 deletions(-)

diff --git a/be/src/olap/inverted_index_parser.h 
b/be/src/olap/inverted_index_parser.h
index 307c78e635..df4f0769f9 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -32,13 +32,15 @@ enum class InvertedIndexParserType {
     PARSER_UNICODE = 5,
 };
 
+using CharFilterMap = std::map<std::string, std::string>;
+
 struct InvertedIndexCtx {
     InvertedIndexParserType parser_type;
     std::string parser_mode;
+    CharFilterMap char_filter_map;
 };
 
 using InvertedIndexCtxSPtr = std::shared_ptr<InvertedIndexCtx>;
-using CharFilterMap = std::map<std::string, std::string>;
 
 const std::string INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode";
 const std::string INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained";
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 3d5801ecb7..cef35a9f51 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -51,8 +51,10 @@
 #include "common/config.h"
 #include "common/logging.h"
 #include "io/fs/file_system.h"
+#include "olap/inverted_index_parser.h"
 #include "olap/key_coder.h"
 #include "olap/olap_common.h"
+#include 
"olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
 #include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h"
 #include "olap/rowset/segment_v2/inverted_index_cache.h"
 #include "olap/rowset/segment_v2/inverted_index_compound_directory.h"
@@ -124,7 +126,15 @@ std::vector<std::wstring> 
InvertedIndexReader::get_analyse_result(
         // default
         analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>();
     }
-    reader.reset(new lucene::util::SStringReader<char>(value.data(), 
value.size(), false));
+    reader.reset(new lucene::util::SStringReader<char>());
+    CharFilterMap& char_filter_map = inverted_index_ctx->char_filter_map;
+    if (!char_filter_map.empty()) {
+        reader.reset(CharFilterFactory::create(
+                char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE], 
reader.release(),
+                char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
+                
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]));
+    }
+    reader->init(value.data(), value.size(), false);
 
     std::wstring field_ws = std::wstring(field_name.begin(), field_name.end());
     std::unique_ptr<lucene::analysis::TokenStream> token_stream(
@@ -232,6 +242,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
             get_parser_string_from_properties(_index_meta.properties()));
     inverted_index_ctx->parser_mode =
             get_parser_mode_string_from_properties(_index_meta.properties());
+    inverted_index_ctx->char_filter_map =
+            
get_parser_char_filter_map_from_properties(_index_meta.properties());
     try {
         std::vector<std::wstring> analyse_result =
                 get_analyse_result(column_name, search_str, query_type, 
inverted_index_ctx.get());
diff --git a/be/src/vec/exprs/vmatch_predicate.cpp 
b/be/src/vec/exprs/vmatch_predicate.cpp
index 3eee50c974..f6ba52705f 100644
--- a/be/src/vec/exprs/vmatch_predicate.cpp
+++ b/be/src/vec/exprs/vmatch_predicate.cpp
@@ -49,6 +49,7 @@ VMatchPredicate::VMatchPredicate(const TExprNode& node) : 
VExpr(node) {
     _inverted_index_ctx->parser_type =
             
get_inverted_index_parser_type_from_string(node.match_predicate.parser_type);
     _inverted_index_ctx->parser_mode = node.match_predicate.parser_mode;
+    _inverted_index_ctx->char_filter_map = 
node.match_predicate.char_filter_map;
 }
 
 Status VMatchPredicate::prepare(RuntimeState* state, const RowDescriptor& desc,
diff --git a/be/src/vec/functions/function_tokenize.cpp 
b/be/src/vec/functions/function_tokenize.cpp
index c7764bcf49..72a400a58d 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -156,6 +156,8 @@ Status FunctionTokenize::execute_impl(FunctionContext* 
/*context*/, Block& block
             inverted_index_ctx.parser_type = 
get_inverted_index_parser_type_from_string(
                     get_parser_string_from_properties(properties));
             inverted_index_ctx.parser_mode = 
get_parser_mode_string_from_properties(properties);
+            inverted_index_ctx.char_filter_map =
+                    get_parser_char_filter_map_from_properties(properties);
             _do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column, 
dest_offsets,
                          dest_nested_null_map);
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
index 84c99bfa73..5fe3d47dfa 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
@@ -20,6 +20,7 @@ package org.apache.doris.analysis;
 import org.apache.doris.catalog.PrimitiveType;
 import org.apache.doris.common.AnalysisException;
 
+import java.util.HashMap;
 import java.util.Map;
 
 public class InvertedIndexUtil {
@@ -36,6 +37,12 @@ public class InvertedIndexUtil {
     public static String INVERTED_INDEX_PARSER_FINE_GRANULARITY = 
"fine_grained";
     public static String INVERTED_INDEX_PARSER_COARSE_GRANULARITY = 
"coarse_grained";
 
+    public static String INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = 
"char_filter_type";
+    public static String INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = 
"char_filter_pattern";
+    public static String INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = 
"char_filter_replacement";
+
+    public static String INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = 
"char_replace";
+
     public static String getInvertedIndexParser(Map<String, String> 
properties) {
         String parser = properties == null ? null : 
properties.get(INVERTED_INDEX_PARSER_KEY);
         // default is "none" if not set
@@ -48,6 +55,41 @@ public class InvertedIndexUtil {
         return mode != null ? mode : INVERTED_INDEX_PARSER_FINE_GRANULARITY;
     }
 
+    public static Map<String, String> getInvertedIndexCharFilter(Map<String, 
String> properties) {
+        if (properties == null) {
+            return new HashMap<>();
+        }
+
+        if (!properties.containsKey(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE)) {
+            return new HashMap<>();
+        }
+        String type = properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE);
+
+        Map<String, String> charFilterMap = new HashMap<>();
+        if (type.equals(INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE)) {
+            // type
+            charFilterMap.put(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE, 
INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE);
+
+            // pattern
+            if 
(!properties.containsKey(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN)) {
+                return new HashMap<>();
+            }
+            String pattern = 
properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
+            charFilterMap.put(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, 
pattern);
+
+            // placement
+            String replacement = " ";
+            if 
(properties.containsKey(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT)) {
+                replacement = 
properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT);
+            }
+            charFilterMap.put(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, 
replacement);
+        } else {
+            return new HashMap<>();
+        }
+
+        return charFilterMap;
+    }
+
     public static void checkInvertedIndexParser(String indexColName, 
PrimitiveType colType,
             Map<String, String> properties) throws AnalysisException {
         String parser = null;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
index 8311a183e2..1057961452 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/MatchPredicate.java
@@ -37,6 +37,7 @@ import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 
 import java.util.List;
+import java.util.Map;
 import java.util.Objects;
 
 /**
@@ -152,6 +153,7 @@ public class MatchPredicate extends Predicate {
     private final Operator op;
     private String invertedIndexParser;
     private String invertedIndexParserMode;
+    private Map<String, String> invertedIndexCharFilter;
 
     public MatchPredicate(Operator op, Expr e1, Expr e2) {
         super();
@@ -179,13 +181,15 @@ public class MatchPredicate extends Predicate {
         op = other.op;
         invertedIndexParser = other.invertedIndexParser;
         invertedIndexParserMode = other.invertedIndexParserMode;
+        invertedIndexCharFilter = other.invertedIndexCharFilter;
     }
 
     /**
      * use for Nereids ONLY
      */
     public MatchPredicate(Operator op, Expr e1, Expr e2, Type retType,
-            NullableMode nullableMode, String invertedIndexParser, String 
invertedIndexParserMode) {
+            NullableMode nullableMode, String invertedIndexParser, String 
invertedIndexParserMode,
+            Map<String, String> invertedIndexCharFilter) {
         this(op, e1, e2);
         if (invertedIndexParser != null) {
             this.invertedIndexParser = invertedIndexParser;
@@ -193,6 +197,9 @@ public class MatchPredicate extends Predicate {
         if (invertedIndexParserMode != null) {
             this.invertedIndexParserMode = invertedIndexParserMode;
         }
+        if (invertedIndexParserMode != null) {
+            this.invertedIndexCharFilter = invertedIndexCharFilter;
+        }
         fn = new Function(new FunctionName(op.name), 
Lists.newArrayList(e1.getType(), e2.getType()), retType,
                 false, true, nullableMode);
     }
@@ -224,6 +231,7 @@ public class MatchPredicate extends Predicate {
         msg.node_type = TExprNodeType.MATCH_PRED;
         msg.setOpcode(op.getOpcode());
         msg.match_predicate = new TMatchPredicate(invertedIndexParser, 
invertedIndexParserMode);
+        msg.match_predicate.setCharFilterMap(invertedIndexCharFilter);
     }
 
     @Override
@@ -278,6 +286,7 @@ public class MatchPredicate extends Predicate {
                         if (slotRef.getColumnName().equals(columns.get(0))) {
                             invertedIndexParser = 
index.getInvertedIndexParser();
                             invertedIndexParserMode = 
index.getInvertedIndexParserMode();
+                            invertedIndexCharFilter = 
index.getInvertedIndexCharFilter();
                             break;
                         }
                     }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java
index e743087d26..e2235868a1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java
@@ -135,6 +135,10 @@ public class Index implements Writable {
         return InvertedIndexUtil.getInvertedIndexParserMode(properties);
     }
 
+    public Map<String, String> getInvertedIndexCharFilter() {
+        return InvertedIndexUtil.getInvertedIndexCharFilter(properties);
+    }
+
     public String getComment() {
         return comment;
     }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
index d6e9911145..1f34e8c6d3 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/ExpressionTranslator.java
@@ -102,7 +102,9 @@ import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableList;
 
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.Optional;
 import java.util.stream.Collectors;
 
@@ -194,6 +196,7 @@ public class ExpressionTranslator extends 
DefaultExpressionVisitor<Expr, PlanTra
     public Expr visitMatch(Match match, PlanTranslatorContext context) {
         String invertedIndexParser = 
InvertedIndexUtil.INVERTED_INDEX_PARSER_UNKNOWN;
         String invertedIndexParserMode = 
InvertedIndexUtil.INVERTED_INDEX_PARSER_FINE_GRANULARITY;
+        Map<String, String> invertedIndexCharFilter = new HashMap<>();
         SlotRef left = (SlotRef) match.left().accept(this, context);
         OlapTable olapTbl = 
Optional.ofNullable(getOlapTableFromSlotDesc(left.getDesc()))
                                     .orElse(getOlapTableDirectly(left));
@@ -210,6 +213,7 @@ public class ExpressionTranslator extends 
DefaultExpressionVisitor<Expr, PlanTra
                     if (columns != null && !columns.isEmpty() && 
left.getColumnName().equals(columns.get(0))) {
                         invertedIndexParser = index.getInvertedIndexParser();
                         invertedIndexParserMode = 
index.getInvertedIndexParserMode();
+                        invertedIndexCharFilter = 
index.getInvertedIndexCharFilter();
                         break;
                     }
                 }
@@ -223,7 +227,8 @@ public class ExpressionTranslator extends 
DefaultExpressionVisitor<Expr, PlanTra
             match.getDataType().toCatalogDataType(),
             NullableMode.DEPEND_ON_ARGUMENT,
             invertedIndexParser,
-            invertedIndexParserMode);
+            invertedIndexParserMode,
+            invertedIndexCharFilter);
     }
 
     @Override
diff --git a/gensrc/thrift/Exprs.thrift b/gensrc/thrift/Exprs.thrift
index e211ded8ce..e102babca2 100644
--- a/gensrc/thrift/Exprs.thrift
+++ b/gensrc/thrift/Exprs.thrift
@@ -142,6 +142,7 @@ struct TLikePredicate {
 struct TMatchPredicate {
   1: required string parser_type;
   2: required string parser_mode;
+  3: optional map<string, string> char_filter_map;
 }
 
 struct TLiteralPredicate {
diff --git 
a/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out 
b/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out
index 3cef00b125..8020e1c877 100644
--- a/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out
+++ b/regression-test/data/inverted_index_p0/char_filter/test_char_replace.out
@@ -44,3 +44,57 @@
 -- !sql --
 10
 
+-- !sql --
+0
+
+-- !sql --
+0
+
+-- !sql --
+0
+
+-- !sql --
+0
+
+-- !sql --
+0
+
+-- !sql --
+0
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
+-- !sql --
+10
+
diff --git 
a/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy 
b/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy
index c8916517f0..556a91c5df 100644
--- 
a/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy
+++ 
b/regression-test/suites/inverted_index_p0/char_filter/test_char_replace.groovy
@@ -34,9 +34,9 @@ suite("test_char_replace") {
                `a` text NULL,
         `b` string NULL,
         `c` string NULL,
-        INDEX a_idx(`a`) USING INVERTED PROPERTIES("parser" = "unicode") 
COMMENT '',
-               INDEX b_idx(`b`) USING INVERTED PROPERTIES("parser" = 
"unicode", "char_filter_type" = "char_replace", "char_filter_pattern" = "._", 
"char_filter_replacement" = " ") COMMENT '',
-        INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser" = "unicode", 
"char_filter_type" = "char_replace", "char_filter_pattern" = "._") COMMENT ''
+        INDEX a_idx(`a`) USING INVERTED PROPERTIES("parser" = "unicode", 
"support_phrase" = "true") COMMENT '',
+               INDEX b_idx(`b`) USING INVERTED PROPERTIES("parser" = 
"unicode", "support_phrase" = "true", "char_filter_type" = "char_replace", 
"char_filter_pattern" = "._", "char_filter_replacement" = " ") COMMENT '',
+        INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser" = "unicode", 
"support_phrase" = "true", "char_filter_type" = "char_replace", 
"char_filter_pattern" = "._") COMMENT ''
        ) ENGINE=OLAP
        DUPLICATE KEY(`id`)
        COMMENT 'OLAP'
@@ -79,4 +79,25 @@ suite("test_char_replace") {
     qt_sql "SELECT count() FROM ${indexTblName} where c match 'jpg'";
     qt_sql "SELECT count() FROM ${indexTblName} where c match '1'";
     qt_sql "SELECT count() FROM ${indexTblName} where c match '0'";
+
+    qt_sql "SELECT count() FROM ${indexTblName} where a match_any 'hm_bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where a match_all 'hm_bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where a match_phrase 'hm_bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where a match_any 'hm bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where a match_all 'hm bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where a match_phrase 'hm bg'";
+
+    qt_sql "SELECT count() FROM ${indexTblName} where b match_any 'hm_bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where b match_all 'hm_bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where b match_phrase 'hm_bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where b match_any 'hm bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where b match_all 'hm bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where b match_phrase 'hm bg'";
+
+    qt_sql "SELECT count() FROM ${indexTblName} where c match_any 'hm_bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where c match_all 'hm_bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where c match_phrase 'hm_bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where c match_any 'hm bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where c match_all 'hm bg'";
+    qt_sql "SELECT count() FROM ${indexTblName} where c match_phrase 'hm bg'";
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to