(doris) branch master updated: [opt](inverted index) Optimize sequential phrase query logic (#41432)

jianliangqi Wed, 09 Oct 2024 23:18:11 -0700

This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new f313f1f8c1d [opt](inverted index) Optimize sequential phrase query 
logic (#41432)
f313f1f8c1d is described below

commit f313f1f8c1d9ea18626a39c8c472bceccab27b9a
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Thu Oct 10 14:17:57 2024 +0800

    [opt](inverted index) Optimize sequential phrase query logic (#41432)
    
    1. Set enable_phrase_query_sequential_opt = true to optimize conjunction
    matching in sequential phrase queries.
    2. For example, match_phrase "赵丽颖 中国 ~20+" ensures that "赵丽颖" appears
    consecutively, and "中国" also appears consecutively, while maintaining
    the semantics of sequential phrase queries.
---
 .../inverted_index/analyzer/analyzer.cpp           | 17 +++++
 .../segment_v2/inverted_index/analyzer/analyzer.h  |  6 ++
 .../inverted_index/query/phrase_query.cpp          | 87 +++++++++++++++-------
 .../segment_v2/inverted_index/query/phrase_query.h | 14 +++-
 .../rowset/segment_v2/inverted_index/query/query.h | 10 +++
 .../rowset/segment_v2/inverted_index_reader.cpp    | 23 ++----
 .../inverted_index/query/phrase_query_test.cpp     | 83 +++++++++++++++++++++
 .../java/org/apache/doris/qe/SessionVariable.java  |  9 +++
 gensrc/thrift/PaloInternalService.thrift           |  2 +
 .../test_index_match_phrase_ordered.out            |  6 ++
 .../test_index_match_phrase_ordered.groovy         | 64 +++++++++++++++-
 11 files changed, 273 insertions(+), 48 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
index 8ad1abb322f..94ba8fce0bc 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
@@ -115,4 +115,21 @@ std::vector<std::string> 
InvertedIndexAnalyzer::get_analyse_result(
     return analyse_result;
 }
 
+std::vector<std::string> InvertedIndexAnalyzer::get_analyse_result(
+        const std::string& search_str, const std::string& field_name,
+        InvertedIndexQueryType query_type, const std::map<std::string, 
std::string>& properties) {
+    InvertedIndexCtxSPtr inverted_index_ctx = 
std::make_shared<InvertedIndexCtx>(
+            get_inverted_index_parser_type_from_string(
+                    get_parser_string_from_properties(properties)),
+            get_parser_mode_string_from_properties(properties),
+            get_parser_char_filter_map_from_properties(properties),
+            get_parser_lowercase_from_properties(properties),
+            get_parser_stopwords_from_properties(properties));
+    auto analyzer = create_analyzer(inverted_index_ctx.get());
+    inverted_index_ctx->analyzer = analyzer.get();
+    auto reader = create_reader(inverted_index_ctx->char_filter_map);
+    reader->init(search_str.data(), search_str.size(), true);
+    return get_analyse_result(reader.get(), analyzer.get(), field_name, 
query_type);
+}
+
 } // namespace doris::segment_v2::inverted_index
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
index ad5d71a5364..6f369d504b2 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h
@@ -32,6 +32,7 @@ class Analyzer;
 } // namespace lucene
 
 namespace doris::segment_v2::inverted_index {
+
 class InvertedIndexAnalyzer {
 public:
     static std::unique_ptr<lucene::util::Reader> create_reader(CharFilterMap& 
char_filter_map);
@@ -44,5 +45,10 @@ public:
                                                        const std::string& 
field_name,
                                                        InvertedIndexQueryType 
query_type,
                                                        bool drop_duplicates = 
true);
+
+    static std::vector<std::string> get_analyse_result(
+            const std::string& search_str, const std::string& field_name,
+            InvertedIndexQueryType query_type,
+            const std::map<std::string, std::string>& properties);
 };
 } // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
index 0ca2dce94e3..9a3ecc68f89 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
@@ -17,8 +17,13 @@
 
 #include "phrase_query.h"
 
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/split.hpp>
 #include <charconv>
 
+#include "CLucene/index/Terms.h"
+#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
+
 namespace doris::segment_v2 {
 
 template <typename Derived>
@@ -141,19 +146,21 @@ void PhraseQuery::add(const InvertedIndexQueryInfo& 
query_info) {
 
     _slop = query_info.slop;
     if (_slop == 0 || query_info.ordered) {
+        if (query_info.ordered) {
+            _additional_terms = query_info.additional_terms;
+        }
         // Logic for no slop query and ordered phrase query
         add(query_info.field_name, query_info.terms);
     } else {
         // Simple slop query follows the default phrase query algorithm
-        auto query = std::make_unique<CL_NS(search)::PhraseQuery>();
+        _phrase_query = std::make_unique<CL_NS(search)::PhraseQuery>();
         for (const auto& term : query_info.terms) {
             std::wstring ws_term = StringUtil::string_to_wstring(term);
             auto* t = _CLNEW 
lucene::index::Term(query_info.field_name.c_str(), ws_term.c_str());
-            query->add(t);
+            _phrase_query->add(t);
             _CLDECDELETE(t);
         }
-        query->setSlop(_slop);
-        _matcher = std::move(query);
+        _phrase_query->setSlop(_slop);
     }
 }
 
@@ -173,13 +180,16 @@ void PhraseQuery::add(const std::wstring& field_name, 
const std::vector<std::str
     }
 
     std::vector<TermIterator> iterators;
-    auto ensureTermPosition = [this, &iterators, &field_name](const 
std::string& term) {
+    auto ensureTermPosition = [this, &iterators, &field_name](const 
std::string& term,
+                                                              bool 
is_save_iter = true) {
         std::wstring ws_term = StringUtil::string_to_wstring(term);
         Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str());
         _terms.push_back(t);
         TermPositions* term_pos = _searcher->getReader()->termPositions(t);
         _term_docs.push_back(term_pos);
-        iterators.emplace_back(term_pos);
+        if (is_save_iter) {
+            iterators.emplace_back(term_pos);
+        }
         return term_pos;
     };
 
@@ -190,16 +200,29 @@ void PhraseQuery::add(const std::wstring& field_name, 
const std::vector<std::str
             auto* term_pos = ensureTermPosition(term);
             matcher._postings.emplace_back(term_pos, i);
         }
-        _matcher = matcher;
+        _matchers.emplace_back(matcher);
     } else {
-        OrderedSloppyPhraseMatcher matcher;
-        for (size_t i = 0; i < terms.size(); i++) {
-            const auto& term = terms[i];
-            auto* term_pos = ensureTermPosition(term);
-            matcher._postings.emplace_back(term_pos, i);
+        {
+            OrderedSloppyPhraseMatcher single_matcher;
+            for (size_t i = 0; i < terms.size(); i++) {
+                const auto& term = terms[i];
+                auto* term_pos = ensureTermPosition(term);
+                single_matcher._postings.emplace_back(term_pos, i);
+            }
+            single_matcher._allowed_slop = _slop;
+            _matchers.emplace_back(single_matcher);
+        }
+        {
+            for (auto& terms : _additional_terms) {
+                ExactPhraseMatcher single_matcher;
+                for (size_t i = 0; i < terms.size(); i++) {
+                    const auto& term = terms[i];
+                    auto* term_pos = ensureTermPosition(term, false);
+                    single_matcher._postings.emplace_back(term_pos, i);
+                }
+                _matchers.emplace_back(std::move(single_matcher));
+            }
         }
-        matcher._allowed_slop = _slop;
-        _matcher = matcher;
     }
 
     std::sort(iterators.begin(), iterators.end(), [](const TermIterator& a, 
const TermIterator& b) {
@@ -214,9 +237,9 @@ void PhraseQuery::add(const std::wstring& field_name, const 
std::vector<std::str
 }
 
 void PhraseQuery::search(roaring::Roaring& roaring) {
-    if (std::holds_alternative<PhraseQueryPtr>(_matcher)) {
+    if (_phrase_query) {
         _searcher->_search(
-                std::get<PhraseQueryPtr>(_matcher).get(),
+                _phrase_query.get(),
                 [&roaring](const int32_t docid, const float_t /*score*/) { 
roaring.add(docid); });
     } else {
         if (_lead1.isEmpty()) {
@@ -288,17 +311,9 @@ int32_t PhraseQuery::do_next(int32_t doc) {
 }
 
 bool PhraseQuery::matches(int32_t doc) {
-    return std::visit(
-            [&doc](auto&& m) -> bool {
-                using T = std::decay_t<decltype(m)>;
-                if constexpr (std::is_same_v<T, PhraseQueryPtr>) {
-                    _CLTHROWA(CL_ERR_IllegalArgument,
-                              "PhraseQueryPtr does not support matches 
function");
-                } else {
-                    return m.matches(doc);
-                }
-            },
-            _matcher);
+    return std::ranges::all_of(_matchers, [&doc](auto&& matcher) {
+        return std::visit([&doc](auto&& m) -> bool { return m.matches(doc); }, 
matcher);
+    });
 }
 
 void PhraseQuery::parser_slop(std::string& query, InvertedIndexQueryInfo& 
query_info) {
@@ -343,6 +358,24 @@ void PhraseQuery::parser_slop(std::string& query, 
InvertedIndexQueryInfo& query_
     }
 }
 
+void PhraseQuery::parser_info(std::string& query, const std::string& 
field_name,
+                              InvertedIndexQueryType query_type,
+                              const std::map<std::string, std::string>& 
properties,
+                              InvertedIndexQueryInfo& query_info, bool 
sequential_opt) {
+    parser_slop(query, query_info);
+    query_info.terms = 
inverted_index::InvertedIndexAnalyzer::get_analyse_result(
+            query, field_name, query_type, properties);
+    if (sequential_opt && query_info.ordered) {
+        std::vector<std::string> t_querys;
+        boost::split(t_querys, query, boost::algorithm::is_any_of(" "));
+        for (auto& t_query : t_querys) {
+            auto terms = 
inverted_index::InvertedIndexAnalyzer::get_analyse_result(
+                    t_query, field_name, query_type, properties);
+            query_info.additional_terms.emplace_back(std::move(terms));
+        }
+    }
+}
+
 template class PhraseMatcherBase<ExactPhraseMatcher>;
 template class PhraseMatcherBase<OrderedSloppyPhraseMatcher>;
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
index 253ba782b78..35a479ff7f9 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
@@ -24,6 +24,8 @@
 
 #include <variant>
 
+#include "olap/rowset/segment_v2/inverted_index_query_type.h"
+
 CL_NS_USE(index)
 CL_NS_USE(search)
 
@@ -76,11 +78,11 @@ private:
     int32_t _match_width = -1;
 };
 
-using PhraseQueryPtr = std::unique_ptr<CL_NS(search)::PhraseQuery>;
 // ExactPhraseMatcher: x match_phrase 'aaa bbb'
 // PhraseQueryPtr: x match_phrase 'aaa bbb ~2', support slop
 // OrderedSloppyPhraseMatcher: x match_phrase 'aaa bbb ~2+', ensuring that the 
words appear in the specified order.
-using Matcher = std::variant<ExactPhraseMatcher, OrderedSloppyPhraseMatcher, 
PhraseQueryPtr>;
+using PhraseQueryPtr = std::unique_ptr<CL_NS(search)::PhraseQuery>;
+using Matcher = std::variant<ExactPhraseMatcher, OrderedSloppyPhraseMatcher>;
 
 class PhraseQuery : public Query {
 public:
@@ -103,6 +105,10 @@ private:
 
 public:
     static void parser_slop(std::string& query, InvertedIndexQueryInfo& 
query_info);
+    static void parser_info(std::string& query, const std::string& field_name,
+                            InvertedIndexQueryType query_type,
+                            const std::map<std::string, std::string>& 
properties,
+                            InvertedIndexQueryInfo& query_info, bool 
sequential_opt);
 
 private:
     std::shared_ptr<lucene::search::IndexSearcher> _searcher;
@@ -117,7 +123,9 @@ private:
     std::vector<TermDocs*> _term_docs;
 
     int32_t _slop = 0;
-    Matcher _matcher;
+    std::vector<std::vector<std::string>> _additional_terms;
+    PhraseQueryPtr _phrase_query = nullptr;
+    std::vector<Matcher> _matchers;
 };
 
 } // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h
index cef7fd51f72..c295765ec63 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h
@@ -38,8 +38,18 @@ namespace doris::segment_v2 {
 struct InvertedIndexQueryInfo {
     std::wstring field_name;
     std::vector<std::string> terms;
+    std::vector<std::vector<std::string>> additional_terms;
     int32_t slop = 0;
     bool ordered = false;
+
+    std::string to_string() {
+        std::string s;
+        s += std::to_string(terms.size()) + ", ";
+        s += std::to_string(additional_terms.size()) + ", ";
+        s += std::to_string(slop) + ", ";
+        s += std::to_string(ordered);
+        return s;
+    }
 };
 
 class Query {
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 7b8504322d2..b7cfe7dfaff 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -266,24 +266,13 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
             query_info.terms.emplace_back(search_str);
         } else {
             if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
-                PhraseQuery::parser_slop(search_str, query_info);
+                PhraseQuery::parser_info(
+                        search_str, column_name, query_type, 
_index_meta.properties(), query_info,
+                        
runtime_state->query_options().enable_phrase_query_sequential_opt);
+            } else {
+                query_info.terms = 
inverted_index::InvertedIndexAnalyzer::get_analyse_result(
+                        search_str, column_name, query_type, 
_index_meta.properties());
             }
-
-            InvertedIndexCtxSPtr inverted_index_ctx = 
std::make_shared<InvertedIndexCtx>(
-                    get_inverted_index_parser_type_from_string(
-                            
get_parser_string_from_properties(_index_meta.properties())),
-                    
get_parser_mode_string_from_properties(_index_meta.properties()),
-                    
get_parser_char_filter_map_from_properties(_index_meta.properties()),
-                    
get_parser_lowercase_from_properties(_index_meta.properties()),
-                    
get_parser_stopwords_from_properties(_index_meta.properties()));
-            auto analyzer = 
inverted_index::InvertedIndexAnalyzer::create_analyzer(
-                    inverted_index_ctx.get());
-            inverted_index_ctx->analyzer = analyzer.get();
-            auto reader = inverted_index::InvertedIndexAnalyzer::create_reader(
-                    inverted_index_ctx->char_filter_map);
-            reader->init(search_str.data(), search_str.size(), true);
-            query_info.terms = 
inverted_index::InvertedIndexAnalyzer::get_analyse_result(
-                    reader.get(), analyzer.get(), column_name, query_type);
         }
         if (query_info.terms.empty()) {
             auto msg = fmt::format(
diff --git 
a/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp 
b/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp
new file mode 100644
index 00000000000..f3fb9763c9b
--- /dev/null
+++ b/be/test/olap/rowset/segment_v2/inverted_index/query/phrase_query_test.cpp
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/inverted_index/query/phrase_query.h"
+
+#include <gtest/gtest.h>
+
+#include "io/fs/local_file_system.h"
+
+namespace doris::segment_v2 {
+
+class PhraseQueryTest : public testing::Test {
+public:
+    const std::string kTestDir = "./ut_dir/phrase_query_test";
+
+    void SetUp() override {
+        auto st = io::global_local_filesystem()->delete_directory(kTestDir);
+        ASSERT_TRUE(st.ok()) << st;
+        st = io::global_local_filesystem()->create_directory(kTestDir);
+        ASSERT_TRUE(st.ok()) << st;
+    }
+    void TearDown() override {
+        
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir).ok());
+    }
+
+    PhraseQueryTest() = default;
+    ~PhraseQueryTest() override = default;
+};
+
+TEST_F(PhraseQueryTest, test_parser_info) {
+    std::map<std::string, std::string> properties;
+    properties.insert({"parser", "english"});
+    properties.insert({"support_phrase", "true"});
+    properties.insert({"lower_case", "true"});
+
+    auto parser_info = [&properties](std::string& search_str, 
InvertedIndexQueryInfo& query_info,
+                                     bool sequential_opt) {
+        PhraseQuery::parser_info(search_str, "name", 
InvertedIndexQueryType::MATCH_REGEXP_QUERY,
+                                 properties, query_info, sequential_opt);
+    };
+
+    auto parser = [&parser_info](std::string search_str, std::string res1, 
size_t res2,
+                                 int32_t res3, bool res4, size_t res5) {
+        InvertedIndexQueryInfo query_info;
+        parser_info(search_str, query_info, true);
+        EXPECT_EQ(search_str, res1);
+        EXPECT_EQ(query_info.terms.size(), res2);
+        EXPECT_EQ(query_info.slop, res3);
+        EXPECT_EQ(query_info.ordered, res4);
+        EXPECT_EQ(query_info.additional_terms.size(), res5);
+        std::cout << "--- 1 ---: " << query_info.to_string() << std::endl;
+    };
+
+    // "english/history off.gif ~20+" sequential_opt = true
+    parser("", "", 0, 0, false, 0);
+    parser("english", "english", 1, 0, false, 0);
+    parser("english/history", "english/history", 2, 0, false, 0);
+    parser("english/history off", "english/history off", 3, 0, false, 0);
+    parser("english/history off.gif", "english/history off.gif", 4, 0, false, 
0);
+    parser("english/history off.gif ", "english/history off.gif ", 4, 0, 
false, 0);
+    parser("english/history off.gif ~", "english/history off.gif ~", 4, 0, 
false, 0);
+    parser("english/history off.gif ~2", "english/history off.gif", 4, 2, 
false, 0);
+    parser("english/history off.gif ~20", "english/history off.gif", 4, 20, 
false, 0);
+    parser("english/history off.gif ~20+", "english/history off.gif", 4, 20, 
true, 2);
+    parser("english/history off.gif ~20+ ", "english/history off.gif ~20+ ", 
5, 0, false, 0);
+    parser("english/history off.gif ~20+x", "english/history off.gif ~20+x", 
6, 0, false, 0);
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 638ea712ce9..60d1b914c95 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -663,6 +663,8 @@ public class SessionVariable implements Serializable, 
Writable {
     public static final String ADAPTIVE_PIPELINE_TASK_SERIAL_READ_ON_LIMIT =
                                     
"adaptive_pipeline_task_serial_read_on_limit";
 
+    public static final String ENABLE_PHRASE_QUERY_SEQUENYIAL_OPT = 
"enable_phrase_query_sequential_opt";
+
     /**
      * If set false, user couldn't submit analyze SQL and FE won't allocate 
any related resources.
      */
@@ -2169,6 +2171,12 @@ public class SessionVariable implements Serializable, 
Writable {
     })
     public int adaptivePipelineTaskSerialReadOnLimit = 10000;
 
+    @VariableMgr.VarAttr(name = ENABLE_PHRASE_QUERY_SEQUENYIAL_OPT, 
needForward = true, description = {
+        "开启顺序短语查询对连词的优化",
+        "enable optimization for conjunctions in sequential phrase queries"
+    })
+    public boolean enablePhraseQuerySequentialOpt = true;
+
     public void setEnableEsParallelScroll(boolean enableESParallelScroll) {
         this.enableESParallelScroll = enableESParallelScroll;
     }
@@ -3770,6 +3778,7 @@ public class SessionVariable implements Serializable, 
Writable {
         
tResult.setEnableAdaptivePipelineTaskSerialReadOnLimit(enableAdaptivePipelineTaskSerialReadOnLimit);
         
tResult.setAdaptivePipelineTaskSerialReadOnLimit(adaptivePipelineTaskSerialReadOnLimit);
         tResult.setInListValueCountThreshold(inListValueCountThreshold);
+        
tResult.setEnablePhraseQuerySequentialOpt(enablePhraseQuerySequentialOpt);
         return tResult;
     }
 
diff --git a/gensrc/thrift/PaloInternalService.thrift 
b/gensrc/thrift/PaloInternalService.thrift
index 332b6c6e425..b560059819f 100644
--- a/gensrc/thrift/PaloInternalService.thrift
+++ b/gensrc/thrift/PaloInternalService.thrift
@@ -349,6 +349,8 @@ struct TQueryOptions {
 
   135: optional bool enable_parallel_outfile = false;
 
+  136: optional bool enable_phrase_query_sequential_opt = true;
+
   // For cloud, to control if the content would be written into file cache
   // In write path, to control if the content would be written into file cache.
   // In read path, read from file cache or remote storage when execute query.
diff --git 
a/regression-test/data/inverted_index_p0/test_index_match_phrase_ordered.out 
b/regression-test/data/inverted_index_p0/test_index_match_phrase_ordered.out
index d1e04ececd5..8636b6af5ac 100644
--- a/regression-test/data/inverted_index_p0/test_index_match_phrase_ordered.out
+++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_ordered.out
@@ -65,3 +65,9 @@
 -- !sql --
 7
 
+-- !sql --
+25
+
+-- !sql --
+87
+
diff --git 
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_ordered.groovy
 
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_ordered.groovy
index a65811d4f65..0f563835e86 100644
--- 
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_ordered.groovy
+++ 
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_ordered.groovy
@@ -17,9 +17,11 @@
 
 
 suite("test_index_match_phrase_ordered", "nonConcurrent"){
-    def indexTbName1 = "test_index_match_phrase_ordered"
+    def indexTbName1 = "test_index_match_phrase_ordered_1"
+    def indexTbName2 = "test_index_match_phrase_ordered_2"
 
     sql "DROP TABLE IF EXISTS ${indexTbName1}"
+    sql "DROP TABLE IF EXISTS ${indexTbName2}"
 
     sql """
       CREATE TABLE ${indexTbName1} (
@@ -35,6 +37,61 @@ suite("test_index_match_phrase_ordered", "nonConcurrent"){
       );
     """
 
+    sql """
+      CREATE TABLE ${indexTbName2} (
+        `@timestamp` int(11) NULL COMMENT "",
+        `clientip` varchar(20) NULL COMMENT "",
+        `request` text NULL COMMENT "",
+        `status` int(11) NULL COMMENT "",
+        `size` int(11) NULL COMMENT "",
+        INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT ''
+      ) ENGINE=OLAP
+      DUPLICATE KEY(`@timestamp`)
+      COMMENT "OLAP"
+      DISTRIBUTED BY RANDOM BUCKETS 1
+      PROPERTIES (
+        "replication_allocation" = "tag.location.default: 1",
+        "disable_auto_compaction" = "true"
+      );
+    """
+
+    def load_httplogs_data = {table_name, label, read_flag, format_flag, 
file_name, ignore_failure=false,
+                        expected_succ_rows = -1, load_to_single_tablet = 
'true' ->
+        
+        // load the json data
+        streamLoad {
+            table "${table_name}"
+            
+            // set http request header params
+            set 'label', label + "_" + UUID.randomUUID().toString()
+            set 'read_json_by_line', read_flag
+            set 'format', format_flag
+            file file_name // import json file
+            time 10000 // limit inflight 10s
+            if (expected_succ_rows >= 0) {
+                set 'max_filter_ratio', '1'
+            }
+
+            // if declared a check callback, the default check condition will 
ignore.
+            // So you must check all condition
+            check { result, exception, startTime, endTime ->
+                       if (ignore_failure && expected_succ_rows < 0) { return }
+                    if (exception != null) {
+                        throw exception
+                    }
+                    log.info("Stream load result: ${result}".toString())
+                    def json = parseJson(result)
+                    assertEquals("success", json.Status.toLowerCase())
+                    if (expected_succ_rows >= 0) {
+                        assertEquals(json.NumberLoadedRows, expected_succ_rows)
+                    } else {
+                        assertEquals(json.NumberTotalRows, 
json.NumberLoadedRows + json.NumberUnselectedRows)
+                        assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes 
> 0)
+                }
+            }
+        }
+    }
+
     sql """ INSERT INTO ${indexTbName1} VALUES (1, "the quick brown fox jumped 
over the lazy dog"); """
     sql """ INSERT INTO ${indexTbName1} VALUES (2, "the quick brown fox jumped 
over the lazy dog over"); """
     sql """ INSERT INTO ${indexTbName1} VALUES (3, "the quick brown fox jumped 
over the lazy dog jumped"); """
@@ -48,6 +105,8 @@ suite("test_index_match_phrase_ordered", "nonConcurrent"){
     sql """ INSERT INTO ${indexTbName1} VALUES (11, "quick brown fox jumped 
over the lazy dog quick"); """
 
     try {
+        load_httplogs_data.call(indexTbName2, 
'test_index_match_phrase_ordered_2', 'true', 'json', 'documents-1000.json')
+
         sql "sync"
         sql """ set enable_common_expr_pushdown = true; """
         GetDebugPoint().enableDebugPointForAllBEs("VMatchPredicate.execute")
@@ -83,6 +142,9 @@ suite("test_index_match_phrase_ordered", "nonConcurrent"){
 
         qt_sql """ select count() from ${indexTbName1} where b match_phrase 
'the quick ~6'; """
         qt_sql """ select count() from ${indexTbName1} where b match_phrase 
'the quick ~6+'; """
+
+        qt_sql """ select count() from ${indexTbName2} where request 
match_phrase 'english/history off.gif ~20+'; """
+        qt_sql """ select count() from ${indexTbName2} where request 
match_phrase 'english/images off.gif ~20+'; """
     } finally {
         GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris) branch master updated: [opt](inverted index) Optimize sequential phrase query logic (#41432)

Reply via email to