This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new bc7278e48ce [fix](inverted index) fix match_phrase_edge query result
error (#38327)
bc7278e48ce is described below
commit bc7278e48ce77e3a8394f71e2d916a1845768d18
Author: zzzxl <[email protected]>
AuthorDate: Mon Aug 5 11:16:02 2024 +0800
[fix](inverted index) fix match_phrase_edge query result error (#38327)
1. The result of match_phrase_ edge query for a single word is incorrect
---
.../inverted_index/query/phrase_edge_query.cpp | 26 +++++++++++++---------
.../inverted_index/query/phrase_edge_query.h | 1 +
.../test_index_match_phrase_edge.out | 12 ++++++++++
.../test_index_match_phrase_edge.groovy | 11 +++++++++
4 files changed, 40 insertions(+), 10 deletions(-)
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp
index 428dc05e6f6..ec1b5bdd9e4 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp
@@ -31,7 +31,9 @@ namespace doris::segment_v2 {
PhraseEdgeQuery::PhraseEdgeQuery(const
std::shared_ptr<lucene::search::IndexSearcher>& searcher,
const TQueryOptions& query_options)
- : _searcher(searcher),
_query(std::make_unique<CL_NS(search)::MultiPhraseQuery>()) {}
+ : _searcher(searcher),
+ _query(std::make_unique<CL_NS(search)::MultiPhraseQuery>()),
+ _max_expansions(query_options.inverted_index_max_expansions) {}
void PhraseEdgeQuery::add(const std::wstring& field_name, const
std::vector<std::string>& terms) {
if (terms.empty()) {
@@ -50,9 +52,9 @@ void PhraseEdgeQuery::search(roaring::Roaring& roaring) {
}
void PhraseEdgeQuery::search_one_term(roaring::Roaring& roaring) {
- size_t count = 0;
+ bool first = true;
std::wstring sub_term = StringUtil::string_to_wstring(_terms[0]);
- find_words([this, &count, &sub_term, &roaring](Term* term) {
+ find_words([this, &first, &sub_term, &roaring](Term* term) {
std::wstring_view ws_term(term->text(), term->textLength());
if (ws_term.find(sub_term) == std::wstring::npos) {
return;
@@ -70,12 +72,12 @@ void PhraseEdgeQuery::search_one_term(roaring::Roaring&
roaring) {
}
_CLDELETE(term_doc);
- if (count) {
+ if (!first) {
roaring.swap(result);
+ first = false;
} else {
roaring |= result;
}
- count++;
});
}
@@ -86,15 +88,19 @@ void PhraseEdgeQuery::search_multi_term(roaring::Roaring&
roaring) {
std::vector<CL_NS(index)::Term*> suffix_terms;
std::vector<CL_NS(index)::Term*> prefix_terms;
- find_words([&suffix_term, &suffix_terms, &prefix_term,
&prefix_terms](Term* term) {
+ find_words([this, &suffix_term, &suffix_terms, &prefix_term,
&prefix_terms](Term* term) {
std::wstring_view ws_term(term->text(), term->textLength());
- if (ws_term.ends_with(suffix_term)) {
- suffix_terms.push_back(_CL_POINTER(term));
+ if (_max_expansions == 0 || suffix_terms.size() < _max_expansions) {
+ if (ws_term.ends_with(suffix_term)) {
+ suffix_terms.push_back(_CL_POINTER(term));
+ }
}
- if (ws_term.starts_with(prefix_term)) {
- prefix_terms.push_back(_CL_POINTER(term));
+ if (_max_expansions == 0 || prefix_terms.size() < _max_expansions) {
+ if (ws_term.starts_with(prefix_term)) {
+ prefix_terms.push_back(_CL_POINTER(term));
+ }
}
});
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h
index 823f46285b1..5daf382e0d0 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h
@@ -52,6 +52,7 @@ private:
std::wstring _field_name;
std::vector<std::string> _terms;
std::unique_ptr<CL_NS(search)::MultiPhraseQuery> _query;
+ int32_t _max_expansions = 50;
};
} // namespace doris::segment_v2
\ No newline at end of file
diff --git
a/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
index e650f9b39b2..8accc202576 100644
--- a/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
+++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
@@ -29,3 +29,15 @@
-- !sql --
10 nav_tickets_off.gif 习惯于生活中很多 nav tickets off gif 虚假 nav tickets off
gif 美化的人来说
+-- !sql --
+2
+
+-- !sql --
+4
+
+-- !sql --
+11
+
+-- !sql --
+6
+
diff --git
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
index e05f6bb1ec9..8d4ab3d2320 100644
---
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
+++
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
@@ -48,6 +48,12 @@ suite("test_index_match_phrase_edge", "p0"){
sql """ INSERT INTO ${indexTbName1} VALUES (9, "hm_bg.jpg", "前几日 hm bg jpg
在别处 hm bg jpg 购得"); """
sql """ INSERT INTO ${indexTbName1} VALUES (10, "nav_tickets_off.gif",
"习惯于生活中很多 nav tickets off gif 虚假 nav tickets off gif 美化的人来说"); """
+ sql """ INSERT INTO ${indexTbName1} VALUES (11, "40.135.0.0", "GET
/images/hm_bg.jpg HTTP/1.0"); """
+ sql """ INSERT INTO ${indexTbName1} VALUES (12, "232.0.0.0", "GET
/images/hm_bg.jpg HTTP/1.0"); """
+ sql """ INSERT INTO ${indexTbName1} VALUES (13, "26.1.0.0", "GET
/images/hm_bg.jpg HTTP/1.0"); """
+ sql """ INSERT INTO ${indexTbName1} VALUES (14, "247.37.0.0", "GET
/french/splash_inet.html HTTP/1.0"); """
+ sql """ INSERT INTO ${indexTbName1} VALUES (15, "247.37.0.0", "GET
/images/hm_nbg.jpg HTTP/1.0"); """
+
try {
sql "sync"
@@ -63,6 +69,11 @@ suite("test_index_match_phrase_edge", "p0"){
qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'ue
off gif 家长 na'; """
qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'if
虚假 na'; """
+ qt_sql """ select count() from ${indexTbName1} where b
match_phrase_edge '1'; """
+ qt_sql """ select count() from ${indexTbName1} where b
match_phrase_edge '3'; """
+ qt_sql """ select count() from ${indexTbName1} where c
match_phrase_edge 'n'; """
+ qt_sql """ select count() from ${indexTbName1} where c
match_phrase_edge 'b'; """
+
} finally {
//try_sql("DROP TABLE IF EXISTS ${testTable}")
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]