xiaokang commented on code in PR #36356:
URL: https://github.com/apache/doris/pull/36356#discussion_r1667911598
##########
be/src/olap/rowset/segment_v2/inverted_index_reader.cpp:
##########
@@ -323,7 +323,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics*
stats, RuntimeState* run
query_info.terms.emplace_back(search_str);
} else {
if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
- RETURN_IF_ERROR(PhraseQuery::parser_slop(search_str,
query_info));
+ PhraseQuery::parser_slop(search_str, query_info);
Review Comment:
Why not return status now?
##########
be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h:
##########
@@ -22,11 +22,62 @@
#include "CLucene/search/PhraseQuery.h"
// clang-format on
+#include <variant>
+
CL_NS_USE(index)
CL_NS_USE(search)
namespace doris::segment_v2 {
+class PostingsAndPosition {
Review Comment:
delete old private class PostingsAndPosition
##########
be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp:
##########
@@ -73,14 +172,33 @@ void PhraseQuery::add(const std::wstring& field_name,
const std::vector<std::str
}
Review Comment:
no _matcher for terms.size() == 1 ?
##########
be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h:
##########
@@ -22,11 +22,62 @@
#include "CLucene/search/PhraseQuery.h"
// clang-format on
+#include <variant>
+
CL_NS_USE(index)
CL_NS_USE(search)
namespace doris::segment_v2 {
+class PostingsAndPosition {
+public:
+ PostingsAndPosition(const TermPositionIterator& postings, int32_t offset)
+ : _postings(postings), _offset(offset) {}
+
+ TermPositionIterator _postings;
+ int32_t _offset = 0;
+ int32_t _freq = 0;
+ int32_t _upTo = 0;
+ int32_t _pos = 0;
+};
+
+template <typename Derived>
+class PhraseMatcherBase {
+public:
+ bool matches(int32_t doc);
Review Comment:
add comment for all interface
##########
be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h:
##########
@@ -54,12 +105,10 @@ class PhraseQuery : public Query {
void search_by_skiplist(roaring::Roaring& roaring);
int32_t do_next(int32_t doc);
Review Comment:
add comment for sematics of the interfaces.
##########
be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h:
##########
@@ -22,11 +22,62 @@
#include "CLucene/search/PhraseQuery.h"
// clang-format on
+#include <variant>
+
CL_NS_USE(index)
CL_NS_USE(search)
namespace doris::segment_v2 {
+class PostingsAndPosition {
+public:
+ PostingsAndPosition(const TermPositionIterator& postings, int32_t offset)
+ : _postings(postings), _offset(offset) {}
+
+ TermPositionIterator _postings;
+ int32_t _offset = 0;
+ int32_t _freq = 0;
+ int32_t _upTo = 0;
+ int32_t _pos = 0;
+};
+
+template <typename Derived>
+class PhraseMatcherBase {
+public:
+ bool matches(int32_t doc);
+
+private:
+ void reset(int32_t doc);
+
+protected:
+ bool advance_position(PostingsAndPosition& posting, int32_t target);
+
+public:
+ std::vector<PostingsAndPosition> _postings;
+};
+
+class ExactPhraseMatcher : public PhraseMatcherBase<ExactPhraseMatcher> {
+public:
+ bool next_match();
+};
+
+class OrderedSloppyPhraseMatcher : public
PhraseMatcherBase<OrderedSloppyPhraseMatcher> {
+public:
+ bool next_match();
+
+private:
+ bool stretchToOrder(PostingsAndPosition* prev_posting);
+
+public:
+ int32_t _allowed_slop = 0;
+
+private:
+ int32_t _match_width = -1;
+};
+
+using PhraseQueryPtr = std::unique_ptr<CL_NS(search)::PhraseQuery>;
+using Matcher = std::variant<ExactPhraseMatcher, OrderedSloppyPhraseMatcher,
PhraseQueryPtr>;
Review Comment:
add comment to explain usage of different matchers
##########
be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp:
##########
@@ -44,16 +140,19 @@ void PhraseQuery::add(const InvertedIndexQueryInfo&
query_info) {
}
_slop = query_info.slop;
- if (_slop <= 0) {
+
+ if (_slop == 0 || query_info.ordered) {
Review Comment:
add comment to explain different branch
##########
be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp:
##########
@@ -240,17 +311,38 @@ Status PhraseQuery::parser_slop(std::string& query,
InvertedIndexQueryInfo& quer
if (tilde_pos < query.size() - 1 && query[tilde_pos] == '~') {
size_t slop_pos = tilde_pos + 1;
std::string_view slop_str(query.data() + slop_pos, query.size() -
slop_pos);
- if (is_digits(slop_str)) {
- auto result = std::from_chars(slop_str.begin(),
slop_str.end(), query_info.slop);
- if (result.ec != std::errc()) {
- return
Status::Error<doris::ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
- "PhraseQuery parser failed: {}", query);
+ do {
+ if (slop_str.empty()) {
+ break;
}
- query = query.substr(0, last_space_pos);
- }
+
+ bool ordered = false;
+ if (slop_str.size() == 1) {
+ if (!std::isdigit(slop_str[0])) {
+ break;
+ }
+ } else {
+ if (slop_str.back() == '+') {
+ ordered = true;
+ slop_str.remove_suffix(1);
+ }
+ }
+
+ if (is_digits(slop_str)) {
+ auto result =
+ std::from_chars(slop_str.begin(), slop_str.end(),
query_info.slop);
+ if (result.ec != std::errc()) {
+ break;
Review Comment:
Why not return Status::Error now?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]