This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new b013c669da3 [fix](inverted index) implementation of match function
without index (#36916)
b013c669da3 is described below
commit b013c669da3ae48d1bbd5c246a006628a92b482b
Author: zzzxl <[email protected]>
AuthorDate: Fri Jul 12 10:17:48 2024 +0800
[fix](inverted index) implementation of match function without index
(#36916)
## Proposed changes
pick from #36471
---
be/src/vec/functions/match.cpp | 150 +++++++++++++++++++++++++++++++++++++++++
be/src/vec/functions/match.h | 10 +--
2 files changed, 152 insertions(+), 8 deletions(-)
diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp
index 15ec3432940..d5ced67a75f 100644
--- a/be/src/vec/functions/match.cpp
+++ b/be/src/vec/functions/match.cpp
@@ -17,6 +17,8 @@
#include "vec/functions/match.h"
+#include <hs/hs.h>
+
#include "runtime/query_context.h"
#include "runtime/runtime_state.h"
#include "util/debug_points.h"
@@ -326,6 +328,154 @@ Status FunctionMatchPhrase::execute_match(const
std::string& column_name,
return Status::OK();
}
+Status FunctionMatchPhrasePrefix::execute_match(
+ const std::string& column_name, const std::string& match_query_str,
size_t input_rows_count,
+ const ColumnString* string_col, InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64* array_offsets, ColumnUInt8::Container&
result) {
+ DBUG_EXECUTE_IF("match.invert_index_not_support_execute_match", {
+ return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+ "FunctionMatchPhrasePrefix not support execute_match");
+ })
+
+ doris::InvertedIndexParserType parser_type =
doris::InvertedIndexParserType::PARSER_UNKNOWN;
+ if (inverted_index_ctx) {
+ parser_type = inverted_index_ctx->parser_type;
+ }
+ VLOG_DEBUG << "begin to run FunctionMatchPhrasePrefix::execute_match,
parser_type: "
+ << inverted_index_parser_type_to_string(parser_type);
+
+ auto reader =
doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
+
match_query_str);
+ std::vector<std::string> query_tokens;
+ doris::segment_v2::InvertedIndexReader::get_analyse_result(
+ query_tokens, reader.get(), inverted_index_ctx->analyzer,
column_name,
+
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY);
+
+ if (query_tokens.empty()) {
+ VLOG_DEBUG << fmt::format(
+ "token parser result is empty for query, "
+ "please check your query: '{}' and index parser: '{}'",
+ match_query_str,
inverted_index_parser_type_to_string(parser_type));
+ return Status::OK();
+ }
+
+ int32_t current_src_array_offset = 0;
+ for (size_t i = 0; i < input_rows_count; i++) {
+ auto data_tokens = analyse_data_token(column_name, inverted_index_ctx,
string_col, i,
+ array_offsets,
current_src_array_offset);
+
+ for (size_t j = 0; j < data_tokens.size() - query_tokens.size() + 1;
j++) {
+ if (data_tokens[j] == query_tokens[0] || query_tokens.size() == 1)
{
+ bool match = true;
+ for (size_t k = 0; k < query_tokens.size(); k++) {
+ const std::string& data_token = data_tokens[j + k];
+ const std::string& query_token = query_tokens[k];
+ if (k == query_tokens.size() - 1) {
+ if (data_token.compare(0, query_token.size(),
query_token) != 0) {
+ match = false;
+ break;
+ }
+ } else {
+ if (data_token != query_token) {
+ match = false;
+ break;
+ }
+ }
+ }
+ if (match) {
+ result[i] = true;
+ break;
+ }
+ }
+ }
+ }
+
+ return Status::OK();
+}
+
+Status FunctionMatchRegexp::execute_match(const std::string& column_name,
+ const std::string& match_query_str,
+ size_t input_rows_count, const
ColumnString* string_col,
+ InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64*
array_offsets,
+ ColumnUInt8::Container& result) {
+ DBUG_EXECUTE_IF("match.invert_index_not_support_execute_match", {
+ return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+ "FunctionMatchRegexp not support execute_match");
+ })
+
+ doris::InvertedIndexParserType parser_type =
doris::InvertedIndexParserType::PARSER_UNKNOWN;
+ if (inverted_index_ctx) {
+ parser_type = inverted_index_ctx->parser_type;
+ }
+ VLOG_DEBUG << "begin to run FunctionMatchRegexp::execute_match,
parser_type: "
+ << inverted_index_parser_type_to_string(parser_type);
+
+ if (match_query_str.empty()) {
+ VLOG_DEBUG << fmt::format(
+ "token parser result is empty for query, "
+ "please check your query: '{}' and index parser: '{}'",
+ match_query_str,
inverted_index_parser_type_to_string(parser_type));
+ return Status::OK();
+ }
+
+ const std::string& pattern = match_query_str;
+
+ hs_database_t* database = nullptr;
+ hs_compile_error_t* compile_err = nullptr;
+ hs_scratch_t* scratch = nullptr;
+
+ if (hs_compile(pattern.data(), HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY |
HS_FLAG_UTF8,
+ HS_MODE_BLOCK, nullptr, &database, &compile_err) !=
HS_SUCCESS) {
+ LOG(ERROR) << "hyperscan compilation failed: " << compile_err->message;
+ hs_free_compile_error(compile_err);
+ return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
+ std::string("hyperscan compilation failed:") +
compile_err->message);
+ }
+
+ if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) {
+ LOG(ERROR) << "hyperscan could not allocate scratch space.";
+ hs_free_database(database);
+ return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
+ "hyperscan could not allocate scratch space.");
+ }
+
+ auto on_match = [](unsigned int id, unsigned long long from, unsigned long
long to,
+ unsigned int flags, void* context) -> int {
+ *((bool*)context) = true;
+ return 0;
+ };
+
+ try {
+ auto current_src_array_offset = 0;
+ for (int i = 0; i < input_rows_count; i++) {
+ std::vector<std::string> data_tokens =
+ analyse_data_token(column_name, inverted_index_ctx,
string_col, i,
+ array_offsets,
current_src_array_offset);
+
+ for (auto& input : data_tokens) {
+ bool is_match = false;
+ if (hs_scan(database, input.data(), input.size(), 0, scratch,
on_match,
+ (void*)&is_match) != HS_SUCCESS) {
+ LOG(ERROR) << "hyperscan match failed: " << input;
+ break;
+ }
+
+ if (is_match) {
+ result[i] = true;
+ break;
+ }
+ }
+ }
+ }
+ _CLFINALLY({
+ hs_free_scratch(scratch);
+ hs_free_database(database);
+ })
+
+ return Status::OK();
+}
+
void register_function_match(SimpleFunctionFactory& factory) {
factory.register_function<FunctionMatchAny>();
factory.register_function<FunctionMatchAll>();
diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h
index db8cca17ec0..d2db27813cf 100644
--- a/be/src/vec/functions/match.h
+++ b/be/src/vec/functions/match.h
@@ -139,10 +139,7 @@ public:
size_t input_rows_count, const ColumnString*
string_col,
InvertedIndexCtx* inverted_index_ctx,
const ColumnArray::Offsets64* array_offsets,
- ColumnUInt8::Container& result) override {
- return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
- "FunctionMatchPhrasePrefix not support execute_match");
- }
+ ColumnUInt8::Container& result) override;
};
class FunctionMatchRegexp : public FunctionMatchBase {
@@ -156,10 +153,7 @@ public:
size_t input_rows_count, const ColumnString*
string_col,
InvertedIndexCtx* inverted_index_ctx,
const ColumnArray::Offsets64* array_offsets,
- ColumnUInt8::Container& result) override {
- return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
- "FunctionMatchRegexp not support execute_match");
- }
+ ColumnUInt8::Container& result) override;
};
} // namespace doris::vectorized
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]