This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 8056274dd9e [fix] Implementing match_phrase_edge without index query
method (#41658)
8056274dd9e is described below
commit 8056274dd9e531bf476a2a4a260330bb100b83d1
Author: zzzxl <[email protected]>
AuthorDate: Wed Oct 16 14:19:15 2024 +0800
[fix] Implementing match_phrase_edge without index query method (#41658)
1. Supports match_phrase_edge query without creating an inverted index.
---
be/src/vec/functions/match.cpp | 66 +++++++++++++++
be/src/vec/functions/match.h | 5 +-
.../test_index_match_phrase_edge.out | 24 ++++++
.../test_index_match_phrase_edge.groovy | 98 ++++++++++++++++++++++
4 files changed, 189 insertions(+), 4 deletions(-)
diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp
index de46cf008d5..bbdabe3c506 100644
--- a/be/src/vec/functions/match.cpp
+++ b/be/src/vec/functions/match.cpp
@@ -506,6 +506,72 @@ Status FunctionMatchRegexp::execute_match(FunctionContext*
context, const std::s
return Status::OK();
}
+Status FunctionMatchPhraseEdge::execute_match(
+ FunctionContext* context, const std::string& column_name,
+ const std::string& match_query_str, size_t input_rows_count, const
ColumnString* string_col,
+ InvertedIndexCtx* inverted_index_ctx, const ColumnArray::Offsets64*
array_offsets,
+ ColumnUInt8::Container& result) const {
+ RETURN_IF_ERROR(check(context, name));
+
+ std::vector<std::string> query_tokens =
+ analyse_query_str_token(inverted_index_ctx, match_query_str,
column_name);
+ if (query_tokens.empty()) {
+ VLOG_DEBUG << fmt::format(
+ "token parser result is empty for query, "
+ "please check your query: '{}' and index parser: '{}'",
+ match_query_str,
+
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
+ return Status::OK();
+ }
+
+ int32_t current_src_array_offset = 0;
+ for (size_t i = 0; i < input_rows_count; i++) {
+ auto data_tokens = analyse_data_token(column_name, inverted_index_ctx,
string_col, i,
+ array_offsets,
current_src_array_offset);
+
+ int32_t dis_count = data_tokens.size() - query_tokens.size();
+ if (dis_count < 0) {
+ continue;
+ }
+
+ for (size_t j = 0; j < dis_count + 1; j++) {
+ bool match = true;
+ if (query_tokens.size() == 1) {
+ if (data_tokens[j].find(query_tokens[0]) == std::string::npos)
{
+ match = false;
+ }
+ } else {
+ for (size_t k = 0; k < query_tokens.size(); k++) {
+ const std::string& data_token = data_tokens[j + k];
+ const std::string& query_token = query_tokens[k];
+ if (k == 0) {
+ if (!data_token.ends_with(query_token)) {
+ match = false;
+ break;
+ }
+ } else if (k == query_tokens.size() - 1) {
+ if (!data_token.starts_with(query_token)) {
+ match = false;
+ break;
+ }
+ } else {
+ if (data_token != query_token) {
+ match = false;
+ break;
+ }
+ }
+ }
+ }
+ if (match) {
+ result[i] = true;
+ break;
+ }
+ }
+ }
+
+ return Status::OK();
+}
+
void register_function_match(SimpleFunctionFactory& factory) {
factory.register_function<FunctionMatchAny>();
factory.register_function<FunctionMatchAll>();
diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h
index a4cea93852a..477ab0a3409 100644
--- a/be/src/vec/functions/match.h
+++ b/be/src/vec/functions/match.h
@@ -180,10 +180,7 @@ public:
const std::string& match_query_str, size_t
input_rows_count,
const ColumnString* string_col, InvertedIndexCtx*
inverted_index_ctx,
const ColumnArray::Offsets64* array_offsets,
- ColumnUInt8::Container& result) const override {
- return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
- "FunctionMatchPhraseEdge not support execute_match");
- }
+ ColumnUInt8::Container& result) const override;
};
} // namespace doris::vectorized
diff --git
a/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
index 8accc202576..71714c41b3b 100644
--- a/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
+++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
@@ -41,3 +41,27 @@
-- !sql --
6
+-- !sql --
+0
+
+-- !sql --
+874
+
+-- !sql --
+150
+
+-- !sql --
+20
+
+-- !sql --
+0
+
+-- !sql --
+874
+
+-- !sql --
+150
+
+-- !sql --
+20
+
diff --git
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
index b7fe5664556..147291eb77b 100644
---
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
+++
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
@@ -79,4 +79,102 @@ suite("test_index_match_phrase_edge", "nonConcurrent"){
} finally {
GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
}
+
+ def indexTbName2 = "test_index_match_phrase_edge2"
+ def indexTbName3 = "test_index_match_phrase_edge3"
+
+ sql "DROP TABLE IF EXISTS ${indexTbName2}"
+ sql "DROP TABLE IF EXISTS ${indexTbName3}"
+
+ sql """
+ CREATE TABLE ${indexTbName2} (
+ `@timestamp` int(11) NULL COMMENT "",
+ `clientip` varchar(20) NULL COMMENT "",
+ `request` text NULL COMMENT "",
+ `status` int(11) NULL COMMENT "",
+ `size` int(11) NULL COMMENT "",
+ INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`@timestamp`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ sql """
+ CREATE TABLE ${indexTbName3} (
+ `@timestamp` int(11) NULL COMMENT "",
+ `clientip` varchar(20) NULL COMMENT "",
+ `request` text NULL COMMENT "",
+ `status` int(11) NULL COMMENT "",
+ `size` int(11) NULL COMMENT ""
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`@timestamp`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ def load_httplogs_data = {table_name, label, read_flag, format_flag,
file_name, ignore_failure=false,
+ expected_succ_rows = -1, load_to_single_tablet =
'true' ->
+
+ // load the json data
+ streamLoad {
+ table "${table_name}"
+
+ // set http request header params
+ set 'label', label + "_" + UUID.randomUUID().toString()
+ set 'read_json_by_line', read_flag
+ set 'format', format_flag
+ file file_name // import json file
+ time 10000 // limit inflight 10s
+ if (expected_succ_rows >= 0) {
+ set 'max_filter_ratio', '1'
+ }
+
+ // if declared a check callback, the default check condition will
ignore.
+ // So you must check all condition
+ check { result, exception, startTime, endTime ->
+ if (ignore_failure && expected_succ_rows < 0) { return }
+ if (exception != null) {
+ throw exception
+ }
+ log.info("Stream load result: ${result}".toString())
+ def json = parseJson(result)
+ assertEquals("success", json.Status.toLowerCase())
+ if (expected_succ_rows >= 0) {
+ assertEquals(json.NumberLoadedRows, expected_succ_rows)
+ } else {
+ assertEquals(json.NumberTotalRows,
json.NumberLoadedRows + json.NumberUnselectedRows)
+ assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes
> 0)
+ }
+ }
+ }
+ }
+
+ try {
+ load_httplogs_data.call(indexTbName2, indexTbName2, 'true', 'json',
'documents-1000.json')
+ load_httplogs_data.call(indexTbName3, indexTbName3, 'true', 'json',
'documents-1000.json')
+
+ sql "sync"
+ sql """ set enable_common_expr_pushdown = true; """
+
+ GetDebugPoint().enableDebugPointForAllBEs("VMatchPredicate.execute")
+ qt_sql """ select count() from ${indexTbName2} where request
match_phrase_edge ''; """
+ qt_sql """ select count() from ${indexTbName2} where request
match_phrase_edge 'age'; """
+ qt_sql """ select count() from ${indexTbName2} where request
match_phrase_edge 'es/na'; """
+ qt_sql """ select count() from ${indexTbName2} where request
match_phrase_edge 'ets/images/ti'; """
+ GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
+
+ qt_sql """ select count() from ${indexTbName3} where request
match_phrase_edge ''; """
+ qt_sql """ select count() from ${indexTbName3} where request
match_phrase_edge 'age'; """
+ qt_sql """ select count() from ${indexTbName3} where request
match_phrase_edge 'es/na'; """
+ qt_sql """ select count() from ${indexTbName3} where request
match_phrase_edge 'ets/images/ti'; """
+ } finally {
+ GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
+ }
}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]