This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 0c8fd93da3c [fix](inverted index) fix multi match result error (#38931)
0c8fd93da3c is described below
commit 0c8fd93da3cd3e8d077a99b8a1517fdb1bb16f50
Author: zzzxl <[email protected]>
AuthorDate: Wed Aug 7 21:49:04 2024 +0800
[fix](inverted index) fix multi match result error (#38931)
1. multi_match result merging logic is incorrect
---
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 14 +--
be/src/olap/rowset/segment_v2/segment_iterator.h | 2 +-
be/src/vec/exprs/vectorized_fn_call.cpp | 5 +-
be/src/vec/exprs/vectorized_fn_call.h | 3 +-
be/src/vec/exprs/vexpr.h | 3 +-
be/src/vec/functions/function.h | 10 +-
be/src/vec/functions/function_multi_match.cpp | 33 +++---
be/src/vec/functions/function_multi_match.h | 4 +-
.../inverted_index_p0/test_index_multi_match.out | 25 +++++
.../test_index_multi_match.groovy | 124 +++++++++++++++++++++
10 files changed, 187 insertions(+), 36 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 0473ff128fc..db27bc45405 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1476,13 +1476,12 @@ Status
SegmentIterator::_init_inverted_index_iterators() {
}
Status SegmentIterator::_init_inverted_index_iterators(ColumnId cid) {
+ std::lock_guard lock(_idx_init_lock);
if (_inverted_index_iterators[cid] == nullptr) {
- return _init_single_inverted_index_iterator.call([&] {
- return _segment->new_inverted_index_iterator(
- _opts.tablet_schema->column(cid),
-
_segment->_tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid)),
- _opts, &_inverted_index_iterators[cid]);
- });
+ return _segment->new_inverted_index_iterator(
+ _opts.tablet_schema->column(cid),
+
_segment->_tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid)),
+ _opts, &_inverted_index_iterators[cid]);
}
return Status::OK();
}
@@ -3066,9 +3065,8 @@ Status SegmentIterator::execute_func_expr(const
vectorized::VExprSPtr& expr,
params._unique_id = _schema->unique_id(slot_expr->column_id());
params._column_name =
_opts.tablet_schema->column(params._column_id).name();
params._segment_iterator = this;
- params.result = result;
- return expr->eval_inverted_index(expr_ctx.get(), params);
+ return expr->eval_inverted_index(expr_ctx.get(), params, result);
}
} // namespace segment_v2
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h
b/be/src/olap/rowset/segment_v2/segment_iterator.h
index f163376d95f..8056036bc98 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -532,7 +532,7 @@ private:
std::unordered_map<int, std::unordered_map<std::string, bool>>
_column_predicate_inverted_index_status;
- DorisCallOnce<Status> _init_single_inverted_index_iterator;
+ std::mutex _idx_init_lock;
};
} // namespace segment_v2
diff --git a/be/src/vec/exprs/vectorized_fn_call.cpp
b/be/src/vec/exprs/vectorized_fn_call.cpp
index 7be01469db7..591d72d6f26 100644
--- a/be/src/vec/exprs/vectorized_fn_call.cpp
+++ b/be/src/vec/exprs/vectorized_fn_call.cpp
@@ -263,8 +263,9 @@ bool VectorizedFnCall::can_fast_execute() const {
}
Status VectorizedFnCall::eval_inverted_index(VExprContext* context,
- segment_v2::FuncExprParams&
params) {
- return
_function->eval_inverted_index(context->fn_context(_fn_context_index), params);
+ segment_v2::FuncExprParams&
params,
+
std::shared_ptr<roaring::Roaring>& result) {
+ return
_function->eval_inverted_index(context->fn_context(_fn_context_index), params,
result);
}
bool VectorizedFnCall::equals(const VExpr& other) {
diff --git a/be/src/vec/exprs/vectorized_fn_call.h
b/be/src/vec/exprs/vectorized_fn_call.h
index 0fa41b88522..02d843b1795 100644
--- a/be/src/vec/exprs/vectorized_fn_call.h
+++ b/be/src/vec/exprs/vectorized_fn_call.h
@@ -75,7 +75,8 @@ public:
bool can_push_down_to_index() const override;
bool can_fast_execute() const override;
- Status eval_inverted_index(VExprContext* context,
segment_v2::FuncExprParams& params) override;
+ Status eval_inverted_index(VExprContext* context,
segment_v2::FuncExprParams& params,
+ std::shared_ptr<roaring::Roaring>& result)
override;
bool equals(const VExpr& other) override;
protected:
diff --git a/be/src/vec/exprs/vexpr.h b/be/src/vec/exprs/vexpr.h
index ee2a221866a..4b13f635f78 100644
--- a/be/src/vec/exprs/vexpr.h
+++ b/be/src/vec/exprs/vexpr.h
@@ -241,7 +241,8 @@ public:
virtual bool can_push_down_to_index() const { return false; }
virtual bool can_fast_execute() const { return false; }
- virtual Status eval_inverted_index(VExprContext* context,
segment_v2::FuncExprParams& params) {
+ virtual Status eval_inverted_index(VExprContext* context,
segment_v2::FuncExprParams& params,
+ std::shared_ptr<roaring::Roaring>&
result) {
return Status::NotSupported("Not supported
execute_with_inverted_index");
}
virtual bool equals(const VExpr& other);
diff --git a/be/src/vec/functions/function.h b/be/src/vec/functions/function.h
index f4243875d64..149b1f6e9d5 100644
--- a/be/src/vec/functions/function.h
+++ b/be/src/vec/functions/function.h
@@ -235,8 +235,8 @@ public:
}
virtual bool can_push_down_to_index() const { return false; }
- virtual Status eval_inverted_index(FunctionContext* context,
- segment_v2::FuncExprParams& params) {
+ virtual Status eval_inverted_index(FunctionContext* context,
segment_v2::FuncExprParams& params,
+ std::shared_ptr<roaring::Roaring>&
result) {
return Status::NotSupported("eval_inverted_index is not supported in
function: ",
get_name());
}
@@ -543,9 +543,9 @@ public:
}
bool can_push_down_to_index() const override { return
function->can_push_down_to_index(); }
- Status eval_inverted_index(FunctionContext* context,
- segment_v2::FuncExprParams& params) override {
- return function->eval_inverted_index(context, params);
+ Status eval_inverted_index(FunctionContext* context,
segment_v2::FuncExprParams& params,
+ std::shared_ptr<roaring::Roaring>& result)
override {
+ return function->eval_inverted_index(context, params, result);
}
private:
diff --git a/be/src/vec/functions/function_multi_match.cpp
b/be/src/vec/functions/function_multi_match.cpp
index d4ff5c10dca..ba7fa887f19 100644
--- a/be/src/vec/functions/function_multi_match.cpp
+++ b/be/src/vec/functions/function_multi_match.cpp
@@ -75,7 +75,11 @@ Status FunctionMultiMatch::open(FunctionContext* context,
field_names_str.end());
std::vector<std::string> field_names;
boost::split(field_names, field_names_str,
boost::algorithm::is_any_of(","));
- state->fields.insert(field_names.begin(), field_names.end());
+ for (const auto& field_name : field_names) {
+ if (!field_name.empty()) {
+ state->fields.insert(field_name);
+ }
+ }
} break;
case 2:
state->type = const_data.to_string();
@@ -93,7 +97,8 @@ Status FunctionMultiMatch::open(FunctionContext* context,
}
Status FunctionMultiMatch::eval_inverted_index(FunctionContext* context,
- segment_v2::FuncExprParams&
params) {
+ segment_v2::FuncExprParams&
params,
+
std::shared_ptr<roaring::Roaring>& result) {
auto* match_param = reinterpret_cast<MatchParam*>(
context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
if (match_param == nullptr) {
@@ -106,7 +111,6 @@ Status
FunctionMultiMatch::eval_inverted_index(FunctionContext* context,
const auto& tablet_schema = opts.tablet_schema;
std::vector<ColumnId> columns_ids;
-
for (const auto& column_name : match_param->fields) {
auto cid = tablet_schema->field_index(column_name);
if (cid < 0) {
@@ -148,14 +152,14 @@ Status
FunctionMultiMatch::eval_inverted_index(FunctionContext* context,
auto* cache = InvertedIndexQueryCache::instance();
InvertedIndexQueryCacheHandle cache_handler;
if (cache->lookup(cache_key, &cache_handler)) {
- params.result = cache_handler.get_bitmap();
+ result = cache_handler.get_bitmap();
return Status::OK();
}
// search
- bool first = true;
for (const auto& column_name : match_param->fields) {
auto cid = tablet_schema->field_index(column_name);
+ const auto& column = *DORIS_TRY(tablet_schema->column(column_name));
auto& index_iterator =
segment_iterator->inverted_index_iterators()[cid];
if (!index_iterator) {
@@ -163,19 +167,16 @@ Status
FunctionMultiMatch::eval_inverted_index(FunctionContext* context,
}
const auto& index_reader = index_iterator->reader();
- auto result = std::make_shared<roaring::Roaring>();
- RETURN_IF_ERROR(index_reader->query(opts.stats, opts.runtime_state,
column_name,
- match_param->query.data(),
query_type, result));
- if (first) {
- (*params.result).swap(*result);
- first = false;
- } else {
- (*params.result) |= (*result);
- }
+ auto single_result = std::make_shared<roaring::Roaring>();
+ StringRef query_value(match_param->query.data());
+ RETURN_IF_ERROR(index_reader->query(opts.stats, opts.runtime_state,
+
std::to_string(column.unique_id()), &query_value,
+ query_type, single_result));
+ (*result) |= (*single_result);
}
- params.result->runOptimize();
- cache->insert(cache_key, params.result, &cache_handler);
+ result->runOptimize();
+ cache->insert(cache_key, result, &cache_handler);
return Status::OK();
}
diff --git a/be/src/vec/functions/function_multi_match.h
b/be/src/vec/functions/function_multi_match.h
index 55e13778ab7..b7d2bd3c30e 100644
--- a/be/src/vec/functions/function_multi_match.h
+++ b/be/src/vec/functions/function_multi_match.h
@@ -63,8 +63,8 @@ public:
bool can_push_down_to_index() const override { return true; }
- Status eval_inverted_index(FunctionContext* context,
- segment_v2::FuncExprParams& params) override;
+ Status eval_inverted_index(FunctionContext* context,
segment_v2::FuncExprParams& params,
+ std::shared_ptr<roaring::Roaring>& result)
override;
};
} // namespace doris::vectorized
diff --git a/regression-test/data/inverted_index_p0/test_index_multi_match.out
b/regression-test/data/inverted_index_p0/test_index_multi_match.out
new file mode 100644
index 00000000000..0a2ed2730b4
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_multi_match.out
@@ -0,0 +1,25 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+178
+
+-- !sql --
+180
+
+-- !sql --
+859
+
+-- !sql --
+44
+
+-- !sql --
+178
+
+-- !sql --
+180
+
+-- !sql --
+859
+
+-- !sql --
+44
+
diff --git
a/regression-test/suites/inverted_index_p0/test_index_multi_match.groovy
b/regression-test/suites/inverted_index_p0/test_index_multi_match.groovy
new file mode 100644
index 00000000000..f08dd984a67
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_index_multi_match.groovy
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_index_multi_match", "p0"){
+ def indexTbName1 = "test_index_multi_match_1"
+ def indexTbName2 = "test_index_multi_match_2"
+
+ sql "DROP TABLE IF EXISTS ${indexTbName1}"
+ sql "DROP TABLE IF EXISTS ${indexTbName2}"
+
+ sql """
+ CREATE TABLE ${indexTbName1} (
+ `@timestamp` int(11) NULL COMMENT "",
+ `clientip` text NULL COMMENT "",
+ `request` text NULL COMMENT "",
+ `status` text NULL COMMENT "",
+ `size` text NULL COMMENT "",
+ INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT '',
+ INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT '',
+ INDEX status_idx (`status`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT '',
+ INDEX size_idx (`size`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`@timestamp`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "disable_auto_compaction" = "true"
+ );
+ """
+
+ sql """
+ CREATE TABLE ${indexTbName2} (
+ `@timestamp` int(11) NULL COMMENT "",
+ `clientip` text NULL COMMENT "",
+ `request` text NULL COMMENT "",
+ `status` text NULL COMMENT "",
+ `size` text NULL COMMENT "",
+ INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT '',
+ INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT '',
+ INDEX status_idx (`status`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT '',
+ INDEX size_idx (`size`) USING INVERTED PROPERTIES("parser" =
"english", "support_phrase" = "true") COMMENT ''
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`@timestamp`)
+ COMMENT "OLAP"
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "disable_auto_compaction" = "true"
+ );
+ """
+
+ def load_httplogs_data = {table_name, label, read_flag, format_flag,
file_name, ignore_failure=false,
+ expected_succ_rows = -1, load_to_single_tablet =
'true' ->
+
+ // load the json data
+ streamLoad {
+ table "${table_name}"
+
+ // set http request header params
+ set 'label', label + "_" + UUID.randomUUID().toString()
+ set 'read_json_by_line', read_flag
+ set 'format', format_flag
+ file file_name // import json file
+ time 10000 // limit inflight 10s
+ if (expected_succ_rows >= 0) {
+ set 'max_filter_ratio', '1'
+ }
+
+ // if declared a check callback, the default check condition will
ignore.
+ // So you must check all condition
+ check { result, exception, startTime, endTime ->
+ if (ignore_failure && expected_succ_rows < 0) { return }
+ if (exception != null) {
+ throw exception
+ }
+ log.info("Stream load result: ${result}".toString())
+ def json = parseJson(result)
+ assertEquals("success", json.Status.toLowerCase())
+ if (expected_succ_rows >= 0) {
+ assertEquals(json.NumberLoadedRows, expected_succ_rows)
+ } else {
+ assertEquals(json.NumberTotalRows,
json.NumberLoadedRows + json.NumberUnselectedRows)
+ assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes
> 0)
+ }
+ }
+ }
+ }
+
+ try {
+ load_httplogs_data.call(indexTbName1, 'test_index_multi_match_1',
'true', 'json', 'documents-1000.json')
+ load_httplogs_data.call(indexTbName2, 'test_index_multi_match_2',
'true', 'json', 'documents-1000.json')
+
+ sql "sync"
+
+ qt_sql """ select count() from ${indexTbName1} where (clientip
match_phrase_prefix '2'); """
+ qt_sql """ select count() from ${indexTbName1} where (clientip
match_phrase_prefix '2' or request match_phrase_prefix '2'); """
+ qt_sql """ select count() from ${indexTbName1} where (clientip
match_phrase_prefix '2' or request match_phrase_prefix '2' or status
match_phrase_prefix '2' or size match_phrase_prefix '2'); """
+ qt_sql """ select count() from ${indexTbName1} where (clientip
match_phrase_prefix 'a' or request match_phrase_prefix 'a' or status
match_phrase_prefix 'a' or size match_phrase_prefix 'a'); """
+
+ qt_sql """ select count() from ${indexTbName2} where
multi_match(clientip, '', 'phrase_prefix', '2'); """
+ qt_sql """ select count() from ${indexTbName2} where
multi_match(clientip, 'request', 'phrase_prefix', '2'); """
+ qt_sql """ select count() from ${indexTbName2} where
multi_match(clientip, 'request, status, size', 'phrase_prefix', '2'); """
+ qt_sql """ select count() from ${indexTbName2} where
multi_match(clientip, 'request, status, size', 'phrase_prefix', 'a'); """
+
+ } finally {
+ //try_sql("DROP TABLE IF EXISTS ${testTable}")
+ }
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]