This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 2ddf97a1a38 [fix](ann-index) Fix ANN range search state leakage and
incorrect slot index tracking. (#63666)
2ddf97a1a38 is described below
commit 2ddf97a1a38707268e7ca4e267c8c6b293ddbdf4
Author: Qi Chen <[email protected]>
AuthorDate: Fri May 29 10:55:29 2026 +0800
[fix](ann-index) Fix ANN range search state leakage and incorrect slot
index tracking. (#63666)
Issue Number: close #xxx
Related PR: #xxx
Problem Summary:
### Release note
ANN range search execution state was stored on shared VExpr roots.
VExprContext clones share the root expression, so a segment that
executed ANN range search could leak that state into another segment
without an ANN index and incorrectly remove the common expression. ANN
range search also mixed schema column indexes with storage column ids
when updating common expression index status, so remapped schemas failed
to mark the source slot expression as evaluated. This patch returns ANN
execution state through the current evaluation call, stores ANN root
bitmap in the current segment IndexContext, and updates slot index
status by source column index.
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
be/src/exprs/vectorized_fn_call.cpp | 18 +-
be/src/exprs/vectorized_fn_call.h | 2 +-
be/src/exprs/vexpr.cpp | 12 +-
be/src/exprs/vexpr.h | 22 +--
be/src/exprs/vexpr_context.cpp | 37 +++-
be/src/exprs/vexpr_context.h | 2 +-
be/src/exprs/virtual_slot_ref.cpp | 6 +-
be/src/exprs/virtual_slot_ref.h | 2 +-
be/src/storage/segment/segment_iterator.cpp | 14 +-
.../storage/index/ann/ann_range_search_test.cpp | 213 ++++++++++++++++++++-
.../ann_range_search_pushdown_regression.groovy | 141 ++++++++++++++
...ge_search_source_index_status_regression.groovy | 84 ++++++++
12 files changed, 495 insertions(+), 58 deletions(-)
diff --git a/be/src/exprs/vectorized_fn_call.cpp
b/be/src/exprs/vectorized_fn_call.cpp
index 2d39cad9450..d6858c612d0 100644
--- a/be/src/exprs/vectorized_fn_call.cpp
+++ b/be/src/exprs/vectorized_fn_call.cpp
@@ -557,7 +557,8 @@ Status VectorizedFnCall::evaluate_ann_range_search(
const std::vector<ColumnId>& idx_to_cid,
const std::vector<std::unique_ptr<segment_v2::ColumnIterator>>&
column_iterators,
roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats,
- bool enable_result_cache) {
+ bool enable_result_cache, AnnRangeSearchEvaluationResult&
evaluation_result) {
+ evaluation_result = {};
if (range_search_runtime.is_ann_range_search == false) {
return Status::OK();
}
@@ -566,8 +567,8 @@ Status VectorizedFnCall::evaluate_ann_range_search(
range_search_runtime.to_string());
size_t origin_num = row_bitmap.cardinality();
- int idx_in_block = static_cast<int>(range_search_runtime.src_col_idx);
- DCHECK(idx_in_block < idx_to_cid.size())
+ const auto idx_in_block = range_search_runtime.src_col_idx;
+ DCHECK_LT(idx_in_block, idx_to_cid.size())
<< "idx_in_block: " << idx_in_block << ", idx_to_cid.size(): " <<
idx_to_cid.size();
ColumnId src_col_cid = idx_to_cid[idx_in_block];
@@ -649,6 +650,7 @@ Status VectorizedFnCall::evaluate_ann_range_search(
row_bitmap = *result.roaring;
// Process virtual column
+ bool dist_fulfilled = false;
if (range_search_runtime.dst_col_idx >= 0) {
// Prepare materialization if we can use result from index.
// Typical situation: range search and operator is LE or LT.
@@ -672,7 +674,7 @@ Status VectorizedFnCall::evaluate_ann_range_search(
}
virtual_column_iterator->prepare_materialization(std::move(distance_col),
std::move(result.row_ids));
- _virtual_column_is_fulfilled = true;
+ dist_fulfilled = true;
} else {
// Whether the ANN index should have produced distance depends on
metric and operator:
// - L2: distance is produced for LE/LT; not produced for GE/GT
@@ -686,17 +688,17 @@ Status VectorizedFnCall::evaluate_ann_range_search(
// If we expected distance but didn't get it, assert in debug to
catch logic errors.
DCHECK(!should_have_distance) << "Expected distance from ANN index
but got none";
#endif
- _virtual_column_is_fulfilled = false;
}
} else {
// Dest is not virtual column.
- _virtual_column_is_fulfilled = true;
+ dist_fulfilled = true;
}
- _has_been_executed = true;
+ evaluation_result.executed = true;
+ evaluation_result.dist_fulfilled = dist_fulfilled;
VLOG_DEBUG << fmt::format(
"Ann range search filtered {} rows, origin {} rows, virtual column
is full-filled: {}",
- origin_num - row_bitmap.cardinality(), origin_num,
_virtual_column_is_fulfilled);
+ origin_num - row_bitmap.cardinality(), origin_num, dist_fulfilled);
ann_index_stats = *stats;
return Status::OK();
diff --git a/be/src/exprs/vectorized_fn_call.h
b/be/src/exprs/vectorized_fn_call.h
index 64661206e63..4d8a1011169 100644
--- a/be/src/exprs/vectorized_fn_call.h
+++ b/be/src/exprs/vectorized_fn_call.h
@@ -92,7 +92,7 @@ public:
const std::vector<ColumnId>& idx_to_cid,
const std::vector<std::unique_ptr<segment_v2::ColumnIterator>>&
column_iterators,
roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats,
- bool enable_result_cache) override;
+ bool enable_result_cache, AnnRangeSearchEvaluationResult& result)
override;
void prepare_ann_range_search(const doris::VectorSearchUserParams& params,
segment_v2::AnnRangeSearchRuntime& runtime,
diff --git a/be/src/exprs/vexpr.cpp b/be/src/exprs/vexpr.cpp
index 61157a5dd00..61e3effbdd1 100644
--- a/be/src/exprs/vexpr.cpp
+++ b/be/src/exprs/vexpr.cpp
@@ -1038,7 +1038,9 @@ Status VExpr::evaluate_ann_range_search(
const std::vector<std::unique_ptr<segment_v2::IndexIterator>>&
index_iterators,
const std::vector<ColumnId>& idx_to_cid,
const std::vector<std::unique_ptr<segment_v2::ColumnIterator>>&
column_iterators,
- roaring::Roaring& row_bitmap, AnnIndexStats& ann_index_stats, bool
enable_result_cache) {
+ roaring::Roaring& row_bitmap, AnnIndexStats& ann_index_stats, bool
enable_result_cache,
+ AnnRangeSearchEvaluationResult& result) {
+ result = {};
return Status::OK();
}
@@ -1056,14 +1058,6 @@ void VExpr::prepare_ann_range_search(const
doris::VectorSearchUserParams& params
}
}
-bool VExpr::ann_range_search_executedd() {
- return _has_been_executed;
-}
-
-bool VExpr::ann_dist_is_fulfilled() const {
- return _virtual_column_is_fulfilled;
-}
-
Status VExpr::execute_filter(VExprContext* context, const Block* block,
uint8_t* __restrict result_filter_data, size_t
rows, bool accept_null,
bool* can_filter_all) const {
diff --git a/be/src/exprs/vexpr.h b/be/src/exprs/vexpr.h
index f99b89af924..1576686fcc7 100644
--- a/be/src/exprs/vexpr.h
+++ b/be/src/exprs/vexpr.h
@@ -79,6 +79,15 @@ struct AnnRangeSearchRuntime;
using Selector = IColumn::Selector;
+struct AnnRangeSearchEvaluationResult {
+ // Indicates whether the expr row_bitmap has been updated.
+ bool executed = false;
+ // Indicates whether the virtual column is fulfilled.
+ // NOTE, if there is no virtual column in the expr tree, and expr
+ // is evaluated by ann index, this flag is still true.
+ bool dist_fulfilled = false;
+};
+
class VExpr {
public:
// resize inserted param column to make sure column size equal to
block.rows() and return param column index
@@ -346,7 +355,7 @@ public:
const std::vector<ColumnId>& idx_to_cid,
const std::vector<std::unique_ptr<segment_v2::ColumnIterator>>&
column_iterators,
roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats,
- bool enable_result_cache);
+ bool enable_result_cache, AnnRangeSearchEvaluationResult& result);
// Prepare the runtime for ANN range search.
// AnnRangeSearchRuntime is used to store the runtime information of ann
range search.
@@ -356,10 +365,6 @@ public:
segment_v2::AnnRangeSearchRuntime&
range_search_runtime,
bool& suitable_for_ann_index);
- bool ann_range_search_executedd();
-
- bool ann_dist_is_fulfilled() const;
-
virtual uint64_t get_digest(uint64_t seed) const;
protected:
@@ -442,13 +447,6 @@ protected:
// ensuring uniqueness during index traversal
uint32_t _index_unique_id = 0;
bool _enable_inverted_index_query = true;
-
- // Indicates whether the expr row_bitmap has been updated.
- bool _has_been_executed = false;
- // Indicates whether the virtual column is fulfilled.
- // NOTE, if there is no virtual column in the expr tree, and expr
- // is evaluated by ann index, this flag is still true.
- bool _virtual_column_is_fulfilled = false;
};
// NOLINTBEGIN(readability-function-size)
diff --git a/be/src/exprs/vexpr_context.cpp b/be/src/exprs/vexpr_context.cpp
index c8ca10ab048..1bdbff89ae0 100644
--- a/be/src/exprs/vexpr_context.cpp
+++ b/be/src/exprs/vexpr_context.cpp
@@ -29,6 +29,7 @@
#include "core/block/columns_with_type_and_name.h"
#include "core/column/column.h"
#include "core/column/column_const.h"
+#include "exec/common/util.hpp"
#include "exprs/function_context.h"
#include "exprs/vexpr.h"
#include "runtime/runtime_state.h"
@@ -440,41 +441,57 @@ Status VExprContext::evaluate_ann_range_search(
const std::unordered_map<VExprContext*, std::unordered_map<ColumnId,
VExpr*>>&
common_expr_to_slotref_map,
roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats,
- bool enable_result_cache) {
+ bool enable_result_cache, bool* ann_range_search_executed) {
+ if (ann_range_search_executed != nullptr) {
+ *ann_range_search_executed = false;
+ }
if (_root == nullptr) {
return Status::OK();
}
+ AnnRangeSearchEvaluationResult evaluation_result;
RETURN_IF_ERROR(_root->evaluate_ann_range_search(
_ann_range_search_runtime, cid_to_index_iterators, idx_to_cid,
column_iterators,
- row_bitmap, ann_index_stats, enable_result_cache));
+ row_bitmap, ann_index_stats, enable_result_cache,
evaluation_result));
- if (!_root->ann_range_search_executedd()) {
+ if (!evaluation_result.executed) {
return Status::OK();
}
+ if (ann_range_search_executed != nullptr) {
+ *ann_range_search_executed = true;
+ }
+
+ DCHECK(_index_context != nullptr);
+ _index_context->set_index_result_for_expr(
+ _root.get(),
+
segment_v2::InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(row_bitmap),
+
std::make_shared<roaring::Roaring>()));
- if (!_root->ann_dist_is_fulfilled()) {
+ if (!evaluation_result.dist_fulfilled) {
// Do not perform index scan in this case.
return Status::OK();
}
- auto src_col_idx = _ann_range_search_runtime.src_col_idx;
+ DCHECK_LT(_ann_range_search_runtime.src_col_idx, idx_to_cid.size());
+ const auto src_col_idx =
cast_set<int>(_ann_range_search_runtime.src_col_idx);
+ const auto src_col_key =
cast_set<ColumnId>(_ann_range_search_runtime.src_col_idx);
auto slot_ref_map_it = common_expr_to_slotref_map.find(this);
if (slot_ref_map_it == common_expr_to_slotref_map.end()) {
return Status::OK();
}
auto& slot_ref_map = slot_ref_map_it->second;
- ColumnId cid = idx_to_cid[src_col_idx];
- if (slot_ref_map.find(cid) == slot_ref_map.end()) {
+ auto slot_ref_it = slot_ref_map.find(src_col_key);
+ if (slot_ref_it == slot_ref_map.end()) {
return Status::OK();
}
- const VExpr* slot_ref_expr_addr = slot_ref_map.find(cid)->second;
- _index_context->set_true_for_index_status(slot_ref_expr_addr,
idx_to_cid[cid]);
+ const VExpr* slot_ref_expr_addr = slot_ref_it->second;
+ _index_context->set_true_for_index_status(slot_ref_expr_addr, src_col_idx);
VLOG_DEBUG << fmt::format(
"Evaluate ann range search for expr {}, src_col_idx {}, cid {},
row_bitmap "
"cardinality {}",
- _root->debug_string(), src_col_idx, cid, row_bitmap.cardinality());
+ _root->debug_string(), src_col_idx,
idx_to_cid[_ann_range_search_runtime.src_col_idx],
+ row_bitmap.cardinality());
return Status::OK();
}
diff --git a/be/src/exprs/vexpr_context.h b/be/src/exprs/vexpr_context.h
index cca3d4c32b0..84900c02463 100644
--- a/be/src/exprs/vexpr_context.h
+++ b/be/src/exprs/vexpr_context.h
@@ -389,7 +389,7 @@ public:
const std::unordered_map<VExprContext*,
std::unordered_map<ColumnId, VExpr*>>&
common_expr_to_slotref_map,
roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats,
- bool enable_result_cache);
+ bool enable_result_cache, bool* ann_range_search_executed);
uint64_t get_digest(uint64_t seed) const;
diff --git a/be/src/exprs/virtual_slot_ref.cpp
b/be/src/exprs/virtual_slot_ref.cpp
index 1ed60244a09..5367df1f31d 100644
--- a/be/src/exprs/virtual_slot_ref.cpp
+++ b/be/src/exprs/virtual_slot_ref.cpp
@@ -235,11 +235,9 @@ Status VirtualSlotRef::evaluate_ann_range_search(
const std::vector<ColumnId>& idx_to_cid,
const std::vector<std::unique_ptr<segment_v2::ColumnIterator>>&
column_iterators,
roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats,
- bool enable_result_cache) {
+ bool enable_result_cache, AnnRangeSearchEvaluationResult& result) {
return _virtual_column_expr->evaluate_ann_range_search(
range_search_runtime, cid_to_index_iterators, idx_to_cid,
column_iterators, row_bitmap,
- ann_index_stats, enable_result_cache);
-
- return Status::OK();
+ ann_index_stats, enable_result_cache, result);
}
} // namespace doris
diff --git a/be/src/exprs/virtual_slot_ref.h b/be/src/exprs/virtual_slot_ref.h
index 721203f7de9..8f1e14b4b86 100644
--- a/be/src/exprs/virtual_slot_ref.h
+++ b/be/src/exprs/virtual_slot_ref.h
@@ -108,7 +108,7 @@ public:
const std::vector<ColumnId>& idx_to_cid,
const std::vector<std::unique_ptr<segment_v2::ColumnIterator>>&
column_iterators,
roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats,
- bool enable_result_cache) override;
+ bool enable_result_cache, AnnRangeSearchEvaluationResult& result)
override;
#ifdef BE_TEST
// Test-only setter methods for unit testing
diff --git a/be/src/storage/segment/segment_iterator.cpp
b/be/src/storage/segment/segment_iterator.cpp
index 0d979ac9fa1..ef2076713a3 100644
--- a/be/src/storage/segment/segment_iterator.cpp
+++ b/be/src/storage/segment/segment_iterator.cpp
@@ -1366,10 +1366,14 @@ Status SegmentIterator::_apply_index_expr() {
for (const auto& expr_ctx : _common_expr_ctxs_push_down) {
segment_v2::AnnIndexStats ann_index_stats;
size_t origin_rows = _row_bitmap.cardinality();
+ bool ann_range_search_executed = false;
RETURN_IF_ERROR(expr_ctx->evaluate_ann_range_search(
_index_iterators, _schema->column_ids(), _column_iterators,
_common_expr_to_slotref_map, _row_bitmap, ann_index_stats,
- enable_ann_index_result_cache));
+ enable_ann_index_result_cache, &ann_range_search_executed));
+ if (ann_range_search_executed) {
+ _opts.stats->ann_index_range_search_cnt++;
+ }
_opts.stats->rows_ann_index_range_filtered += (origin_rows -
_row_bitmap.cardinality());
_opts.stats->ann_index_load_ns +=
ann_index_stats.load_index_costs_ns.value();
_opts.stats->ann_index_range_search_ns +=
ann_index_stats.search_costs_ns.value();
@@ -1386,14 +1390,6 @@ Status SegmentIterator::_apply_index_expr() {
_opts.stats->ann_index_range_cache_hits +=
ann_index_stats.range_cache_hits.value();
}
- for (auto it = _common_expr_ctxs_push_down.begin(); it !=
_common_expr_ctxs_push_down.end();) {
- if ((*it)->root()->ann_range_search_executedd()) {
- _opts.stats->ann_index_range_search_cnt++;
- it = _common_expr_ctxs_push_down.erase(it);
- } else {
- ++it;
- }
- }
return Status::OK();
}
diff --git a/be/test/storage/index/ann/ann_range_search_test.cpp
b/be/test/storage/index/ann/ann_range_search_test.cpp
index 572783ebecb..a3395f0fc2d 100644
--- a/be/test/storage/index/ann/ann_range_search_test.cpp
+++ b/be/test/storage/index/ann/ann_range_search_test.cpp
@@ -23,7 +23,9 @@
#include <cassert>
#include <cstdint>
#include <iostream>
+#include <map>
#include <memory>
+#include <unordered_map>
#include <vector>
#include "common/object_pool.h"
@@ -54,6 +56,18 @@ std::string ann_range_search_thrift =
std::string thrift_table_desc =
R"xxx({"1":{"lst":["rec",7,{"1":{"i32":0},"2":{"i32":0},"3":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":5},"5":{"i32":0}}}}]},"3":{"i64":-1}}},"4":{"i32":-1},"5":{"i32":-1},"6":{"i32":0},"7":{"i32":-1},"8":{"str":"id"},"9":{"i32":0},"10":{"tf":1},"11":{"i32":0},"12":{"tf":1},"13":{"tf":1},"14":{"tf":0},"17":{"i32":5}},{"1":{"i32":1},"2":{"i32":0},"3":{"rec":{"1":{"lst":["rec",2,{"1":{"i32":1},"4":{"tf":1},"5":{"lst":["tf",1,1]}},{"1":{"i32":0},"2":{"rec":{"1
[...]
+static std::shared_ptr<IndexExecContext> create_index_context(
+ const std::vector<ColumnId>& col_ids,
+ const std::vector<std::unique_ptr<segment_v2::IndexIterator>>&
index_iterators,
+ std::vector<IndexFieldNameAndTypePair>& storage_name_and_type,
+ std::unordered_map<ColumnId, std::unordered_map<const VExpr*, bool>>&
+ common_expr_index_status) {
+ segment_v2::ColumnIteratorOptions column_iter_opts;
+ return std::make_shared<IndexExecContext>(col_ids, index_iterators,
storage_name_and_type,
+ common_expr_index_status,
nullptr, nullptr,
+ column_iter_opts);
+}
+
TEST_F(VectorSearchTest, TestPrepareAnnRangeSearch) {
TExpr texpr = read_from_json<TExpr>(ann_range_search_thrift);
// std::cout << "range_search thrift:\n" <<
apache::thrift::ThriftDebugString(texpr) << std::endl;
@@ -167,11 +181,18 @@ TEST_F(VectorSearchTest, TestEvaluateAnnRangeSearch) {
segment_v2::AnnIndexStats stats;
std::unordered_map<VExprContext*, std::unordered_map<ColumnId, VExpr*>>
common_expr_to_slotref_map;
+ std::vector<IndexFieldNameAndTypePair> storage_name_and_type(4);
+ std::unordered_map<ColumnId, std::unordered_map<const VExpr*, bool>>
common_expr_index_status;
+ range_search_ctx->set_index_context(create_index_context(
+ idx_to_cid, cid_to_index_iterators, storage_name_and_type,
common_expr_index_status));
+ bool ann_range_search_executed = false;
ASSERT_TRUE(range_search_ctx
->evaluate_ann_range_search(cid_to_index_iterators,
idx_to_cid,
column_iterators,
common_expr_to_slotref_map,
- row_bitmap, stats, false)
+ row_bitmap, stats, false,
+ &ann_range_search_executed)
.ok());
+ EXPECT_TRUE(ann_range_search_executed);
doris::segment_v2::VirtualColumnIterator* virtual_column_iter =
dynamic_cast<doris::segment_v2::VirtualColumnIterator*>(column_iterators[3].get());
@@ -264,11 +285,18 @@ TEST_F(VectorSearchTest, TestEvaluateAnnRangeSearch2) {
segment_v2::AnnIndexStats stats;
std::unordered_map<VExprContext*, std::unordered_map<ColumnId, VExpr*>>
common_expr_to_slotref_map;
+ std::vector<IndexFieldNameAndTypePair> storage_name_and_type(4);
+ std::unordered_map<ColumnId, std::unordered_map<const VExpr*, bool>>
common_expr_index_status;
+ range_search_ctx->set_index_context(create_index_context(
+ idx_to_cid, cid_to_index_iterators, storage_name_and_type,
common_expr_index_status));
+ bool ann_range_search_executed = false;
ASSERT_TRUE(range_search_ctx
->evaluate_ann_range_search(cid_to_index_iterators,
idx_to_cid,
column_iterators,
common_expr_to_slotref_map,
- row_bitmap, stats, false)
+ row_bitmap, stats, false,
+ &ann_range_search_executed)
.ok());
+ EXPECT_TRUE(ann_range_search_executed);
doris::segment_v2::VirtualColumnIterator* virtual_column_iter =
dynamic_cast<doris::segment_v2::VirtualColumnIterator*>(column_iterators[3].get());
@@ -285,6 +313,185 @@ TEST_F(VectorSearchTest, TestEvaluateAnnRangeSearch2) {
EXPECT_EQ(get_row_id_to_idx.size(), 10);
}
+TEST_F(VectorSearchTest,
TestEvaluateAnnRangeSearchStateDoesNotLeakAcrossClones) {
+ TExpr texpr = read_from_json<TExpr>(ann_range_search_thrift);
+ TDescriptorTable table1 =
read_from_json<TDescriptorTable>(thrift_table_desc);
+ std::unique_ptr<doris::ObjectPool> pool =
std::make_unique<doris::ObjectPool>();
+ auto desc_tbl = std::make_unique<DescriptorTbl>();
+ DescriptorTbl* desc_tbl_ptr = desc_tbl.get();
+ ASSERT_TRUE(DescriptorTbl::create(pool.get(), table1,
&(desc_tbl_ptr)).ok());
+ RowDescriptor row_desc = RowDescriptor(*desc_tbl_ptr, {0});
+ std::unique_ptr<doris::RuntimeState> state =
std::make_unique<doris::RuntimeState>();
+ state->set_desc_tbl(desc_tbl_ptr);
+
+ VExprContextSPtr range_search_ctx;
+ ASSERT_TRUE(VExpr::create_expr_tree(texpr, range_search_ctx).ok());
+ ASSERT_TRUE(range_search_ctx->prepare(state.get(), row_desc).ok());
+ ASSERT_TRUE(range_search_ctx->open(state.get()).ok());
+ doris::VectorSearchUserParams user_params;
+ range_search_ctx->prepare_ann_range_search(user_params);
+
+ VExprContextSPtr segment_with_ann_ctx;
+ ASSERT_TRUE(range_search_ctx->clone(state.get(),
segment_with_ann_ctx).ok());
+ VExprContextSPtr segment_without_ann_ctx;
+ ASSERT_TRUE(range_search_ctx->clone(state.get(),
segment_without_ann_ctx).ok());
+ ASSERT_EQ(segment_with_ann_ctx->root().get(),
segment_without_ann_ctx->root().get());
+
+ std::vector<ColumnId> idx_to_cid = {0, 1, 2, 3};
+ std::vector<std::unique_ptr<segment_v2::IndexIterator>>
ann_index_iterators(4);
+ ann_index_iterators[1] =
std::make_unique<doris::vector_search_utils::MockAnnIndexIterator>();
+ auto* mock_ann_index_iter =
dynamic_cast<doris::vector_search_utils::MockAnnIndexIterator*>(
+ ann_index_iterators[1].get());
+ std::map<std::string, std::string> properties;
+ properties["index_type"] = "hnsw";
+ properties["metric_type"] = "l2_distance";
+ properties["dim"] = "8";
+ auto pair = vector_search_utils::create_tmp_ann_index_reader(properties);
+ mock_ann_index_iter->_ann_reader = pair.second;
+
+ std::vector<std::unique_ptr<segment_v2::ColumnIterator>>
ann_column_iterators(4);
+ ann_column_iterators[3] =
std::make_unique<doris::segment_v2::VirtualColumnIterator>();
+ std::vector<IndexFieldNameAndTypePair> ann_storage_name_and_type(4);
+ std::unordered_map<ColumnId, std::unordered_map<const VExpr*, bool>>
ann_index_status;
+ segment_with_ann_ctx->set_index_context(create_index_context(
+ idx_to_cid, ann_index_iterators, ann_storage_name_and_type,
ann_index_status));
+
+ EXPECT_CALL(*mock_ann_index_iter, range_search(testing::_, testing::_,
testing::_, testing::_))
+ .WillOnce(testing::Invoke([](const
doris::segment_v2::AnnRangeSearchParams& params,
+ const doris::VectorSearchUserParams&
custom_params,
+
doris::segment_v2::AnnRangeSearchResult* result,
+ doris::segment_v2::AnnIndexStats*
stats) {
+ result->roaring = std::make_shared<roaring::Roaring>();
+ result->roaring->add(1);
+ result->roaring->add(3);
+ result->row_ids = nullptr;
+ result->distance = nullptr;
+ return Status::OK();
+ }));
+
+ roaring::Roaring ann_row_bitmap;
+ segment_v2::AnnIndexStats ann_stats;
+ std::unordered_map<VExprContext*, std::unordered_map<ColumnId, VExpr*>>
+ common_expr_to_slotref_map;
+ bool ann_range_search_executed = false;
+ ASSERT_TRUE(segment_with_ann_ctx
+ ->evaluate_ann_range_search(ann_index_iterators,
idx_to_cid,
+ ann_column_iterators,
+
common_expr_to_slotref_map, ann_row_bitmap,
+ ann_stats, false,
&ann_range_search_executed)
+ .ok());
+ EXPECT_TRUE(ann_range_search_executed);
+ const auto* ann_result =
segment_with_ann_ctx->get_index_context()->get_index_result_for_expr(
+ segment_with_ann_ctx->root().get());
+ ASSERT_NE(ann_result, nullptr);
+ ASSERT_NE(ann_result->get_data_bitmap(), nullptr);
+ EXPECT_EQ(ann_result->get_data_bitmap()->cardinality(), 2);
+
+ std::vector<std::unique_ptr<segment_v2::IndexIterator>>
no_ann_index_iterators(4);
+ std::vector<std::unique_ptr<segment_v2::ColumnIterator>>
no_ann_column_iterators(4);
+ no_ann_column_iterators[3] =
std::make_unique<doris::segment_v2::VirtualColumnIterator>();
+ std::vector<IndexFieldNameAndTypePair> no_ann_storage_name_and_type(4);
+ std::unordered_map<ColumnId, std::unordered_map<const VExpr*, bool>>
no_ann_index_status;
+ segment_without_ann_ctx->set_index_context(create_index_context(
+ idx_to_cid, no_ann_index_iterators, no_ann_storage_name_and_type,
no_ann_index_status));
+
+ roaring::Roaring no_ann_row_bitmap;
+ segment_v2::AnnIndexStats no_ann_stats;
+ bool no_ann_range_search_executed = true;
+ ASSERT_TRUE(segment_without_ann_ctx
+ ->evaluate_ann_range_search(
+ no_ann_index_iterators, idx_to_cid,
no_ann_column_iterators,
+ common_expr_to_slotref_map, no_ann_row_bitmap,
no_ann_stats, false,
+ &no_ann_range_search_executed)
+ .ok());
+ EXPECT_FALSE(no_ann_range_search_executed);
+
EXPECT_FALSE(segment_without_ann_ctx->get_index_context()->has_index_result_for_expr(
+ segment_without_ann_ctx->root().get()));
+}
+
+TEST_F(VectorSearchTest,
TestEvaluateAnnRangeSearchUsesSourceColumnIndexForSlotMap) {
+ TExpr texpr = read_from_json<TExpr>(ann_range_search_thrift);
+ TExprNode& opNode = texpr.nodes[0];
+ opNode.opcode = TExprOpcode::LT;
+ opNode.fn.name.function_name = doris::NameLess::name;
+ TDescriptorTable table1 =
read_from_json<TDescriptorTable>(thrift_table_desc);
+ std::unique_ptr<doris::ObjectPool> pool =
std::make_unique<doris::ObjectPool>();
+ auto desc_tbl = std::make_unique<DescriptorTbl>();
+ DescriptorTbl* desc_tbl_ptr = desc_tbl.get();
+ ASSERT_TRUE(DescriptorTbl::create(pool.get(), table1,
&(desc_tbl_ptr)).ok());
+ RowDescriptor row_desc = RowDescriptor(*desc_tbl_ptr, {0});
+ std::unique_ptr<doris::RuntimeState> state =
std::make_unique<doris::RuntimeState>();
+ state->set_desc_tbl(desc_tbl_ptr);
+
+ VExprContextSPtr range_search_ctx;
+ ASSERT_TRUE(VExpr::create_expr_tree(texpr, range_search_ctx).ok());
+ ASSERT_TRUE(range_search_ctx->prepare(state.get(), row_desc).ok());
+ ASSERT_TRUE(range_search_ctx->open(state.get()).ok());
+ doris::VectorSearchUserParams user_params;
+ range_search_ctx->prepare_ann_range_search(user_params);
+ ASSERT_EQ(range_search_ctx->_ann_range_search_runtime.src_col_idx, 1);
+ ASSERT_EQ(range_search_ctx->_ann_range_search_runtime.dst_col_idx, 3);
+
+ std::vector<ColumnId> idx_to_cid = {0, 5, 6, 7};
+ std::vector<std::unique_ptr<segment_v2::IndexIterator>>
cid_to_index_iterators(8);
+ cid_to_index_iterators[5] =
+
std::make_unique<doris::vector_search_utils::MockAnnIndexIterator>();
+ auto* mock_ann_index_iter =
dynamic_cast<doris::vector_search_utils::MockAnnIndexIterator*>(
+ cid_to_index_iterators[5].get());
+ std::map<std::string, std::string> properties;
+ properties["index_type"] = "hnsw";
+ properties["metric_type"] = "l2_distance";
+ properties["dim"] = "8";
+ auto pair = vector_search_utils::create_tmp_ann_index_reader(properties);
+ mock_ann_index_iter->_ann_reader = pair.second;
+
+ std::vector<std::unique_ptr<segment_v2::ColumnIterator>>
column_iterators(8);
+ column_iterators[7] =
std::make_unique<doris::segment_v2::VirtualColumnIterator>();
+ std::vector<IndexFieldNameAndTypePair> storage_name_and_type(8);
+ std::unordered_map<ColumnId, std::unordered_map<const VExpr*, bool>>
common_expr_index_status;
+ common_expr_index_status[5][range_search_ctx->root().get()] = false;
+ range_search_ctx->set_index_context(create_index_context(
+ idx_to_cid, cid_to_index_iterators, storage_name_and_type,
common_expr_index_status));
+
+ std::unordered_map<VExprContext*, std::unordered_map<ColumnId, VExpr*>>
+ common_expr_to_slotref_map;
+ common_expr_to_slotref_map[range_search_ctx.get()][1] =
range_search_ctx->root().get();
+
+ EXPECT_CALL(*mock_ann_index_iter, range_search(testing::_, testing::_,
testing::_, testing::_))
+ .WillOnce(testing::Invoke([](const
doris::segment_v2::AnnRangeSearchParams& params,
+ const doris::VectorSearchUserParams&
custom_params,
+
doris::segment_v2::AnnRangeSearchResult* result,
+ doris::segment_v2::AnnIndexStats*
stats) {
+ constexpr size_t num_results = 3;
+ result->roaring = std::make_shared<roaring::Roaring>();
+ result->row_ids = std::make_shared<std::vector<uint64_t>>();
+ result->distance = std::shared_ptr<float[]>(new
float[num_results]);
+ for (size_t i = 0; i < num_results; ++i) {
+ result->roaring->add(i * 2);
+ result->row_ids->push_back(i * 2);
+ result->distance[i] = static_cast<float>(i);
+ }
+ return Status::OK();
+ }));
+
+ roaring::Roaring row_bitmap;
+ segment_v2::AnnIndexStats stats;
+ bool ann_range_search_executed = false;
+ ASSERT_TRUE(range_search_ctx
+ ->evaluate_ann_range_search(cid_to_index_iterators,
idx_to_cid,
+ column_iterators,
common_expr_to_slotref_map,
+ row_bitmap, stats, false,
+ &ann_range_search_executed)
+ .ok());
+ EXPECT_TRUE(ann_range_search_executed);
+ EXPECT_TRUE(common_expr_index_status[5][range_search_ctx->root().get()]);
+ const auto* result =
range_search_ctx->get_index_context()->get_index_result_for_expr(
+ range_search_ctx->root().get());
+ ASSERT_NE(result, nullptr);
+ ASSERT_NE(result->get_data_bitmap(), nullptr);
+ EXPECT_EQ(result->get_data_bitmap()->cardinality(), 3);
+}
+
TEST_F(VectorSearchTest, TestRangeSearchRuntimeInfoToString) {
// Test default constructor
doris::segment_v2::AnnRangeSearchRuntime runtime_info;
@@ -665,7 +872,7 @@ TEST_F(VectorSearchTest,
TestEvaluateAnnRangeSearch_DimensionMismatch) {
auto st = range_search_ctx->evaluate_ann_range_search(
cid_to_index_iterators, idx_to_cid, column_iterators,
common_expr_to_slotref_map,
- row_bitmap, stats, false);
+ row_bitmap, stats, false, nullptr);
EXPECT_FALSE(st.ok());
EXPECT_TRUE(st.is<doris::ErrorCode::INVALID_ARGUMENT>());
}
diff --git
a/regression-test/suites/ann_index_p0/ann_range_search_pushdown_regression.groovy
b/regression-test/suites/ann_index_p0/ann_range_search_pushdown_regression.groovy
new file mode 100644
index 00000000000..c3122b9d1e6
--- /dev/null
+++
b/regression-test/suites/ann_index_p0/ann_range_search_pushdown_regression.groovy
@@ -0,0 +1,141 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import groovy.json.JsonSlurper
+
+def getProfileList = {
+ def dst = "http://" + context.config.feHttpAddress
+ def conn = new URL(dst + "/rest/v1/query_profile").openConnection()
+ conn.setRequestMethod("GET")
+ def encoding =
Base64.getEncoder().encodeToString((context.config.feHttpUser + ":" +
+ (context.config.feHttpPassword == null ? "" :
context.config.feHttpPassword))
+ .getBytes("UTF-8"))
+ conn.setRequestProperty("Authorization", "Basic ${encoding}")
+ return conn.getInputStream().getText()
+}
+
+def getProfile = { id ->
+ def dst = "http://" + context.config.feHttpAddress
+ def conn = new URL(dst +
"/api/profile/text/?query_id=$id").openConnection()
+ conn.setRequestMethod("GET")
+ def encoding =
Base64.getEncoder().encodeToString((context.config.feHttpUser + ":" +
+ (context.config.feHttpPassword == null ? "" :
context.config.feHttpPassword))
+ .getBytes("UTF-8"))
+ conn.setRequestProperty("Authorization", "Basic ${encoding}")
+ return conn.getInputStream().getText()
+}
+
+def extractCounterValue = { String profileText, String counterName ->
+ for (def line : profileText.split("\n")) {
+ if (line.contains(counterName + ":")) {
+ def m = (line =~
/${java.util.regex.Pattern.quote(counterName)}:\s*([0-9]+(?:\.[0-9]+)?)/)
+ if (m.find()) {
+ return m.group(1)
+ }
+ }
+ }
+ return null
+}
+
+suite("ann_range_search_pushdown_regression", "nonConcurrent") {
+ def getProfileWithToken = { token ->
+ String profileId = ""
+ int attempts = 0
+ while (attempts < 10 && (profileId == null || profileId == "")) {
+ List profileData = new
JsonSlurper().parseText(getProfileList()).data.rows
+ for (def profileItem in profileData) {
+ if (profileItem["Sql Statement"].toString().contains(token)) {
+ profileId = profileItem["Profile ID"].toString()
+ break
+ }
+ }
+ if (profileId == null || profileId == "") {
+ Thread.sleep(300)
+ }
+ attempts++
+ }
+ assertTrue(profileId != null && profileId != "")
+ Thread.sleep(800)
+ return getProfile(profileId).toString()
+ }
+
+ sql "unset variable all;"
+ sql "set enable_common_expr_pushdown=true;"
+ sql "set experimental_enable_virtual_slot_for_cse=true;"
+ sql "set enable_no_need_read_data_opt=true;"
+ sql "set enable_profile=true;"
+ sql "set profile_level=2;"
+ sql "set parallel_pipeline_task_num=1;"
+ sql "set enable_sql_cache=false;"
+ sql "set enable_condition_cache=false;"
+
+ // Case 1: one rowset has an IVF ANN index, while the surrounding small
+ // rowsets skip ANN index building because they have fewer rows than nlist.
+ // This creates a single scan with mixed indexed/non-indexed segments. The
+ // ANN range search execution state must be per segment instead of stored
on
+ // the shared expression root.
+ sql "drop table if exists ann_range_mixed_segment_index"
+ sql """
+ create table ann_range_mixed_segment_index (
+ id int not null,
+ embedding array<float> not null,
+ index idx_embedding(`embedding`) using ann properties(
+ "index_type"="ivf",
+ "metric_type"="l2_distance",
+ "dim"="3",
+ "nlist"="2"
+ )
+ ) duplicate key(id)
+ distributed by hash(id) buckets 1
+ properties(
+ "replication_num"="1",
+ "disable_auto_compaction"="true"
+ );
+ """
+
+ sql "set ivf_nprobe=2;"
+ sql """
+ insert into ann_range_mixed_segment_index values
+ (100, [10.0, 10.0, 10.0]);
+ """
+ sql """
+ insert into ann_range_mixed_segment_index values
+ (1, [0.0, 0.0, 0.0]),
+ (2, [0.1, 0.0, 0.0]),
+ (3, [0.2, 0.0, 0.0]),
+ (4, [0.3, 0.0, 0.0]);
+ """
+ sql """
+ insert into ann_range_mixed_segment_index values
+ (101, [10.0, 10.0, 10.0]);
+ """
+
+ def tokenMixed = UUID.randomUUID().toString()
+ def mixedRows = sql """
+ select id, "${tokenMixed}"
+ from ann_range_mixed_segment_index
+ where l2_distance_approximate(embedding, [0.0, 0.0, 0.0]) < 1.0
+ order by id;
+ """
+ assertEquals([1, 2, 3, 4], mixedRows.collect { it[0] })
+
+ def mixedProfile = getProfileWithToken(tokenMixed)
+ def rangeSearchCnt = extractCounterValue(mixedProfile,
"AnnIndexRangeSearchCnt")
+ logger.info("Mixed indexed/non-indexed segment
AnnIndexRangeSearchCnt=${rangeSearchCnt}")
+ assertEquals("1", rangeSearchCnt)
+
+}
diff --git
a/regression-test/suites/ann_index_p0/ann_range_search_source_index_status_regression.groovy
b/regression-test/suites/ann_index_p0/ann_range_search_source_index_status_regression.groovy
new file mode 100644
index 00000000000..f4a881e75ce
--- /dev/null
+++
b/regression-test/suites/ann_index_p0/ann_range_search_source_index_status_regression.groovy
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("ann_range_search_source_index_status_regression", "nonConcurrent") {
+ sql "unset variable all;"
+ sql "set enable_common_expr_pushdown=true;"
+ sql "set experimental_enable_virtual_slot_for_cse=true;"
+ sql "set enable_no_need_read_data_opt=true;"
+ sql "set parallel_pipeline_task_num=1;"
+ sql "set enable_sql_cache=false;"
+ sql "set enable_condition_cache=false;"
+
+ // The source column's index in the scan block differs from its storage
+ // ColumnId. ANN range search should mark the common expression status by
+ // scan-block source column index, so the embedding column can be skipped
+ // when it is only used by the satisfied ANN predicate.
+ sql "drop table if exists ann_range_source_index_status"
+ sql """
+ create table ann_range_source_index_status (
+ id int not null,
+ pad_int int not null,
+ pad_text string not null,
+ embedding array<float> not null,
+ value int not null,
+ index idx_embedding(`embedding`) using ann properties(
+ "index_type"="hnsw",
+ "metric_type"="l2_distance",
+ "dim"="3"
+ )
+ ) duplicate key(id)
+ distributed by hash(id) buckets 1
+ properties("replication_num"="1");
+ """
+
+ sql """
+ insert into ann_range_source_index_status values
+ (1, 10, 'a', [0.0, 0.0, 0.0], 100),
+ (2, 20, 'b', [0.1, 0.0, 0.0], 200),
+ (3, 30, 'c', [0.2, 0.0, 0.0], 300),
+ (4, 40, 'd', [0.3, 0.0, 0.0], 400),
+ (5, 50, 'e', [0.4, 0.0, 0.0], 500),
+ (6, 60, 'f', [0.5, 0.0, 0.0], 600),
+ (7, 70, 'g', [0.6, 0.0, 0.0], 700),
+ (8, 80, 'h', [0.7, 0.0, 0.0], 800),
+ (9, 90, 'i', [0.8, 0.0, 0.0], 900),
+ (10, 100, 'j', [0.9, 0.0, 0.0], 1000);
+ """
+
+ try {
+ GetDebugPoint().enableDebugPointForAllBEs(
+ "segment_iterator._read_columns_by_index", [column_name:
"embedding"])
+ def indexOnlyRows = sql """
+ select id
+ from ann_range_source_index_status
+ where l2_distance_approximate(embedding, [0.0, 0.0, 0.0]) < 1.0
+ order by id;
+ """
+ assertEquals([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], indexOnlyRows.collect {
it[0] })
+ } finally {
+
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator._read_columns_by_index")
+ }
+
+ def readEmbeddingRows = sql """
+ select id, embedding
+ from ann_range_source_index_status
+ where l2_distance_approximate(embedding, [0.0, 0.0, 0.0]) < 1.0
+ order by id;
+ """
+ assertEquals([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], readEmbeddingRows.collect {
it[0] })
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]