This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch vector-index-dev
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/vector-index-dev by this push:
new 9500b46a8d0 [fix] VirtualColumnIterator should do scatter to input
column in its prepare function. (#51299)
9500b46a8d0 is described below
commit 9500b46a8d0c798edd0cec41bf65be7ae1649126
Author: zhiqiang <[email protected]>
AuthorDate: Tue May 27 21:37:14 2025 +0800
[fix] VirtualColumnIterator should do scatter to input column in its
prepare function. (#51299)
---
be/src/olap/rowset/segment_v2/ann_index_writer.cpp | 2 +-
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 5 +-
.../rowset/segment_v2/virtual_column_iterator.cpp | 67 +++++++++++++++----
be/src/vec/exprs/vann_topn_predicate.cpp | 6 +-
be/src/vec/exprs/vectorized_fn_call.cpp | 12 ++--
.../vec/functions/array/function_array_distance.h | 5 +-
.../olap/vector_search/ann_range_search_test.cpp | 8 +--
.../vector_search/virtual_column_iterator_test.cpp | 76 ++++++++++++++++++++++
8 files changed, 151 insertions(+), 30 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
b/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
index 0f8c6cd1d7c..ffc3b9bb1fd 100644
--- a/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
@@ -59,7 +59,7 @@ Status AnnIndexColumnWriter::init() {
_vector_index = nullptr;
const auto& properties = _index_meta->properties();
const std::string index_type = get_or_default(properties, INDEX_TYPE,
"hnsw");
- const std::string metric_type = get_or_default(properties, METRIC_TYPE,
"l2");
+ const std::string metric_type = get_or_default(properties, METRIC_TYPE,
"l2_distance");
const std::string quantilizer = get_or_default(properties, QUANTILIZER,
"flat");
FaissBuildParameter builderParameter;
std::shared_ptr<FaissVectorIndex> faiss_index =
std::make_shared<FaissVectorIndex>();
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 3075af6122c..d77c72d8ffb 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1882,9 +1882,10 @@ Status SegmentIterator::_read_columns_by_index(uint32_t
nrows_read_limit, uint32
SCOPED_RAW_TIMER(&_opts.stats->predicate_column_read_ns);
nrows_read = _range_iter->read_batch_rowids(_block_rowids.data(),
nrows_read_limit);
- LOG_INFO("nrows_read from range iterator: {}", nrows_read);
bool is_continuous = (nrows_read > 1) &&
(_block_rowids[nrows_read - 1] - _block_rowids[0] ==
nrows_read - 1);
+ LOG_INFO("nrows_read from range iterator: {}, is_continus {}", nrows_read,
is_continuous);
+
std::vector<ColumnId> predicate_column_ids_and_virtual_columns;
predicate_column_ids_and_virtual_columns.reserve(_cols_read_by_column_predicate.size()
+
_virtual_column_exprs.size());
@@ -2922,7 +2923,6 @@ bool SegmentIterator::_can_opt_topn_reads() {
Status SegmentIterator::_materialization_of_virtual_column(vectorized::Block*
block) {
size_t prev_block_columns = block->columns();
- LOG_INFO("Materialize all {} virtual columns",
_virtual_column_exprs.size());
for (const auto& cid_and_expr : _virtual_column_exprs) {
auto cid = cid_and_expr.first;
auto column_expr = cid_and_expr.second;
@@ -2947,7 +2947,6 @@ Status
SegmentIterator::_materialization_of_virtual_column(vectorized::Block* bl
// During execution of expr, some columns may be added to the end of the
block.
// Remove them to keep consistent with current block.
block->erase_tail(prev_block_columns);
- LOG_INFO("Materialize all {} virtual columns end.",
_virtual_column_exprs.size());
return Status::OK();
}
diff --git a/be/src/olap/rowset/segment_v2/virtual_column_iterator.cpp
b/be/src/olap/rowset/segment_v2/virtual_column_iterator.cpp
index 82cb4631cd3..73285504624 100644
--- a/be/src/olap/rowset/segment_v2/virtual_column_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/virtual_column_iterator.cpp
@@ -20,6 +20,7 @@
#include <cstring>
#include <memory>
+#include "gutil/integral_types.h"
#include "vec/columns/column.h"
#include "vec/columns/column_nothing.h"
@@ -36,16 +37,50 @@ Status VirtualColumnIterator::init(const
ColumnIteratorOptions& opts) {
void VirtualColumnIterator::prepare_materialization(vectorized::IColumn::Ptr
column,
std::unique_ptr<std::vector<uint64_t>> labels) {
- _materialized_column_ptr = column;
+ DCHECK(labels->size() == column->size()) << "labels size: " <<
labels->size()
+ << ", materialized column size: "
<< column->size();
+ // 1. do sort to labels
+ // column: [100, 101, 102, 99, 50, 49]
+ // lables: [5, 4, 1, 10, 7, 2]
+ const std::vector<uint64_t>& labels_ref = *labels;
+ const size_t n = labels_ref.size();
+ LOG_INFO("Input labels {}", fmt::join(labels_ref, ", "));
+ std::vector<size_t> order(n);
+ // global_row_id_to_idx:
+ // {5:0, 4:1, 1:2, 10:3, 7:4, 2:5}
+ std::map<size_t, size_t> global_row_id_to_idx;
+ for (size_t i = 0; i < n; ++i) {
+ order[i] = labels_ref[i];
+ global_row_id_to_idx[labels_ref[i]] = i;
+ }
+
+ // orders: [1,2,4,5,7,10]
+ std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { return a <
b; });
+ LOG_INFO("Sorted order {}", fmt::join(order, ", "));
+ // 2. scatter column
+ auto scattered_column = column->clone_empty();
+ // We need a mapping from global row id to local index in the materialized
column.
_row_id_to_idx.clear();
- DCHECK(labels->size() == _materialized_column_ptr->size())
- << "labels size: " << labels->size()
- << ", materialized column size: " <<
_materialized_column_ptr->size();
- _size = _materialized_column_ptr->size();
- for (size_t i = 0; i < _size; ++i) {
- _row_id_to_idx[(*labels)[i]] = i;
+ for (size_t i = 0; i < n; ++i) {
+ size_t global_idx = order[i];
+ size_t original_col_idx = global_row_id_to_idx[global_idx];
+ _row_id_to_idx[global_idx] = i;
+ scattered_column->insert_from(*column, original_col_idx);
}
+ // After scatter:
+ // scattered_column: [102, 49, 101, 100, 50, 99]
+ // _row_id_to_idx: {1:0, 2:1, 4:2, 5:3, 7:4, 10:5}
+ _materialized_column_ptr = std::move(scattered_column);
+
+ _size = n;
+
+ std::string msg;
+ for (const auto& pair : _row_id_to_idx) {
+ msg += fmt::format("{}: {}, ", pair.first, pair.second);
+ }
+
+ LOG_INFO("virtual column iterator, row_idx_to_idx:\n{}", msg);
_filter = doris::vectorized::IColumn::Filter(_size, 0);
}
@@ -86,10 +121,15 @@ Status VirtualColumnIterator::next_batch(size_t* n,
vectorized::MutableColumnPtr
return Status::InternalError("Current ordinal {} not found in
row_id_to_idx map",
_current_ordinal);
}
- size_t start = _row_id_to_idx[_current_ordinal];
+
// Update dst column
- dst = _materialized_column_ptr->clone_empty();
- dst->insert_range_from(*_materialized_column_ptr, start, rows_num_to_read);
+ if (vectorized::check_and_get_column<vectorized::ColumnNothing>(*dst)) {
+ LOG_INFO("Dst is nothing column, create new mutable column");
+ dst = _materialized_column_ptr->clone_resized(rows_num_to_read);
+ } else {
+ size_t start = _row_id_to_idx[_current_ordinal];
+ dst->insert_range_from(*_materialized_column_ptr, start,
rows_num_to_read);
+ }
LOG_INFO("Virtual column iterators, next_batch, rows reads: {}, dst size:
{}", rows_num_to_read,
dst->size());
@@ -114,7 +154,12 @@ Status VirtualColumnIterator::read_by_rowids(const
rowid_t* rowids, const size_t
// Apply filter to materialized column
doris::vectorized::IColumn::Ptr res_col =
_materialized_column_ptr->filter(_filter, 0);
// Update dst column
- dst = res_col->assume_mutable();
+ if (vectorized::check_and_get_column<vectorized::ColumnNothing>(*dst)) {
+ LOG_INFO("Dst is nothing column, create new mutable column");
+ dst = res_col->assume_mutable();
+ } else {
+ dst->insert_range_from(*res_col, 0, res_col->size());
+ }
LOG_INFO("Virtual column iterators, read_by_rowids, rowids size: {}, dst
size: {}", count,
dst->size());
diff --git a/be/src/vec/exprs/vann_topn_predicate.cpp
b/be/src/vec/exprs/vann_topn_predicate.cpp
index 30352b09782..1f8a9bd7047 100644
--- a/be/src/vec/exprs/vann_topn_predicate.cpp
+++ b/be/src/vec/exprs/vann_topn_predicate.cpp
@@ -158,12 +158,12 @@ Status AnnTopNDescriptor::evaluate_vector_ann_search(
DCHECK(ann_query_params.row_ids != nullptr);
result_column = ColumnFloat64::create();
- ColumnFloat64* result_column_float =
assert_cast<ColumnFloat64*>(result_column.get());
+ ColumnFloat64* result_column_double =
assert_cast<ColumnFloat64*>(result_column.get());
size_t num_results = ann_query_params.distance->size();
- result_column_float->resize(num_results);
+ result_column_double->resize(num_results);
for (size_t i = 0; i < num_results; ++i) {
- result_column_float->get_data()[i] = (*ann_query_params.distance)[i];
+ result_column_double->get_data()[i] = (*ann_query_params.distance)[i];
}
row_ids = std::move(ann_query_params.row_ids);
return Status::OK();
diff --git a/be/src/vec/exprs/vectorized_fn_call.cpp
b/be/src/vec/exprs/vectorized_fn_call.cpp
index 9ed006195c3..c6fa1a45d55 100644
--- a/be/src/vec/exprs/vectorized_fn_call.cpp
+++ b/be/src/vec/exprs/vectorized_fn_call.cpp
@@ -525,11 +525,13 @@ Status VectorizedFnCall::evaluate_ann_range_search(
DCHECK(virtual_column_iterator != nullptr);
// Now convert distance to column
size_t size = result.roaring->cardinality();
- // TODO: need to consider nullable column.
- auto distance_col = ColumnFloat32::create();
-
-
distance_col->insert_many_raw_data(reinterpret_cast<char*>(result.distance.get()),
- size);
+ auto distance_col = ColumnFloat64::create(size);
+ // float* -> double*,需要逐个转换
+ const float* src = reinterpret_cast<const
float*>(result.distance.get());
+ double* dst = distance_col->get_data().data();
+ for (size_t i = 0; i < size; ++i) {
+ dst[i] = static_cast<double>(src[i]);
+ }
virtual_column_iterator->prepare_materialization(std::move(distance_col),
std::move(result.row_ids));
} else {
diff --git a/be/src/vec/functions/array/function_array_distance.h
b/be/src/vec/functions/array/function_array_distance.h
index fcb7a067a07..8a2d533fad0 100644
--- a/be/src/vec/functions/array/function_array_distance.h
+++ b/be/src/vec/functions/array/function_array_distance.h
@@ -96,9 +96,8 @@ public:
Status execute_impl(FunctionContext* context, Block& block, const
ColumnNumbers& arguments,
uint32_t result, size_t input_rows_count) const
override {
- LOG_INFO("Function {} is executed with {} rows, stack {}", get_name(),
input_rows_count,
- doris::get_stack_trace());
-
+ // LOG_INFO("Function {} is executed with {} rows, stack {}",
get_name(), input_rows_count,
+ // doris::get_stack_trace());
const auto& arg1 = block.get_by_position(arguments[0]);
const auto& arg2 = block.get_by_position(arguments[1]);
if (!_check_input_type(arg1.type) || !_check_input_type(arg2.type)) {
diff --git a/be/test/olap/vector_search/ann_range_search_test.cpp
b/be/test/olap/vector_search/ann_range_search_test.cpp
index f5599526b38..475dd2dad17 100644
--- a/be/test/olap/vector_search/ann_range_search_test.cpp
+++ b/be/test/olap/vector_search/ann_range_search_test.cpp
@@ -1028,13 +1028,13 @@ TEST_F(VectorSearchTest, TestEvaluateAnnRangeSearch2) {
dynamic_cast<doris::segment_v2::VirtualColumnIterator*>(column_iterators[3].get());
vectorized::IColumn::Ptr column =
virtual_column_iter->get_materialized_column();
- const vectorized::ColumnFloat32* float_column =
- check_and_get_column<const
vectorized::ColumnFloat32>(column.get());
+ const vectorized::ColumnFloat64* double_column =
+ check_and_get_column<const
vectorized::ColumnFloat64>(column.get());
const vectorized::ColumnNothing* nothing_column =
check_and_get_column<const
vectorized::ColumnNothing>(column.get());
- ASSERT_NE(float_column, nullptr);
+ ASSERT_NE(double_column, nullptr);
ASSERT_EQ(nothing_column, nullptr);
- EXPECT_EQ(float_column->size(), 10);
+ EXPECT_EQ(double_column->size(), 10);
EXPECT_EQ(row_bitmap.cardinality(), 10);
const auto& get_row_id_to_idx = virtual_column_iter->get_row_id_to_idx();
diff --git a/be/test/olap/vector_search/virtual_column_iterator_test.cpp
b/be/test/olap/vector_search/virtual_column_iterator_test.cpp
index d860a7a7ad8..0c3f4a73169 100644
--- a/be/test/olap/vector_search/virtual_column_iterator_test.cpp
+++ b/be/test/olap/vector_search/virtual_column_iterator_test.cpp
@@ -20,6 +20,7 @@
#include <gtest/gtest.h>
#include "vec/columns/column.h"
+#include "vec/columns/column_nothing.h"
#include "vec/columns/column_string.h"
#include "vec/columns/column_vector.h"
#include "vec/core/types.h"
@@ -344,4 +345,79 @@ TEST_F(VectorSearchTest, NextBatchTest1) {
}
}
+TEST_F(VectorSearchTest, TestPrepare1) {
+ VirtualColumnIterator iterator;
+
+ // Create a materialized int32_t column with values [10, 20, 30, 40, 50]
+ auto int_column = vectorized::ColumnVector<int32_t>::create();
+ int_column->insert(10);
+ int_column->insert(20);
+ int_column->insert(30);
+ int_column->insert(40);
+ int_column->insert(50);
+ auto labels = std::make_unique<std::vector<uint64_t>>();
+ labels->push_back(100);
+ labels->push_back(11);
+ labels->push_back(33);
+ labels->push_back(22);
+ labels->push_back(55);
+ // Set the materialized column
+ iterator.prepare_materialization(std::move(int_column), std::move(labels));
+
+ // Verify row_id_to_idx mapping
+ const auto& row_id_to_idx = iterator.get_row_id_to_idx();
+ ASSERT_EQ(row_id_to_idx.size(), 5);
+ ASSERT_EQ(row_id_to_idx.find(11)->second, 0);
+ ASSERT_EQ(row_id_to_idx.find(22)->second, 1);
+ ASSERT_EQ(row_id_to_idx.find(33)->second, 2);
+ ASSERT_EQ(row_id_to_idx.find(55)->second, 3);
+ ASSERT_EQ(row_id_to_idx.find(100)->second, 4);
+
+ auto materialization_col = iterator.get_materialized_column();
+ auto int_col_m =
+ assert_cast<const
vectorized::ColumnVector<int32_t>*>(materialization_col.get());
+ ASSERT_EQ(int_col_m->get_data()[0], 20);
+ ASSERT_EQ(int_col_m->get_data()[1], 40);
+ ASSERT_EQ(int_col_m->get_data()[2], 30);
+ ASSERT_EQ(int_col_m->get_data()[3], 50);
+ ASSERT_EQ(int_col_m->get_data()[4], 10);
+}
+
+TEST_F(VectorSearchTest, TestColumnNothing) {
+ VirtualColumnIterator iterator;
+
+ // Create a materialized int32_t column with values [10, 20, 30, 40, 50]
+ auto int_column = vectorized::ColumnVector<int32_t>::create();
+ int_column->insert(10);
+ int_column->insert(20);
+ int_column->insert(30);
+ int_column->insert(40);
+ int_column->insert(50);
+ auto labels = std::make_unique<std::vector<uint64_t>>();
+ labels->push_back(100);
+ labels->push_back(11);
+ labels->push_back(33);
+ labels->push_back(22);
+ labels->push_back(55);
+ // Set the materialized column
+ iterator.prepare_materialization(std::move(int_column), std::move(labels));
+
+ // Create destination column
+ vectorized::MutableColumnPtr dst = vectorized::ColumnNothing::create(0);
+
+ // Read by rowids, should return empty result
+ rowid_t rowids[] = {11, 22, 33};
+ size_t count = sizeof(rowids) / sizeof(rowids[0]);
+ Status status = iterator.read_by_rowids(rowids, count, dst);
+ ASSERT_TRUE(status.ok());
+ auto tmp_nothing =
vectorized::check_and_get_column<vectorized::ColumnNothing>(*dst);
+ ASSERT_TRUE(tmp_nothing == nullptr);
+ auto tmp_col_i32 =
vectorized::check_and_get_column<vectorized::ColumnVector<int32_t>>(
+ *iterator.get_materialized_column());
+ ASSERT_TRUE(tmp_col_i32 != nullptr);
+ ASSERT_EQ(dst->size(), 3);
+ ASSERT_EQ(tmp_col_i32->get_data()[0], 20);
+ ASSERT_EQ(tmp_col_i32->get_data()[1], 40);
+ ASSERT_EQ(tmp_col_i32->get_data()[2], 30);
+}
} // namespace doris::vectorized
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]