This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch vector-index-dev
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/vector-index-dev by this push:
new 13413b20f07 fix conflict with global lazy materialization (#52093)
13413b20f07 is described below
commit 13413b20f07a463a2f349dcc0fee68d5c91fb04a
Author: zhiqiang <[email protected]>
AuthorDate: Sat Jun 21 13:49:05 2025 +0800
fix conflict with global lazy materialization (#52093)
解决跟 TOP N 全局延迟物化实现上的冲突以及一些DDL的增强。
解决冲突:
1. 在 finalizeForNereids 中为每个虚拟列绑定一个 Column,这个 Column 的 unique id 从
INT32_MAX - 1 开始递减
2. 虚拟列绑定的 Column 的 NAME 必须以 `__DORIS_VIRTUAL_COL__` 开头
3. 类型需要与 VirtualSlot 的输出类型一致
4. 上述绑定必须在 finalizeForNereids 阶段而不是 toThrift 的时候做,不然无法处理
DescriptorTable,并且只有这样才能在 explain 的时候能够看到真正的 unique id
5. `toThrift`阶段把虚拟列的 Column 添加到 ColumnDesc 里面
BE 上删除了虚拟列计算 cid 的一些特殊逻辑,按照普通列去处理。
---
be/src/common/consts.h | 1 +
be/src/olap/olap_common.h | 9 ++
be/src/olap/rowset/beta_rowset_reader.cpp | 2 +
be/src/olap/rowset/segment_v2/ann_index_reader.cpp | 6 +-
be/src/olap/rowset/segment_v2/ann_index_writer.cpp | 17 ++-
be/src/olap/rowset/segment_v2/column_writer.cpp | 18 ++-
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 154 ++++++++++-----------
be/src/olap/rowset/segment_v2/segment_iterator.h | 3 +
be/src/olap/schema.h | 27 ++--
be/src/olap/tablet_schema.cpp | 19 ++-
be/src/olap/tablet_schema.h | 3 +
be/src/pipeline/exec/operator.h | 6 +-
be/src/pipeline/pipeline_fragment_context.cpp | 2 +-
be/src/runtime/descriptors.cpp | 5 +-
be/src/util/doris_metrics.cpp | 3 +
be/src/util/doris_metrics.h | 2 +
be/src/vec/exec/scan/olap_scanner.cpp | 52 +++----
be/src/vec/exec/scan/scanner_context.cpp | 4 -
be/src/vec/exprs/ann_topn_runtime.cpp | 11 +-
be/src/vec/exprs/virtual_slot_ref.cpp | 3 +-
.../olap/vector_search/ann_index_writer_test.cpp | 0
.../olap/vector_search/faiss_vector_index_test.cpp | 8 +-
.../apache/doris/alter/SchemaChangeHandler.java | 3 +
.../trees/plans/commands/info/BuildIndexOp.java | 5 +
.../trees/plans/commands/info/CreateIndexOp.java | 25 ++++
.../org/apache/doris/planner/OlapScanNode.java | 25 ++++
.../ann_index_p0/insert_with_invalid_array.out | Bin 0 -> 119 bytes
.../create_ann_index_test.groovy | 0
.../create_tbl_with_ann_index_test.groovy | 0
.../ann_index_p0/insert_with_invalid_array.groovy | 50 +++++++
30 files changed, 297 insertions(+), 166 deletions(-)
diff --git a/be/src/common/consts.h b/be/src/common/consts.h
index 548d5a771a2..920f8b8eba5 100644
--- a/be/src/common/consts.h
+++ b/be/src/common/consts.h
@@ -31,6 +31,7 @@ const std::string GLOBAL_ROWID_COL =
"__DORIS_GLOBAL_ROWID_COL__";
const std::string ROW_STORE_COL = "__DORIS_ROW_STORE_COL__";
const std::string DYNAMIC_COLUMN_NAME = "__DORIS_DYNAMIC_COL__";
const std::string PARTIAL_UPDATE_AUTO_INC_COL =
"__PARTIAL_UPDATE_AUTO_INC_COLUMN__";
+const std::string VIRTUAL_COLUMN_PREFIX = "__DORIS_VIRTUAL_COL__";
/// The maximum precision representable by a 4-byte decimal (Decimal4Value)
constexpr int MAX_DECIMAL32_PRECISION = 9;
diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h
index bf8149ace60..777b34d4bf7 100644
--- a/be/src/olap/olap_common.h
+++ b/be/src/olap/olap_common.h
@@ -375,6 +375,12 @@ struct OlapReaderStatistics {
int64_t inverted_index_downgrade_count = 0;
InvertedIndexStatistics inverted_index_stats;
+ int64_t ann_index_downgrade_count = 0;
+ int64_t ann_index_load_ns = 0;
+ int64_t ann_index_query_ns = 0;
+ int64_t rows_ann_index_topn_filtered = 0;
+ int64_t rows_ann_index_range_filtered = 0;
+
int64_t output_index_result_column_timer = 0;
// number of segment filtered by column stat when creating seg iterator
int64_t filtered_segment_number = 0;
@@ -411,6 +417,9 @@ struct OlapReaderStatistics {
int64_t segment_create_column_readers_timer_ns = 0;
int64_t segment_load_index_timer_ns = 0;
+
+ int64_t segment_load_ann_index_timer_ns = 0;
+ int64_t segment_query_ann_index_timer_ns = 0;
};
using ColumnId = uint32_t;
diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp
b/be/src/olap/rowset/beta_rowset_reader.cpp
index 8c48572227e..4265a9b1255 100644
--- a/be/src/olap/rowset/beta_rowset_reader.cpp
+++ b/be/src/olap/rowset/beta_rowset_reader.cpp
@@ -141,6 +141,8 @@ Status
BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context
}
}
VLOG_NOTICE << "read columns size: " << read_columns.size();
+ LOG_INFO("Tablet columns {}, read columns size: {}",
+ _read_context->tablet_schema->num_columns(), read_columns.size());
_input_schema =
std::make_shared<Schema>(_read_context->tablet_schema->columns(), read_columns);
if (_read_context->predicates != nullptr) {
_read_options.column_predicates.insert(_read_options.column_predicates.end(),
diff --git a/be/src/olap/rowset/segment_v2/ann_index_reader.cpp
b/be/src/olap/rowset/segment_v2/ann_index_reader.cpp
index b08528b2b78..d83c4fc23c9 100644
--- a/be/src/olap/rowset/segment_v2/ann_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/ann_index_reader.cpp
@@ -28,6 +28,7 @@
#include "olap/rowset/segment_v2/inverted_index_compound_reader.h"
#include "runtime/runtime_state.h"
#include "util/once.h"
+#include "util/runtime_profile.h"
#include "vector/faiss_vector_index.h"
#include "vector/vector_index.h"
@@ -76,8 +77,11 @@ Status AnnIndexReader::load_index(io::IOContext* io_ctx) {
compound_dir.error().to_string());
}
_vector_index = std::make_unique<FaissVectorIndex>();
+ {
+ // SCOPED_TIMER()
+ RETURN_IF_ERROR(_vector_index->load(compound_dir->get()));
+ }
- RETURN_IF_ERROR(_vector_index->load(compound_dir->get()));
return Status::OK();
});
}
diff --git a/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
b/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
index 1995b94b395..d4853a01ac7 100644
--- a/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
@@ -85,25 +85,24 @@ void AnnIndexColumnWriter::close_on_error() {}
Status AnnIndexColumnWriter::add_array_values(size_t field_size, const void*
value_ptr,
const uint8_t* null_map, const
uint8_t* offsets_ptr,
- size_t count) {
+ size_t num_rows) {
// TODO: Performance optimization
- if (count == 0) {
+ if (num_rows == 0) {
return Status::OK();
}
const auto* offsets = reinterpret_cast<const size_t*>(offsets_ptr);
const size_t dim = _vector_index->get_dimension();
- for (int i = 1; i < count; ++i) {
- auto array_elem_size = offsets[i] - offsets[i - 1];
+ for (size_t i = 0; i < num_rows; ++i) {
+ auto array_elem_size = offsets[i + 1] - offsets[i];
if (array_elem_size != dim) {
- return Status::InvalidArgument(
- "Ann index only support array with {} dimension, but get
{}", dim,
- array_elem_size);
+ return Status::InvalidArgument("Ann index expect array with {}
dim, got {}.", dim,
+ array_elem_size);
}
}
const float* p = reinterpret_cast<const float*>(value_ptr);
- RETURN_IF_ERROR(_vector_index->add(count, p));
+ RETURN_IF_ERROR(_vector_index->add(num_rows, p));
return Status::OK();
}
@@ -122,7 +121,7 @@ Status AnnIndexColumnWriter::add_array_nulls(const uint8_t*
null_map, size_t row
}
int64_t AnnIndexColumnWriter::size() const {
- return 0; // TODO: 获取倒排索引的内存大小
+ return 0;
}
Status AnnIndexColumnWriter::finish() {
diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp
b/be/src/olap/rowset/segment_v2/column_writer.cpp
index 2220924ffb4..f32ace99834 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/column_writer.cpp
@@ -561,6 +561,12 @@ Status
ScalarColumnWriter::_internal_append_data_in_current_page(const uint8_t*
return Status::OK();
}
+Status ScalarColumnWriter::append_data_in_current_page(const uint8_t** data,
size_t* num_written) {
+ RETURN_IF_ERROR(append_data_in_current_page(*data, num_written));
+ *data += get_field()->size() * (*num_written);
+ return Status::OK();
+}
+
uint64_t ScalarColumnWriter::estimate_buffer_size() {
uint64_t size = _data_size;
size += _page_builder->size();
@@ -929,11 +935,15 @@ Status ArrayColumnWriter::append_data(const uint8_t**
ptr, size_t num_rows) {
const uint8_t* offsets_ptr = (const uint8_t*)offset_data;
auto data = *(data_ptr + 2);
auto nested_null_map = *(data_ptr + 3);
- LOG_INFO("ArrayColumnWriter, element_cnt{}", element_cnt);
+ LOG_INFO(
+ "ArrayColumnWriter, element_cnt: {}, num_rows {},
need_inverted_index {}, "
+ "need_ann_index {}",
+ element_cnt, num_rows, _opts.need_inverted_index,
_opts.need_ann_index);
if (element_cnt > 0) {
RETURN_IF_ERROR(_item_writer->append(reinterpret_cast<const
uint8_t*>(nested_null_map),
reinterpret_cast<const
void*>(data), element_cnt));
}
+
if (_opts.need_inverted_index) {
auto* writer = dynamic_cast<ScalarColumnWriter*>(_item_writer.get());
// Only support scalar as nested type
@@ -964,12 +974,6 @@ Status ArrayColumnWriter::append_data(const uint8_t** ptr,
size_t num_rows) {
return Status::OK();
}
-Status ScalarColumnWriter::append_data_in_current_page(const uint8_t** data,
size_t* num_written) {
- RETURN_IF_ERROR(append_data_in_current_page(*data, num_written));
- *data += get_field()->size() * (*num_written);
- return Status::OK();
-}
-
uint64_t ArrayColumnWriter::estimate_buffer_size() {
return _offset_writer->estimate_buffer_size() +
(is_nullable() ? _null_writer->estimate_buffer_size() : 0) +
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index dc9187eda33..ca191769af0 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -673,22 +673,21 @@ Status SegmentIterator::_apply_ann_topn_predicate() {
pre_size, rows_of_semgnet);
return Status::OK();
}
- const size_t dst_col_idx = _ann_topn_runtime->get_dest_column_idx();
vectorized::IColumn::MutablePtr result_column;
std::unique_ptr<std::vector<uint64_t>> result_row_ids;
RETURN_IF_ERROR(_ann_topn_runtime->evaluate_vector_ann_search(ann_index_iterator,
_row_bitmap,
result_column, result_row_ids));
- // TODO: 处理 nullable
LOG_INFO("Ann topn filtered {} - {} = {} rows", pre_size,
_row_bitmap.cardinality(),
pre_size - _row_bitmap.cardinality());
- ColumnIterator* column_iter =
- _column_iterators[_schema->num_columns() + dst_col_idx -
_schema->num_column_ids()]
- .get();
+ _opts.stats->rows_ann_index_topn_filtered += (pre_size -
_row_bitmap.cardinality());
+ const size_t dst_col_idx = _ann_topn_runtime->get_dest_column_idx();
+ ColumnIterator* column_iter =
_column_iterators[_schema->column_id(dst_col_idx)].get();
DCHECK(column_iter != nullptr);
VirtualColumnIterator* virtual_column_iter =
dynamic_cast<VirtualColumnIterator*>(column_iter);
DCHECK(virtual_column_iter != nullptr);
LOG_INFO("Virtual column iterator, column_idx {}, is materialized with {}
rows", dst_col_idx,
result_row_ids->size());
+ // reference count of result_column should be 1, so move will not issue
any data copy.
virtual_column_iter->prepare_materialization(std::move(result_column),
std::move(result_row_ids));
@@ -958,19 +957,11 @@ Status SegmentIterator::_apply_index_expr() {
}
// Apply ann range search
- std::vector<ColumnId> idx_to_cids;
- idx_to_cids.resize(_schema->num_columns() + _virtual_column_exprs.size());
- for (int i = 0; i < _schema->num_columns(); ++i) {
- idx_to_cids[i] = _schema->column_id(i);
- }
-
- for (const auto pair : _vir_cid_to_idx_in_block) {
- idx_to_cids[pair.second] = pair.first;
- }
-
for (const auto& expr_ctx : _common_expr_ctxs_push_down) {
- RETURN_IF_ERROR(expr_ctx->evaluate_ann_range_search(_index_iterators,
idx_to_cids,
+ size_t origin_rows = _row_bitmap.cardinality();
+ RETURN_IF_ERROR(expr_ctx->evaluate_ann_range_search(_index_iterators,
_schema->column_ids(),
_column_iterators,
_row_bitmap));
+ _opts.stats->rows_ann_index_range_filtered += (origin_rows -
_row_bitmap.cardinality());
}
for (auto it = _common_expr_ctxs_push_down.begin(); it !=
_common_expr_ctxs_push_down.end();) {
@@ -1214,6 +1205,13 @@ Status SegmentIterator::_init_return_column_iterators() {
continue;
}
+ if
(_schema->column(cid)->name().starts_with(BeConsts::VIRTUAL_COLUMN_PREFIX)) {
+ LOG_INFO("Virtual column iterator for column {}, cid: {}",
_schema->column(cid)->name(),
+ cid);
+ _column_iterators[cid] = std::make_unique<VirtualColumnIterator>();
+ continue;
+ }
+
std::set<ColumnId> del_cond_id_set;
_opts.delete_condition_predicates->get_all_column_ids(del_cond_id_set);
std::vector<bool> tmp_is_pred_column;
@@ -1243,12 +1241,17 @@ Status SegmentIterator::_init_return_column_iterators()
{
}
}
+#ifndef NDEBUG
for (auto pair : _vir_cid_to_idx_in_block) {
ColumnId vir_col_cid = pair.first;
- LOG_INFO("create virtual column iterator for column id: {}",
vir_col_cid);
- _column_iterators[vir_col_cid] =
std::make_unique<VirtualColumnIterator>();
- }
-
+ DCHECK(_column_iterators[vir_col_cid] != nullptr)
+ << "Virtual column iterator for " << vir_col_cid << " should
not be null";
+ ColumnIterator* column_iter = _column_iterators[vir_col_cid].get();
+ DCHECK(dynamic_cast<VirtualColumnIterator*>(column_iter) != nullptr)
+ << "Virtual column iterator for " << vir_col_cid
+ << " should be VirtualColumnIterator";
+ }
+#endif
return Status::OK();
}
@@ -1907,19 +1910,7 @@ Status SegmentIterator::_read_columns_by_index(uint32_t
nrows_read_limit, uint32
(_block_rowids[nrows_read - 1] - _block_rowids[0] ==
nrows_read - 1);
LOG_INFO("nrows_read from range iterator: {}, is_continus {}", nrows_read,
is_continuous);
- std::vector<ColumnId> predicate_column_ids_and_virtual_columns;
-
predicate_column_ids_and_virtual_columns.reserve(_cols_read_by_column_predicate.size()
+
-
_virtual_column_exprs.size());
-
predicate_column_ids_and_virtual_columns.insert(predicate_column_ids_and_virtual_columns.end(),
-
_cols_read_by_column_predicate.begin(),
-
_cols_read_by_column_predicate.end());
-
- for (const auto& entry : _virtual_column_exprs) {
- // virtual column id is not in _predicate_column_ids
- predicate_column_ids_and_virtual_columns.push_back(entry.first);
- }
-
- for (auto cid : predicate_column_ids_and_virtual_columns) {
+ for (auto cid : _cols_read_by_column_predicate) {
auto& column = _current_return_columns[cid];
if (!_virtual_column_exprs.contains(cid)) {
if (_no_need_read_key_data(cid, column, nrows_read)) {
@@ -1930,7 +1921,6 @@ Status SegmentIterator::_read_columns_by_index(uint32_t
nrows_read_limit, uint32
LOG_INFO("Column {} is pruned. No need to read data.", cid);
continue;
}
-
DBUG_EXECUTE_IF("segment_iterator._read_columns_by_index", {
auto col_name = _opts.tablet_schema->column(cid).name();
auto debug_col_name =
@@ -1946,7 +1936,7 @@ Status SegmentIterator::_read_columns_by_index(uint32_t
nrows_read_limit, uint32
}
})
}
-
+ LOG_INFO("Read column {}, nrows_read: {}", cid, nrows_read);
if (is_continuous) {
size_t rows_read = nrows_read;
_opts.stats->predicate_column_read_seek_num += 1;
@@ -2183,7 +2173,6 @@ Status SegmentIterator::next_batch(vectorized::Block*
block) {
for (const auto& pair : _vir_cid_to_idx_in_block) {
size_t idx = pair.second;
auto type = idx_to_datatype.find(idx)->second;
-
block->replace_by_position(idx, type->create_column());
}
return res;
@@ -2285,7 +2274,7 @@ Status
SegmentIterator::_next_batch_internal(vectorized::Block* block) {
if (_lazy_materialization_read || _opts.record_rowids ||
_is_need_expr_eval) {
_block_rowids.resize(_opts.block_row_max);
}
- _current_return_columns.resize(_schema->columns().size() +
_virtual_column_exprs.size());
+ _current_return_columns.resize(_schema->columns().size());
_converted_column_ids.resize(_schema->columns().size(), 0);
if (_char_type_idx.empty() && _char_type_idx_no_0.empty()) {
_is_char_type.resize(_schema->columns().size(), false);
@@ -2363,23 +2352,6 @@ Status
SegmentIterator::_next_batch_internal(vectorized::Block* block) {
_current_batch_rows_read = 0;
RETURN_IF_ERROR(_read_columns_by_index(nrows_read_limit,
_current_batch_rows_read));
- // 把从索引物化得到的虚拟列放到 block 中
- for (const auto pair : _vir_cid_to_idx_in_block) {
- ColumnId cid = pair.first;
- size_t position = pair.second;
- block->replace_by_position(position,
std::move(_current_return_columns[cid]));
- bool is_nothing = false;
- if (vectorized::check_and_get_column<vectorized::ColumnNothing>(
- block->get_by_position(position).column.get())) {
- is_nothing = true;
- }
-
- LOG_INFO(
- "SegmentIterator next block replace virtual column, cid {},
position {}, still "
- "nothing {}",
- cid, position, is_nothing);
- }
-
if (std::find(_cols_read_by_column_predicate.begin(),
_cols_read_by_column_predicate.end(),
_schema->version_col_idx()) !=
_cols_read_by_column_predicate.end()) {
_replace_version_col(_current_batch_rows_read);
@@ -2391,7 +2363,7 @@ Status
SegmentIterator::_next_batch_internal(vectorized::Block* block) {
if (_current_batch_rows_read == 0) {
// Convert all columns in _current_return_columns to schema column
RETURN_IF_ERROR(_convert_to_expected_type(_schema->column_ids()));
- for (int i = 0; i < block->columns() -
_vir_cid_to_idx_in_block.size(); i++) {
+ for (int i = 0; i < block->columns(); i++) {
auto cid = _schema->column_id(i);
// todo(wb) abstract make column where
if (!_is_pred_column[cid]) {
@@ -2399,12 +2371,6 @@ Status
SegmentIterator::_next_batch_internal(vectorized::Block* block) {
}
}
- for (auto& pair : _vir_cid_to_idx_in_block) {
- auto cid = pair.first;
- auto loc = pair.second;
- block->replace_by_position(loc,
std::move(_current_return_columns[cid]));
- }
-
block->clear_column_data();
// clear and release iterators memory footprint in advance
_clear_iterators();
@@ -2907,28 +2873,30 @@ bool SegmentIterator::_can_opt_topn_reads() {
}
void SegmentIterator::_init_virtual_columns(vectorized::Block* block) {
- const size_t num_virtual_columns = _virtual_column_exprs.size();
- if (block->columns() < _schema->num_column_ids() + num_virtual_columns) {
- std::vector<size_t> vir_col_idx;
- for (const auto& pair : _vir_cid_to_idx_in_block) {
- vir_col_idx.push_back(pair.second);
- }
- std::sort(vir_col_idx.begin(), vir_col_idx.end());
- for (size_t i = 0; i < num_virtual_columns; ++i) {
- auto iter = _opts.vir_col_idx_to_type.find(vir_col_idx[i]);
- DCHECK(iter != _opts.vir_col_idx_to_type.end());
- // Name of virtual currently is not used, so we just use a dummy
name.
- block->insert({vectorized::ColumnNothing::create(0), iter->second,
- fmt::format("VIRTUAL_COLUMN_{}", i)});
- }
- } else {
- // Before get next batch. make sure all virtual columns has type
ColumnNothing.
- std::vector<size_t> vir_col_pos;
- for (const auto& pair : _vir_cid_to_idx_in_block) {
- vir_col_pos.push_back(pair.second);
- block->replace_by_position(pair.second,
vectorized::ColumnNothing::create(0));
- }
- }
+ // const size_t num_virtual_columns = _virtual_column_exprs.size();
+ // if (block->columns() < _schema->num_column_ids() + num_virtual_columns)
{
+ // std::vector<size_t> vir_col_idx;
+ // for (const auto& pair : _vir_cid_to_idx_in_block) {
+ // vir_col_idx.push_back(pair.second);
+ // }
+ // std::sort(vir_col_idx.begin(), vir_col_idx.end());
+ // for (size_t i = 0; i < num_virtual_columns; ++i) {
+ // auto iter = _opts.vir_col_idx_to_type.find(vir_col_idx[i]);
+ // DCHECK(iter != _opts.vir_col_idx_to_type.end());
+ // // Name of virtual currently is not used, so we just use a
dummy name.
+ // block->insert({vectorized::ColumnNothing::create(0),
iter->second,
+ // fmt::format("VIRTUAL_COLUMN_{}", i)});
+ // }
+ // } else {
+ // Before get next batch. make sure all virtual columns has type
ColumnNothing.
+ for (const auto& pair : _vir_cid_to_idx_in_block) {
+ auto& col_with_type_and_name = block->get_by_position(pair.second);
+ col_with_type_and_name.column = vectorized::ColumnNothing::create(0);
+ col_with_type_and_name.type = _opts.vir_col_idx_to_type[pair.second];
+ LOG_INFO("Virtual column is reset, cid {}, idx_in_block {}, type {}",
pair.first,
+ pair.second, col_with_type_and_name.type->get_name());
+ }
+ // }
}
Status SegmentIterator::_materialization_of_virtual_column(vectorized::Block*
block) {
@@ -2937,6 +2905,26 @@ Status
SegmentIterator::_materialization_of_virtual_column(vectorized::Block* bl
auto cid = cid_and_expr.first;
auto column_expr = cid_and_expr.second;
size_t idx_in_block = _vir_cid_to_idx_in_block[cid];
+ if (block->columns() <= idx_in_block) {
+ LOG_ERROR("Block columns: {}, virtual column idx {}",
block->columns(), idx_in_block);
+ return Status::InternalError(
+ "Virtual column index {} is out of range, block columns
{}, "
+ "virtual columns size {}, virtual column expr {}",
+ idx_in_block, block->columns(),
_vir_cid_to_idx_in_block.size(),
+ column_expr->root()->debug_string());
+
+ } else if (block->get_by_position(idx_in_block).column.get() ==
nullptr) {
+ LOG_ERROR(
+ "Virtual column idx {} is null, block columns {}, virtual
columns size {}, "
+ "virtual column expr {}",
+ idx_in_block, block->columns(),
_vir_cid_to_idx_in_block.size(),
+ column_expr->root()->debug_string());
+ return Status::InternalError(
+ "Virtual column index {} is null, block columns {},
virtual columns size {}, "
+ "virtual column expr {}",
+ idx_in_block, block->columns(),
_vir_cid_to_idx_in_block.size(),
+ column_expr->root()->debug_string());
+ }
if (vectorized::check_and_get_column<const vectorized::ColumnNothing>(
block->get_by_position(idx_in_block).column.get())) {
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h
b/be/src/olap/rowset/segment_v2/segment_iterator.h
index a638605d18a..31bfb1f75f2 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -378,6 +378,9 @@ private:
// storage type schema related to _schema, since column in segment may be
different with type in _schema
std::vector<vectorized::IndexFieldNameAndTypePair> _storage_name_and_type;
// vector idx -> column iterarator
+ // should not use vector.
+ // if a column as 10000 columns, the size of _column_iterators will be
10000, even though involved
+ // columns of query is only 10.
std::vector<std::unique_ptr<ColumnIterator>> _column_iterators;
std::vector<std::unique_ptr<BitmapIndexIterator>> _bitmap_index_iterators;
std::vector<std::unique_ptr<IndexIterator>> _index_iterators;
diff --git a/be/src/olap/schema.h b/be/src/olap/schema.h
index 8cd775ada66..2ad43d52081 100644
--- a/be/src/olap/schema.h
+++ b/be/src/olap/schema.h
@@ -85,7 +85,7 @@ public:
}
// All the columns of one table may exist in the columns param, but
col_ids is only a subset.
- // arg 1 columns is the columns of the table
+ // arg 1 columns is the columns of the tablet
// arg 2 col_ids is the columns to read
Schema(const std::vector<TabletColumnPtr>& columns, const
std::vector<ColumnId>& col_ids) {
size_t num_key_columns = 0;
@@ -109,18 +109,6 @@ public:
_init(columns, col_ids, num_key_columns);
}
- // Only for UT
- Schema(const std::vector<TabletColumnPtr>& columns, size_t
num_key_columns) {
- std::vector<ColumnId> col_ids(columns.size());
- _unique_ids.resize(columns.size());
- for (uint32_t cid = 0; cid < columns.size(); ++cid) {
- col_ids[cid] = cid;
- _unique_ids[cid] = columns[cid]->unique_id();
- }
-
- _init(columns, col_ids, num_key_columns);
- }
-
Schema(const std::vector<const Field*>& cols, size_t num_key_columns) {
std::vector<ColumnId> col_ids(cols.size());
_unique_ids.resize(cols.size());
@@ -138,6 +126,19 @@ public:
_init(cols, col_ids, num_key_columns);
}
+#ifdef BE_TEST
+ Schema(const std::vector<TabletColumnPtr>& columns, size_t
num_key_columns) {
+ std::vector<ColumnId> col_ids(columns.size());
+ _unique_ids.resize(columns.size());
+ for (uint32_t cid = 0; cid < columns.size(); ++cid) {
+ col_ids[cid] = cid;
+ _unique_ids[cid] = columns[cid]->unique_id();
+ }
+
+ _init(columns, col_ids, num_key_columns);
+ }
+#endif
+
Schema(const Schema&);
Schema& operator=(const Schema& other);
diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp
index 1169939f6af..4a4f3e68129 100644
--- a/be/src/olap/tablet_schema.cpp
+++ b/be/src/olap/tablet_schema.cpp
@@ -45,6 +45,7 @@
#include "tablet_meta.h"
#include "vec/aggregate_functions/aggregate_function_simple_factory.h"
#include "vec/aggregate_functions/aggregate_function_state_union.h"
+#include "vec/columns/column_nothing.h"
#include "vec/common/hex.h"
#include "vec/common/string_ref.h"
#include "vec/core/block.h"
@@ -910,6 +911,8 @@ void TabletSchema::append_column(TabletColumn column,
ColumnType col_type) {
_version_col_idx = _num_columns;
} else if (UNLIKELY(column.name() == SKIP_BITMAP_COL)) {
_skip_bitmap_col_idx = _num_columns;
+ } else if
(UNLIKELY(column.name().starts_with(BeConsts::VIRTUAL_COLUMN_PREFIX))) {
+ _vir_col_idx_to_unique_id[_num_columns] = column.unique_id();
}
_field_uniqueid_to_index[column.unique_id()] = _num_columns;
_cols.push_back(std::make_shared<TabletColumn>(std::move(column)));
@@ -1545,13 +1548,21 @@ vectorized::Block TabletSchema::create_block(
const std::unordered_set<uint32_t>* tablet_columns_need_convert_null)
const {
vectorized::Block block;
for (int i = 0; i < return_columns.size(); ++i) {
- const auto& col = *_cols[return_columns[i]];
+ const ColumnId cid = return_columns[i];
+ const auto& col = *_cols[cid];
bool is_nullable = (tablet_columns_need_convert_null != nullptr &&
-
tablet_columns_need_convert_null->find(return_columns[i]) !=
+ tablet_columns_need_convert_null->find(cid) !=
tablet_columns_need_convert_null->end());
auto data_type =
vectorized::DataTypeFactory::instance().create_data_type(col, is_nullable);
- auto column = data_type->create_column();
- block.insert({std::move(column), data_type, col.name()});
+ if (_vir_col_idx_to_unique_id.contains(cid)) {
+ block.insert({vectorized::ColumnNothing::create(0), data_type,
col.name()});
+ LOG_INFO(
+ "Create block from tablet schema, column cid {} is virtual
column, col_name: "
+ "{}, col_unique_id: {}, type {}",
+ cid, col.name(), col.unique_id(), data_type->get_name());
+ } else {
+ block.insert({data_type->create_column(), data_type, col.name()});
+ }
}
return block;
}
diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h
index b2b2653bd31..6ae2f7cfa81 100644
--- a/be/src/olap/tablet_schema.h
+++ b/be/src/olap/tablet_schema.h
@@ -24,6 +24,7 @@
#include <parallel_hashmap/phmap.h>
#include <algorithm>
+#include <cstdint>
#include <map>
#include <memory>
#include <string>
@@ -650,6 +651,8 @@ private:
// ATTN: For compability reason empty cids means all columns of tablet
schema are encoded to row column
std::vector<int32_t> _row_store_column_unique_ids;
bool _enable_variant_flatten_nested = false;
+
+ std::map<size_t, int32_t> _vir_col_idx_to_unique_id;
};
bool operator==(const TabletSchema& a, const TabletSchema& b);
diff --git a/be/src/pipeline/exec/operator.h b/be/src/pipeline/exec/operator.h
index 559226d21a6..cac6fffde79 100644
--- a/be/src/pipeline/exec/operator.h
+++ b/be/src/pipeline/exec/operator.h
@@ -792,10 +792,10 @@ public:
_tuple_ids(tnode.row_tuples),
_row_descriptor(descs, tnode.row_tuples, tnode.nullable_tuples),
_resource_profile(tnode.resource_profile),
- _limit(tnode.limit) {
+ _limit(tnode.limit) {
if (tnode.__isset.output_tuple_id) {
- LOG_INFO("Operator {}, node_id {}, output_tuple_id {}",
- this->_op_name, tnode.node_id, tnode.output_tuple_id);
+ LOG_INFO("Operator {}, node_id {}, output_tuple_id {}",
this->_op_name, tnode.node_id,
+ tnode.output_tuple_id);
_output_row_descriptor.reset(new RowDescriptor(descs,
{tnode.output_tuple_id}, {true}));
}
if (!tnode.intermediate_output_tuple_id_list.empty()) {
diff --git a/be/src/pipeline/pipeline_fragment_context.cpp
b/be/src/pipeline/pipeline_fragment_context.cpp
index c31f5ddde78..727fc486f12 100644
--- a/be/src/pipeline/pipeline_fragment_context.cpp
+++ b/be/src/pipeline/pipeline_fragment_context.cpp
@@ -110,6 +110,7 @@
#include "runtime/thread_context.h"
#include "runtime_filter/runtime_filter_mgr.h"
#include "service/backend_options.h"
+#include "thrift/protocol/TDebugProtocol.h"
#include "util/container_util.hpp"
#include "util/debug_util.h"
#include "util/uid_util.h"
@@ -117,7 +118,6 @@
#include "vec/common/sort/topn_sorter.h"
#include "vec/runtime/vdata_stream_mgr.h"
#include "vec/spill/spill_stream.h"
-#include "thrift/protocol/TDebugProtocol.h"
namespace doris::pipeline {
#include "common/compile_check_begin.h"
diff --git a/be/src/runtime/descriptors.cpp b/be/src/runtime/descriptors.cpp
index 4470b7492b5..9fbbc9e343b 100644
--- a/be/src/runtime/descriptors.cpp
+++ b/be/src/runtime/descriptors.cpp
@@ -163,8 +163,9 @@ PrimitiveType SlotDescriptor::col_type() const {
std::string SlotDescriptor::debug_string() const {
const bool is_virtual = this->get_virtual_column_expr() != nullptr;
return fmt::format(
- "SlotDescriptor(id={}, type={}, col_pos={}, col_name={},
nullable={}, is_virtual={})",
- _id, _type->get_name(), _col_pos, _col_name, is_nullable(),
is_virtual);
+ "SlotDescriptor(id={}, type={}, col_name={}, col_unique_id={}, "
+ "is_virtual={})",
+ _id, _type->get_name(), _col_name, _col_unique_id, is_virtual);
}
TableDescriptor::TableDescriptor(const TTableDescriptor& tdesc)
diff --git a/be/src/util/doris_metrics.cpp b/be/src/util/doris_metrics.cpp
index edc375c49c4..3d5a8880008 100644
--- a/be/src/util/doris_metrics.cpp
+++ b/be/src/util/doris_metrics.cpp
@@ -237,6 +237,9 @@
DEFINE_GAUGE_CORE_METRIC_PROTOTYPE_2ARG(runtime_filter_consumer_timeout_num, Met
DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(get_remote_tablet_slow_time_ms,
MetricUnit::MILLISECONDS);
DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(get_remote_tablet_slow_cnt,
MetricUnit::NOUNIT);
+DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(ann_index_load_costs_ms,
MetricUnit::MILLISECONDS);
+DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(ann_index_load_cnt, MetricUnit::NOUNIT);
+
const std::string DorisMetrics::_s_registry_name = "doris_be";
const std::string DorisMetrics::_s_hook_name = "doris_metrics";
diff --git a/be/src/util/doris_metrics.h b/be/src/util/doris_metrics.h
index 1e2dd26e5a6..da065ae113e 100644
--- a/be/src/util/doris_metrics.h
+++ b/be/src/util/doris_metrics.h
@@ -246,6 +246,8 @@ public:
IntCounter* scanner_ctx_cnt = nullptr;
IntCounter* scanner_cnt = nullptr;
IntCounter* scanner_task_cnt = nullptr;
+ IntCounter* ann_index_load_costs_ms = nullptr;
+ IntCounter* ann_index_load_cnt = nullptr;
IntGauge* runtime_filter_consumer_num = nullptr;
IntGauge* runtime_filter_consumer_ready_num = nullptr;
diff --git a/be/src/vec/exec/scan/olap_scanner.cpp
b/be/src/vec/exec/scan/olap_scanner.cpp
index dc077197539..328b8a7026f 100644
--- a/be/src/vec/exec/scan/olap_scanner.cpp
+++ b/be/src/vec/exec/scan/olap_scanner.cpp
@@ -22,14 +22,13 @@
#include <gen_cpp/Types_types.h>
#include <glog/logging.h>
#include <stdlib.h>
+#include <thrift/protocol/TDebugProtocol.h>
#include <algorithm>
-#include <array>
#include <atomic>
#include <iterator>
#include <ostream>
#include <set>
-#include <shared_mutex>
#include "cloud/cloud_storage_engine.h"
#include "cloud/cloud_tablet_hotspot.h"
@@ -45,14 +44,9 @@
#include "olap/inverted_index_profile.h"
#include "olap/olap_common.h"
#include "olap/olap_tuple.h"
-#include "olap/rowset/rowset.h"
-#include "olap/rowset/rowset_meta.h"
#include "olap/schema_cache.h"
#include "olap/storage_engine.h"
-#include "olap/tablet_manager.h"
-#include "olap/tablet_meta.h"
#include "olap/tablet_schema.h"
-#include "olap/tablet_schema_cache.h"
#include "pipeline/exec/olap_scan_operator.h"
#include "runtime/descriptors.h"
#include "runtime/exec_env.h"
@@ -60,7 +54,6 @@
#include "service/backend_options.h"
#include "util/doris_metrics.h"
#include "util/runtime_profile.h"
-#include "vec/columns/column_nothing.h"
#include "vec/common/schema_util.h"
#include "vec/core/block.h"
#include "vec/exec/scan/scan_node.h"
@@ -172,7 +165,7 @@ Status OlapScanner::init() {
TOlapScanNode& olap_scan_node = local_state->olap_scan_node();
if (olap_scan_node.__isset.schema_version &&
olap_scan_node.__isset.columns_desc &&
!olap_scan_node.columns_desc.empty() &&
- olap_scan_node.columns_desc[0].col_unique_id >= 0 &&
+ olap_scan_node.columns_desc[0].col_unique_id >= 0 && // Why check
first column?
tablet->tablet_schema()->num_variant_columns() == 0) {
schema_key =
SchemaCache::get_schema_key(tablet->tablet_id(),
olap_scan_node.columns_desc,
@@ -192,6 +185,7 @@ Status OlapScanner::init() {
// so we have to use schema from a query plan witch FE puts
it in query plans.
tablet_schema->clear_columns();
for (const auto& column_desc : olap_scan_node.columns_desc) {
+ LOG_INFO("Column desc\n{}",
apache::thrift::ThriftDebugString(column_desc));
tablet_schema->append_column(TabletColumn(column_desc));
}
if (olap_scan_node.__isset.schema_version) {
@@ -490,7 +484,7 @@ Status OlapScanner::_init_variant_columns() {
}
Status OlapScanner::_init_return_columns() {
- size_t virtual_column_index = 0;
+#ifndef NDEBUG
std::vector<std::string> debug_strings;
for (const auto* slot : _output_tuple_desc->slots()) {
debug_strings.push_back(slot->debug_string());
@@ -498,6 +492,7 @@ Status OlapScanner::_init_return_columns() {
LOG_INFO("OlapScanner init return columns, output tuple slots:\n{}",
fmt::join(debug_strings, ",\n"));
+#endif
for (auto* slot : _output_tuple_desc->slots()) {
if (!slot->is_materialized()) {
@@ -507,25 +502,7 @@ Status OlapScanner::_init_return_columns() {
// variant column using path to index a column
int32_t index = 0;
auto& tablet_schema = _tablet_reader_params.tablet_schema;
- if (slot->get_virtual_column_expr()) {
- // 如果这个列是一个虚拟列,那么给它一个特殊的 cid
- // 这个 cid 是在 tablet_schema 中不存在的
- size_t virtual_column_cid = tablet_schema->num_columns() +
virtual_column_index;
-
- // 这两个 map 都会向下传递到 segment iterator
- _virtual_column_exprs[virtual_column_cid] =
_slot_id_to_virtual_column_expr[slot->id()];
- size_t idx_in_block = _slot_id_to_index_in_block[slot->id()];
- _vir_cid_to_idx_in_block[virtual_column_cid] = idx_in_block;
- _vir_col_idx_to_type[idx_in_block] =
_slot_id_to_col_type[slot->id()];
-
- virtual_column_index++;
-
- LOG_INFO("Add virtual column, slot id: {}, cid {}, column index:
{}, type: {}",
- slot->id(), virtual_column_cid,
_vir_cid_to_idx_in_block[virtual_column_cid],
- _vir_col_idx_to_type[idx_in_block]->get_name());
- // Virtual column is not included in columns in read-schema.
- continue;
- } else if (slot->type()->get_primitive_type() ==
PrimitiveType::TYPE_VARIANT) {
+ if (slot->type()->get_primitive_type() == PrimitiveType::TYPE_VARIANT)
{
index = tablet_schema->field_index(PathInData(
tablet_schema->column_by_uid(slot->col_unique_id()).name_lower_case(),
slot->column_paths()));
@@ -539,7 +516,22 @@ Status OlapScanner::_init_return_columns() {
"field name is invalid. field={}, field_name_to_index={},
col_unique_id={}",
slot->col_name(), tablet_schema->get_all_field_names(),
slot->col_unique_id());
}
- // _return_columns 中只保留 normal_columns
+
+ if (slot->get_virtual_column_expr()) {
+ ColumnId virtual_column_cid = index;
+ _virtual_column_exprs[virtual_column_cid] =
_slot_id_to_virtual_column_expr[slot->id()];
+ size_t idx_in_block = _slot_id_to_index_in_block[slot->id()];
+ _vir_cid_to_idx_in_block[virtual_column_cid] = idx_in_block;
+ _vir_col_idx_to_type[idx_in_block] =
_slot_id_to_col_type[slot->id()];
+
+ LOG_INFO("Virtual column, slot id: {}, cid {}, column index: {},
type: {}", slot->id(),
+ virtual_column_cid,
_vir_cid_to_idx_in_block[virtual_column_cid],
+ _vir_col_idx_to_type[idx_in_block]->get_name());
+ }
+
+ // _return_columns will contain:
+ // 1. normal columns.
+ // 2. __DORIS_GLOBAL_ROWID_COL__ column.
_return_columns.push_back(index);
if (slot->is_nullable() &&
!tablet_schema->column(index).is_nullable()) {
_tablet_columns_convert_to_null_set.emplace(index);
diff --git a/be/src/vec/exec/scan/scanner_context.cpp
b/be/src/vec/exec/scan/scanner_context.cpp
index 5c4a3ca1f81..8f370d350bf 100644
--- a/be/src/vec/exec/scan/scanner_context.cpp
+++ b/be/src/vec/exec/scan/scanner_context.cpp
@@ -231,10 +231,6 @@ vectorized::BlockUPtr ScannerContext::get_free_block(bool
force) {
_newly_create_free_blocks_num->update(1);
block = vectorized::Block::create_unique(_output_tuple_desc->slots(),
0,
true /*ignore invalid
slots*/);
- std::vector<size_t> col_sizes;
- for (const auto& c : block->get_columns()) {
- col_sizes.push_back(c->size());
- }
}
return block;
}
diff --git a/be/src/vec/exprs/ann_topn_runtime.cpp
b/be/src/vec/exprs/ann_topn_runtime.cpp
index 8b229f0f859..3a550c0d6f2 100644
--- a/be/src/vec/exprs/ann_topn_runtime.cpp
+++ b/be/src/vec/exprs/ann_topn_runtime.cpp
@@ -20,6 +20,7 @@
#include <cstdint>
#include <memory>
#include <string>
+#include <utility>
#include "common/logging.h"
#include "olap/rowset/segment_v2/ann_index/ann_search_params.h"
@@ -161,14 +162,16 @@ Status
AnnTopNRuntime::evaluate_vector_ann_search(segment_v2::IndexIterator* ann
DCHECK(ann_query_params.distance != nullptr);
DCHECK(ann_query_params.row_ids != nullptr);
- result_column = ColumnFloat64::create();
- ColumnFloat64* result_column_double =
assert_cast<ColumnFloat64*>(result_column.get());
-
size_t num_results = ann_query_params.distance->size();
- result_column_double->resize(num_results);
+ auto result_column_double = ColumnFloat64::create(num_results);
+ auto result_null_map = ColumnUInt8::create(num_results, 0);
+
for (size_t i = 0; i < num_results; ++i) {
result_column_double->get_data()[i] = (*ann_query_params.distance)[i];
}
+
+ result_column =
+ ColumnNullable::create(std::move(result_column_double),
std::move(result_null_map));
row_ids = std::move(ann_query_params.row_ids);
return Status::OK();
}
diff --git a/be/src/vec/exprs/virtual_slot_ref.cpp
b/be/src/vec/exprs/virtual_slot_ref.cpp
index 1fb6b23df4f..1eabc52cf1a 100644
--- a/be/src/vec/exprs/virtual_slot_ref.cpp
+++ b/be/src/vec/exprs/virtual_slot_ref.cpp
@@ -86,7 +86,8 @@ Status VirtualSlotRef::prepare(doris::RuntimeState* state,
const doris::RowDescr
_column_id = desc.get_column_id(_slot_id,
context->force_materialize_slot());
if (_column_id < 0) {
return Status::Error<ErrorCode::INTERNAL_ERROR>(
- "VirtualSlotRef {} has invalid slot id:
{}.\nslot_desc:\n{},\ndesc:\n{},\ndesc_tbl:\n{}",
+ "VirtualSlotRef {} has invalid slot id: "
+ "{}.\nslot_desc:\n{},\ndesc:\n{},\ndesc_tbl:\n{}",
*_column_name, _slot_id, slot_desc->debug_string(),
desc.debug_string(),
state->desc_tbl().debug_string());
}
diff --git a/be/test/olap/vector_search/ann_index_writer_test.cpp
b/be/test/olap/vector_search/ann_index_writer_test.cpp
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/be/test/olap/vector_search/faiss_vector_index_test.cpp
b/be/test/olap/vector_search/faiss_vector_index_test.cpp
index 72bc3aefb74..1526e1800ce 100644
--- a/be/test/olap/vector_search/faiss_vector_index_test.cpp
+++ b/be/test/olap/vector_search/faiss_vector_index_test.cpp
@@ -349,10 +349,10 @@ TEST_F(VectorSearchTest, CompRangeSearch) {
doris_index.get(), native_index.get(), vectors);
std::vector<float> query_vec = vectors.front();
- float radius = 0;
-
- radius = doris::vector_search_utils::get_radius_from_matrix(
- query_vec.data(), params.d, vectors, 0.4f, metric);
+ float radius = 0;
+
+ radius =
doris::vector_search_utils::get_radius_from_matrix(query_vec.data(), params.d,
+
vectors, 0.4f, metric);
HNSWSearchParameters hnsw_params;
hnsw_params.ef_search = 16;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java
b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java
index cd7141682ab..b54570b18dc 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java
@@ -2833,6 +2833,9 @@ public class SchemaChangeHandler extends AlterHandler {
}
if (indexDef.isAnnIndex()) {
+ if (olapTable.getKeysType() != KeysType.DUP_KEYS) {
+ throw new AnalysisException("ANN index can only be built on
table with DUP_KEYS");
+ }
AnnIndexPropertiesChecker.checkProperties(indexDef.getProperties());
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java
index 36439874760..295093688ae 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java
@@ -24,6 +24,7 @@ import org.apache.doris.analysis.IndexDef;
import org.apache.doris.catalog.DatabaseIf;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.Index;
+import org.apache.doris.catalog.KeysType;
import org.apache.doris.catalog.OlapTable;
import org.apache.doris.catalog.Table;
import org.apache.doris.catalog.TableIf;
@@ -126,6 +127,10 @@ public class BuildIndexOp extends AlterTableOp {
+ " is not partitioned, cannot build index with
partitions.");
}
}
+ if (indexDef.getIndexType() == IndexDef.IndexType.ANN
+ && ((OlapTable) table).getKeysType() != KeysType.DUP_KEYS) {
+ throw new AnalysisException("ANN index can only be built on DUP
KEYS tables");
+ }
indexDef.validate();
this.index = existedIdx.clone();
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateIndexOp.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateIndexOp.java
index 7d7ce029d13..f0b4999b3cb 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateIndexOp.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateIndexOp.java
@@ -20,7 +20,14 @@ package org.apache.doris.nereids.trees.plans.commands.info;
import org.apache.doris.alter.AlterOpType;
import org.apache.doris.analysis.AlterTableClause;
import org.apache.doris.analysis.CreateIndexClause;
+import org.apache.doris.analysis.IndexDef;
+import org.apache.doris.catalog.DatabaseIf;
+import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.Index;
+import org.apache.doris.catalog.KeysType;
+import org.apache.doris.catalog.OlapTable;
+import org.apache.doris.catalog.Table;
+import org.apache.doris.catalog.TableIf;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.UserException;
import org.apache.doris.qe.ConnectContext;
@@ -79,6 +86,24 @@ public class CreateIndexOp extends AlterTableOp {
if (tableName != null) {
tableName.analyze(ctx);
}
+ DatabaseIf<Table> db =
Env.getCurrentEnv().getCatalogMgr().getInternalCatalog()
+ .getDb(tableName.getDb()).orElse(null);
+ if (db == null) {
+ throw new AnalysisException("Database[" + tableName.getDb() + "]
is not exist");
+ }
+
+ TableIf table = db.getTable(tableName.getTbl()).orElse(null);
+ if (table == null) {
+ throw new AnalysisException("Table[" + tableName.getTbl() + "] is
not exist");
+ }
+ if (!(table instanceof OlapTable)) {
+ throw new AnalysisException("Only olap table support create
index");
+ }
+ if (indexDef.getIndexType() == IndexDef.IndexType.ANN
+ && ((OlapTable) table).getKeysType() != KeysType.DUP_KEYS) {
+ throw new AnalysisException("ANN index can only be built on DUP
KEYS tables");
+ }
+
indexDef.validate();
index = indexDef.translateToCatalogStyle();
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java
index 628b9e83a00..843df2f8e24 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/OlapScanNode.java
@@ -1567,6 +1567,16 @@ public class OlapScanNode extends ScanNode {
columnsDesc.add(tColumn);
}
}
+
+ // Add virtual column to ColumnsDesc so that backend could
+ // get correct table_schema.
+ for (SlotDescriptor slot : desc.getSlots()) {
+ if (slot.getVirtualColumn() != null) {
+ TColumn tColumn = slot.getColumn().toThrift();
+ columnsDesc.add(tColumn);
+ }
+ }
+
for (Index index : olapTable.getIndexes()) {
TOlapTableIndex tIndex =
index.toThrift(index.getColumnUniqueIds(olapTable.getBaseSchema()));
indexDesc.add(tIndex);
@@ -1901,6 +1911,21 @@ public class OlapScanNode extends ScanNode {
public void finalizeForNereids() {
computeNumNodes();
computeStatsForNereids();
+ // Update SlotDescriptor before construction of thrift message.
+ int virtual_column_idx = 0;
+ for (SlotDescriptor slot : desc.getSlots()) {
+ if (slot.getVirtualColumn() != null) {
+ virtual_column_idx++;
+ Column column = new Column();
+ // Set the name of virtual column to be unique.
+ column.setName("__DORIS_VIRTUAL_COL__" + virtual_column_idx);
+ // Just make sure the unique id is not conflict with other
columns.
+ column.setUniqueId(Integer.MAX_VALUE - virtual_column_idx);
+ column.setType(slot.getType());
+ column.setIsAllowNull(slot.getIsNullable());
+ slot.setColumn(column);
+ }
+ }
}
private void computeStatsForNereids() {
diff --git a/regression-test/data/ann_index_p0/insert_with_invalid_array.out
b/regression-test/data/ann_index_p0/insert_with_invalid_array.out
new file mode 100644
index 00000000000..194b6b4b553
Binary files /dev/null and
b/regression-test/data/ann_index_p0/insert_with_invalid_array.out differ
diff --git
a/regression-test/suites/ddl_p0/ann_index/create_ann_index_test.groovy
b/regression-test/suites/ann_index_p0/create_ann_index_test.groovy
similarity index 100%
rename from regression-test/suites/ddl_p0/ann_index/create_ann_index_test.groovy
rename to regression-test/suites/ann_index_p0/create_ann_index_test.groovy
diff --git
a/regression-test/suites/ddl_p0/ann_index/create_tbl_with_ann_index_test.groovy
b/regression-test/suites/ann_index_p0/create_tbl_with_ann_index_test.groovy
similarity index 100%
rename from
regression-test/suites/ddl_p0/ann_index/create_tbl_with_ann_index_test.groovy
rename to
regression-test/suites/ann_index_p0/create_tbl_with_ann_index_test.groovy
diff --git
a/regression-test/suites/ann_index_p0/insert_with_invalid_array.groovy
b/regression-test/suites/ann_index_p0/insert_with_invalid_array.groovy
new file mode 100644
index 00000000000..8f8da5192e9
--- /dev/null
+++ b/regression-test/suites/ann_index_p0/insert_with_invalid_array.groovy
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("insert_with_invalid_array") {
+ sql "drop table if exists insert_with_invalid_array"
+ test {
+ sql """
+ CREATE TABLE insert_with_invalid_array (
+ id INT NOT NULL COMMENT "",
+ vec ARRAY<FLOAT> NOT NULL COMMENT "",
+ INDEX ann_idx (vec) USING ANN PROPERTIES(
+ "index_type" = "hnsw",
+ "metric_type" = "l2_distance",
+ "dim" = "3"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id) COMMENT "OLAP"
+ DISTRIBUTED BY HASH(id) BUCKETS AUTO
+ PROPERTIES (
+ "replication_num" = "1"
+ );
+ """
+ }
+
+ sql "insert into insert_with_invalid_array values (1, [1.0, 2.0, 3.0])"
+
+ qt_sql "select * from insert_with_invalid_array order by id"
+
+ // Insert with invalid array
+ test {
+ sql """
+ INSERT INTO insert_with_invalid_array VALUES (1, [1.0])
+ """
+ exception "[INVALID_ARGUMENT]"
+ }
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]