This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 4d49ccb1cc8 branch-4.0: [feat](Ann Index)Support create index and
build index for ANN #55586 (#58651)
4d49ccb1cc8 is described below
commit 4d49ccb1cc8c71af49ea6bcb99a8f5b378ba5630
Author: Jack <[email protected]>
AuthorDate: Wed Dec 3 17:42:03 2025 +0800
branch-4.0: [feat](Ann Index)Support create index and build index for ANN
#55586 (#58651)
cherry pick from #55586
---
be/src/olap/rowset/segment_v2/column_writer.cpp | 8 +-
be/src/olap/rowset/segment_v2/column_writer.h | 2 +-
be/src/olap/rowset/segment_v2/index_writer.cpp | 114 ++++---
be/src/olap/rowset/segment_v2/segment_iterator.cpp | 42 ++-
be/src/olap/rowset/segment_v2/segment_iterator.h | 5 +-
be/src/olap/tablet_schema.cpp | 4 +-
be/src/olap/tablet_schema.h | 2 +
be/src/olap/task/index_builder.cpp | 166 ++++++---
be/src/olap/task/index_builder.h | 2 +-
be/test/olap/index_builder_test.cpp | 369 ++++++++++++++++++++-
.../olap/vector_search/ann_index_writer_test.cpp | 2 +
.../apache/doris/alter/SchemaChangeHandler.java | 6 +-
.../main/java/org/apache/doris/catalog/Index.java | 9 +-
.../trees/plans/commands/info/BuildIndexOp.java | 14 +-
.../trees/plans/commands/info/CreateIndexOp.java | 6 -
.../ann_index_p0/build_ann_index_test.groovy | 112 +++++++
.../ann_index_p0/create_ann_index_test.groovy | 23 +-
17 files changed, 731 insertions(+), 155 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp
b/be/src/olap/rowset/segment_v2/column_writer.cpp
index 461a5046e71..2707525e26a 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/column_writer.cpp
@@ -919,7 +919,7 @@ Status ArrayColumnWriter::init() {
if (_opts.need_inverted_index) {
auto* writer = dynamic_cast<ScalarColumnWriter*>(_item_writer.get());
if (writer != nullptr) {
- RETURN_IF_ERROR(IndexColumnWriter::create(get_field(),
&_inverted_index_builder,
+ RETURN_IF_ERROR(IndexColumnWriter::create(get_field(),
&_inverted_index_writer,
_opts.index_file_writer,
_opts.inverted_indexes[0]));
}
@@ -937,7 +937,7 @@ Status ArrayColumnWriter::init() {
Status ArrayColumnWriter::write_inverted_index() {
if (_opts.need_inverted_index) {
- return _inverted_index_builder->finish();
+ return _inverted_index_writer->finish();
}
return Status::OK();
}
@@ -969,7 +969,7 @@ Status ArrayColumnWriter::append_data(const uint8_t** ptr,
size_t num_rows) {
// now only support nested type is scala
if (writer != nullptr) {
//NOTE: use array field name as index field, but item_writer size
should be used when moving item_data_ptr
- RETURN_IF_ERROR(_inverted_index_builder->add_array_values(
+ RETURN_IF_ERROR(_inverted_index_writer->add_array_values(
_item_writer->get_field()->size(), reinterpret_cast<const
void*>(data),
reinterpret_cast<const uint8_t*>(nested_null_map),
offsets_ptr, num_rows));
}
@@ -1005,7 +1005,7 @@ Status ArrayColumnWriter::append_nullable(const uint8_t*
null_map, const uint8_t
RETURN_IF_ERROR(append_data(ptr, num_rows));
if (is_nullable()) {
if (_opts.need_inverted_index) {
- RETURN_IF_ERROR(_inverted_index_builder->add_array_nulls(null_map,
num_rows));
+ RETURN_IF_ERROR(_inverted_index_writer->add_array_nulls(null_map,
num_rows));
}
RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows));
}
diff --git a/be/src/olap/rowset/segment_v2/column_writer.h
b/be/src/olap/rowset/segment_v2/column_writer.h
index 4f42d6bb750..0315b6bb4af 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.h
+++ b/be/src/olap/rowset/segment_v2/column_writer.h
@@ -426,7 +426,7 @@ private:
std::unique_ptr<OffsetColumnWriter> _offset_writer;
std::unique_ptr<ScalarColumnWriter> _null_writer;
std::unique_ptr<ColumnWriter> _item_writer;
- std::unique_ptr<IndexColumnWriter> _inverted_index_builder;
+ std::unique_ptr<IndexColumnWriter> _inverted_index_writer;
std::unique_ptr<AnnIndexColumnWriter> _ann_index_writer;
ColumnWriterOptions _opts;
};
diff --git a/be/src/olap/rowset/segment_v2/index_writer.cpp
b/be/src/olap/rowset/segment_v2/index_writer.cpp
index d5cf7f11b03..70c51b7be13 100644
--- a/be/src/olap/rowset/segment_v2/index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/index_writer.cpp
@@ -17,6 +17,7 @@
#include "common/exception.h"
#include "olap/field.h"
+#include "olap/rowset/segment_v2/ann_index/ann_index_writer.h"
#include "olap/rowset/segment_v2/inverted_index_writer.h"
namespace doris::segment_v2 {
@@ -40,10 +41,11 @@ bool IndexColumnWriter::check_support_inverted_index(const
TabletColumn& column)
}
bool IndexColumnWriter::check_support_ann_index(const TabletColumn& column) {
- // bellow types are not supported in inverted index for extracted columns
+ // only array are supported in ann index
return column.is_array_type();
}
+// create index writer
Status IndexColumnWriter::create(const Field* field,
std::unique_ptr<IndexColumnWriter>* res,
IndexFileWriter* index_file_writer,
const TabletIndex* index_meta) {
@@ -62,64 +64,78 @@ Status IndexColumnWriter::create(const Field* field,
std::unique_ptr<IndexColumn
field_name = std::to_string(field->unique_id());
}
}
- bool single_field = true;
- if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) {
- const auto* array_typeinfo = dynamic_cast<const
ArrayTypeInfo*>(typeinfo);
-
DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_array_typeinfo_is_nullptr",
- { array_typeinfo = nullptr; })
- if (array_typeinfo != nullptr) {
- typeinfo = array_typeinfo->item_type_info();
- type = typeinfo->type();
- single_field = false;
- } else {
- return Status::NotSupported("unsupported array type for inverted
index: " +
- std::to_string(int(type)));
+
+ if (index_meta->is_inverted_index()) {
+ bool single_field = true;
+ if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) {
+ const auto* array_typeinfo = dynamic_cast<const
ArrayTypeInfo*>(typeinfo);
+
DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_array_typeinfo_is_nullptr",
+ { array_typeinfo = nullptr; })
+ if (array_typeinfo != nullptr) {
+ typeinfo = array_typeinfo->item_type_info();
+ type = typeinfo->type();
+ single_field = false;
+ } else {
+ return Status::NotSupported("unsupported array type for
inverted index: " +
+ std::to_string(int(type)));
+ }
}
- }
-
DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_unsupported_type_for_inverted_index",
- { type = FieldType::OLAP_FIELD_TYPE_JSONB; })
- switch (type) {
+
DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_unsupported_type_for_inverted_index",
+ { type = FieldType::OLAP_FIELD_TYPE_JSONB; })
+ switch (type) {
#define M(TYPE)
\
case TYPE:
\
*res = std::make_unique<InvertedIndexColumnWriter<TYPE>>(field_name,
index_file_writer, \
index_meta,
single_field); \
break;
- M(FieldType::OLAP_FIELD_TYPE_TINYINT)
- M(FieldType::OLAP_FIELD_TYPE_SMALLINT)
- M(FieldType::OLAP_FIELD_TYPE_INT)
- M(FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT)
- M(FieldType::OLAP_FIELD_TYPE_BIGINT)
- M(FieldType::OLAP_FIELD_TYPE_LARGEINT)
- M(FieldType::OLAP_FIELD_TYPE_CHAR)
- M(FieldType::OLAP_FIELD_TYPE_VARCHAR)
- M(FieldType::OLAP_FIELD_TYPE_STRING)
- M(FieldType::OLAP_FIELD_TYPE_DATE)
- M(FieldType::OLAP_FIELD_TYPE_DATETIME)
- M(FieldType::OLAP_FIELD_TYPE_DECIMAL)
- M(FieldType::OLAP_FIELD_TYPE_DATEV2)
- M(FieldType::OLAP_FIELD_TYPE_DATETIMEV2)
- M(FieldType::OLAP_FIELD_TYPE_DECIMAL32)
- M(FieldType::OLAP_FIELD_TYPE_DECIMAL64)
- M(FieldType::OLAP_FIELD_TYPE_DECIMAL128I)
- M(FieldType::OLAP_FIELD_TYPE_DECIMAL256)
- M(FieldType::OLAP_FIELD_TYPE_BOOL)
- M(FieldType::OLAP_FIELD_TYPE_IPV4)
- M(FieldType::OLAP_FIELD_TYPE_IPV6)
- M(FieldType::OLAP_FIELD_TYPE_FLOAT)
- M(FieldType::OLAP_FIELD_TYPE_DOUBLE)
+ M(FieldType::OLAP_FIELD_TYPE_TINYINT)
+ M(FieldType::OLAP_FIELD_TYPE_SMALLINT)
+ M(FieldType::OLAP_FIELD_TYPE_INT)
+ M(FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT)
+ M(FieldType::OLAP_FIELD_TYPE_BIGINT)
+ M(FieldType::OLAP_FIELD_TYPE_LARGEINT)
+ M(FieldType::OLAP_FIELD_TYPE_CHAR)
+ M(FieldType::OLAP_FIELD_TYPE_VARCHAR)
+ M(FieldType::OLAP_FIELD_TYPE_STRING)
+ M(FieldType::OLAP_FIELD_TYPE_DATE)
+ M(FieldType::OLAP_FIELD_TYPE_DATETIME)
+ M(FieldType::OLAP_FIELD_TYPE_DECIMAL)
+ M(FieldType::OLAP_FIELD_TYPE_DATEV2)
+ M(FieldType::OLAP_FIELD_TYPE_DATETIMEV2)
+ M(FieldType::OLAP_FIELD_TYPE_DECIMAL32)
+ M(FieldType::OLAP_FIELD_TYPE_DECIMAL64)
+ M(FieldType::OLAP_FIELD_TYPE_DECIMAL128I)
+ M(FieldType::OLAP_FIELD_TYPE_DECIMAL256)
+ M(FieldType::OLAP_FIELD_TYPE_BOOL)
+ M(FieldType::OLAP_FIELD_TYPE_IPV4)
+ M(FieldType::OLAP_FIELD_TYPE_IPV6)
+ M(FieldType::OLAP_FIELD_TYPE_FLOAT)
+ M(FieldType::OLAP_FIELD_TYPE_DOUBLE)
#undef M
- default:
- return Status::NotSupported("unsupported type for inverted index: " +
- std::to_string(int(type)));
- }
- if (*res != nullptr) {
- auto st = (*res)->init();
- if (!st.ok()) {
- (*res)->close_on_error();
- return st;
+ default:
+ return Status::NotSupported("unsupported type for inverted index:
" +
+ std::to_string(int(type)));
+ }
+ if (*res != nullptr) {
+ auto st = (*res)->init();
+ if (!st.ok()) {
+ (*res)->close_on_error();
+ return st;
+ }
+ }
+ } else if (index_meta->is_ann_index()) {
+ DCHECK(type == FieldType::OLAP_FIELD_TYPE_ARRAY);
+ *res = std ::make_unique<AnnIndexColumnWriter>(index_file_writer,
index_meta);
+ if (*res != nullptr) {
+ auto st = (*res)->init();
+ if (!st.ok()) {
+ (*res)->close_on_error();
+ return st;
+ }
}
}
+
return Status::OK();
}
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index d0301c7677f..1dc1101d6df 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -683,6 +683,13 @@ Status
SegmentIterator::_get_row_ranges_by_column_conditions() {
return Status::OK();
}
+bool SegmentIterator::_column_has_ann_index(int32_t cid) {
+ bool has_ann_index = _index_iterators[cid] != nullptr &&
+
_index_iterators[cid]->get_reader(AnnIndexReaderType::ANN);
+
+ return has_ann_index;
+}
+
Status SegmentIterator::_apply_ann_topn_predicate() {
if (_ann_topn_runtime == nullptr) {
return Status::OK();
@@ -692,7 +699,7 @@ Status SegmentIterator::_apply_ann_topn_predicate() {
size_t src_col_idx = _ann_topn_runtime->get_src_column_idx();
ColumnId src_cid = _schema->column_id(src_col_idx);
IndexIterator* ann_index_iterator = _index_iterators[src_cid].get();
- bool has_ann_index = ann_index_iterator != nullptr;
+ bool has_ann_index = _column_has_ann_index(src_cid);
bool has_common_expr_push_down = !_common_expr_ctxs_push_down.empty();
bool has_column_predicate = std::any_of(_is_pred_column.begin(),
_is_pred_column.end(),
[](bool is_pred) { return is_pred;
});
@@ -1368,6 +1375,13 @@ Status SegmentIterator::_init_bitmap_index_iterators() {
return Status::OK();
}
for (auto cid : _schema->column_ids()) {
+ const auto& col = _opts.tablet_schema->column(cid);
+ int col_uid = col.unique_id() >= 0 ? col.unique_id() :
col.parent_unique_id();
+ // The column is not in this segment
+ if (!_segment->_tablet_schema->has_column_unique_id(col_uid)) {
+ continue;
+ }
+
if (_bitmap_index_iterators[cid] == nullptr) {
RETURN_IF_ERROR(_segment->new_bitmap_index_iterator(
_opts.tablet_schema->column(cid), _opts,
&_bitmap_index_iterators[cid]));
@@ -1430,14 +1444,14 @@ Status SegmentIterator::_init_index_iterators() {
for (auto cid : _schema->column_ids()) {
if (_index_iterators[cid] == nullptr) {
const auto& column = _opts.tablet_schema->column(cid);
- int32_t col_unique_id =
- column.is_extracted_column() ? column.parent_unique_id() :
column.unique_id();
- RETURN_IF_ERROR(_segment->new_index_iterator(
- column,
- _segment->_tablet_schema->ann_index(col_unique_id,
column.suffix_path()), _opts,
- &_index_iterators[cid]));
- if (_index_iterators[cid] != nullptr) {
- _index_iterators[cid]->set_context(_index_query_context);
+ const auto* index_meta =
_segment->_tablet_schema->ann_index(column);
+ if (index_meta) {
+ RETURN_IF_ERROR(_segment->new_index_iterator(column,
index_meta, _opts,
+
&_index_iterators[cid]));
+
+ if (_index_iterators[cid] != nullptr) {
+ _index_iterators[cid]->set_context(_index_query_context);
+ }
}
}
}
@@ -1631,22 +1645,22 @@ Status SegmentIterator::_seek_columns(const
std::vector<ColumnId>& column_ids, r
* This is an estimate, if we want more precise cost, statistics collection
is necessary(this is a todo).
* In short, when returned non-pred columns contains string/hll/bitmap, we
using Lazy Materialization.
* Otherwise, we disable it.
- *
+ *
* When Lazy Materialization enable, we need to read column at least two
times.
* First time to read Pred col, second time to read non-pred.
* Here's an interesting question to research, whether read Pred col once is
the best plan.
* (why not read Pred col twice or more?)
*
* When Lazy Materialization disable, we just need to read once.
- *
- *
+ *
+ *
* 2 Whether the predicate type can be evaluate in a fast way(using SIMD to
eval pred)
* Such as integer type and float type, they can be eval fast.
* But for BloomFilter/string/date, they eval slow.
* If a type can be eval fast, we use vectorization to eval it.
* Otherwise, we use short-circuit to eval it.
- *
- *
+ *
+ *
*/
// todo(wb) need a UT here
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h
b/be/src/olap/rowset/segment_v2/segment_iterator.h
index 7f765a5f4b4..92925a75ba7 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -174,7 +174,6 @@ private:
[[nodiscard]] Status _init_bitmap_index_iterators();
[[nodiscard]] Status _init_index_iterators();
- Status _apply_ann_topn_predicate();
// calculate row ranges that fall into requested key ranges using short
key index
[[nodiscard]] Status _get_row_ranges_by_keys();
[[nodiscard]] Status _prepare_seek(const StorageReadOptions::KeyRange&
key_range);
@@ -192,13 +191,17 @@ private:
// calculate row ranges that satisfy requested column conditions using
various column index
[[nodiscard]] Status _get_row_ranges_by_column_conditions();
[[nodiscard]] Status _get_row_ranges_from_conditions(RowRanges*
condition_row_ranges);
+
[[nodiscard]] Status _apply_bitmap_index();
[[nodiscard]] Status _apply_inverted_index();
[[nodiscard]] Status _apply_inverted_index_on_column_predicate(
ColumnPredicate* pred, std::vector<ColumnPredicate*>&
remaining_predicates,
bool* continue_apply);
+ [[nodiscard]] Status _apply_ann_topn_predicate();
[[nodiscard]] Status _apply_index_expr();
+
bool _column_has_fulltext_index(int32_t cid);
+ bool _column_has_ann_index(int32_t cid);
bool _downgrade_without_index(Status res, bool need_remaining = false);
inline bool _inverted_index_not_support_pred_type(const PredicateType&
type);
bool _is_literal_node(const TExprNodeType::type& node_type);
diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp
index 079cf9278ff..b3331cdfb18 100644
--- a/be/src/olap/tablet_schema.cpp
+++ b/be/src/olap/tablet_schema.cpp
@@ -1544,7 +1544,8 @@ void TabletSchema::update_tablet_columns(const
TabletSchema& tablet_schema,
bool TabletSchema::has_inverted_index_with_index_id(int64_t index_id) const {
for (size_t i = 0; i < _indexes.size(); i++) {
- if (_indexes[i]->index_type() == IndexType::INVERTED &&
+ if ((_indexes[i]->index_type() == IndexType::INVERTED ||
+ _indexes[i]->index_type() == IndexType::ANN) &&
_indexes[i]->index_id() == index_id) {
return true;
}
@@ -1645,7 +1646,6 @@ const TabletIndex* TabletSchema::ann_index(int32_t
col_unique_id,
}
const TabletIndex* TabletSchema::ann_index(const TabletColumn& col) const {
- // Some columns(Float, Double, JSONB ...) from the variant do not support
inverted index
if (!segment_v2::IndexColumnWriter::check_support_ann_index(col)) {
return nullptr;
}
diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h
index b839231464b..429e102c9a2 100644
--- a/be/src/olap/tablet_schema.h
+++ b/be/src/olap/tablet_schema.h
@@ -341,6 +341,8 @@ public:
bool is_inverted_index() const { return _index_type ==
IndexType::INVERTED; }
+ bool is_ann_index() const { return _index_type == IndexType::ANN; }
+
void remove_parser_and_analyzer() {
_properties.erase(INVERTED_INDEX_PARSER_KEY);
_properties.erase(INVERTED_INDEX_PARSER_KEY_ALIAS);
diff --git a/be/src/olap/task/index_builder.cpp
b/be/src/olap/task/index_builder.cpp
index 9dd418f4ace..ecfbd2e007b 100644
--- a/be/src/olap/task/index_builder.cpp
+++ b/be/src/olap/task/index_builder.cpp
@@ -19,6 +19,7 @@
#include <mutex>
+#include "common/logging.h"
#include "common/status.h"
#include "olap/olap_define.h"
#include "olap/rowset/beta_rowset.h"
@@ -50,7 +51,7 @@ IndexBuilder::IndexBuilder(StorageEngine& engine,
TabletSharedPtr tablet,
IndexBuilder::~IndexBuilder() {
_olap_data_convertor.reset();
- _inverted_index_builders.clear();
+ _index_column_writers.clear();
}
Status IndexBuilder::init() {
@@ -113,12 +114,9 @@ Status IndexBuilder::update_inverted_index_info() {
}
}
auto column = output_rs_tablet_schema->column(column_idx);
+
+ // inverted index
auto index_metas =
output_rs_tablet_schema->inverted_indexs(column);
- if (index_metas.empty()) {
- LOG(ERROR) << "failed to find column: " << column_name
- << " index_id: " << t_inverted_index.index_id;
- continue;
- }
for (const auto& index_meta : index_metas) {
if
(output_rs_tablet_schema->get_inverted_index_storage_format() ==
InvertedIndexStorageFormatPB::V1) {
@@ -143,7 +141,20 @@ Status IndexBuilder::update_inverted_index_info() {
// remove dropped index_meta from output rowset tablet
schema
output_rs_tablet_schema->remove_index(index_meta->index_id());
}
+
+ // ann index
+ const auto* ann_index =
output_rs_tablet_schema->ann_index(column);
+ if (!ann_index) {
+ continue;
+ }
+
DCHECK(output_rs_tablet_schema->get_inverted_index_storage_format() !=
+ InvertedIndexStorageFormatPB::V1);
+ _dropped_inverted_indexes.push_back(*ann_index);
+ // ATTN: DO NOT REMOVE INDEX AFTER OUTPUT_ROWSET_WRITER
CREATED.
+ // remove dropped index_meta from output rowset tablet schema
+ output_rs_tablet_schema->remove_index(ann_index->index_id());
}
+
DBUG_EXECUTE_IF("index_builder.update_inverted_index_info.drop_index", {
auto indexes_count =
DebugPoints::instance()->get_debug_param_or_default<int32_t>(
"index_builder.update_inverted_index_info.drop_index",
"indexes_count", 0);
@@ -172,6 +183,8 @@ Status IndexBuilder::update_inverted_index_info() {
continue;
}
const TabletColumn& col =
output_rs_tablet_schema->column_by_uid(column_uid);
+
+ // inverted index
auto exist_indexs =
output_rs_tablet_schema->inverted_indexs(col);
for (const auto& exist_index : exist_indexs) {
if (exist_index->index_id() != index.index_id()) {
@@ -188,6 +201,21 @@ Status IndexBuilder::update_inverted_index_info() {
}
}
}
+
+ // ann index
+ const auto* exist_index =
output_rs_tablet_schema->ann_index(col);
+ if (exist_index && exist_index->index_id() !=
index.index_id()) {
+ if (exist_index->is_same_except_id(&index)) {
+ LOG(WARNING) << fmt::format(
+ "column: {} has a exist ann index, but the
index id not "
+ "equal request's index id, , exist index id:
{}, request's index "
+ "id: {}, remove exist index in new
output_rs_tablet_schema",
+ column_uid, exist_index->index_id(),
index.index_id());
+ without_index_uids.insert(exist_index->index_id());
+
output_rs_tablet_schema->remove_index(exist_index->index_id());
+ }
+ }
+
output_rs_tablet_schema->append_index(std::move(index));
}
}
@@ -357,7 +385,7 @@ Status
IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta
LOG(INFO) << "all row nums. source_rows=" <<
output_rowset_meta->num_rows();
return Status::OK();
} else {
- // create inverted index writer
+ // create inverted or ann index writer
const auto& fs = output_rowset_meta->fs();
auto output_rowset_schema = output_rowset_meta->tablet_schema();
size_t inverted_index_size = 0;
@@ -402,8 +430,10 @@ Status
IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta
fs, index_path_prefix,
output_rowset_meta->rowset_id().to_string(),
seg_ptr->id(),
output_rowset_schema->get_inverted_index_storage_format());
}
- // create inverted index writer
+ // create inverted index writer, or ann index writer
for (auto inverted_index : _alter_inverted_indexes) {
+ DCHECK(inverted_index.index_type == TIndexType::INVERTED ||
+ inverted_index.index_type == TIndexType::ANN);
DCHECK_EQ(inverted_index.columns.size(), 1);
auto index_id = inverted_index.index_id;
auto column_name = inverted_index.columns[0];
@@ -425,44 +455,79 @@ Status
IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta
// variant column is not support for building index
auto is_support_inverted_index =
IndexColumnWriter::check_support_inverted_index(column);
+ auto is_support_ann_index =
IndexColumnWriter::check_support_ann_index(column);
DBUG_EXECUTE_IF("IndexBuilder::handle_single_rowset_support_inverted_index",
{ is_support_inverted_index = false; })
- if (!is_support_inverted_index) {
+ if (!is_support_inverted_index && !is_support_ann_index) {
continue;
}
DCHECK(output_rowset_schema->has_inverted_index_with_index_id(index_id));
_olap_data_convertor->add_column_data_convertor(column);
return_columns.emplace_back(column_idx);
std::unique_ptr<Field> field(FieldFactory::create(column));
- auto index_metas =
output_rowset_schema->inverted_indexs(column);
- for (const auto& index_meta : index_metas) {
- if (index_meta->index_id() != index_id) {
- continue;
- }
- std::unique_ptr<segment_v2::IndexColumnWriter>
inverted_index_builder;
- try {
- RETURN_IF_ERROR(segment_v2::IndexColumnWriter::create(
- field.get(), &inverted_index_builder,
index_file_writer.get(),
- index_meta));
- DBUG_EXECUTE_IF(
-
"IndexBuilder::handle_single_rowset_index_column_writer_create_"
- "error",
- {
- _CLTHROWA(CL_ERR_IO,
- "debug point: "
-
"handle_single_rowset_index_column_writer_create_"
- "error");
- })
- } catch (const std::exception& e) {
- return
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
- "CLuceneError occured: {}", e.what());
+
+ if (inverted_index.index_type == TIndexType::INVERTED) {
+ // inverted index
+ auto index_metas =
output_rowset_schema->inverted_indexs(column);
+ for (const auto& index_meta : index_metas) {
+ if (index_meta->index_id() != index_id) {
+ continue;
+ }
+ std::unique_ptr<segment_v2::IndexColumnWriter>
inverted_index_builder;
+ try {
+
RETURN_IF_ERROR(segment_v2::IndexColumnWriter::create(
+ field.get(), &inverted_index_builder,
index_file_writer.get(),
+ index_meta));
+ DBUG_EXECUTE_IF(
+
"IndexBuilder::handle_single_rowset_index_column_writer_create_"
+ "error",
+ {
+ _CLTHROWA(CL_ERR_IO,
+ "debug point: "
+
"handle_single_rowset_index_column_writer_create_"
+ "error");
+ })
+ } catch (const std::exception& e) {
+ return
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
+ "CLuceneError occured: {}", e.what());
+ }
+
+ if (inverted_index_builder) {
+ auto writer_sign = std::make_pair(seg_ptr->id(),
index_id);
+ _index_column_writers.insert(
+ std::make_pair(writer_sign,
std::move(inverted_index_builder)));
+
inverted_index_writer_signs.emplace_back(writer_sign);
+ }
}
+ } else if (inverted_index.index_type == TIndexType::ANN) {
+ // ann index
+ const auto* index_meta =
output_rowset_schema->ann_index(column);
+ if (index_meta && index_meta->index_id() == index_id) {
+ std::unique_ptr<segment_v2::IndexColumnWriter>
index_writer;
+ try {
+
RETURN_IF_ERROR(segment_v2::IndexColumnWriter::create(
+ field.get(), &index_writer,
index_file_writer.get(),
+ index_meta));
+ DBUG_EXECUTE_IF(
+
"IndexBuilder::handle_single_rowset_index_column_writer_create_"
+ "error",
+ {
+ _CLTHROWA(CL_ERR_IO,
+ "debug point: "
+
"handle_single_rowset_index_column_writer_create_"
+ "error");
+ })
+ } catch (const std::exception& e) {
+ return
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
+ "CLuceneError occured: {}", e.what());
+ }
- if (inverted_index_builder) {
- auto writer_sign = std::make_pair(seg_ptr->id(),
index_id);
- _inverted_index_builders.insert(
- std::make_pair(writer_sign,
std::move(inverted_index_builder)));
- inverted_index_writer_signs.emplace_back(writer_sign);
+ if (index_writer) {
+ auto writer_sign = std::make_pair(seg_ptr->id(),
index_id);
+ _index_column_writers.insert(
+ std::make_pair(writer_sign,
std::move(index_writer)));
+
inverted_index_writer_signs.emplace_back(writer_sign);
+ }
}
}
}
@@ -510,7 +575,7 @@ Status
IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta
return status;
}
- // write inverted index data
+ // write inverted index data, or ann index data
status = _write_inverted_index_data(output_rowset_schema,
iter->data_id(),
block.get());
DBUG_EXECUTE_IF(
@@ -529,8 +594,8 @@ Status
IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta
// finish write inverted index, flush data to compound file
for (auto& writer_sign : inverted_index_writer_signs) {
try {
- if (_inverted_index_builders[writer_sign]) {
-
RETURN_IF_ERROR(_inverted_index_builders[writer_sign]->finish());
+ if (_index_column_writers[writer_sign]) {
+
RETURN_IF_ERROR(_index_column_writers[writer_sign]->finish());
}
DBUG_EXECUTE_IF("IndexBuilder::handle_single_rowset_index_build_finish_error", {
_CLTHROWA(CL_ERR_IO,
@@ -556,7 +621,7 @@ Status
IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta
}
inverted_index_size +=
index_file_writer->get_index_file_total_size();
}
- _inverted_index_builders.clear();
+ _index_column_writers.clear();
_index_file_writers.clear();
output_rowset_meta->set_data_disk_size(output_rowset_meta->data_disk_size());
output_rowset_meta->set_total_disk_size(output_rowset_meta->total_disk_size() +
@@ -571,7 +636,7 @@ Status
IndexBuilder::handle_single_rowset(RowsetMetaSharedPtr output_rowset_meta
Status IndexBuilder::_write_inverted_index_data(TabletSchemaSPtr
tablet_schema, int64_t segment_idx,
vectorized::Block* block) {
- VLOG_DEBUG << "begin to write inverted index";
+ VLOG_DEBUG << "begin to write inverted/ann index";
// converter block data
_olap_data_convertor->set_source_content(block, 0, block->rows());
for (auto i = 0; i < _alter_inverted_indexes.size(); ++i) {
@@ -634,14 +699,14 @@ Status IndexBuilder::_add_nullable(const std::string&
column_name,
try {
auto data = *(data_ptr + 2);
auto nested_null_map = *(data_ptr + 3);
-
RETURN_IF_ERROR(_inverted_index_builders[index_writer_sign]->add_array_values(
+
RETURN_IF_ERROR(_index_column_writers[index_writer_sign]->add_array_values(
field->get_sub_field(0)->size(), reinterpret_cast<const
void*>(data),
reinterpret_cast<const uint8_t*>(nested_null_map),
offsets_ptr, num_rows));
DBUG_EXECUTE_IF("IndexBuilder::_add_nullable_add_array_values_error", {
_CLTHROWA(CL_ERR_IO, "debug point:
_add_nullable_add_array_values_error");
})
-
RETURN_IF_ERROR(_inverted_index_builders[index_writer_sign]->add_array_nulls(null_map,
-
num_rows));
+ RETURN_IF_ERROR(
+
_index_column_writers[index_writer_sign]->add_array_nulls(null_map, num_rows));
} catch (const std::exception& e) {
return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
"CLuceneError occured: {}", e.what());
@@ -665,11 +730,11 @@ Status IndexBuilder::_add_nullable(const std::string&
column_name,
do {
auto step = next_run_step();
if (null_map[offset]) {
-
RETURN_IF_ERROR(_inverted_index_builders[index_writer_sign]->add_nulls(
+
RETURN_IF_ERROR(_index_column_writers[index_writer_sign]->add_nulls(
static_cast<uint32_t>(step)));
} else {
-
RETURN_IF_ERROR(_inverted_index_builders[index_writer_sign]->add_values(
- column_name, *ptr, step));
+
RETURN_IF_ERROR(_index_column_writers[index_writer_sign]->add_values(column_name,
+
*ptr, step));
}
*ptr += field->size() * step;
offset += step;
@@ -699,13 +764,13 @@ Status IndexBuilder::_add_data(const std::string&
column_name,
if (element_cnt > 0) {
auto data = *(data_ptr + 2);
auto nested_null_map = *(data_ptr + 3);
-
RETURN_IF_ERROR(_inverted_index_builders[index_writer_sign]->add_array_values(
+
RETURN_IF_ERROR(_index_column_writers[index_writer_sign]->add_array_values(
field->get_sub_field(0)->size(),
reinterpret_cast<const void*>(data),
reinterpret_cast<const uint8_t*>(nested_null_map),
offsets_ptr, num_rows));
}
} else {
-
RETURN_IF_ERROR(_inverted_index_builders[index_writer_sign]->add_values(
- column_name, *ptr, num_rows));
+
RETURN_IF_ERROR(_index_column_writers[index_writer_sign]->add_values(column_name,
*ptr,
+
num_rows));
}
DBUG_EXECUTE_IF("IndexBuilder::_add_data_throw_exception",
{ _CLTHROWA(CL_ERR_IO, "debug point:
_add_data_throw_exception"); })
@@ -785,6 +850,7 @@ Status IndexBuilder::do_build_inverted_index() {
_tablet->tablet_id());
}
+ DCHECK(!_alter_index_ids.empty());
_input_rowsets =
_tablet->pick_candidate_rowsets_to_build_inverted_index(_alter_index_ids,
_is_drop_op);
if (_input_rowsets.empty()) {
diff --git a/be/src/olap/task/index_builder.h b/be/src/olap/task/index_builder.h
index 478e6557b93..d87d88c5e76 100644
--- a/be/src/olap/task/index_builder.h
+++ b/be/src/olap/task/index_builder.h
@@ -83,7 +83,7 @@ private:
std::unique_ptr<vectorized::OlapBlockDataConvertor> _olap_data_convertor;
// "<segment_id, index_id>" -> IndexColumnWriter
std::unordered_map<std::pair<int64_t, int64_t>,
std::unique_ptr<segment_v2::IndexColumnWriter>>
- _inverted_index_builders;
+ _index_column_writers;
std::unordered_map<int64_t, std::unique_ptr<IndexFileWriter>>
_index_file_writers;
// <rowset_id, segment_id>
std::unordered_map<std::pair<std::string, int64_t>,
std::unique_ptr<IndexFileReader>>
diff --git a/be/test/olap/index_builder_test.cpp
b/be/test/olap/index_builder_test.cpp
index afe3a4b8e4d..b0af40ff1f0 100644
--- a/be/test/olap/index_builder_test.cpp
+++ b/be/test/olap/index_builder_test.cpp
@@ -110,6 +110,31 @@ protected:
tablet_schema->append_column(column_2);
}
+ TabletSchemaSPtr create_ann_tablet_schema() {
+ TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>();
+ TabletSchemaPB tablet_schema_pb;
+ tablet_schema_pb.set_keys_type(DUP_KEYS);
+ tablet_schema->init_from_pb(tablet_schema_pb);
+ // Set basic properties of TabletSchema directly
+ tablet_schema->_inverted_index_storage_format =
InvertedIndexStorageFormatPB::V2;
+
+ TabletColumn array_column;
+ array_column.set_name("arr1");
+ array_column.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY);
+ array_column.set_unique_id(1);
+ array_column.set_length(0);
+ array_column.set_index_length(0);
+ array_column.set_is_nullable(false);
+
+ TabletColumn child_column;
+ child_column.set_name("arr_sub_float");
+ child_column.set_type(FieldType::OLAP_FIELD_TYPE_FLOAT);
+ child_column.set_length(INT_MAX);
+ array_column.add_sub_column(child_column);
+ tablet_schema->append_column(array_column);
+ return tablet_schema;
+ }
+
TabletMetaSharedPtr create_tablet_meta() {
TabletMetaPB tablet_meta_pb;
tablet_meta_pb.set_table_id(1);
@@ -177,7 +202,7 @@ TEST_F(IndexBuilderTest, BasicBuildTest) {
EXPECT_EQ(builder._alter_index_ids.size(), 1);
}
-TEST_F(IndexBuilderTest, DropIndexTest) {
+TEST_F(IndexBuilderTest, DropInvertedIndexTest) {
// 0. prepare tablet path
auto tablet_path = _absolute_dir + "/" + std::to_string(15676);
_tablet->_tablet_path = tablet_path;
@@ -253,6 +278,7 @@ TEST_F(IndexBuilderTest, DropIndexTest) {
// 7. Prepare index for dropping
TOlapTableIndex drop_index;
+ drop_index.index_type = TIndexType::INVERTED;
drop_index.index_id = 1;
drop_index.columns.emplace_back("k1");
_alter_indexes.push_back(drop_index);
@@ -312,7 +338,168 @@ TEST_F(IndexBuilderTest, DropIndexTest) {
//EXPECT_FALSE(tablet_schema->has_inverted_index_with_index_id(1));
}
-TEST_F(IndexBuilderTest, BuildIndexAfterWritingDataTest) {
+TEST_F(IndexBuilderTest, DropAnnIndexTest) {
+ // prepare tablet path
+ auto tablet_path = _absolute_dir + "/" + std::to_string(15676);
+ _tablet->_tablet_path = tablet_path;
+
ASSERT_TRUE(io::global_local_filesystem()->delete_directory(tablet_path).ok());
+
ASSERT_TRUE(io::global_local_filesystem()->create_directory(tablet_path).ok());
+
+ RowsetSharedPtr rowset;
+
+ // Create test ann index properties
+ std::map<std::string, std::string> properties;
+ properties["index_type"] = "hnsw";
+ properties["metric_type"] = "l2_distance";
+ properties["dim"] = "4";
+ properties["max_degree"] = "16";
+
+ // First add an initial index to the schema (for arr1 column)
+ TabletIndex initial_index;
+ initial_index._index_id = 1;
+ initial_index._index_name = "arr1_index";
+ initial_index._index_type = IndexType::ANN;
+ initial_index._col_unique_ids.push_back(1); // unique_id for arr1
+ initial_index._properties = properties;
+
+ _tablet_schema = create_ann_tablet_schema();
+ _tablet_schema->append_index(std::move(initial_index));
+
+ // 3. Create a rowset writer context
+ RowsetWriterContext writer_context;
+ writer_context.rowset_id.init(15676);
+ writer_context.tablet_id = 15676;
+ writer_context.tablet_schema_hash = 567997577;
+ writer_context.partition_id = 10;
+ writer_context.rowset_type = BETA_ROWSET;
+ writer_context.tablet_path = tablet_path;
+ writer_context.rowset_state = VISIBLE;
+ writer_context.tablet_schema = _tablet_schema;
+ writer_context.version.first = 10;
+ writer_context.version.second = 10;
+
+
ASSERT_TRUE(io::global_local_filesystem()->create_directory(writer_context.tablet_path).ok());
+
+ // Create a rowset writer
+ auto res = RowsetFactory::create_rowset_writer(*_engine_ref,
writer_context, false);
+ ASSERT_TRUE(res.has_value()) << res.error();
+ auto rowset_writer = std::move(res).value();
+
+ // Write data to the rowset
+ {
+ vectorized::DataTypePtr inner_float =
std::make_shared<vectorized::DataTypeFloat32>();
+ vectorized::DataTypePtr array_type =
+ std::make_shared<vectorized::DataTypeArray>(inner_float);
+
+ // create a MutableColumnPtr
+ vectorized::MutableColumnPtr col = array_type->create_column();
+ // row0
+ {
+ vectorized::Array arr;
+ arr.push_back(vectorized::Field::create_field<TYPE_FLOAT>(1.0F));
+ arr.push_back(vectorized::Field::create_field<TYPE_FLOAT>(2.0F));
+ arr.push_back(vectorized::Field::create_field<TYPE_FLOAT>(3.0F));
+ arr.push_back(vectorized::Field::create_field<TYPE_FLOAT>(4.0F));
+ col->insert(vectorized::Field::create_field<TYPE_ARRAY>(arr));
+ }
+ // row1
+ {
+ vectorized::Array arr;
+ arr.push_back(vectorized::Field::create_field<TYPE_FLOAT>(5.0F));
+ arr.push_back(vectorized::Field::create_field<TYPE_FLOAT>(6.0F));
+ arr.push_back(vectorized::Field::create_field<TYPE_FLOAT>(7.0F));
+ arr.push_back(vectorized::Field::create_field<TYPE_FLOAT>(8.0F));
+ col->insert(vectorized::Field::create_field<TYPE_ARRAY>(arr));
+ }
+ // wrap the constructed column into a ColumnWithTypeAndName
+ vectorized::ColumnPtr column_array = std::move(col);
+ vectorized::ColumnWithTypeAndName type_and_name(column_array,
array_type, "arr1");
+
+ // construct Block (containing only this column), with 2 rows
+ vectorized::Block block;
+ block.insert(type_and_name);
+
+ // Add the block to the rowset
+ Status s = rowset_writer->add_block(&block);
+ ASSERT_TRUE(s.ok()) << s.to_string();
+
+ // Flush the writer
+ s = rowset_writer->flush();
+ ASSERT_TRUE(s.ok()) << s.to_string();
+
+ // Build the rowset
+ ASSERT_TRUE(rowset_writer->build(rowset).ok());
+
+ // Add the rowset to the tablet
+ ASSERT_TRUE(_tablet->add_rowset(rowset).ok());
+ }
+
+ // Verify index exists before dropping
+ EXPECT_TRUE(_tablet_schema->has_ann_index());
+ EXPECT_TRUE(_tablet_schema->has_inverted_index_with_index_id(1));
+
+ // Prepare index for dropping
+ TOlapTableIndex drop_index;
+ drop_index.index_type = TIndexType::type::ANN;
+ drop_index.index_id = 1;
+ drop_index.index_name = "arr1_index";
+ drop_index.columns.emplace_back("arr1");
+ _alter_indexes.clear();
+ _alter_indexes.push_back(drop_index);
+
+ // Create IndexBuilder with drop operation
+ IndexBuilder builder(ExecEnv::GetInstance()->storage_engine().to_local(),
_tablet, _columns,
+ _alter_indexes, true);
+
+ // Initialize and verify
+ auto status = builder.init();
+ EXPECT_TRUE(status.ok()) << status.to_string();
+ EXPECT_EQ(builder._alter_index_ids.size(), 1);
+
+ // Execute drop operation
+ status = builder.do_build_inverted_index();
+ EXPECT_TRUE(status.ok()) << status.to_string();
+
+ // Verify the index has been removed
+ // check old tablet path and new tablet path
+ bool exists = false;
+ EXPECT_TRUE(io::global_local_filesystem()->exists(tablet_path,
&exists).ok());
+ EXPECT_TRUE(exists);
+
+ // Check files in old and new directories
+ std::vector<io::FileInfo> files;
+ bool dir_exists = false;
+ EXPECT_TRUE(io::global_local_filesystem()->list(tablet_path, true, &files,
&dir_exists).ok());
+ EXPECT_TRUE(dir_exists);
+ int new_idx_file_count = 0;
+ int new_dat_file_count = 0;
+ int old_idx_file_count = 0;
+ int old_dat_file_count = 0;
+ for (const auto& file : files) {
+ std::string filename = file.file_name;
+ if (filename.find("15676_0.idx") != std::string::npos) {
+ old_idx_file_count++;
+ }
+ if (filename.find("15676_0.dat") != std::string::npos) {
+ old_dat_file_count++;
+ }
+ if
(filename.find("020000000000000100000000000000000000000000000000_0.idx") !=
+ std::string::npos) {
+ new_idx_file_count++;
+ }
+ if
(filename.find("020000000000000100000000000000000000000000000000_0.dat") !=
+ std::string::npos) {
+ new_dat_file_count++;
+ }
+ }
+ // The index should have been removed
+ EXPECT_EQ(old_idx_file_count, 1) << "Tablet path should have 1 .idx file
before drop";
+ EXPECT_EQ(old_dat_file_count, 1) << "Tablet path should have 1 .dat file
before drop";
+ EXPECT_EQ(new_idx_file_count, 0) << "Tablet path should have no .idx file
after drop";
+ EXPECT_EQ(new_dat_file_count, 1) << "Tablet path should have 1 .dat file
after drop";
+}
+
+TEST_F(IndexBuilderTest, BuildInvertedIndexAfterWritingDataTest) {
// 0. prepare tablet path
auto tablet_path = _absolute_dir + "/" + std::to_string(14673);
_tablet->_tablet_path = tablet_path;
@@ -459,6 +646,184 @@ TEST_F(IndexBuilderTest, BuildIndexAfterWritingDataTest) {
//EXPECT_TRUE(tablet_schema->has_inverted_index_with_index_id(2));
}
+TEST_F(IndexBuilderTest, BuildAnnIndexAfterWritingDataTest) {
+ // 0. prepare tablet path
+ auto tablet_path = _absolute_dir + "/" + std::to_string(14686);
+
ASSERT_TRUE(io::global_local_filesystem()->delete_directory(tablet_path).ok());
+
ASSERT_TRUE(io::global_local_filesystem()->create_directory(tablet_path).ok());
+
+ // 1. Prepare data for writing
+ RowsetSharedPtr rowset;
+ const int num_rows = 100;
+
+ // 2. Use ANN schema with array<float> column
+ auto ann_schema = create_ann_tablet_schema();
+
+ // 3. Update schema in tablet meta
+ TabletMetaPB tablet_meta_pb;
+ _tablet_meta->to_meta_pb(&tablet_meta_pb, false);
+
+ TabletSchemaPB ann_schema_pb;
+ ann_schema->to_schema_pb(&ann_schema_pb);
+ tablet_meta_pb.mutable_schema()->CopyFrom(ann_schema_pb);
+
+ _tablet_meta->init_from_pb(tablet_meta_pb);
+
+ // 4. Reinitialize tablet to use new schema
+ _tablet = std::make_shared<Tablet>(*_engine_ref, _tablet_meta,
_data_dir.get());
+ _tablet->_tablet_path = tablet_path;
+ ASSERT_TRUE(_tablet->init().ok());
+
+ _tablet_schema = ann_schema;
+
+ // 3. Create a rowset writer context
+ RowsetWriterContext writer_context;
+ writer_context.rowset_id.init(15686);
+ writer_context.tablet_id = 15686;
+ writer_context.tablet_schema_hash = 567997577;
+ writer_context.partition_id = 10;
+ writer_context.rowset_type = BETA_ROWSET;
+ writer_context.tablet_path = _absolute_dir + "/" + std::to_string(15686);
+ writer_context.rowset_state = VISIBLE;
+ writer_context.tablet_schema = _tablet_schema;
+ writer_context.version.first = 10;
+ writer_context.version.second = 10;
+
+
ASSERT_TRUE(io::global_local_filesystem()->create_directory(writer_context.tablet_path).ok());
+
+ // 4. Create a rowset writer
+ auto res = RowsetFactory::create_rowset_writer(*_engine_ref,
writer_context, false);
+ ASSERT_TRUE(res.has_value()) << res.error();
+ auto rowset_writer = std::move(res).value();
+
+ // 5. Write data to the rowset with float arrays
+ {
+ vectorized::DataTypePtr inner_float =
std::make_shared<vectorized::DataTypeFloat32>();
+ vectorized::DataTypePtr array_type =
+ std::make_shared<vectorized::DataTypeArray>(inner_float);
+
+ // create a MutableColumnPtr
+ vectorized::MutableColumnPtr col = array_type->create_column();
+
+ // Add data for each row - arrays of 4 floats (matching dim=4 in
properties)
+ for (int i = 0; i < num_rows; ++i) {
+ vectorized::Array arr;
+ // Create 4-dimensional float vectors
+
arr.push_back(vectorized::Field::create_field<TYPE_FLOAT>(static_cast<float>(i
% 10)));
+ arr.push_back(
+
vectorized::Field::create_field<TYPE_FLOAT>(static_cast<float>((i + 1) % 10)));
+ arr.push_back(
+
vectorized::Field::create_field<TYPE_FLOAT>(static_cast<float>((i + 2) % 10)));
+ arr.push_back(
+
vectorized::Field::create_field<TYPE_FLOAT>(static_cast<float>((i + 3) % 10)));
+ col->insert(vectorized::Field::create_field<TYPE_ARRAY>(arr));
+ }
+
+ // wrap the constructed column into a ColumnWithTypeAndName
+ vectorized::ColumnPtr column_array = std::move(col);
+ vectorized::ColumnWithTypeAndName type_and_name(column_array,
array_type, "arr1");
+
+ // construct Block (containing only this column), with num_rows rows
+ vectorized::Block block;
+ block.insert(type_and_name);
+
+ // Add the block to the rowset
+ Status s = rowset_writer->add_block(&block);
+ ASSERT_TRUE(s.ok()) << s.to_string();
+
+ // Flush the writer
+ s = rowset_writer->flush();
+ ASSERT_TRUE(s.ok()) << s.to_string();
+
+ // Build the rowset
+ ASSERT_TRUE(rowset_writer->build(rowset).ok());
+
+ // Add the rowset to the tablet
+ ASSERT_TRUE(_tablet->add_rowset(rowset).ok());
+ }
+
+ // 6. Prepare ANN index for building
+ std::map<std::string, std::string> properties;
+ properties["index_type"] = "hnsw";
+ properties["metric_type"] = "l2_distance";
+ properties["dim"] = "4";
+ properties["max_degree"] = "16";
+
+ TOlapTableIndex ann_index;
+ ann_index.__set_index_id(1);
+ ann_index.__set_columns({"arr1"});
+ ann_index.__set_index_name("arr1_ann_index");
+ ann_index.__set_index_type(TIndexType::ANN);
+ // NOTE: wrong way, it doesn't set __isset.properties flag
+ // ann_index.properties = properties;
+ ann_index.__set_properties(properties);
+ _alter_indexes.push_back(ann_index);
+
+ // 7. Create IndexBuilder
+ IndexBuilder builder(ExecEnv::GetInstance()->storage_engine().to_local(),
_tablet, _columns,
+ _alter_indexes, false);
+
+ // 8. Initialize and verify
+ auto status = builder.init();
+ EXPECT_TRUE(status.ok()) << status.to_string();
+ EXPECT_EQ(builder._alter_index_ids.size(), 1);
+
+ // 9. Build ANN index
+ status = builder.do_build_inverted_index();
+ EXPECT_TRUE(status.ok()) << status.to_string();
+
+ // 10. Check paths and files
+ auto old_tablet_path = _absolute_dir + "/" + std::to_string(15686);
+ auto new_tablet_path = _absolute_dir + "/" + std::to_string(14686);
+ bool old_exists = false;
+ bool new_exists = false;
+ EXPECT_TRUE(io::global_local_filesystem()->exists(old_tablet_path,
&old_exists).ok());
+ EXPECT_TRUE(old_exists);
+ EXPECT_TRUE(io::global_local_filesystem()->exists(new_tablet_path,
&new_exists).ok());
+ EXPECT_TRUE(new_exists);
+
+ // Check files in old and new directories
+ std::vector<io::FileInfo> old_files;
+ bool old_dir_exists = false;
+ EXPECT_TRUE(io::global_local_filesystem()
+ ->list(old_tablet_path, true, &old_files,
&old_dir_exists)
+ .ok());
+ EXPECT_TRUE(old_dir_exists);
+ int idx_file_count = 0;
+ int dat_file_count = 0;
+ for (const auto& file : old_files) {
+ std::string filename = file.file_name;
+ if (filename.find(".idx") != std::string::npos) {
+ idx_file_count++;
+ }
+ if (filename.find(".dat") != std::string::npos) {
+ dat_file_count++;
+ }
+ }
+ EXPECT_EQ(idx_file_count, 0) << "Old directory should contain exactly 0
.idx file";
+ EXPECT_EQ(dat_file_count, 1) << "Old directory should contain exactly 1
.dat file";
+
+ std::vector<io::FileInfo> new_files;
+ bool new_dir_exists = false;
+ EXPECT_TRUE(io::global_local_filesystem()
+ ->list(new_tablet_path, true, &new_files,
&new_dir_exists)
+ .ok());
+ EXPECT_TRUE(new_dir_exists);
+ int new_idx_file_count = 0;
+ int new_dat_file_count = 0;
+ for (const auto& file : new_files) {
+ std::string filename = file.file_name;
+ if (filename.find(".idx") != std::string::npos) {
+ new_idx_file_count++;
+ }
+ if (filename.find(".dat") != std::string::npos) {
+ new_dat_file_count++;
+ }
+ }
+ EXPECT_EQ(new_idx_file_count, 1) << "New directory should contain exactly
1 .idx files";
+ EXPECT_EQ(new_dat_file_count, 1) << "New directory should contain exactly
1 .dat file";
+}
+
TEST_F(IndexBuilderTest, AddIndexWhenOneExistsTest) {
// 0. prepare tablet path
auto tablet_path = _absolute_dir + "/" + std::to_string(14675);
diff --git a/be/test/olap/vector_search/ann_index_writer_test.cpp
b/be/test/olap/vector_search/ann_index_writer_test.cpp
index ad4b6881c42..6a6af3b761f 100644
--- a/be/test/olap/vector_search/ann_index_writer_test.cpp
+++ b/be/test/olap/vector_search/ann_index_writer_test.cpp
@@ -89,6 +89,7 @@ protected:
// Create tablet index
_tablet_index = std::make_unique<TabletIndex>();
+ _tablet_index->_index_type = IndexType::ANN;
_tablet_index->_properties = _properties;
_tablet_index->_index_id = 1;
_tablet_index->_index_name = "test_ann_index";
@@ -416,6 +417,7 @@ TEST_F(AnnIndexWriterTest, TestInvalidMetricType) {
properties["metric_type"] = "invalid_metric";
auto tablet_index = std::make_unique<TabletIndex>();
+ tablet_index->_index_type = IndexType::ANN;
tablet_index->_properties = properties;
tablet_index->_index_id = 1;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java
b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java
index b8af31e5049..ae6c399c318 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java
@@ -2116,7 +2116,7 @@ public class SchemaChangeHandler extends AlterHandler {
lightSchemaChange = false;
// ngram_bf index can do light_schema_change in both local
and cloud mode
- // inverted index can only do light_schema_change in local
mode
+ // inverted index and ann index can only do
light_schema_change in local mode
if
(index.isLightAddIndexSupported(enableAddIndexForNewData)) {
alterIndexes.add(index);
isDropIndex = false;
@@ -2887,7 +2887,7 @@ public class SchemaChangeHandler extends AlterHandler {
public void addIndexChangeJob(IndexChangeJob indexChangeJob) {
indexChangeJobs.put(indexChangeJob.getJobId(), indexChangeJob);
runnableIndexChangeJob.put(indexChangeJob.getJobId(), indexChangeJob);
- LOG.info("add inverted index job {}", indexChangeJob.getJobId());
+ LOG.info("add inverted/ann index change job {}",
indexChangeJob.getJobId());
}
private void clearFinishedOrCancelledSchemaChangeJobV2() {
@@ -2906,7 +2906,7 @@ public class SchemaChangeHandler extends AlterHandler {
IndexChangeJob indexChangeJob = iterator.next().getValue();
if (indexChangeJob.isExpire()) {
iterator.remove();
- LOG.info("remove expired inverted index job {}. finish at {}",
+ LOG.info("remove expired inverted/ann index change job {}.
finish at {}",
indexChangeJob.getJobId(),
TimeUtils.longToTimeString(indexChangeJob.getFinishedTimeMs()));
}
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java
index 44200ea5a59..3c86e5bc454 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Index.java
@@ -18,7 +18,6 @@
package org.apache.doris.catalog;
import org.apache.doris.analysis.IndexDef;
-import org.apache.doris.analysis.IndexDef.IndexType;
import org.apache.doris.analysis.InvertedIndexUtil;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.Config;
@@ -189,12 +188,14 @@ public class Index implements Writable {
// Whether the index can be changed in light mode
public boolean isLightIndexChangeSupported() {
- return indexType == IndexDef.IndexType.INVERTED || indexType ==
IndexType.NGRAM_BF;
+ return indexType == IndexDef.IndexType.INVERTED
+ || indexType == IndexDef.IndexType.NGRAM_BF
+ || indexType == IndexDef.IndexType.ANN;
}
// Whether the index can be added in light mode
// cloud mode supports light add for ngram_bf index and non-tokenized
inverted index (parser="none")
- // local mode supports light add for both inverted index and ngram_bf index
+ // local mode supports light add for inverted index, ann index and
ngram_bf index
// the rest of the index types do not support light add
public boolean isLightAddIndexSupported(boolean enableAddIndexForNewData) {
if (Config.isCloudMode()) {
@@ -206,7 +207,7 @@ public class Index implements Writable {
return false;
}
return (indexType == IndexDef.IndexType.NGRAM_BF &&
enableAddIndexForNewData)
- || (indexType == IndexDef.IndexType.INVERTED);
+ || (indexType == IndexDef.IndexType.INVERTED) || (indexType ==
IndexDef.IndexType.ANN);
}
public String getInvertedIndexCustomAnalyzer() {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java
index 8b83f690125..d34c24feb63 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/BuildIndexOp.java
@@ -142,7 +142,15 @@ public class BuildIndexOp extends AlterTableOp {
throw new AnalysisException(indexType + " index is not needed to
build.");
}
- indexDef = new IndexDefinition(indexName, partitionNamesInfo,
indexType);
+ if (indexType == IndexDef.IndexType.ANN) {
+ List<String> columns = existedIdx.getColumns();
+ Map<String, String> properties = existedIdx.getProperties();
+ String comment = existedIdx.getComment();
+ indexDef = new IndexDefinition(indexName, false, columns, "ANN",
properties, comment);
+ } else {
+ indexDef = new IndexDefinition(indexName, partitionNamesInfo,
indexType);
+ }
+
if (!table.isPartitionedTable()) {
List<String> specifiedPartitions = indexDef.getPartitionNames();
if (!specifiedPartitions.isEmpty()) {
@@ -150,10 +158,6 @@ public class BuildIndexOp extends AlterTableOp {
+ " is not partitioned, cannot build index with
partitions.");
}
}
- if (indexDef.getIndexType() == IndexDef.IndexType.ANN) {
- throw new AnalysisException(
- "ANN index can only be created during table creation, not
through BUILD INDEX.");
- }
indexDef.validate();
this.index = existedIdx.clone();
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateIndexOp.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateIndexOp.java
index 332859b536d..7e41b6f5e18 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateIndexOp.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/commands/info/CreateIndexOp.java
@@ -20,7 +20,6 @@ package org.apache.doris.nereids.trees.plans.commands.info;
import org.apache.doris.alter.AlterOpType;
import org.apache.doris.analysis.AlterTableClause;
import org.apache.doris.analysis.CreateIndexClause;
-import org.apache.doris.analysis.IndexDef;
import org.apache.doris.catalog.Index;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.UserException;
@@ -79,11 +78,6 @@ public class CreateIndexOp extends AlterTableOp {
tableName.analyze(ctx);
}
- if (indexDef.getIndexType() == IndexDef.IndexType.ANN) {
- throw new AnalysisException(
- "ANN index can only be created during table creation, not
through CREATE INDEX.");
- }
-
indexDef.validate();
index = indexDef.translateToCatalogStyle();
}
diff --git a/regression-test/suites/ann_index_p0/build_ann_index_test.groovy
b/regression-test/suites/ann_index_p0/build_ann_index_test.groovy
new file mode 100644
index 00000000000..e8de0d3d2d1
--- /dev/null
+++ b/regression-test/suites/ann_index_p0/build_ann_index_test.groovy
@@ -0,0 +1,112 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("build_ann_index_test") {
+ if (isCloudMode()) {
+ return // TODO enable this case after enable light index in cloud mode
+ }
+
+ // prepare test table
+ def timeout = 30000
+ def delta_time = 1000
+ def alter_res = "null"
+ def useTime = 0
+
+ def wait_for_latest_op_on_table_finish = { tableName, opTimeout ->
+ for(int t = delta_time; t <= opTimeout; t += delta_time){
+ alter_res = sql """SHOW ALTER TABLE COLUMN WHERE TableName =
"${tableName}" ORDER BY CreateTime DESC LIMIT 1;"""
+ alter_res = alter_res.toString()
+ if(alter_res.contains("FINISHED")) {
+ sleep(3000) // wait change table state to normal
+ logger.info(tableName + " latest alter job finished, detail: "
+ alter_res)
+ break
+ }
+ useTime = t
+ sleep(delta_time)
+ }
+ assertTrue(useTime <= opTimeout, "wait_for_latest_op_on_table_finish
timeout")
+ }
+
+ def wait_for_last_build_index_on_table_finish = { tableName, opTimeout ->
+ for(int t = delta_time; t <= opTimeout; t += delta_time){
+ alter_res = sql """SHOW BUILD INDEX WHERE TableName =
"${tableName}" ORDER BY JobId """
+
+ if (alter_res.size() == 0) {
+ logger.info(tableName + " last index job finished")
+ return "SKIPPED"
+ }
+ if (alter_res.size() > 0) {
+ def last_job_state = alter_res[alter_res.size()-1][7];
+ if (last_job_state == "FINISHED" || last_job_state ==
"CANCELLED") {
+ sleep(3000) // wait change table state to normal
+ logger.info(tableName + " last index job finished, state:
" + last_job_state + ", detail: " + alter_res)
+ return last_job_state;
+ }
+ }
+ useTime = t
+ sleep(delta_time)
+ }
+ logger.info("wait_for_last_build_index_on_table_finish debug: " +
alter_res)
+ assertTrue(useTime <= opTimeout,
"wait_for_last_build_index_on_table_finish timeout")
+ return "wait_timeout"
+ }
+
+ sql "set enable_common_expr_pushdown=true;"
+ sql "drop table if exists table_build_ann_index_test;"
+ def tableName = "table_build_ann_index_test"
+
+ // case 1: create table -- insert data -- create index -- build index
+ sql """
+ CREATE TABLE `table_build_ann_index_test` (
+ `id` int NOT NULL COMMENT "",
+ `embedding` array<float> NOT NULL COMMENT ""
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`) COMMENT "OLAP"
+ DISTRIBUTED BY HASH(`id`) BUCKETS 2
+ PROPERTIES (
+ "replication_num" = "1"
+ );
+ """
+
+ sql """
+ INSERT INTO table_build_ann_index_test (id, embedding) VALUES
+ (0, [39.906116, 10.495334, 54.08394, 88.67262, 55.243687, 10.162686,
36.335983, 38.684258]),
+ (1, [62.759315, 97.15586, 25.832521, 39.604908, 88.76715, 72.64085,
9.688437, 17.721428]),
+ (2, [15.447449, 59.7771, 65.54516, 12.973712, 99.685135, 72.080734,
85.71118, 99.35976]),
+ (3, [72.26747, 46.42257, 32.368374, 80.50209, 5.777631, 98.803314,
7.0915947, 68.62693]),
+ (4, [22.098177, 74.10027, 63.634556, 4.710955, 12.405106, 79.39356,
63.014366, 68.67834]),
+ (5, [27.53003, 72.1106, 50.891026, 38.459953, 68.30715, 20.610682,
94.806274, 45.181377]),
+ (6, [77.73215, 64.42907, 71.50025, 43.85641, 94.42648, 50.04773,
65.12575, 68.58207]),
+ (7, [2.1537063, 82.667885, 16.171143, 71.126656, 5.335274, 40.286068,
11.943586, 3.69409]),
+ (8, [54.435013, 56.800594, 59.335514, 55.829235, 85.46627, 33.388138,
11.076194, 20.480877]),
+ (9, [76.197945, 60.623528, 84.229805, 31.652937, 71.82595, 48.04684,
71.29212, 30.282396]);
+ """
+
+ // CREATE INDEX
+ sql """
+ CREATE INDEX idx_test_ann ON table_build_ann_index_test(`embedding`) USING
ANN PROPERTIES(
+ "index_type"="hnsw",
+ "metric_type"="l2_distance",
+ "dim"="8"
+ );
+ """
+ wait_for_latest_op_on_table_finish(tableName, timeout)
+
+ // BUILD INDEX
+ sql "BUILD INDEX idx_test_ann ON table_build_ann_index_test;"
+ wait_for_last_build_index_on_table_finish(tableName, timeout)
+}
diff --git a/regression-test/suites/ann_index_p0/create_ann_index_test.groovy
b/regression-test/suites/ann_index_p0/create_ann_index_test.groovy
index e2fd6f75a25..2ceaf61f6c7 100644
--- a/regression-test/suites/ann_index_p0/create_ann_index_test.groovy
+++ b/regression-test/suites/ann_index_p0/create_ann_index_test.groovy
@@ -17,7 +17,6 @@
suite("create_ann_index_test") {
sql "set enable_common_expr_pushdown=true;"
- // Test that CREATE INDEX for ANN is not supported
sql "drop table if exists tbl_not_null"
sql """
CREATE TABLE `tbl_not_null` (
@@ -31,16 +30,14 @@ suite("create_ann_index_test") {
);
"""
- test {
- sql """
- CREATE INDEX idx_test_ann ON tbl_not_null(`embedding`) USING ANN
PROPERTIES(
- "index_type"="hnsw",
- "metric_type"="l2_distance",
- "dim"="1"
- );
- """
- exception "ANN index can only be created during table creation, not
through CREATE INDEX"
- }
+ // Test that CREATE INDEX for ANN is supported
+ sql """
+ CREATE INDEX idx_test_ann ON tbl_not_null(`embedding`) USING ANN
PROPERTIES(
+ "index_type"="hnsw",
+ "metric_type"="l2_distance",
+ "dim"="1"
+ );
+ """
// Test cases for creating tables with ANN indexes
@@ -343,7 +340,7 @@ suite("create_ann_index_test") {
"""
sql "drop table if exists tbl_efconstruction"
-
+
test {
sql """
CREATE TABLE tbl_efconstruction (
@@ -389,4 +386,4 @@ suite("create_ann_index_test") {
exception "ANN index is not supported in index format V1"
}
-}
\ No newline at end of file
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]