This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 28a82a4e10a [opt](Ann) Cancel index building if input rows is less
than the min_train_rows (#60358)
28a82a4e10a is described below
commit 28a82a4e10ad7e3c91b6e3442b93f76296734c1d
Author: zhiqiang <[email protected]>
AuthorDate: Wed Mar 25 10:18:42 2026 +0800
[opt](Ann) Cancel index building if input rows is less than the
min_train_rows (#60358)
Before this change, when the amount of data used to train the index was
less than the required amount, import or compaction might fail, which
severely impacted user experience. Now, in such cases, it automatically
determines whether training and index generation are needed. When the
amount is completely insufficient, index construction is skipped, and
during queries, it falls back to brute-force computation.
For the calculation of min_train_rows:
1. IVF requires no less than nlist rows.
2. PQ requires no less than 2^pq_nbits * 100 rows.
Take the max of the two as the required minimum number of rows.
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
be/src/exec/operator/olap_scan_operator.cpp | 2 +
be/src/exec/operator/olap_scan_operator.h | 2 +
be/src/exec/scan/olap_scanner.cpp | 2 +
be/src/exprs/vectorized_fn_call.cpp | 16 +-
be/src/storage/index/ann/ann_index.h | 11 +
be/src/storage/index/ann/ann_index_iterator.cpp | 11 +
be/src/storage/index/ann/ann_index_iterator.h | 4 +
be/src/storage/index/ann/ann_index_reader.cpp | 34 +-
be/src/storage/index/ann/ann_index_reader.h | 4 +
be/src/storage/index/ann/ann_index_writer.cpp | 52 ++-
be/src/storage/index/ann/ann_index_writer.h | 3 +
be/src/storage/index/ann/ann_search_params.h | 10 +-
be/src/storage/index/ann/ann_topn_runtime.cpp | 7 +-
be/src/storage/index/ann/ann_topn_runtime.h | 7 +-
be/src/storage/index/ann/faiss_ann_index.cpp | 26 ++
be/src/storage/index/ann/faiss_ann_index.h | 10 +
be/src/storage/index/index_file_writer.cpp | 3 +-
be/src/storage/olap_common.h | 1 +
be/src/storage/segment/segment_iterator.cpp | 41 +-
.../storage/index/ann/ann_index_writer_test.cpp | 430 +++++++++++++++++++++
.../data/ann_index_p0/product_quantization.out | 3 +
.../data/ann_index_p0/quantizer_min_train_rows.out | 32 ++
.../data/ann_index_p0/small_segment.out | 19 +
.../suites/ann_index_p0/ivf_index_test.groovy | 15 +-
.../ann_index_p0/product_quantization.groovy | 6 +-
.../ann_index_p0/quantizer_min_train_rows.groovy | 300 ++++++++++++++
.../suites/ann_index_p0/small_segment.groovy | 90 +++++
27 files changed, 1088 insertions(+), 53 deletions(-)
diff --git a/be/src/exec/operator/olap_scan_operator.cpp
b/be/src/exec/operator/olap_scan_operator.cpp
index 083d1326f9e..d94d9598dab 100644
--- a/be/src/exec/operator/olap_scan_operator.cpp
+++ b/be/src/exec/operator/olap_scan_operator.cpp
@@ -363,6 +363,8 @@ Status OlapScanLocalState::_init_profile() {
_ann_range_result_convert_costs =
ADD_CHILD_TIMER(_segment_profile,
"AnnIndexRangeResultConvertCosts",
"AnnIndexRangeResultPostProcessCosts");
+ _ann_fallback_brute_force_cnt =
+ ADD_COUNTER(_segment_profile, "AnnIndexFallbackBruteForceCnt",
TUnit::UNIT);
_variant_scan_sparse_column_timer = ADD_TIMER(_segment_profile,
"VariantScanSparseColumnTimer");
_variant_scan_sparse_column_bytes =
ADD_COUNTER(_segment_profile, "VariantScanSparseColumnBytes",
TUnit::BYTES);
diff --git a/be/src/exec/operator/olap_scan_operator.h
b/be/src/exec/operator/olap_scan_operator.h
index 97143c6c601..c03c317c73f 100644
--- a/be/src/exec/operator/olap_scan_operator.h
+++ b/be/src/exec/operator/olap_scan_operator.h
@@ -257,6 +257,8 @@ private:
RuntimeProfile::Counter* _ann_range_engine_convert_costs = nullptr;
RuntimeProfile::Counter* _ann_range_result_convert_costs = nullptr;
+ RuntimeProfile::Counter* _ann_fallback_brute_force_cnt = nullptr;
+
RuntimeProfile::Counter* _output_index_result_column_timer = nullptr;
// number of segment filtered by column stat when creating seg iterator
diff --git a/be/src/exec/scan/olap_scanner.cpp
b/be/src/exec/scan/olap_scanner.cpp
index 8757617a2cc..b1f6f8531b9 100644
--- a/be/src/exec/scan/olap_scanner.cpp
+++ b/be/src/exec/scan/olap_scanner.cpp
@@ -946,6 +946,8 @@ void OlapScanner::_collect_profile_before_close() {
COUNTER_UPDATE(local_state->_ann_topn_result_convert_costs,
stats.ann_index_topn_result_process_ns);
+ COUNTER_UPDATE(local_state->_ann_fallback_brute_force_cnt,
stats.ann_fall_back_brute_force_cnt);
+
// Overhead counter removed; precise instrumentation is reported via
engine_prepare above.
}
diff --git a/be/src/exprs/vectorized_fn_call.cpp
b/be/src/exprs/vectorized_fn_call.cpp
index 2194200601d..0a9165ed55e 100644
--- a/be/src/exprs/vectorized_fn_call.cpp
+++ b/be/src/exprs/vectorized_fn_call.cpp
@@ -610,13 +610,27 @@ Status VectorizedFnCall::evaluate_ann_range_search(
range_search_runtime.dim, index_dim);
}
+ auto stats = std::make_unique<segment_v2::AnnIndexStats>();
+ // Track load index timing
+ {
+ SCOPED_TIMER(&(stats->load_index_costs_ns));
+ if (!ann_index_iterator->try_load_index()) {
+ VLOG_DEBUG << "ANN range search skipped: "
+ << fmt::format("Failed to load ANN index for column cid
{}", src_col_cid);
+ ann_index_stats.fall_back_brute_force_cnt += 1;
+ return Status::OK();
+ }
+ double load_costs_ms =
static_cast<double>(stats->load_index_costs_ns.value()) / 1000000.0;
+ DorisMetrics::instance()->ann_index_load_costs_ms->increment(
+ static_cast<int64_t>(load_costs_ms));
+ }
+
AnnRangeSearchParams params =
range_search_runtime.to_range_search_params();
params.roaring = &row_bitmap;
DCHECK(params.roaring != nullptr);
DCHECK(params.query_value != nullptr);
segment_v2::AnnRangeSearchResult result;
- auto stats = std::make_unique<segment_v2::AnnIndexStats>();
RETURN_IF_ERROR(ann_index_iterator->range_search(params,
range_search_runtime.user_params,
&result, stats.get()));
diff --git a/be/src/storage/index/ann/ann_index.h
b/be/src/storage/index/ann/ann_index.h
index 07ac7065719..da83fb4c584 100644
--- a/be/src/storage/index/ann/ann_index.h
+++ b/be/src/storage/index/ann/ann_index.h
@@ -89,6 +89,17 @@ public:
*/
virtual doris::Status add(Int64 n, const float* x) = 0;
+ /**
+ * @brief Returns the minimum number of rows required for training the
index.
+ *
+ * Some index types (like IVF) require a minimum number of training points.
+ * For example, IVF requires at least 'nlist' training points.
+ * HNSW does not require any minimum and returns 0.
+ *
+ * @return Minimum number of rows required for training
+ */
+ virtual Int64 get_min_train_rows() const { return 0; }
+
/** Return approximate nearest neighbors of a query vector.
* The result is stored in the result object.
* @param query_vec input vector, size d
diff --git a/be/src/storage/index/ann/ann_index_iterator.cpp
b/be/src/storage/index/ann/ann_index_iterator.cpp
index ff9eb760a35..fc622f69857 100644
--- a/be/src/storage/index/ann/ann_index_iterator.cpp
+++ b/be/src/storage/index/ann/ann_index_iterator.cpp
@@ -27,6 +27,17 @@ AnnIndexIterator::AnnIndexIterator(const IndexReaderPtr&
reader) : IndexIterator
_ann_reader = std::dynamic_pointer_cast<AnnIndexReader>(reader);
}
+bool AnnIndexIterator::try_load_index() {
+ if (_ann_reader == nullptr) {
+ LOG(WARNING) << "AnnIndexIterator::try_load_index: _ann_reader is
null";
+ return false;
+ }
+
+ // _context may be unset in some test scenarios; pass nullptr IOContext in
that case.
+ io::IOContext* io_ctx = (_context != nullptr) ? _context->io_ctx : nullptr;
+ return _ann_reader->try_load_index(io_ctx);
+}
+
Status AnnIndexIterator::read_from_index(const IndexParam& param) {
auto* a_param = std::get<segment_v2::AnnTopNParam*>(param);
if (a_param == nullptr) {
diff --git a/be/src/storage/index/ann/ann_index_iterator.h
b/be/src/storage/index/ann/ann_index_iterator.h
index 4b63d71dd72..d0e27a719e8 100644
--- a/be/src/storage/index/ann/ann_index_iterator.h
+++ b/be/src/storage/index/ann/ann_index_iterator.h
@@ -44,6 +44,10 @@ public:
Result<bool> has_null() override { return true; }
+ // Try to load index, return true if successful, false if failed
+ // This method should be called before read_from_index or range_search
+ bool try_load_index();
+
MOCK_FUNCTION Status range_search(const AnnRangeSearchParams& params,
const VectorSearchUserParams&
custom_params,
AnnRangeSearchResult* result,
AnnIndexStats* stats);
diff --git a/be/src/storage/index/ann/ann_index_reader.cpp
b/be/src/storage/index/ann/ann_index_reader.cpp
index 2833d27bf6c..5fb94677f40 100644
--- a/be/src/storage/index/ann/ann_index_reader.cpp
+++ b/be/src/storage/index/ann/ann_index_reader.cpp
@@ -74,6 +74,7 @@ Status AnnIndexReader::load_index(io::IOContext* io_ctx) {
DorisMetrics::instance()->ann_index_load_cnt->increment(1);
try {
+ // An exception will be thrown if loading fails
RETURN_IF_ERROR(
_index_file_reader->init(config::inverted_index_read_buffer_size, io_ctx));
Result<std::unique_ptr<DorisCompoundReader, DirectoryDeleter>>
compound_dir;
@@ -87,6 +88,7 @@ Status AnnIndexReader::load_index(io::IOContext* io_ctx) {
_vector_index->set_type(_index_type);
RETURN_IF_ERROR(_vector_index->load(compound_dir->get()));
} catch (CLuceneError& err) {
+ LOG_ERROR("Failed to load ann index: {}", err.what());
return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
"CLuceneError occur when open ann idx file, error msg:
{}", err.what());
}
@@ -94,16 +96,22 @@ Status AnnIndexReader::load_index(io::IOContext* io_ctx) {
});
}
-Status AnnIndexReader::query(io::IOContext* io_ctx, AnnTopNParam* param,
AnnIndexStats* stats) {
+bool AnnIndexReader::try_load_index(io::IOContext* io_ctx) {
#ifndef BE_TEST
- {
- SCOPED_TIMER(&(stats->load_index_costs_ns));
- RETURN_IF_ERROR(load_index(io_ctx));
- double load_costs_ms =
static_cast<double>(stats->load_index_costs_ns.value()) / 1000.0;
- DorisMetrics::instance()->ann_index_load_costs_ms->increment(
- static_cast<int64_t>(load_costs_ms));
+ Status st = load_index(io_ctx);
+ if (!st.ok()) {
+ LOG_WARNING("Failed to load ann index, will fallback to brute force
search: {}",
+ st.to_string());
+ return false;
}
#endif
+ return true;
+}
+
+Status AnnIndexReader::query(io::IOContext* io_ctx, AnnTopNParam* param,
AnnIndexStats* stats) {
+ // Index should be loaded before calling query
+ DCHECK(_vector_index != nullptr);
+
{
DorisMetrics::instance()->ann_index_search_cnt->increment(1);
SCOPED_TIMER(&(stats->search_costs_ns));
@@ -162,16 +170,10 @@ Status AnnIndexReader::range_search(const
AnnRangeSearchParams& params,
const VectorSearchUserParams&
custom_params,
segment_v2::AnnRangeSearchResult* result,
segment_v2::AnnIndexStats* stats,
io::IOContext* io_ctx) {
+ // Index should be loaded before calling range_search
+ DCHECK(_vector_index != nullptr);
+
DCHECK(stats != nullptr);
-#ifndef BE_TEST
- {
- SCOPED_TIMER(&(stats->load_index_costs_ns));
- RETURN_IF_ERROR(load_index(io_ctx));
- double load_costs_ms =
static_cast<double>(stats->load_index_costs_ns.value()) / 1000.0;
- DorisMetrics::instance()->ann_index_load_costs_ms->increment(
- static_cast<int64_t>(load_costs_ms));
- }
-#endif
{
DorisMetrics::instance()->ann_index_search_cnt->increment(1);
SCOPED_TIMER(&(stats->search_costs_ns));
diff --git a/be/src/storage/index/ann/ann_index_reader.h
b/be/src/storage/index/ann/ann_index_reader.h
index 06f864afdd1..7ab677fc279 100644
--- a/be/src/storage/index/ann/ann_index_reader.h
+++ b/be/src/storage/index/ann/ann_index_reader.h
@@ -45,6 +45,10 @@ public:
Status load_index(io::IOContext* io_ctx);
+ // Try to load index, return true if successful, false if failed
+ // This method is used to check if index can be loaded before query
+ bool try_load_index(io::IOContext* io_ctx);
+
Status query(io::IOContext* io_ctx, AnnTopNParam* param, AnnIndexStats*
stats);
Status range_search(const AnnRangeSearchParams& params,
diff --git a/be/src/storage/index/ann/ann_index_writer.cpp
b/be/src/storage/index/ann/ann_index_writer.cpp
index a406454515b..7690ee764ab 100644
--- a/be/src/storage/index/ann/ann_index_writer.cpp
+++ b/be/src/storage/index/ann/ann_index_writer.cpp
@@ -127,6 +127,7 @@ Status AnnIndexColumnWriter::add_array_values(size_t
field_size, const void* val
RETURN_IF_ERROR(
_vector_index->add(AnnIndexColumnWriter::chunk_size(),
_float_array.data()));
_float_array.clear();
+ _need_save_index = true;
}
}
@@ -151,16 +152,55 @@ int64_t AnnIndexColumnWriter::size() const {
}
Status AnnIndexColumnWriter::finish() {
+ Int64 min_train_rows = _vector_index->get_min_train_rows();
+
+ // Check if we have enough rows to train the index
// train/add the remaining data
- if (!_float_array.empty()) {
+ if (_float_array.empty()) {
+ if (_need_save_index) {
+ return _vector_index->save(_dir.get());
+ } else {
+ // No data was added at all. This can happen if the segment has 0
rows
+ // or all rows were filtered out. We need to delete the directory
entry
+ // to avoid writing an empty/invalid index file.
+ LOG_INFO("No data to train/add for ANN index. Skipping index
building.");
+ return _index_file_writer->delete_index(_index_meta);
+ }
+ } else {
DCHECK(_float_array.size() % _vector_index->get_dimension() == 0);
+
Int64 num_rows = _float_array.size() / _vector_index->get_dimension();
- RETURN_IF_ERROR(_vector_index->train(num_rows, _float_array.data()));
- RETURN_IF_ERROR(_vector_index->add(num_rows, _float_array.data()));
- _float_array.clear();
- }
- return _vector_index->save(_dir.get());
+ if (num_rows >= min_train_rows) {
+ RETURN_IF_ERROR(_vector_index->train(num_rows,
_float_array.data()));
+ RETURN_IF_ERROR(_vector_index->add(num_rows, _float_array.data()));
+ _float_array.clear();
+ return _vector_index->save(_dir.get());
+ } else {
+ // It happens to have not enough data to train.
+ // If we have data to add before, we still need to save the index.
+ if (_need_save_index) {
+ // For IVF indexes, adding remaining vectors without training
is acceptable
+ // because the quantizer was already trained on previous
batches. These vectors
+ // are simply added to the nearest clusters without retraining.
+ RETURN_IF_ERROR(_vector_index->add(num_rows,
_float_array.data()));
+ _float_array.clear();
+ return _vector_index->save(_dir.get());
+ } else {
+ // Not enough data to train and no data added before.
+ // Means this is a very small segment, we can skip the index
building.
+ // We need to delete the directory entry from
index_file_writer to avoid
+ // writing an empty/invalid index file which causes
"IndexInput read past EOF" error.
+ LOG_INFO(
+ "Remaining data size {} is less than minimum {} rows
required for ANN "
+ "index "
+ "training. Skipping index building for this segment.",
+ num_rows, min_train_rows);
+ _float_array.clear();
+ return _index_file_writer->delete_index(_index_meta);
+ }
+ }
+ }
}
#include "common/compile_check_end.h"
} // namespace doris::segment_v2
diff --git a/be/src/storage/index/ann/ann_index_writer.h
b/be/src/storage/index/ann/ann_index_writer.h
index 9c6fe9cf978..d8524b37485 100644
--- a/be/src/storage/index/ann/ann_index_writer.h
+++ b/be/src/storage/index/ann/ann_index_writer.h
@@ -79,10 +79,13 @@ private:
// VectorIndex should be weak shared by AnnIndexWriter and
VectorIndexReader
// This should be a weak_ptr
std::shared_ptr<VectorIndex> _vector_index;
+ // _float_array is used to buffer the float data before training/adding to
vector index
+ // if we dont do this, the performance(recall) will be very poor when
adding small number of vectors one by one
PODArray<float> _float_array;
IndexFileWriter* _index_file_writer;
const TabletIndex* _index_meta;
std::shared_ptr<DorisFSDirectory> _dir;
+ bool _need_save_index = false;
};
#include "common/compile_check_end.h"
} // namespace doris::segment_v2
diff --git a/be/src/storage/index/ann/ann_search_params.h
b/be/src/storage/index/ann/ann_search_params.h
index 06bae2742bd..37ba6f875e4 100644
--- a/be/src/storage/index/ann/ann_search_params.h
+++ b/be/src/storage/index/ann/ann_search_params.h
@@ -49,7 +49,8 @@ struct AnnIndexStats {
engine_search_ns(TUnit::TIME_NS, 0),
result_process_costs_ns(TUnit::TIME_NS, 0),
engine_convert_ns(TUnit::TIME_NS, 0),
- engine_prepare_ns(TUnit::TIME_NS, 0) {}
+ engine_prepare_ns(TUnit::TIME_NS, 0),
+ fall_back_brute_force_cnt(0) {}
AnnIndexStats(const AnnIndexStats& other)
: search_costs_ns(TUnit::TIME_NS, other.search_costs_ns.value()),
@@ -57,7 +58,8 @@ struct AnnIndexStats {
engine_search_ns(TUnit::TIME_NS, other.engine_search_ns.value()),
result_process_costs_ns(TUnit::TIME_NS,
other.result_process_costs_ns.value()),
engine_convert_ns(TUnit::TIME_NS,
other.engine_convert_ns.value()),
- engine_prepare_ns(TUnit::TIME_NS,
other.engine_prepare_ns.value()) {}
+ engine_prepare_ns(TUnit::TIME_NS,
other.engine_prepare_ns.value()),
+ fall_back_brute_force_cnt(other.fall_back_brute_force_cnt) {}
AnnIndexStats& operator=(const AnnIndexStats& other) {
if (this != &other) {
@@ -67,6 +69,7 @@ struct AnnIndexStats {
result_process_costs_ns.set(other.result_process_costs_ns.value());
engine_convert_ns.set(other.engine_convert_ns.value());
engine_prepare_ns.set(other.engine_prepare_ns.value());
+ fall_back_brute_force_cnt = other.fall_back_brute_force_cnt;
}
return *this;
}
@@ -77,7 +80,8 @@ struct AnnIndexStats {
RuntimeProfile::Counter result_process_costs_ns; // time cost of
processing search results
RuntimeProfile::Counter engine_convert_ns; // time cost of
engine-side conversions
RuntimeProfile::Counter
- engine_prepare_ns; // time cost before engine search (allocations,
setup)
+ engine_prepare_ns; // time cost before engine search
(allocations, setup)
+ int64_t fall_back_brute_force_cnt; // fallback count when ANN range search
is bypassed
};
struct AnnTopNParam {
diff --git a/be/src/storage/index/ann/ann_topn_runtime.cpp
b/be/src/storage/index/ann/ann_topn_runtime.cpp
index 8b63fe8ad08..488dff5f6a3 100644
--- a/be/src/storage/index/ann/ann_topn_runtime.cpp
+++ b/be/src/storage/index/ann/ann_topn_runtime.cpp
@@ -191,15 +191,12 @@ Status AnnTopNRuntime::prepare(RuntimeState* state, const
RowDescriptor& row_des
return Status::OK();
}
-Status AnnTopNRuntime::evaluate_vector_ann_search(segment_v2::IndexIterator*
ann_index_iterator,
+Status
AnnTopNRuntime::evaluate_vector_ann_search(segment_v2::AnnIndexIterator*
ann_index_iterator,
roaring::Roaring* roaring,
size_t rows_of_segment,
IColumn::MutablePtr&
result_column,
std::unique_ptr<std::vector<uint64_t>>& row_ids,
segment_v2::AnnIndexStats&
ann_index_stats) {
DCHECK(ann_index_iterator != nullptr);
- segment_v2::AnnIndexIterator* ann_index_iterator_casted =
- dynamic_cast<segment_v2::AnnIndexIterator*>(ann_index_iterator);
- DCHECK(ann_index_iterator_casted != nullptr);
DCHECK(_order_by_expr_ctx != nullptr);
DCHECK(_order_by_expr_ctx->root() != nullptr);
size_t query_array_size = _query_array->size();
@@ -209,7 +206,7 @@ Status
AnnTopNRuntime::evaluate_vector_ann_search(segment_v2::IndexIterator* ann
// TODO:(zhiqiang) Maybe we can move this dimension check to prepare phase.
- auto index_reader =
ann_index_iterator_casted->get_reader(AnnIndexReaderType::ANN);
+ auto index_reader =
ann_index_iterator->get_reader(AnnIndexReaderType::ANN);
auto ann_index_reader =
std::dynamic_pointer_cast<AnnIndexReader>(index_reader);
DCHECK(ann_index_reader != nullptr);
if (ann_index_reader->get_dimension() != query_array_size) {
diff --git a/be/src/storage/index/ann/ann_topn_runtime.h
b/be/src/storage/index/ann/ann_topn_runtime.h
index 6d40a32b349..59cb444b517 100644
--- a/be/src/storage/index/ann/ann_topn_runtime.h
+++ b/be/src/storage/index/ann/ann_topn_runtime.h
@@ -49,6 +49,7 @@
namespace doris::segment_v2 {
#include "common/compile_check_begin.h"
struct AnnIndexStats;
+class AnnIndexIterator;
Result<IColumn::Ptr> extract_query_vector(std::shared_ptr<VExpr> arg_expr);
@@ -67,7 +68,7 @@ Result<IColumn::Ptr>
extract_query_vector(std::shared_ptr<VExpr> arg_expr);
* - Thread-safe execution in parallel query contexts
*
* Typical usage in SQL:
- * SELECT * FROM table ORDER BY l2_distance(vec_column, [1,2,3]) LIMIT 10;
+ * SELECT * FROM table ORDER BY l2_distance_approximate(vec_column, [1,2,3])
LIMIT 10;
*/
class AnnTopNRuntime {
ENABLE_FACTORY_CREATOR(AnnTopNRuntime);
@@ -116,7 +117,7 @@ public:
* @param ann_index_stats Statistics collector for performance monitoring
* @return Status indicating success or failure
*/
- Status evaluate_vector_ann_search(segment_v2::IndexIterator*
ann_index_iterator,
+ Status evaluate_vector_ann_search(segment_v2::AnnIndexIterator*
ann_index_iterator,
roaring::Roaring* row_bitmap, size_t
rows_of_segment,
IColumn::MutablePtr& result_column,
std::unique_ptr<std::vector<uint64_t>>&
row_ids,
@@ -167,4 +168,4 @@ private:
doris::VectorSearchUserParams _user_params; ///< User-defined search
parameters
};
#include "common/compile_check_end.h"
-} // namespace doris::segment_v2
\ No newline at end of file
+} // namespace doris::segment_v2
diff --git a/be/src/storage/index/ann/faiss_ann_index.cpp
b/be/src/storage/index/ann/faiss_ann_index.cpp
index 70a2ed01bc9..448dcee5749 100644
--- a/be/src/storage/index/ann/faiss_ann_index.cpp
+++ b/be/src/storage/index/ann/faiss_ann_index.cpp
@@ -296,6 +296,32 @@ doris::Status FaissVectorIndex::add(Int64 n, const float*
vec) {
return doris::Status::OK();
}
+Int64 FaissVectorIndex::get_min_train_rows() const {
+ // For IVF indexes, the minimum number of training points should be at
least
+ // equal to the number of clusters (nlist). FAISS requires this for
k-means clustering.
+ Int64 ivf_min = 0;
+ if (_params.index_type == FaissBuildParameter::IndexType::IVF) {
+ ivf_min = _params.ivf_nlist;
+ }
+
+ // Calculate minimum training rows required by the quantizer
+ Int64 quantizer_min = 0;
+ if (_params.quantizer == FaissBuildParameter::Quantizer::PQ) {
+ // For PQ, FAISS uses ksub = 2^pq_nbits and recommends ksub * 100
training vectors.
+ // This threshold depends on pq_nbits only (independent of pq_m).
+ // See code from contrib/faiss/faiss/impl/ProductQuantizer.cpp::65
+ quantizer_min = (1LL << _params.pq_nbits) * 100;
+ } else if (_params.quantizer == FaissBuildParameter::Quantizer::SQ4 ||
+ _params.quantizer == FaissBuildParameter::Quantizer::SQ8) {
+ // For SQ, minimal training requirement as scalar quantization is
simpler
+ quantizer_min = 1;
+ }
+ // For FLAT, no minimum training data required
+
+ // Return the maximum of IVF and quantizer requirements
+ return std::max(ivf_min, quantizer_min);
+}
+
void FaissVectorIndex::build(const FaissBuildParameter& params) {
_params = params;
_dimension = params.dim;
diff --git a/be/src/storage/index/ann/faiss_ann_index.h
b/be/src/storage/index/ann/faiss_ann_index.h
index 2a0e5aecda6..5560c6fc2d6 100644
--- a/be/src/storage/index/ann/faiss_ann_index.h
+++ b/be/src/storage/index/ann/faiss_ann_index.h
@@ -208,6 +208,16 @@ public:
*/
doris::Status add(Int64 n, const float* vec) override;
+ /**
+ * @brief Returns the minimum number of rows required for training the
index.
+ *
+ * For IVF index types, this returns ivf_nlist (the number of clusters).
+ * For HNSW, this returns 0 as it doesn't require minimum training data.
+ *
+ * @return Minimum number of rows required for training
+ */
+ Int64 get_min_train_rows() const override;
+
/**
* @brief Sets the build parameters for the index.
*
diff --git a/be/src/storage/index/index_file_writer.cpp
b/be/src/storage/index/index_file_writer.cpp
index ec3a8b40a69..9acfadf4172 100644
--- a/be/src/storage/index/index_file_writer.cpp
+++ b/be/src/storage/index/index_file_writer.cpp
@@ -241,8 +241,7 @@ Status IndexFileWriter::finish_close() {
if (_idx_v2_writer != nullptr && _idx_v2_writer->state() !=
io::FileWriter::State::CLOSED) {
RETURN_IF_ERROR(_idx_v2_writer->close(false));
}
- LOG_INFO("IndexFileWriter finish_close, enable_write_index_searcher_cache:
{}",
- config::enable_write_index_searcher_cache);
+
Status st = Status::OK();
if (config::enable_write_index_searcher_cache) {
st = add_into_searcher_cache();
diff --git a/be/src/storage/olap_common.h b/be/src/storage/olap_common.h
index ac8634e16ab..d1df2059c75 100644
--- a/be/src/storage/olap_common.h
+++ b/be/src/storage/olap_common.h
@@ -398,6 +398,7 @@ struct OlapReaderStatistics {
int64_t ann_range_result_convert_ns = 0; // time spent processing range
results
int64_t ann_range_engine_convert_ns = 0; // time spent on FAISS-side
conversions (Range)
int64_t rows_ann_index_range_filtered = 0;
+ int64_t ann_fall_back_brute_force_cnt = 0;
int64_t output_index_result_column_timer = 0;
// number of segment filtered by column stat when creating seg iterator
diff --git a/be/src/storage/segment/segment_iterator.cpp
b/be/src/storage/segment/segment_iterator.cpp
index d6612071efc..cc0a117ce33 100644
--- a/be/src/storage/segment/segment_iterator.cpp
+++ b/be/src/storage/segment/segment_iterator.cpp
@@ -76,6 +76,7 @@
#include "storage/field.h"
#include "storage/id_manager.h"
#include "storage/index/ann/ann_index.h"
+#include "storage/index/ann/ann_index_iterator.h"
#include "storage/index/ann/ann_index_reader.h"
#include "storage/index/ann/ann_topn_runtime.h"
#include "storage/index/index_file_reader.h"
@@ -857,6 +858,7 @@ Status SegmentIterator::_apply_ann_topn_predicate() {
has_ann_index, has_common_expr_push_down,
has_column_predicate);
// Disable index-only scan on ann indexed column.
_need_read_data_indices[src_cid] = true;
+ _opts.stats->ann_fall_back_brute_force_cnt += 1;
return Status::OK();
}
@@ -870,6 +872,7 @@ Status SegmentIterator::_apply_ann_topn_predicate() {
"Asc topn for inner product can not be evaluated by ann
index");
// Disable index-only scan on ann indexed column.
_need_read_data_indices[src_cid] = true;
+ _opts.stats->ann_fall_back_brute_force_cnt += 1;
return Status::OK();
}
} else {
@@ -877,6 +880,7 @@ Status SegmentIterator::_apply_ann_topn_predicate() {
VLOG_DEBUG << fmt::format("Desc topn for l2/cosine can not be
evaluated by ann index");
// Disable index-only scan on ann indexed column.
_need_read_data_indices[src_cid] = true;
+ _opts.stats->ann_fall_back_brute_force_cnt += 1;
return Status::OK();
}
}
@@ -889,6 +893,7 @@ Status SegmentIterator::_apply_ann_topn_predicate() {
metric_to_string(ann_index_reader->get_metric_type()));
// Disable index-only scan on ann indexed column.
_need_read_data_indices[src_cid] = true;
+ _opts.stats->ann_fall_back_brute_force_cnt += 1;
return Status::OK();
}
@@ -902,14 +907,41 @@ Status SegmentIterator::_apply_ann_topn_predicate() {
pre_size, rows_of_segment);
// Disable index-only scan on ann indexed column.
_need_read_data_indices[src_cid] = true;
+ _opts.stats->ann_fall_back_brute_force_cnt += 1;
return Status::OK();
}
IColumn::MutablePtr result_column;
std::unique_ptr<std::vector<uint64_t>> result_row_ids;
segment_v2::AnnIndexStats ann_index_stats;
-
RETURN_IF_ERROR(_ann_topn_runtime->evaluate_vector_ann_search(ann_index_iterator,
&_row_bitmap,
-
rows_of_segment, result_column,
-
result_row_ids, ann_index_stats));
+
+ // Try to load ANN index before search
+ auto ann_index_iterator_casted =
+ dynamic_cast<segment_v2::AnnIndexIterator*>(ann_index_iterator);
+ if (ann_index_iterator_casted == nullptr) {
+ VLOG_DEBUG << "Failed to cast index iterator to AnnIndexIterator,
fallback to brute force";
+ _need_read_data_indices[src_cid] = true;
+ _opts.stats->ann_fall_back_brute_force_cnt += 1;
+ return Status::OK();
+ }
+
+ // Track load index timing
+ {
+ SCOPED_TIMER(&(ann_index_stats.load_index_costs_ns));
+ if (!ann_index_iterator_casted->try_load_index()) {
+ VLOG_DEBUG << "Failed to load ANN index, fallback to brute force
search";
+ _need_read_data_indices[src_cid] = true;
+ _opts.stats->ann_fall_back_brute_force_cnt += 1;
+ return Status::OK();
+ }
+ double load_costs_ms =
+
static_cast<double>(ann_index_stats.load_index_costs_ns.value()) / 1000000.0;
+ DorisMetrics::instance()->ann_index_load_costs_ms->increment(
+ static_cast<int64_t>(load_costs_ms));
+ }
+
+ RETURN_IF_ERROR(_ann_topn_runtime->evaluate_vector_ann_search(
+ ann_index_iterator_casted, &_row_bitmap, rows_of_segment,
result_column, result_row_ids,
+ ann_index_stats));
VLOG_DEBUG << fmt::format("Ann topn filtered {} - {} = {} rows", pre_size,
_row_bitmap.cardinality(), pre_size -
_row_bitmap.cardinality());
@@ -1163,8 +1195,8 @@ Status SegmentIterator::_apply_index_expr() {
}
// Apply ann range search
- segment_v2::AnnIndexStats ann_index_stats;
for (const auto& expr_ctx : _common_expr_ctxs_push_down) {
+ segment_v2::AnnIndexStats ann_index_stats;
size_t origin_rows = _row_bitmap.cardinality();
RETURN_IF_ERROR(expr_ctx->evaluate_ann_range_search(
_index_iterators, _schema->column_ids(), _column_iterators,
@@ -1176,6 +1208,7 @@ Status SegmentIterator::_apply_index_expr() {
_opts.stats->ann_range_result_convert_ns +=
ann_index_stats.result_process_costs_ns.value();
_opts.stats->ann_range_engine_convert_ns +=
ann_index_stats.engine_convert_ns.value();
_opts.stats->ann_range_pre_process_ns +=
ann_index_stats.engine_prepare_ns.value();
+ _opts.stats->ann_fall_back_brute_force_cnt +=
ann_index_stats.fall_back_brute_force_cnt;
}
for (auto it = _common_expr_ctxs_push_down.begin(); it !=
_common_expr_ctxs_push_down.end();) {
diff --git a/be/test/storage/index/ann/ann_index_writer_test.cpp
b/be/test/storage/index/ann/ann_index_writer_test.cpp
index dfb8089ee28..9035c79320c 100644
--- a/be/test/storage/index/ann/ann_index_writer_test.cpp
+++ b/be/test/storage/index/ann/ann_index_writer_test.cpp
@@ -53,6 +53,7 @@ public:
(override));
MOCK_METHOD(doris::Status, save, (lucene::store::Directory * dir),
(override));
MOCK_METHOD(doris::Status, load, (lucene::store::Directory * dir),
(override));
+ MOCK_METHOD(Int64, get_min_train_rows, (), (const, override));
};
class TestAnnIndexColumnWriter : public AnnIndexColumnWriter {
@@ -61,6 +62,7 @@ public:
: AnnIndexColumnWriter(index_file_writer, index_meta) {}
void set_vector_index(std::shared_ptr<VectorIndex> index) { _vector_index
= index; }
+ void set_need_save_index(bool value) { _need_save_index = value; }
};
class AnnIndexWriterTest : public ::testing::Test {
@@ -654,4 +656,432 @@ TEST_F(AnnIndexWriterTest, TestAddMoreThanChunkSizeIVF) {
EXPECT_TRUE(status.ok());
}
+TEST_F(AnnIndexWriterTest, TestSkipTrainWhenRemainderLessThanNlist) {
+ auto mock_index = std::make_shared<MockVectorIndex>();
+ auto properties = _properties;
+ properties["index_type"] = "ivf";
+ properties["nlist"] = "5"; // Set nlist to 5
+ properties["quantizer"] = "flat";
+
+ auto tablet_index = std::make_unique<TabletIndex>();
+ tablet_index->_properties = properties;
+ tablet_index->_index_id = 1;
+
+ auto writer =
std::make_unique<TestAnnIndexColumnWriter>(_index_file_writer.get(),
+
tablet_index.get());
+
+ auto fs_dir = std::make_shared<DorisRAMFSDirectory>();
+ fs_dir->init(doris::io::global_local_filesystem(),
"./ut_dir/tmp_vector_search", nullptr);
+ EXPECT_CALL(*_index_file_writer,
open(testing::_)).WillOnce(testing::Return(fs_dir));
+
+ ASSERT_TRUE(writer->init().ok());
+ writer->set_vector_index(mock_index);
+
+ // CHUNK_SIZE = 10, nlist = 5
+ // Add 12 rows: first 10 will be trained/added in one batch, remaining 2 <
5
+ // Since we have trained data before (_need_save_index = true), we should
add the remaining 2 rows and save
+ EXPECT_CALL(*mock_index,
get_min_train_rows()).WillRepeatedly(testing::Return(5));
+ EXPECT_CALL(*mock_index, train(10, testing::_))
+ .Times(1)
+ .WillOnce(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index, add(10,
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index, add(2,
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index,
save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+
+ const size_t dim = 4;
+
+ // Add 12 rows total
+ {
+ const size_t num_rows = 10;
+ std::vector<float> vectors(10 * 4);
+ for (size_t i = 0; i < 10 * 4; ++i) {
+ vectors[i] = static_cast<float>(i);
+ }
+ std::vector<size_t> offsets;
+ for (size_t i = 0; i <= num_rows; ++i) {
+ offsets.push_back(i * 4);
+ }
+
+ Status status = writer->add_array_values(sizeof(float),
vectors.data(), nullptr,
+ reinterpret_cast<const
uint8_t*>(offsets.data()),
+ num_rows);
+ EXPECT_TRUE(status.ok());
+ }
+
+ // Add 2 more rows
+ {
+ const size_t num_rows = 2;
+ std::vector<float> vectors = {
+ 40.0f, 41.0f, 42.0f, 43.0f, // Row 10
+ 44.0f, 45.0f, 46.0f, 47.0f // Row 11
+ };
+ std::vector<size_t> offsets = {0, 4, 8};
+
+ Status status = writer->add_array_values(sizeof(float),
vectors.data(), nullptr,
+ reinterpret_cast<const
uint8_t*>(offsets.data()),
+ num_rows);
+ EXPECT_TRUE(status.ok());
+ }
+
+ Status status = writer->finish();
+ EXPECT_TRUE(status.ok());
+}
+
+TEST_F(AnnIndexWriterTest, TestLargeDataVolumeWithRemainderSkip) {
+ auto mock_index = std::make_shared<MockVectorIndex>();
+ auto properties = _properties;
+ properties["index_type"] = "ivf";
+ properties["nlist"] = "3"; // Set nlist to 3
+ properties["quantizer"] = "flat";
+
+ auto tablet_index = std::make_unique<TabletIndex>();
+ tablet_index->_properties = properties;
+ tablet_index->_index_id = 1;
+
+ auto writer =
std::make_unique<TestAnnIndexColumnWriter>(_index_file_writer.get(),
+
tablet_index.get());
+
+ auto fs_dir = std::make_shared<DorisRAMFSDirectory>();
+ fs_dir->init(doris::io::global_local_filesystem(),
"./ut_dir/tmp_vector_search", nullptr);
+ EXPECT_CALL(*_index_file_writer,
open(testing::_)).WillOnce(testing::Return(fs_dir));
+
+ ASSERT_TRUE(writer->init().ok());
+ writer->set_vector_index(mock_index);
+
+ // CHUNK_SIZE = 10, nlist = 3
+ // Add 23 rows: 2 full chunks of 10, remaining 3 == nlist, so train
remaining
+ EXPECT_CALL(*mock_index,
get_min_train_rows()).WillRepeatedly(testing::Return(3));
+ EXPECT_CALL(*mock_index, train(10, testing::_))
+ .Times(2)
+ .WillRepeatedly(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index, add(10, testing::_))
+ .Times(2)
+ .WillRepeatedly(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index, train(3,
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index, add(3,
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index,
save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+
+ const size_t dim = 4;
+
+ // Add 3 batches: 10 + 10 + 3 = 23 rows
+ for (int batch = 0; batch < 2; ++batch) {
+ const size_t num_rows = 10;
+ std::vector<float> vectors(10 * 4);
+ for (size_t i = 0; i < 10 * 4; ++i) {
+ vectors[i] = static_cast<float>(batch * 40 + i);
+ }
+ std::vector<size_t> offsets;
+ for (size_t i = 0; i <= num_rows; ++i) {
+ offsets.push_back(i * 4);
+ }
+
+ Status status = writer->add_array_values(sizeof(float),
vectors.data(), nullptr,
+ reinterpret_cast<const
uint8_t*>(offsets.data()),
+ num_rows);
+ EXPECT_TRUE(status.ok());
+ }
+
+ // Add remaining 3 rows
+ {
+ const size_t num_rows = 3;
+ std::vector<float> vectors = {
+ 80.0f, 81.0f, 82.0f, 83.0f, // Row 20
+ 84.0f, 85.0f, 86.0f, 87.0f, // Row 21
+ 88.0f, 89.0f, 90.0f, 91.0f // Row 22
+ };
+ std::vector<size_t> offsets = {0, 4, 8, 12};
+
+ Status status = writer->add_array_values(sizeof(float),
vectors.data(), nullptr,
+ reinterpret_cast<const
uint8_t*>(offsets.data()),
+ num_rows);
+ EXPECT_TRUE(status.ok());
+ }
+
+ Status status = writer->finish();
+ EXPECT_TRUE(status.ok());
+}
+
+TEST_F(AnnIndexWriterTest, TestLargeDataVolumeSkipRemainder) {
+ auto mock_index = std::make_shared<MockVectorIndex>();
+ auto properties = _properties;
+ properties["index_type"] = "ivf";
+ properties["nlist"] = "4"; // Set nlist to 4
+ properties["quantizer"] = "flat";
+
+ auto tablet_index = std::make_unique<TabletIndex>();
+ tablet_index->_properties = properties;
+ tablet_index->_index_id = 1;
+
+ auto writer =
std::make_unique<TestAnnIndexColumnWriter>(_index_file_writer.get(),
+
tablet_index.get());
+
+ auto fs_dir = std::make_shared<DorisRAMFSDirectory>();
+ fs_dir->init(doris::io::global_local_filesystem(),
"./ut_dir/tmp_vector_search", nullptr);
+ EXPECT_CALL(*_index_file_writer,
open(testing::_)).WillOnce(testing::Return(fs_dir));
+
+ ASSERT_TRUE(writer->init().ok());
+ writer->set_vector_index(mock_index);
+
+ // CHUNK_SIZE = 10, nlist = 4
+ // Add 22 rows: 2 full chunks of 10, remaining 2 < 4
+ // Since we have trained data before (_need_save_index = true), we should
add the remaining 2 rows and save
+ EXPECT_CALL(*mock_index,
get_min_train_rows()).WillRepeatedly(testing::Return(4));
+ EXPECT_CALL(*mock_index, train(10, testing::_))
+ .Times(2)
+ .WillRepeatedly(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index, add(10, testing::_))
+ .Times(2)
+ .WillRepeatedly(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index, add(2,
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index,
save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+
+ const size_t dim = 4;
+
+ // Add 2 batches of 10 rows
+ for (int batch = 0; batch < 2; ++batch) {
+ const size_t num_rows = 10;
+ std::vector<float> vectors(10 * 4);
+ for (size_t i = 0; i < 10 * 4; ++i) {
+ vectors[i] = static_cast<float>(batch * 40 + i);
+ }
+ std::vector<size_t> offsets;
+ for (size_t i = 0; i <= num_rows; ++i) {
+ offsets.push_back(i * 4);
+ }
+
+ Status status = writer->add_array_values(sizeof(float),
vectors.data(), nullptr,
+ reinterpret_cast<const
uint8_t*>(offsets.data()),
+ num_rows);
+ EXPECT_TRUE(status.ok());
+ }
+
+ // Add remaining 2 rows
+ {
+ const size_t num_rows = 2;
+ std::vector<float> vectors = {
+ 80.0f, 81.0f, 82.0f, 83.0f, // Row 20
+ 84.0f, 85.0f, 86.0f, 87.0f // Row 21
+ };
+ std::vector<size_t> offsets = {0, 4, 8};
+
+ Status status = writer->add_array_values(sizeof(float),
vectors.data(), nullptr,
+ reinterpret_cast<const
uint8_t*>(offsets.data()),
+ num_rows);
+ EXPECT_TRUE(status.ok());
+ }
+
+ Status status = writer->finish();
+ EXPECT_TRUE(status.ok());
+}
+
+TEST_F(AnnIndexWriterTest, TestSkipIndexWhenTotalRowsLessThanNlist) {
+ auto mock_index = std::make_shared<MockVectorIndex>();
+ auto properties = _properties;
+ properties["index_type"] = "ivf";
+ properties["nlist"] = "5"; // Set nlist to 5
+ properties["quantizer"] = "flat";
+
+ auto tablet_index = std::make_unique<TabletIndex>();
+ tablet_index->_properties = properties;
+ tablet_index->_index_id = 1;
+
+ auto writer =
std::make_unique<TestAnnIndexColumnWriter>(_index_file_writer.get(),
+
tablet_index.get());
+
+ auto fs_dir = std::make_shared<DorisRAMFSDirectory>();
+ fs_dir->init(doris::io::global_local_filesystem(),
"./ut_dir/tmp_vector_search", nullptr);
+ EXPECT_CALL(*_index_file_writer,
open(testing::_)).WillOnce(testing::Return(fs_dir));
+
+ ASSERT_TRUE(writer->init().ok());
+ writer->set_vector_index(mock_index);
+ writer->set_need_save_index(false); // No previous training, so should
skip entirely
+
+ // Add only 3 rows, which is less than nlist (5)
+ // Since no data was trained before (_need_save_index = false), we should
skip index building entirely
+ // No train, add, or save should be called
+ EXPECT_CALL(*mock_index,
get_min_train_rows()).WillRepeatedly(testing::Return(5));
+ EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0);
+ EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0);
+ EXPECT_CALL(*mock_index, save(testing::_)).Times(0);
+
+ const size_t dim = 4;
+
+ // Add 3 rows
+ {
+ const size_t num_rows = 3;
+ std::vector<float> vectors = {
+ 1.0f, 2.0f, 3.0f, 4.0f, // Row 0
+ 5.0f, 6.0f, 7.0f, 8.0f, // Row 1
+ 9.0f, 10.0f, 11.0f, 12.0f // Row 2
+ };
+ std::vector<size_t> offsets = {0, 4, 8, 12};
+
+ Status status = writer->add_array_values(sizeof(float),
vectors.data(), nullptr,
+ reinterpret_cast<const
uint8_t*>(offsets.data()),
+ num_rows);
+ EXPECT_TRUE(status.ok());
+ }
+
+ Status status = writer->finish();
+ EXPECT_TRUE(status.ok());
+}
+
+TEST_F(AnnIndexWriterTest, TestPQMinTrainRows) {
+ // Test writer behavior under a large mocked min_train_rows threshold.
+
+ auto mock_index = std::make_shared<MockVectorIndex>();
+ auto writer =
std::make_unique<TestAnnIndexColumnWriter>(_index_file_writer.get(),
+
_tablet_index.get());
+
+ auto fs_dir = std::make_shared<DorisRAMFSDirectory>();
+ fs_dir->init(doris::io::global_local_filesystem(),
"./ut_dir/tmp_vector_search", nullptr);
+ EXPECT_CALL(*_index_file_writer,
open(testing::_)).WillOnce(testing::Return(fs_dir));
+
+ ASSERT_TRUE(writer->init().ok());
+ writer->set_vector_index(mock_index);
+
+ // Set up expectations: mock a very large min_train_rows threshold.
+ // Since we only provide 1000 vectors, which is less than 131072, training
will happen in batches
+ // but finish() will skip saving since remaining data is insufficient
+ EXPECT_CALL(*mock_index,
get_min_train_rows()).WillRepeatedly(testing::Return(131072));
+ // 1000 vectors will be processed in 100 batches of 10 vectors each
+ EXPECT_CALL(*mock_index, train(10, testing::_))
+ .Times(100)
+ .WillRepeatedly(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index, add(10, testing::_))
+ .Times(100)
+ .WillRepeatedly(testing::Return(Status::OK()));
+ // Since we have trained data in batches, the index will be saved even
though total data is insufficient
+ EXPECT_CALL(*mock_index,
save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+
+ const size_t dim = 4;
+
+ // Add only 1000 rows, which is less than the required 131072
+ {
+ const size_t num_rows = 1000;
+ std::vector<float> vectors(num_rows * dim);
+ for (size_t i = 0; i < num_rows * dim; ++i) {
+ vectors[i] = static_cast<float>(i % 100);
+ }
+ std::vector<size_t> offsets;
+ for (size_t i = 0; i <= num_rows; ++i) {
+ offsets.push_back(i * dim);
+ }
+
+ Status status = writer->add_array_values(sizeof(float),
vectors.data(), nullptr,
+ reinterpret_cast<const
uint8_t*>(offsets.data()),
+ num_rows);
+ EXPECT_TRUE(status.ok());
+ }
+
+ // Finish should skip index building due to insufficient training data
+ Status status = writer->finish();
+ EXPECT_TRUE(status.ok());
+}
+
+TEST_F(AnnIndexWriterTest, TestSQMinTrainRows) {
+ // Test that SQ quantizer requires sufficient training data
+ // SQ requires at least nlist * 2 = 10 * 2 = 20 training vectors
+
+ auto mock_index = std::make_shared<MockVectorIndex>();
+ auto writer =
std::make_unique<TestAnnIndexColumnWriter>(_index_file_writer.get(),
+
_tablet_index.get());
+
+ auto fs_dir = std::make_shared<DorisRAMFSDirectory>();
+ fs_dir->init(doris::io::global_local_filesystem(),
"./ut_dir/tmp_vector_search", nullptr);
+ EXPECT_CALL(*_index_file_writer,
open(testing::_)).WillOnce(testing::Return(fs_dir));
+
+ ASSERT_TRUE(writer->init().ok());
+ writer->set_vector_index(mock_index);
+
+ // Set up expectations: SQ should require at least 20 training vectors
+ // Since we only provide 15 vectors, training will happen in batches but
finish() will skip saving
+ EXPECT_CALL(*mock_index,
get_min_train_rows()).WillRepeatedly(testing::Return(20));
+ // 15 vectors will be processed in 1 batch of 10 vectors and remaining 5
vectors
+ EXPECT_CALL(*mock_index, train(10, testing::_))
+ .Times(1)
+ .WillOnce(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index, add(10,
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index, add(5,
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+ // Since we have trained data, the index will be saved even though total
data is insufficient
+ EXPECT_CALL(*mock_index,
save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+
+ const size_t dim = 4;
+
+ // Add only 15 rows, which is less than the required 20
+ {
+ const size_t num_rows = 15;
+ std::vector<float> vectors(num_rows * dim);
+ for (size_t i = 0; i < num_rows * dim; ++i) {
+ vectors[i] = static_cast<float>(i % 100);
+ }
+ std::vector<size_t> offsets;
+ for (size_t i = 0; i <= num_rows; ++i) {
+ offsets.push_back(i * dim);
+ }
+
+ Status status = writer->add_array_values(sizeof(float),
vectors.data(), nullptr,
+ reinterpret_cast<const
uint8_t*>(offsets.data()),
+ num_rows);
+ EXPECT_TRUE(status.ok());
+ }
+
+ // Finish should skip index building due to insufficient training data
+ Status status = writer->finish();
+ EXPECT_TRUE(status.ok());
+}
+
+TEST_F(AnnIndexWriterTest, TestPQWithSufficientData) {
+ // Test that PQ works when sufficient training data is provided
+
+ auto mock_index = std::make_shared<MockVectorIndex>();
+ auto writer =
std::make_unique<TestAnnIndexColumnWriter>(_index_file_writer.get(),
+
_tablet_index.get());
+
+ auto fs_dir = std::make_shared<DorisRAMFSDirectory>();
+ fs_dir->init(doris::io::global_local_filesystem(),
"./ut_dir/tmp_vector_search", nullptr);
+ EXPECT_CALL(*_index_file_writer,
open(testing::_)).WillOnce(testing::Return(fs_dir));
+
+ ASSERT_TRUE(writer->init().ok());
+ writer->set_vector_index(mock_index);
+
+ // Mock min_train_rows to 131072 and provide exactly that amount.
+ EXPECT_CALL(*mock_index,
get_min_train_rows()).WillRepeatedly(testing::Return(131072));
+ // Since we provide exactly 131072 vectors, they will be trained and added
in chunks
+ // Each chunk is 10 vectors, so we expect 13107 train calls and 13107 add
calls for full chunks
+ EXPECT_CALL(*mock_index, train(10, testing::_))
+ .Times(13107)
+ .WillRepeatedly(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index, add(10, testing::_))
+ .Times(13107)
+ .WillRepeatedly(testing::Return(Status::OK()));
+ // The remaining 2 vectors will be added without training since
min_train_rows > 2
+ EXPECT_CALL(*mock_index, add(2,
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+ EXPECT_CALL(*mock_index,
save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+
+ const size_t dim = 4;
+
+ // Add exactly 131072 rows
+ {
+ const size_t num_rows = 131072;
+ std::vector<float> vectors(num_rows * dim);
+ for (size_t i = 0; i < num_rows * dim; ++i) {
+ vectors[i] = static_cast<float>(i % 100);
+ }
+ std::vector<size_t> offsets;
+ for (size_t i = 0; i <= num_rows; ++i) {
+ offsets.push_back(i * dim);
+ }
+
+ Status status = writer->add_array_values(sizeof(float),
vectors.data(), nullptr,
+ reinterpret_cast<const
uint8_t*>(offsets.data()),
+ num_rows);
+ EXPECT_TRUE(status.ok());
+ }
+
+ // Finish should successfully build the index
+ Status status = writer->finish();
+ EXPECT_TRUE(status.ok());
+}
+
} // namespace doris::segment_v2
diff --git a/regression-test/data/ann_index_p0/product_quantization.out
b/regression-test/data/ann_index_p0/product_quantization.out
index 91be506ee85..2621845b8d4 100644
--- a/regression-test/data/ann_index_p0/product_quantization.out
+++ b/regression-test/data/ann_index_p0/product_quantization.out
@@ -5,3 +5,6 @@
3 [3, 4, 5, 6]
4 [4, 5, 6, 7]
+-- !sql --
+1 [1, 2, 3, 4]
+
diff --git a/regression-test/data/ann_index_p0/quantizer_min_train_rows.out
b/regression-test/data/ann_index_p0/quantizer_min_train_rows.out
new file mode 100644
index 00000000000..b49f93c9ee8
--- /dev/null
+++ b/regression-test/data/ann_index_p0/quantizer_min_train_rows.out
@@ -0,0 +1,32 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+1 [1, 2, 3, 4]
+2 [2, 3, 4, 5]
+3 [3, 4, 5, 6]
+
+-- !sql --
+1 [1, 2, 3, 4]
+2 [2, 3, 4, 5]
+3 [3, 4, 5, 6]
+
+-- !sql --
+400
+
+-- !sql --
+12
+
+-- !sql --
+1 [1, 2, 3, 4]
+2 [2, 3, 4, 5]
+3 [3, 4, 5, 6]
+
+-- !sql --
+1 [1, 2, 3, 4]
+2 [2, 3, 4, 5]
+3 [3, 4, 5, 6]
+
+-- !sql --
+400
+
+-- !sql --
+12
diff --git a/regression-test/data/ann_index_p0/small_segment.out
b/regression-test/data/ann_index_p0/small_segment.out
new file mode 100644
index 00000000000..e6c48002cd2
--- /dev/null
+++ b/regression-test/data/ann_index_p0/small_segment.out
@@ -0,0 +1,19 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !sql --
+1 [1, 2, 3]
+2 [0.5, 2.1, 2.9]
+3 [10, 10, 10]
+
+-- !sql --
+1 [1, 2, 3]
+2 [0.5, 2.1, 2.9]
+
+-- !sql --
+1 [1, 2, 3]
+2 [0.5, 2.1, 2.9]
+3 [10, 10, 10]
+
+-- !sql --
+1 [1, 2, 3]
+2 [0.5, 2.1, 2.9]
+
diff --git a/regression-test/suites/ann_index_p0/ivf_index_test.groovy
b/regression-test/suites/ann_index_p0/ivf_index_test.groovy
index 231e728068a..767cad27fcd 100644
--- a/regression-test/suites/ann_index_p0/ivf_index_test.groovy
+++ b/regression-test/suites/ann_index_p0/ivf_index_test.groovy
@@ -84,15 +84,12 @@ suite ("ivf_index_test") {
DISTRIBUTED BY HASH(id) BUCKETS 1
PROPERTIES ("replication_num" = "1");
"""
- test {
- // not enough training points
- sql """
- INSERT INTO tbl_ann_l2 VALUES
- (1, [1.0, 2.0, 3.0]),
- (2, [0.5, 2.1, 2.9]);
- """
- exception """exception occurred during training"""
- }
+ // Not enough training points: should not throw exception anymore, just
skip index building.
+ sql """
+ INSERT INTO tbl_ann_l2 VALUES
+ (1, [1.0, 2.0, 3.0]),
+ (2, [0.5, 2.1, 2.9]);
+ """
sql "drop table if exists tbl_ann_ip"
sql """
diff --git a/regression-test/suites/ann_index_p0/product_quantization.groovy
b/regression-test/suites/ann_index_p0/product_quantization.groovy
index f9e5abd501b..8a99364a913 100644
--- a/regression-test/suites/ann_index_p0/product_quantization.groovy
+++ b/regression-test/suites/ann_index_p0/product_quantization.groovy
@@ -44,10 +44,8 @@ suite("product_quantization") {
duplicate key(id)
distributed by hash(id) buckets 1
properties('replication_num' = '1');"""
- test {
- sql """insert into product_quantization values (1, [1.0, 2.0, 3.0,
4.0])"""
- exception """exception occurred during training"""
- }
+ sql """insert into product_quantization values (1, [1.0, 2.0, 3.0, 4.0])"""
+ qt_sql """select * from product_quantization order by id"""
sql """drop table if exists product_quantization"""
test {
diff --git
a/regression-test/suites/ann_index_p0/quantizer_min_train_rows.groovy
b/regression-test/suites/ann_index_p0/quantizer_min_train_rows.groovy
new file mode 100644
index 00000000000..15999e5445c
--- /dev/null
+++ b/regression-test/suites/ann_index_p0/quantizer_min_train_rows.groovy
@@ -0,0 +1,300 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("quantizer_min_train_rows") {
+ sql "set enable_common_expr_pushdown=true;"
+
+ // Test PQ quantizer minimum training rows requirement
+ // PQ min_train_rows formula is (1 << pq_nbits) * 100.
+ // For pq_nbits=8, it requires 25600 training vectors.
+ sql "drop table if exists tbl_pq_insufficient_data"
+ sql """
+ CREATE TABLE tbl_pq_insufficient_data (
+ id INT NOT NULL,
+ embedding ARRAY<FLOAT> NOT NULL,
+ INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+ "index_type"="ivf",
+ "metric_type"="l2_distance",
+ "nlist"="10",
+ "dim"="4",
+ "quantizer"="pq",
+ "pq_m"="2",
+ "pq_nbits"="8"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num" = "1");
+ """
+
+ // Insert fewer rows than required (25600), this should not throw an
exception
+ // and should skip building the ANN index
+ sql """
+ INSERT INTO tbl_pq_insufficient_data VALUES
+ (1, [1.0, 2.0, 3.0, 4.0]),
+ (2, [2.0, 3.0, 4.0, 5.0]),
+ (3, [3.0, 4.0, 5.0, 6.0]);
+ """
+
+ // Verify data is inserted successfully
+ qt_sql "select * from tbl_pq_insufficient_data order by
l2_distance_approximate(embedding, [1.0,2.0,3.0,4.0]) limit 100;"
+
+ // Test SQ quantizer minimum training rows requirement
+ // SQ requires 20 training vectors
+ sql "drop table if exists tbl_sq_insufficient_data"
+ sql """
+ CREATE TABLE tbl_sq_insufficient_data (
+ id INT NOT NULL,
+ embedding ARRAY<FLOAT> NOT NULL,
+ INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+ "index_type"="ivf",
+ "metric_type"="l2_distance",
+ "nlist"="10",
+ "dim"="4",
+ "quantizer"="sq8"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num" = "1");
+ """
+
+ // Insert fewer rows than required (10), this should not throw an exception
+ // and should skip building the ANN index
+ sql """
+ INSERT INTO tbl_sq_insufficient_data VALUES
+ (1, [1.0, 2.0, 3.0, 4.0]),
+ (2, [2.0, 3.0, 4.0, 5.0]),
+ (3, [3.0, 4.0, 5.0, 6.0]);
+ """
+
+ // Verify data is inserted successfully
+ qt_sql "select * from tbl_sq_insufficient_data order by
l2_distance_approximate(embedding, [1.0,2.0,3.0,4.0]) limit 100;"
+
+ // Test PQ with sufficient data - should build index successfully
+ sql "drop table if exists tbl_pq_sufficient_data"
+ sql """
+ CREATE TABLE tbl_pq_sufficient_data (
+ id INT NOT NULL,
+ embedding ARRAY<FLOAT> NOT NULL,
+ INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+ "index_type"="ivf",
+ "metric_type"="l2_distance",
+ "nlist"="10",
+ "dim"="4",
+ "quantizer"="pq",
+ "pq_m"="2",
+ "pq_nbits"="2"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num" = "1");
+ """
+
+ // PQ with pq_nbits=2 requires (1 << 2) * 100 = 400 training vectors.
+ // Insert exactly 400 rows to meet the requirement.
+ def insert_data = []
+ for (int i = 1; i <= 400; i++) {
+ insert_data.add("(${i}, [${i % 10}.0, ${(i + 1) % 10}.0, ${(i + 2) %
10}.0, ${(i + 3) % 10}.0])")
+ }
+ sql "INSERT INTO tbl_pq_sufficient_data VALUES ${insert_data.join(', ')};"
+
+ // Verify data is inserted successfully
+ qt_sql "select count(*) from tbl_pq_sufficient_data;"
+
+ // Test SQ with sufficient data - should build index successfully
+ sql "drop table if exists tbl_sq_sufficient_data"
+ sql """
+ CREATE TABLE tbl_sq_sufficient_data (
+ id INT NOT NULL,
+ embedding ARRAY<FLOAT> NOT NULL,
+ INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+ "index_type"="ivf",
+ "metric_type"="l2_distance",
+ "nlist"="5",
+ "dim"="4",
+ "quantizer"="sq4"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num" = "1");
+ """
+
+ // SQ requires 20 training vectors
+ // Insert more than 20 rows to meet the requirement
+ sql """
+ INSERT INTO tbl_sq_sufficient_data VALUES
+ (1, [1.0, 2.0, 3.0, 4.0]),
+ (2, [2.0, 3.0, 4.0, 5.0]),
+ (3, [3.0, 4.0, 5.0, 6.0]),
+ (4, [4.0, 5.0, 6.0, 7.0]),
+ (5, [5.0, 6.0, 7.0, 8.0]),
+ (6, [6.0, 7.0, 8.0, 9.0]),
+ (7, [7.0, 8.0, 9.0, 10.0]),
+ (8, [8.0, 9.0, 10.0, 11.0]),
+ (9, [9.0, 10.0, 11.0, 12.0]),
+ (10, [10.0, 11.0, 12.0, 13.0]),
+ (11, [11.0, 12.0, 13.0, 14.0]),
+ (12, [12.0, 13.0, 14.0, 15.0]);
+ """
+
+ // Verify data is inserted successfully
+ qt_sql "select count(*) from tbl_sq_sufficient_data;"
+
+ // Test HNSW PQ quantizer minimum training rows requirement
+ sql "drop table if exists tbl_hnsw_pq_insufficient_data"
+ sql """
+ CREATE TABLE tbl_hnsw_pq_insufficient_data (
+ id INT NOT NULL,
+ embedding ARRAY<FLOAT> NOT NULL,
+ INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+ "index_type"="hnsw",
+ "metric_type"="l2_distance",
+ "dim"="4",
+ "quantizer"="pq",
+ "pq_m"="2",
+ "pq_nbits"="8",
+ "max_degree"="16",
+ "ef_construction"="200"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num" = "1");
+ """
+
+ // Insert fewer rows than required (25600), this should not throw an
exception
+ // and should skip building the ANN index
+ sql """
+ INSERT INTO tbl_hnsw_pq_insufficient_data VALUES
+ (1, [1.0, 2.0, 3.0, 4.0]),
+ (2, [2.0, 3.0, 4.0, 5.0]),
+ (3, [3.0, 4.0, 5.0, 6.0]);
+ """
+
+ // Verify data is inserted successfully
+ qt_sql "select * from tbl_hnsw_pq_insufficient_data order by id;"
+
+ // Test HNSW SQ quantizer minimum training rows requirement
+ sql "drop table if exists tbl_hnsw_sq_insufficient_data"
+ sql """
+ CREATE TABLE tbl_hnsw_sq_insufficient_data (
+ id INT NOT NULL,
+ embedding ARRAY<FLOAT> NOT NULL,
+ INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+ "index_type"="hnsw",
+ "metric_type"="l2_distance",
+ "dim"="4",
+ "quantizer"="sq8",
+ "max_degree"="16",
+ "ef_construction"="200"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num" = "1");
+ """
+
+ // Insert fewer rows than required (20), this should not throw an exception
+ // and should skip building the ANN index
+ sql """
+ INSERT INTO tbl_hnsw_sq_insufficient_data VALUES
+ (1, [1.0, 2.0, 3.0, 4.0]),
+ (2, [2.0, 3.0, 4.0, 5.0]),
+ (3, [3.0, 4.0, 5.0, 6.0]);
+ """
+
+ // Verify data is inserted successfully
+ qt_sql "select * from tbl_hnsw_sq_insufficient_data order by id;"
+
+ // Test HNSW with sufficient data - should build index successfully
+ sql "drop table if exists tbl_hnsw_pq_sufficient_data"
+ sql """
+ CREATE TABLE tbl_hnsw_pq_sufficient_data (
+ id INT NOT NULL,
+ embedding ARRAY<FLOAT> NOT NULL,
+ INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+ "index_type"="hnsw",
+ "metric_type"="l2_distance",
+ "dim"="4",
+ "quantizer"="pq",
+ "pq_m"="2",
+ "pq_nbits"="2",
+ "max_degree"="16",
+ "ef_construction"="200"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num" = "1");
+ """
+
+ // PQ with pq_nbits=2 requires (1 << 2) * 100 = 400 training vectors.
+ // Insert exactly 400 rows to meet the requirement.
+ def insert_data_hnsw_pq = []
+ for (int i = 1; i <= 400; i++) {
+ insert_data_hnsw_pq.add("(${i}, [${i % 10}.0, ${(i + 1) % 10}.0, ${(i
+ 2) % 10}.0, ${(i + 3) % 10}.0])")
+ }
+ sql "INSERT INTO tbl_hnsw_pq_sufficient_data VALUES
${insert_data_hnsw_pq.join(', ')};"
+
+ // Verify data is inserted successfully
+ qt_sql "select count(*) from tbl_hnsw_pq_sufficient_data;"
+
+ // Test HNSW SQ with sufficient data - should build index successfully
+ sql "drop table if exists tbl_hnsw_sq_sufficient_data"
+ sql """
+ CREATE TABLE tbl_hnsw_sq_sufficient_data (
+ id INT NOT NULL,
+ embedding ARRAY<FLOAT> NOT NULL,
+ INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+ "index_type"="hnsw",
+ "metric_type"="l2_distance",
+ "dim"="4",
+ "quantizer"="sq4",
+ "max_degree"="16",
+ "ef_construction"="200"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num" = "1");
+ """
+
+ // SQ requires 20 training vectors
+ // Insert more than 20 rows to meet the requirement
+ sql """
+ INSERT INTO tbl_hnsw_sq_sufficient_data VALUES
+ (1, [1.0, 2.0, 3.0, 4.0]),
+ (2, [2.0, 3.0, 4.0, 5.0]),
+ (3, [3.0, 4.0, 5.0, 6.0]),
+ (4, [4.0, 5.0, 6.0, 7.0]),
+ (5, [5.0, 6.0, 7.0, 8.0]),
+ (6, [6.0, 7.0, 8.0, 9.0]),
+ (7, [7.0, 8.0, 9.0, 10.0]),
+ (8, [8.0, 9.0, 10.0, 11.0]),
+ (9, [9.0, 10.0, 11.0, 12.0]),
+ (10, [10.0, 11.0, 12.0, 13.0]),
+ (11, [11.0, 12.0, 13.0, 14.0]),
+ (12, [12.0, 13.0, 14.0, 15.0]);
+ """
+
+ // Verify data is inserted successfully
+ qt_sql "select count(*) from tbl_hnsw_sq_sufficient_data;"
+
+}
diff --git a/regression-test/suites/ann_index_p0/small_segment.groovy
b/regression-test/suites/ann_index_p0/small_segment.groovy
new file mode 100644
index 00000000000..cb40ba951f2
--- /dev/null
+++ b/regression-test/suites/ann_index_p0/small_segment.groovy
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("small_segment") {
+ sql "set enable_common_expr_pushdown=true;"
+
+ // Test that ANN index is not built when segment size is smaller than
required training rows
+ sql "drop table if exists tbl_small_segment"
+ sql """
+ CREATE TABLE tbl_small_segment (
+ id INT NOT NULL,
+ embedding ARRAY<FLOAT> NOT NULL,
+ INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+ "index_type"="ivf",
+ "metric_type"="l2_distance",
+ "nlist"="10",
+ "dim"="3"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num" = "1");
+ """
+
+ // Insert fewer rows than nlist (10), this should not throw an exception
+ // and should skip building the ANN index
+ sql """
+ INSERT INTO tbl_small_segment VALUES
+ (1, [1.0, 2.0, 3.0]),
+ (2, [0.5, 2.1, 2.9]),
+ (3, [10.0, 10.0, 10.0]);
+ """
+
+ // Verify data is inserted successfully
+ qt_sql "select * from tbl_small_segment order by id;"
+
+ // Test range search with IVF (should fall back to brute force since index
is not built)
+ qt_sql "select * from tbl_small_segment where
l2_distance_approximate(embedding, [1.0,2.0,3.0]) < 5.0 order by id;"
+
+ // Test with HNSW index as well - HNSW should work even with small segments
+ sql "drop table if exists tbl_small_segment_hnsw"
+ sql """
+ CREATE TABLE tbl_small_segment_hnsw (
+ id INT NOT NULL,
+ embedding ARRAY<FLOAT> NOT NULL,
+ INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+ "index_type"="hnsw",
+ "metric_type"="l2_distance",
+ "dim"="3",
+ "quantizer"="pq",
+ "pq_m"="1",
+ "pq_nbits"="2"
+ )
+ ) ENGINE=OLAP
+ DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num" = "1");
+ """
+
+ // Insert small amount of data - HNSW should work
+ sql """
+ INSERT INTO tbl_small_segment_hnsw VALUES
+ (1, [1.0, 2.0, 3.0]),
+ (2, [0.5, 2.1, 2.9]),
+ (3, [10.0, 10.0, 10.0]);
+ """
+
+ qt_sql "select * from tbl_small_segment_hnsw order by id;"
+
+ // Test approximate search with HNSW (should work)
+ sql "select id, l2_distance_approximate(embedding, [1.0,2.0,3.0]) as dist
from tbl_small_segment_hnsw order by dist limit 2;"
+
+ // Test range search with HNSW (should work)
+ qt_sql "select * from tbl_small_segment_hnsw where
l2_distance_approximate(embedding, [1.0,2.0,3.0]) < 5.0 order by id;"
+
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]