(doris) branch master updated: [opt](Ann) Cancel index building if input rows is less than the min_train_rows (#60358)

yiguolei Tue, 24 Mar 2026 19:18:59 -0700

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 28a82a4e10a [opt](Ann) Cancel index building if input rows is less 
than the min_train_rows (#60358)
28a82a4e10a is described below

commit 28a82a4e10ad7e3c91b6e3442b93f76296734c1d
Author: zhiqiang <[email protected]>
AuthorDate: Wed Mar 25 10:18:42 2026 +0800

    [opt](Ann) Cancel index building if input rows is less than the 
min_train_rows (#60358)
    
    Before this change, when the amount of data used to train the index was
    less than the required amount, import or compaction might fail, which
    severely impacted user experience. Now, in such cases, it automatically
    determines whether training and index generation are needed. When the
    amount is completely insufficient, index construction is skipped, and
    during queries, it falls back to brute-force computation.
    
    For the calculation of min_train_rows:
    1. IVF requires no less than nlist rows.
    2. PQ requires no less than 2^pq_nbits * 100 rows.
    
    Take the max of the two as the required minimum number of rows.
    
    ### Check List (For Author)
    
    - Test <!-- At least one of them must be included. -->
        - [ ] Regression test
        - [ ] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [ ] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
            - [ ] Previous test can cover this change.
            - [ ] No code files have been changed.
            - [ ] Other reason <!-- Add your reason?  -->
    
    - Behavior changed:
        - [ ] No.
        - [ ] Yes. <!-- Explain the behavior change -->
    
    - Does this need documentation?
        - [ ] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    ### Check List (For Reviewer who merge this PR)
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
---
 be/src/exec/operator/olap_scan_operator.cpp        |   2 +
 be/src/exec/operator/olap_scan_operator.h          |   2 +
 be/src/exec/scan/olap_scanner.cpp                  |   2 +
 be/src/exprs/vectorized_fn_call.cpp                |  16 +-
 be/src/storage/index/ann/ann_index.h               |  11 +
 be/src/storage/index/ann/ann_index_iterator.cpp    |  11 +
 be/src/storage/index/ann/ann_index_iterator.h      |   4 +
 be/src/storage/index/ann/ann_index_reader.cpp      |  34 +-
 be/src/storage/index/ann/ann_index_reader.h        |   4 +
 be/src/storage/index/ann/ann_index_writer.cpp      |  52 ++-
 be/src/storage/index/ann/ann_index_writer.h        |   3 +
 be/src/storage/index/ann/ann_search_params.h       |  10 +-
 be/src/storage/index/ann/ann_topn_runtime.cpp      |   7 +-
 be/src/storage/index/ann/ann_topn_runtime.h        |   7 +-
 be/src/storage/index/ann/faiss_ann_index.cpp       |  26 ++
 be/src/storage/index/ann/faiss_ann_index.h         |  10 +
 be/src/storage/index/index_file_writer.cpp         |   3 +-
 be/src/storage/olap_common.h                       |   1 +
 be/src/storage/segment/segment_iterator.cpp        |  41 +-
 .../storage/index/ann/ann_index_writer_test.cpp    | 430 +++++++++++++++++++++
 .../data/ann_index_p0/product_quantization.out     |   3 +
 .../data/ann_index_p0/quantizer_min_train_rows.out |  32 ++
 .../data/ann_index_p0/small_segment.out            |  19 +
 .../suites/ann_index_p0/ivf_index_test.groovy      |  15 +-
 .../ann_index_p0/product_quantization.groovy       |   6 +-
 .../ann_index_p0/quantizer_min_train_rows.groovy   | 300 ++++++++++++++
 .../suites/ann_index_p0/small_segment.groovy       |  90 +++++
 27 files changed, 1088 insertions(+), 53 deletions(-)

diff --git a/be/src/exec/operator/olap_scan_operator.cpp 
b/be/src/exec/operator/olap_scan_operator.cpp
index 083d1326f9e..d94d9598dab 100644
--- a/be/src/exec/operator/olap_scan_operator.cpp
+++ b/be/src/exec/operator/olap_scan_operator.cpp
@@ -363,6 +363,8 @@ Status OlapScanLocalState::_init_profile() {
     _ann_range_result_convert_costs =
             ADD_CHILD_TIMER(_segment_profile, 
"AnnIndexRangeResultConvertCosts",
                             "AnnIndexRangeResultPostProcessCosts");
+    _ann_fallback_brute_force_cnt =
+            ADD_COUNTER(_segment_profile, "AnnIndexFallbackBruteForceCnt", 
TUnit::UNIT);
     _variant_scan_sparse_column_timer = ADD_TIMER(_segment_profile, 
"VariantScanSparseColumnTimer");
     _variant_scan_sparse_column_bytes =
             ADD_COUNTER(_segment_profile, "VariantScanSparseColumnBytes", 
TUnit::BYTES);
diff --git a/be/src/exec/operator/olap_scan_operator.h 
b/be/src/exec/operator/olap_scan_operator.h
index 97143c6c601..c03c317c73f 100644
--- a/be/src/exec/operator/olap_scan_operator.h
+++ b/be/src/exec/operator/olap_scan_operator.h
@@ -257,6 +257,8 @@ private:
     RuntimeProfile::Counter* _ann_range_engine_convert_costs = nullptr;
     RuntimeProfile::Counter* _ann_range_result_convert_costs = nullptr;
 
+    RuntimeProfile::Counter* _ann_fallback_brute_force_cnt = nullptr;
+
     RuntimeProfile::Counter* _output_index_result_column_timer = nullptr;
 
     // number of segment filtered by column stat when creating seg iterator
diff --git a/be/src/exec/scan/olap_scanner.cpp 
b/be/src/exec/scan/olap_scanner.cpp
index 8757617a2cc..b1f6f8531b9 100644
--- a/be/src/exec/scan/olap_scanner.cpp
+++ b/be/src/exec/scan/olap_scanner.cpp
@@ -946,6 +946,8 @@ void OlapScanner::_collect_profile_before_close() {
     COUNTER_UPDATE(local_state->_ann_topn_result_convert_costs,
                    stats.ann_index_topn_result_process_ns);
 
+    COUNTER_UPDATE(local_state->_ann_fallback_brute_force_cnt, 
stats.ann_fall_back_brute_force_cnt);
+
     // Overhead counter removed; precise instrumentation is reported via 
engine_prepare above.
 }
 
diff --git a/be/src/exprs/vectorized_fn_call.cpp 
b/be/src/exprs/vectorized_fn_call.cpp
index 2194200601d..0a9165ed55e 100644
--- a/be/src/exprs/vectorized_fn_call.cpp
+++ b/be/src/exprs/vectorized_fn_call.cpp
@@ -610,13 +610,27 @@ Status VectorizedFnCall::evaluate_ann_range_search(
                 range_search_runtime.dim, index_dim);
     }
 
+    auto stats = std::make_unique<segment_v2::AnnIndexStats>();
+    // Track load index timing
+    {
+        SCOPED_TIMER(&(stats->load_index_costs_ns));
+        if (!ann_index_iterator->try_load_index()) {
+            VLOG_DEBUG << "ANN range search skipped: "
+                       << fmt::format("Failed to load ANN index for column cid 
{}", src_col_cid);
+            ann_index_stats.fall_back_brute_force_cnt += 1;
+            return Status::OK();
+        }
+        double load_costs_ms = 
static_cast<double>(stats->load_index_costs_ns.value()) / 1000000.0;
+        DorisMetrics::instance()->ann_index_load_costs_ms->increment(
+                static_cast<int64_t>(load_costs_ms));
+    }
+
     AnnRangeSearchParams params = 
range_search_runtime.to_range_search_params();
 
     params.roaring = &row_bitmap;
     DCHECK(params.roaring != nullptr);
     DCHECK(params.query_value != nullptr);
     segment_v2::AnnRangeSearchResult result;
-    auto stats = std::make_unique<segment_v2::AnnIndexStats>();
     RETURN_IF_ERROR(ann_index_iterator->range_search(params, 
range_search_runtime.user_params,
                                                      &result, stats.get()));
 
diff --git a/be/src/storage/index/ann/ann_index.h 
b/be/src/storage/index/ann/ann_index.h
index 07ac7065719..da83fb4c584 100644
--- a/be/src/storage/index/ann/ann_index.h
+++ b/be/src/storage/index/ann/ann_index.h
@@ -89,6 +89,17 @@ public:
      */
     virtual doris::Status add(Int64 n, const float* x) = 0;
 
+    /**
+     * @brief Returns the minimum number of rows required for training the 
index.
+     *
+     * Some index types (like IVF) require a minimum number of training points.
+     * For example, IVF requires at least 'nlist' training points.
+     * HNSW does not require any minimum and returns 0.
+     *
+     * @return Minimum number of rows required for training
+     */
+    virtual Int64 get_min_train_rows() const { return 0; }
+
     /** Return approximate nearest neighbors of a query vector.
      * The result is stored in the result object.
      * @param query_vec  input vector, size d
diff --git a/be/src/storage/index/ann/ann_index_iterator.cpp 
b/be/src/storage/index/ann/ann_index_iterator.cpp
index ff9eb760a35..fc622f69857 100644
--- a/be/src/storage/index/ann/ann_index_iterator.cpp
+++ b/be/src/storage/index/ann/ann_index_iterator.cpp
@@ -27,6 +27,17 @@ AnnIndexIterator::AnnIndexIterator(const IndexReaderPtr& 
reader) : IndexIterator
     _ann_reader = std::dynamic_pointer_cast<AnnIndexReader>(reader);
 }
 
+bool AnnIndexIterator::try_load_index() {
+    if (_ann_reader == nullptr) {
+        LOG(WARNING) << "AnnIndexIterator::try_load_index: _ann_reader is 
null";
+        return false;
+    }
+
+    // _context may be unset in some test scenarios; pass nullptr IOContext in 
that case.
+    io::IOContext* io_ctx = (_context != nullptr) ? _context->io_ctx : nullptr;
+    return _ann_reader->try_load_index(io_ctx);
+}
+
 Status AnnIndexIterator::read_from_index(const IndexParam& param) {
     auto* a_param = std::get<segment_v2::AnnTopNParam*>(param);
     if (a_param == nullptr) {
diff --git a/be/src/storage/index/ann/ann_index_iterator.h 
b/be/src/storage/index/ann/ann_index_iterator.h
index 4b63d71dd72..d0e27a719e8 100644
--- a/be/src/storage/index/ann/ann_index_iterator.h
+++ b/be/src/storage/index/ann/ann_index_iterator.h
@@ -44,6 +44,10 @@ public:
 
     Result<bool> has_null() override { return true; }
 
+    // Try to load index, return true if successful, false if failed
+    // This method should be called before read_from_index or range_search
+    bool try_load_index();
+
     MOCK_FUNCTION Status range_search(const AnnRangeSearchParams& params,
                                       const VectorSearchUserParams& 
custom_params,
                                       AnnRangeSearchResult* result, 
AnnIndexStats* stats);
diff --git a/be/src/storage/index/ann/ann_index_reader.cpp 
b/be/src/storage/index/ann/ann_index_reader.cpp
index 2833d27bf6c..5fb94677f40 100644
--- a/be/src/storage/index/ann/ann_index_reader.cpp
+++ b/be/src/storage/index/ann/ann_index_reader.cpp
@@ -74,6 +74,7 @@ Status AnnIndexReader::load_index(io::IOContext* io_ctx) {
         DorisMetrics::instance()->ann_index_load_cnt->increment(1);
 
         try {
+            // An exception will be thrown if loading fails
             RETURN_IF_ERROR(
                     
_index_file_reader->init(config::inverted_index_read_buffer_size, io_ctx));
             Result<std::unique_ptr<DorisCompoundReader, DirectoryDeleter>> 
compound_dir;
@@ -87,6 +88,7 @@ Status AnnIndexReader::load_index(io::IOContext* io_ctx) {
             _vector_index->set_type(_index_type);
             RETURN_IF_ERROR(_vector_index->load(compound_dir->get()));
         } catch (CLuceneError& err) {
+            LOG_ERROR("Failed to load ann index: {}", err.what());
             return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
                     "CLuceneError occur when open ann idx file, error msg: 
{}", err.what());
         }
@@ -94,16 +96,22 @@ Status AnnIndexReader::load_index(io::IOContext* io_ctx) {
     });
 }
 
-Status AnnIndexReader::query(io::IOContext* io_ctx, AnnTopNParam* param, 
AnnIndexStats* stats) {
+bool AnnIndexReader::try_load_index(io::IOContext* io_ctx) {
 #ifndef BE_TEST
-    {
-        SCOPED_TIMER(&(stats->load_index_costs_ns));
-        RETURN_IF_ERROR(load_index(io_ctx));
-        double load_costs_ms = 
static_cast<double>(stats->load_index_costs_ns.value()) / 1000.0;
-        DorisMetrics::instance()->ann_index_load_costs_ms->increment(
-                static_cast<int64_t>(load_costs_ms));
+    Status st = load_index(io_ctx);
+    if (!st.ok()) {
+        LOG_WARNING("Failed to load ann index, will fallback to brute force 
search: {}",
+                    st.to_string());
+        return false;
     }
 #endif
+    return true;
+}
+
+Status AnnIndexReader::query(io::IOContext* io_ctx, AnnTopNParam* param, 
AnnIndexStats* stats) {
+    // Index should be loaded before calling query
+    DCHECK(_vector_index != nullptr);
+
     {
         DorisMetrics::instance()->ann_index_search_cnt->increment(1);
         SCOPED_TIMER(&(stats->search_costs_ns));
@@ -162,16 +170,10 @@ Status AnnIndexReader::range_search(const 
AnnRangeSearchParams& params,
                                     const VectorSearchUserParams& 
custom_params,
                                     segment_v2::AnnRangeSearchResult* result,
                                     segment_v2::AnnIndexStats* stats, 
io::IOContext* io_ctx) {
+    // Index should be loaded before calling range_search
+    DCHECK(_vector_index != nullptr);
+
     DCHECK(stats != nullptr);
-#ifndef BE_TEST
-    {
-        SCOPED_TIMER(&(stats->load_index_costs_ns));
-        RETURN_IF_ERROR(load_index(io_ctx));
-        double load_costs_ms = 
static_cast<double>(stats->load_index_costs_ns.value()) / 1000.0;
-        DorisMetrics::instance()->ann_index_load_costs_ms->increment(
-                static_cast<int64_t>(load_costs_ms));
-    }
-#endif
     {
         DorisMetrics::instance()->ann_index_search_cnt->increment(1);
         SCOPED_TIMER(&(stats->search_costs_ns));
diff --git a/be/src/storage/index/ann/ann_index_reader.h 
b/be/src/storage/index/ann/ann_index_reader.h
index 06f864afdd1..7ab677fc279 100644
--- a/be/src/storage/index/ann/ann_index_reader.h
+++ b/be/src/storage/index/ann/ann_index_reader.h
@@ -45,6 +45,10 @@ public:
 
     Status load_index(io::IOContext* io_ctx);
 
+    // Try to load index, return true if successful, false if failed
+    // This method is used to check if index can be loaded before query
+    bool try_load_index(io::IOContext* io_ctx);
+
     Status query(io::IOContext* io_ctx, AnnTopNParam* param, AnnIndexStats* 
stats);
 
     Status range_search(const AnnRangeSearchParams& params,
diff --git a/be/src/storage/index/ann/ann_index_writer.cpp 
b/be/src/storage/index/ann/ann_index_writer.cpp
index a406454515b..7690ee764ab 100644
--- a/be/src/storage/index/ann/ann_index_writer.cpp
+++ b/be/src/storage/index/ann/ann_index_writer.cpp
@@ -127,6 +127,7 @@ Status AnnIndexColumnWriter::add_array_values(size_t 
field_size, const void* val
             RETURN_IF_ERROR(
                     _vector_index->add(AnnIndexColumnWriter::chunk_size(), 
_float_array.data()));
             _float_array.clear();
+            _need_save_index = true;
         }
     }
 
@@ -151,16 +152,55 @@ int64_t AnnIndexColumnWriter::size() const {
 }
 
 Status AnnIndexColumnWriter::finish() {
+    Int64 min_train_rows = _vector_index->get_min_train_rows();
+
+    // Check if we have enough rows to train the index
     // train/add the remaining data
-    if (!_float_array.empty()) {
+    if (_float_array.empty()) {
+        if (_need_save_index) {
+            return _vector_index->save(_dir.get());
+        } else {
+            // No data was added at all. This can happen if the segment has 0 
rows
+            // or all rows were filtered out. We need to delete the directory 
entry
+            // to avoid writing an empty/invalid index file.
+            LOG_INFO("No data to train/add for ANN index. Skipping index 
building.");
+            return _index_file_writer->delete_index(_index_meta);
+        }
+    } else {
         DCHECK(_float_array.size() % _vector_index->get_dimension() == 0);
+
         Int64 num_rows = _float_array.size() / _vector_index->get_dimension();
-        RETURN_IF_ERROR(_vector_index->train(num_rows, _float_array.data()));
-        RETURN_IF_ERROR(_vector_index->add(num_rows, _float_array.data()));
-        _float_array.clear();
-    }
 
-    return _vector_index->save(_dir.get());
+        if (num_rows >= min_train_rows) {
+            RETURN_IF_ERROR(_vector_index->train(num_rows, 
_float_array.data()));
+            RETURN_IF_ERROR(_vector_index->add(num_rows, _float_array.data()));
+            _float_array.clear();
+            return _vector_index->save(_dir.get());
+        } else {
+            // It happens to have not enough data to train.
+            // If we have data to add before, we still need to save the index.
+            if (_need_save_index) {
+                // For IVF indexes, adding remaining vectors without training 
is acceptable
+                // because the quantizer was already trained on previous 
batches. These vectors
+                // are simply added to the nearest clusters without retraining.
+                RETURN_IF_ERROR(_vector_index->add(num_rows, 
_float_array.data()));
+                _float_array.clear();
+                return _vector_index->save(_dir.get());
+            } else {
+                // Not enough data to train and no data added before.
+                // Means this is a very small segment, we can skip the index 
building.
+                // We need to delete the directory entry from 
index_file_writer to avoid
+                // writing an empty/invalid index file which causes 
"IndexInput read past EOF" error.
+                LOG_INFO(
+                        "Remaining data size {} is less than minimum {} rows 
required for ANN "
+                        "index "
+                        "training. Skipping index building for this segment.",
+                        num_rows, min_train_rows);
+                _float_array.clear();
+                return _index_file_writer->delete_index(_index_meta);
+            }
+        }
+    }
 }
 #include "common/compile_check_end.h"
 } // namespace doris::segment_v2
diff --git a/be/src/storage/index/ann/ann_index_writer.h 
b/be/src/storage/index/ann/ann_index_writer.h
index 9c6fe9cf978..d8524b37485 100644
--- a/be/src/storage/index/ann/ann_index_writer.h
+++ b/be/src/storage/index/ann/ann_index_writer.h
@@ -79,10 +79,13 @@ private:
     // VectorIndex should be weak shared by AnnIndexWriter and 
VectorIndexReader
     // This should be a weak_ptr
     std::shared_ptr<VectorIndex> _vector_index;
+    // _float_array is used to buffer the float data before training/adding to 
vector index
+    // if we dont do this, the performance(recall) will be very poor when 
adding small number of vectors one by one
     PODArray<float> _float_array;
     IndexFileWriter* _index_file_writer;
     const TabletIndex* _index_meta;
     std::shared_ptr<DorisFSDirectory> _dir;
+    bool _need_save_index = false;
 };
 #include "common/compile_check_end.h"
 } // namespace doris::segment_v2
diff --git a/be/src/storage/index/ann/ann_search_params.h 
b/be/src/storage/index/ann/ann_search_params.h
index 06bae2742bd..37ba6f875e4 100644
--- a/be/src/storage/index/ann/ann_search_params.h
+++ b/be/src/storage/index/ann/ann_search_params.h
@@ -49,7 +49,8 @@ struct AnnIndexStats {
               engine_search_ns(TUnit::TIME_NS, 0),
               result_process_costs_ns(TUnit::TIME_NS, 0),
               engine_convert_ns(TUnit::TIME_NS, 0),
-              engine_prepare_ns(TUnit::TIME_NS, 0) {}
+              engine_prepare_ns(TUnit::TIME_NS, 0),
+              fall_back_brute_force_cnt(0) {}
 
     AnnIndexStats(const AnnIndexStats& other)
             : search_costs_ns(TUnit::TIME_NS, other.search_costs_ns.value()),
@@ -57,7 +58,8 @@ struct AnnIndexStats {
               engine_search_ns(TUnit::TIME_NS, other.engine_search_ns.value()),
               result_process_costs_ns(TUnit::TIME_NS, 
other.result_process_costs_ns.value()),
               engine_convert_ns(TUnit::TIME_NS, 
other.engine_convert_ns.value()),
-              engine_prepare_ns(TUnit::TIME_NS, 
other.engine_prepare_ns.value()) {}
+              engine_prepare_ns(TUnit::TIME_NS, 
other.engine_prepare_ns.value()),
+              fall_back_brute_force_cnt(other.fall_back_brute_force_cnt) {}
 
     AnnIndexStats& operator=(const AnnIndexStats& other) {
         if (this != &other) {
@@ -67,6 +69,7 @@ struct AnnIndexStats {
             result_process_costs_ns.set(other.result_process_costs_ns.value());
             engine_convert_ns.set(other.engine_convert_ns.value());
             engine_prepare_ns.set(other.engine_prepare_ns.value());
+            fall_back_brute_force_cnt = other.fall_back_brute_force_cnt;
         }
         return *this;
     }
@@ -77,7 +80,8 @@ struct AnnIndexStats {
     RuntimeProfile::Counter result_process_costs_ns; // time cost of 
processing search results
     RuntimeProfile::Counter engine_convert_ns;       // time cost of 
engine-side conversions
     RuntimeProfile::Counter
-            engine_prepare_ns; // time cost before engine search (allocations, 
setup)
+            engine_prepare_ns;         // time cost before engine search 
(allocations, setup)
+    int64_t fall_back_brute_force_cnt; // fallback count when ANN range search 
is bypassed
 };
 
 struct AnnTopNParam {
diff --git a/be/src/storage/index/ann/ann_topn_runtime.cpp 
b/be/src/storage/index/ann/ann_topn_runtime.cpp
index 8b63fe8ad08..488dff5f6a3 100644
--- a/be/src/storage/index/ann/ann_topn_runtime.cpp
+++ b/be/src/storage/index/ann/ann_topn_runtime.cpp
@@ -191,15 +191,12 @@ Status AnnTopNRuntime::prepare(RuntimeState* state, const 
RowDescriptor& row_des
     return Status::OK();
 }
 
-Status AnnTopNRuntime::evaluate_vector_ann_search(segment_v2::IndexIterator* 
ann_index_iterator,
+Status 
AnnTopNRuntime::evaluate_vector_ann_search(segment_v2::AnnIndexIterator* 
ann_index_iterator,
                                                   roaring::Roaring* roaring, 
size_t rows_of_segment,
                                                   IColumn::MutablePtr& 
result_column,
                                                   
std::unique_ptr<std::vector<uint64_t>>& row_ids,
                                                   segment_v2::AnnIndexStats& 
ann_index_stats) {
     DCHECK(ann_index_iterator != nullptr);
-    segment_v2::AnnIndexIterator* ann_index_iterator_casted =
-            dynamic_cast<segment_v2::AnnIndexIterator*>(ann_index_iterator);
-    DCHECK(ann_index_iterator_casted != nullptr);
     DCHECK(_order_by_expr_ctx != nullptr);
     DCHECK(_order_by_expr_ctx->root() != nullptr);
     size_t query_array_size = _query_array->size();
@@ -209,7 +206,7 @@ Status 
AnnTopNRuntime::evaluate_vector_ann_search(segment_v2::IndexIterator* ann
 
     // TODO:(zhiqiang) Maybe we can move this dimension check to prepare phase.
 
-    auto index_reader = 
ann_index_iterator_casted->get_reader(AnnIndexReaderType::ANN);
+    auto index_reader = 
ann_index_iterator->get_reader(AnnIndexReaderType::ANN);
     auto ann_index_reader = 
std::dynamic_pointer_cast<AnnIndexReader>(index_reader);
     DCHECK(ann_index_reader != nullptr);
     if (ann_index_reader->get_dimension() != query_array_size) {
diff --git a/be/src/storage/index/ann/ann_topn_runtime.h 
b/be/src/storage/index/ann/ann_topn_runtime.h
index 6d40a32b349..59cb444b517 100644
--- a/be/src/storage/index/ann/ann_topn_runtime.h
+++ b/be/src/storage/index/ann/ann_topn_runtime.h
@@ -49,6 +49,7 @@
 namespace doris::segment_v2 {
 #include "common/compile_check_begin.h"
 struct AnnIndexStats;
+class AnnIndexIterator;
 
 Result<IColumn::Ptr> extract_query_vector(std::shared_ptr<VExpr> arg_expr);
 
@@ -67,7 +68,7 @@ Result<IColumn::Ptr> 
extract_query_vector(std::shared_ptr<VExpr> arg_expr);
  * - Thread-safe execution in parallel query contexts
  * 
  * Typical usage in SQL:
- * SELECT * FROM table ORDER BY l2_distance(vec_column, [1,2,3]) LIMIT 10;
+ * SELECT * FROM table ORDER BY l2_distance_approximate(vec_column, [1,2,3]) 
LIMIT 10;
  */
 class AnnTopNRuntime {
     ENABLE_FACTORY_CREATOR(AnnTopNRuntime);
@@ -116,7 +117,7 @@ public:
      * @param ann_index_stats Statistics collector for performance monitoring
      * @return Status indicating success or failure
      */
-    Status evaluate_vector_ann_search(segment_v2::IndexIterator* 
ann_index_iterator,
+    Status evaluate_vector_ann_search(segment_v2::AnnIndexIterator* 
ann_index_iterator,
                                       roaring::Roaring* row_bitmap, size_t 
rows_of_segment,
                                       IColumn::MutablePtr& result_column,
                                       std::unique_ptr<std::vector<uint64_t>>& 
row_ids,
@@ -167,4 +168,4 @@ private:
     doris::VectorSearchUserParams _user_params; ///< User-defined search 
parameters
 };
 #include "common/compile_check_end.h"
-} // namespace doris::segment_v2
\ No newline at end of file
+} // namespace doris::segment_v2
diff --git a/be/src/storage/index/ann/faiss_ann_index.cpp 
b/be/src/storage/index/ann/faiss_ann_index.cpp
index 70a2ed01bc9..448dcee5749 100644
--- a/be/src/storage/index/ann/faiss_ann_index.cpp
+++ b/be/src/storage/index/ann/faiss_ann_index.cpp
@@ -296,6 +296,32 @@ doris::Status FaissVectorIndex::add(Int64 n, const float* 
vec) {
     return doris::Status::OK();
 }
 
+Int64 FaissVectorIndex::get_min_train_rows() const {
+    // For IVF indexes, the minimum number of training points should be at 
least
+    // equal to the number of clusters (nlist). FAISS requires this for 
k-means clustering.
+    Int64 ivf_min = 0;
+    if (_params.index_type == FaissBuildParameter::IndexType::IVF) {
+        ivf_min = _params.ivf_nlist;
+    }
+
+    // Calculate minimum training rows required by the quantizer
+    Int64 quantizer_min = 0;
+    if (_params.quantizer == FaissBuildParameter::Quantizer::PQ) {
+        // For PQ, FAISS uses ksub = 2^pq_nbits and recommends ksub * 100 
training vectors.
+        // This threshold depends on pq_nbits only (independent of pq_m).
+        // See code from contrib/faiss/faiss/impl/ProductQuantizer.cpp::65
+        quantizer_min = (1LL << _params.pq_nbits) * 100;
+    } else if (_params.quantizer == FaissBuildParameter::Quantizer::SQ4 ||
+               _params.quantizer == FaissBuildParameter::Quantizer::SQ8) {
+        // For SQ, minimal training requirement as scalar quantization is 
simpler
+        quantizer_min = 1;
+    }
+    // For FLAT, no minimum training data required
+
+    // Return the maximum of IVF and quantizer requirements
+    return std::max(ivf_min, quantizer_min);
+}
+
 void FaissVectorIndex::build(const FaissBuildParameter& params) {
     _params = params;
     _dimension = params.dim;
diff --git a/be/src/storage/index/ann/faiss_ann_index.h 
b/be/src/storage/index/ann/faiss_ann_index.h
index 2a0e5aecda6..5560c6fc2d6 100644
--- a/be/src/storage/index/ann/faiss_ann_index.h
+++ b/be/src/storage/index/ann/faiss_ann_index.h
@@ -208,6 +208,16 @@ public:
      */
     doris::Status add(Int64 n, const float* vec) override;
 
+    /**
+     * @brief Returns the minimum number of rows required for training the 
index.
+     *
+     * For IVF index types, this returns ivf_nlist (the number of clusters).
+     * For HNSW, this returns 0 as it doesn't require minimum training data.
+     *
+     * @return Minimum number of rows required for training
+     */
+    Int64 get_min_train_rows() const override;
+
     /**
      * @brief Sets the build parameters for the index.
      *
diff --git a/be/src/storage/index/index_file_writer.cpp 
b/be/src/storage/index/index_file_writer.cpp
index ec3a8b40a69..9acfadf4172 100644
--- a/be/src/storage/index/index_file_writer.cpp
+++ b/be/src/storage/index/index_file_writer.cpp
@@ -241,8 +241,7 @@ Status IndexFileWriter::finish_close() {
     if (_idx_v2_writer != nullptr && _idx_v2_writer->state() != 
io::FileWriter::State::CLOSED) {
         RETURN_IF_ERROR(_idx_v2_writer->close(false));
     }
-    LOG_INFO("IndexFileWriter finish_close, enable_write_index_searcher_cache: 
{}",
-             config::enable_write_index_searcher_cache);
+
     Status st = Status::OK();
     if (config::enable_write_index_searcher_cache) {
         st = add_into_searcher_cache();
diff --git a/be/src/storage/olap_common.h b/be/src/storage/olap_common.h
index ac8634e16ab..d1df2059c75 100644
--- a/be/src/storage/olap_common.h
+++ b/be/src/storage/olap_common.h
@@ -398,6 +398,7 @@ struct OlapReaderStatistics {
     int64_t ann_range_result_convert_ns = 0; // time spent processing range 
results
     int64_t ann_range_engine_convert_ns = 0; // time spent on FAISS-side 
conversions (Range)
     int64_t rows_ann_index_range_filtered = 0;
+    int64_t ann_fall_back_brute_force_cnt = 0;
 
     int64_t output_index_result_column_timer = 0;
     // number of segment filtered by column stat when creating seg iterator
diff --git a/be/src/storage/segment/segment_iterator.cpp 
b/be/src/storage/segment/segment_iterator.cpp
index d6612071efc..cc0a117ce33 100644
--- a/be/src/storage/segment/segment_iterator.cpp
+++ b/be/src/storage/segment/segment_iterator.cpp
@@ -76,6 +76,7 @@
 #include "storage/field.h"
 #include "storage/id_manager.h"
 #include "storage/index/ann/ann_index.h"
+#include "storage/index/ann/ann_index_iterator.h"
 #include "storage/index/ann/ann_index_reader.h"
 #include "storage/index/ann/ann_topn_runtime.h"
 #include "storage/index/index_file_reader.h"
@@ -857,6 +858,7 @@ Status SegmentIterator::_apply_ann_topn_predicate() {
                 has_ann_index, has_common_expr_push_down, 
has_column_predicate);
         // Disable index-only scan on ann indexed column.
         _need_read_data_indices[src_cid] = true;
+        _opts.stats->ann_fall_back_brute_force_cnt += 1;
         return Status::OK();
     }
 
@@ -870,6 +872,7 @@ Status SegmentIterator::_apply_ann_topn_predicate() {
                     "Asc topn for inner product can not be evaluated by ann 
index");
             // Disable index-only scan on ann indexed column.
             _need_read_data_indices[src_cid] = true;
+            _opts.stats->ann_fall_back_brute_force_cnt += 1;
             return Status::OK();
         }
     } else {
@@ -877,6 +880,7 @@ Status SegmentIterator::_apply_ann_topn_predicate() {
             VLOG_DEBUG << fmt::format("Desc topn for l2/cosine can not be 
evaluated by ann index");
             // Disable index-only scan on ann indexed column.
             _need_read_data_indices[src_cid] = true;
+            _opts.stats->ann_fall_back_brute_force_cnt += 1;
             return Status::OK();
         }
     }
@@ -889,6 +893,7 @@ Status SegmentIterator::_apply_ann_topn_predicate() {
                 metric_to_string(ann_index_reader->get_metric_type()));
         // Disable index-only scan on ann indexed column.
         _need_read_data_indices[src_cid] = true;
+        _opts.stats->ann_fall_back_brute_force_cnt += 1;
         return Status::OK();
     }
 
@@ -902,14 +907,41 @@ Status SegmentIterator::_apply_ann_topn_predicate() {
                 pre_size, rows_of_segment);
         // Disable index-only scan on ann indexed column.
         _need_read_data_indices[src_cid] = true;
+        _opts.stats->ann_fall_back_brute_force_cnt += 1;
         return Status::OK();
     }
     IColumn::MutablePtr result_column;
     std::unique_ptr<std::vector<uint64_t>> result_row_ids;
     segment_v2::AnnIndexStats ann_index_stats;
-    
RETURN_IF_ERROR(_ann_topn_runtime->evaluate_vector_ann_search(ann_index_iterator,
 &_row_bitmap,
-                                                                  
rows_of_segment, result_column,
-                                                                  
result_row_ids, ann_index_stats));
+
+    // Try to load ANN index before search
+    auto ann_index_iterator_casted =
+            dynamic_cast<segment_v2::AnnIndexIterator*>(ann_index_iterator);
+    if (ann_index_iterator_casted == nullptr) {
+        VLOG_DEBUG << "Failed to cast index iterator to AnnIndexIterator, 
fallback to brute force";
+        _need_read_data_indices[src_cid] = true;
+        _opts.stats->ann_fall_back_brute_force_cnt += 1;
+        return Status::OK();
+    }
+
+    // Track load index timing
+    {
+        SCOPED_TIMER(&(ann_index_stats.load_index_costs_ns));
+        if (!ann_index_iterator_casted->try_load_index()) {
+            VLOG_DEBUG << "Failed to load ANN index, fallback to brute force 
search";
+            _need_read_data_indices[src_cid] = true;
+            _opts.stats->ann_fall_back_brute_force_cnt += 1;
+            return Status::OK();
+        }
+        double load_costs_ms =
+                
static_cast<double>(ann_index_stats.load_index_costs_ns.value()) / 1000000.0;
+        DorisMetrics::instance()->ann_index_load_costs_ms->increment(
+                static_cast<int64_t>(load_costs_ms));
+    }
+
+    RETURN_IF_ERROR(_ann_topn_runtime->evaluate_vector_ann_search(
+            ann_index_iterator_casted, &_row_bitmap, rows_of_segment, 
result_column, result_row_ids,
+            ann_index_stats));
 
     VLOG_DEBUG << fmt::format("Ann topn filtered {} - {} = {} rows", pre_size,
                               _row_bitmap.cardinality(), pre_size - 
_row_bitmap.cardinality());
@@ -1163,8 +1195,8 @@ Status SegmentIterator::_apply_index_expr() {
     }
 
     // Apply ann range search
-    segment_v2::AnnIndexStats ann_index_stats;
     for (const auto& expr_ctx : _common_expr_ctxs_push_down) {
+        segment_v2::AnnIndexStats ann_index_stats;
         size_t origin_rows = _row_bitmap.cardinality();
         RETURN_IF_ERROR(expr_ctx->evaluate_ann_range_search(
                 _index_iterators, _schema->column_ids(), _column_iterators,
@@ -1176,6 +1208,7 @@ Status SegmentIterator::_apply_index_expr() {
         _opts.stats->ann_range_result_convert_ns += 
ann_index_stats.result_process_costs_ns.value();
         _opts.stats->ann_range_engine_convert_ns += 
ann_index_stats.engine_convert_ns.value();
         _opts.stats->ann_range_pre_process_ns += 
ann_index_stats.engine_prepare_ns.value();
+        _opts.stats->ann_fall_back_brute_force_cnt += 
ann_index_stats.fall_back_brute_force_cnt;
     }
 
     for (auto it = _common_expr_ctxs_push_down.begin(); it != 
_common_expr_ctxs_push_down.end();) {
diff --git a/be/test/storage/index/ann/ann_index_writer_test.cpp 
b/be/test/storage/index/ann/ann_index_writer_test.cpp
index dfb8089ee28..9035c79320c 100644
--- a/be/test/storage/index/ann/ann_index_writer_test.cpp
+++ b/be/test/storage/index/ann/ann_index_writer_test.cpp
@@ -53,6 +53,7 @@ public:
                 (override));
     MOCK_METHOD(doris::Status, save, (lucene::store::Directory * dir), 
(override));
     MOCK_METHOD(doris::Status, load, (lucene::store::Directory * dir), 
(override));
+    MOCK_METHOD(Int64, get_min_train_rows, (), (const, override));
 };
 
 class TestAnnIndexColumnWriter : public AnnIndexColumnWriter {
@@ -61,6 +62,7 @@ public:
             : AnnIndexColumnWriter(index_file_writer, index_meta) {}
 
     void set_vector_index(std::shared_ptr<VectorIndex> index) { _vector_index 
= index; }
+    void set_need_save_index(bool value) { _need_save_index = value; }
 };
 
 class AnnIndexWriterTest : public ::testing::Test {
@@ -654,4 +656,432 @@ TEST_F(AnnIndexWriterTest, TestAddMoreThanChunkSizeIVF) {
     EXPECT_TRUE(status.ok());
 }
 
+TEST_F(AnnIndexWriterTest, TestSkipTrainWhenRemainderLessThanNlist) {
+    auto mock_index = std::make_shared<MockVectorIndex>();
+    auto properties = _properties;
+    properties["index_type"] = "ivf";
+    properties["nlist"] = "5"; // Set nlist to 5
+    properties["quantizer"] = "flat";
+
+    auto tablet_index = std::make_unique<TabletIndex>();
+    tablet_index->_properties = properties;
+    tablet_index->_index_id = 1;
+
+    auto writer = 
std::make_unique<TestAnnIndexColumnWriter>(_index_file_writer.get(),
+                                                             
tablet_index.get());
+
+    auto fs_dir = std::make_shared<DorisRAMFSDirectory>();
+    fs_dir->init(doris::io::global_local_filesystem(), 
"./ut_dir/tmp_vector_search", nullptr);
+    EXPECT_CALL(*_index_file_writer, 
open(testing::_)).WillOnce(testing::Return(fs_dir));
+
+    ASSERT_TRUE(writer->init().ok());
+    writer->set_vector_index(mock_index);
+
+    // CHUNK_SIZE = 10, nlist = 5
+    // Add 12 rows: first 10 will be trained/added in one batch, remaining 2 < 
5
+    // Since we have trained data before (_need_save_index = true), we should 
add the remaining 2 rows and save
+    EXPECT_CALL(*mock_index, 
get_min_train_rows()).WillRepeatedly(testing::Return(5));
+    EXPECT_CALL(*mock_index, train(10, testing::_))
+            .Times(1)
+            .WillOnce(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, add(10, 
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, add(2, 
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, 
save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+
+    const size_t dim = 4;
+
+    // Add 12 rows total
+    {
+        const size_t num_rows = 10;
+        std::vector<float> vectors(10 * 4);
+        for (size_t i = 0; i < 10 * 4; ++i) {
+            vectors[i] = static_cast<float>(i);
+        }
+        std::vector<size_t> offsets;
+        for (size_t i = 0; i <= num_rows; ++i) {
+            offsets.push_back(i * 4);
+        }
+
+        Status status = writer->add_array_values(sizeof(float), 
vectors.data(), nullptr,
+                                                 reinterpret_cast<const 
uint8_t*>(offsets.data()),
+                                                 num_rows);
+        EXPECT_TRUE(status.ok());
+    }
+
+    // Add 2 more rows
+    {
+        const size_t num_rows = 2;
+        std::vector<float> vectors = {
+                40.0f, 41.0f, 42.0f, 43.0f, // Row 10
+                44.0f, 45.0f, 46.0f, 47.0f  // Row 11
+        };
+        std::vector<size_t> offsets = {0, 4, 8};
+
+        Status status = writer->add_array_values(sizeof(float), 
vectors.data(), nullptr,
+                                                 reinterpret_cast<const 
uint8_t*>(offsets.data()),
+                                                 num_rows);
+        EXPECT_TRUE(status.ok());
+    }
+
+    Status status = writer->finish();
+    EXPECT_TRUE(status.ok());
+}
+
+TEST_F(AnnIndexWriterTest, TestLargeDataVolumeWithRemainderSkip) {
+    auto mock_index = std::make_shared<MockVectorIndex>();
+    auto properties = _properties;
+    properties["index_type"] = "ivf";
+    properties["nlist"] = "3"; // Set nlist to 3
+    properties["quantizer"] = "flat";
+
+    auto tablet_index = std::make_unique<TabletIndex>();
+    tablet_index->_properties = properties;
+    tablet_index->_index_id = 1;
+
+    auto writer = 
std::make_unique<TestAnnIndexColumnWriter>(_index_file_writer.get(),
+                                                             
tablet_index.get());
+
+    auto fs_dir = std::make_shared<DorisRAMFSDirectory>();
+    fs_dir->init(doris::io::global_local_filesystem(), 
"./ut_dir/tmp_vector_search", nullptr);
+    EXPECT_CALL(*_index_file_writer, 
open(testing::_)).WillOnce(testing::Return(fs_dir));
+
+    ASSERT_TRUE(writer->init().ok());
+    writer->set_vector_index(mock_index);
+
+    // CHUNK_SIZE = 10, nlist = 3
+    // Add 23 rows: 2 full chunks of 10, remaining 3 == nlist, so train 
remaining
+    EXPECT_CALL(*mock_index, 
get_min_train_rows()).WillRepeatedly(testing::Return(3));
+    EXPECT_CALL(*mock_index, train(10, testing::_))
+            .Times(2)
+            .WillRepeatedly(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, add(10, testing::_))
+            .Times(2)
+            .WillRepeatedly(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, train(3, 
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, add(3, 
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, 
save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+
+    const size_t dim = 4;
+
+    // Add 3 batches: 10 + 10 + 3 = 23 rows
+    for (int batch = 0; batch < 2; ++batch) {
+        const size_t num_rows = 10;
+        std::vector<float> vectors(10 * 4);
+        for (size_t i = 0; i < 10 * 4; ++i) {
+            vectors[i] = static_cast<float>(batch * 40 + i);
+        }
+        std::vector<size_t> offsets;
+        for (size_t i = 0; i <= num_rows; ++i) {
+            offsets.push_back(i * 4);
+        }
+
+        Status status = writer->add_array_values(sizeof(float), 
vectors.data(), nullptr,
+                                                 reinterpret_cast<const 
uint8_t*>(offsets.data()),
+                                                 num_rows);
+        EXPECT_TRUE(status.ok());
+    }
+
+    // Add remaining 3 rows
+    {
+        const size_t num_rows = 3;
+        std::vector<float> vectors = {
+                80.0f, 81.0f, 82.0f, 83.0f, // Row 20
+                84.0f, 85.0f, 86.0f, 87.0f, // Row 21
+                88.0f, 89.0f, 90.0f, 91.0f  // Row 22
+        };
+        std::vector<size_t> offsets = {0, 4, 8, 12};
+
+        Status status = writer->add_array_values(sizeof(float), 
vectors.data(), nullptr,
+                                                 reinterpret_cast<const 
uint8_t*>(offsets.data()),
+                                                 num_rows);
+        EXPECT_TRUE(status.ok());
+    }
+
+    Status status = writer->finish();
+    EXPECT_TRUE(status.ok());
+}
+
+TEST_F(AnnIndexWriterTest, TestLargeDataVolumeSkipRemainder) {
+    auto mock_index = std::make_shared<MockVectorIndex>();
+    auto properties = _properties;
+    properties["index_type"] = "ivf";
+    properties["nlist"] = "4"; // Set nlist to 4
+    properties["quantizer"] = "flat";
+
+    auto tablet_index = std::make_unique<TabletIndex>();
+    tablet_index->_properties = properties;
+    tablet_index->_index_id = 1;
+
+    auto writer = 
std::make_unique<TestAnnIndexColumnWriter>(_index_file_writer.get(),
+                                                             
tablet_index.get());
+
+    auto fs_dir = std::make_shared<DorisRAMFSDirectory>();
+    fs_dir->init(doris::io::global_local_filesystem(), 
"./ut_dir/tmp_vector_search", nullptr);
+    EXPECT_CALL(*_index_file_writer, 
open(testing::_)).WillOnce(testing::Return(fs_dir));
+
+    ASSERT_TRUE(writer->init().ok());
+    writer->set_vector_index(mock_index);
+
+    // CHUNK_SIZE = 10, nlist = 4
+    // Add 22 rows: 2 full chunks of 10, remaining 2 < 4
+    // Since we have trained data before (_need_save_index = true), we should 
add the remaining 2 rows and save
+    EXPECT_CALL(*mock_index, 
get_min_train_rows()).WillRepeatedly(testing::Return(4));
+    EXPECT_CALL(*mock_index, train(10, testing::_))
+            .Times(2)
+            .WillRepeatedly(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, add(10, testing::_))
+            .Times(2)
+            .WillRepeatedly(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, add(2, 
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, 
save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+
+    const size_t dim = 4;
+
+    // Add 2 batches of 10 rows
+    for (int batch = 0; batch < 2; ++batch) {
+        const size_t num_rows = 10;
+        std::vector<float> vectors(10 * 4);
+        for (size_t i = 0; i < 10 * 4; ++i) {
+            vectors[i] = static_cast<float>(batch * 40 + i);
+        }
+        std::vector<size_t> offsets;
+        for (size_t i = 0; i <= num_rows; ++i) {
+            offsets.push_back(i * 4);
+        }
+
+        Status status = writer->add_array_values(sizeof(float), 
vectors.data(), nullptr,
+                                                 reinterpret_cast<const 
uint8_t*>(offsets.data()),
+                                                 num_rows);
+        EXPECT_TRUE(status.ok());
+    }
+
+    // Add remaining 2 rows
+    {
+        const size_t num_rows = 2;
+        std::vector<float> vectors = {
+                80.0f, 81.0f, 82.0f, 83.0f, // Row 20
+                84.0f, 85.0f, 86.0f, 87.0f  // Row 21
+        };
+        std::vector<size_t> offsets = {0, 4, 8};
+
+        Status status = writer->add_array_values(sizeof(float), 
vectors.data(), nullptr,
+                                                 reinterpret_cast<const 
uint8_t*>(offsets.data()),
+                                                 num_rows);
+        EXPECT_TRUE(status.ok());
+    }
+
+    Status status = writer->finish();
+    EXPECT_TRUE(status.ok());
+}
+
+TEST_F(AnnIndexWriterTest, TestSkipIndexWhenTotalRowsLessThanNlist) {
+    auto mock_index = std::make_shared<MockVectorIndex>();
+    auto properties = _properties;
+    properties["index_type"] = "ivf";
+    properties["nlist"] = "5"; // Set nlist to 5
+    properties["quantizer"] = "flat";
+
+    auto tablet_index = std::make_unique<TabletIndex>();
+    tablet_index->_properties = properties;
+    tablet_index->_index_id = 1;
+
+    auto writer = 
std::make_unique<TestAnnIndexColumnWriter>(_index_file_writer.get(),
+                                                             
tablet_index.get());
+
+    auto fs_dir = std::make_shared<DorisRAMFSDirectory>();
+    fs_dir->init(doris::io::global_local_filesystem(), 
"./ut_dir/tmp_vector_search", nullptr);
+    EXPECT_CALL(*_index_file_writer, 
open(testing::_)).WillOnce(testing::Return(fs_dir));
+
+    ASSERT_TRUE(writer->init().ok());
+    writer->set_vector_index(mock_index);
+    writer->set_need_save_index(false); // No previous training, so should 
skip entirely
+
+    // Add only 3 rows, which is less than nlist (5)
+    // Since no data was trained before (_need_save_index = false), we should 
skip index building entirely
+    // No train, add, or save should be called
+    EXPECT_CALL(*mock_index, 
get_min_train_rows()).WillRepeatedly(testing::Return(5));
+    EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0);
+    EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0);
+    EXPECT_CALL(*mock_index, save(testing::_)).Times(0);
+
+    const size_t dim = 4;
+
+    // Add 3 rows
+    {
+        const size_t num_rows = 3;
+        std::vector<float> vectors = {
+                1.0f, 2.0f,  3.0f,  4.0f, // Row 0
+                5.0f, 6.0f,  7.0f,  8.0f, // Row 1
+                9.0f, 10.0f, 11.0f, 12.0f // Row 2
+        };
+        std::vector<size_t> offsets = {0, 4, 8, 12};
+
+        Status status = writer->add_array_values(sizeof(float), 
vectors.data(), nullptr,
+                                                 reinterpret_cast<const 
uint8_t*>(offsets.data()),
+                                                 num_rows);
+        EXPECT_TRUE(status.ok());
+    }
+
+    Status status = writer->finish();
+    EXPECT_TRUE(status.ok());
+}
+
+TEST_F(AnnIndexWriterTest, TestPQMinTrainRows) {
+    // Test writer behavior under a large mocked min_train_rows threshold.
+
+    auto mock_index = std::make_shared<MockVectorIndex>();
+    auto writer = 
std::make_unique<TestAnnIndexColumnWriter>(_index_file_writer.get(),
+                                                             
_tablet_index.get());
+
+    auto fs_dir = std::make_shared<DorisRAMFSDirectory>();
+    fs_dir->init(doris::io::global_local_filesystem(), 
"./ut_dir/tmp_vector_search", nullptr);
+    EXPECT_CALL(*_index_file_writer, 
open(testing::_)).WillOnce(testing::Return(fs_dir));
+
+    ASSERT_TRUE(writer->init().ok());
+    writer->set_vector_index(mock_index);
+
+    // Set up expectations: mock a very large min_train_rows threshold.
+    // Since we only provide 1000 vectors, which is less than 131072, training 
will happen in batches
+    // but finish() will skip saving since remaining data is insufficient
+    EXPECT_CALL(*mock_index, 
get_min_train_rows()).WillRepeatedly(testing::Return(131072));
+    // 1000 vectors will be processed in 100 batches of 10 vectors each
+    EXPECT_CALL(*mock_index, train(10, testing::_))
+            .Times(100)
+            .WillRepeatedly(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, add(10, testing::_))
+            .Times(100)
+            .WillRepeatedly(testing::Return(Status::OK()));
+    // Since we have trained data in batches, the index will be saved even 
though total data is insufficient
+    EXPECT_CALL(*mock_index, 
save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+
+    const size_t dim = 4;
+
+    // Add only 1000 rows, which is less than the required 131072
+    {
+        const size_t num_rows = 1000;
+        std::vector<float> vectors(num_rows * dim);
+        for (size_t i = 0; i < num_rows * dim; ++i) {
+            vectors[i] = static_cast<float>(i % 100);
+        }
+        std::vector<size_t> offsets;
+        for (size_t i = 0; i <= num_rows; ++i) {
+            offsets.push_back(i * dim);
+        }
+
+        Status status = writer->add_array_values(sizeof(float), 
vectors.data(), nullptr,
+                                                 reinterpret_cast<const 
uint8_t*>(offsets.data()),
+                                                 num_rows);
+        EXPECT_TRUE(status.ok());
+    }
+
+    // Finish should skip index building due to insufficient training data
+    Status status = writer->finish();
+    EXPECT_TRUE(status.ok());
+}
+
+TEST_F(AnnIndexWriterTest, TestSQMinTrainRows) {
+    // Test that SQ quantizer requires sufficient training data
+    // SQ requires at least nlist * 2 = 10 * 2 = 20 training vectors
+
+    auto mock_index = std::make_shared<MockVectorIndex>();
+    auto writer = 
std::make_unique<TestAnnIndexColumnWriter>(_index_file_writer.get(),
+                                                             
_tablet_index.get());
+
+    auto fs_dir = std::make_shared<DorisRAMFSDirectory>();
+    fs_dir->init(doris::io::global_local_filesystem(), 
"./ut_dir/tmp_vector_search", nullptr);
+    EXPECT_CALL(*_index_file_writer, 
open(testing::_)).WillOnce(testing::Return(fs_dir));
+
+    ASSERT_TRUE(writer->init().ok());
+    writer->set_vector_index(mock_index);
+
+    // Set up expectations: SQ should require at least 20 training vectors
+    // Since we only provide 15 vectors, training will happen in batches but 
finish() will skip saving
+    EXPECT_CALL(*mock_index, 
get_min_train_rows()).WillRepeatedly(testing::Return(20));
+    // 15 vectors will be processed in 1 batch of 10 vectors and remaining 5 
vectors
+    EXPECT_CALL(*mock_index, train(10, testing::_))
+            .Times(1)
+            .WillOnce(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, add(10, 
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, add(5, 
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+    // Since we have trained data, the index will be saved even though total 
data is insufficient
+    EXPECT_CALL(*mock_index, 
save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+
+    const size_t dim = 4;
+
+    // Add only 15 rows, which is less than the required 20
+    {
+        const size_t num_rows = 15;
+        std::vector<float> vectors(num_rows * dim);
+        for (size_t i = 0; i < num_rows * dim; ++i) {
+            vectors[i] = static_cast<float>(i % 100);
+        }
+        std::vector<size_t> offsets;
+        for (size_t i = 0; i <= num_rows; ++i) {
+            offsets.push_back(i * dim);
+        }
+
+        Status status = writer->add_array_values(sizeof(float), 
vectors.data(), nullptr,
+                                                 reinterpret_cast<const 
uint8_t*>(offsets.data()),
+                                                 num_rows);
+        EXPECT_TRUE(status.ok());
+    }
+
+    // Finish should skip index building due to insufficient training data
+    Status status = writer->finish();
+    EXPECT_TRUE(status.ok());
+}
+
+TEST_F(AnnIndexWriterTest, TestPQWithSufficientData) {
+    // Test that PQ works when sufficient training data is provided
+
+    auto mock_index = std::make_shared<MockVectorIndex>();
+    auto writer = 
std::make_unique<TestAnnIndexColumnWriter>(_index_file_writer.get(),
+                                                             
_tablet_index.get());
+
+    auto fs_dir = std::make_shared<DorisRAMFSDirectory>();
+    fs_dir->init(doris::io::global_local_filesystem(), 
"./ut_dir/tmp_vector_search", nullptr);
+    EXPECT_CALL(*_index_file_writer, 
open(testing::_)).WillOnce(testing::Return(fs_dir));
+
+    ASSERT_TRUE(writer->init().ok());
+    writer->set_vector_index(mock_index);
+
+    // Mock min_train_rows to 131072 and provide exactly that amount.
+    EXPECT_CALL(*mock_index, 
get_min_train_rows()).WillRepeatedly(testing::Return(131072));
+    // Since we provide exactly 131072 vectors, they will be trained and added 
in chunks
+    // Each chunk is 10 vectors, so we expect 13107 train calls and 13107 add 
calls for full chunks
+    EXPECT_CALL(*mock_index, train(10, testing::_))
+            .Times(13107)
+            .WillRepeatedly(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, add(10, testing::_))
+            .Times(13107)
+            .WillRepeatedly(testing::Return(Status::OK()));
+    // The remaining 2 vectors will be added without training since 
min_train_rows > 2
+    EXPECT_CALL(*mock_index, add(2, 
testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+    EXPECT_CALL(*mock_index, 
save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK()));
+
+    const size_t dim = 4;
+
+    // Add exactly 131072 rows
+    {
+        const size_t num_rows = 131072;
+        std::vector<float> vectors(num_rows * dim);
+        for (size_t i = 0; i < num_rows * dim; ++i) {
+            vectors[i] = static_cast<float>(i % 100);
+        }
+        std::vector<size_t> offsets;
+        for (size_t i = 0; i <= num_rows; ++i) {
+            offsets.push_back(i * dim);
+        }
+
+        Status status = writer->add_array_values(sizeof(float), 
vectors.data(), nullptr,
+                                                 reinterpret_cast<const 
uint8_t*>(offsets.data()),
+                                                 num_rows);
+        EXPECT_TRUE(status.ok());
+    }
+
+    // Finish should successfully build the index
+    Status status = writer->finish();
+    EXPECT_TRUE(status.ok());
+}
+
 } // namespace doris::segment_v2
diff --git a/regression-test/data/ann_index_p0/product_quantization.out 
b/regression-test/data/ann_index_p0/product_quantization.out
index 91be506ee85..2621845b8d4 100644
--- a/regression-test/data/ann_index_p0/product_quantization.out
+++ b/regression-test/data/ann_index_p0/product_quantization.out
@@ -5,3 +5,6 @@
 3      [3, 4, 5, 6]
 4      [4, 5, 6, 7]
 
+-- !sql --
+1      [1, 2, 3, 4]
+
diff --git a/regression-test/data/ann_index_p0/quantizer_min_train_rows.out 
b/regression-test/data/ann_index_p0/quantizer_min_train_rows.out
new file mode 100644
index 00000000000..b49f93c9ee8
--- /dev/null
+++ b/regression-test/data/ann_index_p0/quantizer_min_train_rows.out
@@ -0,0 +1,32 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+1      [1, 2, 3, 4]
+2      [2, 3, 4, 5]
+3      [3, 4, 5, 6]
+
+-- !sql --
+1      [1, 2, 3, 4]
+2      [2, 3, 4, 5]
+3      [3, 4, 5, 6]
+
+-- !sql --
+400
+
+-- !sql --
+12
+
+-- !sql --
+1      [1, 2, 3, 4]
+2      [2, 3, 4, 5]
+3      [3, 4, 5, 6]
+
+-- !sql --
+1      [1, 2, 3, 4]
+2      [2, 3, 4, 5]
+3      [3, 4, 5, 6]
+
+-- !sql --
+400
+
+-- !sql --
+12
diff --git a/regression-test/data/ann_index_p0/small_segment.out 
b/regression-test/data/ann_index_p0/small_segment.out
new file mode 100644
index 00000000000..e6c48002cd2
--- /dev/null
+++ b/regression-test/data/ann_index_p0/small_segment.out
@@ -0,0 +1,19 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+1      [1, 2, 3]
+2      [0.5, 2.1, 2.9]
+3      [10, 10, 10]
+
+-- !sql --
+1      [1, 2, 3]
+2      [0.5, 2.1, 2.9]
+
+-- !sql --
+1      [1, 2, 3]
+2      [0.5, 2.1, 2.9]
+3      [10, 10, 10]
+
+-- !sql --
+1      [1, 2, 3]
+2      [0.5, 2.1, 2.9]
+
diff --git a/regression-test/suites/ann_index_p0/ivf_index_test.groovy 
b/regression-test/suites/ann_index_p0/ivf_index_test.groovy
index 231e728068a..767cad27fcd 100644
--- a/regression-test/suites/ann_index_p0/ivf_index_test.groovy
+++ b/regression-test/suites/ann_index_p0/ivf_index_test.groovy
@@ -84,15 +84,12 @@ suite ("ivf_index_test") {
     DISTRIBUTED BY HASH(id) BUCKETS 1
     PROPERTIES ("replication_num" = "1");
     """
-    test {
-        // not enough training points
-        sql """
-        INSERT INTO tbl_ann_l2 VALUES
-        (1, [1.0, 2.0, 3.0]),
-        (2, [0.5, 2.1, 2.9]);
-        """
-        exception """exception occurred during training"""
-    }
+    // Not enough training points: should not throw exception anymore, just 
skip index building.
+    sql """
+    INSERT INTO tbl_ann_l2 VALUES
+    (1, [1.0, 2.0, 3.0]),
+    (2, [0.5, 2.1, 2.9]);
+    """
 
     sql "drop table if exists tbl_ann_ip"
     sql """
diff --git a/regression-test/suites/ann_index_p0/product_quantization.groovy 
b/regression-test/suites/ann_index_p0/product_quantization.groovy
index f9e5abd501b..8a99364a913 100644
--- a/regression-test/suites/ann_index_p0/product_quantization.groovy
+++ b/regression-test/suites/ann_index_p0/product_quantization.groovy
@@ -44,10 +44,8 @@ suite("product_quantization") {
             duplicate key(id)
             distributed by hash(id) buckets 1
             properties('replication_num' = '1');"""
-    test {
-        sql """insert into product_quantization values (1, [1.0, 2.0, 3.0, 
4.0])"""
-        exception """exception occurred during training"""
-    }
+    sql """insert into product_quantization values (1, [1.0, 2.0, 3.0, 4.0])"""
+    qt_sql """select * from product_quantization order by id"""
 
     sql """drop table if exists product_quantization"""
     test {
diff --git 
a/regression-test/suites/ann_index_p0/quantizer_min_train_rows.groovy 
b/regression-test/suites/ann_index_p0/quantizer_min_train_rows.groovy
new file mode 100644
index 00000000000..15999e5445c
--- /dev/null
+++ b/regression-test/suites/ann_index_p0/quantizer_min_train_rows.groovy
@@ -0,0 +1,300 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("quantizer_min_train_rows") {
+    sql "set enable_common_expr_pushdown=true;"
+
+    // Test PQ quantizer minimum training rows requirement
+    // PQ min_train_rows formula is (1 << pq_nbits) * 100.
+    // For pq_nbits=8, it requires 25600 training vectors.
+    sql "drop table if exists tbl_pq_insufficient_data"
+    sql """
+    CREATE TABLE tbl_pq_insufficient_data (
+        id INT NOT NULL,
+        embedding ARRAY<FLOAT> NOT NULL,
+        INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+                "index_type"="ivf",
+                "metric_type"="l2_distance",
+                "nlist"="10",
+                "dim"="4",
+                "quantizer"="pq",
+                "pq_m"="2",
+                "pq_nbits"="8"
+        )
+    ) ENGINE=OLAP
+    DUPLICATE KEY(id)
+    DISTRIBUTED BY HASH(id) BUCKETS 1
+    PROPERTIES ("replication_num" = "1");
+    """
+
+    // Insert fewer rows than required (25600), this should not throw an 
exception
+    // and should skip building the ANN index
+    sql """
+    INSERT INTO tbl_pq_insufficient_data VALUES
+    (1, [1.0, 2.0, 3.0, 4.0]),
+    (2, [2.0, 3.0, 4.0, 5.0]),
+    (3, [3.0, 4.0, 5.0, 6.0]);
+    """
+
+    // Verify data is inserted successfully
+    qt_sql "select * from tbl_pq_insufficient_data order by 
l2_distance_approximate(embedding, [1.0,2.0,3.0,4.0]) limit 100;"
+
+    // Test SQ quantizer minimum training rows requirement
+    // SQ requires 20 training vectors
+    sql "drop table if exists tbl_sq_insufficient_data"
+    sql """
+    CREATE TABLE tbl_sq_insufficient_data (
+        id INT NOT NULL,
+        embedding ARRAY<FLOAT> NOT NULL,
+        INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+                "index_type"="ivf",
+                "metric_type"="l2_distance",
+                "nlist"="10",
+                "dim"="4",
+                "quantizer"="sq8"
+        )
+    ) ENGINE=OLAP
+    DUPLICATE KEY(id)
+    DISTRIBUTED BY HASH(id) BUCKETS 1
+    PROPERTIES ("replication_num" = "1");
+    """
+
+    // Insert fewer rows than required (10), this should not throw an exception
+    // and should skip building the ANN index
+    sql """
+    INSERT INTO tbl_sq_insufficient_data VALUES
+    (1, [1.0, 2.0, 3.0, 4.0]),
+    (2, [2.0, 3.0, 4.0, 5.0]),
+    (3, [3.0, 4.0, 5.0, 6.0]);
+    """
+
+    // Verify data is inserted successfully
+    qt_sql "select * from tbl_sq_insufficient_data order by 
l2_distance_approximate(embedding, [1.0,2.0,3.0,4.0]) limit 100;"
+
+    // Test PQ with sufficient data - should build index successfully
+    sql "drop table if exists tbl_pq_sufficient_data"
+    sql """
+    CREATE TABLE tbl_pq_sufficient_data (
+        id INT NOT NULL,
+        embedding ARRAY<FLOAT> NOT NULL,
+        INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+                "index_type"="ivf",
+                "metric_type"="l2_distance",
+                "nlist"="10",
+                "dim"="4",
+                "quantizer"="pq",
+                "pq_m"="2",
+                "pq_nbits"="2"
+        )
+    ) ENGINE=OLAP
+    DUPLICATE KEY(id)
+    DISTRIBUTED BY HASH(id) BUCKETS 1
+    PROPERTIES ("replication_num" = "1");
+    """
+
+    // PQ with pq_nbits=2 requires (1 << 2) * 100 = 400 training vectors.
+    // Insert exactly 400 rows to meet the requirement.
+    def insert_data = []
+    for (int i = 1; i <= 400; i++) {
+        insert_data.add("(${i}, [${i % 10}.0, ${(i + 1) % 10}.0, ${(i + 2) % 
10}.0, ${(i + 3) % 10}.0])")
+    }
+    sql "INSERT INTO tbl_pq_sufficient_data VALUES ${insert_data.join(', ')};"
+
+    // Verify data is inserted successfully
+    qt_sql "select count(*) from tbl_pq_sufficient_data;"
+
+    // Test SQ with sufficient data - should build index successfully
+    sql "drop table if exists tbl_sq_sufficient_data"
+    sql """
+    CREATE TABLE tbl_sq_sufficient_data (
+        id INT NOT NULL,
+        embedding ARRAY<FLOAT> NOT NULL,
+        INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+                "index_type"="ivf",
+                "metric_type"="l2_distance",
+                "nlist"="5",
+                "dim"="4",
+                "quantizer"="sq4"
+        )
+    ) ENGINE=OLAP
+    DUPLICATE KEY(id)
+    DISTRIBUTED BY HASH(id) BUCKETS 1
+    PROPERTIES ("replication_num" = "1");
+    """
+
+    // SQ requires 20 training vectors
+    // Insert more than 20 rows to meet the requirement
+    sql """
+    INSERT INTO tbl_sq_sufficient_data VALUES
+    (1, [1.0, 2.0, 3.0, 4.0]),
+    (2, [2.0, 3.0, 4.0, 5.0]),
+    (3, [3.0, 4.0, 5.0, 6.0]),
+    (4, [4.0, 5.0, 6.0, 7.0]),
+    (5, [5.0, 6.0, 7.0, 8.0]),
+    (6, [6.0, 7.0, 8.0, 9.0]),
+    (7, [7.0, 8.0, 9.0, 10.0]),
+    (8, [8.0, 9.0, 10.0, 11.0]),
+    (9, [9.0, 10.0, 11.0, 12.0]),
+    (10, [10.0, 11.0, 12.0, 13.0]),
+    (11, [11.0, 12.0, 13.0, 14.0]),
+    (12, [12.0, 13.0, 14.0, 15.0]);
+    """
+
+    // Verify data is inserted successfully
+    qt_sql "select count(*) from tbl_sq_sufficient_data;"
+
+    // Test HNSW PQ quantizer minimum training rows requirement
+    sql "drop table if exists tbl_hnsw_pq_insufficient_data"
+    sql """
+    CREATE TABLE tbl_hnsw_pq_insufficient_data (
+        id INT NOT NULL,
+        embedding ARRAY<FLOAT> NOT NULL,
+        INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+                "index_type"="hnsw",
+                "metric_type"="l2_distance",
+                "dim"="4",
+                "quantizer"="pq",
+                "pq_m"="2",
+                "pq_nbits"="8",
+                "max_degree"="16",
+                "ef_construction"="200"
+        )
+    ) ENGINE=OLAP
+    DUPLICATE KEY(id)
+    DISTRIBUTED BY HASH(id) BUCKETS 1
+    PROPERTIES ("replication_num" = "1");
+    """
+
+    // Insert fewer rows than required (25600), this should not throw an 
exception
+    // and should skip building the ANN index
+    sql """
+    INSERT INTO tbl_hnsw_pq_insufficient_data VALUES
+    (1, [1.0, 2.0, 3.0, 4.0]),
+    (2, [2.0, 3.0, 4.0, 5.0]),
+    (3, [3.0, 4.0, 5.0, 6.0]);
+    """
+
+    // Verify data is inserted successfully
+    qt_sql "select * from tbl_hnsw_pq_insufficient_data order by id;"
+
+    // Test HNSW SQ quantizer minimum training rows requirement
+    sql "drop table if exists tbl_hnsw_sq_insufficient_data"
+    sql """
+    CREATE TABLE tbl_hnsw_sq_insufficient_data (
+        id INT NOT NULL,
+        embedding ARRAY<FLOAT> NOT NULL,
+        INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+                "index_type"="hnsw",
+                "metric_type"="l2_distance",
+                "dim"="4",
+                "quantizer"="sq8",
+                "max_degree"="16",
+                "ef_construction"="200"
+        )
+    ) ENGINE=OLAP
+    DUPLICATE KEY(id)
+    DISTRIBUTED BY HASH(id) BUCKETS 1
+    PROPERTIES ("replication_num" = "1");
+    """
+
+    // Insert fewer rows than required (20), this should not throw an exception
+    // and should skip building the ANN index
+    sql """
+    INSERT INTO tbl_hnsw_sq_insufficient_data VALUES
+    (1, [1.0, 2.0, 3.0, 4.0]),
+    (2, [2.0, 3.0, 4.0, 5.0]),
+    (3, [3.0, 4.0, 5.0, 6.0]);
+    """
+
+    // Verify data is inserted successfully
+    qt_sql "select * from tbl_hnsw_sq_insufficient_data order by id;"
+
+    // Test HNSW with sufficient data - should build index successfully
+    sql "drop table if exists tbl_hnsw_pq_sufficient_data"
+    sql """
+    CREATE TABLE tbl_hnsw_pq_sufficient_data (
+        id INT NOT NULL,
+        embedding ARRAY<FLOAT> NOT NULL,
+        INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+                "index_type"="hnsw",
+                "metric_type"="l2_distance",
+                "dim"="4",
+                "quantizer"="pq",
+                "pq_m"="2",
+                "pq_nbits"="2",
+                "max_degree"="16",
+                "ef_construction"="200"
+        )
+    ) ENGINE=OLAP
+    DUPLICATE KEY(id)
+    DISTRIBUTED BY HASH(id) BUCKETS 1
+    PROPERTIES ("replication_num" = "1");
+    """
+
+    // PQ with pq_nbits=2 requires (1 << 2) * 100 = 400 training vectors.
+    // Insert exactly 400 rows to meet the requirement.
+    def insert_data_hnsw_pq = []
+    for (int i = 1; i <= 400; i++) {
+        insert_data_hnsw_pq.add("(${i}, [${i % 10}.0, ${(i + 1) % 10}.0, ${(i 
+ 2) % 10}.0, ${(i + 3) % 10}.0])")
+    }
+    sql "INSERT INTO tbl_hnsw_pq_sufficient_data VALUES 
${insert_data_hnsw_pq.join(', ')};"
+
+    // Verify data is inserted successfully
+    qt_sql "select count(*) from tbl_hnsw_pq_sufficient_data;"
+
+    // Test HNSW SQ with sufficient data - should build index successfully
+    sql "drop table if exists tbl_hnsw_sq_sufficient_data"
+    sql """
+    CREATE TABLE tbl_hnsw_sq_sufficient_data (
+        id INT NOT NULL,
+        embedding ARRAY<FLOAT> NOT NULL,
+        INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+                "index_type"="hnsw",
+                "metric_type"="l2_distance",
+                "dim"="4",
+                "quantizer"="sq4",
+                "max_degree"="16",
+                "ef_construction"="200"
+        )
+    ) ENGINE=OLAP
+    DUPLICATE KEY(id)
+    DISTRIBUTED BY HASH(id) BUCKETS 1
+    PROPERTIES ("replication_num" = "1");
+    """
+
+    // SQ requires 20 training vectors
+    // Insert more than 20 rows to meet the requirement
+    sql """
+    INSERT INTO tbl_hnsw_sq_sufficient_data VALUES
+    (1, [1.0, 2.0, 3.0, 4.0]),
+    (2, [2.0, 3.0, 4.0, 5.0]),
+    (3, [3.0, 4.0, 5.0, 6.0]),
+    (4, [4.0, 5.0, 6.0, 7.0]),
+    (5, [5.0, 6.0, 7.0, 8.0]),
+    (6, [6.0, 7.0, 8.0, 9.0]),
+    (7, [7.0, 8.0, 9.0, 10.0]),
+    (8, [8.0, 9.0, 10.0, 11.0]),
+    (9, [9.0, 10.0, 11.0, 12.0]),
+    (10, [10.0, 11.0, 12.0, 13.0]),
+    (11, [11.0, 12.0, 13.0, 14.0]),
+    (12, [12.0, 13.0, 14.0, 15.0]);
+    """
+
+    // Verify data is inserted successfully
+    qt_sql "select count(*) from tbl_hnsw_sq_sufficient_data;"
+
+}
diff --git a/regression-test/suites/ann_index_p0/small_segment.groovy 
b/regression-test/suites/ann_index_p0/small_segment.groovy
new file mode 100644
index 00000000000..cb40ba951f2
--- /dev/null
+++ b/regression-test/suites/ann_index_p0/small_segment.groovy
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("small_segment") {
+    sql "set enable_common_expr_pushdown=true;"
+
+    // Test that ANN index is not built when segment size is smaller than 
required training rows
+    sql "drop table if exists tbl_small_segment"
+    sql """
+    CREATE TABLE tbl_small_segment (
+        id INT NOT NULL,
+        embedding ARRAY<FLOAT> NOT NULL,
+        INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+                "index_type"="ivf",
+                "metric_type"="l2_distance",
+                "nlist"="10",
+                "dim"="3"
+        )
+    ) ENGINE=OLAP
+    DUPLICATE KEY(id)
+    DISTRIBUTED BY HASH(id) BUCKETS 1
+    PROPERTIES ("replication_num" = "1");
+    """
+
+    // Insert fewer rows than nlist (10), this should not throw an exception
+    // and should skip building the ANN index
+    sql """
+    INSERT INTO tbl_small_segment VALUES
+    (1, [1.0, 2.0, 3.0]),
+    (2, [0.5, 2.1, 2.9]),
+    (3, [10.0, 10.0, 10.0]);
+    """
+
+    // Verify data is inserted successfully
+    qt_sql "select * from tbl_small_segment order by id;"
+
+    // Test range search with IVF (should fall back to brute force since index 
is not built)
+    qt_sql "select * from tbl_small_segment where 
l2_distance_approximate(embedding, [1.0,2.0,3.0]) < 5.0 order by id;"
+
+    // Test with HNSW index as well - HNSW should work even with small segments
+    sql "drop table if exists tbl_small_segment_hnsw"
+    sql """
+    CREATE TABLE tbl_small_segment_hnsw (
+        id INT NOT NULL,
+        embedding ARRAY<FLOAT> NOT NULL,
+        INDEX idx_emb (`embedding`) USING ANN PROPERTIES(
+                "index_type"="hnsw",
+                "metric_type"="l2_distance",
+                "dim"="3",
+                "quantizer"="pq",
+                "pq_m"="1",
+                "pq_nbits"="2"
+        )
+    ) ENGINE=OLAP
+    DUPLICATE KEY(id)
+    DISTRIBUTED BY HASH(id) BUCKETS 1
+    PROPERTIES ("replication_num" = "1");
+    """
+
+    // Insert small amount of data - HNSW should work
+    sql """
+    INSERT INTO tbl_small_segment_hnsw VALUES
+    (1, [1.0, 2.0, 3.0]),
+    (2, [0.5, 2.1, 2.9]),
+    (3, [10.0, 10.0, 10.0]);
+    """
+
+    qt_sql "select * from tbl_small_segment_hnsw order by id;"
+
+    // Test approximate search with HNSW (should work)
+    sql "select id, l2_distance_approximate(embedding, [1.0,2.0,3.0]) as dist 
from tbl_small_segment_hnsw order by dist limit 2;"
+
+    // Test range search with HNSW (should work)
+    qt_sql "select * from tbl_small_segment_hnsw where 
l2_distance_approximate(embedding, [1.0,2.0,3.0]) < 5.0 order by id;"
+
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch master updated: [opt](Ann) Cancel index building if input rows is less than the min_train_rows (#60358)

Reply via email to