(doris) branch master updated: [opt](ann index) Make chunk size of index train configurable (#58645)

airborne Thu, 04 Dec 2025 04:46:30 -0800

This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new d50a794427d [opt](ann index) Make chunk size of index train 
configurable (#58645)
d50a794427d is described below

commit d50a794427dfaaca8fc7e9bfc30872efa85d5d15
Author: zhiqiang <[email protected]>
AuthorDate: Thu Dec 4 20:46:12 2025 +0800

    [opt](ann index) Make chunk size of index train configurable (#58645)
    
    ### What problem does this PR solve?
    Previous pr: https://github.com/apache/doris/pull/57623
    
    The current granularity for index training and data ingestion is set to
    1M and is hard-coded, which makes index construction unnecessarily slow
    in some scenarios. This should be made configurable and reduced when
    appropriate.
    
    For example, when having 1M vectors to add, and batch size of stream
    load is set to 0.3M, this means we will have 3 stream load requests. If
    it happens to make one request that having 0.3M to have 1 threads for
    adding, whole process of load will be very slow. A typical cpu usage
    will be like this:
    <img width="1902" height="552" alt="image"
    
src="https://github.com/user-attachments/assets/65728e56-f333-4bd5-a54a-8c12d01668f1";
    />
    
    We need to make batch size configurable so that we can modify them when
    we need to do it.
    
    For example, when we set batch size to 30K, we can have a more higher
    avg cpu usage when we like this:
    <img width="1890" height="554" alt="image"
    
src="https://github.com/user-attachments/assets/7d664b0e-b017-4a2e-bed8-e40f56ff97b7";
    />
    
    **Default value is still 1M, small batch size will do a damage to the
    recall of the hnsw.**
---
 be/src/common/config.cpp                                     |  6 ++++++
 be/src/common/config.h                                       |  2 ++
 be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp | 10 ++++++----
 be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h   |  6 ++++--
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index c4910f1b756..66750950621 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1612,6 +1612,12 @@ DEFINE_mInt32(max_segment_partial_column_cache_size, 
"100");
 DEFINE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction, "true");
 DEFINE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction, "true");
 
+// Chunk size for ANN/vector index building per training/adding batch
+// 1M By default.
+DEFINE_mInt64(ann_index_build_chunk_size, "1000000");
+DEFINE_Validator(ann_index_build_chunk_size,
+                 [](const int64_t config) -> bool { return config > 0; });
+
 DEFINE_mBool(enable_wal_tde, "false");
 
 DEFINE_mBool(print_stack_when_cache_miss, "false");
diff --git a/be/src/common/config.h b/be/src/common/config.h
index c4fa8f82cc8..13720207eea 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1674,6 +1674,8 @@ DECLARE_mInt64(max_csv_line_reader_output_buffer_size);
 DECLARE_Int32(omp_threads_limit);
 // The capacity of segment partial column cache, used to cache column readers 
for each segment.
 DECLARE_mInt32(max_segment_partial_column_cache_size);
+// Chunk size for ANN/vector index building per training/adding batch
+DECLARE_mInt64(ann_index_build_chunk_size);
 
 DECLARE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction);
 DECLARE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction);
diff --git a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp
index 562565d565b..e18b89e4da4 100644
--- a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp
@@ -78,7 +78,7 @@ Status AnnIndexColumnWriter::init() {
             index_type, build_parameter.dim, metric_type, 
build_parameter.max_degree,
             build_parameter.ef_construction, quantizer);
 
-    size_t block_size = CHUNK_SIZE * build_parameter.dim;
+    size_t block_size = AnnIndexColumnWriter::chunk_size() * 
build_parameter.dim;
     _float_array.reserve(block_size);
 
     return Status::OK();
@@ -110,7 +110,7 @@ Status AnnIndexColumnWriter::add_array_values(size_t 
field_size, const void* val
 
     const float* p = reinterpret_cast<const float*>(value_ptr);
 
-    const size_t full_elements = CHUNK_SIZE * dim;
+    const size_t full_elements = AnnIndexColumnWriter::chunk_size() * dim;
     size_t remaining_elements = num_rows * dim;
     size_t src_offset = 0;
     while (remaining_elements > 0) {
@@ -122,8 +122,10 @@ Status AnnIndexColumnWriter::add_array_values(size_t 
field_size, const void* val
         remaining_elements -= elements_to_add;
 
         if (_float_array.size() == full_elements) {
-            RETURN_IF_ERROR(_vector_index->train(CHUNK_SIZE, 
_float_array.data()));
-            RETURN_IF_ERROR(_vector_index->add(CHUNK_SIZE, 
_float_array.data()));
+            RETURN_IF_ERROR(
+                    _vector_index->train(AnnIndexColumnWriter::chunk_size(), 
_float_array.data()));
+            RETURN_IF_ERROR(
+                    _vector_index->add(AnnIndexColumnWriter::chunk_size(), 
_float_array.data()));
             _float_array.clear();
         }
     }
diff --git a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h 
b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h
index 0fb7ef11706..2110c91d307 100644
--- a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h
+++ b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h
@@ -40,11 +40,13 @@ namespace doris::segment_v2 {
 #include "common/compile_check_begin.h"
 class AnnIndexColumnWriter : public IndexColumnWriter {
 public:
+    static inline int64_t chunk_size() {
 #ifdef BE_TEST
-    static constexpr int64_t CHUNK_SIZE = 10;
+        return 10;
 #else
-    static constexpr int64_t CHUNK_SIZE = 1'000'000;
+        return config::ann_index_build_chunk_size;
 #endif
+    }
     static constexpr const char* INDEX_TYPE = "index_type";
     static constexpr const char* METRIC_TYPE = "metric_type";
     static constexpr const char* DIM = "dim";


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch master updated: [opt](ann index) Make chunk size of index train configurable (#58645)

Reply via email to