This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 56b02a586f9 branch-4.0: [opt](ann index) Make chunk size of index 
train configurable #58645 (#58727)
56b02a586f9 is described below

commit 56b02a586f98750603229ac8b881b0ece3c6aa9d
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Fri Dec 5 11:50:18 2025 +0800

    branch-4.0: [opt](ann index) Make chunk size of index train configurable 
#58645 (#58727)
    
    Cherry-picked from #58645
    
    Co-authored-by: zhiqiang <[email protected]>
---
 be/src/common/config.cpp                                     |  6 ++++++
 be/src/common/config.h                                       |  2 ++
 be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp | 10 ++++++----
 be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h   |  6 ++++--
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index d5fdcc4d12b..7a184b7e5c4 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1600,6 +1600,12 @@ DEFINE_mInt32(max_segment_partial_column_cache_size, 
"100");
 DEFINE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction, "true");
 DEFINE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction, "true");
 
+// Chunk size for ANN/vector index building per training/adding batch
+// 1M By default.
+DEFINE_mInt64(ann_index_build_chunk_size, "1000000");
+DEFINE_Validator(ann_index_build_chunk_size,
+                 [](const int64_t config) -> bool { return config > 0; });
+
 DEFINE_mBool(enable_wal_tde, "false");
 
 DEFINE_mBool(print_stack_when_cache_miss, "false");
diff --git a/be/src/common/config.h b/be/src/common/config.h
index d6c83b87138..e7844e02395 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1663,6 +1663,8 @@ DECLARE_mInt64(max_csv_line_reader_output_buffer_size);
 DECLARE_Int32(omp_threads_limit);
 // The capacity of segment partial column cache, used to cache column readers 
for each segment.
 DECLARE_mInt32(max_segment_partial_column_cache_size);
+// Chunk size for ANN/vector index building per training/adding batch
+DECLARE_mInt64(ann_index_build_chunk_size);
 
 DECLARE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction);
 DECLARE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction);
diff --git a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp
index 562565d565b..e18b89e4da4 100644
--- a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp
@@ -78,7 +78,7 @@ Status AnnIndexColumnWriter::init() {
             index_type, build_parameter.dim, metric_type, 
build_parameter.max_degree,
             build_parameter.ef_construction, quantizer);
 
-    size_t block_size = CHUNK_SIZE * build_parameter.dim;
+    size_t block_size = AnnIndexColumnWriter::chunk_size() * 
build_parameter.dim;
     _float_array.reserve(block_size);
 
     return Status::OK();
@@ -110,7 +110,7 @@ Status AnnIndexColumnWriter::add_array_values(size_t 
field_size, const void* val
 
     const float* p = reinterpret_cast<const float*>(value_ptr);
 
-    const size_t full_elements = CHUNK_SIZE * dim;
+    const size_t full_elements = AnnIndexColumnWriter::chunk_size() * dim;
     size_t remaining_elements = num_rows * dim;
     size_t src_offset = 0;
     while (remaining_elements > 0) {
@@ -122,8 +122,10 @@ Status AnnIndexColumnWriter::add_array_values(size_t 
field_size, const void* val
         remaining_elements -= elements_to_add;
 
         if (_float_array.size() == full_elements) {
-            RETURN_IF_ERROR(_vector_index->train(CHUNK_SIZE, 
_float_array.data()));
-            RETURN_IF_ERROR(_vector_index->add(CHUNK_SIZE, 
_float_array.data()));
+            RETURN_IF_ERROR(
+                    _vector_index->train(AnnIndexColumnWriter::chunk_size(), 
_float_array.data()));
+            RETURN_IF_ERROR(
+                    _vector_index->add(AnnIndexColumnWriter::chunk_size(), 
_float_array.data()));
             _float_array.clear();
         }
     }
diff --git a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h 
b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h
index 0fb7ef11706..2110c91d307 100644
--- a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h
+++ b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h
@@ -40,11 +40,13 @@ namespace doris::segment_v2 {
 #include "common/compile_check_begin.h"
 class AnnIndexColumnWriter : public IndexColumnWriter {
 public:
+    static inline int64_t chunk_size() {
 #ifdef BE_TEST
-    static constexpr int64_t CHUNK_SIZE = 10;
+        return 10;
 #else
-    static constexpr int64_t CHUNK_SIZE = 1'000'000;
+        return config::ann_index_build_chunk_size;
 #endif
+    }
     static constexpr const char* INDEX_TYPE = "index_type";
     static constexpr const char* METRIC_TYPE = "metric_type";
     static constexpr const char* DIM = "dim";


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to