This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 56b02a586f9 branch-4.0: [opt](ann index) Make chunk size of index
train configurable #58645 (#58727)
56b02a586f9 is described below
commit 56b02a586f98750603229ac8b881b0ece3c6aa9d
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Fri Dec 5 11:50:18 2025 +0800
branch-4.0: [opt](ann index) Make chunk size of index train configurable
#58645 (#58727)
Cherry-picked from #58645
Co-authored-by: zhiqiang <[email protected]>
---
be/src/common/config.cpp | 6 ++++++
be/src/common/config.h | 2 ++
be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp | 10 ++++++----
be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h | 6 ++++--
4 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index d5fdcc4d12b..7a184b7e5c4 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1600,6 +1600,12 @@ DEFINE_mInt32(max_segment_partial_column_cache_size,
"100");
DEFINE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction, "true");
DEFINE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction, "true");
+// Chunk size for ANN/vector index building per training/adding batch
+// 1M By default.
+DEFINE_mInt64(ann_index_build_chunk_size, "1000000");
+DEFINE_Validator(ann_index_build_chunk_size,
+ [](const int64_t config) -> bool { return config > 0; });
+
DEFINE_mBool(enable_wal_tde, "false");
DEFINE_mBool(print_stack_when_cache_miss, "false");
diff --git a/be/src/common/config.h b/be/src/common/config.h
index d6c83b87138..e7844e02395 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1663,6 +1663,8 @@ DECLARE_mInt64(max_csv_line_reader_output_buffer_size);
DECLARE_Int32(omp_threads_limit);
// The capacity of segment partial column cache, used to cache column readers
for each segment.
DECLARE_mInt32(max_segment_partial_column_cache_size);
+// Chunk size for ANN/vector index building per training/adding batch
+DECLARE_mInt64(ann_index_build_chunk_size);
DECLARE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction);
DECLARE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction);
diff --git a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp
b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp
index 562565d565b..e18b89e4da4 100644
--- a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp
@@ -78,7 +78,7 @@ Status AnnIndexColumnWriter::init() {
index_type, build_parameter.dim, metric_type,
build_parameter.max_degree,
build_parameter.ef_construction, quantizer);
- size_t block_size = CHUNK_SIZE * build_parameter.dim;
+ size_t block_size = AnnIndexColumnWriter::chunk_size() *
build_parameter.dim;
_float_array.reserve(block_size);
return Status::OK();
@@ -110,7 +110,7 @@ Status AnnIndexColumnWriter::add_array_values(size_t
field_size, const void* val
const float* p = reinterpret_cast<const float*>(value_ptr);
- const size_t full_elements = CHUNK_SIZE * dim;
+ const size_t full_elements = AnnIndexColumnWriter::chunk_size() * dim;
size_t remaining_elements = num_rows * dim;
size_t src_offset = 0;
while (remaining_elements > 0) {
@@ -122,8 +122,10 @@ Status AnnIndexColumnWriter::add_array_values(size_t
field_size, const void* val
remaining_elements -= elements_to_add;
if (_float_array.size() == full_elements) {
- RETURN_IF_ERROR(_vector_index->train(CHUNK_SIZE,
_float_array.data()));
- RETURN_IF_ERROR(_vector_index->add(CHUNK_SIZE,
_float_array.data()));
+ RETURN_IF_ERROR(
+ _vector_index->train(AnnIndexColumnWriter::chunk_size(),
_float_array.data()));
+ RETURN_IF_ERROR(
+ _vector_index->add(AnnIndexColumnWriter::chunk_size(),
_float_array.data()));
_float_array.clear();
}
}
diff --git a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h
b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h
index 0fb7ef11706..2110c91d307 100644
--- a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h
+++ b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h
@@ -40,11 +40,13 @@ namespace doris::segment_v2 {
#include "common/compile_check_begin.h"
class AnnIndexColumnWriter : public IndexColumnWriter {
public:
+ static inline int64_t chunk_size() {
#ifdef BE_TEST
- static constexpr int64_t CHUNK_SIZE = 10;
+ return 10;
#else
- static constexpr int64_t CHUNK_SIZE = 1'000'000;
+ return config::ann_index_build_chunk_size;
#endif
+ }
static constexpr const char* INDEX_TYPE = "index_type";
static constexpr const char* METRIC_TYPE = "metric_type";
static constexpr const char* DIM = "dim";
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]