This is an automated email from the ASF dual-hosted git repository.
airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new d50a794427d [opt](ann index) Make chunk size of index train
configurable (#58645)
d50a794427d is described below
commit d50a794427dfaaca8fc7e9bfc30872efa85d5d15
Author: zhiqiang <[email protected]>
AuthorDate: Thu Dec 4 20:46:12 2025 +0800
[opt](ann index) Make chunk size of index train configurable (#58645)
### What problem does this PR solve?
Previous pr: https://github.com/apache/doris/pull/57623
The current granularity for index training and data ingestion is set to
1M and is hard-coded, which makes index construction unnecessarily slow
in some scenarios. This should be made configurable and reduced when
appropriate.
For example, when having 1M vectors to add, and batch size of stream
load is set to 0.3M, this means we will have 3 stream load requests. If
it happens to make one request that having 0.3M to have 1 threads for
adding, whole process of load will be very slow. A typical cpu usage
will be like this:
<img width="1902" height="552" alt="image"
src="https://github.com/user-attachments/assets/65728e56-f333-4bd5-a54a-8c12d01668f1"
/>
We need to make batch size configurable so that we can modify them when
we need to do it.
For example, when we set batch size to 30K, we can have a more higher
avg cpu usage when we like this:
<img width="1890" height="554" alt="image"
src="https://github.com/user-attachments/assets/7d664b0e-b017-4a2e-bed8-e40f56ff97b7"
/>
**Default value is still 1M, small batch size will do a damage to the
recall of the hnsw.**
---
be/src/common/config.cpp | 6 ++++++
be/src/common/config.h | 2 ++
be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp | 10 ++++++----
be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h | 6 ++++--
4 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index c4910f1b756..66750950621 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1612,6 +1612,12 @@ DEFINE_mInt32(max_segment_partial_column_cache_size,
"100");
DEFINE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction, "true");
DEFINE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction, "true");
+// Chunk size for ANN/vector index building per training/adding batch
+// 1M By default.
+DEFINE_mInt64(ann_index_build_chunk_size, "1000000");
+DEFINE_Validator(ann_index_build_chunk_size,
+ [](const int64_t config) -> bool { return config > 0; });
+
DEFINE_mBool(enable_wal_tde, "false");
DEFINE_mBool(print_stack_when_cache_miss, "false");
diff --git a/be/src/common/config.h b/be/src/common/config.h
index c4fa8f82cc8..13720207eea 100644
--- a/be/src/common/config.h
+++ b/be/src/common/config.h
@@ -1674,6 +1674,8 @@ DECLARE_mInt64(max_csv_line_reader_output_buffer_size);
DECLARE_Int32(omp_threads_limit);
// The capacity of segment partial column cache, used to cache column readers
for each segment.
DECLARE_mInt32(max_segment_partial_column_cache_size);
+// Chunk size for ANN/vector index building per training/adding batch
+DECLARE_mInt64(ann_index_build_chunk_size);
DECLARE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction);
DECLARE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction);
diff --git a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp
b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp
index 562565d565b..e18b89e4da4 100644
--- a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.cpp
@@ -78,7 +78,7 @@ Status AnnIndexColumnWriter::init() {
index_type, build_parameter.dim, metric_type,
build_parameter.max_degree,
build_parameter.ef_construction, quantizer);
- size_t block_size = CHUNK_SIZE * build_parameter.dim;
+ size_t block_size = AnnIndexColumnWriter::chunk_size() *
build_parameter.dim;
_float_array.reserve(block_size);
return Status::OK();
@@ -110,7 +110,7 @@ Status AnnIndexColumnWriter::add_array_values(size_t
field_size, const void* val
const float* p = reinterpret_cast<const float*>(value_ptr);
- const size_t full_elements = CHUNK_SIZE * dim;
+ const size_t full_elements = AnnIndexColumnWriter::chunk_size() * dim;
size_t remaining_elements = num_rows * dim;
size_t src_offset = 0;
while (remaining_elements > 0) {
@@ -122,8 +122,10 @@ Status AnnIndexColumnWriter::add_array_values(size_t
field_size, const void* val
remaining_elements -= elements_to_add;
if (_float_array.size() == full_elements) {
- RETURN_IF_ERROR(_vector_index->train(CHUNK_SIZE,
_float_array.data()));
- RETURN_IF_ERROR(_vector_index->add(CHUNK_SIZE,
_float_array.data()));
+ RETURN_IF_ERROR(
+ _vector_index->train(AnnIndexColumnWriter::chunk_size(),
_float_array.data()));
+ RETURN_IF_ERROR(
+ _vector_index->add(AnnIndexColumnWriter::chunk_size(),
_float_array.data()));
_float_array.clear();
}
}
diff --git a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h
b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h
index 0fb7ef11706..2110c91d307 100644
--- a/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h
+++ b/be/src/olap/rowset/segment_v2/ann_index/ann_index_writer.h
@@ -40,11 +40,13 @@ namespace doris::segment_v2 {
#include "common/compile_check_begin.h"
class AnnIndexColumnWriter : public IndexColumnWriter {
public:
+ static inline int64_t chunk_size() {
#ifdef BE_TEST
- static constexpr int64_t CHUNK_SIZE = 10;
+ return 10;
#else
- static constexpr int64_t CHUNK_SIZE = 1'000'000;
+ return config::ann_index_build_chunk_size;
#endif
+ }
static constexpr const char* INDEX_TYPE = "index_type";
static constexpr const char* METRIC_TYPE = "metric_type";
static constexpr const char* DIM = "dim";
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]