This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new 12645bda20a branch-4.1: [fix](compaction) Fix incorrect memory 
availability check in RowSourceBuffer during vertical compaction #63152 (#63232)
12645bda20a is described below

commit 12645bda20a3a63a0db4e25dc1ecd5bb33d83ff2
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed May 20 14:13:41 2026 +0800

    branch-4.1: [fix](compaction) Fix incorrect memory availability check in 
RowSourceBuffer during vertical compaction #63152 (#63232)
    
    Cherry-picked from #63152
    
    Co-authored-by: Lijia Liu <[email protected]>
    Co-authored-by: liutang123 <[email protected]>
---
 .../storage/iterator/vertical_merge_iterator.cpp   | 10 ++-
 .../compaction/vertical_compaction_test.cpp        | 77 ++++++++++++++++++++++
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/be/src/storage/iterator/vertical_merge_iterator.cpp 
b/be/src/storage/iterator/vertical_merge_iterator.cpp
index 60562a93d1d..731686f7218 100644
--- a/be/src/storage/iterator/vertical_merge_iterator.cpp
+++ b/be/src/storage/iterator/vertical_merge_iterator.cpp
@@ -70,8 +70,14 @@ uint16_t RowSource::data() const {
 Status RowSourcesBuffer::append(const std::vector<RowSource>& row_sources) {
     if (_buffer.allocated_bytes() + row_sources.size() * sizeof(UInt16) >
         config::vertical_compaction_max_row_source_memory_mb * 1024 * 1024) {
-        if (_buffer.allocated_bytes() - _buffer.size() * sizeof(UInt16) <
-            row_sources.size() * sizeof(UInt16)) {
+        // Use capacity() - size() to get the truly available element slots.
+        // Note: PODArrayBase::allocated_bytes() includes pad_left and 
pad_right,
+        // which are NOT usable for storing elements. Using allocated_bytes() 
here
+        // would over-estimate the available space and lead to a missed spill,
+        // causing _buffer to grow beyond the configured memory limit when
+        // push_back triggers reallocation below.
+        size_t available_slots = _buffer.capacity() - _buffer.size();
+        if (available_slots < row_sources.size()) {
             VLOG_NOTICE << "RowSourceBuffer is too large, serialize and reset 
buffer: "
                         << _buffer.allocated_bytes() << ", total size: " << 
_total_size;
             // serialize current buffer
diff --git a/be/test/storage/compaction/vertical_compaction_test.cpp 
b/be/test/storage/compaction/vertical_compaction_test.cpp
index a39932e2a01..37eedcca319 100644
--- a/be/test/storage/compaction/vertical_compaction_test.cpp
+++ b/be/test/storage/compaction/vertical_compaction_test.cpp
@@ -600,6 +600,83 @@ TEST_F(VerticalCompactionTest, TestRowSourcesBuffer) {
     }
 }
 
+// Regression test for RowSourcesBuffer::append spill threshold.
+//
+// Background:
+// PaddedPODArray::allocated_bytes() returns the total allocated memory which
+// INCLUDES pad_left and pad_right. These padding bytes are NOT usable for
+// storing elements. Earlier, append() used `allocated_bytes() - size*sizeof`
+// as "available room" to decide whether to skip spilling. This over-estimates
+// the truly usable space (by pad_left + pad_right bytes), so when the buffer
+// has already crossed the configured memory limit, append() may incorrectly
+// decide that the upcoming push_back will fit without reallocation, skip the
+// spill, and then push_back triggers a reallocation that doubles the buffer,
+// exceeding the configured `vertical_compaction_max_row_source_memory_mb`.
+//
+// This test simulates the case by setting a very small memory limit (1 MB) and
+// repeatedly appending row sources. After the first time the buffer crosses
+// the limit, the next append must trigger a spill (file write + reset) instead
+// of silently growing the in-memory buffer beyond the limit.
+TEST_F(VerticalCompactionTest, TestRowSourcesBufferSpillThreshold) {
+    // 1 MB limit (set in SetUp as well, but make it explicit here).
+    config::vertical_compaction_max_row_source_memory_mb = 1;
+    const size_t mem_limit_bytes =
+            
static_cast<size_t>(config::vertical_compaction_max_row_source_memory_mb) * 
1024 * 1024;
+
+    RowSourcesBuffer buffer(200, absolute_dir, 
ReaderType::READER_CUMULATIVE_COMPACTION);
+
+    // Build a batch of row sources. Use a moderate batch size so that the
+    // buffer's allocated_bytes() can become very close to the limit before
+    // a single append crosses it.
+    constexpr size_t kBatchSize = 4096;
+    std::vector<RowSource> batch;
+    batch.reserve(kBatchSize);
+    for (size_t i = 0; i < kBatchSize; ++i) {
+        batch.emplace_back(static_cast<uint16_t>(i % 8), false);
+    }
+
+    // Total elements that fit in the memory limit (a safe upper bound).
+    // Each element is 2 bytes (UInt16), so ~512K elements per MB.
+    const size_t total_appends = (mem_limit_bytes / sizeof(uint16_t)) * 4 / 
kBatchSize + 8;
+
+    size_t expected_total = 0;
+    for (size_t i = 0; i < total_appends; ++i) {
+        ASSERT_TRUE(buffer.append(batch).ok());
+        expected_total += kBatchSize;
+
+        // Invariant: in-memory buffered_size() must never exceed what the
+        // memory limit allows (in elements). Otherwise the spill logic is
+        // broken (the bug described above).
+        // Allow a small slack equal to one batch because the spill check is
+        // performed BEFORE the push_back that crosses the threshold.
+        size_t buffered_elems = buffer.buffered_size();
+        size_t buffered_bytes = buffered_elems * sizeof(uint16_t);
+        // After each append, buffered_bytes should be <= mem_limit + one 
batch size.
+        // It must NOT grow unboundedly (e.g., 2x of the limit due to PODArray
+        // reallocation that the buggy version would allow).
+        EXPECT_LE(buffered_bytes, mem_limit_bytes + kBatchSize * 
sizeof(uint16_t))
+                << "RowSourcesBuffer in-memory size exceeded the configured 
limit, "
+                << "spill threshold logic is broken. iter=" << i
+                << ", buffered_elems=" << buffered_elems;
+    }
+
+    EXPECT_EQ(buffer.total_size(), expected_total);
+
+    // Make sure data is persisted and can be read back correctly.
+    ASSERT_TRUE(buffer.flush().ok());
+    ASSERT_TRUE(buffer.seek_to_begin().ok());
+
+    size_t read_back = 0;
+    while (buffer.has_remaining().ok()) {
+        // Verify that the source num matches the pattern we wrote.
+        auto cur = buffer.current().get_source_num();
+        EXPECT_EQ(cur, (read_back % kBatchSize) % 8);
+        buffer.advance(1);
+        ++read_back;
+    }
+    EXPECT_EQ(read_back, expected_total);
+}
+
 TEST_F(VerticalCompactionTest, TestDupKeyVerticalMerge) {
     auto num_input_rowset = 2;
     auto num_segments = 2;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to