This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 2abb3fb709 GH-32570: [C++] Fix the issue of `ExecBatchBuilder` when
appending consecutive tail rows with the same id may exceed buffer boundary
(#39234)
2abb3fb709 is described below
commit 2abb3fb7095241300e2bb2aadd953b0f23970237
Author: Rossi(Ruoxi) Sun <[email protected]>
AuthorDate: Thu Dec 21 14:14:45 2023 -0800
GH-32570: [C++] Fix the issue of `ExecBatchBuilder` when appending
consecutive tail rows with the same id may exceed buffer boundary (#39234)
### Rationale for this change
Addressed in
https://github.com/apache/arrow/issues/32570#issuecomment-1856473812
### What changes are included in this PR?
1. Skip consecutive rows with the same id when calculating rows to skip
when appending to `ExecBatchBuilder`.
2. Fix the bug that column offset is neglected when calculating rows to
skip.
### Are these changes tested?
Yes. New UT included and the change is also protected by the existing case
mentioned in the issue.
### Are there any user-facing changes?
No.
**This PR contains a "Critical Fix".**
Because #32570 is labeled critical, and causes a crash even when the API
contract is upheld.
* Closes: #32570
Authored-by: zanmato <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/compute/light_array.cc | 7 +++++--
cpp/src/arrow/compute/light_array.h | 4 +++-
cpp/src/arrow/compute/light_array_test.cc | 26 ++++++++++++++++++++++++++
3 files changed, 34 insertions(+), 3 deletions(-)
diff --git a/cpp/src/arrow/compute/light_array.cc
b/cpp/src/arrow/compute/light_array.cc
index 4e8b2b2d7c..93a054de19 100644
--- a/cpp/src/arrow/compute/light_array.cc
+++ b/cpp/src/arrow/compute/light_array.cc
@@ -398,9 +398,12 @@ int ExecBatchBuilder::NumRowsToSkip(const
std::shared_ptr<ArrayData>& column,
} else {
--num_rows_left;
int row_id_removed = row_ids[num_rows_left];
- const uint32_t* offsets =
- reinterpret_cast<const uint32_t*>(column->buffers[1]->data());
+ const int32_t* offsets = column->GetValues<int32_t>(1);
num_bytes_skipped += offsets[row_id_removed + 1] -
offsets[row_id_removed];
+ // Skip consecutive rows with the same id
+ while (num_rows_left > 0 && row_id_removed == row_ids[num_rows_left -
1]) {
+ --num_rows_left;
+ }
}
}
diff --git a/cpp/src/arrow/compute/light_array.h
b/cpp/src/arrow/compute/light_array.h
index 87f6b6c76a..84aa86d64b 100644
--- a/cpp/src/arrow/compute/light_array.h
+++ b/cpp/src/arrow/compute/light_array.h
@@ -416,7 +416,9 @@ class ARROW_EXPORT ExecBatchBuilder {
// without checking buffer bounds (useful with SIMD or fixed size memory
loads
// and stores).
//
- // The sequence of row_ids provided must be non-decreasing.
+ // The sequence of row_ids provided must be non-decreasing. In case of
consecutive rows
+ // with the same row id, they are skipped all at once because they occupy
the same
+ // space.
//
static int NumRowsToSkip(const std::shared_ptr<ArrayData>& column, int
num_rows,
const uint16_t* row_ids, int
num_tail_bytes_to_skip);
diff --git a/cpp/src/arrow/compute/light_array_test.cc
b/cpp/src/arrow/compute/light_array_test.cc
index 4e33f7b578..52121530fe 100644
--- a/cpp/src/arrow/compute/light_array_test.cc
+++ b/cpp/src/arrow/compute/light_array_test.cc
@@ -471,6 +471,32 @@ TEST(ExecBatchBuilder, AppendBatchesSomeRows) {
ASSERT_EQ(0, pool->bytes_allocated());
}
+TEST(ExecBatchBuilder, AppendBatchDupRows) {
+ std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
+ MemoryPool* pool = owned_pool.get();
+ // Case of cross-word copying for the last row, which may exceed the buffer
boundary.
+ // This is a simplified case of GH-32570
+ {
+ // 64-byte data fully occupying one minimal 64-byte aligned memory region.
+ ExecBatch batch_string = JSONToExecBatch({binary()},
R"([["123456789ABCDEF0"],
+ ["123456789ABCDEF0"],
+ ["123456789ABCDEF0"],
+ ["ABCDEF0"],
+ ["123456789"]])"); // 9-byte tail row, larger than a word.
+ ASSERT_EQ(batch_string[0].array()->buffers[1]->capacity(), 64);
+ ASSERT_EQ(batch_string[0].array()->buffers[2]->capacity(), 64);
+ ExecBatchBuilder builder;
+ uint16_t row_ids[2] = {4, 4};
+ ASSERT_OK(builder.AppendSelected(pool, batch_string, 2, row_ids,
/*num_cols=*/1));
+ ExecBatch built = builder.Flush();
+ ExecBatch batch_string_appended =
+ JSONToExecBatch({binary()}, R"([["123456789"], ["123456789"]])");
+ ASSERT_EQ(batch_string_appended, built);
+ ASSERT_NE(0, pool->bytes_allocated());
+ }
+ ASSERT_EQ(0, pool->bytes_allocated());
+}
+
TEST(ExecBatchBuilder, AppendBatchesSomeCols) {
std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
MemoryPool* pool = owned_pool.get();