This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 1dc3b81875 GH-39583: [C++] Fix the issue of ExecBatchBuilder when
appending consecutive tail rows with the same id may exceed buffer boundary
(for fixed size types) (#39585)
1dc3b81875 is described below
commit 1dc3b81875d243f5d135caa6b2db7a669888ecb4
Author: Rossi(Ruoxi) Sun <[email protected]>
AuthorDate: Thu Jan 18 00:26:48 2024 +0800
GH-39583: [C++] Fix the issue of ExecBatchBuilder when appending
consecutive tail rows with the same id may exceed buffer boundary (for fixed
size types) (#39585)
### Rationale for this change
#39583 is a subsequent issue of #32570 (fixed by #39234). The last issue
and fixed only resolved var length types. It turns out fixed size types have
the same issue.
### What changes are included in this PR?
Do the same fix of #39234 for fixed size types.
### Are these changes tested?
UT included.
### Are there any user-facing changes?
* Closes: #39583
Authored-by: zanmato1984 <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/compute/light_array.cc | 21 ++++-----
cpp/src/arrow/compute/light_array_test.cc | 75 ++++++++++++++++++++++++++++---
2 files changed, 77 insertions(+), 19 deletions(-)
diff --git a/cpp/src/arrow/compute/light_array.cc
b/cpp/src/arrow/compute/light_array.cc
index 73ea01a03a..66d8477b02 100644
--- a/cpp/src/arrow/compute/light_array.cc
+++ b/cpp/src/arrow/compute/light_array.cc
@@ -383,27 +383,22 @@ int ExecBatchBuilder::NumRowsToSkip(const
std::shared_ptr<ArrayData>& column,
KeyColumnMetadata column_metadata =
ColumnMetadataFromDataType(column->type).ValueOrDie();
+ ARROW_DCHECK(!column_metadata.is_fixed_length ||
column_metadata.fixed_length > 0);
int num_rows_left = num_rows;
int num_bytes_skipped = 0;
while (num_rows_left > 0 && num_bytes_skipped < num_tail_bytes_to_skip) {
+ --num_rows_left;
+ int row_id_removed = row_ids[num_rows_left];
if (column_metadata.is_fixed_length) {
- if (column_metadata.fixed_length == 0) {
- num_rows_left = std::max(num_rows_left, 8) - 8;
- ++num_bytes_skipped;
- } else {
- --num_rows_left;
- num_bytes_skipped += column_metadata.fixed_length;
- }
+ num_bytes_skipped += column_metadata.fixed_length;
} else {
- --num_rows_left;
- int row_id_removed = row_ids[num_rows_left];
const int32_t* offsets = column->GetValues<int32_t>(1);
num_bytes_skipped += offsets[row_id_removed + 1] -
offsets[row_id_removed];
- // Skip consecutive rows with the same id
- while (num_rows_left > 0 && row_id_removed == row_ids[num_rows_left -
1]) {
- --num_rows_left;
- }
+ }
+ // Skip consecutive rows with the same id
+ while (num_rows_left > 0 && row_id_removed == row_ids[num_rows_left - 1]) {
+ --num_rows_left;
}
}
diff --git a/cpp/src/arrow/compute/light_array_test.cc
b/cpp/src/arrow/compute/light_array_test.cc
index 3ceba43604..d50e967551 100644
--- a/cpp/src/arrow/compute/light_array_test.cc
+++ b/cpp/src/arrow/compute/light_array_test.cc
@@ -474,15 +474,18 @@ TEST(ExecBatchBuilder, AppendBatchesSomeRows) {
TEST(ExecBatchBuilder, AppendBatchDupRows) {
std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
MemoryPool* pool = owned_pool.get();
+
// Case of cross-word copying for the last row, which may exceed the buffer
boundary.
- // This is a simplified case of GH-32570
+ //
{
+ // This is a simplified case of GH-32570
// 64-byte data fully occupying one minimal 64-byte aligned memory region.
- ExecBatch batch_string = JSONToExecBatch({binary()},
R"([["123456789ABCDEF0"],
- ["123456789ABCDEF0"],
- ["123456789ABCDEF0"],
- ["ABCDEF0"],
- ["123456789"]])"); // 9-byte tail row, larger than a word.
+ ExecBatch batch_string = JSONToExecBatch({binary()}, R"([
+ ["123456789ABCDEF0"],
+ ["123456789ABCDEF0"],
+ ["123456789ABCDEF0"],
+ ["ABCDEF0"],
+ ["123456789"]])"); // 9-byte tail row, larger than a word.
ASSERT_EQ(batch_string[0].array()->buffers[1]->capacity(), 64);
ASSERT_EQ(batch_string[0].array()->buffers[2]->capacity(), 64);
ExecBatchBuilder builder;
@@ -494,6 +497,66 @@ TEST(ExecBatchBuilder, AppendBatchDupRows) {
ASSERT_EQ(batch_string_appended, built);
ASSERT_NE(0, pool->bytes_allocated());
}
+
+ {
+ // This is a simplified case of GH-39583, using fsb(3) type.
+ // 63-byte data occupying almost one minimal 64-byte aligned memory region.
+ ExecBatch batch_fsb = JSONToExecBatch({fixed_size_binary(3)}, R"([
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["000"],
+ ["123"]])"); // 3-byte tail row, not aligned to a word.
+ ASSERT_EQ(batch_fsb[0].array()->buffers[1]->capacity(), 64);
+ ExecBatchBuilder builder;
+ uint16_t row_ids[4] = {20, 20, 20,
+ 20}; // Get the last row 4 times, 3 to skip a word.
+ ASSERT_OK(builder.AppendSelected(pool, batch_fsb, 4, row_ids,
/*num_cols=*/1));
+ ExecBatch built = builder.Flush();
+ ExecBatch batch_fsb_appended = JSONToExecBatch(
+ {fixed_size_binary(3)}, R"([["123"], ["123"], ["123"], ["123"]])");
+ ASSERT_EQ(batch_fsb_appended, built);
+ ASSERT_NE(0, pool->bytes_allocated());
+ }
+
+ {
+ // This is a simplified case of GH-39583, using fsb(9) type.
+ // 63-byte data occupying almost one minimal 64-byte aligned memory region.
+ ExecBatch batch_fsb = JSONToExecBatch({fixed_size_binary(9)}, R"([
+ ["000000000"],
+ ["000000000"],
+ ["000000000"],
+ ["000000000"],
+ ["000000000"],
+ ["000000000"],
+ ["123456789"]])"); // 9-byte tail row, not aligned to a word.
+ ASSERT_EQ(batch_fsb[0].array()->buffers[1]->capacity(), 64);
+ ExecBatchBuilder builder;
+ uint16_t row_ids[2] = {6, 6}; // Get the last row 2 times, 1 to skip a
word.
+ ASSERT_OK(builder.AppendSelected(pool, batch_fsb, 2, row_ids,
/*num_cols=*/1));
+ ExecBatch built = builder.Flush();
+ ExecBatch batch_fsb_appended =
+ JSONToExecBatch({fixed_size_binary(9)}, R"([["123456789"],
["123456789"]])");
+ ASSERT_EQ(batch_fsb_appended, built);
+ ASSERT_NE(0, pool->bytes_allocated());
+ }
+
ASSERT_EQ(0, pool->bytes_allocated());
}