This is an automated email from the ASF dual-hosted git repository.
zanmato pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 1302889c01 GH-43124: [C++] Initialize offset vector head as 0 after
memory allocated in grouper.cc (#43123)
1302889c01 is described below
commit 1302889c01bfbd04dc2dca4995078d2eca9311cd
Author: flashzxi <[email protected]>
AuthorDate: Tue Nov 19 17:56:50 2024 +0800
GH-43124: [C++] Initialize offset vector head as 0 after memory allocated
in grouper.cc (#43123)
## bug description
In grouper.cc:780, memory is allocated for offset vector of varlen column,
however the vector is initialized in `encoder_.DecodeFixedLengthBuffers`,
which will never be called when `num_groups==0`(see line 786). Then
`fixedlen_bufs[i][0]` will be a uninitialized value that means a random
uint32_t. Later this random uint32_t is used to
`AllocatePaddedBuffer(varlen_size)`. However an random uint32_t is up to 4GB
memory, the program may run normally without being affected.
## how to fix
set offset vector head as 0 after memory allocated in case it won't be
initialized when `num_groups==0`
* GitHub Issue: #43124
Lead-authored-by: FlashZXi <[email protected]>
Co-authored-by: Rossi Sun <[email protected]>
Co-authored-by: Ruoxi Sun <[email protected]>
Signed-off-by: Ruoxi Sun <[email protected]>
---
cpp/src/arrow/compute/row/grouper.cc | 3 +++
cpp/src/arrow/compute/row/grouper_test.cc | 22 ++++++++++++++++++++++
2 files changed, 25 insertions(+)
diff --git a/cpp/src/arrow/compute/row/grouper.cc
b/cpp/src/arrow/compute/row/grouper.cc
index 6c1a93fb89..137e9a290e 100644
--- a/cpp/src/arrow/compute/row/grouper.cc
+++ b/cpp/src/arrow/compute/row/grouper.cc
@@ -877,6 +877,9 @@ struct GrouperFastImpl : public Grouper {
} else {
ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i],
AllocatePaddedBuffer((num_groups + 1) *
sizeof(uint32_t)));
+ // Set offset[0] to 0 so the later allocation of varlen_bufs doesn't
see an
+ // uninitialized value when num_groups == 0.
+ reinterpret_cast<uint32_t*>(fixedlen_bufs[i]->mutable_data())[0] = 0;
}
cols_[i] =
KeyColumnArray(col_metadata_[i], num_groups,
non_null_bufs[i]->mutable_data(),
diff --git a/cpp/src/arrow/compute/row/grouper_test.cc
b/cpp/src/arrow/compute/row/grouper_test.cc
index 1e853be5e4..fcee46863f 100644
--- a/cpp/src/arrow/compute/row/grouper_test.cc
+++ b/cpp/src/arrow/compute/row/grouper_test.cc
@@ -64,5 +64,27 @@ TEST(Grouper, ResortedColumnsWithLargeNullRows) {
}
}
+// Reproduction of GH-43124: Provoke var length buffer size if a grouper
produces zero
+// groups.
+TEST(Grouper, EmptyGroups) {
+ ASSERT_OK_AND_ASSIGN(auto grouper, Grouper::Make({int32(), utf8()}));
+ ASSERT_OK_AND_ASSIGN(auto groups, grouper->GetUniques());
+
+ ASSERT_TRUE(groups[0].is_array());
+ ASSERT_EQ(groups[0].array()->buffers.size(), 2);
+ ASSERT_EQ(groups[0].array()->buffers[0], nullptr);
+ ASSERT_NE(groups[0].array()->buffers[1], nullptr);
+ ASSERT_EQ(groups[0].array()->buffers[1]->size(), 0);
+
+ ASSERT_TRUE(groups[1].is_array());
+ ASSERT_EQ(groups[1].array()->buffers.size(), 3);
+ ASSERT_EQ(groups[1].array()->buffers[0], nullptr);
+ ASSERT_NE(groups[1].array()->buffers[1], nullptr);
+ ASSERT_EQ(groups[1].array()->buffers[1]->size(), 4);
+ ASSERT_EQ(groups[1].array()->buffers[1]->data_as<const uint32_t>()[0], 0);
+ ASSERT_NE(groups[1].array()->buffers[2], nullptr);
+ ASSERT_EQ(groups[1].array()->buffers[2]->size(), 0);
+}
+
} // namespace compute
} // namespace arrow