This is an automated email from the ASF dual-hosted git repository.

zanmato pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 1302889c01 GH-43124: [C++] Initialize offset vector head as 0 after 
memory allocated in grouper.cc (#43123)
1302889c01 is described below

commit 1302889c01bfbd04dc2dca4995078d2eca9311cd
Author: flashzxi <[email protected]>
AuthorDate: Tue Nov 19 17:56:50 2024 +0800

    GH-43124: [C++] Initialize offset vector head as 0 after memory allocated 
in grouper.cc (#43123)
    
    ## bug description
    In grouper.cc:780, memory is allocated for offset vector of varlen column, 
however the vector is initialized in  `encoder_.DecodeFixedLengthBuffers`, 
which will never be called when `num_groups==0`(see line 786). Then 
`fixedlen_bufs[i][0]` will be a uninitialized value that means a random 
uint32_t. Later this random uint32_t is used to 
`AllocatePaddedBuffer(varlen_size)`. However an random uint32_t is up to 4GB 
memory, the program may run normally without being affected.
    
    ## how to fix
    set offset vector head as 0 after memory allocated in case it won't be 
initialized when `num_groups==0`
    
    * GitHub Issue: #43124
    
    Lead-authored-by: FlashZXi <[email protected]>
    Co-authored-by: Rossi Sun <[email protected]>
    Co-authored-by: Ruoxi Sun <[email protected]>
    Signed-off-by: Ruoxi Sun <[email protected]>
---
 cpp/src/arrow/compute/row/grouper.cc      |  3 +++
 cpp/src/arrow/compute/row/grouper_test.cc | 22 ++++++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/cpp/src/arrow/compute/row/grouper.cc 
b/cpp/src/arrow/compute/row/grouper.cc
index 6c1a93fb89..137e9a290e 100644
--- a/cpp/src/arrow/compute/row/grouper.cc
+++ b/cpp/src/arrow/compute/row/grouper.cc
@@ -877,6 +877,9 @@ struct GrouperFastImpl : public Grouper {
       } else {
         ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i],
                               AllocatePaddedBuffer((num_groups + 1) * 
sizeof(uint32_t)));
+        // Set offset[0] to 0 so the later allocation of varlen_bufs doesn't 
see an
+        // uninitialized value when num_groups == 0.
+        reinterpret_cast<uint32_t*>(fixedlen_bufs[i]->mutable_data())[0] = 0;
       }
       cols_[i] =
           KeyColumnArray(col_metadata_[i], num_groups, 
non_null_bufs[i]->mutable_data(),
diff --git a/cpp/src/arrow/compute/row/grouper_test.cc 
b/cpp/src/arrow/compute/row/grouper_test.cc
index 1e853be5e4..fcee46863f 100644
--- a/cpp/src/arrow/compute/row/grouper_test.cc
+++ b/cpp/src/arrow/compute/row/grouper_test.cc
@@ -64,5 +64,27 @@ TEST(Grouper, ResortedColumnsWithLargeNullRows) {
   }
 }
 
+// Reproduction of GH-43124: Provoke var length buffer size if a grouper 
produces zero
+// groups.
+TEST(Grouper, EmptyGroups) {
+  ASSERT_OK_AND_ASSIGN(auto grouper, Grouper::Make({int32(), utf8()}));
+  ASSERT_OK_AND_ASSIGN(auto groups, grouper->GetUniques());
+
+  ASSERT_TRUE(groups[0].is_array());
+  ASSERT_EQ(groups[0].array()->buffers.size(), 2);
+  ASSERT_EQ(groups[0].array()->buffers[0], nullptr);
+  ASSERT_NE(groups[0].array()->buffers[1], nullptr);
+  ASSERT_EQ(groups[0].array()->buffers[1]->size(), 0);
+
+  ASSERT_TRUE(groups[1].is_array());
+  ASSERT_EQ(groups[1].array()->buffers.size(), 3);
+  ASSERT_EQ(groups[1].array()->buffers[0], nullptr);
+  ASSERT_NE(groups[1].array()->buffers[1], nullptr);
+  ASSERT_EQ(groups[1].array()->buffers[1]->size(), 4);
+  ASSERT_EQ(groups[1].array()->buffers[1]->data_as<const uint32_t>()[0], 0);
+  ASSERT_NE(groups[1].array()->buffers[2], nullptr);
+  ASSERT_EQ(groups[1].array()->buffers[2]->size(), 0);
+}
+
 }  // namespace compute
 }  // namespace arrow

Reply via email to