This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 1dc3b81875 GH-39583: [C++] Fix the issue of ExecBatchBuilder when 
appending consecutive tail rows with the same id may exceed buffer boundary 
(for fixed size types) (#39585)
1dc3b81875 is described below

commit 1dc3b81875d243f5d135caa6b2db7a669888ecb4
Author: Rossi(Ruoxi) Sun <[email protected]>
AuthorDate: Thu Jan 18 00:26:48 2024 +0800

    GH-39583: [C++] Fix the issue of ExecBatchBuilder when appending 
consecutive tail rows with the same id may exceed buffer boundary (for fixed 
size types) (#39585)
    
    
    
    ### Rationale for this change
    
    #39583 is a subsequent issue of #32570 (fixed by #39234). The last issue 
and fixed only resolved var length types. It turns out fixed size types have 
the same issue.
    
    ### What changes are included in this PR?
    
    Do the same fix of #39234 for fixed size types.
    
    ### Are these changes tested?
    
    UT included.
    
    ### Are there any user-facing changes?
    
    * Closes: #39583
    
    Authored-by: zanmato1984 <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/compute/light_array.cc      | 21 ++++-----
 cpp/src/arrow/compute/light_array_test.cc | 75 ++++++++++++++++++++++++++++---
 2 files changed, 77 insertions(+), 19 deletions(-)

diff --git a/cpp/src/arrow/compute/light_array.cc 
b/cpp/src/arrow/compute/light_array.cc
index 73ea01a03a..66d8477b02 100644
--- a/cpp/src/arrow/compute/light_array.cc
+++ b/cpp/src/arrow/compute/light_array.cc
@@ -383,27 +383,22 @@ int ExecBatchBuilder::NumRowsToSkip(const 
std::shared_ptr<ArrayData>& column,
 
   KeyColumnMetadata column_metadata =
       ColumnMetadataFromDataType(column->type).ValueOrDie();
+  ARROW_DCHECK(!column_metadata.is_fixed_length || 
column_metadata.fixed_length > 0);
 
   int num_rows_left = num_rows;
   int num_bytes_skipped = 0;
   while (num_rows_left > 0 && num_bytes_skipped < num_tail_bytes_to_skip) {
+    --num_rows_left;
+    int row_id_removed = row_ids[num_rows_left];
     if (column_metadata.is_fixed_length) {
-      if (column_metadata.fixed_length == 0) {
-        num_rows_left = std::max(num_rows_left, 8) - 8;
-        ++num_bytes_skipped;
-      } else {
-        --num_rows_left;
-        num_bytes_skipped += column_metadata.fixed_length;
-      }
+      num_bytes_skipped += column_metadata.fixed_length;
     } else {
-      --num_rows_left;
-      int row_id_removed = row_ids[num_rows_left];
       const int32_t* offsets = column->GetValues<int32_t>(1);
       num_bytes_skipped += offsets[row_id_removed + 1] - 
offsets[row_id_removed];
-      // Skip consecutive rows with the same id
-      while (num_rows_left > 0 && row_id_removed == row_ids[num_rows_left - 
1]) {
-        --num_rows_left;
-      }
+    }
+    // Skip consecutive rows with the same id
+    while (num_rows_left > 0 && row_id_removed == row_ids[num_rows_left - 1]) {
+      --num_rows_left;
     }
   }
 
diff --git a/cpp/src/arrow/compute/light_array_test.cc 
b/cpp/src/arrow/compute/light_array_test.cc
index 3ceba43604..d50e967551 100644
--- a/cpp/src/arrow/compute/light_array_test.cc
+++ b/cpp/src/arrow/compute/light_array_test.cc
@@ -474,15 +474,18 @@ TEST(ExecBatchBuilder, AppendBatchesSomeRows) {
 TEST(ExecBatchBuilder, AppendBatchDupRows) {
   std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
   MemoryPool* pool = owned_pool.get();
+
   // Case of cross-word copying for the last row, which may exceed the buffer 
boundary.
-  // This is a simplified case of GH-32570
+  //
   {
+    // This is a simplified case of GH-32570
     // 64-byte data fully occupying one minimal 64-byte aligned memory region.
-    ExecBatch batch_string = JSONToExecBatch({binary()}, 
R"([["123456789ABCDEF0"],
-      ["123456789ABCDEF0"],
-      ["123456789ABCDEF0"],
-      ["ABCDEF0"],
-      ["123456789"]])");  // 9-byte tail row, larger than a word.
+    ExecBatch batch_string = JSONToExecBatch({binary()}, R"([
+        ["123456789ABCDEF0"],
+        ["123456789ABCDEF0"],
+        ["123456789ABCDEF0"],
+        ["ABCDEF0"],
+        ["123456789"]])");  // 9-byte tail row, larger than a word.
     ASSERT_EQ(batch_string[0].array()->buffers[1]->capacity(), 64);
     ASSERT_EQ(batch_string[0].array()->buffers[2]->capacity(), 64);
     ExecBatchBuilder builder;
@@ -494,6 +497,66 @@ TEST(ExecBatchBuilder, AppendBatchDupRows) {
     ASSERT_EQ(batch_string_appended, built);
     ASSERT_NE(0, pool->bytes_allocated());
   }
+
+  {
+    // This is a simplified case of GH-39583, using fsb(3) type.
+    // 63-byte data occupying almost one minimal 64-byte aligned memory region.
+    ExecBatch batch_fsb = JSONToExecBatch({fixed_size_binary(3)}, R"([
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["000"],
+        ["123"]])");  // 3-byte tail row, not aligned to a word.
+    ASSERT_EQ(batch_fsb[0].array()->buffers[1]->capacity(), 64);
+    ExecBatchBuilder builder;
+    uint16_t row_ids[4] = {20, 20, 20,
+                           20};  // Get the last row 4 times, 3 to skip a word.
+    ASSERT_OK(builder.AppendSelected(pool, batch_fsb, 4, row_ids, 
/*num_cols=*/1));
+    ExecBatch built = builder.Flush();
+    ExecBatch batch_fsb_appended = JSONToExecBatch(
+        {fixed_size_binary(3)}, R"([["123"], ["123"], ["123"], ["123"]])");
+    ASSERT_EQ(batch_fsb_appended, built);
+    ASSERT_NE(0, pool->bytes_allocated());
+  }
+
+  {
+    // This is a simplified case of GH-39583, using fsb(9) type.
+    // 63-byte data occupying almost one minimal 64-byte aligned memory region.
+    ExecBatch batch_fsb = JSONToExecBatch({fixed_size_binary(9)}, R"([
+        ["000000000"],
+        ["000000000"],
+        ["000000000"],
+        ["000000000"],
+        ["000000000"],
+        ["000000000"],
+        ["123456789"]])");  // 9-byte tail row, not aligned to a word.
+    ASSERT_EQ(batch_fsb[0].array()->buffers[1]->capacity(), 64);
+    ExecBatchBuilder builder;
+    uint16_t row_ids[2] = {6, 6};  // Get the last row 2 times, 1 to skip a 
word.
+    ASSERT_OK(builder.AppendSelected(pool, batch_fsb, 2, row_ids, 
/*num_cols=*/1));
+    ExecBatch built = builder.Flush();
+    ExecBatch batch_fsb_appended =
+        JSONToExecBatch({fixed_size_binary(9)}, R"([["123456789"], 
["123456789"]])");
+    ASSERT_EQ(batch_fsb_appended, built);
+    ASSERT_NE(0, pool->bytes_allocated());
+  }
+
   ASSERT_EQ(0, pool->bytes_allocated());
 }
 

Reply via email to