This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 0e85c12a38 GH-46407: [C++] Fix IPC serialization of sliced list arrays 
(#46408)
0e85c12a38 is described below

commit 0e85c12a381d8bcdd07ed5d8ed07ff4b535b597d
Author: Bruno <[email protected]>
AuthorDate: Tue Jun 3 20:28:33 2025 +0200

    GH-46407: [C++] Fix IPC serialization of sliced list arrays (#46408)
    
    ### Rationale for this change
    
    Arrow C++ slices arrays by bumping the top-level `offset` value.
    However, Arrow Rust slices list arrays by slicing the `value_offsets`
    buffer. When receiving a Rust Arrow Array in C++ (via the C data
    interface), its IPC serialization fails to notice that the
    `value_offsets` buffer needed to be updated, but it still updates the
    `values` buffer.  This leads to a corrupt array on deserialization, with
    an `value_offsets` buffer that points past the end of the values array.
    
    This PR fixes the IPC serialization by also looking at value_offset(0) to
    determine whether the `value_offsets` buffer needs reconstructing,
    instead of only looking at offset().
    This works because value_offset(int) is the offets buffer, shifted by the 
top-level offset.
    We still need to check for offset(), to account for array starting with an 
empty list (multiple
    zeroes at the start of the offsets buffer).
    
    ### What changes are included in this PR?
    
    The fix and nothing else
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    No (well, unless they are affected by the bug)
    
    **This PR contains a "Critical Fix".** (the changes fix (b) a bug that 
caused incorrect or invalid data to be produced) : valid operations on valid 
data produce invalid data.
    
    * GitHub Issue: #46407
    
    Lead-authored-by: Bruno Cauet <[email protected]>
    Co-authored-by: Bruno Cauet <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/ipc/feather_test.cc    | 18 ++++++++++++++++++
 cpp/src/arrow/ipc/read_write_test.cc | 23 +++++++++++++++++++++++
 cpp/src/arrow/ipc/test_common.cc     | 14 ++++++++++----
 cpp/src/arrow/ipc/test_common.h      |  6 ++++++
 cpp/src/arrow/ipc/writer.cc          | 36 +++++++++++++++++++-----------------
 5 files changed, 76 insertions(+), 21 deletions(-)

diff --git a/cpp/src/arrow/ipc/feather_test.cc 
b/cpp/src/arrow/ipc/feather_test.cc
index ba3f4d828c..e1dc6046a1 100644
--- a/cpp/src/arrow/ipc/feather_test.cc
+++ b/cpp/src/arrow/ipc/feather_test.cc
@@ -319,6 +319,24 @@ TEST_P(TestFeather, SliceBooleanRoundTrip) {
   CheckSlices(batch);
 }
 
+TEST_P(TestFeather, SliceListRoundTrip) {
+  if (GetParam().version == kFeatherV1Version) {
+    GTEST_SKIP() << "Feather V1 does not support list types";
+  }
+  std::shared_ptr<RecordBatch> batch;
+  ASSERT_OK(ipc::test::MakeListRecordBatchSized(600, &batch));
+  CheckSlices(batch);
+}
+
+TEST_P(TestFeather, SliceListViewRoundTrip) {
+  if (GetParam().version == kFeatherV1Version) {
+    GTEST_SKIP() << "Feather V1 does not support list view types";
+  }
+  std::shared_ptr<RecordBatch> batch;
+  ASSERT_OK(ipc::test::MakeListViewRecordBatchSized(600, &batch));
+  CheckSlices(batch);
+}
+
 INSTANTIATE_TEST_SUITE_P(
     FeatherTests, TestFeather,
     ::testing::Values(TestParam(kFeatherV1Version), 
TestParam(kFeatherV2Version),
diff --git a/cpp/src/arrow/ipc/read_write_test.cc 
b/cpp/src/arrow/ipc/read_write_test.cc
index 8bd28e4d58..84ec923ce8 100644
--- a/cpp/src/arrow/ipc/read_write_test.cc
+++ b/cpp/src/arrow/ipc/read_write_test.cc
@@ -579,6 +579,29 @@ TEST_F(TestIpcRoundTrip, SpecificMetadataVersion) {
   TestMetadataVersion(MetadataVersion::V5);
 }
 
+TEST_F(TestIpcRoundTrip, ListWithSlicedValues) {
+  // This tests serialization of a sliced ListArray that got sliced "the Rust
+  // way": by slicing the value_offsets buffer, but keeping top-level offset at
+  // 0.
+  auto child_data = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5]")->data();
+
+  // Offsets buffer [2, 5]
+  TypedBufferBuilder<int32_t> offsets_builder;
+  ASSERT_OK(offsets_builder.Reserve(2));
+  ASSERT_OK(offsets_builder.Append(2));
+  ASSERT_OK(offsets_builder.Append(5));
+  ASSERT_OK_AND_ASSIGN(auto offsets_buffer, offsets_builder.Finish());
+
+  auto list_data = ArrayData::Make(list(int32()),
+                                   /*num_rows=*/1,
+                                   /*buffers=*/{nullptr, offsets_buffer});
+  list_data->child_data = {child_data};
+  std::shared_ptr<Array> list_array = MakeArray(list_data);
+  ASSERT_OK(list_array->ValidateFull());
+
+  CheckRoundtrip(list_array);
+}
+
 TEST(TestReadMessage, CorruptedSmallInput) {
   std::string data = "abc";
   auto reader = io::BufferReader::FromString(data);
diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc
index 3d7137e496..46060a0db1 100644
--- a/cpp/src/arrow/ipc/test_common.cc
+++ b/cpp/src/arrow/ipc/test_common.cc
@@ -421,7 +421,7 @@ Status MakeNullRecordBatch(std::shared_ptr<RecordBatch>* 
out) {
   return Status::OK();
 }
 
-Status MakeListRecordBatch(std::shared_ptr<RecordBatch>* out) {
+Status MakeListRecordBatchSized(const int length, 
std::shared_ptr<RecordBatch>* out) {
   // Make the schema
   auto f0 = field("f0", list(int32()));
   auto f1 = field("f1", list(list(int32())));
@@ -431,7 +431,6 @@ Status MakeListRecordBatch(std::shared_ptr<RecordBatch>* 
out) {
   // Example data
 
   MemoryPool* pool = default_memory_pool();
-  const int length = 200;
   std::shared_ptr<Array> leaf_values, list_array, list_list_array, 
large_list_array;
   const bool include_nulls = true;
   RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values));
@@ -446,7 +445,11 @@ Status MakeListRecordBatch(std::shared_ptr<RecordBatch>* 
out) {
   return Status::OK();
 }
 
-Status MakeListViewRecordBatch(std::shared_ptr<RecordBatch>* out) {
+Status MakeListRecordBatch(std::shared_ptr<RecordBatch>* out) {
+  return MakeListRecordBatchSized(200, out);
+}
+
+Status MakeListViewRecordBatchSized(const int length, 
std::shared_ptr<RecordBatch>* out) {
   // Make the schema
   auto f0 = field("f0", list_view(int32()));
   auto f1 = field("f1", list_view(list_view(int32())));
@@ -456,7 +459,6 @@ Status 
MakeListViewRecordBatch(std::shared_ptr<RecordBatch>* out) {
   // Example data
 
   MemoryPool* pool = default_memory_pool();
-  const int length = 200;
   std::shared_ptr<Array> leaf_values, list_array, list_list_array, 
large_list_array;
   const bool include_nulls = true;
   RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values));
@@ -471,6 +473,10 @@ Status 
MakeListViewRecordBatch(std::shared_ptr<RecordBatch>* out) {
   return Status::OK();
 }
 
+Status MakeListViewRecordBatch(std::shared_ptr<RecordBatch>* out) {
+  return MakeListRecordBatchSized(200, out);
+}
+
 Status MakeFixedSizeListRecordBatch(std::shared_ptr<RecordBatch>* out) {
   // Make the schema
   auto f0 = field("f0", fixed_size_list(int32(), 1));
diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h
index 189de28879..6044ef207b 100644
--- a/cpp/src/arrow/ipc/test_common.h
+++ b/cpp/src/arrow/ipc/test_common.h
@@ -104,9 +104,15 @@ Status 
MakeStringTypesRecordBatchWithNulls(std::shared_ptr<RecordBatch>* out);
 ARROW_TESTING_EXPORT
 Status MakeNullRecordBatch(std::shared_ptr<RecordBatch>* out);
 
+ARROW_TESTING_EXPORT
+Status MakeListRecordBatchSized(int length, std::shared_ptr<RecordBatch>* out);
+
 ARROW_TESTING_EXPORT
 Status MakeListRecordBatch(std::shared_ptr<RecordBatch>* out);
 
+ARROW_TESTING_EXPORT
+Status MakeListViewRecordBatchSized(int length, std::shared_ptr<RecordBatch>* 
out);
+
 ARROW_TESTING_EXPORT
 Status MakeListViewRecordBatch(std::shared_ptr<RecordBatch>* out);
 
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index 90574c7cb6..8b7d943fc7 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -324,34 +324,36 @@ class RecordBatchSerializer {
     // Share slicing logic between ListArray, BinaryArray and LargeBinaryArray
     using offset_type = typename ArrayType::offset_type;
 
-    auto offsets = array.value_offsets();
+    if (array.length() == 0) {
+      *value_offsets = array.value_offsets();
+      return Status::OK();
+    }
 
     int64_t required_bytes = sizeof(offset_type) * (array.length() + 1);
-    if (array.offset() != 0) {
-      // If we have a non-zero offset, then the value offsets do not start at
-      // zero. We must a) create a new offsets array with shifted offsets and
-      // b) slice the values array accordingly
-
+    if (array.value_offset(0) > 0) {
+      // If the offset of the first value is non-zero, then we must create a 
new
+      // offsets buffer with shifted offsets.
       ARROW_ASSIGN_OR_RAISE(auto shifted_offsets,
                             AllocateBuffer(required_bytes, 
options_.memory_pool));
 
       auto dest_offsets = shifted_offsets->mutable_span_as<offset_type>();
-      const offset_type start_offset = array.value_offset(0);
+      const offset_type* source_offsets = array.raw_value_offsets();
+      const offset_type start_offset = source_offsets[0];
 
-      for (int i = 0; i < array.length(); ++i) {
-        dest_offsets[i] = array.value_offset(i) - start_offset;
+      for (int i = 0; i <= array.length(); ++i) {
+        dest_offsets[i] = source_offsets[i] - start_offset;
       }
-      // Final offset
-      dest_offsets[array.length()] = array.value_offset(array.length()) - 
start_offset;
-      offsets = std::move(shifted_offsets);
+      *value_offsets = std::move(shifted_offsets);
     } else {
-      // ARROW-6046: Slice offsets to used extent, in case we have a truncated
-      // slice
-      if (offsets != nullptr && offsets->size() > required_bytes) {
-        offsets = SliceBuffer(offsets, 0, required_bytes);
+      // ARROW-6046: if we have a truncated slice with unused leading or
+      // trailing data, then we slice it.
+      if (array.offset() > 0 || array.value_offsets()->size() > 
required_bytes) {
+        *value_offsets = SliceBuffer(
+            array.value_offsets(), array.offset() * sizeof(offset_type), 
required_bytes);
+      } else {
+        *value_offsets = array.value_offsets();
       }
     }
-    *value_offsets = std::move(offsets);
     return Status::OK();
   }
 

Reply via email to