This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 0e85c12a38 GH-46407: [C++] Fix IPC serialization of sliced list arrays
(#46408)
0e85c12a38 is described below
commit 0e85c12a381d8bcdd07ed5d8ed07ff4b535b597d
Author: Bruno <[email protected]>
AuthorDate: Tue Jun 3 20:28:33 2025 +0200
GH-46407: [C++] Fix IPC serialization of sliced list arrays (#46408)
### Rationale for this change
Arrow C++ slices arrays by bumping the top-level `offset` value.
However, Arrow Rust slices list arrays by slicing the `value_offsets`
buffer. When receiving a Rust Arrow Array in C++ (via the C data
interface), its IPC serialization fails to notice that the
`value_offsets` buffer needed to be updated, but it still updates the
`values` buffer. This leads to a corrupt array on deserialization, with
an `value_offsets` buffer that points past the end of the values array.
This PR fixes the IPC serialization by also looking at value_offset(0) to
determine whether the `value_offsets` buffer needs reconstructing,
instead of only looking at offset().
This works because value_offset(int) is the offets buffer, shifted by the
top-level offset.
We still need to check for offset(), to account for array starting with an
empty list (multiple
zeroes at the start of the offsets buffer).
### What changes are included in this PR?
The fix and nothing else
### Are these changes tested?
Yes
### Are there any user-facing changes?
No (well, unless they are affected by the bug)
**This PR contains a "Critical Fix".** (the changes fix (b) a bug that
caused incorrect or invalid data to be produced) : valid operations on valid
data produce invalid data.
* GitHub Issue: #46407
Lead-authored-by: Bruno Cauet <[email protected]>
Co-authored-by: Bruno Cauet <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/ipc/feather_test.cc | 18 ++++++++++++++++++
cpp/src/arrow/ipc/read_write_test.cc | 23 +++++++++++++++++++++++
cpp/src/arrow/ipc/test_common.cc | 14 ++++++++++----
cpp/src/arrow/ipc/test_common.h | 6 ++++++
cpp/src/arrow/ipc/writer.cc | 36 +++++++++++++++++++-----------------
5 files changed, 76 insertions(+), 21 deletions(-)
diff --git a/cpp/src/arrow/ipc/feather_test.cc
b/cpp/src/arrow/ipc/feather_test.cc
index ba3f4d828c..e1dc6046a1 100644
--- a/cpp/src/arrow/ipc/feather_test.cc
+++ b/cpp/src/arrow/ipc/feather_test.cc
@@ -319,6 +319,24 @@ TEST_P(TestFeather, SliceBooleanRoundTrip) {
CheckSlices(batch);
}
+TEST_P(TestFeather, SliceListRoundTrip) {
+ if (GetParam().version == kFeatherV1Version) {
+ GTEST_SKIP() << "Feather V1 does not support list types";
+ }
+ std::shared_ptr<RecordBatch> batch;
+ ASSERT_OK(ipc::test::MakeListRecordBatchSized(600, &batch));
+ CheckSlices(batch);
+}
+
+TEST_P(TestFeather, SliceListViewRoundTrip) {
+ if (GetParam().version == kFeatherV1Version) {
+ GTEST_SKIP() << "Feather V1 does not support list view types";
+ }
+ std::shared_ptr<RecordBatch> batch;
+ ASSERT_OK(ipc::test::MakeListViewRecordBatchSized(600, &batch));
+ CheckSlices(batch);
+}
+
INSTANTIATE_TEST_SUITE_P(
FeatherTests, TestFeather,
::testing::Values(TestParam(kFeatherV1Version),
TestParam(kFeatherV2Version),
diff --git a/cpp/src/arrow/ipc/read_write_test.cc
b/cpp/src/arrow/ipc/read_write_test.cc
index 8bd28e4d58..84ec923ce8 100644
--- a/cpp/src/arrow/ipc/read_write_test.cc
+++ b/cpp/src/arrow/ipc/read_write_test.cc
@@ -579,6 +579,29 @@ TEST_F(TestIpcRoundTrip, SpecificMetadataVersion) {
TestMetadataVersion(MetadataVersion::V5);
}
+TEST_F(TestIpcRoundTrip, ListWithSlicedValues) {
+ // This tests serialization of a sliced ListArray that got sliced "the Rust
+ // way": by slicing the value_offsets buffer, but keeping top-level offset at
+ // 0.
+ auto child_data = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5]")->data();
+
+ // Offsets buffer [2, 5]
+ TypedBufferBuilder<int32_t> offsets_builder;
+ ASSERT_OK(offsets_builder.Reserve(2));
+ ASSERT_OK(offsets_builder.Append(2));
+ ASSERT_OK(offsets_builder.Append(5));
+ ASSERT_OK_AND_ASSIGN(auto offsets_buffer, offsets_builder.Finish());
+
+ auto list_data = ArrayData::Make(list(int32()),
+ /*num_rows=*/1,
+ /*buffers=*/{nullptr, offsets_buffer});
+ list_data->child_data = {child_data};
+ std::shared_ptr<Array> list_array = MakeArray(list_data);
+ ASSERT_OK(list_array->ValidateFull());
+
+ CheckRoundtrip(list_array);
+}
+
TEST(TestReadMessage, CorruptedSmallInput) {
std::string data = "abc";
auto reader = io::BufferReader::FromString(data);
diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc
index 3d7137e496..46060a0db1 100644
--- a/cpp/src/arrow/ipc/test_common.cc
+++ b/cpp/src/arrow/ipc/test_common.cc
@@ -421,7 +421,7 @@ Status MakeNullRecordBatch(std::shared_ptr<RecordBatch>*
out) {
return Status::OK();
}
-Status MakeListRecordBatch(std::shared_ptr<RecordBatch>* out) {
+Status MakeListRecordBatchSized(const int length,
std::shared_ptr<RecordBatch>* out) {
// Make the schema
auto f0 = field("f0", list(int32()));
auto f1 = field("f1", list(list(int32())));
@@ -431,7 +431,6 @@ Status MakeListRecordBatch(std::shared_ptr<RecordBatch>*
out) {
// Example data
MemoryPool* pool = default_memory_pool();
- const int length = 200;
std::shared_ptr<Array> leaf_values, list_array, list_list_array,
large_list_array;
const bool include_nulls = true;
RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values));
@@ -446,7 +445,11 @@ Status MakeListRecordBatch(std::shared_ptr<RecordBatch>*
out) {
return Status::OK();
}
-Status MakeListViewRecordBatch(std::shared_ptr<RecordBatch>* out) {
+Status MakeListRecordBatch(std::shared_ptr<RecordBatch>* out) {
+ return MakeListRecordBatchSized(200, out);
+}
+
+Status MakeListViewRecordBatchSized(const int length,
std::shared_ptr<RecordBatch>* out) {
// Make the schema
auto f0 = field("f0", list_view(int32()));
auto f1 = field("f1", list_view(list_view(int32())));
@@ -456,7 +459,6 @@ Status
MakeListViewRecordBatch(std::shared_ptr<RecordBatch>* out) {
// Example data
MemoryPool* pool = default_memory_pool();
- const int length = 200;
std::shared_ptr<Array> leaf_values, list_array, list_list_array,
large_list_array;
const bool include_nulls = true;
RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values));
@@ -471,6 +473,10 @@ Status
MakeListViewRecordBatch(std::shared_ptr<RecordBatch>* out) {
return Status::OK();
}
+Status MakeListViewRecordBatch(std::shared_ptr<RecordBatch>* out) {
+ return MakeListRecordBatchSized(200, out);
+}
+
Status MakeFixedSizeListRecordBatch(std::shared_ptr<RecordBatch>* out) {
// Make the schema
auto f0 = field("f0", fixed_size_list(int32(), 1));
diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h
index 189de28879..6044ef207b 100644
--- a/cpp/src/arrow/ipc/test_common.h
+++ b/cpp/src/arrow/ipc/test_common.h
@@ -104,9 +104,15 @@ Status
MakeStringTypesRecordBatchWithNulls(std::shared_ptr<RecordBatch>* out);
ARROW_TESTING_EXPORT
Status MakeNullRecordBatch(std::shared_ptr<RecordBatch>* out);
+ARROW_TESTING_EXPORT
+Status MakeListRecordBatchSized(int length, std::shared_ptr<RecordBatch>* out);
+
ARROW_TESTING_EXPORT
Status MakeListRecordBatch(std::shared_ptr<RecordBatch>* out);
+ARROW_TESTING_EXPORT
+Status MakeListViewRecordBatchSized(int length, std::shared_ptr<RecordBatch>*
out);
+
ARROW_TESTING_EXPORT
Status MakeListViewRecordBatch(std::shared_ptr<RecordBatch>* out);
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index 90574c7cb6..8b7d943fc7 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -324,34 +324,36 @@ class RecordBatchSerializer {
// Share slicing logic between ListArray, BinaryArray and LargeBinaryArray
using offset_type = typename ArrayType::offset_type;
- auto offsets = array.value_offsets();
+ if (array.length() == 0) {
+ *value_offsets = array.value_offsets();
+ return Status::OK();
+ }
int64_t required_bytes = sizeof(offset_type) * (array.length() + 1);
- if (array.offset() != 0) {
- // If we have a non-zero offset, then the value offsets do not start at
- // zero. We must a) create a new offsets array with shifted offsets and
- // b) slice the values array accordingly
-
+ if (array.value_offset(0) > 0) {
+ // If the offset of the first value is non-zero, then we must create a
new
+ // offsets buffer with shifted offsets.
ARROW_ASSIGN_OR_RAISE(auto shifted_offsets,
AllocateBuffer(required_bytes,
options_.memory_pool));
auto dest_offsets = shifted_offsets->mutable_span_as<offset_type>();
- const offset_type start_offset = array.value_offset(0);
+ const offset_type* source_offsets = array.raw_value_offsets();
+ const offset_type start_offset = source_offsets[0];
- for (int i = 0; i < array.length(); ++i) {
- dest_offsets[i] = array.value_offset(i) - start_offset;
+ for (int i = 0; i <= array.length(); ++i) {
+ dest_offsets[i] = source_offsets[i] - start_offset;
}
- // Final offset
- dest_offsets[array.length()] = array.value_offset(array.length()) -
start_offset;
- offsets = std::move(shifted_offsets);
+ *value_offsets = std::move(shifted_offsets);
} else {
- // ARROW-6046: Slice offsets to used extent, in case we have a truncated
- // slice
- if (offsets != nullptr && offsets->size() > required_bytes) {
- offsets = SliceBuffer(offsets, 0, required_bytes);
+ // ARROW-6046: if we have a truncated slice with unused leading or
+ // trailing data, then we slice it.
+ if (array.offset() > 0 || array.value_offsets()->size() >
required_bytes) {
+ *value_offsets = SliceBuffer(
+ array.value_offsets(), array.offset() * sizeof(offset_type),
required_bytes);
+ } else {
+ *value_offsets = array.value_offsets();
}
}
- *value_offsets = std::move(offsets);
return Status::OK();
}