This is an automated email from the ASF dual-hosted git repository. raulcd pushed a commit to branch maint-16.x.x in repository https://gitbox.apache.org/repos/asf/arrow.git
commit f0b570bc7b9eb3860dac55acc0f146a6b3f39d8b Author: Adam Reeve <[email protected]> AuthorDate: Wed Apr 17 00:25:49 2024 +1200 GH-41225: [C#] Slice value buffers when writing sliced list or binary arrays in IPC format (#41230) ### Rationale for this change This reduces file sizes when writing sliced binary or list arrays to IPC format. ### What changes are included in this PR? Changes `ArrowStreamWriter` to write only the subset of the values that is needed rather than the full value buffer when writing a `ListArray` or `BinaryArray`, and compute shifted value offset buffers. ### Are these changes tested? This code is covered by existing tests and the change doesn't introduce any difference in the observed array values, so I haven't added new tests or checks. I did change how list arrays are compared though as we can no longer compare the value and value offset buffers directly, so the tests now get list items as arrays and create a new `ArrayComparer` to compare them. This meant that array offsets are no longer always zero, so I've changed the offset assertions to only be used in strict mode. ### Are there any user-facing changes? Yes, this might reduce IPC file sizes for users writing sliced data. * GitHub Issue: #41225 Authored-by: Adam Reeve <[email protected]> Signed-off-by: Curt Hagenlocher <[email protected]> --- csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs | 54 ++++++++++++++++++++-- .../Apache.Arrow.Tests/ArrowFileWriterTests.cs | 1 + .../test/Apache.Arrow.Tests/ArrowReaderVerifier.cs | 50 ++++++++++++++------ csharp/test/Apache.Arrow.Tests/TestData.cs | 47 ++++++++++++++----- 4 files changed, 122 insertions(+), 30 deletions(-) diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index 6127c5a662..1b83735925 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -163,9 +163,18 @@ namespace Apache.Arrow.Ipc public void Visit(ListArray array) { _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); - _buffers.Add(CreateSlicedBuffer<int>(array.ValueOffsetsBuffer, array.Offset, array.Length + 1)); + _buffers.Add(CreateBuffer(GetZeroBasedValueOffsets(array.ValueOffsetsBuffer, array.Offset, array.Length))); - VisitArray(array.Values); + int valuesOffset = array.ValueOffsets[0]; + int valuesLength = array.ValueOffsets[array.Length] - valuesOffset; + + var values = array.Values; + if (valuesOffset > 0 || valuesLength < values.Length) + { + values = ArrowArrayFactory.Slice(values, valuesOffset, valuesLength); + } + + VisitArray(values); } public void Visit(ListViewArray array) @@ -195,8 +204,12 @@ namespace Apache.Arrow.Ipc public void Visit(BinaryArray array) { _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); - _buffers.Add(CreateSlicedBuffer<int>(array.ValueOffsetsBuffer, array.Offset, array.Length + 1)); - _buffers.Add(CreateBuffer(array.ValueBuffer)); + _buffers.Add(CreateBuffer(GetZeroBasedValueOffsets(array.ValueOffsetsBuffer, array.Offset, array.Length))); + + int valuesOffset = array.ValueOffsets[0]; + int valuesLength = array.ValueOffsets[array.Length] - valuesOffset; + + _buffers.Add(CreateSlicedBuffer<byte>(array.ValueBuffer, valuesOffset, valuesLength)); } public void Visit(BinaryViewArray array) @@ -263,6 +276,39 @@ namespace Apache.Arrow.Ipc // There are no buffers for a NullArray } + private ArrowBuffer GetZeroBasedValueOffsets(ArrowBuffer valueOffsetsBuffer, int arrayOffset, int arrayLength) + { + var requiredBytes = CalculatePaddedBufferLength(sizeof(int) * (arrayLength + 1)); + + if (arrayOffset != 0) + { + // Array has been sliced, so we need to shift and adjust the offsets + var originalOffsets = valueOffsetsBuffer.Span.CastTo<int>().Slice(arrayOffset, arrayLength + 1); + var firstOffset = arrayLength > 0 ? originalOffsets[0] : 0; + + var newValueOffsetsBuffer = _allocator.Allocate(requiredBytes); + var newValueOffsets = newValueOffsetsBuffer.Memory.Span.CastTo<int>(); + + for (int i = 0; i < arrayLength + 1; ++i) + { + newValueOffsets[i] = originalOffsets[i] - firstOffset; + } + + return new ArrowBuffer(newValueOffsetsBuffer); + } + else if (valueOffsetsBuffer.Length > requiredBytes) + { + // Array may have been sliced but the offset is zero, + // so we can truncate the existing offsets + return new ArrowBuffer(valueOffsetsBuffer.Memory.Slice(0, requiredBytes)); + } + else + { + // Use the full buffer + return valueOffsetsBuffer; + } + } + private Buffer CreateBitmapBuffer(ArrowBuffer buffer, int offset, int length) { if (buffer.IsEmpty) diff --git a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs index faf650973d..baea4d61e5 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs @@ -113,6 +113,7 @@ namespace Apache.Arrow.Tests [InlineData(0, 45)] [InlineData(3, 45)] [InlineData(16, 45)] + [InlineData(10, 0)] public async Task WriteSlicedArrays(int sliceOffset, int sliceLength) { var originalBatch = TestData.CreateSampleRecordBatch(length: 100); diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs index 07c8aa3f56..9597219321 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs @@ -160,10 +160,14 @@ namespace Apache.Arrow.Tests Assert.Equal(expectedArray.Length, array.Length); Assert.Equal(expectedArray.NullCount, array.NullCount); - Assert.Equal(0, array.Offset); Assert.Equal(expectedArray.Data.Children.Length, array.Data.Children.Length); Assert.Equal(expectedArray.Fields.Count, array.Fields.Count); + if (_strictCompare) + { + Assert.Equal(expectedArray.Offset, array.Offset); + } + for (int i = 0; i < array.Fields.Count; i++) { array.Fields[i].Accept(new ArrayComparer(expectedArray.Fields[i], _strictCompare)); @@ -178,12 +182,12 @@ namespace Apache.Arrow.Tests Assert.Equal(expectedArray.Mode, array.Mode); Assert.Equal(expectedArray.Length, array.Length); Assert.Equal(expectedArray.NullCount, array.NullCount); - Assert.Equal(0, array.Offset); Assert.Equal(expectedArray.Data.Children.Length, array.Data.Children.Length); Assert.Equal(expectedArray.Fields.Count, array.Fields.Count); if (_strictCompare) { + Assert.Equal(expectedArray.Offset, array.Offset); Assert.True(expectedArray.TypeBuffer.Span.SequenceEqual(array.TypeBuffer.Span)); } else @@ -252,12 +256,12 @@ namespace Apache.Arrow.Tests Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { + Assert.Equal(expectedArray.Offset, actualArray.Offset); Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span)); Assert.True(expectedArray.Values.Slice(0, expectedArray.Length).SequenceEqual(actualArray.Values.Slice(0, actualArray.Length))); } @@ -284,7 +288,11 @@ namespace Apache.Arrow.Tests Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); + + if (_strictCompare) + { + Assert.Equal(expectedArray.Offset, actualArray.Offset); + } CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); @@ -309,12 +317,12 @@ namespace Apache.Arrow.Tests Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { + Assert.Equal(expectedArray.Offset, actualArray.Offset); Assert.True(expectedArray.ValueBuffer.Span.Slice(0, expectedArray.Length).SequenceEqual(actualArray.ValueBuffer.Span.Slice(0, actualArray.Length))); } else @@ -338,12 +346,12 @@ namespace Apache.Arrow.Tests Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { + Assert.Equal(expectedArray.Offset, actualArray.Offset); Assert.True(expectedArray.Values.Slice(0, expectedArray.Length).SequenceEqual(actualArray.Values.Slice(0, actualArray.Length))); } else @@ -370,12 +378,12 @@ namespace Apache.Arrow.Tests Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { + Assert.Equal(expectedArray.Offset, actualArray.Offset); int booleanByteCount = BitUtility.ByteCount(expectedArray.Length); Assert.True(expectedArray.Values.Slice(0, booleanByteCount).SequenceEqual(actualArray.Values.Slice(0, booleanByteCount))); } @@ -397,22 +405,31 @@ namespace Apache.Arrow.Tests Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { + Assert.Equal(expectedArray.Offset, actualArray.Offset); Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span)); + actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); } else { - int offsetsStart = (expectedArray.Offset) * sizeof(int); - int offsetsLength = (expectedArray.Length + 1) * sizeof(int); - Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(offsetsStart, offsetsLength).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, offsetsLength))); + for (int i = 0; i < actualArray.Length; ++i) + { + if (expectedArray.IsNull(i)) + { + Assert.True(actualArray.IsNull(i)); + } + else + { + var expectedList = expectedArray.GetSlicedValues(i); + var actualList = actualArray.GetSlicedValues(i); + actualList.Accept(new ArrayComparer(expectedList, _strictCompare)); + } + } } - - actualArray.Values.Accept(new ArrayComparer(expectedArray.Values, _strictCompare)); } private void CompareArrays(ListViewArray actualArray) @@ -424,12 +441,12 @@ namespace Apache.Arrow.Tests Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); if (_strictCompare) { + Assert.Equal(expectedArray.Offset, actualArray.Offset); Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span)); Assert.True(expectedArray.SizesBuffer.Span.SequenceEqual(actualArray.SizesBuffer.Span)); } @@ -453,7 +470,10 @@ namespace Apache.Arrow.Tests Assert.Equal(expectedArray.Length, actualArray.Length); Assert.Equal(expectedArray.NullCount, actualArray.NullCount); - Assert.Equal(0, actualArray.Offset); + if (_strictCompare) + { + Assert.Equal(expectedArray.Offset, actualArray.Offset); + } CompareValidityBuffer(expectedArray.NullCount, _expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, actualArray.NullBitmapBuffer); diff --git a/csharp/test/Apache.Arrow.Tests/TestData.cs b/csharp/test/Apache.Arrow.Tests/TestData.cs index 29ddef2864..3ea42ee0fb 100644 --- a/csharp/test/Apache.Arrow.Tests/TestData.cs +++ b/csharp/test/Apache.Arrow.Tests/TestData.cs @@ -294,7 +294,18 @@ namespace Apache.Arrow.Tests for (var i = 0; i < Length; i++) { - builder.Append(str); + switch (i % 3) + { + case 0: + builder.AppendNull(); + break; + case 1: + builder.Append(str); + break; + case 2: + builder.Append(str + str); + break; + } } Array = builder.Build(); @@ -328,15 +339,21 @@ namespace Apache.Arrow.Tests { var builder = new ListArray.Builder(type.ValueField).Reserve(Length); - var valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(Length + 1); + var valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(Length * 3 / 2); for (var i = 0; i < Length; i++) { - builder.Append(); - valueBuilder.Append(i); + if (i % 10 == 2) + { + builder.AppendNull(); + } + else + { + builder.Append(); + var listLength = i % 4; + valueBuilder.AppendRange(Enumerable.Range(i, listLength).Select(x => (long)x)); + } } - //Add a value to check if Values.Length can exceed ListArray.Length - valueBuilder.Append(0); Array = builder.Build(); } @@ -352,8 +369,12 @@ namespace Apache.Arrow.Tests builder.Append(); valueBuilder.Append(i); } - //Add a value to check if Values.Length can exceed ListArray.Length - valueBuilder.Append(0); + + if (Length > 0) + { + // Add a value to check if Values.Length can exceed ListArray.Length + valueBuilder.Append(0); + } Array = builder.Build(); } @@ -562,9 +583,13 @@ namespace Apache.Arrow.Tests keyBuilder.Append(i.ToString()); valueBuilder.Append(i); } - //Add a value to check if Values.Length can exceed MapArray.Length - keyBuilder.Append("0"); - valueBuilder.Append(0); + + if (Length > 0) + { + // Add a value to check if Values.Length can exceed MapArray.Length + keyBuilder.Append("0"); + valueBuilder.Append(0); + } Array = builder.Build(); }
