This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch maint-16.x.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 4155a298ec54c9dc3852b4becb54cb552a5974b4
Author: Adam Reeve <[email protected]>
AuthorDate: Fri Apr 19 15:00:15 2024 +1200

    GH-41231: [C#] Slice values array when writing a sliced list view array to 
IPC format (#41255)
    
    ### Rationale for this change
    
    Reduces IPC file sizes when writing sliced list view arrays.
    
    ### What changes are included in this PR?
    
    Updates `ArrowSreamWriter` so it only writes the required range of values 
for a list view array, and adjusts the offset values accordingly.
    
    ### Are these changes tested?
    
    Yes, this is covered by existing tests and I've also added a new test to 
verify the behaviour with list view arrays that have unordered offsets.
    
    ### Are there any user-facing changes?
    
    Yes, this might reduce IPC file sizes for users writing sliced data.
    * GitHub Issue: #41231
    
    Lead-authored-by: Adam Reeve <[email protected]>
    Co-authored-by: Curt Hagenlocher <[email protected]>
    Signed-off-by: Curt Hagenlocher <[email protected]>
---
 csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs   | 64 +++++++++++++++++-
 .../Apache.Arrow.Tests/ArrowFileWriterTests.cs     | 77 ++++++++++++++++++++++
 .../test/Apache.Arrow.Tests/ArrowReaderVerifier.cs | 20 ++++--
 3 files changed, 153 insertions(+), 8 deletions(-)

diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs 
b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
index a7e4c13525..b11479c0d4 100644
--- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
+++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
@@ -184,11 +184,19 @@ namespace Apache.Arrow.Ipc
 
             public void Visit(ListViewArray array)
             {
+                var (valueOffsetsBuffer, minOffset, maxEnd) = 
GetZeroBasedListViewOffsets(array);
+
                 _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, 
array.Offset, array.Length));
-                _buffers.Add(CreateSlicedBuffer<int>(array.ValueOffsetsBuffer, 
array.Offset, array.Length));
+                _buffers.Add(CreateBuffer(valueOffsetsBuffer));
                 _buffers.Add(CreateSlicedBuffer<int>(array.SizesBuffer, 
array.Offset, array.Length));
 
-                VisitArray(array.Values);
+                IArrowArray values = array.Values;
+                if (minOffset != 0 || values.Length != maxEnd)
+                {
+                    values = ArrowArrayFactory.Slice(values, minOffset, maxEnd 
- minOffset);
+                }
+
+                VisitArray(values);
             }
 
             public void Visit(FixedSizeListArray array)
@@ -319,6 +327,58 @@ namespace Apache.Arrow.Ipc
                 }
             }
 
+            private (ArrowBuffer Buffer, int minOffset, int maxEnd) 
GetZeroBasedListViewOffsets(ListViewArray array)
+            {
+                if (array.Length == 0)
+                {
+                    return (ArrowBuffer.Empty, 0, 0);
+                }
+
+                var offsets = array.ValueOffsets;
+                var sizes = array.Sizes;
+
+                int minOffset = offsets[0];
+                int maxEnd = offsets[array.Length - 1] + sizes[array.Length - 
1];
+
+                // Min possible offset is zero, and max possible end is the 
values length.
+                // If these match the first offset and last end we don't need 
to do anything further,
+                // but otherwise we need to iterate over each index in case 
the offsets aren't ordered.
+                if (minOffset != 0 || maxEnd != array.Values.Length)
+                {
+                    for (int i = 0; i < array.Length; ++i)
+                    {
+                        minOffset = Math.Min(minOffset, offsets[i]);
+                        maxEnd = Math.Max(maxEnd, offsets[i] + sizes[i]);
+                    }
+                }
+
+                var requiredBytes = CalculatePaddedBufferLength(sizeof(int) * 
array.Length);
+
+                if (minOffset == 0)
+                {
+                    // No need to adjust the offsets, but we may need to slice 
the offsets buffer.
+                    ArrowBuffer buffer = array.ValueOffsetsBuffer;
+                    if (array.Offset != 0 || buffer.Length > requiredBytes)
+                    {
+                        var byteOffset = sizeof(int) * array.Offset;
+                        var sliceLength = Math.Min(requiredBytes, 
buffer.Length - byteOffset);
+                        buffer = new 
ArrowBuffer(buffer.Memory.Slice(byteOffset, sliceLength));
+                    }
+
+                    return (buffer, minOffset, maxEnd);
+                }
+
+                // Compute shifted offsets
+                var newOffsetsBuffer = _allocator.Allocate(requiredBytes);
+                var newOffsets = newOffsetsBuffer.Memory.Span.CastTo<int>();
+                for (int i = 0; i < array.Length; ++i)
+                {
+                    newOffsets[i] = offsets[i] - minOffset;
+                }
+
+                return (new ArrowBuffer(newOffsetsBuffer), minOffset, maxEnd);
+            }
+
             private Buffer CreateBitmapBuffer(ArrowBuffer buffer, int offset, 
int length)
             {
                 if (buffer.IsEmpty)
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs 
b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
index 297cb5e181..7b25910114 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
@@ -135,6 +135,71 @@ namespace Apache.Arrow.Tests
             await ValidateRecordBatchFile(stream, slicedBatch, strictCompare: 
false);
         }
 
+        [Theory]
+        [InlineData(0, 100)]
+        [InlineData(0, 50)]
+        [InlineData(50, 50)]
+        [InlineData(25, 50)]
+        public async Task WriteListViewDataWithUnorderedOffsets(int 
sliceOffset, int sliceLength)
+        {
+            // A list-view array doesn't require that offsets are ordered,
+            // so verify that we can round trip a list-view array with 
out-of-order offsets.
+            const int length = 100;
+            var random = new Random();
+
+            var randomizedIndices = Enumerable.Range(0, length).ToArray();
+            Shuffle(randomizedIndices, random);
+
+            var offsetsBuilder = new ArrowBuffer.Builder<int>().Resize(length);
+            var sizesBuilder = new ArrowBuffer.Builder<int>().Resize(length);
+            var validityBuilder = new 
ArrowBuffer.BitmapBuilder().Reserve(length);
+
+            var valuesLength = 0;
+            for (int i = 0; i < length; ++i)
+            {
+                var index = randomizedIndices[i];
+                var listLength = random.Next(0, 10);
+                offsetsBuilder.Span[index] = valuesLength;
+                sizesBuilder.Span[index] = listLength;
+                valuesLength += listLength;
+
+                validityBuilder.Append(random.NextDouble() < 0.9);
+            }
+
+            var valuesBuilder = new Int64Array.Builder().Reserve(valuesLength);
+            for (int i = 0; i < valuesLength; ++i)
+            {
+                valuesBuilder.Append(random.Next(0, 1_000));
+            }
+
+            var type = new ListViewType(new Int64Type());
+            var offsets = offsetsBuilder.Build();
+            var sizes = sizesBuilder.Build();
+            var values = valuesBuilder.Build();
+            var nullCount = validityBuilder.UnsetBitCount;
+            var validityBuffer = validityBuilder.Build();
+
+            IArrowArray listViewArray = new ListViewArray(
+                type, length, offsets, sizes, values, validityBuffer, 
nullCount);
+
+            if (sliceOffset != 0 || sliceLength != length)
+            {
+                listViewArray = ArrowArrayFactory.Slice(listViewArray, 
sliceOffset, sliceLength);
+            }
+
+            var recordBatch = new RecordBatch.Builder().Append("x", true, 
listViewArray).Build();
+
+            var stream = new MemoryStream();
+            var writer = new ArrowFileWriter(stream, recordBatch.Schema, 
leaveOpen: true);
+
+            await writer.WriteRecordBatchAsync(recordBatch);
+            await writer.WriteEndAsync();
+
+            stream.Position = 0;
+
+            await ValidateRecordBatchFile(stream, recordBatch, strictCompare: 
false);
+        }
+
         private async Task ValidateRecordBatchFile(Stream stream, RecordBatch 
recordBatch, bool strictCompare = true)
         {
             var reader = new ArrowFileReader(stream);
@@ -245,5 +310,17 @@ namespace Apache.Arrow.Tests
 
             await ValidateRecordBatchFile(stream, recordBatch, strictCompare: 
false);
         }
+
+        private static void Shuffle(int[] values, Random random)
+        {
+            var length = values.Length;
+            for (int i = 0; i < length - 1; ++i)
+            {
+                var j = random.Next(i, length);
+                var tmp = values[i];
+                values[i] = values[j];
+                values[j] = tmp;
+            }
+        }
     }
 }
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs 
b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
index 9597219321..555c08a90f 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
@@ -449,16 +449,24 @@ namespace Apache.Arrow.Tests
                     Assert.Equal(expectedArray.Offset, actualArray.Offset);
                     
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
                     
Assert.True(expectedArray.SizesBuffer.Span.SequenceEqual(actualArray.SizesBuffer.Span));
+                    actualArray.Values.Accept(new 
ArrayComparer(expectedArray.Values, _strictCompare));
                 }
                 else
                 {
-                    int start = expectedArray.Offset * sizeof(int);
-                    int length = expectedArray.Length * sizeof(int);
-                    
Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(start, 
length).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, length)));
-                    Assert.True(expectedArray.SizesBuffer.Span.Slice(start, 
length).SequenceEqual(actualArray.SizesBuffer.Span.Slice(0, length)));
+                    for (int i = 0; i < actualArray.Length; ++i)
+                    {
+                        if (expectedArray.IsNull(i))
+                        {
+                            Assert.True(actualArray.IsNull(i));
+                        }
+                        else
+                        {
+                            var expectedList = 
expectedArray.GetSlicedValues(i);
+                            var actualList = actualArray.GetSlicedValues(i);
+                            actualList.Accept(new ArrayComparer(expectedList, 
_strictCompare));
+                        }
+                    }
                 }
-
-                actualArray.Values.Accept(new 
ArrayComparer(expectedArray.Values, _strictCompare));
             }
 
             private void CompareArrays(FixedSizeListArray actualArray)

Reply via email to