This is an automated email from the ASF dual-hosted git repository.

curth pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 56186b994d GH-41225: [C#] Slice value buffers when writing sliced list 
or binary arrays in IPC format (#41230)
56186b994d is described below

commit 56186b994db3eab8b2684fde9e1726f0b0658ef6
Author: Adam Reeve <[email protected]>
AuthorDate: Wed Apr 17 00:25:49 2024 +1200

    GH-41225: [C#] Slice value buffers when writing sliced list or binary 
arrays in IPC format (#41230)
    
    ### Rationale for this change
    
    This reduces file sizes when writing sliced binary or list arrays to IPC 
format.
    
    ### What changes are included in this PR?
    
    Changes `ArrowStreamWriter` to write only the subset of the values that is 
needed rather than the full value buffer when writing a `ListArray` or 
`BinaryArray`, and compute shifted value offset buffers.
    
    ### Are these changes tested?
    
    This code is covered by existing tests and the change doesn't introduce any 
difference in the observed array values, so I haven't added new tests or checks.
    
    I did change how list arrays are compared though as we can no longer 
compare the value and value offset buffers directly, so the tests now get list 
items as arrays and create a new `ArrayComparer` to compare them. This meant 
that array offsets are no longer always zero, so I've changed the offset 
assertions to only be used in strict mode.
    
    ### Are there any user-facing changes?
    
    Yes, this might reduce IPC file sizes for users writing sliced data.
    * GitHub Issue: #41225
    
    Authored-by: Adam Reeve <[email protected]>
    Signed-off-by: Curt Hagenlocher <[email protected]>
---
 csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs   | 54 ++++++++++++++++++++--
 .../Apache.Arrow.Tests/ArrowFileWriterTests.cs     |  1 +
 .../test/Apache.Arrow.Tests/ArrowReaderVerifier.cs | 50 ++++++++++++++------
 csharp/test/Apache.Arrow.Tests/TestData.cs         | 47 ++++++++++++++-----
 4 files changed, 122 insertions(+), 30 deletions(-)

diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs 
b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
index 6127c5a662..1b83735925 100644
--- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
+++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
@@ -163,9 +163,18 @@ namespace Apache.Arrow.Ipc
             public void Visit(ListArray array)
             {
                 _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, 
array.Offset, array.Length));
-                _buffers.Add(CreateSlicedBuffer<int>(array.ValueOffsetsBuffer, 
array.Offset, array.Length + 1));
+                
_buffers.Add(CreateBuffer(GetZeroBasedValueOffsets(array.ValueOffsetsBuffer, 
array.Offset, array.Length)));
 
-                VisitArray(array.Values);
+                int valuesOffset = array.ValueOffsets[0];
+                int valuesLength = array.ValueOffsets[array.Length] - 
valuesOffset;
+
+                var values = array.Values;
+                if (valuesOffset > 0 || valuesLength < values.Length)
+                {
+                    values = ArrowArrayFactory.Slice(values, valuesOffset, 
valuesLength);
+                }
+
+                VisitArray(values);
             }
 
             public void Visit(ListViewArray array)
@@ -195,8 +204,12 @@ namespace Apache.Arrow.Ipc
             public void Visit(BinaryArray array)
             {
                 _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, 
array.Offset, array.Length));
-                _buffers.Add(CreateSlicedBuffer<int>(array.ValueOffsetsBuffer, 
array.Offset, array.Length + 1));
-                _buffers.Add(CreateBuffer(array.ValueBuffer));
+                
_buffers.Add(CreateBuffer(GetZeroBasedValueOffsets(array.ValueOffsetsBuffer, 
array.Offset, array.Length)));
+
+                int valuesOffset = array.ValueOffsets[0];
+                int valuesLength = array.ValueOffsets[array.Length] - 
valuesOffset;
+
+                _buffers.Add(CreateSlicedBuffer<byte>(array.ValueBuffer, 
valuesOffset, valuesLength));
             }
 
             public void Visit(BinaryViewArray array)
@@ -263,6 +276,39 @@ namespace Apache.Arrow.Ipc
                 // There are no buffers for a NullArray
             }
 
+            private ArrowBuffer GetZeroBasedValueOffsets(ArrowBuffer 
valueOffsetsBuffer, int arrayOffset, int arrayLength)
+            {
+                var requiredBytes = CalculatePaddedBufferLength(sizeof(int) * 
(arrayLength + 1));
+
+                if (arrayOffset != 0)
+                {
+                    // Array has been sliced, so we need to shift and adjust 
the offsets
+                    var originalOffsets = 
valueOffsetsBuffer.Span.CastTo<int>().Slice(arrayOffset, arrayLength + 1);
+                    var firstOffset = arrayLength > 0 ? originalOffsets[0] : 0;
+
+                    var newValueOffsetsBuffer = 
_allocator.Allocate(requiredBytes);
+                    var newValueOffsets = 
newValueOffsetsBuffer.Memory.Span.CastTo<int>();
+
+                    for (int i = 0; i < arrayLength + 1; ++i)
+                    {
+                        newValueOffsets[i] = originalOffsets[i] - firstOffset;
+                    }
+
+                    return new ArrowBuffer(newValueOffsetsBuffer);
+                }
+                else if (valueOffsetsBuffer.Length > requiredBytes)
+                {
+                    // Array may have been sliced but the offset is zero,
+                    // so we can truncate the existing offsets
+                    return new ArrowBuffer(valueOffsetsBuffer.Memory.Slice(0, 
requiredBytes));
+                }
+                else
+                {
+                    // Use the full buffer
+                    return valueOffsetsBuffer;
+                }
+            }
+
             private Buffer CreateBitmapBuffer(ArrowBuffer buffer, int offset, 
int length)
             {
                 if (buffer.IsEmpty)
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs 
b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
index faf650973d..baea4d61e5 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
@@ -113,6 +113,7 @@ namespace Apache.Arrow.Tests
         [InlineData(0, 45)]
         [InlineData(3, 45)]
         [InlineData(16, 45)]
+        [InlineData(10, 0)]
         public async Task WriteSlicedArrays(int sliceOffset, int sliceLength)
         {
             var originalBatch = TestData.CreateSampleRecordBatch(length: 100);
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs 
b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
index 07c8aa3f56..9597219321 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
@@ -160,10 +160,14 @@ namespace Apache.Arrow.Tests
 
                 Assert.Equal(expectedArray.Length, array.Length);
                 Assert.Equal(expectedArray.NullCount, array.NullCount);
-                Assert.Equal(0, array.Offset);
                 Assert.Equal(expectedArray.Data.Children.Length, 
array.Data.Children.Length);
                 Assert.Equal(expectedArray.Fields.Count, array.Fields.Count);
 
+                if (_strictCompare)
+                {
+                    Assert.Equal(expectedArray.Offset, array.Offset);
+                }
+
                 for (int i = 0; i < array.Fields.Count; i++)
                 {
                     array.Fields[i].Accept(new 
ArrayComparer(expectedArray.Fields[i], _strictCompare));
@@ -178,12 +182,12 @@ namespace Apache.Arrow.Tests
                 Assert.Equal(expectedArray.Mode, array.Mode);
                 Assert.Equal(expectedArray.Length, array.Length);
                 Assert.Equal(expectedArray.NullCount, array.NullCount);
-                Assert.Equal(0, array.Offset);
                 Assert.Equal(expectedArray.Data.Children.Length, 
array.Data.Children.Length);
                 Assert.Equal(expectedArray.Fields.Count, array.Fields.Count);
 
                 if (_strictCompare)
                 {
+                    Assert.Equal(expectedArray.Offset, array.Offset);
                     
Assert.True(expectedArray.TypeBuffer.Span.SequenceEqual(array.TypeBuffer.Span));
                 }
                 else
@@ -252,12 +256,12 @@ namespace Apache.Arrow.Tests
 
                 Assert.Equal(expectedArray.Length, actualArray.Length);
                 Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
-                Assert.Equal(0, actualArray.Offset);
 
                 CompareValidityBuffer(expectedArray.NullCount, 
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, 
actualArray.NullBitmapBuffer);
 
                 if (_strictCompare)
                 {
+                    Assert.Equal(expectedArray.Offset, actualArray.Offset);
                     
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
                     Assert.True(expectedArray.Values.Slice(0, 
expectedArray.Length).SequenceEqual(actualArray.Values.Slice(0, 
actualArray.Length)));
                 }
@@ -284,7 +288,11 @@ namespace Apache.Arrow.Tests
 
                 Assert.Equal(expectedArray.Length, actualArray.Length);
                 Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
-                Assert.Equal(0, actualArray.Offset);
+
+                if (_strictCompare)
+                {
+                    Assert.Equal(expectedArray.Offset, actualArray.Offset);
+                }
 
                 CompareValidityBuffer(expectedArray.NullCount, 
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, 
actualArray.NullBitmapBuffer);
 
@@ -309,12 +317,12 @@ namespace Apache.Arrow.Tests
 
                 Assert.Equal(expectedArray.Length, actualArray.Length);
                 Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
-                Assert.Equal(0, actualArray.Offset);
 
                 CompareValidityBuffer(expectedArray.NullCount, 
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, 
actualArray.NullBitmapBuffer);
 
                 if (_strictCompare)
                 {
+                    Assert.Equal(expectedArray.Offset, actualArray.Offset);
                     Assert.True(expectedArray.ValueBuffer.Span.Slice(0, 
expectedArray.Length).SequenceEqual(actualArray.ValueBuffer.Span.Slice(0, 
actualArray.Length)));
                 }
                 else
@@ -338,12 +346,12 @@ namespace Apache.Arrow.Tests
 
                 Assert.Equal(expectedArray.Length, actualArray.Length);
                 Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
-                Assert.Equal(0, actualArray.Offset);
 
                 CompareValidityBuffer(expectedArray.NullCount, 
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, 
actualArray.NullBitmapBuffer);
 
                 if (_strictCompare)
                 {
+                    Assert.Equal(expectedArray.Offset, actualArray.Offset);
                     Assert.True(expectedArray.Values.Slice(0, 
expectedArray.Length).SequenceEqual(actualArray.Values.Slice(0, 
actualArray.Length)));
                 }
                 else
@@ -370,12 +378,12 @@ namespace Apache.Arrow.Tests
 
                 Assert.Equal(expectedArray.Length, actualArray.Length);
                 Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
-                Assert.Equal(0, actualArray.Offset);
 
                 CompareValidityBuffer(expectedArray.NullCount, 
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, 
actualArray.NullBitmapBuffer);
 
                 if (_strictCompare)
                 {
+                    Assert.Equal(expectedArray.Offset, actualArray.Offset);
                     int booleanByteCount = 
BitUtility.ByteCount(expectedArray.Length);
                     Assert.True(expectedArray.Values.Slice(0, 
booleanByteCount).SequenceEqual(actualArray.Values.Slice(0, booleanByteCount)));
                 }
@@ -397,22 +405,31 @@ namespace Apache.Arrow.Tests
 
                 Assert.Equal(expectedArray.Length, actualArray.Length);
                 Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
-                Assert.Equal(0, actualArray.Offset);
 
                 CompareValidityBuffer(expectedArray.NullCount, 
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, 
actualArray.NullBitmapBuffer);
 
                 if (_strictCompare)
                 {
+                    Assert.Equal(expectedArray.Offset, actualArray.Offset);
                     
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
+                    actualArray.Values.Accept(new 
ArrayComparer(expectedArray.Values, _strictCompare));
                 }
                 else
                 {
-                    int offsetsStart = (expectedArray.Offset) * sizeof(int);
-                    int offsetsLength = (expectedArray.Length + 1) * 
sizeof(int);
-                    
Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(offsetsStart, 
offsetsLength).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, 
offsetsLength)));
+                    for (int i = 0; i < actualArray.Length; ++i)
+                    {
+                        if (expectedArray.IsNull(i))
+                        {
+                            Assert.True(actualArray.IsNull(i));
+                        }
+                        else
+                        {
+                            var expectedList = 
expectedArray.GetSlicedValues(i);
+                            var actualList = actualArray.GetSlicedValues(i);
+                            actualList.Accept(new ArrayComparer(expectedList, 
_strictCompare));
+                        }
+                    }
                 }
-
-                actualArray.Values.Accept(new 
ArrayComparer(expectedArray.Values, _strictCompare));
             }
 
             private void CompareArrays(ListViewArray actualArray)
@@ -424,12 +441,12 @@ namespace Apache.Arrow.Tests
 
                 Assert.Equal(expectedArray.Length, actualArray.Length);
                 Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
-                Assert.Equal(0, actualArray.Offset);
 
                 CompareValidityBuffer(expectedArray.NullCount, 
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, 
actualArray.NullBitmapBuffer);
 
                 if (_strictCompare)
                 {
+                    Assert.Equal(expectedArray.Offset, actualArray.Offset);
                     
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
                     
Assert.True(expectedArray.SizesBuffer.Span.SequenceEqual(actualArray.SizesBuffer.Span));
                 }
@@ -453,7 +470,10 @@ namespace Apache.Arrow.Tests
 
                 Assert.Equal(expectedArray.Length, actualArray.Length);
                 Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
-                Assert.Equal(0, actualArray.Offset);
+                if (_strictCompare)
+                {
+                    Assert.Equal(expectedArray.Offset, actualArray.Offset);
+                }
 
                 CompareValidityBuffer(expectedArray.NullCount, 
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset, 
actualArray.NullBitmapBuffer);
 
diff --git a/csharp/test/Apache.Arrow.Tests/TestData.cs 
b/csharp/test/Apache.Arrow.Tests/TestData.cs
index 29ddef2864..3ea42ee0fb 100644
--- a/csharp/test/Apache.Arrow.Tests/TestData.cs
+++ b/csharp/test/Apache.Arrow.Tests/TestData.cs
@@ -294,7 +294,18 @@ namespace Apache.Arrow.Tests
 
                 for (var i = 0; i < Length; i++)
                 {
-                    builder.Append(str);
+                    switch (i % 3)
+                    {
+                        case 0:
+                            builder.AppendNull();
+                            break;
+                        case 1:
+                            builder.Append(str);
+                            break;
+                        case 2:
+                            builder.Append(str + str);
+                            break;
+                    }
                 }
 
                 Array = builder.Build();
@@ -328,15 +339,21 @@ namespace Apache.Arrow.Tests
             {
                 var builder = new 
ListArray.Builder(type.ValueField).Reserve(Length);
 
-                var valueBuilder = 
(Int64Array.Builder)builder.ValueBuilder.Reserve(Length + 1);
+                var valueBuilder = 
(Int64Array.Builder)builder.ValueBuilder.Reserve(Length * 3 / 2);
 
                 for (var i = 0; i < Length; i++)
                 {
-                    builder.Append();
-                    valueBuilder.Append(i);
+                    if (i % 10 == 2)
+                    {
+                        builder.AppendNull();
+                    }
+                    else
+                    {
+                        builder.Append();
+                        var listLength = i % 4;
+                        valueBuilder.AppendRange(Enumerable.Range(i, 
listLength).Select(x => (long)x));
+                    }
                 }
-                //Add a value to check if Values.Length can exceed 
ListArray.Length
-                valueBuilder.Append(0);
 
                 Array = builder.Build();
             }
@@ -352,8 +369,12 @@ namespace Apache.Arrow.Tests
                     builder.Append();
                     valueBuilder.Append(i);
                 }
-                //Add a value to check if Values.Length can exceed 
ListArray.Length
-                valueBuilder.Append(0);
+
+                if (Length > 0)
+                {
+                    // Add a value to check if Values.Length can exceed 
ListArray.Length
+                    valueBuilder.Append(0);
+                }
 
                 Array = builder.Build();
             }
@@ -562,9 +583,13 @@ namespace Apache.Arrow.Tests
                     keyBuilder.Append(i.ToString());
                     valueBuilder.Append(i);
                 }
-                //Add a value to check if Values.Length can exceed 
MapArray.Length
-                keyBuilder.Append("0");
-                valueBuilder.Append(0);
+
+                if (Length > 0)
+                {
+                    // Add a value to check if Values.Length can exceed 
MapArray.Length
+                    keyBuilder.Append("0");
+                    valueBuilder.Append(0);
+                }
 
                 Array = builder.Build();
             }

Reply via email to