This is an automated email from the ASF dual-hosted git repository.
curth pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 56186b994d GH-41225: [C#] Slice value buffers when writing sliced list
or binary arrays in IPC format (#41230)
56186b994d is described below
commit 56186b994db3eab8b2684fde9e1726f0b0658ef6
Author: Adam Reeve <[email protected]>
AuthorDate: Wed Apr 17 00:25:49 2024 +1200
GH-41225: [C#] Slice value buffers when writing sliced list or binary
arrays in IPC format (#41230)
### Rationale for this change
This reduces file sizes when writing sliced binary or list arrays to IPC
format.
### What changes are included in this PR?
Changes `ArrowStreamWriter` to write only the subset of the values that is
needed rather than the full value buffer when writing a `ListArray` or
`BinaryArray`, and compute shifted value offset buffers.
### Are these changes tested?
This code is covered by existing tests and the change doesn't introduce any
difference in the observed array values, so I haven't added new tests or checks.
I did change how list arrays are compared though as we can no longer
compare the value and value offset buffers directly, so the tests now get list
items as arrays and create a new `ArrayComparer` to compare them. This meant
that array offsets are no longer always zero, so I've changed the offset
assertions to only be used in strict mode.
### Are there any user-facing changes?
Yes, this might reduce IPC file sizes for users writing sliced data.
* GitHub Issue: #41225
Authored-by: Adam Reeve <[email protected]>
Signed-off-by: Curt Hagenlocher <[email protected]>
---
csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs | 54 ++++++++++++++++++++--
.../Apache.Arrow.Tests/ArrowFileWriterTests.cs | 1 +
.../test/Apache.Arrow.Tests/ArrowReaderVerifier.cs | 50 ++++++++++++++------
csharp/test/Apache.Arrow.Tests/TestData.cs | 47 ++++++++++++++-----
4 files changed, 122 insertions(+), 30 deletions(-)
diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
index 6127c5a662..1b83735925 100644
--- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
+++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
@@ -163,9 +163,18 @@ namespace Apache.Arrow.Ipc
public void Visit(ListArray array)
{
_buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer,
array.Offset, array.Length));
- _buffers.Add(CreateSlicedBuffer<int>(array.ValueOffsetsBuffer,
array.Offset, array.Length + 1));
+
_buffers.Add(CreateBuffer(GetZeroBasedValueOffsets(array.ValueOffsetsBuffer,
array.Offset, array.Length)));
- VisitArray(array.Values);
+ int valuesOffset = array.ValueOffsets[0];
+ int valuesLength = array.ValueOffsets[array.Length] -
valuesOffset;
+
+ var values = array.Values;
+ if (valuesOffset > 0 || valuesLength < values.Length)
+ {
+ values = ArrowArrayFactory.Slice(values, valuesOffset,
valuesLength);
+ }
+
+ VisitArray(values);
}
public void Visit(ListViewArray array)
@@ -195,8 +204,12 @@ namespace Apache.Arrow.Ipc
public void Visit(BinaryArray array)
{
_buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer,
array.Offset, array.Length));
- _buffers.Add(CreateSlicedBuffer<int>(array.ValueOffsetsBuffer,
array.Offset, array.Length + 1));
- _buffers.Add(CreateBuffer(array.ValueBuffer));
+
_buffers.Add(CreateBuffer(GetZeroBasedValueOffsets(array.ValueOffsetsBuffer,
array.Offset, array.Length)));
+
+ int valuesOffset = array.ValueOffsets[0];
+ int valuesLength = array.ValueOffsets[array.Length] -
valuesOffset;
+
+ _buffers.Add(CreateSlicedBuffer<byte>(array.ValueBuffer,
valuesOffset, valuesLength));
}
public void Visit(BinaryViewArray array)
@@ -263,6 +276,39 @@ namespace Apache.Arrow.Ipc
// There are no buffers for a NullArray
}
+ private ArrowBuffer GetZeroBasedValueOffsets(ArrowBuffer
valueOffsetsBuffer, int arrayOffset, int arrayLength)
+ {
+ var requiredBytes = CalculatePaddedBufferLength(sizeof(int) *
(arrayLength + 1));
+
+ if (arrayOffset != 0)
+ {
+ // Array has been sliced, so we need to shift and adjust
the offsets
+ var originalOffsets =
valueOffsetsBuffer.Span.CastTo<int>().Slice(arrayOffset, arrayLength + 1);
+ var firstOffset = arrayLength > 0 ? originalOffsets[0] : 0;
+
+ var newValueOffsetsBuffer =
_allocator.Allocate(requiredBytes);
+ var newValueOffsets =
newValueOffsetsBuffer.Memory.Span.CastTo<int>();
+
+ for (int i = 0; i < arrayLength + 1; ++i)
+ {
+ newValueOffsets[i] = originalOffsets[i] - firstOffset;
+ }
+
+ return new ArrowBuffer(newValueOffsetsBuffer);
+ }
+ else if (valueOffsetsBuffer.Length > requiredBytes)
+ {
+ // Array may have been sliced but the offset is zero,
+ // so we can truncate the existing offsets
+ return new ArrowBuffer(valueOffsetsBuffer.Memory.Slice(0,
requiredBytes));
+ }
+ else
+ {
+ // Use the full buffer
+ return valueOffsetsBuffer;
+ }
+ }
+
private Buffer CreateBitmapBuffer(ArrowBuffer buffer, int offset,
int length)
{
if (buffer.IsEmpty)
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
index faf650973d..baea4d61e5 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
@@ -113,6 +113,7 @@ namespace Apache.Arrow.Tests
[InlineData(0, 45)]
[InlineData(3, 45)]
[InlineData(16, 45)]
+ [InlineData(10, 0)]
public async Task WriteSlicedArrays(int sliceOffset, int sliceLength)
{
var originalBatch = TestData.CreateSampleRecordBatch(length: 100);
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
index 07c8aa3f56..9597219321 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
@@ -160,10 +160,14 @@ namespace Apache.Arrow.Tests
Assert.Equal(expectedArray.Length, array.Length);
Assert.Equal(expectedArray.NullCount, array.NullCount);
- Assert.Equal(0, array.Offset);
Assert.Equal(expectedArray.Data.Children.Length,
array.Data.Children.Length);
Assert.Equal(expectedArray.Fields.Count, array.Fields.Count);
+ if (_strictCompare)
+ {
+ Assert.Equal(expectedArray.Offset, array.Offset);
+ }
+
for (int i = 0; i < array.Fields.Count; i++)
{
array.Fields[i].Accept(new
ArrayComparer(expectedArray.Fields[i], _strictCompare));
@@ -178,12 +182,12 @@ namespace Apache.Arrow.Tests
Assert.Equal(expectedArray.Mode, array.Mode);
Assert.Equal(expectedArray.Length, array.Length);
Assert.Equal(expectedArray.NullCount, array.NullCount);
- Assert.Equal(0, array.Offset);
Assert.Equal(expectedArray.Data.Children.Length,
array.Data.Children.Length);
Assert.Equal(expectedArray.Fields.Count, array.Fields.Count);
if (_strictCompare)
{
+ Assert.Equal(expectedArray.Offset, array.Offset);
Assert.True(expectedArray.TypeBuffer.Span.SequenceEqual(array.TypeBuffer.Span));
}
else
@@ -252,12 +256,12 @@ namespace Apache.Arrow.Tests
Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
- Assert.Equal(0, actualArray.Offset);
CompareValidityBuffer(expectedArray.NullCount,
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset,
actualArray.NullBitmapBuffer);
if (_strictCompare)
{
+ Assert.Equal(expectedArray.Offset, actualArray.Offset);
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
Assert.True(expectedArray.Values.Slice(0,
expectedArray.Length).SequenceEqual(actualArray.Values.Slice(0,
actualArray.Length)));
}
@@ -284,7 +288,11 @@ namespace Apache.Arrow.Tests
Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
- Assert.Equal(0, actualArray.Offset);
+
+ if (_strictCompare)
+ {
+ Assert.Equal(expectedArray.Offset, actualArray.Offset);
+ }
CompareValidityBuffer(expectedArray.NullCount,
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset,
actualArray.NullBitmapBuffer);
@@ -309,12 +317,12 @@ namespace Apache.Arrow.Tests
Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
- Assert.Equal(0, actualArray.Offset);
CompareValidityBuffer(expectedArray.NullCount,
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset,
actualArray.NullBitmapBuffer);
if (_strictCompare)
{
+ Assert.Equal(expectedArray.Offset, actualArray.Offset);
Assert.True(expectedArray.ValueBuffer.Span.Slice(0,
expectedArray.Length).SequenceEqual(actualArray.ValueBuffer.Span.Slice(0,
actualArray.Length)));
}
else
@@ -338,12 +346,12 @@ namespace Apache.Arrow.Tests
Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
- Assert.Equal(0, actualArray.Offset);
CompareValidityBuffer(expectedArray.NullCount,
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset,
actualArray.NullBitmapBuffer);
if (_strictCompare)
{
+ Assert.Equal(expectedArray.Offset, actualArray.Offset);
Assert.True(expectedArray.Values.Slice(0,
expectedArray.Length).SequenceEqual(actualArray.Values.Slice(0,
actualArray.Length)));
}
else
@@ -370,12 +378,12 @@ namespace Apache.Arrow.Tests
Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
- Assert.Equal(0, actualArray.Offset);
CompareValidityBuffer(expectedArray.NullCount,
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset,
actualArray.NullBitmapBuffer);
if (_strictCompare)
{
+ Assert.Equal(expectedArray.Offset, actualArray.Offset);
int booleanByteCount =
BitUtility.ByteCount(expectedArray.Length);
Assert.True(expectedArray.Values.Slice(0,
booleanByteCount).SequenceEqual(actualArray.Values.Slice(0, booleanByteCount)));
}
@@ -397,22 +405,31 @@ namespace Apache.Arrow.Tests
Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
- Assert.Equal(0, actualArray.Offset);
CompareValidityBuffer(expectedArray.NullCount,
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset,
actualArray.NullBitmapBuffer);
if (_strictCompare)
{
+ Assert.Equal(expectedArray.Offset, actualArray.Offset);
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
+ actualArray.Values.Accept(new
ArrayComparer(expectedArray.Values, _strictCompare));
}
else
{
- int offsetsStart = (expectedArray.Offset) * sizeof(int);
- int offsetsLength = (expectedArray.Length + 1) *
sizeof(int);
-
Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(offsetsStart,
offsetsLength).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0,
offsetsLength)));
+ for (int i = 0; i < actualArray.Length; ++i)
+ {
+ if (expectedArray.IsNull(i))
+ {
+ Assert.True(actualArray.IsNull(i));
+ }
+ else
+ {
+ var expectedList =
expectedArray.GetSlicedValues(i);
+ var actualList = actualArray.GetSlicedValues(i);
+ actualList.Accept(new ArrayComparer(expectedList,
_strictCompare));
+ }
+ }
}
-
- actualArray.Values.Accept(new
ArrayComparer(expectedArray.Values, _strictCompare));
}
private void CompareArrays(ListViewArray actualArray)
@@ -424,12 +441,12 @@ namespace Apache.Arrow.Tests
Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
- Assert.Equal(0, actualArray.Offset);
CompareValidityBuffer(expectedArray.NullCount,
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset,
actualArray.NullBitmapBuffer);
if (_strictCompare)
{
+ Assert.Equal(expectedArray.Offset, actualArray.Offset);
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
Assert.True(expectedArray.SizesBuffer.Span.SequenceEqual(actualArray.SizesBuffer.Span));
}
@@ -453,7 +470,10 @@ namespace Apache.Arrow.Tests
Assert.Equal(expectedArray.Length, actualArray.Length);
Assert.Equal(expectedArray.NullCount, actualArray.NullCount);
- Assert.Equal(0, actualArray.Offset);
+ if (_strictCompare)
+ {
+ Assert.Equal(expectedArray.Offset, actualArray.Offset);
+ }
CompareValidityBuffer(expectedArray.NullCount,
_expectedArray.Length, expectedArray.NullBitmapBuffer, expectedArray.Offset,
actualArray.NullBitmapBuffer);
diff --git a/csharp/test/Apache.Arrow.Tests/TestData.cs
b/csharp/test/Apache.Arrow.Tests/TestData.cs
index 29ddef2864..3ea42ee0fb 100644
--- a/csharp/test/Apache.Arrow.Tests/TestData.cs
+++ b/csharp/test/Apache.Arrow.Tests/TestData.cs
@@ -294,7 +294,18 @@ namespace Apache.Arrow.Tests
for (var i = 0; i < Length; i++)
{
- builder.Append(str);
+ switch (i % 3)
+ {
+ case 0:
+ builder.AppendNull();
+ break;
+ case 1:
+ builder.Append(str);
+ break;
+ case 2:
+ builder.Append(str + str);
+ break;
+ }
}
Array = builder.Build();
@@ -328,15 +339,21 @@ namespace Apache.Arrow.Tests
{
var builder = new
ListArray.Builder(type.ValueField).Reserve(Length);
- var valueBuilder =
(Int64Array.Builder)builder.ValueBuilder.Reserve(Length + 1);
+ var valueBuilder =
(Int64Array.Builder)builder.ValueBuilder.Reserve(Length * 3 / 2);
for (var i = 0; i < Length; i++)
{
- builder.Append();
- valueBuilder.Append(i);
+ if (i % 10 == 2)
+ {
+ builder.AppendNull();
+ }
+ else
+ {
+ builder.Append();
+ var listLength = i % 4;
+ valueBuilder.AppendRange(Enumerable.Range(i,
listLength).Select(x => (long)x));
+ }
}
- //Add a value to check if Values.Length can exceed
ListArray.Length
- valueBuilder.Append(0);
Array = builder.Build();
}
@@ -352,8 +369,12 @@ namespace Apache.Arrow.Tests
builder.Append();
valueBuilder.Append(i);
}
- //Add a value to check if Values.Length can exceed
ListArray.Length
- valueBuilder.Append(0);
+
+ if (Length > 0)
+ {
+ // Add a value to check if Values.Length can exceed
ListArray.Length
+ valueBuilder.Append(0);
+ }
Array = builder.Build();
}
@@ -562,9 +583,13 @@ namespace Apache.Arrow.Tests
keyBuilder.Append(i.ToString());
valueBuilder.Append(i);
}
- //Add a value to check if Values.Length can exceed
MapArray.Length
- keyBuilder.Append("0");
- valueBuilder.Append(0);
+
+ if (Length > 0)
+ {
+ // Add a value to check if Values.Length can exceed
MapArray.Length
+ keyBuilder.Append("0");
+ valueBuilder.Append(0);
+ }
Array = builder.Build();
}