This is an automated email from the ASF dual-hosted git repository.
curth pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 2d977e4085 GH-41231: [C#] Slice values array when writing a sliced
list view array to IPC format (#41255)
2d977e4085 is described below
commit 2d977e408519d524a259246487a4c6b0f355ae21
Author: Adam Reeve <[email protected]>
AuthorDate: Fri Apr 19 15:00:15 2024 +1200
GH-41231: [C#] Slice values array when writing a sliced list view array to
IPC format (#41255)
### Rationale for this change
Reduces IPC file sizes when writing sliced list view arrays.
### What changes are included in this PR?
Updates `ArrowSreamWriter` so it only writes the required range of values
for a list view array, and adjusts the offset values accordingly.
### Are these changes tested?
Yes, this is covered by existing tests and I've also added a new test to
verify the behaviour with list view arrays that have unordered offsets.
### Are there any user-facing changes?
Yes, this might reduce IPC file sizes for users writing sliced data.
* GitHub Issue: #41231
Lead-authored-by: Adam Reeve <[email protected]>
Co-authored-by: Curt Hagenlocher <[email protected]>
Signed-off-by: Curt Hagenlocher <[email protected]>
---
csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs | 64 +++++++++++++++++-
.../Apache.Arrow.Tests/ArrowFileWriterTests.cs | 77 ++++++++++++++++++++++
.../test/Apache.Arrow.Tests/ArrowReaderVerifier.cs | 20 ++++--
3 files changed, 153 insertions(+), 8 deletions(-)
diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
index a7e4c13525..b11479c0d4 100644
--- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
+++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
@@ -184,11 +184,19 @@ namespace Apache.Arrow.Ipc
public void Visit(ListViewArray array)
{
+ var (valueOffsetsBuffer, minOffset, maxEnd) =
GetZeroBasedListViewOffsets(array);
+
_buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer,
array.Offset, array.Length));
- _buffers.Add(CreateSlicedBuffer<int>(array.ValueOffsetsBuffer,
array.Offset, array.Length));
+ _buffers.Add(CreateBuffer(valueOffsetsBuffer));
_buffers.Add(CreateSlicedBuffer<int>(array.SizesBuffer,
array.Offset, array.Length));
- VisitArray(array.Values);
+ IArrowArray values = array.Values;
+ if (minOffset != 0 || values.Length != maxEnd)
+ {
+ values = ArrowArrayFactory.Slice(values, minOffset, maxEnd
- minOffset);
+ }
+
+ VisitArray(values);
}
public void Visit(FixedSizeListArray array)
@@ -319,6 +327,58 @@ namespace Apache.Arrow.Ipc
}
}
+ private (ArrowBuffer Buffer, int minOffset, int maxEnd)
GetZeroBasedListViewOffsets(ListViewArray array)
+ {
+ if (array.Length == 0)
+ {
+ return (ArrowBuffer.Empty, 0, 0);
+ }
+
+ var offsets = array.ValueOffsets;
+ var sizes = array.Sizes;
+
+ int minOffset = offsets[0];
+ int maxEnd = offsets[array.Length - 1] + sizes[array.Length -
1];
+
+ // Min possible offset is zero, and max possible end is the
values length.
+ // If these match the first offset and last end we don't need
to do anything further,
+ // but otherwise we need to iterate over each index in case
the offsets aren't ordered.
+ if (minOffset != 0 || maxEnd != array.Values.Length)
+ {
+ for (int i = 0; i < array.Length; ++i)
+ {
+ minOffset = Math.Min(minOffset, offsets[i]);
+ maxEnd = Math.Max(maxEnd, offsets[i] + sizes[i]);
+ }
+ }
+
+ var requiredBytes = CalculatePaddedBufferLength(sizeof(int) *
array.Length);
+
+ if (minOffset == 0)
+ {
+ // No need to adjust the offsets, but we may need to slice
the offsets buffer.
+ ArrowBuffer buffer = array.ValueOffsetsBuffer;
+ if (array.Offset != 0 || buffer.Length > requiredBytes)
+ {
+ var byteOffset = sizeof(int) * array.Offset;
+ var sliceLength = Math.Min(requiredBytes,
buffer.Length - byteOffset);
+ buffer = new
ArrowBuffer(buffer.Memory.Slice(byteOffset, sliceLength));
+ }
+
+ return (buffer, minOffset, maxEnd);
+ }
+
+ // Compute shifted offsets
+ var newOffsetsBuffer = _allocator.Allocate(requiredBytes);
+ var newOffsets = newOffsetsBuffer.Memory.Span.CastTo<int>();
+ for (int i = 0; i < array.Length; ++i)
+ {
+ newOffsets[i] = offsets[i] - minOffset;
+ }
+
+ return (new ArrowBuffer(newOffsetsBuffer), minOffset, maxEnd);
+ }
+
private Buffer CreateBitmapBuffer(ArrowBuffer buffer, int offset,
int length)
{
if (buffer.IsEmpty)
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
index 297cb5e181..7b25910114 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowFileWriterTests.cs
@@ -135,6 +135,71 @@ namespace Apache.Arrow.Tests
await ValidateRecordBatchFile(stream, slicedBatch, strictCompare:
false);
}
+ [Theory]
+ [InlineData(0, 100)]
+ [InlineData(0, 50)]
+ [InlineData(50, 50)]
+ [InlineData(25, 50)]
+ public async Task WriteListViewDataWithUnorderedOffsets(int
sliceOffset, int sliceLength)
+ {
+ // A list-view array doesn't require that offsets are ordered,
+ // so verify that we can round trip a list-view array with
out-of-order offsets.
+ const int length = 100;
+ var random = new Random();
+
+ var randomizedIndices = Enumerable.Range(0, length).ToArray();
+ Shuffle(randomizedIndices, random);
+
+ var offsetsBuilder = new ArrowBuffer.Builder<int>().Resize(length);
+ var sizesBuilder = new ArrowBuffer.Builder<int>().Resize(length);
+ var validityBuilder = new
ArrowBuffer.BitmapBuilder().Reserve(length);
+
+ var valuesLength = 0;
+ for (int i = 0; i < length; ++i)
+ {
+ var index = randomizedIndices[i];
+ var listLength = random.Next(0, 10);
+ offsetsBuilder.Span[index] = valuesLength;
+ sizesBuilder.Span[index] = listLength;
+ valuesLength += listLength;
+
+ validityBuilder.Append(random.NextDouble() < 0.9);
+ }
+
+ var valuesBuilder = new Int64Array.Builder().Reserve(valuesLength);
+ for (int i = 0; i < valuesLength; ++i)
+ {
+ valuesBuilder.Append(random.Next(0, 1_000));
+ }
+
+ var type = new ListViewType(new Int64Type());
+ var offsets = offsetsBuilder.Build();
+ var sizes = sizesBuilder.Build();
+ var values = valuesBuilder.Build();
+ var nullCount = validityBuilder.UnsetBitCount;
+ var validityBuffer = validityBuilder.Build();
+
+ IArrowArray listViewArray = new ListViewArray(
+ type, length, offsets, sizes, values, validityBuffer,
nullCount);
+
+ if (sliceOffset != 0 || sliceLength != length)
+ {
+ listViewArray = ArrowArrayFactory.Slice(listViewArray,
sliceOffset, sliceLength);
+ }
+
+ var recordBatch = new RecordBatch.Builder().Append("x", true,
listViewArray).Build();
+
+ var stream = new MemoryStream();
+ var writer = new ArrowFileWriter(stream, recordBatch.Schema,
leaveOpen: true);
+
+ await writer.WriteRecordBatchAsync(recordBatch);
+ await writer.WriteEndAsync();
+
+ stream.Position = 0;
+
+ await ValidateRecordBatchFile(stream, recordBatch, strictCompare:
false);
+ }
+
private async Task ValidateRecordBatchFile(Stream stream, RecordBatch
recordBatch, bool strictCompare = true)
{
var reader = new ArrowFileReader(stream);
@@ -245,5 +310,17 @@ namespace Apache.Arrow.Tests
await ValidateRecordBatchFile(stream, recordBatch, strictCompare:
false);
}
+
+ private static void Shuffle(int[] values, Random random)
+ {
+ var length = values.Length;
+ for (int i = 0; i < length - 1; ++i)
+ {
+ var j = random.Next(i, length);
+ var tmp = values[i];
+ values[i] = values[j];
+ values[j] = tmp;
+ }
+ }
}
}
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
index f656979c22..360c3d7129 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
@@ -449,16 +449,24 @@ namespace Apache.Arrow.Tests
Assert.Equal(expectedArray.Offset, actualArray.Offset);
Assert.True(expectedArray.ValueOffsetsBuffer.Span.SequenceEqual(actualArray.ValueOffsetsBuffer.Span));
Assert.True(expectedArray.SizesBuffer.Span.SequenceEqual(actualArray.SizesBuffer.Span));
+ actualArray.Values.Accept(new
ArrayComparer(expectedArray.Values, _strictCompare));
}
else
{
- int start = expectedArray.Offset * sizeof(int);
- int length = expectedArray.Length * sizeof(int);
-
Assert.True(expectedArray.ValueOffsetsBuffer.Span.Slice(start,
length).SequenceEqual(actualArray.ValueOffsetsBuffer.Span.Slice(0, length)));
- Assert.True(expectedArray.SizesBuffer.Span.Slice(start,
length).SequenceEqual(actualArray.SizesBuffer.Span.Slice(0, length)));
+ for (int i = 0; i < actualArray.Length; ++i)
+ {
+ if (expectedArray.IsNull(i))
+ {
+ Assert.True(actualArray.IsNull(i));
+ }
+ else
+ {
+ var expectedList =
expectedArray.GetSlicedValues(i);
+ var actualList = actualArray.GetSlicedValues(i);
+ actualList.Accept(new ArrayComparer(expectedList,
_strictCompare));
+ }
+ }
}
-
- actualArray.Values.Accept(new
ArrayComparer(expectedArray.Values, _strictCompare));
}
private void CompareArrays(FixedSizeListArray actualArray)