This is an automated email from the ASF dual-hosted git repository.
curth pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 9ba789d111 GH-43267: [C#] Correctly import sliced arrays through the C
Data interface (#44117)
9ba789d111 is described below
commit 9ba789d1115494b00e6772a3170c8ba2f1a9a02c
Author: Curt Hagenlocher <[email protected]>
AuthorDate: Sun Sep 15 19:20:59 2024 -0700
GH-43267: [C#] Correctly import sliced arrays through the C Data interface
(#44117)
### What changes are included in this PR?
Changes to the C Data importer to correctly handle nonzero offsets.
### Are these changes tested?
Yes
### Are there any user-facing changes?
No
Closes #43267
* GitHub Issue: #43267
Authored-by: Curt Hagenlocher <[email protected]>
Signed-off-by: Curt Hagenlocher <[email protected]>
---
csharp/src/Apache.Arrow/Apache.Arrow.csproj | 6 +--
csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs | 22 +++++------
csharp/src/Apache.Arrow/RecordBatch.cs | 11 ++++++
csharp/src/Apache.Arrow/Utility.cs | 2 -
.../test/Apache.Arrow.Tests/ArrowReaderVerifier.cs | 4 +-
.../Apache.Arrow.Tests/CDataInterfaceDataTests.cs | 18 +++++++++
.../CDataInterfacePythonTests.cs | 43 ++++++++++++++++++++++
7 files changed, 88 insertions(+), 18 deletions(-)
diff --git a/csharp/src/Apache.Arrow/Apache.Arrow.csproj
b/csharp/src/Apache.Arrow/Apache.Arrow.csproj
index 034876a114..a845f8e693 100644
--- a/csharp/src/Apache.Arrow/Apache.Arrow.csproj
+++ b/csharp/src/Apache.Arrow/Apache.Arrow.csproj
@@ -7,18 +7,16 @@
<Description>Apache Arrow is a cross-language development platform for
in-memory data. It specifies a standardized language-independent columnar
memory format for flat and hierarchical data, organized for efficient analytic
operations on modern hardware.</Description>
</PropertyGroup>
- <PropertyGroup Condition="'$(IsWindows)'=='true'">
+ <PropertyGroup>
<TargetFrameworks>netstandard2.0;net6.0;net8.0;net462</TargetFrameworks>
</PropertyGroup>
- <PropertyGroup Condition="'$(IsWindows)'!='true'">
- <TargetFrameworks>netstandard2.0;net6.0;net8.0</TargetFrameworks>
- </PropertyGroup>
<ItemGroup Condition="'$(TargetFrameworkIdentifier)' == '.NETStandard' or
'$(TargetFramework)' == 'net462'">
<PackageReference Include="System.Buffers" Version="4.5.1" />
<PackageReference Include="System.Memory" Version="4.5.5" />
<PackageReference Include="System.Runtime.CompilerServices.Unsafe"
Version="4.7.1" />
<PackageReference Include="System.Threading.Tasks.Extensions"
Version="4.5.4" />
+ <PackageReference Include="System.ValueTuple" Version="4.5.0" />
</ItemGroup>
<ItemGroup>
diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs
b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs
index 68b67f3d7c..c454380e17 100644
--- a/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs
+++ b/csharp/src/Apache.Arrow/C/CArrowArrayImporter.cs
@@ -260,7 +260,7 @@ namespace Apache.Arrow.C
private ArrowBuffer ImportValidityBuffer(CArrowArray* cArray)
{
- int length = checked((int)cArray->length);
+ int length = checked((int)cArray->offset +
(int)cArray->length);
int validityLength =
checked((int)BitUtility.RoundUpToMultipleOf8(length) / 8);
return (cArray->buffers[0] == null) ? ArrowBuffer.Empty : new
ArrowBuffer(AddMemory((IntPtr)cArray->buffers[0], 0, validityLength));
}
@@ -285,7 +285,7 @@ namespace Apache.Arrow.C
throw new InvalidOperationException("Byte arrays are
expected to have exactly three buffers");
}
- int length = checked((int)cArray->length);
+ int length = checked((int)cArray->offset +
(int)cArray->length);
int offsetsLength = (length + 1) * 4;
int* offsets = (int*)cArray->buffers[1];
Debug.Assert(offsets != null);
@@ -306,7 +306,7 @@ namespace Apache.Arrow.C
throw new InvalidOperationException("Byte array views are
expected to have at least three buffers");
}
- int length = checked((int)cArray->length);
+ int length = checked((int)cArray->offset +
(int)cArray->length);
int viewsLength = length * 16;
long* bufferLengths = (long*)cArray->buffers[cArray->n_buffers
- 1];
@@ -336,7 +336,7 @@ namespace Apache.Arrow.C
$"is greater than the maximum supported large byte
array length ({maxLength})");
}
- int length = (int)cArray->length;
+ int length = checked((int)cArray->offset +
(int)cArray->length);
int offsetsLength = (length + 1) * 8;
long* offsets = (long*)cArray->buffers[1];
Debug.Assert(offsets != null);
@@ -364,7 +364,7 @@ namespace Apache.Arrow.C
throw new InvalidOperationException("List arrays are
expected to have exactly two buffers");
}
- int length = checked((int)cArray->length);
+ int length = checked((int)cArray->offset +
(int)cArray->length);
int offsetsLength = (length + 1) * 4;
ArrowBuffer[] buffers = new ArrowBuffer[2];
@@ -381,7 +381,7 @@ namespace Apache.Arrow.C
throw new InvalidOperationException("List view arrays are
expected to have exactly three buffers");
}
- int length = checked((int)cArray->length);
+ int length = checked((int)cArray->offset +
(int)cArray->length);
int offsetsLength = length * 4;
ArrowBuffer[] buffers = new ArrowBuffer[3];
@@ -407,7 +407,7 @@ namespace Apache.Arrow.C
$"is greater than the maximum supported large list
array length ({maxLength})");
}
- int length = (int)cArray->length;
+ int length = checked((int)cArray->offset +
(int)cArray->length);
int offsetsLength = (length + 1) * 8;
ArrowBuffer[] buffers = new ArrowBuffer[2];
@@ -436,7 +436,7 @@ namespace Apache.Arrow.C
{
throw new InvalidOperationException("Dense union arrays
are expected to have exactly two children");
}
- int length = checked((int)cArray->length);
+ int length = checked((int)cArray->offset +
(int)cArray->length);
int offsetsLength = length * 4;
ArrowBuffer[] buffers = new ArrowBuffer[2];
@@ -454,7 +454,7 @@ namespace Apache.Arrow.C
}
ArrowBuffer[] buffers = new ArrowBuffer[1];
- buffers[0] = ImportCArrayBuffer(cArray, 0,
checked((int)cArray->length));
+ buffers[0] = ImportCArrayBuffer(cArray, 0,
checked((int)cArray->offset + (int)cArray->length));
return buffers;
}
@@ -467,10 +467,10 @@ namespace Apache.Arrow.C
}
// validity, data
- int length = checked((int)cArray->length);
+ int length = checked((int)cArray->offset +
(int)cArray->length);
int valuesLength;
if (bitWidth >= 8)
- valuesLength = checked((int)(cArray->length * bitWidth /
8));
+ valuesLength = checked(length * bitWidth / 8);
else
valuesLength =
checked((int)BitUtility.RoundUpToMultipleOf8(length) / 8);
diff --git a/csharp/src/Apache.Arrow/RecordBatch.cs
b/csharp/src/Apache.Arrow/RecordBatch.cs
index 9cc81b1648..4067ba9ac6 100644
--- a/csharp/src/Apache.Arrow/RecordBatch.cs
+++ b/csharp/src/Apache.Arrow/RecordBatch.cs
@@ -100,6 +100,17 @@ namespace Apache.Arrow
return new RecordBatch(Schema, arrays, Length);
}
+ public RecordBatch Slice(int offset, int length)
+ {
+ if (offset > Length)
+ {
+ throw new ArgumentException($"Offset {offset} cannot be
greater than Length {Length} for RecordBatch.Slice");
+ }
+
+ length = Math.Min(Length - offset, length);
+ return new RecordBatch(Schema, _arrays.Select(a =>
ArrowArrayFactory.Slice(a, offset, length)), length);
+ }
+
public void Accept(IArrowArrayVisitor visitor)
{
switch (visitor)
diff --git a/csharp/src/Apache.Arrow/Utility.cs
b/csharp/src/Apache.Arrow/Utility.cs
index c4e5732e6e..22b3ff15f1 100644
--- a/csharp/src/Apache.Arrow/Utility.cs
+++ b/csharp/src/Apache.Arrow/Utility.cs
@@ -13,10 +13,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-using Apache.Arrow.Flatbuf;
using System;
using System.Collections.Generic;
-using System.Text;
namespace Apache.Arrow
{
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
index 85f7b75f93..35b2c4e7f2 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowReaderVerifier.cs
@@ -566,7 +566,9 @@ namespace Apache.Arrow.Tests
var listSize =
((FixedSizeListType)expectedArray.Data.DataType).ListSize;
var expectedValuesSlice = ArrowArrayFactory.Slice(
expectedArray.Values, expectedArray.Offset * listSize,
expectedArray.Length * listSize);
- actualArray.Values.Accept(new
ArrayComparer(expectedValuesSlice, _strictCompare));
+ var actualValuesSlice = ArrowArrayFactory.Slice(
+ actualArray.Values, actualArray.Offset * listSize,
actualArray.Length * listSize);
+ actualValuesSlice.Accept(new
ArrayComparer(expectedValuesSlice, _strictCompare));
}
private void CompareValidityBuffer(int nullCount, int arrayLength,
ArrowBuffer expectedValidityBuffer, int expectedBufferOffset, ArrowBuffer
actualValidityBuffer, int actualBufferOffset)
diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs
b/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs
index 2bd4d4d661..70ab1fdae2 100644
--- a/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/CDataInterfaceDataTests.cs
@@ -92,5 +92,23 @@ namespace Apache.Arrow.Tests
GC.KeepAlive(releaseCallback);
}
#endif
+
+ [Fact]
+ public unsafe void RoundTripInt32ArrayWithOffset()
+ {
+ Int32Array array = new Int32Array.Builder()
+ .AppendRange(new[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 })
+ .Build();
+ IArrowArray sliced = array.Slice(2, 6);
+ CArrowArray* cArray = CArrowArray.Create();
+ CArrowArrayExporter.ExportArray(sliced, cArray);
+ using (var importedSlice =
(Int32Array)CArrowArrayImporter.ImportArray(cArray, array.Data.DataType))
+ {
+ Assert.Equal(6, importedSlice.Length);
+ Assert.Equal(2, importedSlice.Offset);
+ Assert.Equal(2, importedSlice.GetValue(0));
+ }
+ CArrowArray.Free(cArray);
+ }
}
}
diff --git a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
index fee18d165c..638cbfb272 100644
--- a/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/CDataInterfacePythonTests.cs
@@ -792,6 +792,49 @@ namespace Apache.Arrow.Tests
CArrowSchema.Free(cImportSchema);
}
+ [SkippableFact]
+ public unsafe void RoundTripTestSlicedBatch()
+ {
+ // TODO: Enable these once this the version of pyarrow referenced
during testing supports them
+ HashSet<ArrowTypeId> unsupported = new HashSet<ArrowTypeId> {
ArrowTypeId.ListView, ArrowTypeId.BinaryView, ArrowTypeId.StringView };
+ RecordBatch batch1 = TestData.CreateSampleRecordBatch(4,
excludedTypes: unsupported);
+ RecordBatch batch1slice = batch1.Slice(1, 2);
+ RecordBatch batch2 = batch1slice.Clone();
+
+ CArrowArray* cExportArray = CArrowArray.Create();
+ CArrowArrayExporter.ExportRecordBatch(batch1slice, cExportArray);
+
+ CArrowSchema* cExportSchema = CArrowSchema.Create();
+ CArrowSchemaExporter.ExportSchema(batch1.Schema, cExportSchema);
+
+ CArrowArray* cImportArray = CArrowArray.Create();
+ CArrowSchema* cImportSchema = CArrowSchema.Create();
+
+ // For Python, we need to provide the pointers
+ long exportArrayPtr = ((IntPtr)cExportArray).ToInt64();
+ long exportSchemaPtr = ((IntPtr)cExportSchema).ToInt64();
+ long importArrayPtr = ((IntPtr)cImportArray).ToInt64();
+ long importSchemaPtr = ((IntPtr)cImportSchema).ToInt64();
+
+ using (Py.GIL())
+ {
+ dynamic pa = Py.Import("pyarrow");
+ dynamic exportedPyArray =
pa.RecordBatch._import_from_c(exportArrayPtr, exportSchemaPtr);
+ exportedPyArray._export_to_c(importArrayPtr, importSchemaPtr);
+ }
+
+ Schema schema = CArrowSchemaImporter.ImportSchema(cImportSchema);
+ RecordBatch importedBatch =
CArrowArrayImporter.ImportRecordBatch(cImportArray, schema);
+
+ ArrowReaderVerifier.CompareBatches(batch2, importedBatch,
strictCompare: false); // Non-strict because span lengths won't match.
+
+ // Since we allocated, we are responsible for freeing the pointer.
+ CArrowArray.Free(cExportArray);
+ CArrowSchema.Free(cExportSchema);
+ CArrowArray.Free(cImportArray);
+ CArrowSchema.Free(cImportSchema);
+ }
+
[SkippableFact]
public unsafe void ExportBatchReader()
{