This is an automated email from the ASF dual-hosted git repository. raulcd pushed a commit to branch maint-16.x.x in repository https://gitbox.apache.org/repos/asf/arrow.git
commit dcfeceb93f29b343cedbee4a2f93cbdbc4eed8e7 Author: Adam Reeve <[email protected]> AuthorDate: Sat Apr 13 01:04:36 2024 +1200 GH-41140: [C#] Account for offset and length in union arrays (#41165) ### Rationale for this change See #41140. This makes a sliced union array behave as expected without having to manually account for the array offset unless accessing the underlying buffers. ### What changes are included in this PR? Accounts for the offset and length when getting type ids, value offsets and field arrays for sparse and dense union arrays. ### Are these changes tested? Yes, I've updated the union array tests to cover this. ### Are there any user-facing changes? Yes, this is a user facing bug fix. * GitHub Issue: #41140 Authored-by: Adam Reeve <[email protected]> Signed-off-by: Curt Hagenlocher <[email protected]> --- csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs | 3 +- csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs | 1 - csharp/src/Apache.Arrow/Arrays/UnionArray.cs | 13 +++- csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs | 87 ++++++++++++++++++---- 4 files changed, 85 insertions(+), 19 deletions(-) diff --git a/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs b/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs index 459a30e221..79880c894b 100644 --- a/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs @@ -24,7 +24,7 @@ namespace Apache.Arrow { public ArrowBuffer ValueOffsetBuffer => Data.Buffers[1]; - public ReadOnlySpan<int> ValueOffsets => ValueOffsetBuffer.Span.CastTo<int>(); + public ReadOnlySpan<int> ValueOffsets => ValueOffsetBuffer.Span.CastTo<int>().Slice(Offset, Length); public DenseUnionArray( IArrowType dataType, @@ -38,7 +38,6 @@ namespace Apache.Arrow dataType, length, nullCount, offset, new[] { typeIds, valuesOffsetBuffer }, children.Select(child => child.Data))) { - _fields = children.ToArray(); ValidateMode(UnionMode.Dense, Type.Mode); } diff --git a/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs b/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs index ef55786f01..5b29489ebb 100644 --- a/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs @@ -32,7 +32,6 @@ namespace Apache.Arrow dataType, length, nullCount, offset, new[] { typeIds }, children.Select(child => child.Data))) { - _fields = children.ToArray(); ValidateMode(UnionMode.Sparse, Type.Mode); } diff --git a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs index f96f527135..c1deb9b651 100644 --- a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs @@ -25,7 +25,7 @@ namespace Apache.Arrow protected IReadOnlyList<IArrowArray> _fields; public IReadOnlyList<IArrowArray> Fields => - LazyInitializer.EnsureInitialized(ref _fields, () => InitializeFields()); + LazyInitializer.EnsureInitialized(ref _fields, InitializeFields); public ArrayData Data { get; } @@ -35,7 +35,7 @@ namespace Apache.Arrow public ArrowBuffer TypeBuffer => Data.Buffers[0]; - public ReadOnlySpan<byte> TypeIds => TypeBuffer.Span; + public ReadOnlySpan<byte> TypeIds => TypeBuffer.Span.Slice(Offset, Length); public int Length => Data.Length; @@ -106,7 +106,14 @@ namespace Apache.Arrow IArrowArray[] result = new IArrowArray[Data.Children.Length]; for (int i = 0; i < Data.Children.Length; i++) { - result[i] = ArrowArrayFactory.BuildArray(Data.Children[i]); + var childData = Data.Children[i]; + if (Mode == UnionMode.Sparse && (Data.Offset != 0 || childData.Length != Data.Length)) + { + // We only slice the child data for sparse mode, + // so that the sliced value offsets remain valid in dense mode + childData = childData.Slice(Data.Offset, Data.Length); + } + result[i] = ArrowArrayFactory.BuildArray(childData); } return result; } diff --git a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs index 45fed722a7..712a87a252 100644 --- a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs @@ -13,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +using System; using System.Linq; using Apache.Arrow.Types; using Xunit; @@ -24,7 +25,7 @@ public class UnionArrayTests [Theory] [InlineData(UnionMode.Sparse)] [InlineData(UnionMode.Dense)] - public void UnionArray_IsNull(UnionMode mode) + public void UnionArrayIsNull(UnionMode mode) { var (array, expectedNull) = BuildUnionArray(mode, 100); @@ -38,7 +39,7 @@ public class UnionArrayTests [Theory] [InlineData(UnionMode.Sparse)] [InlineData(UnionMode.Dense)] - public void UnionArray_Slice(UnionMode mode) + public void UnionArraySlice(UnionMode mode) { var (array, expectedNull) = BuildUnionArray(mode, 10); @@ -46,32 +47,92 @@ public class UnionArrayTests { for (var length = 0; length < array.Length - offset; ++length) { - var slicedArray = ArrowArrayFactory.Slice(array, offset, length); + var slicedArray = (UnionArray)ArrowArrayFactory.Slice(array, offset, length); var nullCount = 0; for (var i = 0; i < slicedArray.Length; ++i) { - // TODO: Shouldn't need to add offset in IsNull/IsValid calls, - // see https://github.com/apache/arrow/issues/41140 - Assert.Equal(expectedNull[offset + i], slicedArray.IsNull(offset + i)); - Assert.Equal(!expectedNull[offset + i], slicedArray.IsValid(offset + i)); + Assert.Equal(expectedNull[offset + i], slicedArray.IsNull(i)); + Assert.Equal(!expectedNull[offset + i], slicedArray.IsValid(i)); nullCount += expectedNull[offset + i] ? 1 : 0; + + CompareValue(array, offset + i, slicedArray, i); } - Assert.True(nullCount == slicedArray.NullCount, $"offset = {offset}, length = {length}"); Assert.Equal(nullCount, slicedArray.NullCount); } } } - private static (UnionArray array, bool[] isNull) BuildUnionArray(UnionMode mode, int length) + [Theory] + [InlineData(UnionMode.Sparse)] + [InlineData(UnionMode.Dense)] + public void UnionArrayConstructedWithOffset(UnionMode mode) + { + const int length = 10; + var (array, expectedNull) = BuildUnionArray(mode, length); + + for (var offset = 0; offset < array.Length; ++offset) + { + var (slicedArray, _) = BuildUnionArray(mode, length, offset); + + var nullCount = 0; + for (var i = 0; i < slicedArray.Length; ++i) + { + Assert.Equal(expectedNull[offset + i], slicedArray.IsNull(i)); + Assert.Equal(!expectedNull[offset + i], slicedArray.IsValid(i)); + nullCount += expectedNull[offset + i] ? 1 : 0; + + CompareValue(array, offset + i, slicedArray, i); + } + + Assert.Equal(nullCount, slicedArray.NullCount); + } + } + + private static void CompareValue(UnionArray originalArray, int originalIndex, UnionArray slicedArray, int sliceIndex) + { + var typeId = originalArray.TypeIds[originalIndex]; + var sliceTypeId = slicedArray.TypeIds[sliceIndex]; + Assert.Equal(typeId, sliceTypeId); + + switch (typeId) + { + case 0: + CompareFieldValue<int, Int32Array>(typeId, originalArray, originalIndex, slicedArray, sliceIndex); + break; + case 1: + CompareFieldValue<float, FloatArray>(typeId, originalArray, originalIndex, slicedArray, sliceIndex); + break; + default: + throw new Exception($"Unexpected type id {typeId}"); + } + } + + private static void CompareFieldValue<T, TArray>(byte typeId, UnionArray originalArray, int originalIndex, UnionArray slicedArray, int sliceIndex) + where T: struct + where TArray : PrimitiveArray<T> + { + if (originalArray is DenseUnionArray denseOriginalArray) + { + Assert.IsType<DenseUnionArray>(slicedArray); + + originalIndex = denseOriginalArray.ValueOffsets[originalIndex]; + sliceIndex = ((DenseUnionArray)slicedArray).ValueOffsets[sliceIndex]; + } + var originalValue = ((TArray)originalArray.Fields[typeId]).GetValue(originalIndex); + var sliceValue = ((TArray)slicedArray.Fields[typeId]).GetValue(sliceIndex); + Assert.Equal(originalValue, sliceValue); + } + + private static (UnionArray array, bool[] isNull) BuildUnionArray(UnionMode mode, int length, int offset=0) { var fields = new Field[] { new Field("field0", new Int32Type(), true), new Field("field1", new FloatType(), true), }; - var typeIds = fields.Select(f => (int) f.DataType.TypeId).ToArray(); + var typeIds = new[] { 0, 1 }; var type = new UnionType(fields, typeIds, mode); var nullCount = 0; @@ -85,7 +146,7 @@ public class UnionArrayTests { var isNull = i % 3 == 0; expectedNull[i] = isNull; - nullCount += isNull ? 1 : 0; + nullCount += (isNull && i >= offset) ? 1 : 0; if (i % 2 == 0) { @@ -140,8 +201,8 @@ public class UnionArrayTests }; UnionArray array = mode == UnionMode.Dense - ? new DenseUnionArray(type, length, children, typeIdsBuffer, valuesOffsetBuffer, nullCount) - : new SparseUnionArray(type, length, children, typeIdsBuffer, nullCount); + ? new DenseUnionArray(type, length - offset, children, typeIdsBuffer, valuesOffsetBuffer, nullCount, offset) + : new SparseUnionArray(type, length - offset, children, typeIdsBuffer, nullCount, offset); return (array, expectedNull); }
