This is an automated email from the ASF dual-hosted git repository.

curth pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new f20c1e8150 GH-41136: [C#] Recompute null count for sliced arrays on 
demand (#41144)
f20c1e8150 is described below

commit f20c1e815089f9113273393634cf925fde59958a
Author: Adam Reeve <[email protected]>
AuthorDate: Fri Apr 12 15:00:02 2024 +1200

    GH-41136: [C#] Recompute null count for sliced arrays on demand (#41144)
    
    ### Rationale for this change
    
    See #41136. This is required for writing sliced arrays to IPC files and 
streams. It should also be generally useful too by allowing users to be able to 
rely on valid null count values for sliced arrays.
    
    ### What changes are included in this PR?
    
    Adds a new `ArrayData.GetNullCount()` method which computes the actual null 
count if it is unknown, and changes most uses of `ArrayData.NullCount` to 
`GetNullCount()`.
    
    ### Are these changes tested?
    
    Yes, I've extended the existing slicing tests to also verify the null 
count, and added new tests specifically for union arrays, which needed special 
handling.
    
    ### Are there any user-facing changes?
    
    Yes, this is a user-facing behaviour change. I don't expect that it should 
break any existing code as users are unlikely to rely on the NullCount being -1.
    
    * GitHub Issue: #41136
    
    Authored-by: Adam Reeve <[email protected]>
    Signed-off-by: Curt Hagenlocher <[email protected]>
---
 csharp/src/Apache.Arrow/Arrays/Array.cs            |  2 +-
 csharp/src/Apache.Arrow/Arrays/ArrayData.cs        | 60 ++++++++++++++++++++--
 .../Apache.Arrow/Arrays/ArrayDataConcatenator.cs   |  2 +-
 csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs  | 23 +++++++++
 csharp/src/Apache.Arrow/Arrays/NullArray.cs        |  2 +-
 csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs | 21 ++++++++
 csharp/src/Apache.Arrow/Arrays/UnionArray.cs       | 12 ++++-
 csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs   |  2 +-
 csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs   |  2 +-
 csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs  | 15 ++++++
 csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs  | 49 +++++++++++++++---
 11 files changed, 174 insertions(+), 16 deletions(-)

diff --git a/csharp/src/Apache.Arrow/Arrays/Array.cs 
b/csharp/src/Apache.Arrow/Arrays/Array.cs
index 0838134b19..4abe63e05a 100644
--- a/csharp/src/Apache.Arrow/Arrays/Array.cs
+++ b/csharp/src/Apache.Arrow/Arrays/Array.cs
@@ -31,7 +31,7 @@ namespace Apache.Arrow
 
         public int Offset => Data.Offset;
 
-        public int NullCount => Data.NullCount;
+        public int NullCount => Data.GetNullCount();
 
         public ArrowBuffer NullBitmapBuffer => Data.Buffers[0];
 
diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayData.cs 
b/csharp/src/Apache.Arrow/Arrays/ArrayData.cs
index 55d77f598c..cdb6ed6b39 100644
--- a/csharp/src/Apache.Arrow/Arrays/ArrayData.cs
+++ b/csharp/src/Apache.Arrow/Arrays/ArrayData.cs
@@ -15,7 +15,6 @@
 
 using Apache.Arrow.Memory;
 using Apache.Arrow.Types;
-using Google.FlatBuffers;
 using System;
 using System.Collections.Generic;
 using System.Linq;
@@ -28,12 +27,30 @@ namespace Apache.Arrow
 
         public readonly IArrowType DataType;
         public readonly int Length;
-        public readonly int NullCount;
+
+        /// <summary>
+        /// The number of null values in the Array. May be -1 if the null 
count has not been computed.
+        /// </summary>
+        public int NullCount;
+
         public readonly int Offset;
         public readonly ArrowBuffer[] Buffers;
         public readonly ArrayData[] Children;
         public readonly ArrayData Dictionary; // Only used for dictionary type
 
+        /// <summary>
+        /// Get the number of null values in the Array, computing the count if 
required.
+        /// </summary>
+        public int GetNullCount()
+        {
+            if (NullCount == RecalculateNullCount)
+            {
+                NullCount = ComputeNullCount();
+            }
+
+            return NullCount;
+        }
+
         // This is left for compatibility with lower version binaries
         // before the dictionary type was supported.
         public ArrayData(
@@ -111,7 +128,25 @@ namespace Apache.Arrow
             length = Math.Min(Length - offset, length);
             offset += Offset;
 
-            return new ArrayData(DataType, length, RecalculateNullCount, 
offset, Buffers, Children, Dictionary);
+            int nullCount;
+            if (NullCount == 0)
+            {
+                nullCount = 0;
+            }
+            else if (NullCount == Length)
+            {
+                nullCount = length;
+            }
+            else if (offset == Offset && length == Length)
+            {
+                nullCount = NullCount;
+            }
+            else
+            {
+                nullCount = RecalculateNullCount;
+            }
+
+            return new ArrayData(DataType, length, nullCount, offset, Buffers, 
Children, Dictionary);
         }
 
         public ArrayData Clone(MemoryAllocator allocator = default)
@@ -125,5 +160,24 @@ namespace Apache.Arrow
                 Children?.Select(b => b.Clone(allocator))?.ToArray(),
                 Dictionary?.Clone(allocator));
         }
+
+        private int ComputeNullCount()
+        {
+            if (DataType.TypeId == ArrowTypeId.Union)
+            {
+                return UnionArray.ComputeNullCount(this);
+            }
+
+            if (Buffers == null || Buffers.Length == 0 || Buffers[0].IsEmpty)
+            {
+                return 0;
+            }
+
+            // Note: Dictionary arrays may be logically null if there is a 
null in the dictionary values,
+            // but this isn't accounted for by the IArrowArray.IsNull 
implementation,
+            // so we maintain consistency with that behaviour here.
+
+            return Length - BitUtility.CountBits(Buffers[0].Span, Offset, 
Length);
+        }
     }
 }
diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs 
b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs
index 698d74e4ba..84658a5fab 100644
--- a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs
+++ b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs
@@ -71,7 +71,7 @@ namespace Apache.Arrow
                 foreach (ArrayData arrayData in _arrayDataList)
                 {
                     _totalLength += arrayData.Length;
-                    _totalNullCount += arrayData.NullCount;
+                    _totalNullCount += arrayData.GetNullCount();
                 }
             }
 
diff --git a/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs 
b/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs
index b6b61c560e..459a30e221 100644
--- a/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs
+++ b/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs
@@ -53,5 +53,28 @@ namespace Apache.Arrow
         {
             return fieldArray.IsValid(ValueOffsets[index]);
         }
+
+        internal new static int ComputeNullCount(ArrayData data)
+        {
+            var offset = data.Offset;
+            var length = data.Length;
+            var typeIds = data.Buffers[0].Span.Slice(offset, length);
+            var valueOffsets = 
data.Buffers[1].Span.CastTo<int>().Slice(offset, length);
+            var childArrays = new IArrowArray[data.Children.Length];
+            for (var childIdx = 0; childIdx < data.Children.Length; ++childIdx)
+            {
+                childArrays[childIdx] = 
ArrowArrayFactory.BuildArray(data.Children[childIdx]);
+            }
+
+            var nullCount = 0;
+            for (var i = 0; i < length; ++i)
+            {
+                var typeId = typeIds[i];
+                var valueOffset = valueOffsets[i];
+                nullCount += childArrays[typeId].IsNull(valueOffset) ? 1 : 0;
+            }
+
+            return nullCount;
+        }
     }
 }
diff --git a/csharp/src/Apache.Arrow/Arrays/NullArray.cs 
b/csharp/src/Apache.Arrow/Arrays/NullArray.cs
index 762540065c..7f3e183829 100644
--- a/csharp/src/Apache.Arrow/Arrays/NullArray.cs
+++ b/csharp/src/Apache.Arrow/Arrays/NullArray.cs
@@ -95,7 +95,7 @@ namespace Apache.Arrow
 
         public int Offset => Data.Offset;
 
-        public int NullCount => Data.NullCount;
+        public int NullCount => Data.GetNullCount();
 
         public void Dispose() { }
         public bool IsNull(int index) => true;
diff --git a/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs 
b/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs
index 07d36e25cf..ef55786f01 100644
--- a/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs
+++ b/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs
@@ -47,5 +47,26 @@ namespace Apache.Arrow
         {
             return fieldArray.IsValid(index);
         }
+
+        internal new static int ComputeNullCount(ArrayData data)
+        {
+            var offset = data.Offset;
+            var length = data.Length;
+            var typeIds = data.Buffers[0].Span.Slice(offset, length);
+            var childArrays = new IArrowArray[data.Children.Length];
+            for (var childIdx = 0; childIdx < data.Children.Length; ++childIdx)
+            {
+                childArrays[childIdx] = 
ArrowArrayFactory.BuildArray(data.Children[childIdx]);
+            }
+
+            var nullCount = 0;
+            for (var i = 0; i < data.Length; ++i)
+            {
+                var typeId = typeIds[i];
+                nullCount += childArrays[typeId].IsNull(offset + i) ? 1 : 0;
+            }
+
+            return nullCount;
+        }
     }
 }
diff --git a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs 
b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs
index 5fcb276655..f96f527135 100644
--- a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs
+++ b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs
@@ -41,7 +41,7 @@ namespace Apache.Arrow
 
         public int Offset => Data.Offset;
 
-        public int NullCount => Data.NullCount;
+        public int NullCount => Data.GetNullCount();
 
         public bool IsValid(int index) => NullCount == 0 || 
FieldIsValid(Fields[TypeIds[index]], index);
 
@@ -91,6 +91,16 @@ namespace Apache.Arrow
             }
         }
 
+        internal static int ComputeNullCount(ArrayData data)
+        {
+            return ((UnionType)data.DataType).Mode switch
+            {
+                UnionMode.Sparse => SparseUnionArray.ComputeNullCount(data),
+                UnionMode.Dense => DenseUnionArray.ComputeNullCount(data),
+                _ => throw new InvalidOperationException("unknown union mode 
in null count computation")
+            };
+        }
+
         private IReadOnlyList<IArrowArray> InitializeFields()
         {
             IArrowArray[] result = new IArrowArray[Data.Children.Length];
diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs 
b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs
index 03059eaf5d..b241fdfea3 100644
--- a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs
+++ b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs
@@ -115,7 +115,7 @@ namespace Apache.Arrow.C
         {
             cArray->length = array.Length;
             cArray->offset = array.Offset;
-            cArray->null_count = array.NullCount;
+            cArray->null_count = array.NullCount; // The C Data interface 
allows the null count to be -1
             cArray->release = ReleaseArrayPtr;
             cArray->private_data = MakePrivateData(sharedOwner);
 
diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs 
b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
index b002f8c8b1..7b319b03d7 100644
--- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
+++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
@@ -376,7 +376,7 @@ namespace Apache.Arrow.Ipc
                     CreateSelfAndChildrenFieldNodes(data.Children[i]);
                 }
             }
-            Flatbuf.FieldNode.CreateFieldNode(Builder, data.Length, 
data.NullCount);
+            Flatbuf.FieldNode.CreateFieldNode(Builder, data.Length, 
data.GetNullCount());
         }
 
         private static int CountAllNodes(IReadOnlyList<Field> fields)
diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs 
b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs
index a0e90cbbc7..682ebec323 100644
--- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs
@@ -185,6 +185,7 @@ namespace Apache.Arrow.Tests
             TestSlice<Date64Array, Date64Array.Builder>(x => x.Append(new 
DateTime(2019, 1, 1)).Append(new DateTime(2019, 1, 2)).AppendNull().Append(new 
DateTime(2019, 1, 3)));
             TestSlice<Time32Array, Time32Array.Builder>(x => 
x.Append(10).Append(20).AppendNull().Append(30));
             TestSlice<Time64Array, Time64Array.Builder>(x => 
x.Append(10).Append(20).AppendNull().Append(30));
+            TestSlice<Int32Array, Int32Array.Builder>(x => 
x.AppendNull().AppendNull().AppendNull());  // All nulls
 
             static void TestNumberSlice<T, TArray, TBuilder>()
                 where T : struct, INumber<T>
@@ -314,6 +315,8 @@ namespace Apache.Arrow.Tests
                         .SequenceEqual(slicedArray.Values));
 
                 Assert.Equal(baseArray.GetValue(slicedArray.Offset), 
slicedArray.GetValue(0));
+
+                ValidateNullCount(slicedArray);
             }
 
             private void ValidateArrays(BooleanArray slicedArray)
@@ -333,6 +336,8 @@ namespace Apache.Arrow.Tests
 #pragma warning disable CS0618
                 Assert.Equal(baseArray.GetBoolean(slicedArray.Offset), 
slicedArray.GetBoolean(0));
 #pragma warning restore CS0618
+
+                ValidateNullCount(slicedArray);
             }
 
             private void ValidateArrays(BinaryArray slicedArray)
@@ -347,6 +352,16 @@ namespace Apache.Arrow.Tests
                         .SequenceEqual(slicedArray.ValueOffsets));
 
                 
Assert.True(baseArray.GetBytes(slicedArray.Offset).SequenceEqual(slicedArray.GetBytes(0)));
+
+                ValidateNullCount(slicedArray);
+            }
+
+            private static void ValidateNullCount(IArrowArray slicedArray)
+            {
+                var expectedNullCount = Enumerable.Range(0, slicedArray.Length)
+                    .Select(i => slicedArray.IsNull(i) ? 1 : 0)
+                    .Sum();
+                Assert.Equal(expectedNullCount, slicedArray.NullCount);
             }
         }
     }
diff --git a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs 
b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs
index 1fb5cf2415..45fed722a7 100644
--- a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs
+++ b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs
@@ -25,6 +25,46 @@ public class UnionArrayTests
     [InlineData(UnionMode.Sparse)]
     [InlineData(UnionMode.Dense)]
     public void UnionArray_IsNull(UnionMode mode)
+    {
+        var (array, expectedNull) = BuildUnionArray(mode, 100);
+
+        for (var i = 0; i < array.Length; ++i)
+        {
+            Assert.Equal(expectedNull[i], array.IsNull(i));
+            Assert.Equal(!expectedNull[i], array.IsValid(i));
+        }
+    }
+
+    [Theory]
+    [InlineData(UnionMode.Sparse)]
+    [InlineData(UnionMode.Dense)]
+    public void UnionArray_Slice(UnionMode mode)
+    {
+        var (array, expectedNull) = BuildUnionArray(mode, 10);
+
+        for (var offset = 0; offset < array.Length; ++offset)
+        {
+            for (var length = 0; length < array.Length - offset; ++length)
+            {
+                var slicedArray = ArrowArrayFactory.Slice(array, offset, 
length);
+
+                var nullCount = 0;
+                for (var i = 0; i < slicedArray.Length; ++i)
+                {
+                    // TODO: Shouldn't need to add offset in IsNull/IsValid 
calls,
+                    // see https://github.com/apache/arrow/issues/41140
+                    Assert.Equal(expectedNull[offset + i], 
slicedArray.IsNull(offset + i));
+                    Assert.Equal(!expectedNull[offset + i], 
slicedArray.IsValid(offset + i));
+                    nullCount += expectedNull[offset + i] ? 1 : 0;
+                }
+
+                Assert.True(nullCount == slicedArray.NullCount, $"offset = 
{offset}, length = {length}");
+                Assert.Equal(nullCount, slicedArray.NullCount);
+            }
+        }
+    }
+
+    private static (UnionArray array, bool[] isNull) BuildUnionArray(UnionMode 
mode, int length)
     {
         var fields = new Field[]
         {
@@ -34,7 +74,6 @@ public class UnionArrayTests
         var typeIds = fields.Select(f => (int) f.DataType.TypeId).ToArray();
         var type = new UnionType(fields, typeIds, mode);
 
-        const int length = 100;
         var nullCount = 0;
         var field0Builder = new Int32Array.Builder();
         var field1Builder = new FloatArray.Builder();
@@ -44,7 +83,7 @@ public class UnionArrayTests
 
         for (var i = 0; i < length; ++i)
         {
-            var isNull = i % 5 == 0;
+            var isNull = i % 3 == 0;
             expectedNull[i] = isNull;
             nullCount += isNull ? 1 : 0;
 
@@ -104,10 +143,6 @@ public class UnionArrayTests
             ? new DenseUnionArray(type, length, children, typeIdsBuffer, 
valuesOffsetBuffer, nullCount)
             : new SparseUnionArray(type, length, children, typeIdsBuffer, 
nullCount);
 
-        for (var i = 0; i < length; ++i)
-        {
-            Assert.Equal(expectedNull[i], array.IsNull(i));
-            Assert.Equal(!expectedNull[i], array.IsValid(i));
-        }
+        return (array, expectedNull);
     }
 }

Reply via email to