eerhardt commented on a change in pull request #7032:
URL: https://github.com/apache/arrow/pull/7032#discussion_r417707980



##########
File path: csharp/src/Apache.Arrow/Arrays/BinaryArray.cs
##########
@@ -111,23 +130,32 @@ public TBuilder AppendRange(IEnumerable<byte[]> values)
             public TBuilder AppendRange(IEnumerable<byte> values)
             {
                 var len = ValueBuffer.Length;
-                ValueOffsets.Append(Offset);
                 ValueBuffer.AppendRange(values);
-                Offset += ValueBuffer.Length - len;
+                var valOffset = ValueBuffer.Length - len;
+                if (valOffset == 0)

Review comment:
       I think I would check for `values` to be `null` explicitly, and only 
`AppendNull()` in that case. That way an empty Enumerable and `null` are 
treated separately (as they should be).

##########
File path: csharp/src/Apache.Arrow/Arrays/BinaryArray.cs
##########
@@ -73,24 +76,34 @@ public TArray Build(MemoryAllocator allocator = default)
             {
                 ValueOffsets.Append(Offset);
 
-                var data = new ArrayData(DataType, ValueOffsets.Length - 1, 0, 
0,
-                    new[] { ArrowBuffer.Empty, ValueOffsets.Build(allocator), 
ValueBuffer.Build(allocator) });
+                var data = new ArrayData(DataType, ValueOffsets.Length - 1, 
NullCount, 0,
+                    new[] { ValidityBuffer.Build(allocator).ValueBuffer, 
ValueOffsets.Build(allocator), ValueBuffer.Build(allocator) });

Review comment:
       Were we going to check for `NullCount == 0` and use an empty buffer for 
the validity buffer in that case?
   
   The same goes for `PrimitiveArrayBuilder` and `BoleanArray.Builder`.

##########
File path: csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs
##########
@@ -99,55 +105,75 @@ public abstract class PrimitiveArrayBuilder<T, TArray, 
TBuilder> : IArrowArrayBu
     {
         protected TBuilder Instance => this as TBuilder;
         protected ArrowBuffer.Builder<T> ValueBuffer { get; }
+        protected BooleanArray.Builder ValidityBuffer { get; }
 
         public int Length => ValueBuffer.Length;
-
-        // TODO: Implement support for null values (null bitmaps)
+        protected int NullCount { get; set; }
 
         internal PrimitiveArrayBuilder()
         {
             ValueBuffer = new ArrowBuffer.Builder<T>();
+            ValidityBuffer = new BooleanArray.Builder();
         }
 
         public TBuilder Resize(int length)
         {
             ValueBuffer.Resize(length);
+            ValidityBuffer.Resize(length + 1);
             return Instance;
         }
 
         public TBuilder Reserve(int capacity)
         {
             ValueBuffer.Reserve(capacity);
+            ValidityBuffer.Reserve(capacity + 1);
             return Instance;
         }
 
         public TBuilder Append(T value)
         {
             ValueBuffer.Append(value);
+            ValidityBuffer.Append(true);
             return Instance;
         }
 
         public TBuilder Append(ReadOnlySpan<T> span)
         {
+            var len = ValueBuffer.Length;
             ValueBuffer.Append(span);
+            ValidityBuffer.AppendRange(Enumerable.Repeat(true, 
ValueBuffer.Length - len));
             return Instance;
         }
 
         public TBuilder AppendRange(IEnumerable<T> values)
         {
+            var len = ValueBuffer.Length;
             ValueBuffer.AppendRange(values);
+            ValidityBuffer.AppendRange(Enumerable.Repeat(true, 
ValueBuffer.Length - len));
+            return Instance;
+        }
+
+        public TBuilder AppendNull()
+        {
+            ValidityBuffer.Append(false);
+            NullCount++;
+            // Need this until this is refactored to use

Review comment:
       I'm not sure I understand this comment. Can you explain?

##########
File path: csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs
##########
@@ -99,55 +105,75 @@ public abstract class PrimitiveArrayBuilder<T, TArray, 
TBuilder> : IArrowArrayBu
     {
         protected TBuilder Instance => this as TBuilder;
         protected ArrowBuffer.Builder<T> ValueBuffer { get; }
+        protected BooleanArray.Builder ValidityBuffer { get; }
 
         public int Length => ValueBuffer.Length;
-
-        // TODO: Implement support for null values (null bitmaps)
+        protected int NullCount { get; set; }
 
         internal PrimitiveArrayBuilder()
         {
             ValueBuffer = new ArrowBuffer.Builder<T>();
+            ValidityBuffer = new BooleanArray.Builder();
         }
 
         public TBuilder Resize(int length)
         {
             ValueBuffer.Resize(length);
+            ValidityBuffer.Resize(length + 1);

Review comment:
       Why the `+ 1` here? Shouldn't the ValidityBuffer and ValueBuffer be the 
same size? (Same question for Reserve below).

##########
File path: csharp/src/Apache.Arrow/Arrays/StringArray.cs
##########
@@ -71,6 +76,15 @@ public string GetString(int index, Encoding encoding = 
default)
 
             var bytes = GetBytes(index);
 
+            if (bytes == Span<byte>.Empty)

Review comment:
       We had similar conversations about this in the `DataFrame` type we made 
here: https://github.com/dotnet/corefxlab/pull/2710#issuecomment-521799900.
   
   I think it makes more sense to say `if (bytes == default)` here.
   
   Or another idea is to add a `ReadOnlySpan<byte> GetBytes(int index, out bool 
isNull)` method to `BinaryArray`, which would allow callers know explicitly if 
the value is `null` without having a double `IsValid` check.

##########
File path: csharp/src/Apache.Arrow/Arrays/StringArray.cs
##########
@@ -71,6 +76,15 @@ public string GetString(int index, Encoding encoding = 
default)
 
             var bytes = GetBytes(index);
 
+            if (bytes == Span<byte>.Empty)
+            {
+                return null;
+            }
+            if (bytes.Length == 0)

Review comment:
       Do we have unit tests for both of these cases to make sure they act 
appropriately? A test for `null` and a test for `string.Empty`.

##########
File path: csharp/src/Apache.Arrow/Arrays/BooleanArray.cs
##########
@@ -153,17 +182,25 @@ private void CheckIndex(int index)
                 new[] { nullBitmapBuffer, valueBuffer }))
         { }
 
-        public BooleanArray(ArrayData data) 
+        public BooleanArray(ArrayData data)
             : base(data)
         {
             data.EnsureDataType(ArrowTypeId.Boolean);
         }
 
         public override void Accept(IArrowArrayVisitor visitor) => 
Accept(this, visitor);
 
+        [Obsolete("GetBoolean does not support null values. Use GetValue 
instead (which this method invokes internally).")]
         public bool GetBoolean(int index)
         {
-            return BitUtility.GetBit(Values, index);
+            return GetValue(index).GetValueOrDefault();
+        }
+
+        public bool? GetValue(int index)
+        {
+            return BitUtility.GetBit(Nulls, Offset + index)

Review comment:
       I think we are double counting `Offset` here, since `Nulls` and `Values` 
already account for `Offset`. Should the calls here just be 
`BitUtility.GetBit(Nulls, index)` and `BitUtility.GetBit(Values, index)`?

##########
File path: csharp/src/Apache.Arrow/Arrays/BooleanArray.cs
##########
@@ -153,17 +182,25 @@ private void CheckIndex(int index)
                 new[] { nullBitmapBuffer, valueBuffer }))
         { }
 
-        public BooleanArray(ArrayData data) 
+        public BooleanArray(ArrayData data)
             : base(data)
         {
             data.EnsureDataType(ArrowTypeId.Boolean);
         }
 
         public override void Accept(IArrowArrayVisitor visitor) => 
Accept(this, visitor);
 
+        [Obsolete("GetBoolean does not support null values. Use GetValue 
instead (which this method invokes internally).")]
         public bool GetBoolean(int index)
         {
-            return BitUtility.GetBit(Values, index);
+            return GetValue(index).GetValueOrDefault();
+        }
+
+        public bool? GetValue(int index)
+        {
+            return BitUtility.GetBit(Nulls, Offset + index)

Review comment:
       Do we have a unit test for this situation? Where Offset is non-zero, and 
we are calling `GetValue(index)`.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to