Copilot commented on code in PR #308:
URL: https://github.com/apache/arrow-dotnet/pull/308#discussion_r3047801024


##########
src/Apache.Arrow/Ipc/ArrowStreamWriter.cs:
##########
@@ -797,6 +840,7 @@ private protected void WriteRecordBatchInternal(RecordBatch 
recordBatch)
                 recordBatchOffset, recordBatchBuilder.TotalLength);
 
             long bufferLength = WriteBufferData(recordBatchBuilder.Buffers);
+            recordBatchBuilder.DisposeDeferredArrays();
 

Review Comment:
   DisposeDeferredArrays() is only called after WriteBufferData. If 
WriteMessage/WriteBufferData throws, the normalized REE arrays held in 
_deferredDisposals will leak (and may retain native buffers) because they’re 
never disposed. Wrap the message/body write in a try/finally (or make 
ArrowRecordBatchFlatBufferBuilder IDisposable and dispose it in finally) so 
deferred arrays are always released.



##########
src/Apache.Arrow/Ipc/ArrowStreamWriter.cs:
##########
@@ -837,6 +881,7 @@ private protected async Task 
WriteRecordBatchInternalAsync(RecordBatch recordBat
                 cancellationToken).ConfigureAwait(false);
 
             long bufferLength = await 
WriteBufferDataAsync(recordBatchBuilder.Buffers, 
cancellationToken).ConfigureAwait(false);
+            recordBatchBuilder.DisposeDeferredArrays();
 

Review Comment:
   DisposeDeferredArrays() should run in a finally block here as well. As 
written, an exception from WriteMessageAsync/WriteBufferDataAsync will skip 
disposal of any deferred normalized arrays, causing a memory/resource leak.



##########
src/Apache.Arrow/Ipc/ArrowStreamWriter.cs:
##########
@@ -986,6 +1031,7 @@ private protected void WriteDictionary(long id, IArrowType 
valueType, IArrowArra
                 dictionaryBatchOffset, recordBatchBuilder.TotalLength);
 
             long bufferLength = WriteBufferData(recordBatchBuilder.Buffers);
+            recordBatchBuilder.DisposeDeferredArrays();
 

Review Comment:
   Same issue as record batches: DisposeDeferredArrays() is not in a finally. 
If WriteMessage/WriteBufferData throws while writing a dictionary batch, 
deferred arrays (e.g., normalized REE slices) won’t be disposed.



##########
src/Apache.Arrow/Arrays/RunEndEncodedArray.cs:
##########
@@ -0,0 +1,399 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using Apache.Arrow.Memory;
+using Apache.Arrow.Types;
+
+namespace Apache.Arrow;
+
+/// <summary>
+/// Represents a run-end encoded array.
+/// A run-end encoded array stores consecutive runs of the same value more 
efficiently.
+/// It contains two child arrays: run_ends (Int16/Int32/Int64) and values (any 
type).
+/// The run_ends array stores the cumulative end positions of each run.
+/// </summary>
+public class RunEndEncodedArray : Array
+{
+    /// <summary>
+    /// Gets the run ends array (Int16Array, Int32Array, or Int64Array).
+    /// This array contains the cumulative end indices for each run.
+    /// </summary>
+    public IArrowArray RunEnds { get; }
+
+    /// <summary>
+    /// Gets the values array.
+    /// This array contains the actual values that are run-length encoded.
+    /// </summary>
+    public IArrowArray Values { get; }
+
+    /// <summary>
+    /// Creates a new RunEndEncodedArray from ArrayData.
+    /// </summary>
+    /// <param name="data">The array data containing run ends and values as 
children.</param>
+    public RunEndEncodedArray(ArrayData data)
+        : this(data, ArrowArrayFactory.BuildArray(data.Children[0]), 
ArrowArrayFactory.BuildArray(data.Children[1]))
+    {
+    }
+
+    /// <summary>
+    /// Creates a new RunEndEncodedArray with specified run ends and values 
arrays.
+    /// </summary>
+    /// <param name="runEnds">The run ends array (must be Int16Array, 
Int32Array, or Int64Array).</param>
+    /// <param name="values">The values array (can be any type).</param>
+    public RunEndEncodedArray(IArrowArray runEnds, IArrowArray values)
+        : this(CreateArrayData(runEnds, values), runEnds, values)
+    {
+    }
+
+    private RunEndEncodedArray(ArrayData data, IArrowArray runEnds, 
IArrowArray values)
+        : base(data)
+    {
+        data.EnsureBufferCount(0); // REE arrays have no buffers, only children
+        data.EnsureDataType(ArrowTypeId.RunEndEncoded);
+
+        if (data.NullCount != 0)
+        {
+            throw new ArgumentException(
+                $"Run-end encoded arrays have no top-level validity bitmap and 
must report null count 0, but got {data.NullCount}.",
+                nameof(data));
+        }
+
+        ValidateRunEndsType(runEnds);
+
+        if (runEnds.Length != values.Length)
+        {
+            throw new ArgumentException(
+                $"Run ends array length ({runEnds.Length}) must equal values 
array length ({values.Length}).");
+        }

Review Comment:
   When constructing from ArrayData, the top-level Length/Offset is not 
validated against the run_ends values. This allows malformed ArrayData where 
Offset+Length exceeds the last run_end, which will break 
Normalize()/FindPhysicalIndex semantics. Add a validation that the slice range 
(data.Offset + data.Length) does not exceed the logical length implied by the 
last run_end value.



##########
src/Apache.Arrow/Ipc/ArrowStreamWriter.cs:
##########
@@ -1010,6 +1056,7 @@ private protected async Task WriteDictionaryAsync(long 
id, IArrowType valueType,
                 dictionaryBatchOffset, recordBatchBuilder.TotalLength, 
cancellationToken).ConfigureAwait(false);
 
             long bufferLength = await 
WriteBufferDataAsync(recordBatchBuilder.Buffers, 
cancellationToken).ConfigureAwait(false);
+            recordBatchBuilder.DisposeDeferredArrays();
 

Review Comment:
   Async dictionary writing has the same leak hazard: DisposeDeferredArrays() 
should be guaranteed via try/finally so deferred normalized arrays are disposed 
even when WriteMessageAsync/WriteBufferDataAsync throws.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to