Copilot commented on code in PR #308: URL: https://github.com/apache/arrow-dotnet/pull/308#discussion_r3045035711
########## src/Apache.Arrow/Arrays/RunEndEncodedArray.cs: ########## @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow.Types; + +namespace Apache.Arrow; + +/// <summary> +/// Represents a run-end encoded array. +/// A run-end encoded array stores consecutive runs of the same value more efficiently. +/// It contains two child arrays: run_ends (Int16/Int32/Int64) and values (any type). +/// The run_ends array stores the cumulative end positions of each run. +/// </summary> +public class RunEndEncodedArray : Array +{ + /// <summary> + /// Gets the run ends array (Int16Array, Int32Array, or Int64Array). + /// This array contains the cumulative end indices for each run. + /// </summary> + public IArrowArray RunEnds { get; } + + /// <summary> + /// Gets the values array. + /// This array contains the actual values that are run-length encoded. + /// </summary> + public IArrowArray Values { get; } + + /// <summary> + /// Creates a new RunEndEncodedArray from ArrayData. + /// </summary> + /// <param name="data">The array data containing run ends and values as children.</param> + public RunEndEncodedArray(ArrayData data) + : this(data, ArrowArrayFactory.BuildArray(data.Children[0]), ArrowArrayFactory.BuildArray(data.Children[1])) + { + } + + /// <summary> + /// Creates a new RunEndEncodedArray with specified run ends and values arrays. + /// </summary> + /// <param name="runEnds">The run ends array (must be Int16Array, Int32Array, or Int64Array).</param> + /// <param name="values">The values array (can be any type).</param> + public RunEndEncodedArray(IArrowArray runEnds, IArrowArray values) + : this(CreateArrayData(runEnds, values), runEnds, values) + { + } + + private RunEndEncodedArray(ArrayData data, IArrowArray runEnds, IArrowArray values) + : base(data) + { + data.EnsureBufferCount(0); // REE arrays have no buffers, only children + data.EnsureDataType(ArrowTypeId.RunEndEncoded); + + ValidateRunEndsType(runEnds); + RunEnds = runEnds; + Values = values; + } + + private static ArrayData CreateArrayData(IArrowArray runEnds, IArrowArray values) + { + ValidateRunEndsType(runEnds); + + // The logical length of a REE array is determined by the last value in run_ends + int logicalLength = GetLogicalLength(runEnds); + + var dataType = new RunEndEncodedType(runEnds.Data.DataType, values.Data.DataType); + + return new ArrayData( + dataType, + logicalLength, + nullCount: 0, // REE arrays don't have a validity bitmap + offset: 0, + buffers: [], + children: [runEnds.Data, values.Data]); + } + + private static void ValidateRunEndsType(IArrowArray runEnds) + { + ArrowTypeId typeId = runEnds.Data.DataType.TypeId; + if (typeId != ArrowTypeId.Int16 && + typeId != ArrowTypeId.Int32 && + typeId != ArrowTypeId.Int64) + { + throw new ArgumentException( + $"Run ends array must be Int16, Int32, or Int64, but got {typeId}", + nameof(runEnds)); + } + } + + private static int GetLogicalLength(IArrowArray runEnds) + { + if (runEnds.Length == 0) + { + return 0; + } + + // Get the last run end value which represents the logical length + switch (runEnds) + { + case Int16Array int16Array: + return int16Array.GetValue(int16Array.Length - 1) ?? throw new ArgumentException("invalid length"); + case Int32Array int32Array: + return int32Array.GetValue(int32Array.Length - 1) ?? throw new ArgumentException("invalid length"); + case Int64Array int64Array: + { + long? lastValue = int64Array.GetValue(int64Array.Length - 1); + if (lastValue.HasValue && lastValue.Value > int.MaxValue) + { + throw new ArgumentException("Run ends value exceeds maximum supported length."); + } + return (int)(lastValue ?? throw new ArgumentException("invalid length")); + } + default: + throw new InvalidOperationException($"Unexpected run ends array type: {runEnds.Data.DataType.TypeId}"); + } + } + + /// <summary> + /// Finds the physical index in the run_ends array that contains the specified logical index. + /// </summary> + /// <param name="logicalIndex">The logical index in the decoded array.</param> + /// <returns>The physical index in the run_ends/values arrays.</returns> + public int FindPhysicalIndex(int logicalIndex) + { + if (logicalIndex < 0 || logicalIndex >= Length) + { + throw new ArgumentOutOfRangeException(nameof(logicalIndex)); + } + + // Binary search to find the run that contains this logical index + return RunEnds switch + { + Int16Array int16Array => BinarySearchRunEnds(int16Array, logicalIndex), + Int32Array int32Array => BinarySearchRunEnds(int32Array, logicalIndex), + Int64Array int64Array => BinarySearchRunEnds(int64Array, logicalIndex), Review Comment: FindPhysicalIndex doesn't account for the REE array's Data.Offset (slices). For a sliced RunEndEncodedArray, logical indices are relative to the slice, but the binary search compares against run_ends values from the unsliced physical representation, producing incorrect physical indices. Adjust the searched logical index by Data.Offset (or otherwise make FindPhysicalIndex slice-aware) and add a test covering FindPhysicalIndex on a sliced REE array. ```suggestion int searchLogicalIndex = logicalIndex + Data.Offset; // Binary search to find the run that contains this logical index. // For sliced arrays, run_ends remain in the original logical coordinate // space, so we need to account for the array offset before searching. return RunEnds switch { Int16Array int16Array => BinarySearchRunEnds(int16Array, searchLogicalIndex), Int32Array int32Array => BinarySearchRunEnds(int32Array, searchLogicalIndex), Int64Array int64Array => BinarySearchRunEnds(int64Array, searchLogicalIndex), ``` ########## src/Apache.Arrow/Arrays/RunEndEncodedArray.cs: ########## @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow.Types; + +namespace Apache.Arrow; + +/// <summary> +/// Represents a run-end encoded array. +/// A run-end encoded array stores consecutive runs of the same value more efficiently. +/// It contains two child arrays: run_ends (Int16/Int32/Int64) and values (any type). +/// The run_ends array stores the cumulative end positions of each run. +/// </summary> +public class RunEndEncodedArray : Array +{ + /// <summary> + /// Gets the run ends array (Int16Array, Int32Array, or Int64Array). + /// This array contains the cumulative end indices for each run. + /// </summary> + public IArrowArray RunEnds { get; } + + /// <summary> + /// Gets the values array. + /// This array contains the actual values that are run-length encoded. + /// </summary> + public IArrowArray Values { get; } + + /// <summary> + /// Creates a new RunEndEncodedArray from ArrayData. + /// </summary> + /// <param name="data">The array data containing run ends and values as children.</param> + public RunEndEncodedArray(ArrayData data) + : this(data, ArrowArrayFactory.BuildArray(data.Children[0]), ArrowArrayFactory.BuildArray(data.Children[1])) + { + } + + /// <summary> + /// Creates a new RunEndEncodedArray with specified run ends and values arrays. + /// </summary> + /// <param name="runEnds">The run ends array (must be Int16Array, Int32Array, or Int64Array).</param> + /// <param name="values">The values array (can be any type).</param> + public RunEndEncodedArray(IArrowArray runEnds, IArrowArray values) + : this(CreateArrayData(runEnds, values), runEnds, values) + { + } + + private RunEndEncodedArray(ArrayData data, IArrowArray runEnds, IArrowArray values) + : base(data) + { + data.EnsureBufferCount(0); // REE arrays have no buffers, only children + data.EnsureDataType(ArrowTypeId.RunEndEncoded); + + ValidateRunEndsType(runEnds); + RunEnds = runEnds; + Values = values; + } + + private static ArrayData CreateArrayData(IArrowArray runEnds, IArrowArray values) + { + ValidateRunEndsType(runEnds); + + // The logical length of a REE array is determined by the last value in run_ends + int logicalLength = GetLogicalLength(runEnds); + + var dataType = new RunEndEncodedType(runEnds.Data.DataType, values.Data.DataType); + + return new ArrayData( + dataType, + logicalLength, + nullCount: 0, // REE arrays don't have a validity bitmap + offset: 0, + buffers: [], + children: [runEnds.Data, values.Data]); + } + + private static void ValidateRunEndsType(IArrowArray runEnds) + { + ArrowTypeId typeId = runEnds.Data.DataType.TypeId; + if (typeId != ArrowTypeId.Int16 && + typeId != ArrowTypeId.Int32 && + typeId != ArrowTypeId.Int64) + { + throw new ArgumentException( + $"Run ends array must be Int16, Int32, or Int64, but got {typeId}", + nameof(runEnds)); + } + } + + private static int GetLogicalLength(IArrowArray runEnds) + { + if (runEnds.Length == 0) + { + return 0; + } + + // Get the last run end value which represents the logical length + switch (runEnds) + { + case Int16Array int16Array: + return int16Array.GetValue(int16Array.Length - 1) ?? throw new ArgumentException("invalid length"); + case Int32Array int32Array: + return int32Array.GetValue(int32Array.Length - 1) ?? throw new ArgumentException("invalid length"); + case Int64Array int64Array: + { + long? lastValue = int64Array.GetValue(int64Array.Length - 1); + if (lastValue.HasValue && lastValue.Value > int.MaxValue) + { + throw new ArgumentException("Run ends value exceeds maximum supported length."); + } + return (int)(lastValue ?? throw new ArgumentException("invalid length")); + } + default: + throw new InvalidOperationException($"Unexpected run ends array type: {runEnds.Data.DataType.TypeId}"); + } Review Comment: GetLogicalLength can return a negative logical length if the last run_end value is negative (Int16/Int32/Int64). ArrayData doesn't validate length >= 0, so this can create invalid arrays and downstream failures. Validate that the final run_end is >= 0 (and ideally > 0 when runEnds.Length > 0) and throw a clear ArgumentException when invalid. ########## src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs: ########## @@ -286,6 +287,201 @@ public void Visit(UnionType type) public void Visit(MapType type) => ConcatenateLists(type.UnsortedKey()); /* Can't tell if the output is still sorted */ + public void Visit(RunEndEncodedType type) + { + ArrowTypeId runEndsTypeId = type.RunEndsDataType.TypeId; + if (runEndsTypeId != ArrowTypeId.Int16 && + runEndsTypeId != ArrowTypeId.Int32 && + runEndsTypeId != ArrowTypeId.Int64) + { + throw new InvalidOperationException( + $"Run-ends array must be Int16, Int32, or Int64, but got {runEndsTypeId}"); + } + + var slicedValues = new List<ArrayData>(_arrayDataList.Count); + ArrowBuffer.Builder<short> int16Builder = null; + ArrowBuffer.Builder<int> int32Builder = null; + ArrowBuffer.Builder<long> int64Builder = null; + + switch (runEndsTypeId) + { + case ArrowTypeId.Int16: int16Builder = new ArrowBuffer.Builder<short>(); break; + case ArrowTypeId.Int32: int32Builder = new ArrowBuffer.Builder<int>(); break; + case ArrowTypeId.Int64: int64Builder = new ArrowBuffer.Builder<long>(); break; + } + + long baseOffset = 0; + int physicalRunCount = 0; + + foreach (ArrayData arrayData in _arrayDataList) + { + arrayData.EnsureDataType(type.TypeId); + + ArrayData runEndsData = arrayData.Children[0]; + ArrayData valuesData = arrayData.Children[1]; + + if (runEndsData.DataType.TypeId != runEndsTypeId) + { + throw new ArgumentException( + $"All run-end encoded arrays must have the same run-ends type. Expected <{runEndsTypeId}> but got <{runEndsData.DataType.TypeId}>."); + } + if (valuesData.DataType.TypeId != type.ValuesDataType.TypeId) + { + throw new ArgumentException( + $"All run-end encoded arrays must have the same values type. Expected <{type.ValuesDataType.TypeId}> but got <{valuesData.DataType.TypeId}>."); + } + + if (arrayData.Length == 0) + { + continue; + } + + int logicalOffset = arrayData.Offset; + int logicalLength = arrayData.Length; + int logicalEnd = logicalOffset + logicalLength; + + int physicalStart; + int physicalEndExclusive; + + switch (runEndsTypeId) + { + case ArrowTypeId.Int16: + { + ReadOnlySpan<short> re = runEndsData.Buffers[1].Span.CastTo<short>() + .Slice(runEndsData.Offset, runEndsData.Length); + physicalStart = FindPhysicalStartInt16(re, logicalOffset); + physicalEndExclusive = FindPhysicalEndExclusiveInt16(re, logicalEnd, physicalStart); + for (int p = physicalStart; p < physicalEndExclusive; p++) + { + int adjustedEnd = Math.Min((int)re[p], logicalEnd) - logicalOffset; + int16Builder.Append(checked((short)(baseOffset + adjustedEnd))); + } Review Comment: For Int16 run_ends, concatenation appends checked((short)(baseOffset + adjustedEnd)). If the concatenated logical length exceeds Int16.MaxValue, this will throw OverflowException mid-build. Consider proactively validating that the final logical length fits in the chosen run_ends type and throw a targeted ArgumentException with a clearer message. ########## src/Apache.Arrow/Arrays/RunEndEncodedArray.cs: ########## @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow.Types; + +namespace Apache.Arrow; + +/// <summary> +/// Represents a run-end encoded array. +/// A run-end encoded array stores consecutive runs of the same value more efficiently. +/// It contains two child arrays: run_ends (Int16/Int32/Int64) and values (any type). +/// The run_ends array stores the cumulative end positions of each run. +/// </summary> +public class RunEndEncodedArray : Array +{ + /// <summary> + /// Gets the run ends array (Int16Array, Int32Array, or Int64Array). + /// This array contains the cumulative end indices for each run. + /// </summary> + public IArrowArray RunEnds { get; } + + /// <summary> + /// Gets the values array. + /// This array contains the actual values that are run-length encoded. + /// </summary> + public IArrowArray Values { get; } + + /// <summary> + /// Creates a new RunEndEncodedArray from ArrayData. + /// </summary> + /// <param name="data">The array data containing run ends and values as children.</param> + public RunEndEncodedArray(ArrayData data) + : this(data, ArrowArrayFactory.BuildArray(data.Children[0]), ArrowArrayFactory.BuildArray(data.Children[1])) + { + } + + /// <summary> + /// Creates a new RunEndEncodedArray with specified run ends and values arrays. + /// </summary> + /// <param name="runEnds">The run ends array (must be Int16Array, Int32Array, or Int64Array).</param> + /// <param name="values">The values array (can be any type).</param> + public RunEndEncodedArray(IArrowArray runEnds, IArrowArray values) + : this(CreateArrayData(runEnds, values), runEnds, values) + { + } + + private RunEndEncodedArray(ArrayData data, IArrowArray runEnds, IArrowArray values) + : base(data) + { + data.EnsureBufferCount(0); // REE arrays have no buffers, only children + data.EnsureDataType(ArrowTypeId.RunEndEncoded); + + ValidateRunEndsType(runEnds); + RunEnds = runEnds; + Values = values; + } + + private static ArrayData CreateArrayData(IArrowArray runEnds, IArrowArray values) + { + ValidateRunEndsType(runEnds); Review Comment: RunEndEncodedArray doesn't validate that runEnds.Length == values.Length (one value per run). Creating an instance with mismatched lengths yields an invalid REE array and can lead to out-of-range access in consumers. Add a check in the constructor(s) (and potentially also validate runEnds.NullCount == 0). ```suggestion ValidateRunEndsType(runEnds); ValidateRunEndEncodedChildren(runEnds, values); RunEnds = runEnds; Values = values; } private static void ValidateRunEndEncodedChildren(IArrowArray runEnds, IArrowArray values) { if (runEnds.Length != values.Length) { throw new ArgumentException("Run-end encoded arrays require runEnds.Length to equal values.Length.", nameof(values)); } if (runEnds.NullCount != 0) { throw new ArgumentException("Run-end encoded arrays require runEnds to have no nulls.", nameof(runEnds)); } } private static ArrayData CreateArrayData(IArrowArray runEnds, IArrowArray values) { ValidateRunEndsType(runEnds); ValidateRunEndEncodedChildren(runEnds, values); ``` ########## src/Apache.Arrow/Arrays/RunEndEncodedArray.cs: ########## @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow.Types; + +namespace Apache.Arrow; + +/// <summary> +/// Represents a run-end encoded array. +/// A run-end encoded array stores consecutive runs of the same value more efficiently. +/// It contains two child arrays: run_ends (Int16/Int32/Int64) and values (any type). +/// The run_ends array stores the cumulative end positions of each run. +/// </summary> +public class RunEndEncodedArray : Array +{ + /// <summary> + /// Gets the run ends array (Int16Array, Int32Array, or Int64Array). + /// This array contains the cumulative end indices for each run. + /// </summary> + public IArrowArray RunEnds { get; } + + /// <summary> + /// Gets the values array. + /// This array contains the actual values that are run-length encoded. + /// </summary> + public IArrowArray Values { get; } + + /// <summary> + /// Creates a new RunEndEncodedArray from ArrayData. + /// </summary> + /// <param name="data">The array data containing run ends and values as children.</param> + public RunEndEncodedArray(ArrayData data) + : this(data, ArrowArrayFactory.BuildArray(data.Children[0]), ArrowArrayFactory.BuildArray(data.Children[1])) + { + } + + /// <summary> + /// Creates a new RunEndEncodedArray with specified run ends and values arrays. + /// </summary> + /// <param name="runEnds">The run ends array (must be Int16Array, Int32Array, or Int64Array).</param> + /// <param name="values">The values array (can be any type).</param> + public RunEndEncodedArray(IArrowArray runEnds, IArrowArray values) + : this(CreateArrayData(runEnds, values), runEnds, values) + { + } + + private RunEndEncodedArray(ArrayData data, IArrowArray runEnds, IArrowArray values) + : base(data) + { + data.EnsureBufferCount(0); // REE arrays have no buffers, only children + data.EnsureDataType(ArrowTypeId.RunEndEncoded); + + ValidateRunEndsType(runEnds); + RunEnds = runEnds; + Values = values; + } + + private static ArrayData CreateArrayData(IArrowArray runEnds, IArrowArray values) + { + ValidateRunEndsType(runEnds); + + // The logical length of a REE array is determined by the last value in run_ends + int logicalLength = GetLogicalLength(runEnds); + + var dataType = new RunEndEncodedType(runEnds.Data.DataType, values.Data.DataType); + + return new ArrayData( + dataType, + logicalLength, + nullCount: 0, // REE arrays don't have a validity bitmap + offset: 0, + buffers: [], + children: [runEnds.Data, values.Data]); + } + + private static void ValidateRunEndsType(IArrowArray runEnds) + { + ArrowTypeId typeId = runEnds.Data.DataType.TypeId; + if (typeId != ArrowTypeId.Int16 && + typeId != ArrowTypeId.Int32 && + typeId != ArrowTypeId.Int64) + { + throw new ArgumentException( + $"Run ends array must be Int16, Int32, or Int64, but got {typeId}", + nameof(runEnds)); + } + } + + private static int GetLogicalLength(IArrowArray runEnds) + { + if (runEnds.Length == 0) + { + return 0; + } + + // Get the last run end value which represents the logical length + switch (runEnds) + { + case Int16Array int16Array: + return int16Array.GetValue(int16Array.Length - 1) ?? throw new ArgumentException("invalid length"); + case Int32Array int32Array: + return int32Array.GetValue(int32Array.Length - 1) ?? throw new ArgumentException("invalid length"); + case Int64Array int64Array: + { + long? lastValue = int64Array.GetValue(int64Array.Length - 1); + if (lastValue.HasValue && lastValue.Value > int.MaxValue) + { + throw new ArgumentException("Run ends value exceeds maximum supported length."); + } + return (int)(lastValue ?? throw new ArgumentException("invalid length")); + } + default: + throw new InvalidOperationException($"Unexpected run ends array type: {runEnds.Data.DataType.TypeId}"); + } + } + + /// <summary> + /// Finds the physical index in the run_ends array that contains the specified logical index. + /// </summary> + /// <param name="logicalIndex">The logical index in the decoded array.</param> + /// <returns>The physical index in the run_ends/values arrays.</returns> + public int FindPhysicalIndex(int logicalIndex) + { + if (logicalIndex < 0 || logicalIndex >= Length) + { + throw new ArgumentOutOfRangeException(nameof(logicalIndex)); + } + + // Binary search to find the run that contains this logical index + return RunEnds switch + { + Int16Array int16Array => BinarySearchRunEnds(int16Array, logicalIndex), + Int32Array int32Array => BinarySearchRunEnds(int32Array, logicalIndex), + Int64Array int64Array => BinarySearchRunEnds(int64Array, logicalIndex), + _ => throw new InvalidOperationException($"Unexpected run ends array type: {RunEnds.GetType()}"), + }; + } + + private static int BinarySearchRunEnds(Int16Array runEnds, int logicalIndex) + { + int left = 0; + int right = runEnds.Length - 1; + + while (left < right) + { + int mid = left + (right - left) / 2; + int runEnd = runEnds.GetValue(mid) ?? throw new ArgumentException("invalid length"); ; + + if (logicalIndex < runEnd) + { + right = mid; + } + else + { + left = mid + 1; + } + } + + return left; + } + + private static int BinarySearchRunEnds(Int32Array runEnds, int logicalIndex) + { + int left = 0; + int right = runEnds.Length - 1; + + while (left < right) + { + int mid = left + (right - left) / 2; + int runEnd = runEnds.GetValue(mid) ?? 0; + + if (logicalIndex < runEnd) + { + right = mid; + } + else + { + left = mid + 1; + } + } + + return left; + } + + private static int BinarySearchRunEnds(Int64Array runEnds, int logicalIndex) + { + int left = 0; + int right = runEnds.Length - 1; + + while (left < right) + { + int mid = left + (right - left) / 2; + long runEnd = runEnds.GetValue(mid) ?? 0; + Review Comment: BinarySearchRunEnds for Int32/Int64 treats null run_end values as 0 ("?? 0"), which can silently return incorrect physical indices and mask invalid data. This should be consistent with the Int16 path (throw if a run_end is null) since run_ends must be non-null. ########## src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs: ########## @@ -286,6 +287,201 @@ public void Visit(UnionType type) public void Visit(MapType type) => ConcatenateLists(type.UnsortedKey()); /* Can't tell if the output is still sorted */ + public void Visit(RunEndEncodedType type) + { + ArrowTypeId runEndsTypeId = type.RunEndsDataType.TypeId; + if (runEndsTypeId != ArrowTypeId.Int16 && + runEndsTypeId != ArrowTypeId.Int32 && + runEndsTypeId != ArrowTypeId.Int64) + { + throw new InvalidOperationException( + $"Run-ends array must be Int16, Int32, or Int64, but got {runEndsTypeId}"); + } + + var slicedValues = new List<ArrayData>(_arrayDataList.Count); + ArrowBuffer.Builder<short> int16Builder = null; + ArrowBuffer.Builder<int> int32Builder = null; + ArrowBuffer.Builder<long> int64Builder = null; + + switch (runEndsTypeId) + { + case ArrowTypeId.Int16: int16Builder = new ArrowBuffer.Builder<short>(); break; + case ArrowTypeId.Int32: int32Builder = new ArrowBuffer.Builder<int>(); break; + case ArrowTypeId.Int64: int64Builder = new ArrowBuffer.Builder<long>(); break; + } + + long baseOffset = 0; + int physicalRunCount = 0; + + foreach (ArrayData arrayData in _arrayDataList) + { + arrayData.EnsureDataType(type.TypeId); + + ArrayData runEndsData = arrayData.Children[0]; + ArrayData valuesData = arrayData.Children[1]; + + if (runEndsData.DataType.TypeId != runEndsTypeId) + { + throw new ArgumentException( + $"All run-end encoded arrays must have the same run-ends type. Expected <{runEndsTypeId}> but got <{runEndsData.DataType.TypeId}>."); + } + if (valuesData.DataType.TypeId != type.ValuesDataType.TypeId) + { + throw new ArgumentException( + $"All run-end encoded arrays must have the same values type. Expected <{type.ValuesDataType.TypeId}> but got <{valuesData.DataType.TypeId}>."); + } + + if (arrayData.Length == 0) + { + continue; + } + + int logicalOffset = arrayData.Offset; + int logicalLength = arrayData.Length; + int logicalEnd = logicalOffset + logicalLength; + + int physicalStart; + int physicalEndExclusive; + + switch (runEndsTypeId) + { + case ArrowTypeId.Int16: + { + ReadOnlySpan<short> re = runEndsData.Buffers[1].Span.CastTo<short>() + .Slice(runEndsData.Offset, runEndsData.Length); + physicalStart = FindPhysicalStartInt16(re, logicalOffset); + physicalEndExclusive = FindPhysicalEndExclusiveInt16(re, logicalEnd, physicalStart); + for (int p = physicalStart; p < physicalEndExclusive; p++) + { + int adjustedEnd = Math.Min((int)re[p], logicalEnd) - logicalOffset; + int16Builder.Append(checked((short)(baseOffset + adjustedEnd))); + } + break; + } + case ArrowTypeId.Int32: + { + ReadOnlySpan<int> re = runEndsData.Buffers[1].Span.CastTo<int>() + .Slice(runEndsData.Offset, runEndsData.Length); + physicalStart = FindPhysicalStartInt32(re, logicalOffset); + physicalEndExclusive = FindPhysicalEndExclusiveInt32(re, logicalEnd, physicalStart); + for (int p = physicalStart; p < physicalEndExclusive; p++) + { + int adjustedEnd = Math.Min(re[p], logicalEnd) - logicalOffset; + int32Builder.Append(checked((int)(baseOffset + adjustedEnd))); + } + break; + } + default: // Int64 + { + ReadOnlySpan<long> re = runEndsData.Buffers[1].Span.CastTo<long>() + .Slice(runEndsData.Offset, runEndsData.Length); + physicalStart = FindPhysicalStartInt64(re, logicalOffset); + physicalEndExclusive = FindPhysicalEndExclusiveInt64(re, logicalEnd, physicalStart); + for (int p = physicalStart; p < physicalEndExclusive; p++) + { + long adjustedEnd = Math.Min(re[p], (long)logicalEnd) - logicalOffset; + int64Builder.Append(baseOffset + adjustedEnd); + } + break; + } + } + + int physicalCount = physicalEndExclusive - physicalStart; + physicalRunCount += physicalCount; + slicedValues.Add(valuesData.Slice(physicalStart, physicalCount)); + baseOffset += logicalLength; + } + + ArrowBuffer runEndsValueBuffer = runEndsTypeId switch + { + ArrowTypeId.Int16 => int16Builder.Build(_allocator), + ArrowTypeId.Int32 => int32Builder.Build(_allocator), + _ => int64Builder.Build(_allocator), + }; + + ArrayData runEndsResult = new ArrayData( + type.RunEndsDataType, physicalRunCount, 0, 0, + new[] { ArrowBuffer.Empty, runEndsValueBuffer }); + + ArrayData valuesResult = slicedValues.Count == 0 + ? new ArrayData(type.ValuesDataType, 0, 0, 0, + new[] { ArrowBuffer.Empty }) + : Concatenate(slicedValues, _allocator); + Review Comment: When concatenating REE arrays, valuesResult for the all-empty case is created with a single empty buffer. This is invalid for many values types (e.g., primitive arrays require 2 buffers, binary requires 3, etc.) and will fail when building the values child array. Construct an empty ArrayData for type.ValuesDataType with the correct buffer/child layout (or reuse existing concatenation logic to produce an empty of the right shape). ```suggestion ArrayData valuesResult; if (slicedValues.Count == 0) { if (_arrayDataList.Count == 0) { throw new InvalidOperationException( "Cannot construct REE values array data without any input arrays."); } // Reuse an existing empty values child so the buffer/child layout // matches type.ValuesDataType for all Arrow value types. valuesResult = _arrayDataList[0].Children[1]; } else { valuesResult = Concatenate(slicedValues, _allocator); } ``` ########## src/Apache.Arrow/Types/RunEndEncodedType.cs: ########## @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; + +namespace Apache.Arrow.Types; + +/// <summary> +/// Represents a run-end encoded array type. +/// Contains two child arrays: run_ends and values. +/// The run_ends child array must be a 16/32/64-bit signed integer array +/// which encodes the indices at which the run with the value in +/// each corresponding index in the values child array ends. +/// </summary> +public sealed class RunEndEncodedType : NestedType +{ + public override ArrowTypeId TypeId => ArrowTypeId.RunEndEncoded; + public override string Name => "run_end_encoded"; + + /// <summary> + /// Gets the run ends field (must be Int16, Int32, or Int64). + /// </summary> + public Field RunEndsField => Fields[0]; + + /// <summary> + /// Gets the values field (can be any type). + /// </summary> + public Field ValuesField => Fields[1]; + + /// <summary> + /// Gets the data type of the run ends array. + /// </summary> + public IArrowType RunEndsDataType => RunEndsField.DataType; + + /// <summary> + /// Gets the data type of the values array. + /// </summary> + public IArrowType ValuesDataType => ValuesField.DataType; + + /// <summary> + /// Creates a new RunEndEncodedType with the specified run ends and values fields. + /// </summary> + /// <param name="runEndsField">The run ends field (must be Int16, Int32, or Int64).</param> + /// <param name="valuesField">The values field (can be any type).</param> + public RunEndEncodedType(Field runEndsField, Field valuesField) + : base([runEndsField, valuesField]) + { + ValidateRunEndsType(runEndsField.DataType); + } Review Comment: ValidateRunEndsType throws an ArgumentException with paramName "runEndsDataType", but in the Field-based constructor the invalid input originates from runEndsField.DataType. Consider passing a paramName that matches the public constructor parameter (e.g., nameof(runEndsField)) to make exception reporting clearer for callers. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
