Copilot commented on code in PR #308: URL: https://github.com/apache/arrow-dotnet/pull/308#discussion_r3045654722
########## test/Apache.Arrow.Tests/RunEndEncodedArrayTests.cs: ########## @@ -0,0 +1,366 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.IO; +using Apache.Arrow.Ipc; +using Apache.Arrow.Types; +using Xunit; + +namespace Apache.Arrow.Tests; + +public class RunEndEncodedArrayTests +{ + [Fact] + public void TestRunEndEncodedTypeCreation() + { + // Test with explicit fields + var runEndsField = new Field("run_ends", Int32Type.Default, nullable: false); + var valuesField = new Field("values", StringType.Default, nullable: true); + var reeType = new RunEndEncodedType(runEndsField, valuesField); + + Assert.Equal(ArrowTypeId.RunEndEncoded, reeType.TypeId); + Assert.Equal("run_end_encoded", reeType.Name); + Assert.Equal(runEndsField, reeType.RunEndsField); + Assert.Equal(valuesField, reeType.ValuesField); + Assert.Equal(Int32Type.Default.TypeId, reeType.RunEndsDataType.TypeId); + Assert.Equal(StringType.Default.TypeId, reeType.ValuesDataType.TypeId); + } + + [Fact] + public void TestRunEndEncodedTypeCreationWithDataTypes() + { + // Test with data types (uses default field names) + var reeType = new RunEndEncodedType(Int32Type.Default, StringType.Default); + + Assert.Equal(ArrowTypeId.RunEndEncoded, reeType.TypeId); + Assert.Equal("run_ends", reeType.RunEndsField.Name); + Assert.Equal("values", reeType.ValuesField.Name); + } + + [Fact] + public void TestRunEndEncodedTypeValidation() + { + // Invalid run ends type (must be Int16, Int32, or Int64) + Assert.Throws<ArgumentException>(() => new RunEndEncodedType(Int8Type.Default, StringType.Default)); + Assert.Throws<ArgumentException>(() => new RunEndEncodedType(FloatType.Default, StringType.Default)); + Assert.Throws<ArgumentException>(() => new RunEndEncodedType(StringType.Default, StringType.Default)); + + // Valid run ends types + Assert.NotNull(new RunEndEncodedType(Int16Type.Default, StringType.Default)); // Should not throw + Assert.NotNull(new RunEndEncodedType(Int32Type.Default, StringType.Default)); // Should not throw + Assert.NotNull(new RunEndEncodedType(Int64Type.Default, StringType.Default)); // Should not throw + } + + [Fact] + public void TestRunEndEncodedArrayWithInt32RunEnds() + { + // Create run ends: [3, 7, 10, 15] + // This represents: 3 'A's, 4 'B's, 3 'C's, 5 'D's + var runEndsBuilder = new Int32Array.Builder(); + runEndsBuilder.AppendRange([3, 7, 10, 15]); + Int32Array runEnds = runEndsBuilder.Build(); + + // Create values: ['A', 'B', 'C', 'D'] + var valuesBuilder = new StringArray.Builder(); + valuesBuilder.AppendRange(["A", "B", "C", "D"]); + StringArray values = valuesBuilder.Build(); + + // Create REE array + var reeArray = new RunEndEncodedArray(runEnds, values); + + Assert.Equal(15, reeArray.Length); // Logical length is the last run end value + Assert.Equal(0, reeArray.NullCount); // REE arrays don't have nulls at the top level + Assert.Equal(runEnds, reeArray.RunEnds); + Assert.Equal(values, reeArray.Values); + } + + [Fact] + public void TestRunEndEncodedArrayWithInt16RunEnds() + { + var runEndsBuilder = new Int16Array.Builder(); + runEndsBuilder.AppendRange([2, 5, 8]); + Int16Array runEnds = runEndsBuilder.Build(); + + var valuesBuilder = new Int32Array.Builder(); + valuesBuilder.AppendRange([100, 200, 300]); + Int32Array values = valuesBuilder.Build(); + + var reeArray = new RunEndEncodedArray(runEnds, values); + + Assert.Equal(8, reeArray.Length); + Assert.Equal(runEnds, reeArray.RunEnds); + Assert.Equal(values, reeArray.Values); + } + + [Fact] + public void TestRunEndEncodedArrayWithInt64RunEnds() + { + var runEndsBuilder = new Int64Array.Builder(); + runEndsBuilder.AppendRange([1000, 2000, 3000]); + Int64Array runEnds = runEndsBuilder.Build(); + + var valuesBuilder = new DoubleArray.Builder(); + valuesBuilder.AppendRange([1.5, 2.5, 3.5]); + DoubleArray values = valuesBuilder.Build(); + + var reeArray = new RunEndEncodedArray(runEnds, values); + + Assert.Equal(3000, reeArray.Length); + Assert.Equal(runEnds, reeArray.RunEnds); + Assert.Equal(values, reeArray.Values); + } + + [Fact] + public void TestRunEndEncodedArrayInvalidRunEndsType() + { + Int8Array invalidRunEnds = new Int8Array.Builder().AppendRange([1, 2, 3]).Build(); + StringArray values = new StringArray.Builder().AppendRange(["A", "B", "C"]).Build(); + + Assert.Throws<ArgumentException>(() => new RunEndEncodedArray(invalidRunEnds, values)); + } + + [Fact] + public void TestRunEndEncodedArrayEmpty() + { + Int32Array runEnds = new Int32Array.Builder().Build(); + StringArray values = new StringArray.Builder().Build(); + + var reeArray = new RunEndEncodedArray(runEnds, values); + + Assert.Equal(0, reeArray.Length); + } + + [Fact] + public void TestFindPhysicalIndexInt32() + { + // Run ends: [3, 7, 10, 15] means: + // Logical indices 0-2 map to physical index 0 (value 'A') + // Logical indices 3-6 map to physical index 1 (value 'B') + // Logical indices 7-9 map to physical index 2 (value 'C') + // Logical indices 10-14 map to physical index 3 (value 'D') + Int32Array runEnds = new Int32Array.Builder() + .AppendRange([3, 7, 10, 15]) + .Build(); + StringArray values = new StringArray.Builder() + .AppendRange(["A", "B", "C", "D"]) + .Build(); + + var reeArray = new RunEndEncodedArray(runEnds, values); + + Assert.Equal(0, reeArray.FindPhysicalIndex(0)); + Assert.Equal(0, reeArray.FindPhysicalIndex(1)); + Assert.Equal(0, reeArray.FindPhysicalIndex(2)); + Assert.Equal(1, reeArray.FindPhysicalIndex(3)); + Assert.Equal(1, reeArray.FindPhysicalIndex(4)); + Assert.Equal(1, reeArray.FindPhysicalIndex(5)); + Assert.Equal(1, reeArray.FindPhysicalIndex(6)); + Assert.Equal(2, reeArray.FindPhysicalIndex(7)); + Assert.Equal(2, reeArray.FindPhysicalIndex(8)); + Assert.Equal(2, reeArray.FindPhysicalIndex(9)); + Assert.Equal(3, reeArray.FindPhysicalIndex(10)); + Assert.Equal(3, reeArray.FindPhysicalIndex(11)); + Assert.Equal(3, reeArray.FindPhysicalIndex(14)); + } + + [Fact] + public void TestRunEndEncodedLengthMismatchThrows() + { + Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 7]).Build(); + StringArray values = new StringArray.Builder().AppendRange(["A", "B", "C"]).Build(); + + var ex = Assert.Throws<ArgumentException>(() => new RunEndEncodedArray(runEnds, values)); + Assert.Contains("length", ex.Message); + } + + [Fact] + public void TestRunEndEncodedNullRunEndsThrows() + { + Int32Array runEnds = new Int32Array.Builder().Append(3).AppendNull().Build(); + StringArray values = new StringArray.Builder().AppendRange(["A", "B"]).Build(); + + var ex = Assert.Throws<ArgumentException>(() => new RunEndEncodedArray(runEnds, values)); + Assert.Contains("null", ex.Message); + } + + [Fact] + public void TestFindPhysicalIndexOnSlicedArray() + { + // Run ends: [3, 7, 10] → run 0 (A): positions 0..2, run 1 (B): 3..6, run 2 (C): 7..9 + Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 7, 10]).Build(); + StringArray values = new StringArray.Builder().AppendRange(["A", "B", "C"]).Build(); + var reeArray = new RunEndEncodedArray(runEnds, values); + + // Slice covering underlying positions 2..7 (length 6). + // Slice positions: 0→A(2), 1→B(3), 2→B(4), 3→B(5), 4→B(6), 5→C(7) + var sliced = (RunEndEncodedArray)ArrowArrayFactory.Slice(reeArray, 2, 6); + + Assert.Equal(6, sliced.Length); + Assert.Equal(0, sliced.FindPhysicalIndex(0)); + Assert.Equal(1, sliced.FindPhysicalIndex(1)); + Assert.Equal(1, sliced.FindPhysicalIndex(2)); + Assert.Equal(1, sliced.FindPhysicalIndex(3)); + Assert.Equal(1, sliced.FindPhysicalIndex(4)); + Assert.Equal(2, sliced.FindPhysicalIndex(5)); + + // Out-of-range against the slice length, even though they would be valid against + // the underlying physical array. + Assert.Throws<ArgumentOutOfRangeException>(() => sliced.FindPhysicalIndex(6)); + } + + [Fact] + public void TestFindPhysicalIndexOutOfRange() + { + Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 7]).Build(); + StringArray values = new StringArray.Builder().AppendRange(["A", "B"]).Build(); + var reeArray = new RunEndEncodedArray(runEnds, values); + + Assert.Throws<ArgumentOutOfRangeException>(() => reeArray.FindPhysicalIndex(-1)); + Assert.Throws<ArgumentOutOfRangeException>(() => reeArray.FindPhysicalIndex(7)); + Assert.Throws<ArgumentOutOfRangeException>(() => reeArray.FindPhysicalIndex(100)); + } + + [Fact] + public void TestRunEndEncodedArraySerialization() + { + // Create a REE array + Int32Array runEnds = new Int32Array.Builder().AppendRange([3, 7, 10]).Build(); + StringArray values = new StringArray.Builder().AppendRange(["foo", "bar", "baz"]).Build(); + var reeArray = new RunEndEncodedArray(runEnds, values); + + // Create a record batch with the REE array + var reeField = new Field("ree_column", reeArray.Data.DataType, nullable: false); + var schema = new Schema([reeField], null); + var recordBatch = new RecordBatch(schema, [reeArray], reeArray.Length); + Review Comment: The serialization test only covers a non-sliced RunEndEncodedArray (Offset=0). Given REE requires special handling to serialize slices correctly (since IPC always encodes arrays with offset=0), it would be good to add a round-trip test for a sliced REE array (e.g., ArrowArrayFactory.Slice on a larger REE) to prevent regressions in slice normalization during write/read. ########## src/Apache.Arrow/Ipc/ArrowReaderImplementation.cs: ########## @@ -276,6 +276,9 @@ private ArrayData LoadField( { case ArrowTypeId.Null: return new ArrayData(field.DataType, fieldLength, fieldNullCount, 0, System.Array.Empty<ArrowBuffer>()); + case ArrowTypeId.RunEndEncoded: Review Comment: When reading RunEndEncoded arrays, the IPC field node’s nullCount should always be 0 (REE has no top-level validity bitmap). If a message provides a non-zero nullCount, the resulting ArrayData will have NullCount != 0 but BufferCount==0, which can later trigger IndexOutOfRangeException via Array.IsValid/NullBitmapBuffer access. Consider validating fieldNullCount==0 here and throwing InvalidDataException for non-canonical/invalid input. ```suggestion case ArrowTypeId.RunEndEncoded: if (fieldNullCount != 0) { throw new InvalidDataException("RunEndEncoded array top-level null count must be 0"); } ``` ########## src/Apache.Arrow/Ipc/ArrowStreamWriter.cs: ########## @@ -358,6 +359,15 @@ public void Visit(DictionaryArray array) array.Indices.Accept(this); } + public void Visit(RunEndEncodedArray array) + { + // REE arrays have no buffers at the top level, only child arrays + // Visit the run_ends array + VisitArray(array.RunEnds); + // Visit the values array + VisitArray(array.Values); + } Review Comment: Arrow IPC serialization doesn’t account for sliced RunEndEncodedArray instances. When array.Offset != 0 (or when serializing a logical slice), this currently writes the full RunEnds/Values children without normalizing run_ends to the slice range, but the stream format always encodes arrays with offset=0. This will round-trip to incorrect data (logical index 0 in the deserialized array won’t correspond to the slice’s start). Consider serializing a normalized/sliced REE representation: compute the physical run range for [array.Offset, array.Offset+array.Length) and emit adjusted run_ends (ending at array.Length) plus the corresponding values slice. ########## src/Apache.Arrow/Arrays/RunEndEncodedArray.cs: ########## @@ -0,0 +1,252 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow.Types; + +namespace Apache.Arrow; + +/// <summary> +/// Represents a run-end encoded array. +/// A run-end encoded array stores consecutive runs of the same value more efficiently. +/// It contains two child arrays: run_ends (Int16/Int32/Int64) and values (any type). +/// The run_ends array stores the cumulative end positions of each run. +/// </summary> +public class RunEndEncodedArray : Array +{ + /// <summary> + /// Gets the run ends array (Int16Array, Int32Array, or Int64Array). + /// This array contains the cumulative end indices for each run. + /// </summary> + public IArrowArray RunEnds { get; } + + /// <summary> + /// Gets the values array. + /// This array contains the actual values that are run-length encoded. + /// </summary> + public IArrowArray Values { get; } + + /// <summary> + /// Creates a new RunEndEncodedArray from ArrayData. + /// </summary> + /// <param name="data">The array data containing run ends and values as children.</param> + public RunEndEncodedArray(ArrayData data) + : this(data, ArrowArrayFactory.BuildArray(data.Children[0]), ArrowArrayFactory.BuildArray(data.Children[1])) + { + } + + /// <summary> + /// Creates a new RunEndEncodedArray with specified run ends and values arrays. + /// </summary> + /// <param name="runEnds">The run ends array (must be Int16Array, Int32Array, or Int64Array).</param> + /// <param name="values">The values array (can be any type).</param> + public RunEndEncodedArray(IArrowArray runEnds, IArrowArray values) + : this(CreateArrayData(runEnds, values), runEnds, values) + { + } + + private RunEndEncodedArray(ArrayData data, IArrowArray runEnds, IArrowArray values) + : base(data) + { + data.EnsureBufferCount(0); // REE arrays have no buffers, only children + data.EnsureDataType(ArrowTypeId.RunEndEncoded); + Review Comment: RunEndEncodedArray relies on NullCount==0 to avoid Array.IsValid touching NullBitmapBuffer (since buffer count is 0). If a RunEndEncodedArray is constructed from ArrayData with non-zero nullCount (e.g., malformed IPC/C data), IsValid/IsNull can throw IndexOutOfRangeException when it tries to access Data.Buffers[0]. Consider validating data.NullCount==0 in this constructor (and throwing ArgumentException/InvalidDataException) to fail fast with a clearer message. ```suggestion if (data.NullCount != 0) { throw new ArgumentException( $"Run-end encoded arrays must not have parent nulls, but had {data.NullCount} null(s).", nameof(data)); } ``` ########## src/Apache.Arrow/C/CArrowArrayImporter.cs: ########## @@ -206,6 +206,11 @@ private ArrayData GetAsArrayData(CArrowArray* cArray, IArrowType type) children = ProcessListChildren(cArray, mapType.Fields[0].DataType); buffers = ImportListBuffers(cArray); break; + case ArrowTypeId.RunEndEncoded: + RunEndEncodedType reeType = (RunEndEncodedType)storageType; + children = ProcessStructChildren(cArray, reeType.Fields); Review Comment: For REE import, buffers are set to empty (correct for canonical REE), but ArrayData created later will still carry cArray->null_count. If a C producer sets null_count > 0, the resulting RunEndEncodedArray can later throw when IsValid accesses Data.Buffers[0]. Consider validating cArray->null_count==0 for ArrowTypeId.RunEndEncoded (or otherwise rejecting/normalizing) to avoid constructing an invalid in-memory array shape. ```suggestion children = ProcessStructChildren(cArray, reeType.Fields); if (cArray->null_count != 0) { throw new InvalidOperationException("Imported RunEndEncoded arrays must have null_count == 0."); } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
