Copilot commented on code in PR #324: URL: https://github.com/apache/arrow-dotnet/pull/324#discussion_r3121123114
########## src/Apache.Arrow/Arrays/TimestampWithOffsetArray.cs: ########## @@ -0,0 +1,255 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections; +using System.Collections.Generic; +using Apache.Arrow.Types; + +namespace Apache.Arrow +{ + /// <summary> + /// Extension definition for the "arrow.timestamp_with_offset" canonical extension type. + /// Storage is a struct with fields "timestamp" (Timestamp(unit, "UTC")) and + /// "offset_minutes" (Int16). The offset_minutes field may be dictionary-encoded + /// or run-end encoded. + /// </summary> + public class TimestampWithOffsetExtensionDefinition : ExtensionDefinition + { + public static readonly TimestampWithOffsetExtensionDefinition Instance = new TimestampWithOffsetExtensionDefinition(); + + public override string ExtensionName => "arrow.timestamp_with_offset"; + + private TimestampWithOffsetExtensionDefinition() { } + + public override bool TryCreateType(IArrowType storageType, string metadata, out ExtensionType type) + { + type = null; + + if (!(storageType is StructType structType) || structType.Fields.Count != 2) + return false; + + // Validate field order and names per spec + Field tsField = structType.Fields[0]; + Field offsetField = structType.Fields[1]; + + if (tsField.Name != "timestamp" || offsetField.Name != "offset_minutes") + return false; + + if (!(tsField.DataType is TimestampType tsType) || tsType.Timezone != "UTC") + return false; + + // offset_minutes must logically be Int16, but may be dict/REE encoded + if (!IsLogicallyInt16(offsetField.DataType)) + return false; Review Comment: TryCreateType validates field names and types, but doesn't enforce the spec's non-nullable fields (both 'timestamp' and 'offset_minutes' are declared nullable: false in the default storage type). Without checking Field.IsNullable, this definition may accept incompatible storage schemas that TimestampWithOffsetArray doesn't handle well. Consider rejecting storage types where either field is nullable. ########## src/Apache.Arrow/Arrays/TimestampWithOffsetArray.cs: ########## @@ -0,0 +1,255 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections; +using System.Collections.Generic; +using Apache.Arrow.Types; + +namespace Apache.Arrow +{ + /// <summary> + /// Extension definition for the "arrow.timestamp_with_offset" canonical extension type. + /// Storage is a struct with fields "timestamp" (Timestamp(unit, "UTC")) and + /// "offset_minutes" (Int16). The offset_minutes field may be dictionary-encoded + /// or run-end encoded. + /// </summary> + public class TimestampWithOffsetExtensionDefinition : ExtensionDefinition + { + public static readonly TimestampWithOffsetExtensionDefinition Instance = new TimestampWithOffsetExtensionDefinition(); + + public override string ExtensionName => "arrow.timestamp_with_offset"; + + private TimestampWithOffsetExtensionDefinition() { } + + public override bool TryCreateType(IArrowType storageType, string metadata, out ExtensionType type) + { + type = null; + + if (!(storageType is StructType structType) || structType.Fields.Count != 2) + return false; + + // Validate field order and names per spec + Field tsField = structType.Fields[0]; + Field offsetField = structType.Fields[1]; + + if (tsField.Name != "timestamp" || offsetField.Name != "offset_minutes") + return false; + + if (!(tsField.DataType is TimestampType tsType) || tsType.Timezone != "UTC") + return false; + + // offset_minutes must logically be Int16, but may be dict/REE encoded + if (!IsLogicallyInt16(offsetField.DataType)) + return false; + + type = new TimestampWithOffsetType(tsType.Unit, structType); + return true; + } + + private static bool IsLogicallyInt16(IArrowType type) + { + switch (type) + { + case Int16Type _: + return true; + case DictionaryType dictType: + return dictType.ValueType is Int16Type; + case RunEndEncodedType reeType: + return reeType.ValuesDataType is Int16Type; + default: + return false; + } + } + } + + /// <summary> + /// Extension type for timestamps with per-value UTC offset, stored as + /// Struct(timestamp: Timestamp(unit, "UTC"), offset_minutes: Int16). + /// </summary> + public class TimestampWithOffsetType : ExtensionType + { + public static readonly TimestampWithOffsetType Default = + new TimestampWithOffsetType(TimeUnit.Microsecond); + + public override string Name => "arrow.timestamp_with_offset"; + public override string ExtensionMetadata => ""; + + public TimeUnit Unit { get; } + + public TimestampWithOffsetType(TimeUnit unit = TimeUnit.Microsecond) + : base(CreateDefaultStorageType(unit)) + { + Unit = unit; + } + + internal TimestampWithOffsetType(TimeUnit unit, StructType storageType) + : base(storageType) + { + Unit = unit; + } + + public override ExtensionArray CreateArray(IArrowArray storageArray) + { + return new TimestampWithOffsetArray(this, storageArray); + } + + private static StructType CreateDefaultStorageType(TimeUnit unit) + { + return new StructType(new[] + { + new Field("timestamp", new TimestampType(unit, "UTC"), nullable: false), + new Field("offset_minutes", Int16Type.Default, nullable: false), + }); + } + } + + /// <summary> + /// Extension array for the "arrow.timestamp_with_offset" canonical extension type. + /// Implements <see cref="IReadOnlyList{T}"/> of nullable <see cref="DateTimeOffset"/>. + /// </summary> + public class TimestampWithOffsetArray : ExtensionArray, IReadOnlyList<DateTimeOffset?> + { + private readonly StructArray _struct; + private readonly TimestampArray _timestamps; + private readonly IReadOnlyList<short?> _offsetMinutes; + + public TimestampWithOffsetArray(TimestampWithOffsetType type, IArrowArray storage) + : base(type, storage) + { + _struct = (StructArray)storage; + var structType = (StructType)storage.Data.DataType; + + int tsIndex = structType.GetFieldIndex("timestamp"); + int offsetIndex = structType.GetFieldIndex("offset_minutes"); + if (tsIndex < 0 || offsetIndex < 0) + throw new ArgumentException("Storage struct must have 'timestamp' and 'offset_minutes' fields."); + + _timestamps = (TimestampArray)_struct.Fields[tsIndex]; + _offsetMinutes = _struct.Fields[offsetIndex].AsDecodedReadOnlyList<short?>(); + } + + /// <summary> + /// Gets the value at the specified index as a <see cref="DateTimeOffset"/> + /// with the original timezone offset preserved. + /// </summary> + public DateTimeOffset? GetValue(int index) + { + if (index < 0 || index >= Length) + throw new ArgumentOutOfRangeException(nameof(index)); + + if (IsNull(index)) + return null; + + DateTimeOffset utc = _timestamps.GetTimestampUnchecked(index); + short offsetMins = _offsetMinutes[index] ?? 0; + TimeSpan offset = TimeSpan.FromMinutes(offsetMins); + return utc.ToOffset(offset); + } Review Comment: GetValue assumes both child fields are present and non-null: it uses _timestamps.GetTimestampUnchecked(index) and treats a null offset (_offsetMinutes[index] == null) as 0 minutes. If the underlying storage violates the declared non-nullable fields (e.g., offset_minutes has nulls due to malformed data), this silently returns an incorrect DateTimeOffset. Consider validating that the timestamp is valid and that offset_minutes is non-null for non-null struct rows, and throw InvalidDataException/ArgumentException if not. ########## src/Apache.Arrow/Extensions/IArrowArrayExtensions.cs: ########## @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections; +using System.Collections.Generic; +using Apache.Arrow.Types; + +namespace Apache.Arrow +{ + /// <summary> + /// Provides factory methods that return <see cref="IReadOnlyList{T}"/> views + /// over Arrow arrays, transparently handling plain, dictionary-encoded, + /// and run-end encoded layouts. + /// </summary> + public static class IArrowArrayExtensions + { + /// <summary> + /// Returns an <see cref="IReadOnlyList{T}"/> of nullable T values + /// for the given array, regardless of encoding. + /// </summary> + public static IReadOnlyList<T> AsDecodedReadOnlyList<T>(this IArrowArray array) + { + if (array == null) + throw new ArgumentNullException(nameof(array)); + + switch (array) + { + case IReadOnlyList<T> plain: + return plain; + + case DictionaryArray dict: + IReadOnlyList<T> values = dict.Dictionary as IReadOnlyList<T>; + if (values == null) + throw new ArgumentException( + $"Dictionary value type {dict.Dictionary.Data.DataType.TypeId} cannot be read as {typeof(T).Name}."); + return new DictionaryReadOnlyList<T>(dict, values); + + case RunEndEncodedArray ree: + IReadOnlyList<T> reeValues = ree.Values as IReadOnlyList<T>; + if (reeValues == null) + throw new ArgumentException( + $"Run-end encoded value type {ree.Values.Data.DataType.TypeId} cannot be read as {typeof(T).Name}."); + return new ReeReadOnlyList<T>(ree, reeValues); + + default: + throw new ArgumentException( + $"Cannot create {typeof(T).Name} reader for array of type {array.Data.DataType.TypeId}.", + nameof(array)); + } + } + + private sealed class DictionaryReadOnlyList<T> : IReadOnlyList<T> + { + private readonly IArrowArray _indices; + private readonly IReadOnlyList<T> _values; + private readonly Func<IArrowArray, int, int> _indexLookup; + + public DictionaryReadOnlyList(DictionaryArray dict, IReadOnlyList<T> values) + { + _indices = dict.Indices; + _values = values; + _indexLookup = GetDictionaryIndex(dict.Indices.Data.DataType); + } + + public int Count => _indices.Length; + + public T this[int index] + { + get + { + if (_indices.IsNull(index)) + return default; + + int dictIndex = _indexLookup(_indices, index); + return _values[dictIndex]; + } + } Review Comment: DictionaryReadOnlyList<T>.this[int] calls _indices.IsNull(index) without an explicit bounds check. Since IArrowArray.IsNull/IsValid don't validate index bounds, invalid indices can throw inconsistent exceptions (or read the bitmap out of range) rather than the IReadOnlyList contract's ArgumentOutOfRangeException. Add a bounds check (e.g., unsigned compare against Count) at the start of the indexer. ########## src/Apache.Arrow/Extensions/IArrowArrayExtensions.cs: ########## @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections; +using System.Collections.Generic; +using Apache.Arrow.Types; + +namespace Apache.Arrow +{ + /// <summary> + /// Provides factory methods that return <see cref="IReadOnlyList{T}"/> views + /// over Arrow arrays, transparently handling plain, dictionary-encoded, + /// and run-end encoded layouts. + /// </summary> + public static class IArrowArrayExtensions + { + /// <summary> + /// Returns an <see cref="IReadOnlyList{T}"/> of nullable T values + /// for the given array, regardless of encoding. Review Comment: The XML docs state this returns an IReadOnlyList of "nullable T values", but the method is generic over T and will return default(T) for null slots. Consider updating the docs to clarify that nulls are represented as default(T) and that callers should use nullable value types (e.g., int?) when nulls are possible. ```suggestion /// Returns an <see cref="IReadOnlyList{T}"/> view for the given array, /// regardless of encoding. /// Null slots are represented as <c>default(T)</c>. Callers should use /// nullable value types (for example, <c>int?</c>) when nulls are possible /// and need to be distinguished from the default value of <typeparamref name="T"/>. ``` ########## src/Apache.Arrow/Arrays/TimestampWithOffsetArray.cs: ########## @@ -0,0 +1,255 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections; +using System.Collections.Generic; +using Apache.Arrow.Types; + +namespace Apache.Arrow +{ + /// <summary> + /// Extension definition for the "arrow.timestamp_with_offset" canonical extension type. + /// Storage is a struct with fields "timestamp" (Timestamp(unit, "UTC")) and + /// "offset_minutes" (Int16). The offset_minutes field may be dictionary-encoded + /// or run-end encoded. + /// </summary> + public class TimestampWithOffsetExtensionDefinition : ExtensionDefinition + { + public static readonly TimestampWithOffsetExtensionDefinition Instance = new TimestampWithOffsetExtensionDefinition(); + + public override string ExtensionName => "arrow.timestamp_with_offset"; + + private TimestampWithOffsetExtensionDefinition() { } + + public override bool TryCreateType(IArrowType storageType, string metadata, out ExtensionType type) + { + type = null; + + if (!(storageType is StructType structType) || structType.Fields.Count != 2) + return false; + + // Validate field order and names per spec + Field tsField = structType.Fields[0]; + Field offsetField = structType.Fields[1]; + + if (tsField.Name != "timestamp" || offsetField.Name != "offset_minutes") + return false; + + if (!(tsField.DataType is TimestampType tsType) || tsType.Timezone != "UTC") + return false; + + // offset_minutes must logically be Int16, but may be dict/REE encoded + if (!IsLogicallyInt16(offsetField.DataType)) + return false; + + type = new TimestampWithOffsetType(tsType.Unit, structType); + return true; + } + + private static bool IsLogicallyInt16(IArrowType type) + { + switch (type) + { + case Int16Type _: + return true; + case DictionaryType dictType: + return dictType.ValueType is Int16Type; + case RunEndEncodedType reeType: + return reeType.ValuesDataType is Int16Type; + default: + return false; + } + } + } + + /// <summary> + /// Extension type for timestamps with per-value UTC offset, stored as + /// Struct(timestamp: Timestamp(unit, "UTC"), offset_minutes: Int16). + /// </summary> + public class TimestampWithOffsetType : ExtensionType + { + public static readonly TimestampWithOffsetType Default = + new TimestampWithOffsetType(TimeUnit.Microsecond); + + public override string Name => "arrow.timestamp_with_offset"; + public override string ExtensionMetadata => ""; + + public TimeUnit Unit { get; } + + public TimestampWithOffsetType(TimeUnit unit = TimeUnit.Microsecond) + : base(CreateDefaultStorageType(unit)) + { + Unit = unit; + } + + internal TimestampWithOffsetType(TimeUnit unit, StructType storageType) + : base(storageType) + { + Unit = unit; + } + + public override ExtensionArray CreateArray(IArrowArray storageArray) + { + return new TimestampWithOffsetArray(this, storageArray); + } + + private static StructType CreateDefaultStorageType(TimeUnit unit) + { + return new StructType(new[] + { + new Field("timestamp", new TimestampType(unit, "UTC"), nullable: false), + new Field("offset_minutes", Int16Type.Default, nullable: false), + }); + } + } + + /// <summary> + /// Extension array for the "arrow.timestamp_with_offset" canonical extension type. + /// Implements <see cref="IReadOnlyList{T}"/> of nullable <see cref="DateTimeOffset"/>. + /// </summary> + public class TimestampWithOffsetArray : ExtensionArray, IReadOnlyList<DateTimeOffset?> + { + private readonly StructArray _struct; + private readonly TimestampArray _timestamps; + private readonly IReadOnlyList<short?> _offsetMinutes; + + public TimestampWithOffsetArray(TimestampWithOffsetType type, IArrowArray storage) + : base(type, storage) + { + _struct = (StructArray)storage; + var structType = (StructType)storage.Data.DataType; + + int tsIndex = structType.GetFieldIndex("timestamp"); + int offsetIndex = structType.GetFieldIndex("offset_minutes"); + if (tsIndex < 0 || offsetIndex < 0) + throw new ArgumentException("Storage struct must have 'timestamp' and 'offset_minutes' fields."); + + _timestamps = (TimestampArray)_struct.Fields[tsIndex]; + _offsetMinutes = _struct.Fields[offsetIndex].AsDecodedReadOnlyList<short?>(); + } + + /// <summary> + /// Gets the value at the specified index as a <see cref="DateTimeOffset"/> + /// with the original timezone offset preserved. + /// </summary> + public DateTimeOffset? GetValue(int index) + { + if (index < 0 || index >= Length) + throw new ArgumentOutOfRangeException(nameof(index)); + + if (IsNull(index)) + return null; + + DateTimeOffset utc = _timestamps.GetTimestampUnchecked(index); + short offsetMins = _offsetMinutes[index] ?? 0; + TimeSpan offset = TimeSpan.FromMinutes(offsetMins); + return utc.ToOffset(offset); + } + + public int Count => Length; + public DateTimeOffset? this[int index] => GetValue(index); + + public IEnumerator<DateTimeOffset?> GetEnumerator() + { + for (int i = 0; i < Length; i++) + yield return GetValue(i); + } + + IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); + + /// <summary> + /// Builder for <see cref="TimestampWithOffsetArray"/>. + /// </summary> + public class Builder + { + private readonly TimestampArray.Builder _timestampBuilder; + private readonly Int16Array.Builder _offsetBuilder; + private readonly ArrowBuffer.BitmapBuilder _validityBuilder; + private readonly TimestampWithOffsetType _type; + private int _length; + private int _nullCount; + + public Builder(TimeUnit unit = TimeUnit.Microsecond) + { + _type = new TimestampWithOffsetType(unit); + _timestampBuilder = new TimestampArray.Builder(unit, "UTC"); + _offsetBuilder = new Int16Array.Builder(); + _validityBuilder = new ArrowBuffer.BitmapBuilder(); + } + + public Builder Append(DateTimeOffset value) + { + _timestampBuilder.Append(value.ToUniversalTime()); + _offsetBuilder.Append(checked((short)value.Offset.TotalMinutes)); + _validityBuilder.Append(true); + _length++; + return this; + } + + public Builder AppendNull() + { + _timestampBuilder.Append(default(DateTimeOffset)); + _offsetBuilder.Append(0); + _validityBuilder.Append(false); + _length++; + _nullCount++; + return this; + } + + public Builder AppendRange(IEnumerable<DateTimeOffset> values) + { + if (values == null) + throw new ArgumentNullException(nameof(values)); + + foreach (var value in values) + Append(value); + + return this; + } + + public Builder AppendRange(IEnumerable<DateTimeOffset?> values) + { + if (values == null) + throw new ArgumentNullException(nameof(values)); + + foreach (var value in values) + { + if (value.HasValue) + Append(value.Value); + else + AppendNull(); + } + + return this; + } + + public TimestampWithOffsetArray Build() + { + TimestampArray timestamps = _timestampBuilder.Build(); + Int16Array offsets = _offsetBuilder.Build(); + ArrowBuffer validityBuffer = _validityBuilder.Build(); + + var structType = (StructType)_type.StorageType; + var structArray = new StructArray( + structType, _length, + new IArrowArray[] { timestamps, offsets }, + validityBuffer, _nullCount); + Review Comment: Builder.Build() always materializes a validity bitmap via _validityBuilder.Build(), even when _nullCount == 0. Other builders in this repo commonly use ArrowBuffer.Empty when there are no nulls to avoid an unnecessary allocation. Consider building ArrowBuffer.Empty when _nullCount == 0 (and only building the bitmap when needed). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
