Copilot commented on code in PR #328: URL: https://github.com/apache/arrow-dotnet/pull/328#discussion_r3141187913
########## src/Apache.Arrow.Operations/Shredding/ShredOptions.cs: ########## @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace Apache.Arrow.Operations.Shredding +{ + /// <summary> + /// Options controlling how <see cref="ShredSchemaInferer"/> infers a shredding schema. + /// </summary> + public sealed class ShredOptions + { + /// <summary> + /// Maximum nesting depth for shredded objects and arrays. + /// 0 means only top-level fields are shredded. + /// Default is 3. + /// </summary> + public int MaxDepth { get; set; } = 3; + + /// <summary> + /// Minimum fraction of values (0.0–1.0) in which a field must appear + /// to be considered for shredding. Fields appearing less frequently + /// than this threshold are left in the binary residual. + /// Default is 0.5 (50%). + /// </summary> + public double MinFieldFrequency { get; set; } = 0.5; + + /// <summary> + /// Minimum fraction of non-null values (0.0–1.0) for a field that must + /// share the same type for the field to be shredded as a typed column. + /// If the type consistency is below this threshold, the field gets a + /// <see cref="ShredType.None"/> schema (binary-only). + /// Default is 0.8 (80%). + /// </summary> + public double MinTypeConsistency { get; set; } = 0.8; + + /// <summary>Default options.</summary> + public static readonly ShredOptions Default = new ShredOptions(); Review Comment: `ShredOptions.Default` is a mutable static instance (the type has settable properties). Any consumer that does `ShredOptions.Default.MaxDepth = ...` will mutate global state for the entire process, which is easy to do accidentally and hard to debug. Consider making `Default` return a new instance each time (e.g., `=> new ShredOptions()`), making the options immutable (`init`-only), or exposing a `CreateDefault()` factory instead. ```suggestion public static ShredOptions Default => new ShredOptions(); ``` ########## test/shredded_variant_ipc/regen.py: ########## @@ -0,0 +1,79 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Converts shredded-variant Parquet test cases from +test/parquet-testing/shredded_variant/*.parquet into Arrow IPC (.arrow) files +under this directory, so that .NET tests can load them without a Parquet +reader. The Parquet test corpus comes from apache/parquet-testing. + +Requires: pyarrow (tested with 23.0). + +Run from the repo root: + + python test/shredded_variant_ipc/regen.py + +Existing .arrow files are overwritten in place. +""" + +import json +import os +import sys + +import pyarrow as pa +import pyarrow.ipc as ipc +import pyarrow.parquet as pq + + +def main() -> int: + repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) + src = os.path.join(repo_root, "test", "parquet-testing", "shredded_variant") + dst = os.path.join(repo_root, "test", "shredded_variant_ipc") + cases_json = os.path.join(src, "cases.json") + + if not os.path.exists(cases_json): + print(f"cases.json not found at {cases_json}", file=sys.stderr) + return 1 + + with open(cases_json) as f: + cases = json.load(f) + + os.makedirs(dst, exist_ok=True) + + written = 0 + for case in cases: + parquet_files = [] + if "parquet_file" in case: + parquet_files.append(case["parquet_file"]) + + for pf in parquet_files: + src_path = os.path.join(src, pf) + if not os.path.exists(src_path): + continue + Review Comment: regen.py silently skips a Parquet file when `src_path` doesn't exist (`continue`). This can lead to an incomplete regenerated IPC corpus without any signal (e.g., if the parquet-testing subtree isn't fully checked out or a filename in cases.json changes). Consider failing fast or at least emitting a warning/error to stderr when a listed Parquet file is missing, and optionally track a nonzero exit code if any were skipped. ########## src/Apache.Arrow.Scalars/Variant/VariantValueWriter.cs: ########## @@ -424,6 +424,92 @@ public void WriteTimestampNtzNanos(long nanos) WriteInt64LE(ms, nanos); } + // --------------------------------------------------------------- + // Transcode from a VariantReader + // --------------------------------------------------------------- + + /// <summary> + /// Copies the variant value pointed to by <paramref name="source"/> into this + /// writer. Useful when copying between metadata dictionaries: field IDs in the + /// source are re-looked-up against this writer's <see cref="VariantMetadataBuilder"/> + /// on the fly, via <see cref="WriteFieldName"/>. + /// </summary> + /// <remarks> + /// All field names referenced anywhere in <paramref name="source"/> must already + /// exist in the metadata builder used to construct this writer. Use + /// <see cref="VariantMetadataBuilder.CollectFieldNames(VariantReader)"/> during + /// the metadata-collection phase of a two-pass encode to accumulate them. + /// </remarks> + public void CopyValue(VariantReader source) + { + switch (source.BasicType) + { + case VariantBasicType.Primitive: + CopyPrimitive(source); + return; + + case VariantBasicType.ShortString: + WriteString(source.GetString()); + return; + + case VariantBasicType.Object: + VariantObjectReader obj = new VariantObjectReader(source.Metadata, source.Value); + BeginObject(); + for (int i = 0; i < obj.FieldCount; i++) + { + WriteFieldName(obj.GetFieldName(i)); + CopyValue(obj.GetFieldValue(i)); + } + EndObject(); + return; + + case VariantBasicType.Array: + VariantArrayReader arr = new VariantArrayReader(source.Metadata, source.Value); + BeginArray(); + for (int i = 0; i < arr.ElementCount; i++) + { + CopyValue(arr.GetElement(i)); + } + EndArray(); + return; + + default: + throw new NotSupportedException($"Unsupported basic type: {source.BasicType}"); + } + } + + private void CopyPrimitive(VariantReader source) + { + VariantPrimitiveType? pt = source.PrimitiveType; + switch (pt) + { + case VariantPrimitiveType.NullType: WriteNull(); return; + case VariantPrimitiveType.BooleanTrue: WriteBoolean(true); return; + case VariantPrimitiveType.BooleanFalse: WriteBoolean(false); return; + case VariantPrimitiveType.Int8: WriteInt8(source.GetInt8()); return; + case VariantPrimitiveType.Int16: WriteInt16(source.GetInt16()); return; + case VariantPrimitiveType.Int32: WriteInt32(source.GetInt32()); return; + case VariantPrimitiveType.Int64: WriteInt64(source.GetInt64()); return; + case VariantPrimitiveType.Float: WriteFloat(source.GetFloat()); return; + case VariantPrimitiveType.Double: WriteDouble(source.GetDouble()); return; + case VariantPrimitiveType.Decimal4: WriteDecimal4(source.GetDecimal4()); return; + case VariantPrimitiveType.Decimal8: WriteDecimal8(source.GetDecimal8()); return; + // Decimal16 may exceed System.Decimal's range, so route through SqlDecimal. + case VariantPrimitiveType.Decimal16: WriteDecimal16(source.GetSqlDecimal()); return; + case VariantPrimitiveType.Date: WriteDateDays(source.GetDateDays()); return; + case VariantPrimitiveType.Timestamp: WriteTimestampMicros(source.GetTimestampMicros()); return; + case VariantPrimitiveType.TimestampNtz: WriteTimestampNtzMicros(source.GetTimestampNtzMicros()); return; + case VariantPrimitiveType.TimeNtz: WriteTimeNtzMicros(source.GetTimeNtzMicros()); return; + case VariantPrimitiveType.TimestampTzNanos: WriteTimestampTzNanos(source.GetTimestampTzNanos()); return; + case VariantPrimitiveType.TimestampNtzNanos: WriteTimestampNtzNanos(source.GetTimestampNtzNanos()); return; + case VariantPrimitiveType.String: WriteString(source.GetString()); return; + case VariantPrimitiveType.Binary: WriteBinary(source.GetBinary().ToArray()); return; + case VariantPrimitiveType.Uuid: WriteUuid(source.GetUuid()); return; Review Comment: `CopyPrimitive` allocates for binary values via `source.GetBinary().ToArray()` because `WriteBinary` only accepts `byte[]`. For large variants or bulk transcoding this can be a significant overhead. Consider adding a `WriteBinary(ReadOnlySpan<byte>)`/`WriteBinary(ReadOnlyMemory<byte>)` overload (or equivalent) and using it here to avoid the intermediate allocation. ########## src/Apache.Arrow.Operations/Shredding/ShreddingHelpers.cs: ########## @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow; +using Apache.Arrow.Types; + +namespace Apache.Arrow.Operations.Shredding +{ + /// <summary> + /// Internal helpers shared by the shredded-variant reader trio. + /// </summary> + internal static class ShreddingHelpers + { + /// <summary> + /// Builds a <see cref="ShreddedVariant"/> slot for the given index of an element-group + /// struct (one with <c>value</c> and/or <c>typed_value</c> sub-fields). Either sub-field + /// may be absent from the struct. + /// </summary> + public static ShreddedVariant BuildSlot( + ShredSchema slotSchema, + ReadOnlySpan<byte> metadata, + StructArray elementGroup, + int index) + { + StructType elementGroupType = (StructType)elementGroup.Data.DataType; + int valueIdx = elementGroupType.GetFieldIndex("value"); + int typedIdx = elementGroupType.GetFieldIndex("typed_value"); + + IArrowArray valueArr = valueIdx >= 0 ? elementGroup.Fields[valueIdx] : null; + IArrowArray typedArr = typedIdx >= 0 ? elementGroup.Fields[typedIdx] : null; Review Comment: `BuildSlot` calls `StructType.GetFieldIndex("value")` / `GetFieldIndex("typed_value")` on every invocation. This method is a linear scan (StructType.cs even notes caching if on a hot path), and `BuildSlot` is used inside per-element loops in ShreddedArray/ShreddedObject. Consider caching these field indices once per element-group StructType (or passing them in) to avoid repeated linear lookups. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
