adamreeve commented on code in PR #332:
URL: https://github.com/apache/arrow-dotnet/pull/332#discussion_r3158217982


##########
test/Apache.Arrow.Operations.Tests/Shredding/ShreddedVariantReaderTests.cs:
##########
@@ -0,0 +1,511 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.Collections.Generic;
+using System.IO;
+using Apache.Arrow;
+using Apache.Arrow.Ipc;
+using Apache.Arrow.Operations.Shredding;
+using Apache.Arrow.Scalars.Variant;
+using Apache.Arrow.Types;
+using Xunit;
+
+namespace Apache.Arrow.Operations.Tests.Shredding
+{
+    /// <summary>
+    /// Reader-style API tests: exercise <see 
cref="ShreddedVariant.GetShreddedVariant"/>,
+    /// <see cref="ShreddedObject"/>, and <see cref="ShreddedArray"/> typed 
accessors
+    /// without going through full variant materialization. These mirror what a
+    /// query engine would do for push-down reads against typed Parquet 
columns.
+    /// </summary>
+    public class ShreddedVariantReaderTests
+    {
+        private static readonly string IpcDir = FindIpcDir();
+
+        private static string FindIpcDir()
+        {
+            string dir = AppContext.BaseDirectory;
+            for (int i = 0; i < 10; i++)
+            {
+                string candidate = Path.Combine(dir, "test", 
"shredded_variant_ipc");
+                if (Directory.Exists(candidate) && 
Directory.GetFiles(candidate, "*.arrow").Length > 0)
+                    return candidate;
+                string parent = Path.GetDirectoryName(dir);
+                if (parent == null || parent == dir) break;
+                dir = parent;
+            }
+            return null;
+        }
+
+        private static VariantArray LoadCase(string caseStem)
+        {
+            Skip.If(IpcDir == null, "regen.py has not been run");
+            string path = Path.Combine(IpcDir, caseStem + ".arrow");
+            Skip.IfNot(File.Exists(path), $"Missing {path}");
+
+            using Stream stream = File.OpenRead(path);
+            using ArrowFileReader reader = new ArrowFileReader(stream);
+            RecordBatch batch = reader.ReadNextRecordBatch();
+            return new 
VariantArray(batch.Column(batch.Schema.GetFieldIndex("var")));
+        }
+
+        // ---------------------------------------------------------------
+        // Schema + state introspection
+        // ---------------------------------------------------------------
+
+        [SkippableFact]
+        public void GetShredSchema_ReflectsPrimitiveTypedValue()
+        {
+            VariantArray array = LoadCase("case-010"); // typed_value: int32
+            ShredSchema schema = array.GetShredSchema();
+            Assert.Equal(ShredType.Int32, schema.TypedValueType);
+        }
+
+        [SkippableFact]
+        public void GetShreddedVariant_HasTypedValue_WhenColumnPopulated()
+        {
+            VariantArray array = LoadCase("case-010"); // Int32 = 12345
+            ShreddedVariant slot = array.GetShreddedVariant(0);
+            Assert.True(slot.HasTypedValue);
+            Assert.False(slot.HasResidual);
+            Assert.False(slot.IsMissing);
+        }
+
+        [SkippableFact]
+        public void GetShreddedVariant_HasResidual_WhenUnshredded()
+        {
+            VariantArray array = LoadCase("case-048"); // 
testUnshreddedVariants (bool true)
+            ShreddedVariant slot = array.GetShreddedVariant(0);
+            Assert.False(slot.HasTypedValue);
+            Assert.True(slot.HasResidual);
+        }
+
+        // ---------------------------------------------------------------
+        // Typed primitive accessors
+        // ---------------------------------------------------------------
+
+        [SkippableFact]
+        public void GetInt32_ReadsShreddedValue()
+        {
+            VariantArray array = LoadCase("case-010"); // Int32 = 12345
+            ShreddedVariant slot = array.GetShreddedVariant(0);
+            Assert.Equal(12345, slot.GetInt32());
+        }
+
+        [SkippableFact]
+        public void GetInt8_ReadsShreddedValue()
+        {
+            VariantArray array = LoadCase("case-006"); // Int8 = 34
+            ShreddedVariant slot = array.GetShreddedVariant(0);
+            Assert.Equal((sbyte)34, slot.GetInt8());
+        }
+
+        [SkippableFact]
+        public void GetInt64_ReadsShreddedValue()
+        {
+            VariantArray array = LoadCase("case-012"); // Int64 = 9876543210
+            ShreddedVariant slot = array.GetShreddedVariant(0);
+            Assert.Equal(9876543210L, slot.GetInt64());
+        }
+
+        [SkippableFact]
+        public void GetBoolean_ReadsShreddedValue()
+        {
+            VariantArray array = LoadCase("case-004"); // Bool = true
+            ShreddedVariant slot = array.GetShreddedVariant(0);
+            Assert.True(slot.GetBoolean());
+        }
+
+        [SkippableFact]
+        public void GetString_ReadsShreddedValue()
+        {
+            VariantArray array = LoadCase("case-031"); // String
+            ShreddedVariant slot = array.GetShreddedVariant(0);
+            Assert.Equal("iceberg", slot.GetString());
+        }
+
+        [SkippableFact]
+        public void GetDouble_ReadsShreddedValue()
+        {
+            VariantArray array = LoadCase("case-016"); // Double = 14.3
+            ShreddedVariant slot = array.GetShreddedVariant(0);
+            Assert.Equal(14.3, slot.GetDouble());
+        }
+
+        [SkippableFact]
+        public void GetDecimal_ReadsShreddedDecimal4()
+        {
+            VariantArray array = LoadCase("case-024"); // Decimal4 = 
123456.789 (scale 3?)
+            ShreddedVariant slot = array.GetShreddedVariant(0);
+            decimal d = slot.GetDecimal();
+            Assert.NotEqual(0m, d);

Review Comment:
   ```suggestion
               VariantArray array = LoadCase("case-024"); // Decimal4 (scale 4)
               ShreddedVariant slot = array.GetShreddedVariant(0);
               decimal d = slot.GetDecimal();
               Assert.Equal(12345.6789m, d);
   ```



##########
src/Apache.Arrow/Arrays/VariantArray.cs:
##########
@@ -79,14 +92,46 @@ public class VariantType : ExtensionType
         public override string Name => ExtensionName;
         public override string ExtensionMetadata => "";
 
+        /// <summary>
+        /// True if the storage layout has a <c>value</c> binary field 
(unshredded or
+        /// partially shredded). False for fully shredded layouts that omit 
the column.
+        /// </summary>
+        public bool HasValueColumn { get; }
+
+        /// <summary>
+        /// True if the storage layout has a <c>typed_value</c> field 
(shredded).
+        /// </summary>
+        public bool HasTypedValueColumn { get; }
+
+        /// <summary>
+        /// True if the storage layout includes any shredding (has a 
<c>typed_value</c>
+        /// column, or lacks a <c>value</c> column indicating full shredding).
+        /// </summary>
+        public bool IsShredded => HasTypedValueColumn || !HasValueColumn;

Review Comment:
   The check for `!HasValueColumn` is redundant, if there's no `value` column 
then there must be a `typed_value` column. Do we need both `IsShredded` and 
`HasTypedValueColumn`?



##########
test/Apache.Arrow.Operations.Tests/Shredding/ShreddedVariantArrayBuilderTests.cs:
##########
@@ -0,0 +1,424 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.Collections.Generic;
+using Apache.Arrow;
+using Apache.Arrow.Operations.Shredding;
+using Apache.Arrow.Scalars.Variant;
+using Xunit;
+
+namespace Apache.Arrow.Operations.Tests.Shredding
+{
+    /// <summary>
+    /// Round-trip tests for the producer path: take <see 
cref="VariantValue"/>s,
+    /// shred them, assemble into a shredded <see cref="VariantArray"/>, and 
then
+    /// read each row back via <c>GetLogicalVariantValue</c>. The reader is the
+    /// trusted oracle (validated against the Iceberg corpus), so equality here
+    /// confirms the builder produces a correct Arrow structure.
+    /// </summary>
+    public class ShreddedVariantArrayBuilderTests
+    {
+        private static VariantArray ShredAndBuild(IReadOnlyList<VariantValue> 
values, ShredSchema schema)
+        {
+            (byte[] metadata, IReadOnlyList<ShredResult> rows) = 
VariantShredder.Shred(values, schema);
+            return ShreddedVariantArrayBuilder.Build(schema, metadata, rows);
+        }
+
+        private static void AssertRoundTrip(IReadOnlyList<VariantValue> 
values, ShredSchema schema)
+        {
+            VariantArray array = ShredAndBuild(values, schema);
+            Assert.Equal(values.Count, array.Length);
+            for (int i = 0; i < values.Count; i++)
+            {
+                VariantValue actual = array.GetLogicalVariantValue(i);
+                Assert.Equal(values[i], actual);
+            }
+        }
+
+        // ---------------------------------------------------------------
+        // Unshredded (schema = None)
+        // ---------------------------------------------------------------
+
+        [Fact]
+        public void Unshredded_Column_HasNoTypedValue()
+        {
+            var values = new List<VariantValue>
+            {
+                VariantValue.FromInt32(42),
+                VariantValue.FromString("hello"),
+            };
+            VariantArray array = ShredAndBuild(values, 
ShredSchema.Unshredded());
+
+            Assert.False(array.IsShredded);
+            AssertRoundTrip(values, ShredSchema.Unshredded());

Review Comment:
   `AssertRoundTrip` could return the shredded array so we don't need to call 
`ShredAndBuild` twice here and in `Object_FullyShredded`, and there are a few 
other tests where it might make sense to make assertions about whether the 
array is shredded, or check if there are any residual values.



##########
src/Apache.Arrow.Operations/Shredding/ShreddedVariantArrayBuilder.cs:
##########
@@ -0,0 +1,513 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using System;
+using System.Collections.Generic;
+using Apache.Arrow;
+using Apache.Arrow.Arrays;
+using Apache.Arrow.Memory;
+using Apache.Arrow.Scalars.Variant;
+using Apache.Arrow.Types;
+
+namespace Apache.Arrow.Operations.Shredding
+{
+    /// <summary>
+    /// Assembles a shredded <see cref="VariantArray"/> from pre-shredded rows.
+    /// Produces an Arrow struct with shared <c>metadata</c>, residual 
<c>value</c>,
+    /// and the <c>typed_value</c> tree whose Arrow shape matches the <see 
cref="ShredSchema"/>.
+    /// </summary>
+    public static class ShreddedVariantArrayBuilder
+    {
+        /// <summary>
+        /// Builds a shredded <see cref="VariantArray"/> from the output of
+        /// <see 
cref="VariantShredder.Shred(System.Collections.Generic.IEnumerable{VariantValue},
 ShredSchema)"/>.
+        /// </summary>
+        /// <param name="schema">The shredding schema applied to each 
row.</param>
+        /// <param name="metadata">The column-level variant metadata (shared 
across rows).</param>
+        /// <param name="rows">Per-row shred results whose residual bytes 
reference <paramref name="metadata"/>.</param>
+        /// <param name="allocator">Arrow memory allocator, or default if 
null.</param>
+        public static VariantArray Build(
+            ShredSchema schema,
+            byte[] metadata,
+            IReadOnlyList<ShredResult> rows,
+            MemoryAllocator allocator = null)
+        {
+            if (schema == null) throw new 
ArgumentNullException(nameof(schema));
+            if (metadata == null) throw new 
ArgumentNullException(nameof(metadata));
+            if (rows == null) throw new ArgumentNullException(nameof(rows));
+
+            int rowCount = rows.Count;
+
+            // metadata column: emit the shared bytes once per row. (A 
dictionary-encoded
+            // or run-end-encoded representation would compress this; 
VariantArray's reader
+            // already handles those, but for simplicity we emit the plain 
binary form.)
+            BinaryArray.Builder metadataBuilder = new BinaryArray.Builder();
+            for (int i = 0; i < rowCount; i++)
+            {
+                metadataBuilder.Append((ReadOnlySpan<byte>)metadata);
+            }
+            BinaryArray metadataArr = metadataBuilder.Build(allocator);
+
+            // value column: residual bytes (or null).
+            BinaryArray valueArr = BuildBinaryColumn(rows, allocator);

Review Comment:
   Should we omit the value array if values are fully shredded? Probably fine 
to add that as an optimisation later though if there's a need for it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to