Copilot commented on code in PR #284: URL: https://github.com/apache/arrow-dotnet/pull/284#discussion_r2935955635
########## src/Apache.Arrow.Serialization.Generator/JsonSchemaEmitter.cs: ########## @@ -0,0 +1,195 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System.Text; + +#nullable enable + +namespace Apache.Arrow.Serialization.Generator +{ + +/// <summary> +/// Emits a JSON schema descriptor as a static ArrowSchemaJson property. +/// The JSON describes the Arrow schema for cross-language codegen tools. +/// </summary> +internal static class JsonSchemaEmitter +{ + public static void Emit(StringBuilder sb, TypeModel model) + { + var json = BuildJson(model); + + sb.AppendLine("// <auto-generated/>"); + sb.AppendLine("#nullable enable"); + sb.AppendLine(); + + if (model.Namespace != null) + { + sb.AppendLine($"namespace {model.Namespace};"); + sb.AppendLine(); + } + + var typeKeyword = GetTypeKeyword(model); + sb.AppendLine($"partial {typeKeyword} {model.TypeName}"); + sb.AppendLine("{"); + sb.AppendLine($" public static string ArrowSchemaJson => @\"{json.Replace("\"", "\"\"")}\";"); + sb.AppendLine("}"); + } + + private static string BuildJson(TypeModel model) + { + var sb = new StringBuilder(); + sb.AppendLine("{"); + sb.AppendLine($" \"type\": \"{JsonEscape(model.FullTypeName)}\","); + sb.AppendLine($" \"namespace\": {(model.Namespace != null ? $"\"{JsonEscape(model.Namespace)}\"" : "null")},"); + sb.AppendLine($" \"typeName\": \"{JsonEscape(model.TypeName)}\","); + + // Metadata + sb.Append(" \"metadata\": {"); + if (model.Metadata.Count > 0) + { + sb.AppendLine(); + for (int i = 0; i < model.Metadata.Count; i++) + { + var kv = model.Metadata[i]; + var comma = i < model.Metadata.Count - 1 ? "," : ""; + sb.AppendLine($" \"{JsonEscape(kv.Key)}\": \"{JsonEscape(kv.Value)}\"{comma}"); + } + sb.AppendLine(" },"); + } + else + { + sb.AppendLine("},"); + } + + // Fields + sb.AppendLine(" \"fields\": ["); + for (int i = 0; i < model.Properties.Count; i++) + { + var prop = model.Properties[i]; + var comma = i < model.Properties.Count - 1 ? "," : ""; + EmitField(sb, prop); + // Remove trailing newline, add comma + sb.Length -= sb.ToString().EndsWith("\r\n") ? 2 : 1; Review Comment: `sb.ToString().EndsWith(...)` is called repeatedly inside `EmitField` and in the field loop to check for trailing newlines. Each call allocates a full string copy of the `StringBuilder` contents. This runs at compile time in the source generator, so it's not a hot-path runtime concern, but for types with many properties or deeply nested schemas, this could noticeably slow down source generation. Consider tracking newlines with a flag or using `sb[sb.Length - 1]` / `sb[sb.Length - 2]` character checks instead. ########## Directory.Packages.props: ########## @@ -32,6 +32,8 @@ <PackageVersion Include="Grpc.Net.ClientFactory" Version="2.71.0" /> <PackageVersion Include="Grpc.Tools" Version="2.72.0" PrivateAssets="All" /> <PackageVersion Include="K4os.Compression.LZ4.Streams" Version="1.3.8" /> + <PackageVersion Include="Microsoft.CodeAnalysis.Analyzers" Version="3.3.4" /> + <PackageVersion Include="Microsoft.CodeAnalysis.CSharp" Version="4.11.0" /> <PackageVersion Include="Microsoft.Bcl.AsyncInterfaces" Version="8.0.0" /> Review Comment: The package versions for `Microsoft.CodeAnalysis.Analyzers` and `Microsoft.CodeAnalysis.CSharp` are added out of alphabetical order relative to `Microsoft.Bcl.AsyncInterfaces`. The existing file maintains alphabetical ordering of package versions (e.g., `Grpc.*` then `K4os.*` then `Microsoft.*`). These two new entries should be placed after `Microsoft.Bcl.AsyncInterfaces` to maintain consistency, or the `Microsoft.Bcl.AsyncInterfaces` entry should be moved after them. ########## src/Apache.Arrow.Serialization.Generator/PolymorphicCodeEmitter.cs: ########## @@ -0,0 +1,816 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.Text; + +#nullable enable + +namespace Apache.Arrow.Serialization.Generator +{ + +internal class PolymorphicCodeEmitter +{ + private readonly StringBuilder _sb; + private readonly PolymorphicModel _model; + private int _indent; + + public PolymorphicCodeEmitter(StringBuilder sb, PolymorphicModel model) + { + _sb = sb; + _model = model; + } + + public void Emit() + { + Line("// <auto-generated/>"); + Line("#nullable enable"); + Line("#pragma warning disable CS8629 // Nullable value type may be null"); + Line(); + Line("using System.Collections.Generic;"); + Line("using System.Linq;"); + Line("using Apache.Arrow;"); + Line("using Apache.Arrow.Arrays;"); + Line("using Apache.Arrow.Types;"); + Line("using Apache.Arrow.Serialization;"); + Line(); + + if (_model.Namespace != null) + { + Line($"namespace {_model.Namespace};"); + Line(); + } + + var typeKeyword = _model.IsInterface ? "interface" : (_model.IsRecord ? "record" : "class"); + Line($"partial {typeKeyword} {_model.TypeName} : IArrowSerializer<{_model.TypeName}>"); + Line("{"); + _indent++; + + EmitSchemaField(); + Line(); + EmitSerialize(); + Line(); + EmitDeserialize(); + Line(); + EmitMultiRowSerialize(); + Line(); + EmitMultiRowDeserialize(); + + _indent--; + Line("}"); + } + + private void EmitSchemaField() + { + Line($"private static readonly Schema _arrowSchema = new Schema.Builder()"); + _indent++; + + // Discriminator field + Line($".Field(new Field(\"{Escape(_model.TypeDiscriminatorFieldName)}\", StringType.Default, false))"); + + // Union properties — all nullable + foreach (var prop in _model.UnionProperties) + { + var arrowType = CodeEmitterHelpers.GetArrowTypeExpression(prop.Type); + Line($".Field(new Field(\"{Escape(prop.FieldName)}\", {arrowType}, true))"); + } + + _indent--; + Line($".Build();"); + Line(); + Line($"public static Schema ArrowSchema => _arrowSchema;"); + } + + private void EmitSerialize() + { + Line($"public static RecordBatch ToRecordBatch({_model.TypeName} value)"); + Line("{"); + _indent++; + + // Build discriminator column + Line("var discriminatorBuilder = new StringArray.Builder();"); + + // Build union property builders + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var prop = _model.UnionProperties[i]; + EmitBuilderDeclaration(prop, i); + } + + Line(); + // Switch on concrete type + Line("switch (value)"); + Line("{"); + _indent++; + + foreach (var dt in _model.DerivedTypes) + { + Line($"case {dt.FullTypeName} v_{dt.TypeName}:"); + Line("{"); + _indent++; + Line($"discriminatorBuilder.Append(\"{Escape(dt.TypeDiscriminator)}\");"); + + // For each union property, either append the value or null + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var unionProp = _model.UnionProperties[i]; + var derivedProp = FindProperty(dt, unionProp.FieldName); + if (derivedProp != null) + { + EmitAppendValue(derivedProp, i, $"v_{dt.TypeName}.{derivedProp.PropertyName}"); + } + else + { + EmitAppendNull(unionProp, i); + } + } + + Line("break;"); + _indent--; + Line("}"); + } + + Line($"default:"); + _indent++; + Line($"throw new System.ArgumentException($\"Unknown derived type: {{value.GetType().Name}}\");"); + _indent--; + + _indent--; + Line("}"); + + Line(); + // Build arrays and RecordBatch + Line("var columns = new IArrowArray[]"); + Line("{"); + _indent++; + Line("discriminatorBuilder.Build(),"); + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + EmitBuildColumn(i); + } + _indent--; + Line("};"); + Line("return new RecordBatch(_arrowSchema, columns, 1);"); + + _indent--; + Line("}"); + } + + private void EmitDeserialize() + { + Line($"public static {_model.TypeName} FromRecordBatch(RecordBatch batch)"); + Line("{"); + _indent++; + + Line("var discriminator = ((StringArray)batch.Column(0)).GetString(0)!;"); + Line("switch (discriminator)"); + Line("{"); + _indent++; + + foreach (var dt in _model.DerivedTypes) + { + Line($"case \"{Escape(dt.TypeDiscriminator)}\":"); + Line("{"); + _indent++; + + // Read each property for this derived type from the batch columns + for (int i = 0; i < dt.Properties.Count; i++) + { + var prop = dt.Properties[i]; + var unionIndex = FindUnionPropertyIndex(prop.FieldName); + var colIndex = unionIndex + 1; // +1 for discriminator + EmitReadProperty(prop, i, colIndex); + } + + // Construct the derived type + Line($"return new {dt.FullTypeName}"); + Line("{"); + _indent++; + for (int i = 0; i < dt.Properties.Count; i++) + { + var prop = dt.Properties[i]; + Line($"{prop.PropertyName} = prop_{i},"); + } + _indent--; + Line("};"); + + _indent--; + Line("}"); + } + + Line("default:"); + _indent++; + Line("throw new System.ArgumentException($\"Unknown type discriminator: {discriminator}\");"); + _indent--; + + _indent--; + Line("}"); + + _indent--; + Line("}"); + } + + private void EmitMultiRowSerialize() + { + Line($"public static RecordBatch ToRecordBatch(IReadOnlyList<{_model.TypeName}> values)"); + Line("{"); + _indent++; + + Line("var discriminatorBuilder = new StringArray.Builder();"); + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var prop = _model.UnionProperties[i]; + EmitBuilderDeclaration(prop, i); + } + + Line(); + Line("foreach (var value in values)"); + Line("{"); + _indent++; + + Line("switch (value)"); + Line("{"); + _indent++; + + foreach (var dt in _model.DerivedTypes) + { + Line($"case {dt.FullTypeName} v_{dt.TypeName}:"); + Line("{"); + _indent++; + Line($"discriminatorBuilder.Append(\"{Escape(dt.TypeDiscriminator)}\");"); + + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var unionProp = _model.UnionProperties[i]; + var derivedProp = FindProperty(dt, unionProp.FieldName); + if (derivedProp != null) + { + EmitAppendValue(derivedProp, i, $"v_{dt.TypeName}.{derivedProp.PropertyName}"); + } + else + { + EmitAppendNull(unionProp, i); + } + } + + Line("break;"); + _indent--; + Line("}"); + } + + Line($"default:"); + _indent++; + Line($"throw new System.ArgumentException($\"Unknown derived type: {{value.GetType().Name}}\");"); + _indent--; + + _indent--; + Line("}"); + + _indent--; + Line("}"); + + Line(); + Line("var columns = new IArrowArray[]"); + Line("{"); + _indent++; + Line("discriminatorBuilder.Build(),"); + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + EmitBuildColumn(i); + } + _indent--; + Line("};"); + Line("return new RecordBatch(_arrowSchema, columns, values.Count);"); + + _indent--; + Line("}"); + } + + private void EmitBuildColumn(int index) + { + var prop = _model.UnionProperties[index]; + if (prop.Type.Kind == TypeKind2.Enum) + { + // Build DictionaryArray from index + dict builders + Line($"new DictionaryArray(new DictionaryType(Int16Type.Default, StringType.Default, false), bld_{index}_idx.Build(), new StringArray.Builder().AppendRange(bld_{index}_dict).Build()),"); + } + else + { + Line($"bld_{index}.Build(),"); + } + } + + private void EmitMultiRowDeserialize() + { + Line($"public static IReadOnlyList<{_model.TypeName}> ListFromRecordBatch(RecordBatch batch)"); + Line("{"); + _indent++; + + Line("var discriminatorCol = (StringArray)batch.Column(0);"); + + // Cast all union property columns + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var prop = _model.UnionProperties[i]; + var colIndex = i + 1; + var castType = CodeEmitterHelpers.GetArrayCastType(prop); + Line($"var col_{i} = ({castType})batch.Column({colIndex});"); + } + + Line(); + Line($"var result = new List<{_model.TypeName}>(batch.Length);"); + Line("for (int row = 0; row < batch.Length; row++)"); + Line("{"); + _indent++; + + Line("var discriminator = discriminatorCol.GetString(row)!;"); + Line("switch (discriminator)"); + Line("{"); + _indent++; + + foreach (var dt in _model.DerivedTypes) + { + Line($"case \"{Escape(dt.TypeDiscriminator)}\":"); + Line("{"); + _indent++; + + for (int i = 0; i < dt.Properties.Count; i++) + { + var prop = dt.Properties[i]; + var unionIndex = FindUnionPropertyIndex(prop.FieldName); + EmitMultiRowReadProperty(prop, i, unionIndex); + } + + Line($"result.Add(new {dt.FullTypeName}"); + Line("{"); + _indent++; + for (int i = 0; i < dt.Properties.Count; i++) + { + var prop = dt.Properties[i]; + Line($"{prop.PropertyName} = prop_{i},"); + } + _indent--; + Line("});"); + Line("break;"); + + _indent--; + Line("}"); + } + + Line("default:"); + _indent++; + Line("throw new System.ArgumentException($\"Unknown type discriminator: {discriminator}\");"); + _indent--; + + _indent--; + Line("}"); + + _indent--; + Line("}"); + Line("return result;"); + + _indent--; + Line("}"); + } + + // --- Helpers --- + + private void EmitBuilderDeclaration(PropertyModel prop, int index) + { + if (prop.Type.Kind == TypeKind2.Enum) + { + // Enum uses Dictionary(Int16, Utf8) — built manually + Line($"var bld_{index}_idx = new Int16Array.Builder();"); + Line($"var bld_{index}_dict = new List<string>();"); + } + else + { + var builderType = CodeEmitterHelpers.GetNullableBuilderDeclaration(prop); + Line($"var bld_{index} = {builderType};"); + } + } + + private void EmitAppendValue(PropertyModel prop, int index, string access) + { + // For non-nullable properties in the derived type, we still need to handle them + // as nullable in the union schema + switch (prop.Type.Kind) + { + case TypeKind2.String: + if (prop.IsNullable) + Line($"if ({access} != null) bld_{index}.Append({access}); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append({access});"); + break; + case TypeKind2.Bool: + case TypeKind2.Byte: + case TypeKind2.SByte: + case TypeKind2.Int16: + case TypeKind2.UInt16: + case TypeKind2.Int32: + case TypeKind2.UInt32: + case TypeKind2.Int64: + case TypeKind2.UInt64: + case TypeKind2.Float: + case TypeKind2.Double: + case TypeKind2.Half: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) bld_{index}.Append(v_{index}); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append({access});"); + break; + case TypeKind2.DateTime: + case TypeKind2.DateTimeOffset: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) bld_{index}.Append(Apache.Arrow.Serialization.ArrowArrayHelper.ToUtcDateTimeOffset(v_{index})); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append(Apache.Arrow.Serialization.ArrowArrayHelper.ToUtcDateTimeOffset({access}));"); + break; + case TypeKind2.DateOnly: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) bld_{index}.Append(v_{index}); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append({access});"); + break; + case TypeKind2.Decimal: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) bld_{index}.Append(v_{index}); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append({access});"); + break; + case TypeKind2.Enum: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) {{ bld_{index}_idx.Append((short)bld_{index}_dict.Count); bld_{index}_dict.Add(v_{index}.ToString()); }} else bld_{index}_idx.AppendNull();"); + else + { + Line($"bld_{index}_idx.Append((short)bld_{index}_dict.Count);"); + Line($"bld_{index}_dict.Add({access}.ToString());"); + } + break; + default: + // For complex types (Binary, Guid, TimeOnly, TimeSpan, nested, collections), + // fall back to AppendNull for now — these are less common in polymorphic scenarios + Line($"bld_{index}.AppendNull(); // TODO: complex type {prop.Type.Kind}"); Review Comment: The `default` case emits a `// TODO: complex type` comment in the generated code and silently appends null for complex types (Binary, Guid, TimeOnly, TimeSpan, nested records, collections) in polymorphic serialization. This means data is silently lost at runtime with no warning or error — the generated code will compile and run, but produce incorrect null values. At minimum, the generated code should throw `NotSupportedException` instead of silently losing data, or these types should be properly supported. ########## src/Apache.Arrow.Serialization/RecordBatchBuilder.cs: ########## @@ -0,0 +1,711 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System.Diagnostics.CodeAnalysis; +using System.Reflection; +using Apache.Arrow; +using Apache.Arrow.Arrays; +using Apache.Arrow.Types; + +namespace Apache.Arrow.Serialization; + +/// <summary> +/// Reflection-based serializer for converting arbitrary .NET objects (including anonymous types) +/// to Arrow RecordBatches. Analogous to System.Text.Json's reflection-based path — +/// works without attributes or source generation but is not AOT-safe. +/// </summary> +public static class RecordBatchBuilder +{ + /// <summary> + /// Convert a collection of objects to a RecordBatch. Schema is inferred from the + /// public readable properties of <typeparamref name="T"/>. + /// Works with anonymous types, records, classes, and structs. + /// </summary> + [RequiresUnreferencedCode("Uses reflection to inspect properties. Use [ArrowSerializable] for AOT-safe serialization.")] + public static RecordBatch FromObjects<T>(IEnumerable<T> items) + { + var list = items as IReadOnlyList<T> ?? items.ToList(); + if (list.Count == 0) + throw new ArgumentException("Cannot infer schema from empty collection.", nameof(items)); + + var properties = typeof(T).GetProperties(BindingFlags.Public | BindingFlags.Instance) + .Where(p => p.CanRead) + .ToArray(); + + var fields = new List<Field>(); + var builders = new List<IColumnBuilder>(); + + foreach (var prop in properties) + { + var propType = prop.PropertyType; + var (arrowType, nullable) = InferArrowType(propType); + fields.Add(new Field(prop.Name, arrowType, nullable)); + builders.Add(CreateColumnBuilder(propType, arrowType)); + } + + var schema = new Schema.Builder(); + foreach (var f in fields) schema.Field(f); + + // Populate builders + for (int row = 0; row < list.Count; row++) + { + var item = list[row]!; + for (int col = 0; col < properties.Length; col++) + { + var value = properties[col].GetValue(item); + builders[col].Append(value); + } + } + + var arrays = builders.Select(b => b.Build()).ToArray(); + return new RecordBatch(schema.Build(), arrays, list.Count); + } + + /// <summary> + /// Convert a single object to a single-row RecordBatch. + /// </summary> + [RequiresUnreferencedCode("Uses reflection to inspect properties. Use [ArrowSerializable] for AOT-safe serialization.")] + public static RecordBatch FromObject<T>(T item) + => FromObjects(new[] { item }); + + private static (IArrowType Type, bool Nullable) InferArrowType(Type clrType) + { + var underlying = Nullable.GetUnderlyingType(clrType); + if (underlying is not null) + { + var (inner, _) = InferArrowType(underlying); + return (inner, true); + } + + if (clrType == typeof(string)) return (StringType.Default, true); + if (clrType == typeof(bool)) return (BooleanType.Default, false); + if (clrType == typeof(sbyte)) return (Int8Type.Default, false); + if (clrType == typeof(byte)) return (UInt8Type.Default, false); + if (clrType == typeof(short)) return (Int16Type.Default, false); + if (clrType == typeof(ushort)) return (UInt16Type.Default, false); + if (clrType == typeof(int)) return (Int32Type.Default, false); + if (clrType == typeof(uint)) return (UInt32Type.Default, false); + if (clrType == typeof(long)) return (Int64Type.Default, false); + if (clrType == typeof(ulong)) return (UInt64Type.Default, false); + if (clrType == typeof(Half)) return (HalfFloatType.Default, false); + if (clrType == typeof(float)) return (FloatType.Default, false); + if (clrType == typeof(double)) return (DoubleType.Default, false); + if (clrType == typeof(decimal)) return (new Decimal128Type(38, 18), false); + if (clrType == typeof(DateTime)) return (new TimestampType(TimeUnit.Microsecond, "UTC"), false); + if (clrType == typeof(DateTimeOffset)) return (new TimestampType(TimeUnit.Microsecond, "UTC"), false); + if (clrType == typeof(DateOnly)) return (Date32Type.Default, false); + if (clrType == typeof(TimeOnly)) return (new Time64Type(TimeUnit.Microsecond), false); + if (clrType == typeof(TimeSpan)) return (DurationType.Microsecond, false); + if (clrType == typeof(Guid)) return (new GuidType(), false); + if (clrType == typeof(byte[])) return (BinaryType.Default, true); + if (clrType == typeof(ReadOnlyMemory<byte>)) return (BinaryType.Default, false); + + if (clrType.IsEnum) + return (new DictionaryType(Int16Type.Default, StringType.Default, false), false); + + // T[] arrays (not byte[] which is handled above) + if (clrType.IsArray) + { + var elemType = clrType.GetElementType()!; + var (elemArrow, elemNullable) = InferArrowType(elemType); + return (new ListType(new Field("item", elemArrow, elemNullable)), true); + } + + if (clrType.IsGenericType) + { + var genDef = clrType.GetGenericTypeDefinition(); + if (genDef == typeof(List<>) || genDef == typeof(HashSet<>)) + { + var elemType = clrType.GetGenericArguments()[0]; + var (elemArrow, elemNullable) = InferArrowType(elemType); + return (new ListType(new Field("item", elemArrow, elemNullable)), true); + } + if (genDef == typeof(Dictionary<,>)) + { + var args = clrType.GetGenericArguments(); + var (keyArrow, _) = InferArrowType(args[0]); + var (valArrow, valNullable) = InferArrowType(args[1]); + return (new MapType(new Field("key", keyArrow, false), new Field("value", valArrow, valNullable)), true); + } + } + + // Check for [ArrowSerializable] types with source-generated IArrowSerializer<T> + var genSchema = GetGeneratedArrowSchema(clrType); + if (genSchema is not null) + { + var structFields = new List<Field>(genSchema.FieldsList); + return (new StructType(structFields), true); + } + + // Nested object type (anonymous, record, class, struct with readable properties) + if (clrType.IsClass || clrType.IsValueType) + { + var nestedProps = clrType.GetProperties(BindingFlags.Public | BindingFlags.Instance) + .Where(p => p.CanRead) + .ToArray(); + if (nestedProps.Length > 0) + { + var nestedFields = nestedProps.Select(p => + { + var (ft, fn) = InferArrowType(p.PropertyType); + return new Field(p.Name, ft, fn); + }).ToList(); + return (new StructType(nestedFields), true); + } + } + + throw new NotSupportedException($"Cannot infer Arrow type for {clrType.FullName}"); + } + + /// <summary> + /// Check if a type implements IArrowSerializer<T> (i.e. has [ArrowSerializable] source-generated code) + /// and return its static ArrowSchema if so. + /// </summary> + private static Schema? GetGeneratedArrowSchema(Type clrType) + { + var iface = clrType.GetInterfaces() + .FirstOrDefault(i => i.IsGenericType && i.GetGenericTypeDefinition() == typeof(IArrowSerializer<>)); + if (iface is null) return null; + + var schemaProp = clrType.GetProperty("ArrowSchema", BindingFlags.Public | BindingFlags.Static | BindingFlags.FlattenHierarchy); + return schemaProp?.GetValue(null) as Schema; + } + + /// <summary> + /// Try to get the static ToRecordBatch(IReadOnlyList<T>) method from a source-generated type. + /// </summary> + private static MethodInfo? GetGeneratedToRecordBatchList(Type clrType) + { + var listType = typeof(IReadOnlyList<>).MakeGenericType(clrType); + return clrType.GetMethod("ToRecordBatch", BindingFlags.Public | BindingFlags.Static | BindingFlags.FlattenHierarchy, [listType]); + } + + private static IColumnBuilder CreateColumnBuilder(Type clrType, IArrowType arrowType) + { + var underlying = Nullable.GetUnderlyingType(clrType); + if (underlying is not null) + return CreateColumnBuilder(underlying, arrowType); // inner builders all handle null + + if (clrType == typeof(string)) return new StringColumnBuilder(); + if (clrType == typeof(bool)) return new BoolColumnBuilder(); + if (clrType == typeof(sbyte)) return new TypedColumnBuilder<sbyte, Int8Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(byte)) return new TypedColumnBuilder<byte, UInt8Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(short)) return new TypedColumnBuilder<short, Int16Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(ushort)) return new TypedColumnBuilder<ushort, UInt16Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(int)) return new TypedColumnBuilder<int, Int32Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(uint)) return new TypedColumnBuilder<uint, UInt32Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(long)) return new TypedColumnBuilder<long, Int64Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(ulong)) return new TypedColumnBuilder<ulong, UInt64Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(Half)) return new TypedColumnBuilder<Half, HalfFloatArray.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(float)) return new TypedColumnBuilder<float, FloatArray.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(double)) return new TypedColumnBuilder<double, DoubleArray.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(decimal)) return new DecimalColumnBuilder(); + if (clrType == typeof(DateTime)) return new DateTimeColumnBuilder(); + if (clrType == typeof(DateTimeOffset)) return new DateTimeOffsetColumnBuilder(); + if (clrType == typeof(DateOnly)) return new TypedColumnBuilder<DateOnly, Date32Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(TimeOnly)) return new TimeOnlyColumnBuilder(); + if (clrType == typeof(TimeSpan)) return new TimeSpanColumnBuilder(); + if (clrType == typeof(Guid)) return new GuidColumnBuilder(); + if (clrType == typeof(byte[])) return new BinaryColumnBuilder(); + if (clrType == typeof(ReadOnlyMemory<byte>)) return new ReadOnlyMemoryByteColumnBuilder(); + if (clrType.IsEnum) return new EnumColumnBuilder(); + + // List<T>, T[], HashSet<T> → ListArray + if (arrowType is ListType listType) + { + var elemClrType = clrType.IsArray + ? clrType.GetElementType()! + : clrType.GetGenericArguments()[0]; + var elemBuilder = CreateColumnBuilder(elemClrType, listType.ValueDataType); + return new ListColumnBuilder(listType, elemClrType, elemBuilder); + } + + // Dictionary<K,V> → MapArray + if (arrowType is MapType mapType) + { + var args = clrType.GetGenericArguments(); + var keyBuilder = CreateColumnBuilder(args[0], mapType.KeyField.DataType); + var valBuilder = CreateColumnBuilder(args[1], mapType.ValueField.DataType); + return new MapColumnBuilder(mapType, args[0], args[1], keyBuilder, valBuilder); + } + + // Nested object → StructArray + if (arrowType is StructType structType) + { + // If the type has source-generated IArrowSerializer<T>, delegate to it + var toRecordBatchList = GetGeneratedToRecordBatchList(clrType); + if (toRecordBatchList is not null) + return new SourceGenStructColumnBuilder(clrType, structType, toRecordBatchList); + + // Otherwise, fall back to reflection-based struct builder + var nestedProps = clrType.GetProperties(BindingFlags.Public | BindingFlags.Instance) + .Where(p => p.CanRead) + .ToArray(); + var childBuilders = new List<IColumnBuilder>(); + for (int i = 0; i < nestedProps.Length; i++) + { + var childArrowType = structType.Fields[i].DataType; + childBuilders.Add(CreateColumnBuilder(nestedProps[i].PropertyType, childArrowType)); + } + return new StructColumnBuilder(structType, nestedProps, childBuilders); + } + + throw new NotSupportedException($"Column builder not available for {clrType.FullName}"); + } + + // --- Column builder interface and implementations --- + + private interface IColumnBuilder + { + void Append(object? value); + IArrowArray Build(); + } + + private sealed class StringColumnBuilder : IColumnBuilder + { + private readonly StringArray.Builder _b = new(); + public void Append(object? value) { if (value is null) _b.AppendNull(); else _b.Append((string)value); } + public IArrowArray Build() => _b.Build(); + } + + private sealed class BoolColumnBuilder : IColumnBuilder + { + private readonly BooleanArray.Builder _b = new(); + public void Append(object? value) { if (value is null) _b.AppendNull(); else _b.Append((bool)value); } + public IArrowArray Build() => _b.Build(); + } + + private sealed class TypedColumnBuilder<T, TBuilder> : IColumnBuilder + where T : struct + where TBuilder : class + { + private readonly TBuilder _builder; + private readonly Action<TBuilder, T> _append; + private readonly Action<TBuilder> _appendNull; + private readonly Func<TBuilder, IArrowArray> _build; + + public TypedColumnBuilder(TBuilder builder, Action<TBuilder, T> append, + Action<TBuilder> appendNull, Func<TBuilder, IArrowArray> build) + { + _builder = builder; + _append = append; + _appendNull = appendNull; + _build = build; + } + + public void Append(object? value) + { + if (value is null) _appendNull(_builder); + else _append(_builder, (T)value); + } + + public IArrowArray Build() => _build(_builder); + } + + private sealed class DecimalColumnBuilder : IColumnBuilder + { + private readonly List<(decimal Value, bool IsNull)> _values = new(); + public void Append(object? value) + { + if (value is null) _values.Add((0, true)); + else _values.Add(((decimal)value, false)); + } + public IArrowArray Build() + { + var b = new Decimal128Array.Builder(new Decimal128Type(38, 18)); + foreach (var (v, isNull) in _values) + if (isNull) b.AppendNull(); else b.Append(v); + return b.Build(); + } + } + + private sealed class DateTimeColumnBuilder : IColumnBuilder + { + private readonly TimestampArray.Builder _b = new(new TimestampType(TimeUnit.Microsecond, "UTC")); + public void Append(object? value) + { + if (value is null) _b.AppendNull(); + else _b.Append(new DateTimeOffset((DateTime)value, TimeSpan.Zero)); Review Comment: The `DateTimeColumnBuilder.Append` creates a `DateTimeOffset` with `TimeSpan.Zero` regardless of the `DateTime.Kind`. If the input `DateTime` has `DateTimeKind.Local`, this will silently interpret local time as UTC, producing incorrect timestamps. The source-generated path (in `ArrowArrayHelper.ToUtcDateTimeOffset`) correctly handles this by calling `value.ToUniversalTime()` for non-UTC kinds. This reflection-based path should use the same logic, e.g., call `ArrowArrayHelper.ToUtcDateTimeOffset((DateTime)value)` instead. ########## src/Apache.Arrow.Serialization/RecordBatchBuilder.cs: ########## @@ -0,0 +1,711 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System.Diagnostics.CodeAnalysis; +using System.Reflection; +using Apache.Arrow; +using Apache.Arrow.Arrays; +using Apache.Arrow.Types; + +namespace Apache.Arrow.Serialization; + +/// <summary> +/// Reflection-based serializer for converting arbitrary .NET objects (including anonymous types) +/// to Arrow RecordBatches. Analogous to System.Text.Json's reflection-based path — +/// works without attributes or source generation but is not AOT-safe. +/// </summary> +public static class RecordBatchBuilder +{ + /// <summary> + /// Convert a collection of objects to a RecordBatch. Schema is inferred from the + /// public readable properties of <typeparamref name="T"/>. + /// Works with anonymous types, records, classes, and structs. + /// </summary> + [RequiresUnreferencedCode("Uses reflection to inspect properties. Use [ArrowSerializable] for AOT-safe serialization.")] + public static RecordBatch FromObjects<T>(IEnumerable<T> items) + { + var list = items as IReadOnlyList<T> ?? items.ToList(); + if (list.Count == 0) + throw new ArgumentException("Cannot infer schema from empty collection.", nameof(items)); + + var properties = typeof(T).GetProperties(BindingFlags.Public | BindingFlags.Instance) + .Where(p => p.CanRead) + .ToArray(); + + var fields = new List<Field>(); + var builders = new List<IColumnBuilder>(); + + foreach (var prop in properties) + { + var propType = prop.PropertyType; + var (arrowType, nullable) = InferArrowType(propType); + fields.Add(new Field(prop.Name, arrowType, nullable)); + builders.Add(CreateColumnBuilder(propType, arrowType)); + } + + var schema = new Schema.Builder(); + foreach (var f in fields) schema.Field(f); + + // Populate builders + for (int row = 0; row < list.Count; row++) + { + var item = list[row]!; + for (int col = 0; col < properties.Length; col++) + { + var value = properties[col].GetValue(item); + builders[col].Append(value); + } + } + + var arrays = builders.Select(b => b.Build()).ToArray(); + return new RecordBatch(schema.Build(), arrays, list.Count); + } + + /// <summary> + /// Convert a single object to a single-row RecordBatch. + /// </summary> + [RequiresUnreferencedCode("Uses reflection to inspect properties. Use [ArrowSerializable] for AOT-safe serialization.")] + public static RecordBatch FromObject<T>(T item) + => FromObjects(new[] { item }); + + private static (IArrowType Type, bool Nullable) InferArrowType(Type clrType) + { + var underlying = Nullable.GetUnderlyingType(clrType); + if (underlying is not null) + { + var (inner, _) = InferArrowType(underlying); + return (inner, true); + } + + if (clrType == typeof(string)) return (StringType.Default, true); + if (clrType == typeof(bool)) return (BooleanType.Default, false); + if (clrType == typeof(sbyte)) return (Int8Type.Default, false); + if (clrType == typeof(byte)) return (UInt8Type.Default, false); + if (clrType == typeof(short)) return (Int16Type.Default, false); + if (clrType == typeof(ushort)) return (UInt16Type.Default, false); + if (clrType == typeof(int)) return (Int32Type.Default, false); + if (clrType == typeof(uint)) return (UInt32Type.Default, false); + if (clrType == typeof(long)) return (Int64Type.Default, false); + if (clrType == typeof(ulong)) return (UInt64Type.Default, false); + if (clrType == typeof(Half)) return (HalfFloatType.Default, false); + if (clrType == typeof(float)) return (FloatType.Default, false); + if (clrType == typeof(double)) return (DoubleType.Default, false); + if (clrType == typeof(decimal)) return (new Decimal128Type(38, 18), false); + if (clrType == typeof(DateTime)) return (new TimestampType(TimeUnit.Microsecond, "UTC"), false); + if (clrType == typeof(DateTimeOffset)) return (new TimestampType(TimeUnit.Microsecond, "UTC"), false); + if (clrType == typeof(DateOnly)) return (Date32Type.Default, false); + if (clrType == typeof(TimeOnly)) return (new Time64Type(TimeUnit.Microsecond), false); + if (clrType == typeof(TimeSpan)) return (DurationType.Microsecond, false); + if (clrType == typeof(Guid)) return (new GuidType(), false); + if (clrType == typeof(byte[])) return (BinaryType.Default, true); + if (clrType == typeof(ReadOnlyMemory<byte>)) return (BinaryType.Default, false); + + if (clrType.IsEnum) + return (new DictionaryType(Int16Type.Default, StringType.Default, false), false); + + // T[] arrays (not byte[] which is handled above) + if (clrType.IsArray) + { + var elemType = clrType.GetElementType()!; + var (elemArrow, elemNullable) = InferArrowType(elemType); + return (new ListType(new Field("item", elemArrow, elemNullable)), true); + } + + if (clrType.IsGenericType) + { + var genDef = clrType.GetGenericTypeDefinition(); + if (genDef == typeof(List<>) || genDef == typeof(HashSet<>)) + { + var elemType = clrType.GetGenericArguments()[0]; + var (elemArrow, elemNullable) = InferArrowType(elemType); + return (new ListType(new Field("item", elemArrow, elemNullable)), true); + } + if (genDef == typeof(Dictionary<,>)) + { + var args = clrType.GetGenericArguments(); + var (keyArrow, _) = InferArrowType(args[0]); + var (valArrow, valNullable) = InferArrowType(args[1]); + return (new MapType(new Field("key", keyArrow, false), new Field("value", valArrow, valNullable)), true); + } + } + + // Check for [ArrowSerializable] types with source-generated IArrowSerializer<T> + var genSchema = GetGeneratedArrowSchema(clrType); + if (genSchema is not null) + { + var structFields = new List<Field>(genSchema.FieldsList); + return (new StructType(structFields), true); + } + + // Nested object type (anonymous, record, class, struct with readable properties) + if (clrType.IsClass || clrType.IsValueType) + { + var nestedProps = clrType.GetProperties(BindingFlags.Public | BindingFlags.Instance) + .Where(p => p.CanRead) + .ToArray(); + if (nestedProps.Length > 0) + { + var nestedFields = nestedProps.Select(p => + { + var (ft, fn) = InferArrowType(p.PropertyType); + return new Field(p.Name, ft, fn); + }).ToList(); + return (new StructType(nestedFields), true); + } + } + + throw new NotSupportedException($"Cannot infer Arrow type for {clrType.FullName}"); + } + + /// <summary> + /// Check if a type implements IArrowSerializer<T> (i.e. has [ArrowSerializable] source-generated code) + /// and return its static ArrowSchema if so. + /// </summary> + private static Schema? GetGeneratedArrowSchema(Type clrType) + { + var iface = clrType.GetInterfaces() + .FirstOrDefault(i => i.IsGenericType && i.GetGenericTypeDefinition() == typeof(IArrowSerializer<>)); + if (iface is null) return null; + + var schemaProp = clrType.GetProperty("ArrowSchema", BindingFlags.Public | BindingFlags.Static | BindingFlags.FlattenHierarchy); + return schemaProp?.GetValue(null) as Schema; + } + + /// <summary> + /// Try to get the static ToRecordBatch(IReadOnlyList<T>) method from a source-generated type. + /// </summary> + private static MethodInfo? GetGeneratedToRecordBatchList(Type clrType) + { + var listType = typeof(IReadOnlyList<>).MakeGenericType(clrType); + return clrType.GetMethod("ToRecordBatch", BindingFlags.Public | BindingFlags.Static | BindingFlags.FlattenHierarchy, [listType]); + } + + private static IColumnBuilder CreateColumnBuilder(Type clrType, IArrowType arrowType) + { + var underlying = Nullable.GetUnderlyingType(clrType); + if (underlying is not null) + return CreateColumnBuilder(underlying, arrowType); // inner builders all handle null + + if (clrType == typeof(string)) return new StringColumnBuilder(); + if (clrType == typeof(bool)) return new BoolColumnBuilder(); + if (clrType == typeof(sbyte)) return new TypedColumnBuilder<sbyte, Int8Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(byte)) return new TypedColumnBuilder<byte, UInt8Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(short)) return new TypedColumnBuilder<short, Int16Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(ushort)) return new TypedColumnBuilder<ushort, UInt16Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(int)) return new TypedColumnBuilder<int, Int32Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(uint)) return new TypedColumnBuilder<uint, UInt32Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(long)) return new TypedColumnBuilder<long, Int64Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(ulong)) return new TypedColumnBuilder<ulong, UInt64Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(Half)) return new TypedColumnBuilder<Half, HalfFloatArray.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(float)) return new TypedColumnBuilder<float, FloatArray.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(double)) return new TypedColumnBuilder<double, DoubleArray.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(decimal)) return new DecimalColumnBuilder(); + if (clrType == typeof(DateTime)) return new DateTimeColumnBuilder(); + if (clrType == typeof(DateTimeOffset)) return new DateTimeOffsetColumnBuilder(); + if (clrType == typeof(DateOnly)) return new TypedColumnBuilder<DateOnly, Date32Array.Builder>(new(), (b, v) => b.Append(v), b => b.AppendNull(), b => b.Build()); + if (clrType == typeof(TimeOnly)) return new TimeOnlyColumnBuilder(); + if (clrType == typeof(TimeSpan)) return new TimeSpanColumnBuilder(); + if (clrType == typeof(Guid)) return new GuidColumnBuilder(); + if (clrType == typeof(byte[])) return new BinaryColumnBuilder(); + if (clrType == typeof(ReadOnlyMemory<byte>)) return new ReadOnlyMemoryByteColumnBuilder(); + if (clrType.IsEnum) return new EnumColumnBuilder(); + + // List<T>, T[], HashSet<T> → ListArray + if (arrowType is ListType listType) + { + var elemClrType = clrType.IsArray + ? clrType.GetElementType()! + : clrType.GetGenericArguments()[0]; + var elemBuilder = CreateColumnBuilder(elemClrType, listType.ValueDataType); + return new ListColumnBuilder(listType, elemClrType, elemBuilder); + } + + // Dictionary<K,V> → MapArray + if (arrowType is MapType mapType) + { + var args = clrType.GetGenericArguments(); + var keyBuilder = CreateColumnBuilder(args[0], mapType.KeyField.DataType); + var valBuilder = CreateColumnBuilder(args[1], mapType.ValueField.DataType); + return new MapColumnBuilder(mapType, args[0], args[1], keyBuilder, valBuilder); + } + + // Nested object → StructArray + if (arrowType is StructType structType) + { + // If the type has source-generated IArrowSerializer<T>, delegate to it + var toRecordBatchList = GetGeneratedToRecordBatchList(clrType); + if (toRecordBatchList is not null) + return new SourceGenStructColumnBuilder(clrType, structType, toRecordBatchList); + + // Otherwise, fall back to reflection-based struct builder + var nestedProps = clrType.GetProperties(BindingFlags.Public | BindingFlags.Instance) + .Where(p => p.CanRead) + .ToArray(); + var childBuilders = new List<IColumnBuilder>(); + for (int i = 0; i < nestedProps.Length; i++) + { + var childArrowType = structType.Fields[i].DataType; + childBuilders.Add(CreateColumnBuilder(nestedProps[i].PropertyType, childArrowType)); + } + return new StructColumnBuilder(structType, nestedProps, childBuilders); + } + + throw new NotSupportedException($"Column builder not available for {clrType.FullName}"); + } + + // --- Column builder interface and implementations --- + + private interface IColumnBuilder + { + void Append(object? value); + IArrowArray Build(); + } + + private sealed class StringColumnBuilder : IColumnBuilder + { + private readonly StringArray.Builder _b = new(); + public void Append(object? value) { if (value is null) _b.AppendNull(); else _b.Append((string)value); } + public IArrowArray Build() => _b.Build(); + } + + private sealed class BoolColumnBuilder : IColumnBuilder + { + private readonly BooleanArray.Builder _b = new(); + public void Append(object? value) { if (value is null) _b.AppendNull(); else _b.Append((bool)value); } + public IArrowArray Build() => _b.Build(); + } + + private sealed class TypedColumnBuilder<T, TBuilder> : IColumnBuilder + where T : struct + where TBuilder : class + { + private readonly TBuilder _builder; + private readonly Action<TBuilder, T> _append; + private readonly Action<TBuilder> _appendNull; + private readonly Func<TBuilder, IArrowArray> _build; + + public TypedColumnBuilder(TBuilder builder, Action<TBuilder, T> append, + Action<TBuilder> appendNull, Func<TBuilder, IArrowArray> build) + { + _builder = builder; + _append = append; + _appendNull = appendNull; + _build = build; + } + + public void Append(object? value) + { + if (value is null) _appendNull(_builder); + else _append(_builder, (T)value); + } + + public IArrowArray Build() => _build(_builder); + } + + private sealed class DecimalColumnBuilder : IColumnBuilder + { + private readonly List<(decimal Value, bool IsNull)> _values = new(); + public void Append(object? value) + { + if (value is null) _values.Add((0, true)); + else _values.Add(((decimal)value, false)); + } + public IArrowArray Build() + { + var b = new Decimal128Array.Builder(new Decimal128Type(38, 18)); + foreach (var (v, isNull) in _values) + if (isNull) b.AppendNull(); else b.Append(v); + return b.Build(); + } + } + + private sealed class DateTimeColumnBuilder : IColumnBuilder + { + private readonly TimestampArray.Builder _b = new(new TimestampType(TimeUnit.Microsecond, "UTC")); + public void Append(object? value) + { + if (value is null) _b.AppendNull(); + else _b.Append(new DateTimeOffset((DateTime)value, TimeSpan.Zero)); + } + public IArrowArray Build() => _b.Build(); + } + + private sealed class DateTimeOffsetColumnBuilder : IColumnBuilder + { + private readonly TimestampArray.Builder _b = new(new TimestampType(TimeUnit.Microsecond, "UTC")); + public void Append(object? value) + { + if (value is null) _b.AppendNull(); + else _b.Append((DateTimeOffset)value); + } + public IArrowArray Build() => _b.Build(); + } + + private sealed class TimeOnlyColumnBuilder : IColumnBuilder + { + private readonly List<(TimeOnly Value, bool IsNull)> _values = new(); + public void Append(object? value) + { + if (value is null) _values.Add((default, true)); + else _values.Add(((TimeOnly)value, false)); + } + public IArrowArray Build() + { + var b = new Time64Array.Builder(new Time64Type(TimeUnit.Microsecond)); + foreach (var (v, isNull) in _values) + if (isNull) b.AppendNull(); else b.Append(v); + return b.Build(); + } + } + + private sealed class TimeSpanColumnBuilder : IColumnBuilder + { + private readonly List<(TimeSpan Value, bool IsNull)> _values = new(); + public void Append(object? value) + { + if (value is null) _values.Add((default, true)); + else _values.Add(((TimeSpan)value, false)); + } + public IArrowArray Build() + { + var b = new DurationArray.Builder(DurationType.Microsecond); + foreach (var (v, isNull) in _values) + if (isNull) b.AppendNull(); else b.Append(v); + return b.Build(); + } + } + + private sealed class GuidColumnBuilder : IColumnBuilder + { + private readonly GuidArray.Builder _b = new(); + public void Append(object? value) + { + if (value is null) _b.AppendNull(); + else _b.Append((Guid)value); + } + public IArrowArray Build() => _b.Build(); + } + + private sealed class BinaryColumnBuilder : IColumnBuilder + { + private readonly BinaryArray.Builder _b = new(); + public void Append(object? value) + { + if (value is null) _b.AppendNull(); + else _b.Append((ReadOnlySpan<byte>)(byte[])value); + } + public IArrowArray Build() => _b.Build(); + } + + private sealed class EnumColumnBuilder : IColumnBuilder + { + private readonly Dictionary<string, short> _dict = new(); + private readonly List<string?> _values = new(); + + public void Append(object? value) + { + if (value is null) { _values.Add(null); return; } + var name = value.ToString()!; + if (!_dict.ContainsKey(name)) + _dict[name] = (short)_dict.Count; + _values.Add(name); + } + + public IArrowArray Build() + { + var dictNames = _dict.OrderBy(kv => kv.Value).Select(kv => kv.Key).ToArray(); + var dictBuilder = new StringArray.Builder(); + foreach (var n in dictNames) dictBuilder.Append(n); + var dictArray = dictBuilder.Build(); + + var idxBuilder = new Int16Array.Builder(); + foreach (var v in _values) + { + if (v is null) idxBuilder.AppendNull(); + else idxBuilder.Append(_dict[v]); + } + + return new DictionaryArray( + new DictionaryType(Int16Type.Default, StringType.Default, false), + idxBuilder.Build(), dictArray); + } + } + + private sealed class ReadOnlyMemoryByteColumnBuilder : IColumnBuilder + { + private readonly BinaryArray.Builder _b = new(); + public void Append(object? value) + { + if (value is null) _b.AppendNull(); + else _b.Append(((ReadOnlyMemory<byte>)value).Span); + } + public IArrowArray Build() => _b.Build(); + } + + private sealed class ListColumnBuilder : IColumnBuilder + { + private readonly ListType _listType; + private readonly IColumnBuilder _elemBuilder; + private readonly List<int> _offsets = new() { 0 }; + private readonly List<bool> _validity = new(); + private int _totalElements; + + public ListColumnBuilder(ListType listType, Type elemClrType, IColumnBuilder elemBuilder) + { + _listType = listType; + _elemBuilder = elemBuilder; + } + + public void Append(object? value) + { + if (value is null) + { + _validity.Add(false); + _offsets.Add(_totalElements); + return; + } + + _validity.Add(true); + var enumerable = (System.Collections.IEnumerable)value; + foreach (var item in enumerable) + { + _elemBuilder.Append(item); + _totalElements++; + } + _offsets.Add(_totalElements); + } + + public IArrowArray Build() + { + var valueArray = _elemBuilder.Build(); + int length = _validity.Count; + int nullCount = _validity.Count(v => !v); + + var offsetBuffer = new ArrowBuffer( + _offsets.SelectMany(BitConverter.GetBytes).ToArray()); + + ArrowBuffer nullBitmap; + if (nullCount == 0) + { + nullBitmap = ArrowBuffer.Empty; + } + else + { + var bitmapBytes = new byte[(length + 7) / 8]; + for (int i = 0; i < length; i++) + if (_validity[i]) + bitmapBytes[i / 8] |= (byte)(1 << (i % 8)); + nullBitmap = new ArrowBuffer(bitmapBytes); + } + + var data = new ArrayData(_listType, length, nullCount, + 0, [nullBitmap, offsetBuffer], [valueArray.Data]); + return new ListArray(data); + } + } + + private sealed class MapColumnBuilder : IColumnBuilder + { + private readonly MapType _mapType; + private readonly IColumnBuilder _keyBuilder; + private readonly IColumnBuilder _valBuilder; + private readonly List<int> _offsets = new() { 0 }; + private readonly List<bool> _validity = new(); + private int _totalEntries; + + public MapColumnBuilder(MapType mapType, Type keyClrType, Type valClrType, + IColumnBuilder keyBuilder, IColumnBuilder valBuilder) + { + _mapType = mapType; + _keyBuilder = keyBuilder; + _valBuilder = valBuilder; + } + + public void Append(object? value) + { + if (value is null) + { + _validity.Add(false); + _offsets.Add(_totalEntries); + return; + } + + _validity.Add(true); + var dict = (System.Collections.IDictionary)value; + foreach (System.Collections.DictionaryEntry entry in dict) + { + _keyBuilder.Append(entry.Key); + _valBuilder.Append(entry.Value); + _totalEntries++; + } + _offsets.Add(_totalEntries); + } + + public IArrowArray Build() + { + var keyArray = _keyBuilder.Build(); + var valArray = _valBuilder.Build(); + int length = _validity.Count; + int nullCount = _validity.Count(v => !v); + + var offsetBuffer = new ArrowBuffer( + _offsets.SelectMany(BitConverter.GetBytes).ToArray()); + + ArrowBuffer nullBitmap; + if (nullCount == 0) + { + nullBitmap = ArrowBuffer.Empty; + } + else + { + var bitmapBytes = new byte[(length + 7) / 8]; + for (int i = 0; i < length; i++) + if (_validity[i]) + bitmapBytes[i / 8] |= (byte)(1 << (i % 8)); + nullBitmap = new ArrowBuffer(bitmapBytes); + } + + // MapArray's child is a StructArray of (key, value) entries + var entryType = new StructType(new List<Field> { _mapType.KeyField, _mapType.ValueField }); + var entryArray = new StructArray(entryType, _totalEntries, + new IArrowArray[] { keyArray, valArray }, ArrowBuffer.Empty, 0); + + var data = new ArrayData(_mapType, length, nullCount, + 0, [nullBitmap, offsetBuffer], [entryArray.Data]); + return new MapArray(data); + } + } + + /// <summary> + /// Column builder that delegates to source-generated ToRecordBatch(IReadOnlyList<T>) + /// for [ArrowSerializable] types, then wraps the RecordBatch columns into a StructArray. + /// </summary> + private sealed class SourceGenStructColumnBuilder : IColumnBuilder + { + private readonly Type _clrType; + private readonly StructType _structType; + private readonly MethodInfo _toRecordBatchList; + private readonly List<object?> _items = new(); + + public SourceGenStructColumnBuilder(Type clrType, StructType structType, MethodInfo toRecordBatchList) + { + _clrType = clrType; + _structType = structType; + _toRecordBatchList = toRecordBatchList; + } + + public void Append(object? value) => _items.Add(value); + + public IArrowArray Build() + { + int length = _items.Count; + int nullCount = _items.Count(v => v is null); + + // Build a typed list for the source-generated method + var listType = typeof(List<>).MakeGenericType(_clrType); + var typedList = (System.Collections.IList)Activator.CreateInstance(listType, length)!; + + // For null slots, we need a stand-in value (first non-null item) + object? standIn = _items.FirstOrDefault(v => v is not null); + foreach (var item in _items) + typedList.Add(item ?? standIn!); Review Comment: The `SourceGenStructColumnBuilder` will crash with `NullReferenceException` when **all** items are null, because `standIn` will be `null` and the `standIn!` null-forgiving dereference on line 623 doesn't actually prevent the issue — `typedList.Add(null)` will still be called for a `List<SomeValueType>`, which will throw. Consider throwing a descriptive exception or creating a default instance via `Activator.CreateInstance(_clrType)` as the stand-in when no non-null item exists. ########## src/Apache.Arrow.Serialization/Apache.Arrow.Serialization.csproj: ########## @@ -0,0 +1,20 @@ +<Project Sdk="Microsoft.NET.Sdk"> + + <PropertyGroup> + <TargetFramework>net8.0</TargetFramework> + <Nullable>enable</Nullable> + <ImplicitUsings>enable</ImplicitUsings> + <Description>Source-generated Apache Arrow serialization for .NET. Provides [ArrowSerializable] attribute and IArrowSerializer<T> interface for compile-time Arrow schema derivation, serialization, and deserialization.</Description> Review Comment: The `Apache.Arrow.Serialization` runtime library targets only `net8.0`, while the core `Apache.Arrow` library targets `netstandard2.0;net6.0;net8.0;net462`. This means consumers on .NET 6 or netstandard2.0 cannot use the serialization library, even though the core Arrow library supports them. Consider multi-targeting (at least `net6.0;net8.0`) or documenting this as a deliberate requirement. The `IArrowSerializer<T>` interface uses C# 11 static abstract interface members, which requires .NET 7+, so `net8.0` as a minimum is reasonable if that's the intent, but it should be documented. ########## src/Apache.Arrow.Serialization.Generator/PolymorphicCodeEmitter.cs: ########## @@ -0,0 +1,816 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.Text; + +#nullable enable + +namespace Apache.Arrow.Serialization.Generator +{ + +internal class PolymorphicCodeEmitter +{ + private readonly StringBuilder _sb; + private readonly PolymorphicModel _model; + private int _indent; + + public PolymorphicCodeEmitter(StringBuilder sb, PolymorphicModel model) + { + _sb = sb; + _model = model; + } + + public void Emit() + { + Line("// <auto-generated/>"); + Line("#nullable enable"); + Line("#pragma warning disable CS8629 // Nullable value type may be null"); + Line(); + Line("using System.Collections.Generic;"); + Line("using System.Linq;"); + Line("using Apache.Arrow;"); + Line("using Apache.Arrow.Arrays;"); + Line("using Apache.Arrow.Types;"); + Line("using Apache.Arrow.Serialization;"); + Line(); + + if (_model.Namespace != null) + { + Line($"namespace {_model.Namespace};"); + Line(); + } + + var typeKeyword = _model.IsInterface ? "interface" : (_model.IsRecord ? "record" : "class"); + Line($"partial {typeKeyword} {_model.TypeName} : IArrowSerializer<{_model.TypeName}>"); + Line("{"); + _indent++; + + EmitSchemaField(); + Line(); + EmitSerialize(); + Line(); + EmitDeserialize(); + Line(); + EmitMultiRowSerialize(); + Line(); + EmitMultiRowDeserialize(); + + _indent--; + Line("}"); + } + + private void EmitSchemaField() + { + Line($"private static readonly Schema _arrowSchema = new Schema.Builder()"); + _indent++; + + // Discriminator field + Line($".Field(new Field(\"{Escape(_model.TypeDiscriminatorFieldName)}\", StringType.Default, false))"); + + // Union properties — all nullable + foreach (var prop in _model.UnionProperties) + { + var arrowType = CodeEmitterHelpers.GetArrowTypeExpression(prop.Type); + Line($".Field(new Field(\"{Escape(prop.FieldName)}\", {arrowType}, true))"); + } + + _indent--; + Line($".Build();"); + Line(); + Line($"public static Schema ArrowSchema => _arrowSchema;"); + } + + private void EmitSerialize() + { + Line($"public static RecordBatch ToRecordBatch({_model.TypeName} value)"); + Line("{"); + _indent++; + + // Build discriminator column + Line("var discriminatorBuilder = new StringArray.Builder();"); + + // Build union property builders + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var prop = _model.UnionProperties[i]; + EmitBuilderDeclaration(prop, i); + } + + Line(); + // Switch on concrete type + Line("switch (value)"); + Line("{"); + _indent++; + + foreach (var dt in _model.DerivedTypes) + { + Line($"case {dt.FullTypeName} v_{dt.TypeName}:"); + Line("{"); + _indent++; + Line($"discriminatorBuilder.Append(\"{Escape(dt.TypeDiscriminator)}\");"); + + // For each union property, either append the value or null + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var unionProp = _model.UnionProperties[i]; + var derivedProp = FindProperty(dt, unionProp.FieldName); + if (derivedProp != null) + { + EmitAppendValue(derivedProp, i, $"v_{dt.TypeName}.{derivedProp.PropertyName}"); + } + else + { + EmitAppendNull(unionProp, i); + } + } + + Line("break;"); + _indent--; + Line("}"); + } + + Line($"default:"); + _indent++; + Line($"throw new System.ArgumentException($\"Unknown derived type: {{value.GetType().Name}}\");"); + _indent--; + + _indent--; + Line("}"); + + Line(); + // Build arrays and RecordBatch + Line("var columns = new IArrowArray[]"); + Line("{"); + _indent++; + Line("discriminatorBuilder.Build(),"); + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + EmitBuildColumn(i); + } + _indent--; + Line("};"); + Line("return new RecordBatch(_arrowSchema, columns, 1);"); + + _indent--; + Line("}"); + } + + private void EmitDeserialize() + { + Line($"public static {_model.TypeName} FromRecordBatch(RecordBatch batch)"); + Line("{"); + _indent++; + + Line("var discriminator = ((StringArray)batch.Column(0)).GetString(0)!;"); + Line("switch (discriminator)"); + Line("{"); + _indent++; + + foreach (var dt in _model.DerivedTypes) + { + Line($"case \"{Escape(dt.TypeDiscriminator)}\":"); + Line("{"); + _indent++; + + // Read each property for this derived type from the batch columns + for (int i = 0; i < dt.Properties.Count; i++) + { + var prop = dt.Properties[i]; + var unionIndex = FindUnionPropertyIndex(prop.FieldName); + var colIndex = unionIndex + 1; // +1 for discriminator + EmitReadProperty(prop, i, colIndex); + } + + // Construct the derived type + Line($"return new {dt.FullTypeName}"); + Line("{"); + _indent++; + for (int i = 0; i < dt.Properties.Count; i++) + { + var prop = dt.Properties[i]; + Line($"{prop.PropertyName} = prop_{i},"); + } + _indent--; + Line("};"); + + _indent--; + Line("}"); + } + + Line("default:"); + _indent++; + Line("throw new System.ArgumentException($\"Unknown type discriminator: {discriminator}\");"); + _indent--; + + _indent--; + Line("}"); + + _indent--; + Line("}"); + } + + private void EmitMultiRowSerialize() + { + Line($"public static RecordBatch ToRecordBatch(IReadOnlyList<{_model.TypeName}> values)"); + Line("{"); + _indent++; + + Line("var discriminatorBuilder = new StringArray.Builder();"); + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var prop = _model.UnionProperties[i]; + EmitBuilderDeclaration(prop, i); + } + + Line(); + Line("foreach (var value in values)"); + Line("{"); + _indent++; + + Line("switch (value)"); + Line("{"); + _indent++; + + foreach (var dt in _model.DerivedTypes) + { + Line($"case {dt.FullTypeName} v_{dt.TypeName}:"); + Line("{"); + _indent++; + Line($"discriminatorBuilder.Append(\"{Escape(dt.TypeDiscriminator)}\");"); + + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var unionProp = _model.UnionProperties[i]; + var derivedProp = FindProperty(dt, unionProp.FieldName); + if (derivedProp != null) + { + EmitAppendValue(derivedProp, i, $"v_{dt.TypeName}.{derivedProp.PropertyName}"); + } + else + { + EmitAppendNull(unionProp, i); + } + } + + Line("break;"); + _indent--; + Line("}"); + } + + Line($"default:"); + _indent++; + Line($"throw new System.ArgumentException($\"Unknown derived type: {{value.GetType().Name}}\");"); + _indent--; + + _indent--; + Line("}"); + + _indent--; + Line("}"); + + Line(); + Line("var columns = new IArrowArray[]"); + Line("{"); + _indent++; + Line("discriminatorBuilder.Build(),"); + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + EmitBuildColumn(i); + } + _indent--; + Line("};"); + Line("return new RecordBatch(_arrowSchema, columns, values.Count);"); + + _indent--; + Line("}"); + } + + private void EmitBuildColumn(int index) + { + var prop = _model.UnionProperties[index]; + if (prop.Type.Kind == TypeKind2.Enum) + { + // Build DictionaryArray from index + dict builders + Line($"new DictionaryArray(new DictionaryType(Int16Type.Default, StringType.Default, false), bld_{index}_idx.Build(), new StringArray.Builder().AppendRange(bld_{index}_dict).Build()),"); + } + else + { + Line($"bld_{index}.Build(),"); + } + } + + private void EmitMultiRowDeserialize() + { + Line($"public static IReadOnlyList<{_model.TypeName}> ListFromRecordBatch(RecordBatch batch)"); + Line("{"); + _indent++; + + Line("var discriminatorCol = (StringArray)batch.Column(0);"); + + // Cast all union property columns + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var prop = _model.UnionProperties[i]; + var colIndex = i + 1; + var castType = CodeEmitterHelpers.GetArrayCastType(prop); + Line($"var col_{i} = ({castType})batch.Column({colIndex});"); + } + + Line(); + Line($"var result = new List<{_model.TypeName}>(batch.Length);"); + Line("for (int row = 0; row < batch.Length; row++)"); + Line("{"); + _indent++; + + Line("var discriminator = discriminatorCol.GetString(row)!;"); + Line("switch (discriminator)"); + Line("{"); + _indent++; + + foreach (var dt in _model.DerivedTypes) + { + Line($"case \"{Escape(dt.TypeDiscriminator)}\":"); + Line("{"); + _indent++; + + for (int i = 0; i < dt.Properties.Count; i++) + { + var prop = dt.Properties[i]; + var unionIndex = FindUnionPropertyIndex(prop.FieldName); + EmitMultiRowReadProperty(prop, i, unionIndex); + } + + Line($"result.Add(new {dt.FullTypeName}"); + Line("{"); + _indent++; + for (int i = 0; i < dt.Properties.Count; i++) + { + var prop = dt.Properties[i]; + Line($"{prop.PropertyName} = prop_{i},"); + } + _indent--; + Line("});"); + Line("break;"); + + _indent--; + Line("}"); + } + + Line("default:"); + _indent++; + Line("throw new System.ArgumentException($\"Unknown type discriminator: {discriminator}\");"); + _indent--; + + _indent--; + Line("}"); + + _indent--; + Line("}"); + Line("return result;"); + + _indent--; + Line("}"); + } + + // --- Helpers --- + + private void EmitBuilderDeclaration(PropertyModel prop, int index) + { + if (prop.Type.Kind == TypeKind2.Enum) + { + // Enum uses Dictionary(Int16, Utf8) — built manually + Line($"var bld_{index}_idx = new Int16Array.Builder();"); + Line($"var bld_{index}_dict = new List<string>();"); + } + else + { + var builderType = CodeEmitterHelpers.GetNullableBuilderDeclaration(prop); + Line($"var bld_{index} = {builderType};"); + } + } + + private void EmitAppendValue(PropertyModel prop, int index, string access) + { + // For non-nullable properties in the derived type, we still need to handle them + // as nullable in the union schema + switch (prop.Type.Kind) + { + case TypeKind2.String: + if (prop.IsNullable) + Line($"if ({access} != null) bld_{index}.Append({access}); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append({access});"); + break; + case TypeKind2.Bool: + case TypeKind2.Byte: + case TypeKind2.SByte: + case TypeKind2.Int16: + case TypeKind2.UInt16: + case TypeKind2.Int32: + case TypeKind2.UInt32: + case TypeKind2.Int64: + case TypeKind2.UInt64: + case TypeKind2.Float: + case TypeKind2.Double: + case TypeKind2.Half: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) bld_{index}.Append(v_{index}); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append({access});"); + break; + case TypeKind2.DateTime: + case TypeKind2.DateTimeOffset: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) bld_{index}.Append(Apache.Arrow.Serialization.ArrowArrayHelper.ToUtcDateTimeOffset(v_{index})); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append(Apache.Arrow.Serialization.ArrowArrayHelper.ToUtcDateTimeOffset({access}));"); + break; + case TypeKind2.DateOnly: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) bld_{index}.Append(v_{index}); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append({access});"); + break; + case TypeKind2.Decimal: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) bld_{index}.Append(v_{index}); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append({access});"); + break; + case TypeKind2.Enum: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) {{ bld_{index}_idx.Append((short)bld_{index}_dict.Count); bld_{index}_dict.Add(v_{index}.ToString()); }} else bld_{index}_idx.AppendNull();"); + else + { + Line($"bld_{index}_idx.Append((short)bld_{index}_dict.Count);"); + Line($"bld_{index}_dict.Add({access}.ToString());"); + } Review Comment: The polymorphic enum serialization always appends a new dictionary entry per value, even for duplicate enum names. On line 459, `bld_{index}_dict.Add({access}.ToString())` is called for every row, making the dictionary contain duplicates and producing incorrect index values. The dictionary-based approach used in `EnumColumnBuilder` (tracking seen values with a `Dictionary<string, short>`) should be used here instead, so that repeated enum values share the same index. ########## src/Apache.Arrow.Serialization.Generator/PolymorphicCodeEmitter.cs: ########## @@ -0,0 +1,816 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.Text; + +#nullable enable + +namespace Apache.Arrow.Serialization.Generator +{ + +internal class PolymorphicCodeEmitter +{ + private readonly StringBuilder _sb; + private readonly PolymorphicModel _model; + private int _indent; + + public PolymorphicCodeEmitter(StringBuilder sb, PolymorphicModel model) + { + _sb = sb; + _model = model; + } + + public void Emit() + { + Line("// <auto-generated/>"); + Line("#nullable enable"); + Line("#pragma warning disable CS8629 // Nullable value type may be null"); + Line(); + Line("using System.Collections.Generic;"); + Line("using System.Linq;"); + Line("using Apache.Arrow;"); + Line("using Apache.Arrow.Arrays;"); + Line("using Apache.Arrow.Types;"); + Line("using Apache.Arrow.Serialization;"); + Line(); + + if (_model.Namespace != null) + { + Line($"namespace {_model.Namespace};"); + Line(); + } + + var typeKeyword = _model.IsInterface ? "interface" : (_model.IsRecord ? "record" : "class"); + Line($"partial {typeKeyword} {_model.TypeName} : IArrowSerializer<{_model.TypeName}>"); + Line("{"); + _indent++; + + EmitSchemaField(); + Line(); + EmitSerialize(); + Line(); + EmitDeserialize(); + Line(); + EmitMultiRowSerialize(); + Line(); + EmitMultiRowDeserialize(); + + _indent--; + Line("}"); + } + + private void EmitSchemaField() + { + Line($"private static readonly Schema _arrowSchema = new Schema.Builder()"); + _indent++; + + // Discriminator field + Line($".Field(new Field(\"{Escape(_model.TypeDiscriminatorFieldName)}\", StringType.Default, false))"); + + // Union properties — all nullable + foreach (var prop in _model.UnionProperties) + { + var arrowType = CodeEmitterHelpers.GetArrowTypeExpression(prop.Type); + Line($".Field(new Field(\"{Escape(prop.FieldName)}\", {arrowType}, true))"); + } + + _indent--; + Line($".Build();"); + Line(); + Line($"public static Schema ArrowSchema => _arrowSchema;"); + } + + private void EmitSerialize() + { + Line($"public static RecordBatch ToRecordBatch({_model.TypeName} value)"); + Line("{"); + _indent++; + + // Build discriminator column + Line("var discriminatorBuilder = new StringArray.Builder();"); + + // Build union property builders + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var prop = _model.UnionProperties[i]; + EmitBuilderDeclaration(prop, i); + } + + Line(); + // Switch on concrete type + Line("switch (value)"); + Line("{"); + _indent++; + + foreach (var dt in _model.DerivedTypes) + { + Line($"case {dt.FullTypeName} v_{dt.TypeName}:"); + Line("{"); + _indent++; + Line($"discriminatorBuilder.Append(\"{Escape(dt.TypeDiscriminator)}\");"); + + // For each union property, either append the value or null + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var unionProp = _model.UnionProperties[i]; + var derivedProp = FindProperty(dt, unionProp.FieldName); + if (derivedProp != null) + { + EmitAppendValue(derivedProp, i, $"v_{dt.TypeName}.{derivedProp.PropertyName}"); + } + else + { + EmitAppendNull(unionProp, i); + } + } + + Line("break;"); + _indent--; + Line("}"); + } + + Line($"default:"); + _indent++; + Line($"throw new System.ArgumentException($\"Unknown derived type: {{value.GetType().Name}}\");"); + _indent--; + + _indent--; + Line("}"); + + Line(); + // Build arrays and RecordBatch + Line("var columns = new IArrowArray[]"); + Line("{"); + _indent++; + Line("discriminatorBuilder.Build(),"); + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + EmitBuildColumn(i); + } + _indent--; + Line("};"); + Line("return new RecordBatch(_arrowSchema, columns, 1);"); + + _indent--; + Line("}"); + } + + private void EmitDeserialize() + { + Line($"public static {_model.TypeName} FromRecordBatch(RecordBatch batch)"); + Line("{"); + _indent++; + + Line("var discriminator = ((StringArray)batch.Column(0)).GetString(0)!;"); + Line("switch (discriminator)"); + Line("{"); + _indent++; + + foreach (var dt in _model.DerivedTypes) + { + Line($"case \"{Escape(dt.TypeDiscriminator)}\":"); + Line("{"); + _indent++; + + // Read each property for this derived type from the batch columns + for (int i = 0; i < dt.Properties.Count; i++) + { + var prop = dt.Properties[i]; + var unionIndex = FindUnionPropertyIndex(prop.FieldName); + var colIndex = unionIndex + 1; // +1 for discriminator + EmitReadProperty(prop, i, colIndex); + } + + // Construct the derived type + Line($"return new {dt.FullTypeName}"); + Line("{"); + _indent++; + for (int i = 0; i < dt.Properties.Count; i++) + { + var prop = dt.Properties[i]; + Line($"{prop.PropertyName} = prop_{i},"); + } + _indent--; + Line("};"); + + _indent--; + Line("}"); + } + + Line("default:"); + _indent++; + Line("throw new System.ArgumentException($\"Unknown type discriminator: {discriminator}\");"); + _indent--; + + _indent--; + Line("}"); + + _indent--; + Line("}"); + } + + private void EmitMultiRowSerialize() + { + Line($"public static RecordBatch ToRecordBatch(IReadOnlyList<{_model.TypeName}> values)"); + Line("{"); + _indent++; + + Line("var discriminatorBuilder = new StringArray.Builder();"); + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var prop = _model.UnionProperties[i]; + EmitBuilderDeclaration(prop, i); + } + + Line(); + Line("foreach (var value in values)"); + Line("{"); + _indent++; + + Line("switch (value)"); + Line("{"); + _indent++; + + foreach (var dt in _model.DerivedTypes) + { + Line($"case {dt.FullTypeName} v_{dt.TypeName}:"); + Line("{"); + _indent++; + Line($"discriminatorBuilder.Append(\"{Escape(dt.TypeDiscriminator)}\");"); + + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var unionProp = _model.UnionProperties[i]; + var derivedProp = FindProperty(dt, unionProp.FieldName); + if (derivedProp != null) + { + EmitAppendValue(derivedProp, i, $"v_{dt.TypeName}.{derivedProp.PropertyName}"); + } + else + { + EmitAppendNull(unionProp, i); + } + } + + Line("break;"); + _indent--; + Line("}"); + } + + Line($"default:"); + _indent++; + Line($"throw new System.ArgumentException($\"Unknown derived type: {{value.GetType().Name}}\");"); + _indent--; + + _indent--; + Line("}"); + + _indent--; + Line("}"); + + Line(); + Line("var columns = new IArrowArray[]"); + Line("{"); + _indent++; + Line("discriminatorBuilder.Build(),"); + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + EmitBuildColumn(i); + } + _indent--; + Line("};"); + Line("return new RecordBatch(_arrowSchema, columns, values.Count);"); + + _indent--; + Line("}"); + } + + private void EmitBuildColumn(int index) + { + var prop = _model.UnionProperties[index]; + if (prop.Type.Kind == TypeKind2.Enum) + { + // Build DictionaryArray from index + dict builders + Line($"new DictionaryArray(new DictionaryType(Int16Type.Default, StringType.Default, false), bld_{index}_idx.Build(), new StringArray.Builder().AppendRange(bld_{index}_dict).Build()),"); + } + else + { + Line($"bld_{index}.Build(),"); + } + } + + private void EmitMultiRowDeserialize() + { + Line($"public static IReadOnlyList<{_model.TypeName}> ListFromRecordBatch(RecordBatch batch)"); + Line("{"); + _indent++; + + Line("var discriminatorCol = (StringArray)batch.Column(0);"); + + // Cast all union property columns + for (int i = 0; i < _model.UnionProperties.Count; i++) + { + var prop = _model.UnionProperties[i]; + var colIndex = i + 1; + var castType = CodeEmitterHelpers.GetArrayCastType(prop); + Line($"var col_{i} = ({castType})batch.Column({colIndex});"); + } + + Line(); + Line($"var result = new List<{_model.TypeName}>(batch.Length);"); + Line("for (int row = 0; row < batch.Length; row++)"); + Line("{"); + _indent++; + + Line("var discriminator = discriminatorCol.GetString(row)!;"); + Line("switch (discriminator)"); + Line("{"); + _indent++; + + foreach (var dt in _model.DerivedTypes) + { + Line($"case \"{Escape(dt.TypeDiscriminator)}\":"); + Line("{"); + _indent++; + + for (int i = 0; i < dt.Properties.Count; i++) + { + var prop = dt.Properties[i]; + var unionIndex = FindUnionPropertyIndex(prop.FieldName); + EmitMultiRowReadProperty(prop, i, unionIndex); + } + + Line($"result.Add(new {dt.FullTypeName}"); + Line("{"); + _indent++; + for (int i = 0; i < dt.Properties.Count; i++) + { + var prop = dt.Properties[i]; + Line($"{prop.PropertyName} = prop_{i},"); + } + _indent--; + Line("});"); + Line("break;"); + + _indent--; + Line("}"); + } + + Line("default:"); + _indent++; + Line("throw new System.ArgumentException($\"Unknown type discriminator: {discriminator}\");"); + _indent--; + + _indent--; + Line("}"); + + _indent--; + Line("}"); + Line("return result;"); + + _indent--; + Line("}"); + } + + // --- Helpers --- + + private void EmitBuilderDeclaration(PropertyModel prop, int index) + { + if (prop.Type.Kind == TypeKind2.Enum) + { + // Enum uses Dictionary(Int16, Utf8) — built manually + Line($"var bld_{index}_idx = new Int16Array.Builder();"); + Line($"var bld_{index}_dict = new List<string>();"); + } + else + { + var builderType = CodeEmitterHelpers.GetNullableBuilderDeclaration(prop); + Line($"var bld_{index} = {builderType};"); + } + } + + private void EmitAppendValue(PropertyModel prop, int index, string access) + { + // For non-nullable properties in the derived type, we still need to handle them + // as nullable in the union schema + switch (prop.Type.Kind) + { + case TypeKind2.String: + if (prop.IsNullable) + Line($"if ({access} != null) bld_{index}.Append({access}); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append({access});"); + break; + case TypeKind2.Bool: + case TypeKind2.Byte: + case TypeKind2.SByte: + case TypeKind2.Int16: + case TypeKind2.UInt16: + case TypeKind2.Int32: + case TypeKind2.UInt32: + case TypeKind2.Int64: + case TypeKind2.UInt64: + case TypeKind2.Float: + case TypeKind2.Double: + case TypeKind2.Half: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) bld_{index}.Append(v_{index}); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append({access});"); + break; + case TypeKind2.DateTime: + case TypeKind2.DateTimeOffset: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) bld_{index}.Append(Apache.Arrow.Serialization.ArrowArrayHelper.ToUtcDateTimeOffset(v_{index})); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append(Apache.Arrow.Serialization.ArrowArrayHelper.ToUtcDateTimeOffset({access}));"); + break; + case TypeKind2.DateOnly: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) bld_{index}.Append(v_{index}); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append({access});"); + break; + case TypeKind2.Decimal: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) bld_{index}.Append(v_{index}); else bld_{index}.AppendNull();"); + else + Line($"bld_{index}.Append({access});"); + break; + case TypeKind2.Enum: + if (prop.IsNullable) + Line($"if ({access} is {{ }} v_{index}) {{ bld_{index}_idx.Append((short)bld_{index}_dict.Count); bld_{index}_dict.Add(v_{index}.ToString()); }} else bld_{index}_idx.AppendNull();"); + else + { + Line($"bld_{index}_idx.Append((short)bld_{index}_dict.Count);"); + Line($"bld_{index}_dict.Add({access}.ToString());"); + } + break; + default: + // For complex types (Binary, Guid, TimeOnly, TimeSpan, nested, collections), + // fall back to AppendNull for now — these are less common in polymorphic scenarios + Line($"bld_{index}.AppendNull(); // TODO: complex type {prop.Type.Kind}"); + break; + } + } + + private void EmitAppendNull(PropertyModel prop, int index) + { + if (prop.Type.Kind == TypeKind2.Enum) + Line($"bld_{index}_idx.AppendNull();"); + else + Line($"bld_{index}.AppendNull();"); + } + + private void EmitReadProperty(PropertyModel prop, int propIndex, int colIndex) + { + var col = $"batch.Column({colIndex})"; + switch (prop.Type.Kind) + { + case TypeKind2.String: + if (prop.IsNullable) + Line($"var prop_{propIndex} = ((StringArray){col}).IsNull(0) ? null : ((StringArray){col}).GetString(0);"); + else + Line($"var prop_{propIndex} = ((StringArray){col}).GetString(0)!;"); + break; + case TypeKind2.Bool: + case TypeKind2.Byte: + case TypeKind2.SByte: + case TypeKind2.Int16: + case TypeKind2.UInt16: + case TypeKind2.Int32: + case TypeKind2.UInt32: + case TypeKind2.Int64: + case TypeKind2.UInt64: + case TypeKind2.Float: + case TypeKind2.Double: + case TypeKind2.Half: + { + var castType = CodeEmitterHelpers.GetArrayCastType(prop); + if (prop.IsNullable) + Line($"var prop_{propIndex} = (({castType}){col}).IsNull(0) ? ({prop.Type.FullTypeName}?)null : (({castType}){col}).GetValue(0).Value;"); + else + Line($"var prop_{propIndex} = (({castType}){col}).GetValue(0).Value;"); + break; + } + case TypeKind2.DateTime: + { + if (prop.IsNullable) + Line($"var prop_{propIndex} = ((TimestampArray){col}).IsNull(0) ? (System.DateTime?)null : ((TimestampArray){col}).GetTimestamp(0)!.Value.UtcDateTime;"); + else + Line($"var prop_{propIndex} = ((TimestampArray){col}).GetTimestamp(0)!.Value.UtcDateTime;"); + break; + } + case TypeKind2.DateTimeOffset: + { + if (prop.IsNullable) + Line($"var prop_{propIndex} = ((TimestampArray){col}).IsNull(0) ? (System.DateTimeOffset?)null : ((TimestampArray){col}).GetTimestamp(0)!.Value;"); + else + Line($"var prop_{propIndex} = ((TimestampArray){col}).GetTimestamp(0)!.Value;"); + break; + } + case TypeKind2.DateOnly: + { + var castType = CodeEmitterHelpers.GetArrayCastType(prop); + if (prop.IsNullable) + Line($"var prop_{propIndex} = (({castType}){col}).IsNull(0) ? (System.DateOnly?)null : (({castType}){col}).GetDateOnly(0);"); + else + Line($"var prop_{propIndex} = (({castType}){col}).GetDateOnly(0)!.Value;"); + break; + } + case TypeKind2.Decimal: + { + if (prop.IsNullable) + Line($"var prop_{propIndex} = ((Decimal128Array){col}).IsNull(0) ? (decimal?)null : ((Decimal128Array){col}).GetValue(0).Value;"); + else + Line($"var prop_{propIndex} = ((Decimal128Array){col}).GetValue(0)!.Value;"); + break; + } + case TypeKind2.Enum: + { + var enumType = prop.Type.FullTypeName; + if (prop.IsNullable) + { + Line($"{enumType}? prop_{propIndex} = null;"); + Line($"{{ var da = (DictionaryArray){col}; if (!da.IsNull(0)) {{ var idx = ((Int16Array)da.Indices).GetValue(0).Value; var name = ((StringArray)da.Dictionary).GetString(idx); prop_{propIndex} = System.Enum.Parse<{enumType}>(name!); }} }}"); + } + else + { + Line($"var da_{propIndex} = (DictionaryArray){col};"); + Line($"var idx_{propIndex} = ((Int16Array)da_{propIndex}.Indices).GetValue(0).Value;"); + Line($"var prop_{propIndex} = System.Enum.Parse<{enumType}>(((StringArray)da_{propIndex}.Dictionary).GetString(idx_{propIndex})!);"); + } + break; + } + default: + Line($"object? prop_{propIndex} = null; // TODO: unsupported type {prop.Type.Kind}"); + break; + } + } + + private void EmitMultiRowReadProperty(PropertyModel prop, int propIndex, int unionIndex) + { + var col = $"col_{unionIndex}"; + switch (prop.Type.Kind) + { + case TypeKind2.String: + if (prop.IsNullable) + Line($"var prop_{propIndex} = {col}.IsNull(row) ? null : {col}.GetString(row);"); + else + Line($"var prop_{propIndex} = {col}.GetString(row)!;"); + break; + case TypeKind2.Bool: + case TypeKind2.Byte: + case TypeKind2.SByte: + case TypeKind2.Int16: + case TypeKind2.UInt16: + case TypeKind2.Int32: + case TypeKind2.UInt32: + case TypeKind2.Int64: + case TypeKind2.UInt64: + case TypeKind2.Float: + case TypeKind2.Double: + case TypeKind2.Half: + if (prop.IsNullable) + Line($"var prop_{propIndex} = {col}.IsNull(row) ? ({prop.Type.FullTypeName}?)null : {col}.GetValue(row).Value;"); + else + Line($"var prop_{propIndex} = {col}.GetValue(row).Value;"); + break; + case TypeKind2.DateTime: + if (prop.IsNullable) + Line($"var prop_{propIndex} = {col}.IsNull(row) ? (System.DateTime?)null : {col}.GetTimestamp(row)!.Value.UtcDateTime;"); + else + Line($"var prop_{propIndex} = {col}.GetTimestamp(row)!.Value.UtcDateTime;"); + break; + case TypeKind2.DateTimeOffset: + if (prop.IsNullable) + Line($"var prop_{propIndex} = {col}.IsNull(row) ? (System.DateTimeOffset?)null : {col}.GetTimestamp(row)!.Value;"); + else + Line($"var prop_{propIndex} = {col}.GetTimestamp(row)!.Value;"); + break; + case TypeKind2.DateOnly: + if (prop.IsNullable) + Line($"var prop_{propIndex} = {col}.IsNull(row) ? (System.DateOnly?)null : {col}.GetDateOnly(row);"); + else + Line($"var prop_{propIndex} = {col}.GetDateOnly(row)!.Value;"); + break; + case TypeKind2.Decimal: + if (prop.IsNullable) + Line($"var prop_{propIndex} = {col}.IsNull(row) ? (decimal?)null : {col}.GetValue(row).Value;"); + else + Line($"var prop_{propIndex} = {col}.GetValue(row)!.Value;"); + break; + case TypeKind2.Enum: + { + var enumType = prop.Type.FullTypeName; + if (prop.IsNullable) + { + Line($"{enumType}? prop_{propIndex} = null;"); + Line($"if (!{col}.IsNull(row)) {{ var idx = ((Int16Array){col}.Indices).GetValue(row).Value; var name = ((StringArray){col}.Dictionary).GetString(idx); prop_{propIndex} = System.Enum.Parse<{enumType}>(name!); }}"); + } + else + { + Line($"var idx_{propIndex} = ((Int16Array){col}.Indices).GetValue(row).Value;"); + Line($"var prop_{propIndex} = System.Enum.Parse<{enumType}>(((StringArray){col}.Dictionary).GetString(idx_{propIndex})!);"); + } + break; + } + default: + Line($"object? prop_{propIndex} = null; // TODO: unsupported type {prop.Type.Kind}"); Review Comment: Similarly, the `default` case in the multi-row read path (and single-row read path at line 560) sets `prop_{propIndex}` to `null` with a `// TODO` comment for unsupported types. This means deserialization will silently produce null values for Guid, TimeOnly, TimeSpan, Binary, nested records, and collections in polymorphic types, potentially causing `NullReferenceException` downstream when the property is non-nullable. This should throw `NotSupportedException` instead. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
