scovich commented on code in PR #8348:
URL: https://github.com/apache/arrow-rs/pull/8348#discussion_r2357882638


##########
arrow-avro/src/codec.rs:
##########
@@ -401,6 +423,14 @@ impl AvroDataType {
                     ))
                 }
             },
+            Codec::Union(encodings, _, _) => {
+                if encodings.is_empty() {
+                    return Err(ArrowError::SchemaError(
+                        "Union with no branches cannot have a 
default".to_string(),
+                    ));
+                }
+                encodings[0].parse_default_literal(default_json)?
+            }

Review Comment:
   nit
   ```suggestion
                   let Some(default_encoding) = encodings.first() else {
                       return Err(ArrowError::SchemaError(
                           "Union with no branches cannot have a 
default".to_string(),
                       ));
                   };
                   default_encoding.parse_default_literal(default_json)?
               }
   ```



##########
arrow-avro/src/codec.rs:
##########
@@ -267,6 +284,11 @@ impl AvroDataType {
         if default_json.is_null() {
             return match self.codec() {
                 Codec::Null => Ok(AvroLiteral::Null),
+                Codec::Union(encodings, _, _) if !encodings.is_empty()
+                    && matches!(encodings[0].codec(), Codec::Null) =>
+                    {
+                        Ok(AvroLiteral::Null)
+                    }

Review Comment:
   aside: that is some funky formatting, but I guess it's what `fmt` produced?



##########
arrow-avro/src/codec.rs:
##########
@@ -915,6 +1020,76 @@ fn nullable_union_variants<'x, 'y>(
     }
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+enum UnionBranchKey {
+    Named(String),
+    Primitive(PrimitiveType),
+    Array,
+    Map,
+}
+
+fn branch_key_of<'a>(s: &Schema<'a>, enclosing_ns: Option<&'a str>) -> 
Option<UnionBranchKey> {
+    match s {
+        // Primitives
+        Schema::TypeName(TypeName::Primitive(p)) => 
Some(UnionBranchKey::Primitive(*p)),
+        Schema::Type(Type {
+            r#type: TypeName::Primitive(p),
+            ..
+        }) => Some(UnionBranchKey::Primitive(*p)),
+        // Named references
+        Schema::TypeName(TypeName::Ref(name)) => {
+            let (full, _) = make_full_name(name, None, enclosing_ns);
+            Some(UnionBranchKey::Named(full))
+        }
+        Schema::Type(Type {
+            r#type: TypeName::Ref(name),
+            ..
+        }) => {
+            let (full, _) = make_full_name(name, None, enclosing_ns);
+            Some(UnionBranchKey::Named(full))
+        }
+        // Complex non‑named
+        Schema::Complex(ComplexType::Array(_)) => Some(UnionBranchKey::Array),
+        Schema::Complex(ComplexType::Map(_)) => Some(UnionBranchKey::Map),
+        // Inline named definitions
+        Schema::Complex(ComplexType::Record(r)) => {
+            let (full, _) = make_full_name(r.name, r.namespace, enclosing_ns);
+            Some(UnionBranchKey::Named(full))
+        }
+        Schema::Complex(ComplexType::Enum(e)) => {
+            let (full, _) = make_full_name(e.name, e.namespace, enclosing_ns);
+            Some(UnionBranchKey::Named(full))
+        }
+        Schema::Complex(ComplexType::Fixed(f)) => {
+            let (full, _) = make_full_name(f.name, f.namespace, enclosing_ns);
+            Some(UnionBranchKey::Named(full))
+        }

Review Comment:
   qq: Would it be cleaner -- or not -- to rearrange this match as follows?
   ```rust
   let (name, namespace) = match s {
       Schema::TypeName(TypeName::Primitive(p))
       | Schema::Type(Type {
           r#type: TypeName::Primitive(p),
           ..
       }) => return Some(UnionBranchKey::Primitive(*p)),
       Schema::TypeName(TypeName::Ref(name))
       | Schema::Type(Type {
           r#type: TypeName::Ref(name),
           ..
       }) => (name, None),
       Schema::Complex(ComplexType::Array(_)) => return 
Some(UnionBranchKey::Array),
       Schema::Complex(ComplexType::Map(_)) => return Some(UnionBranchKey::Map),
       Schema::Complex(ComplexType::Record(r)) => (r.name, r.namespace),
       Schema::Complex(ComplexType::Enum(e)) => (e.name, e.namespace),
       Schema::Complex(ComplexType::Fixed(f)) => (f.name, f.namespace),
       Schema::Union(_) => return None,
   };
   
   let (full, _) = make_full_name(name, namespace, enclosing_ns);
   Some(UnionBranchKey::Named(full))
   ```



##########
arrow-avro/src/codec.rs:
##########
@@ -1149,6 +1345,67 @@ impl<'a> Maker<'a> {
             return self.resolve_primitives(write_primitive, read_primitive, 
reader_schema);
         }
         match (writer_schema, reader_schema) {
+            (Schema::Union(writer_variants), Schema::Union(reader_variants)) 
=> {
+                match (
+                    nullable_union_variants(writer_variants.as_slice()),
+                    nullable_union_variants(reader_variants.as_slice()),
+                ) {
+                    (Some((w_nb, w_nonnull)), Some((_r_nb, r_nonnull))) => {
+                        let mut dt = self.make_data_type(w_nonnull, 
Some(r_nonnull), namespace)?;
+                        dt.nullability = Some(w_nb);
+                        Ok(dt)
+                    }
+                    _ => self.resolve_unions(
+                        writer_variants.as_slice(),
+                        reader_variants.as_slice(),
+                        namespace,
+                    ),
+                }
+            }
+            (Schema::Union(writer_variants), reader_non_union) => {
+                let mut writer_to_reader: Vec<Option<(usize, Promotion)>> =
+                    Vec::with_capacity(writer_variants.len());
+                for writer in writer_variants {
+                    match self.resolve_type(writer, reader_non_union, 
namespace) {
+                        Ok(tmp) => writer_to_reader.push(Some((0usize, 
Self::coercion_from(&tmp)))),
+                        Err(_) => writer_to_reader.push(None),
+                    }
+                }
+                let mut dt = self.parse_type(reader_non_union, namespace)?;
+                dt.resolution = Some(ResolutionInfo::Union(ResolvedUnion {
+                    writer_to_reader: Arc::from(writer_to_reader),

Review Comment:
   nit
   ```suggestion
                   let writer_to_reader = 
writer_variants.iter().filter_map(|writer| {
                       let tmp = self.resolve_type(writer, reader_non_union, 
namespace).ok()?;
                       Some((0usize, Self::coercion_from(&tmp))))
                   }));
                   let mut dt = self.parse_type(reader_non_union, namespace)?;
                   dt.resolution = Some(ResolutionInfo::Union(ResolvedUnion {
                       writer_to_reader: Arc::from(writer_to_reader.collect()),
   ```



##########
arrow-avro/src/schema.rs:
##########
@@ -970,13 +973,59 @@ fn merge_extras(schema: Value, mut extras: 
JsonMap<String, Value>) -> Value {
     }
 }
 
+#[inline]
+fn is_avro_json_null(v: &Value) -> bool {
+    matches!(v, Value::String(s) if s == "null")
+}
+
 fn wrap_nullable(inner: Value, null_order: Nullability) -> Value {
     let null = Value::String("null".into());
-    let elements = match null_order {
-        Nullability::NullFirst => vec![null, inner],
-        Nullability::NullSecond => vec![inner, null],
-    };
-    Value::Array(elements)
+    match inner {
+        Value::Array(mut union) => {
+            union.retain(|v| !is_avro_json_null(v));
+            match null_order {
+                Nullability::NullFirst => {
+                    let mut out = Vec::with_capacity(union.len() + 1);
+                    out.push(null);
+                    out.extend(union);
+                    Value::Array(out)
+                }
+                Nullability::NullSecond => {
+                    union.push(null);
+                    Value::Array(union)
+                }
+            }

Review Comment:
   ```suggestion
               match null_order {
                   Nullability::NullFirst => union.insert(0, null),
                   Nullability::NullSecond => union.push(null),
               }
               Value::Array(union)
   ```
   
   (I guess it should really be called `Nullability::NullLast`?)



##########
arrow-avro/src/schema.rs:
##########
@@ -970,13 +973,59 @@ fn merge_extras(schema: Value, mut extras: 
JsonMap<String, Value>) -> Value {
     }
 }
 
+#[inline]
+fn is_avro_json_null(v: &Value) -> bool {
+    matches!(v, Value::String(s) if s == "null")
+}
+
 fn wrap_nullable(inner: Value, null_order: Nullability) -> Value {
     let null = Value::String("null".into());
-    let elements = match null_order {
-        Nullability::NullFirst => vec![null, inner],
-        Nullability::NullSecond => vec![inner, null],
-    };
-    Value::Array(elements)
+    match inner {
+        Value::Array(mut union) => {
+            union.retain(|v| !is_avro_json_null(v));
+            match null_order {
+                Nullability::NullFirst => {
+                    let mut out = Vec::with_capacity(union.len() + 1);
+                    out.push(null);
+                    out.extend(union);
+                    Value::Array(out)
+                }
+                Nullability::NullSecond => {
+                    union.push(null);
+                    Value::Array(union)
+                }
+            }
+        }
+        other => match null_order {
+            Nullability::NullFirst => Value::Array(vec![null, other]),
+            Nullability::NullSecond => Value::Array(vec![other, null]),
+        },
+    }
+}
+
+fn union_branch_signature(branch: &Value) -> Result<String, ArrowError> {
+    match branch {
+        Value::String(t) => Ok(format!("P:{t}")),
+        Value::Object(map) => {
+            let t = map.get("type").and_then(|v| v.as_str()).ok_or_else(|| {
+                ArrowError::SchemaError("Union branch object missing string 
'type'".into())
+            })?;
+            match t {
+                "record" | "enum" | "fixed" => {
+                    let name = map.get("name").and_then(|v| 
v.as_str()).unwrap_or_default();

Review Comment:
   What is the default `&str`, out of curiosity?



##########
arrow-avro/src/codec.rs:
##########
@@ -915,6 +1020,76 @@ fn nullable_union_variants<'x, 'y>(
     }
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+enum UnionBranchKey {
+    Named(String),
+    Primitive(PrimitiveType),
+    Array,
+    Map,
+}
+
+fn branch_key_of<'a>(s: &Schema<'a>, enclosing_ns: Option<&'a str>) -> 
Option<UnionBranchKey> {
+    match s {
+        // Primitives
+        Schema::TypeName(TypeName::Primitive(p)) => 
Some(UnionBranchKey::Primitive(*p)),
+        Schema::Type(Type {
+            r#type: TypeName::Primitive(p),
+            ..
+        }) => Some(UnionBranchKey::Primitive(*p)),
+        // Named references
+        Schema::TypeName(TypeName::Ref(name)) => {
+            let (full, _) = make_full_name(name, None, enclosing_ns);
+            Some(UnionBranchKey::Named(full))
+        }
+        Schema::Type(Type {
+            r#type: TypeName::Ref(name),
+            ..
+        }) => {
+            let (full, _) = make_full_name(name, None, enclosing_ns);
+            Some(UnionBranchKey::Named(full))
+        }
+        // Complex non‑named
+        Schema::Complex(ComplexType::Array(_)) => Some(UnionBranchKey::Array),
+        Schema::Complex(ComplexType::Map(_)) => Some(UnionBranchKey::Map),
+        // Inline named definitions
+        Schema::Complex(ComplexType::Record(r)) => {
+            let (full, _) = make_full_name(r.name, r.namespace, enclosing_ns);
+            Some(UnionBranchKey::Named(full))
+        }
+        Schema::Complex(ComplexType::Enum(e)) => {
+            let (full, _) = make_full_name(e.name, e.namespace, enclosing_ns);
+            Some(UnionBranchKey::Named(full))
+        }
+        Schema::Complex(ComplexType::Fixed(f)) => {
+            let (full, _) = make_full_name(f.name, f.namespace, enclosing_ns);
+            Some(UnionBranchKey::Named(full))
+        }
+        // Unions are validated separately (and disallowed as immediate 
branches)
+        Schema::Union(_) => None,
+    }
+}
+
+fn union_first_duplicate<'a>(
+    branches: &'a [Schema<'a>],
+    enclosing_ns: Option<&'a str>,
+) -> Option<String> {
+    let mut seen: HashSet<UnionBranchKey> = 
HashSet::with_capacity(branches.len());

Review Comment:
   nit: type annotation shouldn't be necessary?



##########
arrow-avro/src/codec.rs:
##########
@@ -1149,6 +1345,67 @@ impl<'a> Maker<'a> {
             return self.resolve_primitives(write_primitive, read_primitive, 
reader_schema);
         }
         match (writer_schema, reader_schema) {
+            (Schema::Union(writer_variants), Schema::Union(reader_variants)) 
=> {
+                match (
+                    nullable_union_variants(writer_variants.as_slice()),
+                    nullable_union_variants(reader_variants.as_slice()),
+                ) {
+                    (Some((w_nb, w_nonnull)), Some((_r_nb, r_nonnull))) => {
+                        let mut dt = self.make_data_type(w_nonnull, 
Some(r_nonnull), namespace)?;
+                        dt.nullability = Some(w_nb);
+                        Ok(dt)
+                    }
+                    _ => self.resolve_unions(
+                        writer_variants.as_slice(),
+                        reader_variants.as_slice(),
+                        namespace,
+                    ),
+                }
+            }
+            (Schema::Union(writer_variants), reader_non_union) => {
+                let mut writer_to_reader: Vec<Option<(usize, Promotion)>> =
+                    Vec::with_capacity(writer_variants.len());

Review Comment:
   again, type annotation shouldn't be needed?



##########
arrow-avro/src/codec.rs:
##########
@@ -1149,6 +1345,67 @@ impl<'a> Maker<'a> {
             return self.resolve_primitives(write_primitive, read_primitive, 
reader_schema);
         }
         match (writer_schema, reader_schema) {
+            (Schema::Union(writer_variants), Schema::Union(reader_variants)) 
=> {
+                match (
+                    nullable_union_variants(writer_variants.as_slice()),
+                    nullable_union_variants(reader_variants.as_slice()),
+                ) {
+                    (Some((w_nb, w_nonnull)), Some((_r_nb, r_nonnull))) => {
+                        let mut dt = self.make_data_type(w_nonnull, 
Some(r_nonnull), namespace)?;
+                        dt.nullability = Some(w_nb);
+                        Ok(dt)
+                    }
+                    _ => self.resolve_unions(
+                        writer_variants.as_slice(),
+                        reader_variants.as_slice(),
+                        namespace,
+                    ),

Review Comment:
   ```suggestion
                   let writer_variants = writer_variants.as_slice();
                   let reader_variants = reader_variants.as_slice();
                   match (
                       nullable_union_variants(writer_variants),
                       nullable_union_variants(reader_variants),
                   ) {
                       (Some((w_nb, w_nonnull)), Some((_r_nb, r_nonnull))) => {
                           let mut dt = self.make_data_type(w_nonnull, 
Some(r_nonnull), namespace)?;
                           dt.nullability = Some(w_nb);
                           Ok(dt)
                       }
                       _ => self.resolve_unions(writer_variants, 
reader_variants, namespace),
   ```



##########
arrow-avro/src/codec.rs:
##########
@@ -1557,6 +1857,24 @@ mod tests {
             .expect("promotion should resolve")
     }
 
+    fn mk_primitive(pt: PrimitiveType) -> Schema<'static> {
+        Schema::TypeName(TypeName::Primitive(pt))
+    }
+    fn mk_union(branches: Vec<Schema<'static>>) -> Schema<'static> {
+        Schema::Union(branches)
+    }
+
+    fn mk_record_named(name: &'static str) -> Schema<'static> {

Review Comment:
   static lifetimes will pretty strongly constrain real-world usage... is there 
a reason it needs to be fixed? Why not just 
   
   ```rust
   fn mk_record_name<'a>(name: &'a str) -> Schema<'a>
   ```



##########
arrow-avro/src/codec.rs:
##########
@@ -1149,6 +1345,67 @@ impl<'a> Maker<'a> {
             return self.resolve_primitives(write_primitive, read_primitive, 
reader_schema);
         }
         match (writer_schema, reader_schema) {
+            (Schema::Union(writer_variants), Schema::Union(reader_variants)) 
=> {
+                match (
+                    nullable_union_variants(writer_variants.as_slice()),
+                    nullable_union_variants(reader_variants.as_slice()),
+                ) {
+                    (Some((w_nb, w_nonnull)), Some((_r_nb, r_nonnull))) => {
+                        let mut dt = self.make_data_type(w_nonnull, 
Some(r_nonnull), namespace)?;
+                        dt.nullability = Some(w_nb);
+                        Ok(dt)
+                    }
+                    _ => self.resolve_unions(
+                        writer_variants.as_slice(),
+                        reader_variants.as_slice(),
+                        namespace,
+                    ),
+                }
+            }
+            (Schema::Union(writer_variants), reader_non_union) => {
+                let mut writer_to_reader: Vec<Option<(usize, Promotion)>> =
+                    Vec::with_capacity(writer_variants.len());
+                for writer in writer_variants {
+                    match self.resolve_type(writer, reader_non_union, 
namespace) {
+                        Ok(tmp) => writer_to_reader.push(Some((0usize, 
Self::coercion_from(&tmp)))),
+                        Err(_) => writer_to_reader.push(None),
+                    }
+                }
+                let mut dt = self.parse_type(reader_non_union, namespace)?;
+                dt.resolution = Some(ResolutionInfo::Union(ResolvedUnion {
+                    writer_to_reader: Arc::from(writer_to_reader),
+                    writer_is_union: true,
+                    reader_is_union: false,
+                }));
+                Ok(dt)
+            }
+            (writer_non_union, Schema::Union(reader_variants)) => {
+                let mut direct: Option<(usize, Promotion)> = None;
+                let mut promo: Option<(usize, Promotion)> = None;
+                for (reader_index, reader) in 
reader_variants.iter().enumerate() {
+                    if let Ok(tmp) = self.resolve_type(writer_non_union, 
reader, namespace) {
+                        let how = Self::coercion_from(&tmp);
+                        if how == Promotion::Direct {
+                            direct = Some((reader_index, how));
+                            break; // first exact match wins
+                        } else if promo.is_none() {
+                            promo = Some((reader_index, how));
+                        }

Review Comment:
   Double checking intent -- Use the first-found promo, unless a direct match 
is found?
   
   If so, I think we can use just the `promo` option for both:
   ```rust
   if how == Promotion::Direct {
       promo = Some((reader_index, how));
       break; // first exact match wins
   }
   if promo.is_none() {
       // first promo wins, unless an exact match is found later
       promo = Some((reader_index, how));
   }
   ```
   and then
   ```rust
   let Some((reader_index, promotion) = promo else {
       return ArrowError::SchemaError(...);
   };
   ```
   (again below)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to