scovich commented on code in PR #8348: URL: https://github.com/apache/arrow-rs/pull/8348#discussion_r2357882638
########## arrow-avro/src/codec.rs: ########## @@ -401,6 +423,14 @@ impl AvroDataType { )) } }, + Codec::Union(encodings, _, _) => { + if encodings.is_empty() { + return Err(ArrowError::SchemaError( + "Union with no branches cannot have a default".to_string(), + )); + } + encodings[0].parse_default_literal(default_json)? + } Review Comment: nit ```suggestion let Some(default_encoding) = encodings.first() else { return Err(ArrowError::SchemaError( "Union with no branches cannot have a default".to_string(), )); }; default_encoding.parse_default_literal(default_json)? } ``` ########## arrow-avro/src/codec.rs: ########## @@ -267,6 +284,11 @@ impl AvroDataType { if default_json.is_null() { return match self.codec() { Codec::Null => Ok(AvroLiteral::Null), + Codec::Union(encodings, _, _) if !encodings.is_empty() + && matches!(encodings[0].codec(), Codec::Null) => + { + Ok(AvroLiteral::Null) + } Review Comment: aside: that is some funky formatting, but I guess it's what `fmt` produced? ########## arrow-avro/src/codec.rs: ########## @@ -915,6 +1020,76 @@ fn nullable_union_variants<'x, 'y>( } } +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +enum UnionBranchKey { + Named(String), + Primitive(PrimitiveType), + Array, + Map, +} + +fn branch_key_of<'a>(s: &Schema<'a>, enclosing_ns: Option<&'a str>) -> Option<UnionBranchKey> { + match s { + // Primitives + Schema::TypeName(TypeName::Primitive(p)) => Some(UnionBranchKey::Primitive(*p)), + Schema::Type(Type { + r#type: TypeName::Primitive(p), + .. + }) => Some(UnionBranchKey::Primitive(*p)), + // Named references + Schema::TypeName(TypeName::Ref(name)) => { + let (full, _) = make_full_name(name, None, enclosing_ns); + Some(UnionBranchKey::Named(full)) + } + Schema::Type(Type { + r#type: TypeName::Ref(name), + .. + }) => { + let (full, _) = make_full_name(name, None, enclosing_ns); + Some(UnionBranchKey::Named(full)) + } + // Complex non‑named + Schema::Complex(ComplexType::Array(_)) => Some(UnionBranchKey::Array), + Schema::Complex(ComplexType::Map(_)) => Some(UnionBranchKey::Map), + // Inline named definitions + Schema::Complex(ComplexType::Record(r)) => { + let (full, _) = make_full_name(r.name, r.namespace, enclosing_ns); + Some(UnionBranchKey::Named(full)) + } + Schema::Complex(ComplexType::Enum(e)) => { + let (full, _) = make_full_name(e.name, e.namespace, enclosing_ns); + Some(UnionBranchKey::Named(full)) + } + Schema::Complex(ComplexType::Fixed(f)) => { + let (full, _) = make_full_name(f.name, f.namespace, enclosing_ns); + Some(UnionBranchKey::Named(full)) + } Review Comment: qq: Would it be cleaner -- or not -- to rearrange this match as follows? ```rust let (name, namespace) = match s { Schema::TypeName(TypeName::Primitive(p)) | Schema::Type(Type { r#type: TypeName::Primitive(p), .. }) => return Some(UnionBranchKey::Primitive(*p)), Schema::TypeName(TypeName::Ref(name)) | Schema::Type(Type { r#type: TypeName::Ref(name), .. }) => (name, None), Schema::Complex(ComplexType::Array(_)) => return Some(UnionBranchKey::Array), Schema::Complex(ComplexType::Map(_)) => return Some(UnionBranchKey::Map), Schema::Complex(ComplexType::Record(r)) => (r.name, r.namespace), Schema::Complex(ComplexType::Enum(e)) => (e.name, e.namespace), Schema::Complex(ComplexType::Fixed(f)) => (f.name, f.namespace), Schema::Union(_) => return None, }; let (full, _) = make_full_name(name, namespace, enclosing_ns); Some(UnionBranchKey::Named(full)) ``` ########## arrow-avro/src/codec.rs: ########## @@ -1149,6 +1345,67 @@ impl<'a> Maker<'a> { return self.resolve_primitives(write_primitive, read_primitive, reader_schema); } match (writer_schema, reader_schema) { + (Schema::Union(writer_variants), Schema::Union(reader_variants)) => { + match ( + nullable_union_variants(writer_variants.as_slice()), + nullable_union_variants(reader_variants.as_slice()), + ) { + (Some((w_nb, w_nonnull)), Some((_r_nb, r_nonnull))) => { + let mut dt = self.make_data_type(w_nonnull, Some(r_nonnull), namespace)?; + dt.nullability = Some(w_nb); + Ok(dt) + } + _ => self.resolve_unions( + writer_variants.as_slice(), + reader_variants.as_slice(), + namespace, + ), + } + } + (Schema::Union(writer_variants), reader_non_union) => { + let mut writer_to_reader: Vec<Option<(usize, Promotion)>> = + Vec::with_capacity(writer_variants.len()); + for writer in writer_variants { + match self.resolve_type(writer, reader_non_union, namespace) { + Ok(tmp) => writer_to_reader.push(Some((0usize, Self::coercion_from(&tmp)))), + Err(_) => writer_to_reader.push(None), + } + } + let mut dt = self.parse_type(reader_non_union, namespace)?; + dt.resolution = Some(ResolutionInfo::Union(ResolvedUnion { + writer_to_reader: Arc::from(writer_to_reader), Review Comment: nit ```suggestion let writer_to_reader = writer_variants.iter().filter_map(|writer| { let tmp = self.resolve_type(writer, reader_non_union, namespace).ok()?; Some((0usize, Self::coercion_from(&tmp)))) })); let mut dt = self.parse_type(reader_non_union, namespace)?; dt.resolution = Some(ResolutionInfo::Union(ResolvedUnion { writer_to_reader: Arc::from(writer_to_reader.collect()), ``` ########## arrow-avro/src/schema.rs: ########## @@ -970,13 +973,59 @@ fn merge_extras(schema: Value, mut extras: JsonMap<String, Value>) -> Value { } } +#[inline] +fn is_avro_json_null(v: &Value) -> bool { + matches!(v, Value::String(s) if s == "null") +} + fn wrap_nullable(inner: Value, null_order: Nullability) -> Value { let null = Value::String("null".into()); - let elements = match null_order { - Nullability::NullFirst => vec![null, inner], - Nullability::NullSecond => vec![inner, null], - }; - Value::Array(elements) + match inner { + Value::Array(mut union) => { + union.retain(|v| !is_avro_json_null(v)); + match null_order { + Nullability::NullFirst => { + let mut out = Vec::with_capacity(union.len() + 1); + out.push(null); + out.extend(union); + Value::Array(out) + } + Nullability::NullSecond => { + union.push(null); + Value::Array(union) + } + } Review Comment: ```suggestion match null_order { Nullability::NullFirst => union.insert(0, null), Nullability::NullSecond => union.push(null), } Value::Array(union) ``` (I guess it should really be called `Nullability::NullLast`?) ########## arrow-avro/src/schema.rs: ########## @@ -970,13 +973,59 @@ fn merge_extras(schema: Value, mut extras: JsonMap<String, Value>) -> Value { } } +#[inline] +fn is_avro_json_null(v: &Value) -> bool { + matches!(v, Value::String(s) if s == "null") +} + fn wrap_nullable(inner: Value, null_order: Nullability) -> Value { let null = Value::String("null".into()); - let elements = match null_order { - Nullability::NullFirst => vec![null, inner], - Nullability::NullSecond => vec![inner, null], - }; - Value::Array(elements) + match inner { + Value::Array(mut union) => { + union.retain(|v| !is_avro_json_null(v)); + match null_order { + Nullability::NullFirst => { + let mut out = Vec::with_capacity(union.len() + 1); + out.push(null); + out.extend(union); + Value::Array(out) + } + Nullability::NullSecond => { + union.push(null); + Value::Array(union) + } + } + } + other => match null_order { + Nullability::NullFirst => Value::Array(vec![null, other]), + Nullability::NullSecond => Value::Array(vec![other, null]), + }, + } +} + +fn union_branch_signature(branch: &Value) -> Result<String, ArrowError> { + match branch { + Value::String(t) => Ok(format!("P:{t}")), + Value::Object(map) => { + let t = map.get("type").and_then(|v| v.as_str()).ok_or_else(|| { + ArrowError::SchemaError("Union branch object missing string 'type'".into()) + })?; + match t { + "record" | "enum" | "fixed" => { + let name = map.get("name").and_then(|v| v.as_str()).unwrap_or_default(); Review Comment: What is the default `&str`, out of curiosity? ########## arrow-avro/src/codec.rs: ########## @@ -915,6 +1020,76 @@ fn nullable_union_variants<'x, 'y>( } } +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +enum UnionBranchKey { + Named(String), + Primitive(PrimitiveType), + Array, + Map, +} + +fn branch_key_of<'a>(s: &Schema<'a>, enclosing_ns: Option<&'a str>) -> Option<UnionBranchKey> { + match s { + // Primitives + Schema::TypeName(TypeName::Primitive(p)) => Some(UnionBranchKey::Primitive(*p)), + Schema::Type(Type { + r#type: TypeName::Primitive(p), + .. + }) => Some(UnionBranchKey::Primitive(*p)), + // Named references + Schema::TypeName(TypeName::Ref(name)) => { + let (full, _) = make_full_name(name, None, enclosing_ns); + Some(UnionBranchKey::Named(full)) + } + Schema::Type(Type { + r#type: TypeName::Ref(name), + .. + }) => { + let (full, _) = make_full_name(name, None, enclosing_ns); + Some(UnionBranchKey::Named(full)) + } + // Complex non‑named + Schema::Complex(ComplexType::Array(_)) => Some(UnionBranchKey::Array), + Schema::Complex(ComplexType::Map(_)) => Some(UnionBranchKey::Map), + // Inline named definitions + Schema::Complex(ComplexType::Record(r)) => { + let (full, _) = make_full_name(r.name, r.namespace, enclosing_ns); + Some(UnionBranchKey::Named(full)) + } + Schema::Complex(ComplexType::Enum(e)) => { + let (full, _) = make_full_name(e.name, e.namespace, enclosing_ns); + Some(UnionBranchKey::Named(full)) + } + Schema::Complex(ComplexType::Fixed(f)) => { + let (full, _) = make_full_name(f.name, f.namespace, enclosing_ns); + Some(UnionBranchKey::Named(full)) + } + // Unions are validated separately (and disallowed as immediate branches) + Schema::Union(_) => None, + } +} + +fn union_first_duplicate<'a>( + branches: &'a [Schema<'a>], + enclosing_ns: Option<&'a str>, +) -> Option<String> { + let mut seen: HashSet<UnionBranchKey> = HashSet::with_capacity(branches.len()); Review Comment: nit: type annotation shouldn't be necessary? ########## arrow-avro/src/codec.rs: ########## @@ -1149,6 +1345,67 @@ impl<'a> Maker<'a> { return self.resolve_primitives(write_primitive, read_primitive, reader_schema); } match (writer_schema, reader_schema) { + (Schema::Union(writer_variants), Schema::Union(reader_variants)) => { + match ( + nullable_union_variants(writer_variants.as_slice()), + nullable_union_variants(reader_variants.as_slice()), + ) { + (Some((w_nb, w_nonnull)), Some((_r_nb, r_nonnull))) => { + let mut dt = self.make_data_type(w_nonnull, Some(r_nonnull), namespace)?; + dt.nullability = Some(w_nb); + Ok(dt) + } + _ => self.resolve_unions( + writer_variants.as_slice(), + reader_variants.as_slice(), + namespace, + ), + } + } + (Schema::Union(writer_variants), reader_non_union) => { + let mut writer_to_reader: Vec<Option<(usize, Promotion)>> = + Vec::with_capacity(writer_variants.len()); Review Comment: again, type annotation shouldn't be needed? ########## arrow-avro/src/codec.rs: ########## @@ -1149,6 +1345,67 @@ impl<'a> Maker<'a> { return self.resolve_primitives(write_primitive, read_primitive, reader_schema); } match (writer_schema, reader_schema) { + (Schema::Union(writer_variants), Schema::Union(reader_variants)) => { + match ( + nullable_union_variants(writer_variants.as_slice()), + nullable_union_variants(reader_variants.as_slice()), + ) { + (Some((w_nb, w_nonnull)), Some((_r_nb, r_nonnull))) => { + let mut dt = self.make_data_type(w_nonnull, Some(r_nonnull), namespace)?; + dt.nullability = Some(w_nb); + Ok(dt) + } + _ => self.resolve_unions( + writer_variants.as_slice(), + reader_variants.as_slice(), + namespace, + ), Review Comment: ```suggestion let writer_variants = writer_variants.as_slice(); let reader_variants = reader_variants.as_slice(); match ( nullable_union_variants(writer_variants), nullable_union_variants(reader_variants), ) { (Some((w_nb, w_nonnull)), Some((_r_nb, r_nonnull))) => { let mut dt = self.make_data_type(w_nonnull, Some(r_nonnull), namespace)?; dt.nullability = Some(w_nb); Ok(dt) } _ => self.resolve_unions(writer_variants, reader_variants, namespace), ``` ########## arrow-avro/src/codec.rs: ########## @@ -1557,6 +1857,24 @@ mod tests { .expect("promotion should resolve") } + fn mk_primitive(pt: PrimitiveType) -> Schema<'static> { + Schema::TypeName(TypeName::Primitive(pt)) + } + fn mk_union(branches: Vec<Schema<'static>>) -> Schema<'static> { + Schema::Union(branches) + } + + fn mk_record_named(name: &'static str) -> Schema<'static> { Review Comment: static lifetimes will pretty strongly constrain real-world usage... is there a reason it needs to be fixed? Why not just ```rust fn mk_record_name<'a>(name: &'a str) -> Schema<'a> ``` ########## arrow-avro/src/codec.rs: ########## @@ -1149,6 +1345,67 @@ impl<'a> Maker<'a> { return self.resolve_primitives(write_primitive, read_primitive, reader_schema); } match (writer_schema, reader_schema) { + (Schema::Union(writer_variants), Schema::Union(reader_variants)) => { + match ( + nullable_union_variants(writer_variants.as_slice()), + nullable_union_variants(reader_variants.as_slice()), + ) { + (Some((w_nb, w_nonnull)), Some((_r_nb, r_nonnull))) => { + let mut dt = self.make_data_type(w_nonnull, Some(r_nonnull), namespace)?; + dt.nullability = Some(w_nb); + Ok(dt) + } + _ => self.resolve_unions( + writer_variants.as_slice(), + reader_variants.as_slice(), + namespace, + ), + } + } + (Schema::Union(writer_variants), reader_non_union) => { + let mut writer_to_reader: Vec<Option<(usize, Promotion)>> = + Vec::with_capacity(writer_variants.len()); + for writer in writer_variants { + match self.resolve_type(writer, reader_non_union, namespace) { + Ok(tmp) => writer_to_reader.push(Some((0usize, Self::coercion_from(&tmp)))), + Err(_) => writer_to_reader.push(None), + } + } + let mut dt = self.parse_type(reader_non_union, namespace)?; + dt.resolution = Some(ResolutionInfo::Union(ResolvedUnion { + writer_to_reader: Arc::from(writer_to_reader), + writer_is_union: true, + reader_is_union: false, + })); + Ok(dt) + } + (writer_non_union, Schema::Union(reader_variants)) => { + let mut direct: Option<(usize, Promotion)> = None; + let mut promo: Option<(usize, Promotion)> = None; + for (reader_index, reader) in reader_variants.iter().enumerate() { + if let Ok(tmp) = self.resolve_type(writer_non_union, reader, namespace) { + let how = Self::coercion_from(&tmp); + if how == Promotion::Direct { + direct = Some((reader_index, how)); + break; // first exact match wins + } else if promo.is_none() { + promo = Some((reader_index, how)); + } Review Comment: Double checking intent -- Use the first-found promo, unless a direct match is found? If so, I think we can use just the `promo` option for both: ```rust if how == Promotion::Direct { promo = Some((reader_index, how)); break; // first exact match wins } if promo.is_none() { // first promo wins, unless an exact match is found later promo = Some((reader_index, how)); } ``` and then ```rust let Some((reader_index, promotion) = promo else { return ArrowError::SchemaError(...); }; ``` (again below) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org