jecsand838 commented on code in PR #8075: URL: https://github.com/apache/arrow-rs/pull/8075#discussion_r2268150437
########## arrow-avro/src/schema.rs: ########## @@ -647,12 +715,336 @@ pub(crate) fn compute_fingerprint_rabin(canonical_form: &str) -> u64 { fp } +#[inline] +fn is_internal_arrow_key(key: &str) -> bool { + key.starts_with("ARROW:") || key == SCHEMA_METADATA_KEY +} + +// Sanitize an arbitrary string so it is a valid Avro field or type name +fn sanitise_avro_name(base_name: &str) -> String { + if base_name.is_empty() { + return "_".to_owned(); + } + let mut out: String = base_name + .chars() + .map(|char| { + if char.is_ascii_alphanumeric() || char == '_' { + char + } else { + '_' + } + }) + .collect(); + if out.as_bytes()[0].is_ascii_digit() { + out.insert(0, '_'); + } + out +} + +#[derive(Default)] +struct NameGenerator { + used: HashSet<String>, + counters: HashMap<String, usize>, +} + +impl NameGenerator { + fn make_unique(&mut self, field_name: &str) -> String { + let field_name = sanitise_avro_name(field_name); + if self.used.insert(field_name.clone()) { + self.counters.insert(field_name.clone(), 1); + return field_name; + } + let counter = self.counters.entry(field_name.clone()).or_insert(1); + loop { + let candidate = format!("{field_name}_{}", *counter); + if self.used.insert(candidate.clone()) { + return candidate; + } + *counter += 1; + } + } +} + +fn merge_extras(schema: Value, mut extras: JsonMap<String, Value>) -> Value { + if extras.is_empty() { + return schema; + } + match schema { + Value::Object(mut map) => { + map.extend(extras); + Value::Object(map) + } + Value::Array(mut union) => { + if let Some(non_null) = union.iter_mut().find(|val| val.as_str() != Some("null")) { + let original = std::mem::take(non_null); + *non_null = merge_extras(original, extras); + } + Value::Array(union) + } + primitive => { + let mut map = JsonMap::with_capacity(extras.len() + 1); + map.insert("type".into(), primitive); + map.extend(extras); + Value::Object(map) + } + } +} + +// Convert an Arrow `DataType` into an Avro schema `Value`. +fn datatype_to_avro( + dt: &DataType, + field_name: &str, + metadata: &HashMap<String, String>, + name_gen: &mut NameGenerator, +) -> Result<(Value, JsonMap<String, Value>), ArrowError> { Review Comment: That's a good call out. There's definitely some follow-up improvements to be made both here and in the `AvroSchema` type as well. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org