alamb commented on code in PR #8075: URL: https://github.com/apache/arrow-rs/pull/8075#discussion_r2267965800
########## arrow-avro/src/schema.rs: ########## @@ -647,12 +715,336 @@ pub(crate) fn compute_fingerprint_rabin(canonical_form: &str) -> u64 { fp } +#[inline] +fn is_internal_arrow_key(key: &str) -> bool { + key.starts_with("ARROW:") || key == SCHEMA_METADATA_KEY +} + +// Sanitize an arbitrary string so it is a valid Avro field or type name +fn sanitise_avro_name(base_name: &str) -> String { + if base_name.is_empty() { + return "_".to_owned(); + } + let mut out: String = base_name + .chars() + .map(|char| { + if char.is_ascii_alphanumeric() || char == '_' { + char + } else { + '_' + } + }) + .collect(); + if out.as_bytes()[0].is_ascii_digit() { + out.insert(0, '_'); + } + out +} + +#[derive(Default)] +struct NameGenerator { + used: HashSet<String>, + counters: HashMap<String, usize>, +} + +impl NameGenerator { + fn make_unique(&mut self, field_name: &str) -> String { + let field_name = sanitise_avro_name(field_name); + if self.used.insert(field_name.clone()) { + self.counters.insert(field_name.clone(), 1); + return field_name; + } + let counter = self.counters.entry(field_name.clone()).or_insert(1); + loop { + let candidate = format!("{field_name}_{}", *counter); + if self.used.insert(candidate.clone()) { + return candidate; + } + *counter += 1; + } + } +} + +fn merge_extras(schema: Value, mut extras: JsonMap<String, Value>) -> Value { + if extras.is_empty() { + return schema; + } + match schema { + Value::Object(mut map) => { + map.extend(extras); + Value::Object(map) + } + Value::Array(mut union) => { + if let Some(non_null) = union.iter_mut().find(|val| val.as_str() != Some("null")) { + let original = std::mem::take(non_null); + *non_null = merge_extras(original, extras); + } + Value::Array(union) + } + primitive => { + let mut map = JsonMap::with_capacity(extras.len() + 1); + map.insert("type".into(), primitive); + map.extend(extras); + Value::Object(map) + } + } +} + +// Convert an Arrow `DataType` into an Avro schema `Value`. +fn datatype_to_avro( + dt: &DataType, + field_name: &str, + metadata: &HashMap<String, String>, + name_gen: &mut NameGenerator, +) -> Result<(Value, JsonMap<String, Value>), ArrowError> { Review Comment: longer term it might be worth considering moving to something other than the JSON value representation unless its flexibility is really needed -- like this code might be better / more efficient if it directly creates an AvroSchema element or something I haven't followed this code enough to have any sort of specific proposal, I am just pattern matching based on Json::Value usage here -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org