goldmedal commented on code in PR #12816: URL: https://github.com/apache/datafusion/pull/12816#discussion_r1795799303
########## datafusion/core/src/datasource/file_format/mod.rs: ########## @@ -302,6 +302,87 @@ pub(crate) fn coerce_file_schema_to_view_type( )) } +/// Transform a schema to force binary types to be strings +pub fn transform_binary_to_string(schema: &Schema) -> Schema { + let transformed_fields: Vec<Arc<Field>> = schema + .fields + .iter() + .map(|field| match field.data_type() { + DataType::Binary => Arc::new( + Field::new(field.name(), DataType::Utf8, field.is_nullable()) + .with_metadata(field.metadata().to_owned()), + ), + DataType::LargeBinary => Arc::new( + Field::new(field.name(), DataType::LargeUtf8, field.is_nullable()) + .with_metadata(field.metadata().to_owned()), + ), + _ => field.clone(), + }) + .collect(); + Schema::new_with_metadata(transformed_fields, schema.metadata.clone()) +} + +/// If the table schema uses a string type, coerce the file schema to use a string type. +pub(crate) fn coerce_file_schema_to_string_type( + table_schema: &Schema, + file_schema: &Schema, +) -> Option<Schema> { + let mut transform = false; + let table_fields: HashMap<_, _> = table_schema + .fields + .iter() + .map(|f| (f.name(), f.data_type())) + .collect(); + let transformed_fields: Vec<Arc<Field>> = file_schema + .fields + .iter() + .map( + |field| match (table_fields.get(field.name()), field.data_type()) { + (Some(DataType::Utf8), DataType::Binary) => { + transform = true; + Arc::new(Field::new( + field.name(), + DataType::Utf8, + field.is_nullable(), + )) + } + (Some(DataType::LargeUtf8), DataType::LargeBinary) => { Review Comment: Actually, this case isn't covered by testing because the Arrow reader always marks `BYTE_ARRAY` as `Utf8` or `Binary`. I'm not pretty sure if we need it. 🤔 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org