goldmedal commented on code in PR #12816: URL: https://github.com/apache/datafusion/pull/12816#discussion_r1795800690
########## datafusion/core/src/datasource/file_format/mod.rs: ########## @@ -302,6 +302,87 @@ pub(crate) fn coerce_file_schema_to_view_type( )) } +/// Transform a schema to force binary types to be strings +pub fn transform_binary_to_string(schema: &Schema) -> Schema { + let transformed_fields: Vec<Arc<Field>> = schema + .fields + .iter() + .map(|field| match field.data_type() { + DataType::Binary => Arc::new( + Field::new(field.name(), DataType::Utf8, field.is_nullable()) + .with_metadata(field.metadata().to_owned()), + ), + DataType::LargeBinary => Arc::new( + Field::new(field.name(), DataType::LargeUtf8, field.is_nullable()) + .with_metadata(field.metadata().to_owned()), + ), + _ => field.clone(), + }) + .collect(); + Schema::new_with_metadata(transformed_fields, schema.metadata.clone()) +} + +/// If the table schema uses a string type, coerce the file schema to use a string type. +pub(crate) fn coerce_file_schema_to_string_type( + table_schema: &Schema, + file_schema: &Schema, +) -> Option<Schema> { + let mut transform = false; + let table_fields: HashMap<_, _> = table_schema + .fields + .iter() + .map(|f| (f.name(), f.data_type())) + .collect(); + let transformed_fields: Vec<Arc<Field>> = file_schema + .fields + .iter() + .map( + |field| match (table_fields.get(field.name()), field.data_type()) { + (Some(DataType::Utf8), DataType::Binary) => { + transform = true; + Arc::new(Field::new( + field.name(), + DataType::Utf8, + field.is_nullable(), + )) + } + (Some(DataType::LargeUtf8), DataType::LargeBinary) => { + transform = true; + Arc::new(Field::new( + field.name(), + DataType::LargeUtf8, + field.is_nullable(), + )) + } + // If `schema_force_view_types` is enabled, the actual data could be `Binary` or `LargeBinary` + // because we will first change the table schema for binary-to-string coercion, then apply the + // string-to-view transformation. So we need all binary types to be coerced to `Utf8View` here. + ( + Some(DataType::Utf8View), + DataType::Binary | DataType::LargeBinary | DataType::BinaryView, Review Comment: Same as above, testing doesn't cover the case for `DataType::LargeBinary` and `DataType::BinaryView`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org