goldmedal commented on code in PR #12816:
URL: https://github.com/apache/datafusion/pull/12816#discussion_r1795800690


##########
datafusion/core/src/datasource/file_format/mod.rs:
##########
@@ -302,6 +302,87 @@ pub(crate) fn coerce_file_schema_to_view_type(
     ))
 }
 
+/// Transform a schema to force binary types to be strings
+pub fn transform_binary_to_string(schema: &Schema) -> Schema {
+    let transformed_fields: Vec<Arc<Field>> = schema
+        .fields
+        .iter()
+        .map(|field| match field.data_type() {
+            DataType::Binary => Arc::new(
+                Field::new(field.name(), DataType::Utf8, field.is_nullable())
+                    .with_metadata(field.metadata().to_owned()),
+            ),
+            DataType::LargeBinary => Arc::new(
+                Field::new(field.name(), DataType::LargeUtf8, 
field.is_nullable())
+                    .with_metadata(field.metadata().to_owned()),
+            ),
+            _ => field.clone(),
+        })
+        .collect();
+    Schema::new_with_metadata(transformed_fields, schema.metadata.clone())
+}
+
+/// If the table schema uses a string type, coerce the file schema to use a 
string type.
+pub(crate) fn coerce_file_schema_to_string_type(
+    table_schema: &Schema,
+    file_schema: &Schema,
+) -> Option<Schema> {
+    let mut transform = false;
+    let table_fields: HashMap<_, _> = table_schema
+        .fields
+        .iter()
+        .map(|f| (f.name(), f.data_type()))
+        .collect();
+    let transformed_fields: Vec<Arc<Field>> = file_schema
+        .fields
+        .iter()
+        .map(
+            |field| match (table_fields.get(field.name()), field.data_type()) {
+                (Some(DataType::Utf8), DataType::Binary) => {
+                    transform = true;
+                    Arc::new(Field::new(
+                        field.name(),
+                        DataType::Utf8,
+                        field.is_nullable(),
+                    ))
+                }
+                (Some(DataType::LargeUtf8), DataType::LargeBinary) => {
+                    transform = true;
+                    Arc::new(Field::new(
+                        field.name(),
+                        DataType::LargeUtf8,
+                        field.is_nullable(),
+                    ))
+                }
+                // If `schema_force_view_types` is enabled, the actual data 
could be `Binary` or `LargeBinary`
+                // because we will first change the table schema for 
binary-to-string coercion, then apply the
+                // string-to-view transformation. So we need all binary types 
to be coerced to `Utf8View` here.
+                (
+                    Some(DataType::Utf8View),
+                    DataType::Binary | DataType::LargeBinary | 
DataType::BinaryView,

Review Comment:
   Same as above, testing doesn't cover the case for `DataType::LargeBinary` 
and `DataType::BinaryView`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to