martin-g commented on code in PR #22695:
URL: https://github.com/apache/datafusion/pull/22695#discussion_r3334456186


##########
datafusion/core/src/datasource/listing_table_factory.rs:
##########
@@ -222,13 +297,18 @@ fn get_extension(path: &str) -> String {
     }
 }
 
+fn schemas_have_same_fields(left: &SchemaRef, right: &SchemaRef) -> bool {
+    left.fields() == right.fields()

Review Comment:
   
https://github.com/apache/datafusion/pull/22695/changes#diff-8ce463e0d56c67a237fce2942f0759984d56764b13b62156472132ab84c78192R226-R227
 says `Schema metadata may differ between files ...` but here the equality 
check compares the fields' metadata too - 
https://docs.rs/arrow-schema/58.3.0/src/arrow_schema/field.rs.html#113
   Is this intentional ?



##########
datafusion/proto-models/src/generated/prost.rs:
##########
@@ -243,8 +243,8 @@ pub struct EmptyRelationNode {
 pub struct CreateExternalTableNode {
     #[prost(message, optional, tag = "9")]
     pub name: ::core::option::Option<TableReference>,
-    #[prost(string, tag = "2")]
-    pub location: ::prost::alloc::string::String,

Review Comment:
   What is the lifetime of a serialized CreateExternalTableNode ?
   If there is a use case where a CreateExternalTableNode serialized with DF 
v.X and deserialized with v. X+1 then it would be better to keep `location` 
around until v. X+2 (assuming users don't upgrade from X to X+2).
   I.e. deserialize should do:
   1) if `locations` is not empty then use it
   2) otherwise if `location` is not an empty string then insert it to 
`locations` 
   
   Serialize should: 1) write non-empty `locations` and `location=""`



##########
datafusion/core/src/test_util/mod.rs:
##########
@@ -188,7 +188,7 @@ impl TableProviderFactory for TestTableFactory {
         cmd: &CreateExternalTable,
     ) -> Result<Arc<dyn TableProvider>> {
         Ok(Arc::new(TestTableProvider {
-            url: cmd.location.to_string(),
+            url: cmd.locations.first().cloned().unwrap_or_default(),

Review Comment:
   This could silently use `""` if no locations are provided. Better return an 
error.



##########
datafusion/core/src/datasource/listing_table_factory.rs:
##########
@@ -142,36 +182,71 @@ impl TableProviderFactory for ListingTableFactory {
 
         options = options.with_table_partition_cols(table_partition_cols);
 
-        options
-            .validate_partitions(session_state, &table_path)
-            .await?;
+        // Validate partitions against every location before any glob 
rewriting.
+        for table_path in &table_paths {
+            options
+                .validate_partitions(session_state, table_path)
+                .await?;
+        }
 
-        let resolved_schema = match provided_schema {
+        let (resolved_table_paths, resolved_schema) = match provided_schema {
             // We will need to check the table columns against the schema
             // this is done so that we can do an ORDER BY for external table 
creation
             // specifically for parquet file format.
             // See: https://github.com/apache/datafusion/issues/7317
             None => {
-                // if the folder then rewrite a file path as 'path/*.parquet'
-                // to only read the files the reader can understand
-                if table_path.is_folder() && table_path.get_glob().is_none() {
-                    // Since there are no files yet to infer an actual 
extension,
-                    // derive the pattern based on compression type.
-                    // So for gzipped CSV the pattern is `*.csv.gz`
-                    let glob = match options.format.compression_type() {
-                        Some(compression) => {
-                            match 
options.format.get_ext_with_compression(&compression) {
-                                // Use glob based on `FileFormat` extension
-                                Ok(ext) => format!("*.{ext}"),
-                                // Fallback to `file_type`, if not supported 
by `FileFormat`
-                                Err(_) => format!("*.{}", 
cmd.file_type.to_lowercase()),
+                let mut resolved_paths = Vec::with_capacity(table_paths.len());
+                let mut inferred_schema: Option<SchemaRef> = None;
+                for mut table_path in table_paths {
+                    // if the folder then rewrite a file path as 
'path/*.parquet'
+                    // to only read the files the reader can understand
+                    if table_path.is_folder() && 
table_path.get_glob().is_none() {
+                        // Since there are no files yet to infer an actual 
extension,
+                        // derive the pattern based on compression type.
+                        // So for gzipped CSV the pattern is `*.csv.gz`
+                        let glob = match options.format.compression_type() {
+                            Some(compression) => {
+                                match options
+                                    .format
+                                    .get_ext_with_compression(&compression)
+                                {
+                                    // Use glob based on `FileFormat` extension
+                                    Ok(ext) => format!("*.{ext}"),
+                                    // Fallback to `file_type`, if not 
supported by `FileFormat`
+                                    Err(_) => {
+                                        format!("*.{}", 
cmd.file_type.to_lowercase())
+                                    }
+                                }
                             }
+                            None => format!("*.{}", 
cmd.file_type.to_lowercase()),
+                        };
+                        table_path = table_path.with_glob(glob.as_ref())?;
+                    }
+                    let schema = options.infer_schema(session_state, 
&table_path).await?;
+                    // All locations must resolve to the same fields. Schema
+                    // metadata may differ between files without changing the
+                    // fields read by the table.
+                    match &inferred_schema {
+                        None => inferred_schema = Some(schema),
+                        Some(existing)
+                            if !schemas_have_same_fields(existing, &schema) =>
+                        {
+                            return plan_err!(
+                                "All locations of a CREATE EXTERNAL TABLE must 
have the \
+                                 same schema, but the provided locations have 
differing schemas"

Review Comment:
   This error message does not help the user to find the problematic 
schema/location.



##########
datafusion/proto/src/logical_plan/mod.rs:
##########
@@ -795,10 +725,15 @@ impl AsLogicalPlan for LogicalPlanNode {
                             create_extern_table.name.as_ref(),
                             "CreateExternalTable",
                         )?,
-                        create_extern_table.location.clone(),
+                        create_extern_table

Review Comment:
   If backward compatibility is needed, i.e. if a CreateExternalTableNode is 
serialized with v. X and it has to be read with v. X+1 then you need to keep 
`location` around for at least one major version.
   If such compatibility is not needed then you can just use `""` here. It is 
overwritten by `.with_locations(...)` below



##########
datafusion/sql/src/parser.rs:
##########
@@ -289,7 +257,18 @@ impl fmt::Display for CreateExternalTable {
             }
             write!(f, ") ")?;
         }
-        write!(f, "LOCATION {}", self.location)
+        if self.locations.len() > 1 {
+            write!(f, "LOCATION (")?;
+            for (idx, location) in self.locations.iter().enumerate() {
+                if idx > 0 {
+                    write!(f, ", ")?;
+                }
+                write!(f, "{location}")?;
+            }
+            write!(f, ")")
+        } else {
+            write!(f, "LOCATION {}", self.location)

Review Comment:
   I see that this is how it worked before but shouldn't the locations be 
quoted ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to