martin-g commented on code in PR #17702:
URL: https://github.com/apache/datafusion/pull/17702#discussion_r2528968804


##########
datafusion/proto/src/logical_plan/mod.rs:
##########
@@ -625,7 +625,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                             create_extern_table.name.as_ref(),
                             "CreateExternalTable",
                         )?,
-                        location: create_extern_table.location.clone(),
+                        locations: vec![create_extern_table.location.clone()],

Review Comment:
   This should split by comma, no ?
   At line 1502 below you serialize the locations to a location by joining them 
with comma.
   I.e. `vec!["path1", "path2"]` serializes to `"path1,path2"` but deserializes 
back to `vec!["path1,path2"]`



##########
datafusion/core/src/test_util/mod.rs:
##########
@@ -194,7 +194,7 @@ impl TableProviderFactory for TestTableFactory {
 #[derive(Debug)]
 pub struct TestTableProvider {
     /// URL of table files or folder
-    pub url: String,
+    pub url: Vec<String>,

Review Comment:
   ```suggestion
       pub urls: Vec<String>,
   ```
   ?
   `location` has been renamed to `locations` already



##########
datafusion/sql/src/statement.rs:
##########
@@ -1548,7 +1548,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
             PlanCreateExternalTable {
                 schema: df_schema,
                 name,
-                location,
+                locations: location.split(",").map(|x| 
x.to_string()).collect(),

Review Comment:
   ```suggestion
                   locations: location.split(",").map(|x| 
x.to_string().trim()).collect(),
   ```



##########
datafusion/core/src/datasource/listing_table_factory.rs:
##########
@@ -63,137 +65,190 @@ impl TableProviderFactory for ListingTableFactory {
             ))?
             .create(session_state, &cmd.options)?;
 
-        let mut table_path = ListingTableUrl::parse(&cmd.location)?;
-        let file_extension = match table_path.is_collection() {
-            // Setting the extension to be empty instead of allowing the 
default extension seems
-            // odd, but was done to ensure existing behavior isn't modified. 
It seems like this
-            // could be refactored to either use the default extension or set 
the fully expected
-            // extension when compression is included (e.g. ".csv.gz")
-            true => "",
-            false => &get_extension(cmd.location.as_str()),
+        let file_extension = match cmd.locations.len() {
+            1 => {
+                let table_path = ListingTableUrl::parse(&cmd.locations[0])?;
+                match table_path.is_collection() {
+                    // Setting the extension to be empty instead of allowing 
the default extension seems
+                    // odd, but was done to ensure existing behavior isn't 
modified. It seems like this
+                    // could be refactored to either use the default extension 
or set the fully expected
+                    // extension when compression is included (e.g. ".csv.gz").
+                    // We do the same if there are multiple locations provided 
for the table.
+                    true => "",
+                    false => &get_extension(cmd.locations[0].as_str()),
+                }
+            }
+            _ => "",

Review Comment:
   You could make an attempt here - if all files have the same extension then 
use it, otherwise fall back to `""`



##########
datafusion/proto/tests/cases/roundtrip_logical_plan.rs:
##########
@@ -196,7 +196,7 @@ impl LogicalExtensionCodec for TestTableProviderCodec {
         })?;
         assert_eq!(msg.table_name, table_ref.to_string());
         let provider = TestTableProvider {
-            url: msg.url,
+            url: vec![msg.url],

Review Comment:
   Same error as earlier - here you need to split by comma because the encode 
below joins by comma.
   But as I already said the idea of using comma as separator is fragile.



##########
datafusion/core/src/datasource/listing_table_factory.rs:
##########
@@ -63,137 +65,190 @@ impl TableProviderFactory for ListingTableFactory {
             ))?
             .create(session_state, &cmd.options)?;
 
-        let mut table_path = ListingTableUrl::parse(&cmd.location)?;
-        let file_extension = match table_path.is_collection() {
-            // Setting the extension to be empty instead of allowing the 
default extension seems
-            // odd, but was done to ensure existing behavior isn't modified. 
It seems like this
-            // could be refactored to either use the default extension or set 
the fully expected
-            // extension when compression is included (e.g. ".csv.gz")
-            true => "",
-            false => &get_extension(cmd.location.as_str()),
+        let file_extension = match cmd.locations.len() {
+            1 => {
+                let table_path = ListingTableUrl::parse(&cmd.locations[0])?;
+                match table_path.is_collection() {
+                    // Setting the extension to be empty instead of allowing 
the default extension seems
+                    // odd, but was done to ensure existing behavior isn't 
modified. It seems like this
+                    // could be refactored to either use the default extension 
or set the fully expected
+                    // extension when compression is included (e.g. ".csv.gz").
+                    // We do the same if there are multiple locations provided 
for the table.
+                    true => "",
+                    false => &get_extension(cmd.locations[0].as_str()),
+                }
+            }
+            _ => "",
         };
+
         let mut options = ListingOptions::new(file_format)
             .with_session_config_options(session_state.config())
             .with_file_extension(file_extension);
 
-        let (provided_schema, table_partition_cols) = if 
cmd.schema.fields().is_empty() {
-            let infer_parts = session_state
-                .config_options()
-                .execution
-                .listing_table_factory_infer_partitions;
-            let part_cols = if cmd.table_partition_cols.is_empty() && 
infer_parts {
-                options
-                    .infer_partitions(session_state, &table_path)
-                    .await?
-                    .into_iter()
-            } else {
-                cmd.table_partition_cols.clone().into_iter()
-            };
-
-            (
-                None,
-                part_cols
-                    .map(|p| {
-                        (
-                            p,
-                            DataType::Dictionary(
-                                Box::new(DataType::UInt16),
-                                Box::new(DataType::Utf8),
-                            ),
-                        )
-                    })
-                    .collect::<Vec<_>>(),
+        let table_paths: Vec<ListingTableUrl> = cmd
+            .locations
+            .iter()
+            .map(|loc| ListingTableUrl::parse(loc))
+            .collect::<Result<Vec<_>>>()?;
+
+        // We use the first location to infer the partition columns,
+        // primarily for performance and simplicity reasons.
+        let partition_columns = infer_partition_columns(
+            &options,
+            session_state,
+            &table_paths[0],

Review Comment:
   Could this fail with out of bounds error ?
   Above while extracting the file extension you make a check whether the 
locations.len() is 1 or anything else (`_ => ""`), i.e. even 0 will match there.



##########
datafusion-cli/src/exec.rs:
##########
@@ -524,14 +529,16 @@ mod tests {
 
         if let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = 
&plan {
             let format = config_file_type_from_str(&cmd.file_type);
-            register_object_store_and_config_extensions(
-                &ctx,
-                &cmd.location,
-                &cmd.options,
-                format,
-                false,
-            )
-            .await?;
+            for location in &cmd.locations {
+                register_object_store_and_config_extensions(
+                    &ctx,
+                    &location,
+                    &cmd.options,
+                    format.clone(),
+                    false,
+                )
+                .await?;

Review Comment:
   At line 428 you join the `register_futures`. Here you don't. Why the 
difference ?



##########
datafusion/proto/src/logical_plan/mod.rs:
##########
@@ -1499,7 +1499,7 @@ impl AsLogicalPlan for LogicalPlanNode {
                     logical_plan_type: 
Some(LogicalPlanType::CreateExternalTable(
                         protobuf::CreateExternalTableNode {
                             name: Some(name.clone().into()),
-                            location: location.clone(),
+                            location: location.clone().join(","),

Review Comment:
   Comma is an allowed character in file names.
   Joining the file names by comma may break later when splitting if a file 
already has a comma in its name.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to