alamb commented on code in PR #17050:
URL: https://github.com/apache/datafusion/pull/17050#discussion_r2265216268


##########
datafusion/core/src/datasource/listing/table.rs:
##########
@@ -802,6 +802,7 @@ impl ListingOptions {
                     .rev()
                     .skip(1) // get parents only; skip the file itself
                     .rev()
+                    .filter(|s| s.contains('='))

Review Comment:
   can we please add a comment about why this is done?
   I know it is obvious from the context of this PR, but it would help to have 
some sort of
   comment for the future / link to a description / spec of hive datasources to 
explain why paths with `=` are ignored



##########
datafusion/core/src/datasource/listing_table_factory.rs:
##########
@@ -375,4 +384,86 @@ mod tests {
             Pattern::new("*.csv").unwrap()
         );
     }
+
+    #[tokio::test]
+    async fn test_create_with_hive_partitions() {
+        let dir = tempfile::tempdir().unwrap();
+        let mut path = PathBuf::from(dir.path());
+        path.extend(["key1=value1", "key2=value2"]);

Review Comment:
   This test fails like this without the changes in this PR
   
   ```
   assertion `left == right` failed
     left: [("key1", Dictionary(UInt16, Utf8)), ("key2", Dictionary(UInt16, 
Utf8))]
    right: []
   
   ```



##########
datafusion/core/src/datasource/listing_table_factory.rs:
##########
@@ -375,4 +384,86 @@ mod tests {
             Pattern::new("*.csv").unwrap()
         );
     }
+
+    #[tokio::test]
+    async fn test_create_with_hive_partitions() {
+        let dir = tempfile::tempdir().unwrap();
+        let mut path = PathBuf::from(dir.path());
+        path.extend(["key1=value1", "key2=value2"]);
+        fs::create_dir_all(&path).unwrap();
+        path.push("data.parquet");
+        fs::File::create_new(&path).unwrap();
+
+        let factory = ListingTableFactory::new();
+        let context = SessionContext::new();
+        let state = context.state();
+        let name = TableReference::bare("foo");
+
+        let cmd = CreateExternalTable {
+            name,
+            location: dir.path().to_str().unwrap().to_string(),
+            file_type: "parquet".to_string(),
+            schema: Arc::new(DFSchema::empty()),
+            table_partition_cols: vec![],
+            if_not_exists: false,
+            temporary: false,
+            definition: None,
+            order_exprs: vec![],
+            unbounded: false,
+            options: HashMap::new(),
+            constraints: Constraints::default(),
+            column_defaults: HashMap::new(),
+        };
+        let table_provider = factory.create(&state, &cmd).await.unwrap();
+        let listing_table = table_provider
+            .as_any()
+            .downcast_ref::<ListingTable>()
+            .unwrap();
+
+        let listing_options = listing_table.options();
+        let dtype =
+            DataType::Dictionary(Box::new(DataType::UInt16), 
Box::new(DataType::Utf8));
+        let expected_cols = vec![
+            (String::from("key1"), dtype.clone()),
+            (String::from("key2"), dtype.clone()),
+        ];
+        assert_eq!(expected_cols, listing_options.table_partition_cols);
+    }
+
+    #[tokio::test]
+    async fn test_odd_directory_names() {

Review Comment:
   this test passes even wihtout the changes in this PR



##########
datafusion/core/src/datasource/listing_table_factory.rs:
##########
@@ -63,16 +63,29 @@ impl TableProviderFactory for ListingTableFactory {
             ))?
             .create(session_state, &cmd.options)?;
 
-        let file_extension = get_extension(cmd.location.as_str());
+        let mut table_path = ListingTableUrl::parse(&cmd.location)?;
+        let file_extension = match table_path.is_collection() {
+            true => "",
+            false => &get_extension(cmd.location.as_str()),
+        };

Review Comment:
   Can you please also add a comment explaining the rationale for the change?
   
   I agree it looks odd -- I haven't studied the code but it feels like maybe 
this is treating the symptom of some larger problem and a real fix elsewhere 
would be simpler 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to