alamb commented on code in PR #17050: URL: https://github.com/apache/datafusion/pull/17050#discussion_r2265216268
########## datafusion/core/src/datasource/listing/table.rs: ########## @@ -802,6 +802,7 @@ impl ListingOptions { .rev() .skip(1) // get parents only; skip the file itself .rev() + .filter(|s| s.contains('=')) Review Comment: can we please add a comment about why this is done? I know it is obvious from the context of this PR, but it would help to have some sort of comment for the future / link to a description / spec of hive datasources to explain why paths with `=` are ignored ########## datafusion/core/src/datasource/listing_table_factory.rs: ########## @@ -375,4 +384,86 @@ mod tests { Pattern::new("*.csv").unwrap() ); } + + #[tokio::test] + async fn test_create_with_hive_partitions() { + let dir = tempfile::tempdir().unwrap(); + let mut path = PathBuf::from(dir.path()); + path.extend(["key1=value1", "key2=value2"]); Review Comment: This test fails like this without the changes in this PR ``` assertion `left == right` failed left: [("key1", Dictionary(UInt16, Utf8)), ("key2", Dictionary(UInt16, Utf8))] right: [] ``` ########## datafusion/core/src/datasource/listing_table_factory.rs: ########## @@ -375,4 +384,86 @@ mod tests { Pattern::new("*.csv").unwrap() ); } + + #[tokio::test] + async fn test_create_with_hive_partitions() { + let dir = tempfile::tempdir().unwrap(); + let mut path = PathBuf::from(dir.path()); + path.extend(["key1=value1", "key2=value2"]); + fs::create_dir_all(&path).unwrap(); + path.push("data.parquet"); + fs::File::create_new(&path).unwrap(); + + let factory = ListingTableFactory::new(); + let context = SessionContext::new(); + let state = context.state(); + let name = TableReference::bare("foo"); + + let cmd = CreateExternalTable { + name, + location: dir.path().to_str().unwrap().to_string(), + file_type: "parquet".to_string(), + schema: Arc::new(DFSchema::empty()), + table_partition_cols: vec![], + if_not_exists: false, + temporary: false, + definition: None, + order_exprs: vec![], + unbounded: false, + options: HashMap::new(), + constraints: Constraints::default(), + column_defaults: HashMap::new(), + }; + let table_provider = factory.create(&state, &cmd).await.unwrap(); + let listing_table = table_provider + .as_any() + .downcast_ref::<ListingTable>() + .unwrap(); + + let listing_options = listing_table.options(); + let dtype = + DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)); + let expected_cols = vec![ + (String::from("key1"), dtype.clone()), + (String::from("key2"), dtype.clone()), + ]; + assert_eq!(expected_cols, listing_options.table_partition_cols); + } + + #[tokio::test] + async fn test_odd_directory_names() { Review Comment: this test passes even wihtout the changes in this PR ########## datafusion/core/src/datasource/listing_table_factory.rs: ########## @@ -63,16 +63,29 @@ impl TableProviderFactory for ListingTableFactory { ))? .create(session_state, &cmd.options)?; - let file_extension = get_extension(cmd.location.as_str()); + let mut table_path = ListingTableUrl::parse(&cmd.location)?; + let file_extension = match table_path.is_collection() { + true => "", + false => &get_extension(cmd.location.as_str()), + }; Review Comment: Can you please also add a comment explaining the rationale for the change? I agree it looks odd -- I haven't studied the code but it feels like maybe this is treating the symptom of some larger problem and a real fix elsewhere would be simpler -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org