This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new e9a77e0ea3 Add a hint about expected extension in error message in
register_csv,… (#14168)
e9a77e0ea3 is described below
commit e9a77e0ea3e30b7f2718c9cea1fed023dca1f646
Author: Sergey Zhukov <[email protected]>
AuthorDate: Sat Jan 18 20:35:48 2025 +0300
Add a hint about expected extension in error message in register_csv,…
(#14168)
* Add a hint about expected extension in error message in register_csv,
register_parquet, register_json, register_avro (#14144)
* Add tests for error
* fix test
* fmt
* Fix issues causing GitHub checks to fail
* revert datafusion-testing change
---------
Co-authored-by: Sergey Zhukov <[email protected]>
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/core/src/execution/context/avro.rs | 2 +
datafusion/core/src/execution/context/csv.rs | 2 +
datafusion/core/src/execution/context/json.rs | 2 +
datafusion/core/src/execution/context/mod.rs | 23 +++++++++
datafusion/core/src/execution/context/parquet.rs | 2 +
datafusion/core/tests/dataframe/mod.rs | 65 +++++++++++++++++++++++-
6 files changed, 94 insertions(+), 2 deletions(-)
diff --git a/datafusion/core/src/execution/context/avro.rs
b/datafusion/core/src/execution/context/avro.rs
index a31f2af642..be15e26961 100644
--- a/datafusion/core/src/execution/context/avro.rs
+++ b/datafusion/core/src/execution/context/avro.rs
@@ -46,6 +46,8 @@ impl SessionContext {
let listing_options = options
.to_listing_options(&self.copied_config(),
self.copied_table_options());
+ self.register_type_check(table_path.as_ref(),
&listing_options.file_extension)?;
+
self.register_listing_table(
table_ref,
table_path,
diff --git a/datafusion/core/src/execution/context/csv.rs
b/datafusion/core/src/execution/context/csv.rs
index e97c70ef98..9b4c0e3b29 100644
--- a/datafusion/core/src/execution/context/csv.rs
+++ b/datafusion/core/src/execution/context/csv.rs
@@ -62,6 +62,8 @@ impl SessionContext {
let listing_options = options
.to_listing_options(&self.copied_config(),
self.copied_table_options());
+ self.register_type_check(table_path.as_ref(),
&listing_options.file_extension)?;
+
self.register_listing_table(
table_ref,
table_path,
diff --git a/datafusion/core/src/execution/context/json.rs
b/datafusion/core/src/execution/context/json.rs
index c9a9492f91..013c47d046 100644
--- a/datafusion/core/src/execution/context/json.rs
+++ b/datafusion/core/src/execution/context/json.rs
@@ -48,6 +48,8 @@ impl SessionContext {
let listing_options = options
.to_listing_options(&self.copied_config(),
self.copied_table_options());
+ self.register_type_check(table_path.as_ref(),
&listing_options.file_extension)?;
+
self.register_listing_table(
table_ref,
table_path,
diff --git a/datafusion/core/src/execution/context/mod.rs
b/datafusion/core/src/execution/context/mod.rs
index e5da49ad7b..e377dd6297 100644
--- a/datafusion/core/src/execution/context/mod.rs
+++ b/datafusion/core/src/execution/context/mod.rs
@@ -1379,6 +1379,29 @@ impl SessionContext {
Ok(())
}
+ fn register_type_check<P: DataFilePaths>(
+ &self,
+ table_paths: P,
+ extension: impl AsRef<str>,
+ ) -> Result<()> {
+ let table_paths = table_paths.to_urls()?;
+ if table_paths.is_empty() {
+ return exec_err!("No table paths were provided");
+ }
+
+ // check if the file extension matches the expected extension
+ let extension = extension.as_ref();
+ for path in &table_paths {
+ let file_path = path.as_str();
+ if !file_path.ends_with(extension) && !path.is_collection() {
+ return exec_err!(
+ "File path '{file_path}' does not match the expected
extension '{extension}'"
+ );
+ }
+ }
+ Ok(())
+ }
+
/// Registers an Arrow file as a table that can be referenced from
/// SQL statements executed against this context.
pub async fn register_arrow(
diff --git a/datafusion/core/src/execution/context/parquet.rs
b/datafusion/core/src/execution/context/parquet.rs
index be87c7cac1..67ccacaea6 100644
--- a/datafusion/core/src/execution/context/parquet.rs
+++ b/datafusion/core/src/execution/context/parquet.rs
@@ -50,6 +50,8 @@ impl SessionContext {
let listing_options = options
.to_listing_options(&self.copied_config(),
self.copied_table_options());
+ self.register_type_check(table_path.as_ref(),
&listing_options.file_extension)?;
+
self.register_listing_table(
table_ref,
table_path,
diff --git a/datafusion/core/tests/dataframe/mod.rs
b/datafusion/core/tests/dataframe/mod.rs
index b6023a726a..954c46ab27 100644
--- a/datafusion/core/tests/dataframe/mod.rs
+++ b/datafusion/core/tests/dataframe/mod.rs
@@ -54,8 +54,9 @@ use datafusion::error::Result;
use datafusion::execution::context::SessionContext;
use datafusion::execution::session_state::SessionStateBuilder;
use datafusion::logical_expr::{ColumnarValue, Volatility};
-use datafusion::prelude::{CsvReadOptions, ParquetReadOptions};
-use datafusion::prelude::{JoinType, NdJsonReadOptions};
+use datafusion::prelude::{
+ AvroReadOptions, CsvReadOptions, JoinType, NdJsonReadOptions,
ParquetReadOptions,
+};
use datafusion::test_util::{
parquet_test_data, populate_csv_partitions, register_aggregate_csv,
test_table,
test_table_with_name,
@@ -5121,3 +5122,63 @@ async fn test_alias_nested() -> Result<()> {
);
Ok(())
}
+
+#[tokio::test]
+async fn register_non_json_file() {
+ let ctx = SessionContext::new();
+ let err = ctx
+ .register_json(
+ "data",
+ "tests/data/test_binary.parquet",
+ NdJsonReadOptions::default(),
+ )
+ .await;
+ assert_contains!(
+ err.unwrap_err().to_string(),
+ "test_binary.parquet' does not match the expected extension '.json'"
+ );
+}
+
+#[tokio::test]
+async fn register_non_csv_file() {
+ let ctx = SessionContext::new();
+ let err = ctx
+ .register_csv(
+ "data",
+ "tests/data/test_binary.parquet",
+ CsvReadOptions::default(),
+ )
+ .await;
+ assert_contains!(
+ err.unwrap_err().to_string(),
+ "test_binary.parquet' does not match the expected extension '.csv'"
+ );
+}
+
+#[tokio::test]
+async fn register_non_avro_file() {
+ let ctx = SessionContext::new();
+ let err = ctx
+ .register_avro(
+ "data",
+ "tests/data/test_binary.parquet",
+ AvroReadOptions::default(),
+ )
+ .await;
+ assert_contains!(
+ err.unwrap_err().to_string(),
+ "test_binary.parquet' does not match the expected extension '.avro'"
+ );
+}
+
+#[tokio::test]
+async fn register_non_parquet_file() {
+ let ctx = SessionContext::new();
+ let err = ctx
+ .register_parquet("data", "tests/data/1.json",
ParquetReadOptions::default())
+ .await;
+ assert_contains!(
+ err.unwrap_err().to_string(),
+ "1.json' does not match the expected extension '.parquet'"
+ );
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]