This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new e9a77e0ea3 Add a hint about expected extension in error message in 
register_csv,… (#14168)
e9a77e0ea3 is described below

commit e9a77e0ea3e30b7f2718c9cea1fed023dca1f646
Author: Sergey Zhukov <[email protected]>
AuthorDate: Sat Jan 18 20:35:48 2025 +0300

    Add a hint about expected extension in error message in register_csv,… 
(#14168)
    
    * Add a hint about expected extension in error message in register_csv, 
register_parquet, register_json, register_avro (#14144)
    
    * Add tests for error
    
    * fix test
    
    * fmt
    
    * Fix issues causing GitHub checks to fail
    
    * revert datafusion-testing change
    
    ---------
    
    Co-authored-by: Sergey Zhukov <[email protected]>
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/core/src/execution/context/avro.rs    |  2 +
 datafusion/core/src/execution/context/csv.rs     |  2 +
 datafusion/core/src/execution/context/json.rs    |  2 +
 datafusion/core/src/execution/context/mod.rs     | 23 +++++++++
 datafusion/core/src/execution/context/parquet.rs |  2 +
 datafusion/core/tests/dataframe/mod.rs           | 65 +++++++++++++++++++++++-
 6 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/datafusion/core/src/execution/context/avro.rs 
b/datafusion/core/src/execution/context/avro.rs
index a31f2af642..be15e26961 100644
--- a/datafusion/core/src/execution/context/avro.rs
+++ b/datafusion/core/src/execution/context/avro.rs
@@ -46,6 +46,8 @@ impl SessionContext {
         let listing_options = options
             .to_listing_options(&self.copied_config(), 
self.copied_table_options());
 
+        self.register_type_check(table_path.as_ref(), 
&listing_options.file_extension)?;
+
         self.register_listing_table(
             table_ref,
             table_path,
diff --git a/datafusion/core/src/execution/context/csv.rs 
b/datafusion/core/src/execution/context/csv.rs
index e97c70ef98..9b4c0e3b29 100644
--- a/datafusion/core/src/execution/context/csv.rs
+++ b/datafusion/core/src/execution/context/csv.rs
@@ -62,6 +62,8 @@ impl SessionContext {
         let listing_options = options
             .to_listing_options(&self.copied_config(), 
self.copied_table_options());
 
+        self.register_type_check(table_path.as_ref(), 
&listing_options.file_extension)?;
+
         self.register_listing_table(
             table_ref,
             table_path,
diff --git a/datafusion/core/src/execution/context/json.rs 
b/datafusion/core/src/execution/context/json.rs
index c9a9492f91..013c47d046 100644
--- a/datafusion/core/src/execution/context/json.rs
+++ b/datafusion/core/src/execution/context/json.rs
@@ -48,6 +48,8 @@ impl SessionContext {
         let listing_options = options
             .to_listing_options(&self.copied_config(), 
self.copied_table_options());
 
+        self.register_type_check(table_path.as_ref(), 
&listing_options.file_extension)?;
+
         self.register_listing_table(
             table_ref,
             table_path,
diff --git a/datafusion/core/src/execution/context/mod.rs 
b/datafusion/core/src/execution/context/mod.rs
index e5da49ad7b..e377dd6297 100644
--- a/datafusion/core/src/execution/context/mod.rs
+++ b/datafusion/core/src/execution/context/mod.rs
@@ -1379,6 +1379,29 @@ impl SessionContext {
         Ok(())
     }
 
+    fn register_type_check<P: DataFilePaths>(
+        &self,
+        table_paths: P,
+        extension: impl AsRef<str>,
+    ) -> Result<()> {
+        let table_paths = table_paths.to_urls()?;
+        if table_paths.is_empty() {
+            return exec_err!("No table paths were provided");
+        }
+
+        // check if the file extension matches the expected extension
+        let extension = extension.as_ref();
+        for path in &table_paths {
+            let file_path = path.as_str();
+            if !file_path.ends_with(extension) && !path.is_collection() {
+                return exec_err!(
+                    "File path '{file_path}' does not match the expected 
extension '{extension}'"
+                );
+            }
+        }
+        Ok(())
+    }
+
     /// Registers an Arrow file as a table that can be referenced from
     /// SQL statements executed against this context.
     pub async fn register_arrow(
diff --git a/datafusion/core/src/execution/context/parquet.rs 
b/datafusion/core/src/execution/context/parquet.rs
index be87c7cac1..67ccacaea6 100644
--- a/datafusion/core/src/execution/context/parquet.rs
+++ b/datafusion/core/src/execution/context/parquet.rs
@@ -50,6 +50,8 @@ impl SessionContext {
         let listing_options = options
             .to_listing_options(&self.copied_config(), 
self.copied_table_options());
 
+        self.register_type_check(table_path.as_ref(), 
&listing_options.file_extension)?;
+
         self.register_listing_table(
             table_ref,
             table_path,
diff --git a/datafusion/core/tests/dataframe/mod.rs 
b/datafusion/core/tests/dataframe/mod.rs
index b6023a726a..954c46ab27 100644
--- a/datafusion/core/tests/dataframe/mod.rs
+++ b/datafusion/core/tests/dataframe/mod.rs
@@ -54,8 +54,9 @@ use datafusion::error::Result;
 use datafusion::execution::context::SessionContext;
 use datafusion::execution::session_state::SessionStateBuilder;
 use datafusion::logical_expr::{ColumnarValue, Volatility};
-use datafusion::prelude::{CsvReadOptions, ParquetReadOptions};
-use datafusion::prelude::{JoinType, NdJsonReadOptions};
+use datafusion::prelude::{
+    AvroReadOptions, CsvReadOptions, JoinType, NdJsonReadOptions, 
ParquetReadOptions,
+};
 use datafusion::test_util::{
     parquet_test_data, populate_csv_partitions, register_aggregate_csv, 
test_table,
     test_table_with_name,
@@ -5121,3 +5122,63 @@ async fn test_alias_nested() -> Result<()> {
     );
     Ok(())
 }
+
+#[tokio::test]
+async fn register_non_json_file() {
+    let ctx = SessionContext::new();
+    let err = ctx
+        .register_json(
+            "data",
+            "tests/data/test_binary.parquet",
+            NdJsonReadOptions::default(),
+        )
+        .await;
+    assert_contains!(
+        err.unwrap_err().to_string(),
+        "test_binary.parquet' does not match the expected extension '.json'"
+    );
+}
+
+#[tokio::test]
+async fn register_non_csv_file() {
+    let ctx = SessionContext::new();
+    let err = ctx
+        .register_csv(
+            "data",
+            "tests/data/test_binary.parquet",
+            CsvReadOptions::default(),
+        )
+        .await;
+    assert_contains!(
+        err.unwrap_err().to_string(),
+        "test_binary.parquet' does not match the expected extension '.csv'"
+    );
+}
+
+#[tokio::test]
+async fn register_non_avro_file() {
+    let ctx = SessionContext::new();
+    let err = ctx
+        .register_avro(
+            "data",
+            "tests/data/test_binary.parquet",
+            AvroReadOptions::default(),
+        )
+        .await;
+    assert_contains!(
+        err.unwrap_err().to_string(),
+        "test_binary.parquet' does not match the expected extension '.avro'"
+    );
+}
+
+#[tokio::test]
+async fn register_non_parquet_file() {
+    let ctx = SessionContext::new();
+    let err = ctx
+        .register_parquet("data", "tests/data/1.json", 
ParquetReadOptions::default())
+        .await;
+    assert_contains!(
+        err.unwrap_err().to_string(),
+        "1.json' does not match the expected extension '.parquet'"
+    );
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to