alamb commented on code in PR #10404:
URL: https://github.com/apache/datafusion/pull/10404#discussion_r1592885993


##########
datafusion/common/src/config.rs:
##########
@@ -1564,7 +1564,7 @@ config_namespace_with_hashmap! {
 config_namespace! {
     /// Options controlling CSV format
     pub struct CsvOptions {
-        pub has_header: bool, default = true
+        pub has_header: Option<bool>, default = None

Review Comment:
   👍 



##########
datafusion/sql/src/parser.rs:
##########
@@ -1009,34 +996,33 @@ mod tests {
             columns: vec![make_column_def("c1", DataType::Int(None))],
             file_type: "CSV".to_string(),
             has_header: false,
-            delimiter: ',',
             location: "foo.csv".into(),
             table_partition_cols: vec![],
             order_exprs: vec![],
             if_not_exists: false,
-            file_compression_type: UNCOMPRESSED,
             unbounded: false,
             options: HashMap::new(),
             constraints: vec![],
         });
         expect_parse_ok(sql, expected)?;
 
         // positive case with delimiter
-        let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV DELIMITER '|' 
LOCATION 'foo.csv'";
+        let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 
'foo.csv' OPTIONS ('format.delimiter' '|')";

Review Comment:
   Is it required to removing the existing syntax support for `DELIMTER` and 
`COMPRESSION` -- I think other users will see that as a regression. 
   
   It seems like it would be possible to apply the same type of logic for 
`has_header` to these other options to prevent conflicts and silent igores
   
   



##########
datafusion/core/src/datasource/file_format/csv.rs:
##########
@@ -803,9 +819,11 @@ mod tests {
         file_name: &str,
         projection: Option<Vec<usize>>,
         limit: Option<usize>,
+        hash_header: bool,

Review Comment:
   possibly a typo
   
   ```suggestion
           has_header: bool,
   ```



##########
datafusion/core/src/datasource/file_format/csv.rs:
##########
@@ -136,13 +136,13 @@ impl CsvFormat {
     /// Set true to indicate that the first line is a header.
     /// - default to true
     pub fn with_has_header(mut self, has_header: bool) -> Self {
-        self.options.has_header = has_header;
+        self.options.has_header = Some(has_header);
         self
     }
 
     /// True if the first line is a header.
     pub fn has_header(&self) -> bool {

Review Comment:
   Likewise here, I wonder if returning `Option<bool>` would be less confusing



##########
datafusion/core/src/datasource/file_format/csv.rs:
##########
@@ -236,13 +236,15 @@ impl FileFormat for CsvFormat {
 
     async fn create_physical_plan(
         &self,
-        _state: &SessionState,
+        state: &SessionState,
         conf: FileScanConfig,
         _filters: Option<&Arc<dyn PhysicalExpr>>,
     ) -> Result<Arc<dyn ExecutionPlan>> {
         let exec = CsvExec::new(
             conf,
-            self.options.has_header,
+            self.options
+                .has_header
+                .unwrap_or(state.config().options().catalog.has_header),

Review Comment:
   It took me a while to understand why this is reading from 
`catalog.has_header` (which is different than `format.has_header`). I think it 
is because catalog.has_header is the default value that is used when has_header 
has not been explicitly specified for the table.
   
   If so this make sense - but maybe it is worth a comment on 
`catalog.has_header` explaining this (I can make a follow on PR too)



##########
datafusion/core/src/datasource/file_format/csv.rs:
##########
@@ -298,7 +301,12 @@ impl CsvFormat {
 
         while let Some(chunk) = stream.next().await.transpose()? {
             let format = arrow::csv::reader::Format::default()
-                .with_header(self.options.has_header && first_chunk)
+                .with_header(
+                    self.options
+                        .has_header
+                        .unwrap_or(state.config_options().catalog.has_header)
+                        && first_chunk,

Review Comment:
   It might be possible to avoid some redundant code (and also document the 
behavior) by pulling this into a function like
   
   ```rust
   impl CsvFormat { 
   ..
     /// returns true if the first line should be treated like a header
     /// if not explicitly specified by the `CREATE TABLE` statement or format 
options
     /// defaults to the default `catalog.has_header` value
     fn has_header(&self) -> bool {
       self.options
         .has_header
         .unwrap_or(state.config_options().catalog.has_header)
     }
   ..
   }
   ```



##########
datafusion/common/src/config.rs:
##########
@@ -1600,13 +1600,13 @@ impl CsvOptions {
     /// Set true to indicate that the first line is a header.
     /// - default to true
     pub fn with_has_header(mut self, has_header: bool) -> Self {
-        self.has_header = has_header;
+        self.has_header = Some(has_header);
         self
     }
 
     /// True if the first line is a header.
     pub fn has_header(&self) -> bool {

Review Comment:
   I wonder if it would be clearer to change this to to return `Option<bool>` 
to mirror the underlying struct 🤔 



##########
datafusion/core/src/datasource/listing_table_factory.rs:
##########
@@ -67,9 +67,13 @@ impl TableProviderFactory for ListingTableFactory {
         let file_format: Arc<dyn FileFormat> = match file_type {
             FileType::CSV => {
                 let mut csv_options = table_options.csv;
-                csv_options.has_header = cmd.has_header;
-                csv_options.delimiter = cmd.delimiter as u8;
-                csv_options.compression = cmd.file_compression_type;
+                if let Some(has_header) = csv_options.has_header {
+                    if cmd.has_header && !has_header {
+                        return plan_err!("Conflicting header options for CSV 
file");

Review Comment:
   Nice!
   
   Can you please add a test for this error? Perhaps using the example from 
https://github.com/apache/datafusion/issues/9945 ?



##########
datafusion/common/src/config.rs:
##########
@@ -1564,7 +1564,7 @@ config_namespace_with_hashmap! {
 config_namespace! {
     /// Options controlling CSV format
     pub struct CsvOptions {
-        pub has_header: bool, default = true
+        pub has_header: Option<bool>, default = None

Review Comment:
   I think it would help to explain why this is an `Option` which is different 
than the other options. Something like
   
   ```suggestion
           /// If the first line of the CSV is column names. 
           /// If not specified, uses default from `CREATE TABLE` command, if 
any. 
           pub has_header: Option<bool>, default = None
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to