Jefffrey commented on code in PR #20813:
URL: https://github.com/apache/datafusion/pull/20813#discussion_r3090727569


##########
datafusion/common/src/file_options/csv_writer.rs:
##########
@@ -94,6 +94,19 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions {
         if let Some(v) = &value.double_quote {
             builder = builder.with_double_quote(*v)
         }
+        let style = match value.quote_style {
+            CsvQuoteStyle::Always => QuoteStyle::Always,

Review Comment:
   nit: pull this into a `From` impl where `CsvQuoteStyle` is defined



##########
datafusion/common/src/parsers.rs:
##########
@@ -73,3 +73,48 @@ impl CompressionTypeVariant {
         !matches!(self, &Self::UNCOMPRESSED)
     }
 }
+
+/// CSV quote style
+///
+/// Controls when fields are quoted when writing CSV files.
+/// Corresponds to [`arrow::csv::QuoteStyle`].
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
+pub enum CsvQuoteStyle {
+    /// Quote all fields
+    Always,
+    /// Only quote fields when necessary (default)
+    #[default]
+    Necessary,
+    /// Quote all non-numeric fields
+    NonNumeric,
+    /// Never quote fields
+    Never,
+}
+
+impl FromStr for CsvQuoteStyle {
+    type Err = DataFusionError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "Always" | "ALWAYS" | "always" => Ok(Self::Always),
+            "Necessary" | "NECESSARY" | "necessary" | "" => 
Ok(Self::Necessary),
+            "NonNumeric" | "NON_NUMERIC" | "nonnumeric" => 
Ok(Self::NonNumeric),
+            "Never" | "NEVER" | "never" => Ok(Self::Never),
+            _ => Err(DataFusionError::NotImplemented(format!(
+                "Unsupported CSV quote style {s}"
+            ))),
+        }

Review Comment:
   ```suggestion
           match s.to_lowercase().as_str() {
               "always" => Ok(Self::Always),
               "necessary" | "" => Ok(Self::Necessary),
               "non_numeric" | "nonnumeric" => Ok(Self::NonNumeric),
               "never" => Ok(Self::Never),
               _ => Err(DataFusionError::NotImplemented(format!(
                   "Unsupported CSV quote style {s}"
               ))),
           }
   ```
   
   I think it might be a better idea to be permissive like this; also not sure 
about that empty string parsing as the default 🤔 



##########
datafusion/common/src/config.rs:
##########
@@ -2927,6 +2938,13 @@ config_namespace! {
         pub terminator: Option<u8>, default = None
         pub escape: Option<u8>, default = None
         pub double_quote: Option<bool>, default = None
+        /// Quote style for CSV writing.
+        /// One of: "Always", "Necessary", "NonNumeric", "Never"
+        pub quote_style: CsvQuoteStyle, default = CsvQuoteStyle::Necessary
+        /// Whether to ignore leading whitespace in string values when writing 
CSV.
+        pub ignore_leading_whitespace: Option<bool>, default = None
+        /// Whether to ignore trailing whitespace in string values when 
writing CSV.
+        pub ignore_trailing_whitespace: Option<bool>, default = None
         /// Specifies whether newlines in (quoted) values are supported.

Review Comment:
   I'm wondering why we have this trend of doing `Option<bool>` instead of just 
`bool` 🤔
   
   What is the default in case of `None`, and how is that different from just 
`true`/`false` (whichever it is)?



##########
datafusion/sqllogictest/test_files/csv_files.slt:
##########
@@ -380,3 +380,200 @@ SET datafusion.optimizer.repartition_file_min_size = 
10485760;
 
 statement ok
 drop table stored_table_with_cr_terminator;
+
+# Test quote_style option
+
+statement ok
+CREATE TABLE quote_style_source (
+  int_col INT,
+  string_col TEXT,
+  float_col DOUBLE
+) AS VALUES
+(1, 'hello', 1.1),
+(2, 'world', 2.2),
+(3, 'comma,value', 3.3);
+
+# QuoteStyle::Always - all fields are quoted
+query I
+COPY quote_style_source TO 
'test_files/scratch/csv_files/quote_style_always.csv'
+STORED AS csv
+OPTIONS ('format.has_header' 'true', 'format.quote_style' 'Always');
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE stored_quote_style_always (
+  int_col TEXT,
+  string_col TEXT,
+  float_col TEXT
+) STORED AS CSV
+LOCATION 'test_files/scratch/csv_files/quote_style_always.csv'
+OPTIONS ('format.has_header' 'true', 'format.quote_style' 'Never');
+
+# All values should have been quoted, but reading them back strips the quotes
+query TTT
+select * from stored_quote_style_always;
+----
+1 hello 1.1
+2 world 2.2
+3 comma,value 3.3
+
+statement ok
+DROP TABLE stored_quote_style_always;
+
+# QuoteStyle::NonNumeric - only string fields are quoted
+query I
+COPY quote_style_source TO 
'test_files/scratch/csv_files/quote_style_nonnumeric.csv'
+STORED AS csv
+OPTIONS ('format.has_header' 'true', 'format.quote_style' 'NonNumeric');
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE stored_quote_style_nonnumeric (
+  int_col TEXT,
+  string_col TEXT,
+  float_col TEXT
+) STORED AS CSV
+LOCATION 'test_files/scratch/csv_files/quote_style_nonnumeric.csv'
+OPTIONS ('format.has_header' 'true', 'format.quote_style' 'Never');
+
+query TTT
+select * from stored_quote_style_nonnumeric;
+----
+1 hello 1.1
+2 world 2.2
+3 comma,value 3.3
+
+statement ok
+DROP TABLE stored_quote_style_nonnumeric;
+
+# QuoteStyle::Never - no fields are quoted (can produce invalid CSV)
+# Note: the comma in 'comma,value' will NOT be quoted, so reading back
+# will see an extra column
+query I
+COPY quote_style_source TO 'test_files/scratch/csv_files/quote_style_never.csv'
+STORED AS csv
+OPTIONS ('format.has_header' 'true', 'format.quote_style' 'Never');
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE stored_quote_style_never (
+  int_col TEXT,
+  string_col TEXT,
+  float_col TEXT,
+  extra TEXT
+) STORED AS CSV
+LOCATION 'test_files/scratch/csv_files/quote_style_never.csv'
+OPTIONS ('format.has_header' 'false', 'format.truncated_rows' 'true');
+
+# Rows without commas in data parse normally (extra column is NULL),
+# while the row with 'comma,value' splits across columns
+query TTTT
+select * from stored_quote_style_never order by int_col;
+----
+1 hello 1.1 NULL
+2 world 2.2 NULL
+3 comma value 3.3
+int_col string_col float_col NULL
+
+statement ok
+DROP TABLE stored_quote_style_never;
+
+statement ok
+DROP TABLE quote_style_source;
+
+# Test ignore_leading_whitespace and ignore_trailing_whitespace options
+
+statement ok
+CREATE TABLE whitespace_source (
+  id INT,
+  value TEXT
+) AS VALUES
+(1, '  hello  '),
+(2, '  world  '),
+(3, 'no_space');
+
+# Write with ignore_leading_whitespace to trim leading spaces
+query I
+COPY whitespace_source TO 'test_files/scratch/csv_files/trim_leading.csv'
+STORED AS csv
+OPTIONS ('format.has_header' 'true', 'format.ignore_leading_whitespace' 
'true');
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE stored_trim_leading (
+  id INT,
+  value TEXT
+) STORED AS CSV
+LOCATION 'test_files/scratch/csv_files/trim_leading.csv'
+OPTIONS ('format.has_header' 'true');
+
+query IT
+select * from stored_trim_leading order by id;
+----
+1 hello

Review Comment:
   Do we expect to see trailing whitespace here? Or is there an issue with the 
runner on how it treats trailing whitespaces 🤔 



##########
datafusion/sqllogictest/test_files/csv_files.slt:
##########
@@ -380,3 +380,200 @@ SET datafusion.optimizer.repartition_file_min_size = 
10485760;
 
 statement ok
 drop table stored_table_with_cr_terminator;
+
+# Test quote_style option
+
+statement ok
+CREATE TABLE quote_style_source (
+  int_col INT,
+  string_col TEXT,
+  float_col DOUBLE
+) AS VALUES
+(1, 'hello', 1.1),
+(2, 'world', 2.2),
+(3, 'comma,value', 3.3);
+
+# QuoteStyle::Always - all fields are quoted
+query I
+COPY quote_style_source TO 
'test_files/scratch/csv_files/quote_style_always.csv'
+STORED AS csv
+OPTIONS ('format.has_header' 'true', 'format.quote_style' 'Always');
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE stored_quote_style_always (
+  int_col TEXT,
+  string_col TEXT,
+  float_col TEXT
+) STORED AS CSV
+LOCATION 'test_files/scratch/csv_files/quote_style_always.csv'
+OPTIONS ('format.has_header' 'true', 'format.quote_style' 'Never');

Review Comment:
   Bit confused why we set quote style if it's meant to be a write property?



##########
datafusion/common/src/parsers.rs:
##########
@@ -73,3 +73,48 @@ impl CompressionTypeVariant {
         !matches!(self, &Self::UNCOMPRESSED)
     }
 }
+
+/// CSV quote style
+///
+/// Controls when fields are quoted when writing CSV files.
+/// Corresponds to [`arrow::csv::QuoteStyle`].
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
+pub enum CsvQuoteStyle {
+    /// Quote all fields
+    Always,
+    /// Only quote fields when necessary (default)
+    #[default]
+    Necessary,
+    /// Quote all non-numeric fields
+    NonNumeric,
+    /// Never quote fields
+    Never,
+}

Review Comment:
   I believe this is in line with trend of having types here be DF versions; 
for example we have a DF version of parquet writer version as well:
   
   
https://github.com/apache/datafusion/blob/cc4717a14452e65a4d99c4b180d62c2793b713bd/datafusion/common/src/parquet_config.rs#L24-L36



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to