(datafusion) branch main updated: fix: csv schema_infer_max_records set to 0 return null datatype (#19432)

github-bot Tue, 23 Dec 2025 17:01:57 -0800

This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 0bd880931e fix: csv schema_infer_max_records set to 0 return null 
datatype (#19432)
0bd880931e is described below

commit 0bd880931e9e31179c7a39c0c3dcef9150ec07ee
Author: Huaijin <[email protected]>
AuthorDate: Wed Dec 24 09:01:25 2025 +0800

    fix: csv schema_infer_max_records set to 0 return null datatype (#19432)
    
    ## Which issue does this PR close?
    
    <!--
    We generally require a GitHub issue to be filed for all bug fixes and
    enhancements and this helps us generate change logs for our releases.
    You can link an issue to this PR using the GitHub syntax. For example
    `Closes #123` indicates that this PR will close issue #123.
    -->
    
    - close https://github.com/apache/datafusion/issues/19417
    
    ## Rationale for this change
    
    - see https://github.com/apache/datafusion/issues/19417
    - related to https://github.com/apache/datafusion/pull/17796
    
    ## What changes are included in this PR?
    
    when schema_infer_max_records set to 0 in csv, return datatype as string
    
    ## Are these changes tested?
    
    add test case for schema_infer_max_records equal to 0
    
    ## Are there any user-facing changes?
    
    <!--
    If there are user-facing changes then we may require documentation to be
    updated before approving the PR.
    -->
    
    <!--
    If there are any breaking changes to public APIs, please add the `api
    change` label.
    -->
---
 datafusion/core/src/datasource/file_format/csv.rs | 28 +++++++++++++
 datafusion/datasource-csv/src/file_format.rs      | 50 +++++++++++++++++++----
 docs/source/user-guide/sql/format_options.md      | 38 ++++++++---------
 3 files changed, 88 insertions(+), 28 deletions(-)

diff --git a/datafusion/core/src/datasource/file_format/csv.rs 
b/datafusion/core/src/datasource/file_format/csv.rs
index ddbf810796..719bc4361a 100644
--- a/datafusion/core/src/datasource/file_format/csv.rs
+++ b/datafusion/core/src/datasource/file_format/csv.rs
@@ -1536,4 +1536,32 @@ mod tests {
 
         Ok(())
     }
+
+    #[tokio::test]
+    async fn test_infer_schema_with_zero_max_records() -> Result<()> {
+        let session_ctx = SessionContext::new();
+        let state = session_ctx.state();
+
+        let root = format!("{}/csv", arrow_test_data());
+        let format = CsvFormat::default()
+            .with_has_header(true)
+            .with_schema_infer_max_rec(0); // Set to 0 to disable inference
+        let exec = scan_format(
+            &state,
+            &format,
+            None,
+            &root,
+            "aggregate_test_100.csv",
+            None,
+            None,
+        )
+        .await?;
+
+        // related to https://github.com/apache/datafusion/issues/19417
+        for f in exec.schema().fields() {
+            assert_eq!(*f.data_type(), DataType::Utf8);
+        }
+
+        Ok(())
+    }
 }
diff --git a/datafusion/datasource-csv/src/file_format.rs 
b/datafusion/datasource-csv/src/file_format.rs
index 1bb8679102..efb7829179 100644
--- a/datafusion/datasource-csv/src/file_format.rs
+++ b/datafusion/datasource-csv/src/file_format.rs
@@ -211,6 +211,11 @@ impl CsvFormat {
 
     /// Set a limit in terms of records to scan to infer the schema
     /// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD`
+    ///
+    /// # Behavior when set to 0
+    ///
+    /// When `max_rec` is set to 0, schema inference is disabled and all fields
+    /// will be inferred as `Utf8` (string) type, regardless of their actual 
content.
     pub fn with_schema_infer_max_rec(mut self, max_rec: usize) -> Self {
         self.options.schema_infer_max_rec = Some(max_rec);
         self
@@ -529,6 +534,7 @@ impl CsvFormat {
         let mut column_names = vec![];
         let mut column_type_possibilities = vec![];
         let mut record_number = -1;
+        let initial_records_to_read = records_to_read;
 
         pin_mut!(stream);
 
@@ -619,12 +625,31 @@ impl CsvFormat {
             }
         }
 
-        let schema = build_schema_helper(column_names, 
column_type_possibilities);
+        let schema = build_schema_helper(
+            column_names,
+            column_type_possibilities,
+            initial_records_to_read == 0,
+        );
         Ok((schema, total_records_read))
     }
 }
 
-fn build_schema_helper(names: Vec<String>, types: Vec<HashSet<DataType>>) -> 
Schema {
+/// Builds a schema from column names and their possible data types.
+///
+/// # Arguments
+///
+/// * `names` - Vector of column names
+/// * `types` - Vector of possible data types for each column (as HashSets)
+/// * `disable_inference` - When true, forces all columns with no inferred 
types to be Utf8.
+///   This should be set to true when `schema_infer_max_rec` is explicitly
+///   set to 0, indicating the user wants to skip type inference and treat
+///   all fields as strings. When false, columns with no inferred types
+///   will be set to Null, allowing schema merging to work properly.
+fn build_schema_helper(
+    names: Vec<String>,
+    types: Vec<HashSet<DataType>>,
+    disable_inference: bool,
+) -> Schema {
     let fields = names
         .into_iter()
         .zip(types)
@@ -637,10 +662,17 @@ fn build_schema_helper(names: Vec<String>, types: 
Vec<HashSet<DataType>>) -> Sch
             data_type_possibilities.remove(&DataType::Null);
 
             match data_type_possibilities.len() {
-                // Return Null for columns with only nulls / empty files
-                // This allows schema merging to work when reading folders
-                // such files along with normal files.
-                0 => Field::new(field_name, DataType::Null, true),
+                // When no types were inferred (empty HashSet):
+                // - If schema_infer_max_rec was explicitly set to 0, return 
Utf8
+                // - Otherwise return Null (whether from reading null values 
or empty files)
+                //   This allows schema merging to work when reading folders 
with empty files
+                0 => {
+                    if disable_inference {
+                        Field::new(field_name, DataType::Utf8, true)
+                    } else {
+                        Field::new(field_name, DataType::Null, true)
+                    }
+                }
                 1 => Field::new(
                     field_name,
                     data_type_possibilities.iter().next().unwrap().clone(),
@@ -832,7 +864,7 @@ mod tests {
             HashSet::from([DataType::Utf8]), // col5
         ];
 
-        let schema = build_schema_helper(column_names, 
column_type_possibilities);
+        let schema = build_schema_helper(column_names, 
column_type_possibilities, false);
 
         // Verify schema has 5 columns
         assert_eq!(schema.fields().len(), 5);
@@ -862,7 +894,7 @@ mod tests {
             HashSet::from([DataType::Utf8]),                     // Should 
remain Utf8
         ];
 
-        let schema = build_schema_helper(column_names, 
column_type_possibilities);
+        let schema = build_schema_helper(column_names, 
column_type_possibilities, false);
 
         // col1 should be Float64 due to Int64 + Float64 = Float64
         assert_eq!(*schema.field(0).data_type(), DataType::Float64);
@@ -880,7 +912,7 @@ mod tests {
             HashSet::from([DataType::Boolean, DataType::Int64, 
DataType::Utf8]), // Should resolve to Utf8 due to conflicts
         ];
 
-        let schema = build_schema_helper(column_names, 
column_type_possibilities);
+        let schema = build_schema_helper(column_names, 
column_type_possibilities, false);
 
         // Should default to Utf8 for conflicting types
         assert_eq!(*schema.field(0).data_type(), DataType::Utf8);
diff --git a/docs/source/user-guide/sql/format_options.md 
b/docs/source/user-guide/sql/format_options.md
index e8008eafb1..d349bc1c98 100644
--- a/docs/source/user-guide/sql/format_options.md
+++ b/docs/source/user-guide/sql/format_options.md
@@ -99,25 +99,25 @@ OPTIONS('COMPRESSION' 'gzip');
 
 The following options are available when reading or writing CSV files. Note: 
If any unsupported option is specified, an error will be raised and the query 
will fail.
 
-| Option               | Description                                           
                                                                            | 
Default Value      |
-| -------------------- | 
---------------------------------------------------------------------------------------------------------------------------------
 | ------------------ |
-| COMPRESSION          | Sets the compression that should be applied to the 
entire CSV file. Supported values are GZIP, BZIP2, XZ, ZSTD, and UNCOMPRESSED. 
| UNCOMPRESSED       |
-| HAS_HEADER           | Sets if the CSV file should include column headers. 
If not set, uses session or system default.                                   | 
None               |
-| DELIMITER            | Sets the character which should be used as the column 
delimiter within the CSV file.                                              | 
`,` (comma)        |
-| QUOTE                | Sets the character which should be used for quoting 
values within the CSV file.                                                   | 
`"` (double quote) |
-| TERMINATOR           | Sets the character which should be used as the line 
terminator within the CSV file.                                               | 
None               |
-| ESCAPE               | Sets the character which should be used for escaping 
special characters within the CSV file.                                      | 
None               |
-| DOUBLE_QUOTE         | Sets if quotes within quoted fields should be escaped 
by doubling them (e.g., `"aaa""bbb"`).                                      | 
None               |
-| NEWLINES_IN_VALUES   | Sets if newlines in quoted values are supported. If 
not set, uses session or system default.                                      | 
None               |
-| DATE_FORMAT          | Sets the format that dates should be encoded in 
within the CSV file.                                                            
  | None               |
-| DATETIME_FORMAT      | Sets the format that datetimes should be encoded in 
within the CSV file.                                                          | 
None               |
-| TIMESTAMP_FORMAT     | Sets the format that timestamps should be encoded in 
within the CSV file.                                                         | 
None               |
-| TIMESTAMP_TZ_FORMAT  | Sets the format that timestamps with timezone should 
be encoded in within the CSV file.                                           | 
None               |
-| TIME_FORMAT          | Sets the format that times should be encoded in 
within the CSV file.                                                            
  | None               |
-| NULL_VALUE           | Sets the string which should be used to indicate null 
values within the CSV file.                                                 | 
None               |
-| NULL_REGEX           | Sets the regex pattern to match null values when 
loading CSVs.                                                                   
 | None               |
-| SCHEMA_INFER_MAX_REC | Sets the maximum number of records to scan to infer 
the schema.                                                                   | 
None               |
-| COMMENT              | Sets the character which should be used to indicate 
comment lines in the CSV file.                                                | 
None               |
+| Option               | Description                                           
                                                                                
                           | Default Value      |
+| -------------------- | 
----------------------------------------------------------------------------------------------------------------------------------------------------------------
 | ------------------ |
+| COMPRESSION          | Sets the compression that should be applied to the 
entire CSV file. Supported values are GZIP, BZIP2, XZ, ZSTD, and UNCOMPRESSED.  
                              | UNCOMPRESSED       |
+| HAS_HEADER           | Sets if the CSV file should include column headers. 
If not set, uses session or system default.                                     
                             | None               |
+| DELIMITER            | Sets the character which should be used as the column 
delimiter within the CSV file.                                                  
                           | `,` (comma)        |
+| QUOTE                | Sets the character which should be used for quoting 
values within the CSV file.                                                     
                             | `"` (double quote) |
+| TERMINATOR           | Sets the character which should be used as the line 
terminator within the CSV file.                                                 
                             | None               |
+| ESCAPE               | Sets the character which should be used for escaping 
special characters within the CSV file.                                         
                            | None               |
+| DOUBLE_QUOTE         | Sets if quotes within quoted fields should be escaped 
by doubling them (e.g., `"aaa""bbb"`).                                          
                           | None               |
+| NEWLINES_IN_VALUES   | Sets if newlines in quoted values are supported. If 
not set, uses session or system default.                                        
                             | None               |
+| DATE_FORMAT          | Sets the format that dates should be encoded in 
within the CSV file.                                                            
                                 | None               |
+| DATETIME_FORMAT      | Sets the format that datetimes should be encoded in 
within the CSV file.                                                            
                             | None               |
+| TIMESTAMP_FORMAT     | Sets the format that timestamps should be encoded in 
within the CSV file.                                                            
                            | None               |
+| TIMESTAMP_TZ_FORMAT  | Sets the format that timestamps with timezone should 
be encoded in within the CSV file.                                              
                            | None               |
+| TIME_FORMAT          | Sets the format that times should be encoded in 
within the CSV file.                                                            
                                 | None               |
+| NULL_VALUE           | Sets the string which should be used to indicate null 
values within the CSV file.                                                     
                           | None               |
+| NULL_REGEX           | Sets the regex pattern to match null values when 
loading CSVs.                                                                   
                                | None               |
+| SCHEMA_INFER_MAX_REC | Sets the maximum number of records to scan to infer 
the schema. If set to 0, schema inference is disabled and all fields will be 
inferred as Utf8 (string) type. | None               |
+| COMMENT              | Sets the character which should be used to indicate 
comment lines in the CSV file.                                                  
                             | None               |
 
 **Example:**
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion) branch main updated: fix: csv schema_infer_max_records set to 0 return null datatype (#19432)

Reply via email to