This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 0bd880931e fix: csv schema_infer_max_records set to 0 return null
datatype (#19432)
0bd880931e is described below
commit 0bd880931e9e31179c7a39c0c3dcef9150ec07ee
Author: Huaijin <[email protected]>
AuthorDate: Wed Dec 24 09:01:25 2025 +0800
fix: csv schema_infer_max_records set to 0 return null datatype (#19432)
## Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax. For example
`Closes #123` indicates that this PR will close issue #123.
-->
- close https://github.com/apache/datafusion/issues/19417
## Rationale for this change
- see https://github.com/apache/datafusion/issues/19417
- related to https://github.com/apache/datafusion/pull/17796
## What changes are included in this PR?
when schema_infer_max_records set to 0 in csv, return datatype as string
## Are these changes tested?
add test case for schema_infer_max_records equal to 0
## Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
-->
<!--
If there are any breaking changes to public APIs, please add the `api
change` label.
-->
---
datafusion/core/src/datasource/file_format/csv.rs | 28 +++++++++++++
datafusion/datasource-csv/src/file_format.rs | 50 +++++++++++++++++++----
docs/source/user-guide/sql/format_options.md | 38 ++++++++---------
3 files changed, 88 insertions(+), 28 deletions(-)
diff --git a/datafusion/core/src/datasource/file_format/csv.rs
b/datafusion/core/src/datasource/file_format/csv.rs
index ddbf810796..719bc4361a 100644
--- a/datafusion/core/src/datasource/file_format/csv.rs
+++ b/datafusion/core/src/datasource/file_format/csv.rs
@@ -1536,4 +1536,32 @@ mod tests {
Ok(())
}
+
+ #[tokio::test]
+ async fn test_infer_schema_with_zero_max_records() -> Result<()> {
+ let session_ctx = SessionContext::new();
+ let state = session_ctx.state();
+
+ let root = format!("{}/csv", arrow_test_data());
+ let format = CsvFormat::default()
+ .with_has_header(true)
+ .with_schema_infer_max_rec(0); // Set to 0 to disable inference
+ let exec = scan_format(
+ &state,
+ &format,
+ None,
+ &root,
+ "aggregate_test_100.csv",
+ None,
+ None,
+ )
+ .await?;
+
+ // related to https://github.com/apache/datafusion/issues/19417
+ for f in exec.schema().fields() {
+ assert_eq!(*f.data_type(), DataType::Utf8);
+ }
+
+ Ok(())
+ }
}
diff --git a/datafusion/datasource-csv/src/file_format.rs
b/datafusion/datasource-csv/src/file_format.rs
index 1bb8679102..efb7829179 100644
--- a/datafusion/datasource-csv/src/file_format.rs
+++ b/datafusion/datasource-csv/src/file_format.rs
@@ -211,6 +211,11 @@ impl CsvFormat {
/// Set a limit in terms of records to scan to infer the schema
/// - default to `DEFAULT_SCHEMA_INFER_MAX_RECORD`
+ ///
+ /// # Behavior when set to 0
+ ///
+ /// When `max_rec` is set to 0, schema inference is disabled and all fields
+ /// will be inferred as `Utf8` (string) type, regardless of their actual
content.
pub fn with_schema_infer_max_rec(mut self, max_rec: usize) -> Self {
self.options.schema_infer_max_rec = Some(max_rec);
self
@@ -529,6 +534,7 @@ impl CsvFormat {
let mut column_names = vec![];
let mut column_type_possibilities = vec![];
let mut record_number = -1;
+ let initial_records_to_read = records_to_read;
pin_mut!(stream);
@@ -619,12 +625,31 @@ impl CsvFormat {
}
}
- let schema = build_schema_helper(column_names,
column_type_possibilities);
+ let schema = build_schema_helper(
+ column_names,
+ column_type_possibilities,
+ initial_records_to_read == 0,
+ );
Ok((schema, total_records_read))
}
}
-fn build_schema_helper(names: Vec<String>, types: Vec<HashSet<DataType>>) ->
Schema {
+/// Builds a schema from column names and their possible data types.
+///
+/// # Arguments
+///
+/// * `names` - Vector of column names
+/// * `types` - Vector of possible data types for each column (as HashSets)
+/// * `disable_inference` - When true, forces all columns with no inferred
types to be Utf8.
+/// This should be set to true when `schema_infer_max_rec` is explicitly
+/// set to 0, indicating the user wants to skip type inference and treat
+/// all fields as strings. When false, columns with no inferred types
+/// will be set to Null, allowing schema merging to work properly.
+fn build_schema_helper(
+ names: Vec<String>,
+ types: Vec<HashSet<DataType>>,
+ disable_inference: bool,
+) -> Schema {
let fields = names
.into_iter()
.zip(types)
@@ -637,10 +662,17 @@ fn build_schema_helper(names: Vec<String>, types:
Vec<HashSet<DataType>>) -> Sch
data_type_possibilities.remove(&DataType::Null);
match data_type_possibilities.len() {
- // Return Null for columns with only nulls / empty files
- // This allows schema merging to work when reading folders
- // such files along with normal files.
- 0 => Field::new(field_name, DataType::Null, true),
+ // When no types were inferred (empty HashSet):
+ // - If schema_infer_max_rec was explicitly set to 0, return
Utf8
+ // - Otherwise return Null (whether from reading null values
or empty files)
+ // This allows schema merging to work when reading folders
with empty files
+ 0 => {
+ if disable_inference {
+ Field::new(field_name, DataType::Utf8, true)
+ } else {
+ Field::new(field_name, DataType::Null, true)
+ }
+ }
1 => Field::new(
field_name,
data_type_possibilities.iter().next().unwrap().clone(),
@@ -832,7 +864,7 @@ mod tests {
HashSet::from([DataType::Utf8]), // col5
];
- let schema = build_schema_helper(column_names,
column_type_possibilities);
+ let schema = build_schema_helper(column_names,
column_type_possibilities, false);
// Verify schema has 5 columns
assert_eq!(schema.fields().len(), 5);
@@ -862,7 +894,7 @@ mod tests {
HashSet::from([DataType::Utf8]), // Should
remain Utf8
];
- let schema = build_schema_helper(column_names,
column_type_possibilities);
+ let schema = build_schema_helper(column_names,
column_type_possibilities, false);
// col1 should be Float64 due to Int64 + Float64 = Float64
assert_eq!(*schema.field(0).data_type(), DataType::Float64);
@@ -880,7 +912,7 @@ mod tests {
HashSet::from([DataType::Boolean, DataType::Int64,
DataType::Utf8]), // Should resolve to Utf8 due to conflicts
];
- let schema = build_schema_helper(column_names,
column_type_possibilities);
+ let schema = build_schema_helper(column_names,
column_type_possibilities, false);
// Should default to Utf8 for conflicting types
assert_eq!(*schema.field(0).data_type(), DataType::Utf8);
diff --git a/docs/source/user-guide/sql/format_options.md
b/docs/source/user-guide/sql/format_options.md
index e8008eafb1..d349bc1c98 100644
--- a/docs/source/user-guide/sql/format_options.md
+++ b/docs/source/user-guide/sql/format_options.md
@@ -99,25 +99,25 @@ OPTIONS('COMPRESSION' 'gzip');
The following options are available when reading or writing CSV files. Note:
If any unsupported option is specified, an error will be raised and the query
will fail.
-| Option | Description
|
Default Value |
-| -------------------- |
---------------------------------------------------------------------------------------------------------------------------------
| ------------------ |
-| COMPRESSION | Sets the compression that should be applied to the
entire CSV file. Supported values are GZIP, BZIP2, XZ, ZSTD, and UNCOMPRESSED.
| UNCOMPRESSED |
-| HAS_HEADER | Sets if the CSV file should include column headers.
If not set, uses session or system default. |
None |
-| DELIMITER | Sets the character which should be used as the column
delimiter within the CSV file. |
`,` (comma) |
-| QUOTE | Sets the character which should be used for quoting
values within the CSV file. |
`"` (double quote) |
-| TERMINATOR | Sets the character which should be used as the line
terminator within the CSV file. |
None |
-| ESCAPE | Sets the character which should be used for escaping
special characters within the CSV file. |
None |
-| DOUBLE_QUOTE | Sets if quotes within quoted fields should be escaped
by doubling them (e.g., `"aaa""bbb"`). |
None |
-| NEWLINES_IN_VALUES | Sets if newlines in quoted values are supported. If
not set, uses session or system default. |
None |
-| DATE_FORMAT | Sets the format that dates should be encoded in
within the CSV file.
| None |
-| DATETIME_FORMAT | Sets the format that datetimes should be encoded in
within the CSV file. |
None |
-| TIMESTAMP_FORMAT | Sets the format that timestamps should be encoded in
within the CSV file. |
None |
-| TIMESTAMP_TZ_FORMAT | Sets the format that timestamps with timezone should
be encoded in within the CSV file. |
None |
-| TIME_FORMAT | Sets the format that times should be encoded in
within the CSV file.
| None |
-| NULL_VALUE | Sets the string which should be used to indicate null
values within the CSV file. |
None |
-| NULL_REGEX | Sets the regex pattern to match null values when
loading CSVs.
| None |
-| SCHEMA_INFER_MAX_REC | Sets the maximum number of records to scan to infer
the schema. |
None |
-| COMMENT | Sets the character which should be used to indicate
comment lines in the CSV file. |
None |
+| Option | Description
| Default Value |
+| -------------------- |
----------------------------------------------------------------------------------------------------------------------------------------------------------------
| ------------------ |
+| COMPRESSION | Sets the compression that should be applied to the
entire CSV file. Supported values are GZIP, BZIP2, XZ, ZSTD, and UNCOMPRESSED.
| UNCOMPRESSED |
+| HAS_HEADER | Sets if the CSV file should include column headers.
If not set, uses session or system default.
| None |
+| DELIMITER | Sets the character which should be used as the column
delimiter within the CSV file.
| `,` (comma) |
+| QUOTE | Sets the character which should be used for quoting
values within the CSV file.
| `"` (double quote) |
+| TERMINATOR | Sets the character which should be used as the line
terminator within the CSV file.
| None |
+| ESCAPE | Sets the character which should be used for escaping
special characters within the CSV file.
| None |
+| DOUBLE_QUOTE | Sets if quotes within quoted fields should be escaped
by doubling them (e.g., `"aaa""bbb"`).
| None |
+| NEWLINES_IN_VALUES | Sets if newlines in quoted values are supported. If
not set, uses session or system default.
| None |
+| DATE_FORMAT | Sets the format that dates should be encoded in
within the CSV file.
| None |
+| DATETIME_FORMAT | Sets the format that datetimes should be encoded in
within the CSV file.
| None |
+| TIMESTAMP_FORMAT | Sets the format that timestamps should be encoded in
within the CSV file.
| None |
+| TIMESTAMP_TZ_FORMAT | Sets the format that timestamps with timezone should
be encoded in within the CSV file.
| None |
+| TIME_FORMAT | Sets the format that times should be encoded in
within the CSV file.
| None |
+| NULL_VALUE | Sets the string which should be used to indicate null
values within the CSV file.
| None |
+| NULL_REGEX | Sets the regex pattern to match null values when
loading CSVs.
| None |
+| SCHEMA_INFER_MAX_REC | Sets the maximum number of records to scan to infer
the schema. If set to 0, schema inference is disabled and all fields will be
inferred as Utf8 (string) type. | None |
+| COMMENT | Sets the character which should be used to indicate
comment lines in the CSV file.
| None |
**Example:**
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]