This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new c6387c1ffc fix(csv)!: infer null for empty column. (#4910)
c6387c1ffc is described below
commit c6387c1ffc27cbf9180253648c4ba461d92d586d
Author: Kamil Skalski <[email protected]>
AuthorDate: Tue Oct 10 11:13:34 2023 +0200
fix(csv)!: infer null for empty column. (#4910)
* Infer null for empty column.
* Add test file.
---
arrow-csv/src/reader/mod.rs | 62 +++++++++++++++++++++++++++++++++-
arrow-csv/test/data/init_null_test.csv | 6 ++++
2 files changed, 67 insertions(+), 1 deletion(-)
diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index 17db7a34e0..2ba49cadc7 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -193,6 +193,7 @@ impl InferredDataType {
/// Returns the inferred data type
fn get(&self) -> DataType {
match self.packed {
+ 0 => DataType::Null,
1 => DataType::Boolean,
2 => DataType::Int64,
4 | 6 => DataType::Float64, // Promote Int64 to Float64
@@ -785,6 +786,9 @@ fn parse(
null_regex,
)
}
+ DataType::Null => {
+ Ok(Arc::new(NullArray::builder(rows.len()).finish()) as
ArrayRef)
+ }
DataType::Utf8 => Ok(Arc::new(
rows.iter()
.map(|row| Some(row.get(i)))
@@ -1511,6 +1515,62 @@ mod tests {
assert!(!batch.column(1).is_null(4));
}
+ #[test]
+ fn test_init_nulls() {
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("c_int", DataType::UInt64, true),
+ Field::new("c_float", DataType::Float32, true),
+ Field::new("c_string", DataType::Utf8, true),
+ Field::new("c_bool", DataType::Boolean, true),
+ Field::new("c_null", DataType::Null, true),
+ ]));
+ let file = File::open("test/data/init_null_test.csv").unwrap();
+
+ let mut csv = ReaderBuilder::new(schema)
+ .has_header(true)
+ .build(file)
+ .unwrap();
+
+ let batch = csv.next().unwrap().unwrap();
+
+ assert!(batch.column(1).is_null(0));
+ assert!(!batch.column(1).is_null(1));
+ assert!(batch.column(1).is_null(2));
+ assert!(!batch.column(1).is_null(3));
+ assert!(!batch.column(1).is_null(4));
+ }
+
+ #[test]
+ fn test_init_nulls_with_inference() {
+ let format = Format::default().with_header(true).with_delimiter(b',');
+
+ let mut file = File::open("test/data/init_null_test.csv").unwrap();
+ let (schema, _) = format.infer_schema(&mut file, None).unwrap();
+ file.rewind().unwrap();
+
+ let expected_schema = Schema::new(vec![
+ Field::new("c_int", DataType::Int64, true),
+ Field::new("c_float", DataType::Float64, true),
+ Field::new("c_string", DataType::Utf8, true),
+ Field::new("c_bool", DataType::Boolean, true),
+ Field::new("c_null", DataType::Null, true),
+ ]);
+ assert_eq!(schema, expected_schema);
+
+ let mut csv = ReaderBuilder::new(Arc::new(schema))
+ .with_format(format)
+ .build(file)
+ .unwrap();
+
+ let batch = csv.next().unwrap().unwrap();
+
+ assert!(batch.column(1).is_null(0));
+ assert!(!batch.column(1).is_null(1));
+ assert!(batch.column(1).is_null(2));
+ assert!(!batch.column(1).is_null(3));
+ assert!(!batch.column(1).is_null(4));
+ }
+
#[test]
fn test_custom_nulls() {
let schema = Arc::new(Schema::new(vec![
@@ -2283,7 +2343,7 @@ mod tests {
#[test]
fn test_inference() {
let cases: &[(&[&str], DataType)] = &[
- (&[], DataType::Utf8),
+ (&[], DataType::Null),
(&["false", "12"], DataType::Utf8),
(&["12", "cupcakes"], DataType::Utf8),
(&["12", "12.4"], DataType::Float64),
diff --git a/arrow-csv/test/data/init_null_test.csv
b/arrow-csv/test/data/init_null_test.csv
new file mode 100644
index 0000000000..f7d8a29964
--- /dev/null
+++ b/arrow-csv/test/data/init_null_test.csv
@@ -0,0 +1,6 @@
+c_int,c_float,c_string,c_bool,c_null
+,,,,
+2,2.2,"a",TRUE,
+3,,"b",true,
+4,4.4,,False,
+5,6.6,"",FALSE,
\ No newline at end of file