This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new c6387c1ffc fix(csv)!: infer null for empty column. (#4910)
c6387c1ffc is described below

commit c6387c1ffc27cbf9180253648c4ba461d92d586d
Author: Kamil Skalski <[email protected]>
AuthorDate: Tue Oct 10 11:13:34 2023 +0200

    fix(csv)!: infer null for empty column. (#4910)
    
    * Infer null for empty column.
    
    * Add test file.
---
 arrow-csv/src/reader/mod.rs            | 62 +++++++++++++++++++++++++++++++++-
 arrow-csv/test/data/init_null_test.csv |  6 ++++
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index 17db7a34e0..2ba49cadc7 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -193,6 +193,7 @@ impl InferredDataType {
     /// Returns the inferred data type
     fn get(&self) -> DataType {
         match self.packed {
+            0 => DataType::Null,
             1 => DataType::Boolean,
             2 => DataType::Int64,
             4 | 6 => DataType::Float64, // Promote Int64 to Float64
@@ -785,6 +786,9 @@ fn parse(
                         null_regex,
                     )
                 }
+                DataType::Null => {
+                    Ok(Arc::new(NullArray::builder(rows.len()).finish()) as 
ArrayRef)
+                }
                 DataType::Utf8 => Ok(Arc::new(
                     rows.iter()
                         .map(|row| Some(row.get(i)))
@@ -1511,6 +1515,62 @@ mod tests {
         assert!(!batch.column(1).is_null(4));
     }
 
+    #[test]
+    fn test_init_nulls() {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("c_int", DataType::UInt64, true),
+            Field::new("c_float", DataType::Float32, true),
+            Field::new("c_string", DataType::Utf8, true),
+            Field::new("c_bool", DataType::Boolean, true),
+            Field::new("c_null", DataType::Null, true),
+        ]));
+        let file = File::open("test/data/init_null_test.csv").unwrap();
+
+        let mut csv = ReaderBuilder::new(schema)
+            .has_header(true)
+            .build(file)
+            .unwrap();
+
+        let batch = csv.next().unwrap().unwrap();
+
+        assert!(batch.column(1).is_null(0));
+        assert!(!batch.column(1).is_null(1));
+        assert!(batch.column(1).is_null(2));
+        assert!(!batch.column(1).is_null(3));
+        assert!(!batch.column(1).is_null(4));
+    }
+
+    #[test]
+    fn test_init_nulls_with_inference() {
+        let format = Format::default().with_header(true).with_delimiter(b',');
+
+        let mut file = File::open("test/data/init_null_test.csv").unwrap();
+        let (schema, _) = format.infer_schema(&mut file, None).unwrap();
+        file.rewind().unwrap();
+
+        let expected_schema = Schema::new(vec![
+            Field::new("c_int", DataType::Int64, true),
+            Field::new("c_float", DataType::Float64, true),
+            Field::new("c_string", DataType::Utf8, true),
+            Field::new("c_bool", DataType::Boolean, true),
+            Field::new("c_null", DataType::Null, true),
+        ]);
+        assert_eq!(schema, expected_schema);
+
+        let mut csv = ReaderBuilder::new(Arc::new(schema))
+            .with_format(format)
+            .build(file)
+            .unwrap();
+
+        let batch = csv.next().unwrap().unwrap();
+
+        assert!(batch.column(1).is_null(0));
+        assert!(!batch.column(1).is_null(1));
+        assert!(batch.column(1).is_null(2));
+        assert!(!batch.column(1).is_null(3));
+        assert!(!batch.column(1).is_null(4));
+    }
+
     #[test]
     fn test_custom_nulls() {
         let schema = Arc::new(Schema::new(vec![
@@ -2283,7 +2343,7 @@ mod tests {
     #[test]
     fn test_inference() {
         let cases: &[(&[&str], DataType)] = &[
-            (&[], DataType::Utf8),
+            (&[], DataType::Null),
             (&["false", "12"], DataType::Utf8),
             (&["12", "cupcakes"], DataType::Utf8),
             (&["12", "12.4"], DataType::Float64),
diff --git a/arrow-csv/test/data/init_null_test.csv 
b/arrow-csv/test/data/init_null_test.csv
new file mode 100644
index 0000000000..f7d8a29964
--- /dev/null
+++ b/arrow-csv/test/data/init_null_test.csv
@@ -0,0 +1,6 @@
+c_int,c_float,c_string,c_bool,c_null
+,,,,
+2,2.2,"a",TRUE,
+3,,"b",true,
+4,4.4,,False,
+5,6.6,"",FALSE,
\ No newline at end of file

Reply via email to