This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 95b015cf7b Evaluate null_regex for string type in csv (now such values 
will be parsed as `Null` rather than `""`) (#4942)
95b015cf7b is described below

commit 95b015cf7b5d57c7fe66a8feada4f48a987cb020
Author: Huaijin <[email protected]>
AuthorDate: Tue Oct 17 01:52:27 2023 +0800

    Evaluate null_regex for string type in csv (now such values will be parsed 
as `Null` rather than `""`) (#4942)
    
    * fix: add null_regex for string type in csv
    
    * Update arrow-csv/src/reader/mod.rs
    
    Co-authored-by: Raphael Taylor-Davies 
<[email protected]>
    
    ---------
    
    Co-authored-by: Raphael Taylor-Davies 
<[email protected]>
---
 arrow-csv/src/reader/mod.rs | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index 2ba49cadc7..1106b16bc4 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -791,7 +791,10 @@ fn parse(
                 }
                 DataType::Utf8 => Ok(Arc::new(
                     rows.iter()
-                        .map(|row| Some(row.get(i)))
+                        .map(|row| {
+                            let s = row.get(i);
+                            (!null_regex.is_null(s)).then_some(s)
+                        })
                         .collect::<StringArray>(),
                 ) as ArrayRef),
                 DataType::Dictionary(key_type, value_type)
@@ -1495,7 +1498,7 @@ mod tests {
         let schema = Arc::new(Schema::new(vec![
             Field::new("c_int", DataType::UInt64, false),
             Field::new("c_float", DataType::Float32, true),
-            Field::new("c_string", DataType::Utf8, false),
+            Field::new("c_string", DataType::Utf8, true),
             Field::new("c_bool", DataType::Boolean, false),
         ]));
 
@@ -1596,8 +1599,7 @@ mod tests {
         assert!(batch.column(0).is_null(1));
         assert!(batch.column(1).is_null(2));
         assert!(batch.column(3).is_null(4));
-        // String won't be empty
-        assert!(!batch.column(2).is_null(3));
+        assert!(batch.column(2).is_null(3));
         assert!(!batch.column(2).is_null(4));
     }
 
@@ -2237,8 +2239,8 @@ mod tests {
 
     fn err_test(csv: &[u8], expected: &str) {
         let schema = Arc::new(Schema::new(vec![
-            Field::new("text1", DataType::Utf8, false),
-            Field::new("text2", DataType::Utf8, false),
+            Field::new("text1", DataType::Utf8, true),
+            Field::new("text2", DataType::Utf8, true),
         ]));
         let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv));
         let b = ReaderBuilder::new(schema)

Reply via email to