This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 95b015cf7b Evaluate null_regex for string type in csv (now such values
will be parsed as `Null` rather than `""`) (#4942)
95b015cf7b is described below
commit 95b015cf7b5d57c7fe66a8feada4f48a987cb020
Author: Huaijin <[email protected]>
AuthorDate: Tue Oct 17 01:52:27 2023 +0800
Evaluate null_regex for string type in csv (now such values will be parsed
as `Null` rather than `""`) (#4942)
* fix: add null_regex for string type in csv
* Update arrow-csv/src/reader/mod.rs
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
---------
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
---
arrow-csv/src/reader/mod.rs | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index 2ba49cadc7..1106b16bc4 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -791,7 +791,10 @@ fn parse(
}
DataType::Utf8 => Ok(Arc::new(
rows.iter()
- .map(|row| Some(row.get(i)))
+ .map(|row| {
+ let s = row.get(i);
+ (!null_regex.is_null(s)).then_some(s)
+ })
.collect::<StringArray>(),
) as ArrayRef),
DataType::Dictionary(key_type, value_type)
@@ -1495,7 +1498,7 @@ mod tests {
let schema = Arc::new(Schema::new(vec![
Field::new("c_int", DataType::UInt64, false),
Field::new("c_float", DataType::Float32, true),
- Field::new("c_string", DataType::Utf8, false),
+ Field::new("c_string", DataType::Utf8, true),
Field::new("c_bool", DataType::Boolean, false),
]));
@@ -1596,8 +1599,7 @@ mod tests {
assert!(batch.column(0).is_null(1));
assert!(batch.column(1).is_null(2));
assert!(batch.column(3).is_null(4));
- // String won't be empty
- assert!(!batch.column(2).is_null(3));
+ assert!(batch.column(2).is_null(3));
assert!(!batch.column(2).is_null(4));
}
@@ -2237,8 +2239,8 @@ mod tests {
fn err_test(csv: &[u8], expected: &str) {
let schema = Arc::new(Schema::new(vec![
- Field::new("text1", DataType::Utf8, false),
- Field::new("text2", DataType::Utf8, false),
+ Field::new("text1", DataType::Utf8, true),
+ Field::new("text2", DataType::Utf8, true),
]));
let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv));
let b = ReaderBuilder::new(schema)