This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new b2458bd686 StringView support in arrow-csv (#6062)
b2458bd686 is described below

commit b2458bd686e5bc75397fde4a25f3a8b6c42ab064
Author: Yongting You <[email protected]>
AuthorDate: Wed Jul 17 06:29:34 2024 +0800

    StringView support in arrow-csv (#6062)
    
    * StringView support in arrow-csv
    
    * review and micro-benches
---
 arrow-csv/src/reader/mod.rs | 94 +++++++++++++++++++++++++++++++++++++++++----
 arrow/benches/csv_reader.rs | 42 ++++++++++++++++++++
 2 files changed, 128 insertions(+), 8 deletions(-)

diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index 9721349b01..c5057599b8 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -795,6 +795,14 @@ fn parse(
                         })
                         .collect::<StringArray>(),
                 ) as ArrayRef),
+                DataType::Utf8View => Ok(Arc::new(
+                    rows.iter()
+                        .map(|row| {
+                            let s = row.get(i);
+                            (!null_regex.is_null(s)).then_some(s)
+                        })
+                        .collect::<StringViewArray>(),
+                ) as ArrayRef),
                 DataType::Dictionary(key_type, value_type)
                     if value_type.as_ref() == &DataType::Utf8 =>
                 {
@@ -2380,17 +2388,27 @@ mod tests {
     }
 
     fn err_test(csv: &[u8], expected: &str) {
-        let schema = Arc::new(Schema::new(vec![
+        fn err_test_with_schema(csv: &[u8], expected: &str, schema: 
Arc<Schema>) {
+            let buffer = std::io::BufReader::with_capacity(2, 
Cursor::new(csv));
+            let b = ReaderBuilder::new(schema)
+                .with_batch_size(2)
+                .build_buffered(buffer)
+                .unwrap();
+            let err = b.collect::<Result<Vec<_>, 
_>>().unwrap_err().to_string();
+            assert_eq!(err, expected)
+        }
+
+        let schema_utf8 = Arc::new(Schema::new(vec![
             Field::new("text1", DataType::Utf8, true),
             Field::new("text2", DataType::Utf8, true),
         ]));
-        let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv));
-        let b = ReaderBuilder::new(schema)
-            .with_batch_size(2)
-            .build_buffered(buffer)
-            .unwrap();
-        let err = b.collect::<Result<Vec<_>, _>>().unwrap_err().to_string();
-        assert_eq!(err, expected)
+        err_test_with_schema(csv, expected, schema_utf8);
+
+        let schema_utf8view = Arc::new(Schema::new(vec![
+            Field::new("text1", DataType::Utf8View, true),
+            Field::new("text2", DataType::Utf8View, true),
+        ]));
+        err_test_with_schema(csv, expected, schema_utf8view);
     }
 
     #[test]
@@ -2587,4 +2605,64 @@ mod tests {
             &vec![2, 22]
         );
     }
+
+    #[test]
+    fn test_parse_string_view_single_column() {
+        let csv = ["foo", "something_cannot_be_inlined", "foobar"].join("\n");
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "c1",
+            DataType::Utf8View,
+            true,
+        )]));
+
+        let mut decoder = ReaderBuilder::new(schema).build_decoder();
+
+        let decoded = decoder.decode(csv.as_bytes()).unwrap();
+        assert_eq!(decoded, csv.len());
+        decoder.decode(&[]).unwrap();
+
+        let batch = decoder.flush().unwrap().unwrap();
+        assert_eq!(batch.num_columns(), 1);
+        assert_eq!(batch.num_rows(), 3);
+        let col = batch.column(0).as_string_view();
+        assert_eq!(col.data_type(), &DataType::Utf8View);
+        assert_eq!(col.value(0), "foo");
+        assert_eq!(col.value(1), "something_cannot_be_inlined");
+        assert_eq!(col.value(2), "foobar");
+    }
+
+    #[test]
+    fn test_parse_string_view_multi_column() {
+        let csv = ["foo,", ",something_cannot_be_inlined", 
"foobarfoobar,bar"].join("\n");
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("c1", DataType::Utf8View, true),
+            Field::new("c2", DataType::Utf8View, true),
+        ]));
+
+        let mut decoder = ReaderBuilder::new(schema).build_decoder();
+
+        let decoded = decoder.decode(csv.as_bytes()).unwrap();
+        assert_eq!(decoded, csv.len());
+        decoder.decode(&[]).unwrap();
+
+        let batch = decoder.flush().unwrap().unwrap();
+        assert_eq!(batch.num_columns(), 2);
+        assert_eq!(batch.num_rows(), 3);
+        let c1 = batch.column(0).as_string_view();
+        let c2 = batch.column(1).as_string_view();
+        assert_eq!(c1.data_type(), &DataType::Utf8View);
+        assert_eq!(c2.data_type(), &DataType::Utf8View);
+
+        assert!(!c1.is_null(0));
+        assert!(c1.is_null(1));
+        assert!(!c1.is_null(2));
+        assert_eq!(c1.value(0), "foo");
+        assert_eq!(c1.value(2), "foobarfoobar");
+
+        assert!(c2.is_null(0));
+        assert!(!c2.is_null(1));
+        assert!(!c2.is_null(2));
+        assert_eq!(c2.value(1), "something_cannot_be_inlined");
+        assert_eq!(c2.value(2), "bar");
+    }
 }
diff --git a/arrow/benches/csv_reader.rs b/arrow/benches/csv_reader.rs
index 38e091548b..74a47ef892 100644
--- a/arrow/benches/csv_reader.rs
+++ b/arrow/benches/csv_reader.rs
@@ -21,6 +21,7 @@ extern crate criterion;
 use std::io::Cursor;
 use std::sync::Arc;
 
+use arrow::util::bench_util::create_string_view_array_with_len;
 use criterion::*;
 use rand::Rng;
 
@@ -59,6 +60,7 @@ fn do_bench(c: &mut Criterion, name: &str, cols: 
Vec<ArrayRef>) {
 fn criterion_benchmark(c: &mut Criterion) {
     let mut rng = seedable_rng();
 
+    // Single Primitive Column tests
     let values = Int32Array::from_iter_values((0..4096).map(|_| 
rng.gen_range(0..1024)));
     let cols = vec![Arc::new(values) as ArrayRef];
     do_bench(c, "4096 i32_small(0)", cols);
@@ -101,6 +103,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     let cols = vec![Arc::new(values) as ArrayRef];
     do_bench(c, "4096 f64(0)", cols);
 
+    // Single String Column tests
     let cols = vec![Arc::new(create_string_array_with_len::<i32>(4096, 0., 
10)) as ArrayRef];
     do_bench(c, "4096 string(10, 0)", cols);
 
@@ -113,6 +116,20 @@ fn criterion_benchmark(c: &mut Criterion) {
     let cols = vec![Arc::new(create_string_array_with_len::<i32>(4096, 0.5, 
100)) as ArrayRef];
     do_bench(c, "4096 string(100, 0.5)", cols);
 
+    // Single StringView Column tests
+    let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 10, 
false)) as ArrayRef];
+    do_bench(c, "4096 StringView(10, 0)", cols);
+
+    let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 30, 
false)) as ArrayRef];
+    do_bench(c, "4096 StringView(30, 0)", cols);
+
+    let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 100, 
false)) as ArrayRef];
+    do_bench(c, "4096 StringView(100, 0)", cols);
+
+    let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0.5, 100, 
false)) as ArrayRef];
+    do_bench(c, "4096 StringView(100, 0.5)", cols);
+
+    // Multi-Column(with String) tests
     let cols = vec![
         Arc::new(create_string_array_with_len::<i32>(4096, 0.5, 20)) as 
ArrayRef,
         Arc::new(create_string_array_with_len::<i32>(4096, 0., 30)) as 
ArrayRef,
@@ -136,6 +153,31 @@ fn criterion_benchmark(c: &mut Criterion) {
         "4096 string(20, 0.5), string(30, 0), f64(0), i64(0)",
         cols,
     );
+
+    // Multi-Column(with StringView) tests
+    let cols = vec![
+        Arc::new(create_string_view_array_with_len(4096, 0.5, 20, false)) as 
ArrayRef,
+        Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as 
ArrayRef,
+        Arc::new(create_string_view_array_with_len(4096, 0., 100, false)) as 
ArrayRef,
+        Arc::new(create_primitive_array::<Int64Type>(4096, 0.)) as ArrayRef,
+    ];
+    do_bench(
+        c,
+        "4096 StringView(20, 0.5), StringView(30, 0), StringView(100, 0), 
i64(0)",
+        cols,
+    );
+
+    let cols = vec![
+        Arc::new(create_string_view_array_with_len(4096, 0.5, 20, false)) as 
ArrayRef,
+        Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as 
ArrayRef,
+        Arc::new(create_primitive_array::<Float64Type>(4096, 0.)) as ArrayRef,
+        Arc::new(create_primitive_array::<Int64Type>(4096, 0.)) as ArrayRef,
+    ];
+    do_bench(
+        c,
+        "4096 StringView(20, 0.5), StringView(30, 0), f64(0), i64(0)",
+        cols,
+    );
 }
 
 criterion_group!(benches, criterion_benchmark);

Reply via email to