This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new b2458bd686 StringView support in arrow-csv (#6062)
b2458bd686 is described below
commit b2458bd686e5bc75397fde4a25f3a8b6c42ab064
Author: Yongting You <[email protected]>
AuthorDate: Wed Jul 17 06:29:34 2024 +0800
StringView support in arrow-csv (#6062)
* StringView support in arrow-csv
* review and micro-benches
---
arrow-csv/src/reader/mod.rs | 94 +++++++++++++++++++++++++++++++++++++++++----
arrow/benches/csv_reader.rs | 42 ++++++++++++++++++++
2 files changed, 128 insertions(+), 8 deletions(-)
diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs
index 9721349b01..c5057599b8 100644
--- a/arrow-csv/src/reader/mod.rs
+++ b/arrow-csv/src/reader/mod.rs
@@ -795,6 +795,14 @@ fn parse(
})
.collect::<StringArray>(),
) as ArrayRef),
+ DataType::Utf8View => Ok(Arc::new(
+ rows.iter()
+ .map(|row| {
+ let s = row.get(i);
+ (!null_regex.is_null(s)).then_some(s)
+ })
+ .collect::<StringViewArray>(),
+ ) as ArrayRef),
DataType::Dictionary(key_type, value_type)
if value_type.as_ref() == &DataType::Utf8 =>
{
@@ -2380,17 +2388,27 @@ mod tests {
}
fn err_test(csv: &[u8], expected: &str) {
- let schema = Arc::new(Schema::new(vec![
+ fn err_test_with_schema(csv: &[u8], expected: &str, schema:
Arc<Schema>) {
+ let buffer = std::io::BufReader::with_capacity(2,
Cursor::new(csv));
+ let b = ReaderBuilder::new(schema)
+ .with_batch_size(2)
+ .build_buffered(buffer)
+ .unwrap();
+ let err = b.collect::<Result<Vec<_>,
_>>().unwrap_err().to_string();
+ assert_eq!(err, expected)
+ }
+
+ let schema_utf8 = Arc::new(Schema::new(vec![
Field::new("text1", DataType::Utf8, true),
Field::new("text2", DataType::Utf8, true),
]));
- let buffer = std::io::BufReader::with_capacity(2, Cursor::new(csv));
- let b = ReaderBuilder::new(schema)
- .with_batch_size(2)
- .build_buffered(buffer)
- .unwrap();
- let err = b.collect::<Result<Vec<_>, _>>().unwrap_err().to_string();
- assert_eq!(err, expected)
+ err_test_with_schema(csv, expected, schema_utf8);
+
+ let schema_utf8view = Arc::new(Schema::new(vec![
+ Field::new("text1", DataType::Utf8View, true),
+ Field::new("text2", DataType::Utf8View, true),
+ ]));
+ err_test_with_schema(csv, expected, schema_utf8view);
}
#[test]
@@ -2587,4 +2605,64 @@ mod tests {
&vec![2, 22]
);
}
+
+ #[test]
+ fn test_parse_string_view_single_column() {
+ let csv = ["foo", "something_cannot_be_inlined", "foobar"].join("\n");
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "c1",
+ DataType::Utf8View,
+ true,
+ )]));
+
+ let mut decoder = ReaderBuilder::new(schema).build_decoder();
+
+ let decoded = decoder.decode(csv.as_bytes()).unwrap();
+ assert_eq!(decoded, csv.len());
+ decoder.decode(&[]).unwrap();
+
+ let batch = decoder.flush().unwrap().unwrap();
+ assert_eq!(batch.num_columns(), 1);
+ assert_eq!(batch.num_rows(), 3);
+ let col = batch.column(0).as_string_view();
+ assert_eq!(col.data_type(), &DataType::Utf8View);
+ assert_eq!(col.value(0), "foo");
+ assert_eq!(col.value(1), "something_cannot_be_inlined");
+ assert_eq!(col.value(2), "foobar");
+ }
+
+ #[test]
+ fn test_parse_string_view_multi_column() {
+ let csv = ["foo,", ",something_cannot_be_inlined",
"foobarfoobar,bar"].join("\n");
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("c1", DataType::Utf8View, true),
+ Field::new("c2", DataType::Utf8View, true),
+ ]));
+
+ let mut decoder = ReaderBuilder::new(schema).build_decoder();
+
+ let decoded = decoder.decode(csv.as_bytes()).unwrap();
+ assert_eq!(decoded, csv.len());
+ decoder.decode(&[]).unwrap();
+
+ let batch = decoder.flush().unwrap().unwrap();
+ assert_eq!(batch.num_columns(), 2);
+ assert_eq!(batch.num_rows(), 3);
+ let c1 = batch.column(0).as_string_view();
+ let c2 = batch.column(1).as_string_view();
+ assert_eq!(c1.data_type(), &DataType::Utf8View);
+ assert_eq!(c2.data_type(), &DataType::Utf8View);
+
+ assert!(!c1.is_null(0));
+ assert!(c1.is_null(1));
+ assert!(!c1.is_null(2));
+ assert_eq!(c1.value(0), "foo");
+ assert_eq!(c1.value(2), "foobarfoobar");
+
+ assert!(c2.is_null(0));
+ assert!(!c2.is_null(1));
+ assert!(!c2.is_null(2));
+ assert_eq!(c2.value(1), "something_cannot_be_inlined");
+ assert_eq!(c2.value(2), "bar");
+ }
}
diff --git a/arrow/benches/csv_reader.rs b/arrow/benches/csv_reader.rs
index 38e091548b..74a47ef892 100644
--- a/arrow/benches/csv_reader.rs
+++ b/arrow/benches/csv_reader.rs
@@ -21,6 +21,7 @@ extern crate criterion;
use std::io::Cursor;
use std::sync::Arc;
+use arrow::util::bench_util::create_string_view_array_with_len;
use criterion::*;
use rand::Rng;
@@ -59,6 +60,7 @@ fn do_bench(c: &mut Criterion, name: &str, cols:
Vec<ArrayRef>) {
fn criterion_benchmark(c: &mut Criterion) {
let mut rng = seedable_rng();
+ // Single Primitive Column tests
let values = Int32Array::from_iter_values((0..4096).map(|_|
rng.gen_range(0..1024)));
let cols = vec![Arc::new(values) as ArrayRef];
do_bench(c, "4096 i32_small(0)", cols);
@@ -101,6 +103,7 @@ fn criterion_benchmark(c: &mut Criterion) {
let cols = vec![Arc::new(values) as ArrayRef];
do_bench(c, "4096 f64(0)", cols);
+ // Single String Column tests
let cols = vec![Arc::new(create_string_array_with_len::<i32>(4096, 0.,
10)) as ArrayRef];
do_bench(c, "4096 string(10, 0)", cols);
@@ -113,6 +116,20 @@ fn criterion_benchmark(c: &mut Criterion) {
let cols = vec![Arc::new(create_string_array_with_len::<i32>(4096, 0.5,
100)) as ArrayRef];
do_bench(c, "4096 string(100, 0.5)", cols);
+ // Single StringView Column tests
+ let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 10,
false)) as ArrayRef];
+ do_bench(c, "4096 StringView(10, 0)", cols);
+
+ let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 30,
false)) as ArrayRef];
+ do_bench(c, "4096 StringView(30, 0)", cols);
+
+ let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0., 100,
false)) as ArrayRef];
+ do_bench(c, "4096 StringView(100, 0)", cols);
+
+ let cols = vec![Arc::new(create_string_view_array_with_len(4096, 0.5, 100,
false)) as ArrayRef];
+ do_bench(c, "4096 StringView(100, 0.5)", cols);
+
+ // Multi-Column(with String) tests
let cols = vec![
Arc::new(create_string_array_with_len::<i32>(4096, 0.5, 20)) as
ArrayRef,
Arc::new(create_string_array_with_len::<i32>(4096, 0., 30)) as
ArrayRef,
@@ -136,6 +153,31 @@ fn criterion_benchmark(c: &mut Criterion) {
"4096 string(20, 0.5), string(30, 0), f64(0), i64(0)",
cols,
);
+
+ // Multi-Column(with StringView) tests
+ let cols = vec![
+ Arc::new(create_string_view_array_with_len(4096, 0.5, 20, false)) as
ArrayRef,
+ Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as
ArrayRef,
+ Arc::new(create_string_view_array_with_len(4096, 0., 100, false)) as
ArrayRef,
+ Arc::new(create_primitive_array::<Int64Type>(4096, 0.)) as ArrayRef,
+ ];
+ do_bench(
+ c,
+ "4096 StringView(20, 0.5), StringView(30, 0), StringView(100, 0),
i64(0)",
+ cols,
+ );
+
+ let cols = vec![
+ Arc::new(create_string_view_array_with_len(4096, 0.5, 20, false)) as
ArrayRef,
+ Arc::new(create_string_view_array_with_len(4096, 0., 30, false)) as
ArrayRef,
+ Arc::new(create_primitive_array::<Float64Type>(4096, 0.)) as ArrayRef,
+ Arc::new(create_primitive_array::<Int64Type>(4096, 0.)) as ArrayRef,
+ ];
+ do_bench(
+ c,
+ "4096 StringView(20, 0.5), StringView(30, 0), f64(0), i64(0)",
+ cols,
+ );
}
criterion_group!(benches, criterion_benchmark);