This is an automated email from the ASF dual-hosted git repository.
nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 586c060 ARROW-9615: [Rust] Added kernel to compute length of a string.
586c060 is described below
commit 586c060c8b1851f1077911fae6d02a10ed83e7fb
Author: Jorge C. Leitao <[email protected]>
AuthorDate: Thu Aug 13 19:15:36 2020 +0200
ARROW-9615: [Rust] Added kernel to compute length of a string.
Closes #7876 from jorgecarleitao/length
Authored-by: Jorge C. Leitao <[email protected]>
Signed-off-by: Neville Dipale <[email protected]>
---
rust/arrow/Cargo.toml | 4 +
.../kernels/mod.rs => benches/length_kernel.rs} | 45 +++--
rust/arrow/src/compute/kernels/length.rs | 186 +++++++++++++++++++++
rust/arrow/src/compute/kernels/mod.rs | 1 +
4 files changed, 223 insertions(+), 13 deletions(-)
diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml
index e43e00e..e41557a 100644
--- a/rust/arrow/Cargo.toml
+++ b/rust/arrow/Cargo.toml
@@ -94,5 +94,9 @@ name = "take_kernels"
harness = false
[[bench]]
+name = "length_kernel"
+harness = false
+
+[[bench]]
name = "csv_writer"
harness = false
diff --git a/rust/arrow/src/compute/kernels/mod.rs
b/rust/arrow/benches/length_kernel.rs
similarity index 50%
copy from rust/arrow/src/compute/kernels/mod.rs
copy to rust/arrow/benches/length_kernel.rs
index 1f437ed..39223d2 100644
--- a/rust/arrow/src/compute/kernels/mod.rs
+++ b/rust/arrow/benches/length_kernel.rs
@@ -15,16 +15,35 @@
// specific language governing permissions and limitations
// under the License.
-//! Computation kernels on Arrow Arrays
-
-pub mod aggregate;
-pub mod arithmetic;
-pub mod boolean;
-pub mod cast;
-pub mod comparison;
-pub mod concat;
-pub mod filter;
-pub mod limit;
-pub mod sort;
-pub mod take;
-pub mod temporal;
+#[macro_use]
+extern crate criterion;
+use criterion::Criterion;
+
+extern crate arrow;
+
+use arrow::array::*;
+use arrow::compute::kernels::length::length;
+
+fn bench_length() {
+ fn double_vec<T: Clone>(v: Vec<T>) -> Vec<T> {
+ [&v[..], &v[..]].concat()
+ }
+
+ // double ["hello", " ", "world", "!"] 10 times
+ let mut values = vec!["one", "on", "o", ""];
+ let mut expected = vec![3, 2, 1, 0];
+ for _ in 0..10 {
+ values = double_vec(values);
+ expected = double_vec(expected);
+ }
+ let array = StringArray::from(values);
+
+ criterion::black_box(length(&array).unwrap());
+}
+
+fn add_benchmark(c: &mut Criterion) {
+ c.bench_function("length", |b| b.iter(|| bench_length()));
+}
+
+criterion_group!(benches, add_benchmark);
+criterion_main!(benches);
diff --git a/rust/arrow/src/compute/kernels/length.rs
b/rust/arrow/src/compute/kernels/length.rs
new file mode 100644
index 0000000..b4ae26f
--- /dev/null
+++ b/rust/arrow/src/compute/kernels/length.rs
@@ -0,0 +1,186 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Defines kernel for length of a string array
+
+use crate::array::*;
+use crate::{
+ datatypes::DataType,
+ datatypes::UInt32Type,
+ error::{ArrowError, Result},
+};
+use std::sync::Arc;
+
+/// Returns an array of UInt32 denoting the number of characters in each
string in the array.
+///
+/// * this only accepts StringArray
+/// * length of null is null.
+/// * length is in number of bytes
+pub fn length(array: &Array) -> Result<UInt32Array> {
+ match array.data_type() {
+ DataType::Utf8 => {
+ // note: offsets are stored as u8, but they can be interpreted as
u32
+ let offsets = array.data_ref().clone().buffers()[0].clone();
+ // this is a 30% improvement over iterating over u8s and building
u32, which
+ // justifies the usage of `unsafe`.
+ let slice: &[u32] = unsafe { offsets.typed_data::<u32>() };
+
+ let mut builder = UInt32BufferBuilder::new(array.len());
+ let lengths: Vec<u32> = slice
+ .windows(2)
+ .map(|offset| offset[1] - offset[0])
+ .collect();
+ builder.append_slice(lengths.as_slice())?;
+
+ let null_bit_buffer = array
+ .data_ref()
+ .null_bitmap()
+ .as_ref()
+ .map(|b| b.bits.clone());
+
+ let data = ArrayData::new(
+ DataType::UInt32,
+ array.len(),
+ None,
+ null_bit_buffer,
+ 0,
+ vec![builder.finish()],
+ vec![],
+ );
+ Ok(PrimitiveArray::<UInt32Type>::from(Arc::new(data)))
+ }
+ _ => Err(ArrowError::ComputeError(format!(
+ "length not supported for {:?}",
+ array.data_type()
+ ))),
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ /// Tests a vector whose len is not a multiple of 4
+ #[test]
+ fn len_3() -> Result<()> {
+ let array = StringArray::from(vec!["hello", " ", "world"]);
+ let result = length(&array)?;
+ assert_eq!(3, result.len());
+ assert_eq!(
+ vec![5, 1, 5],
+ vec![result.value(0), result.value(1), result.value(2)]
+ );
+ Ok(())
+ }
+
+ /// Tests a vector whose len is multiple of 4
+ #[test]
+ fn len_4() -> Result<()> {
+ let array = StringArray::from(vec!["hello", " ", "world", "!"]);
+ let result = length(&array)?;
+ assert_eq!(4, result.len());
+ assert_eq!(
+ vec![5, 1, 5, 1],
+ vec![
+ result.value(0),
+ result.value(1),
+ result.value(2),
+ result.value(3)
+ ]
+ );
+ Ok(())
+ }
+
+ /// Tests a vector with a character with more than one code point.
+ #[test]
+ fn special() -> Result<()> {
+ let mut builder: StringBuilder = StringBuilder::new(1);
+ builder.append_value("💖")?;
+ let array = builder.finish();
+
+ let result = length(&array)?;
+
+ assert_eq!(1, result.len());
+
+ assert_eq!(4, result.value(0));
+ Ok(())
+ }
+
+ /// Tests a vector with more than 255 entries, to ensure that offsets are
correctly computed beyond simple cases
+ #[test]
+ fn long_array() -> Result<()> {
+ fn double_vec<T: Clone>(v: Vec<T>) -> Vec<T> {
+ [&v[..], &v[..]].concat()
+ }
+
+ // double ["hello", " ", "world", "!"] 10 times
+ let mut values = vec!["one", "on", "o", ""];
+ let mut expected = vec![3, 2, 1, 0];
+ for _ in 0..10 {
+ values = double_vec(values);
+ expected = double_vec(expected);
+ }
+
+ let a = StringArray::from(values);
+
+ let result = length(&a)?;
+
+ assert_eq!(4096, result.len()); // 2^12
+
+ let mut builder = UInt32Builder::new(expected.len());
+ for e in expected {
+ builder.append_value(e)?
+ }
+ assert_eq!(builder.finish(), result);
+ Ok(())
+ }
+
+ /// Tests handling of null values
+ #[test]
+ fn null() -> Result<()> {
+ let mut builder: StringBuilder = StringBuilder::new(4);
+ builder.append_value("one")?;
+ builder.append_null()?;
+ builder.append_value("three")?;
+ builder.append_value("four")?;
+ let array = builder.finish();
+
+ let a = length(&array)?;
+ assert_eq!(a.len(), array.len());
+
+ let mut expected = UInt32Builder::new(4);
+ expected.append_value(3)?;
+ expected.append_null()?;
+ expected.append_value(5)?;
+ expected.append_value(4)?;
+ let expected = expected.finish();
+
+ assert_eq!(expected.data(), a.data());
+ Ok(())
+ }
+
+ /// Tests that length is not valid for u64.
+ #[test]
+ fn wrong_type() -> Result<()> {
+ let mut builder = UInt64Builder::new(1);
+ builder.append_value(1)?;
+ let array = builder.finish();
+
+ assert!(length(&array).is_err());
+ Ok(())
+ }
+}
diff --git a/rust/arrow/src/compute/kernels/mod.rs
b/rust/arrow/src/compute/kernels/mod.rs
index 1f437ed..0cd3d70 100644
--- a/rust/arrow/src/compute/kernels/mod.rs
+++ b/rust/arrow/src/compute/kernels/mod.rs
@@ -24,6 +24,7 @@ pub mod cast;
pub mod comparison;
pub mod concat;
pub mod filter;
+pub mod length;
pub mod limit;
pub mod sort;
pub mod take;