This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 8abb936 ARROW-11430: [Rust] zip kernel: combine arrays based on
boolean mask
8abb936 is described below
commit 8abb93655d41d53baef7ecd436250e2a3f2a19e0
Author: Ritchie Vink <[email protected]>
AuthorDate: Mon Feb 8 06:53:07 2021 -0500
ARROW-11430: [Rust] zip kernel: combine arrays based on boolean mask
This PR proposes a kernel that combines two arrays based on a boolean mask.
It reuses the logic from the filter kernel to efficiently skip chunks that
evaluate false/true.
Closes #9363 from ritchie46/zip_kernel
Authored-by: Ritchie Vink <[email protected]>
Signed-off-by: Andrew Lamb <[email protected]>
---
rust/arrow/src/compute/kernels/filter.rs | 4 +-
rust/arrow/src/compute/kernels/mod.rs | 1 +
rust/arrow/src/compute/kernels/zip.rs | 88 ++++++++++++++++++++++++++++++++
3 files changed, 91 insertions(+), 2 deletions(-)
diff --git a/rust/arrow/src/compute/kernels/filter.rs
b/rust/arrow/src/compute/kernels/filter.rs
index 8bd0d3e..0896502 100644
--- a/rust/arrow/src/compute/kernels/filter.rs
+++ b/rust/arrow/src/compute/kernels/filter.rs
@@ -42,7 +42,7 @@ enum State {
/// slots of a [BooleanArray] are true. Each interval corresponds to a
contiguous region of memory to be
/// "taken" from an array to be filtered.
#[derive(Debug)]
-struct SlicesIterator<'a> {
+pub(crate) struct SlicesIterator<'a> {
iter: Enumerate<BitChunkIterator<'a>>,
state: State,
filter_count: usize,
@@ -57,7 +57,7 @@ struct SlicesIterator<'a> {
}
impl<'a> SlicesIterator<'a> {
- fn new(filter: &'a BooleanArray) -> Self {
+ pub(crate) fn new(filter: &'a BooleanArray) -> Self {
let values = &filter.data_ref().buffers()[0];
// this operation is performed before iteration
diff --git a/rust/arrow/src/compute/kernels/mod.rs
b/rust/arrow/src/compute/kernels/mod.rs
index 7c3c8c7..62d3642 100644
--- a/rust/arrow/src/compute/kernels/mod.rs
+++ b/rust/arrow/src/compute/kernels/mod.rs
@@ -32,3 +32,4 @@ pub mod substring;
pub mod take;
pub mod temporal;
pub mod window;
+pub mod zip;
diff --git a/rust/arrow/src/compute/kernels/zip.rs
b/rust/arrow/src/compute/kernels/zip.rs
new file mode 100644
index 0000000..ba84568
--- /dev/null
+++ b/rust/arrow/src/compute/kernels/zip.rs
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::array::*;
+use crate::compute::SlicesIterator;
+use crate::error::{ArrowError, Result};
+use std::sync::Arc;
+
+/// Zip two arrays by some boolean mask. Where the mask evaluates `true`
values of `truthy`
+/// are taken, where the mask evaluates `false` values of `falsy` are taken.
+///
+/// # Arguments
+/// * `mask` - Boolean values used to determine from which array to take the
values.
+/// * `truthy` - Values of this array are taken if mask evaluates `true`
+/// * `falsy` - Values of this array are taken if mask evaluates `false`
+pub fn zip(
+ mask: &BooleanArray,
+ truthy: &dyn Array,
+ falsy: &dyn Array,
+) -> Result<ArrayRef> {
+ if truthy.data_type() != falsy.data_type() {
+ return Err(ArrowError::InvalidArgumentError(
+ "arguments need to have the same data type".into(),
+ ));
+ }
+ if truthy.len() != falsy.len() || falsy.len() != mask.len() {
+ return Err(ArrowError::InvalidArgumentError(
+ "all arrays should have the same length".into(),
+ ));
+ }
+ let falsy = falsy.data();
+ let truthy = truthy.data();
+
+ let mut mutable = MutableArrayData::new(vec![&*truthy, &*falsy], false,
truthy.len());
+
+ // the SlicesIterator slices only the true values. So the gaps left by
this iterator we need to
+ // fill with falsy values
+
+ // keep track of how much is filled
+ let mut filled = 0;
+
+ SlicesIterator::new(mask).for_each(|(start, end)| {
+ // the gap needs to be filled with falsy values
+ if start > filled {
+ mutable.extend(1, filled, start);
+ }
+ // fill with truthy values
+ mutable.extend(0, start, end);
+ filled = end;
+ });
+ // the remaining part is falsy
+ if filled < truthy.len() {
+ mutable.extend(1, filled, truthy.len());
+ }
+
+ let data = mutable.freeze();
+ Ok(make_array(Arc::new(data)))
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn test_zip_kernel() {
+ let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]);
+ let b = Int32Array::from(vec![None, Some(3), Some(6), Some(7),
Some(3)]);
+ let mask = BooleanArray::from(vec![true, true, false, false, true]);
+ let out = zip(&mask, &a, &b).unwrap();
+ let actual = out.as_any().downcast_ref::<Int32Array>().unwrap();
+ let expected = Int32Array::from(vec![Some(5), None, Some(6), Some(7),
Some(1)]);
+ assert_eq!(actual, &expected);
+ }
+}