This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 8abb936  ARROW-11430: [Rust] zip kernel: combine arrays based on 
boolean mask
8abb936 is described below

commit 8abb93655d41d53baef7ecd436250e2a3f2a19e0
Author: Ritchie Vink <[email protected]>
AuthorDate: Mon Feb 8 06:53:07 2021 -0500

    ARROW-11430: [Rust] zip kernel: combine arrays based on boolean mask
    
    This PR proposes a kernel that combines two arrays based on a boolean mask.
    
    It reuses the logic from the filter kernel to efficiently skip chunks that 
evaluate false/true.
    
    Closes #9363 from ritchie46/zip_kernel
    
    Authored-by: Ritchie Vink <[email protected]>
    Signed-off-by: Andrew Lamb <[email protected]>
---
 rust/arrow/src/compute/kernels/filter.rs |  4 +-
 rust/arrow/src/compute/kernels/mod.rs    |  1 +
 rust/arrow/src/compute/kernels/zip.rs    | 88 ++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/rust/arrow/src/compute/kernels/filter.rs 
b/rust/arrow/src/compute/kernels/filter.rs
index 8bd0d3e..0896502 100644
--- a/rust/arrow/src/compute/kernels/filter.rs
+++ b/rust/arrow/src/compute/kernels/filter.rs
@@ -42,7 +42,7 @@ enum State {
 /// slots of a [BooleanArray] are true. Each interval corresponds to a 
contiguous region of memory to be
 /// "taken" from an array to be filtered.
 #[derive(Debug)]
-struct SlicesIterator<'a> {
+pub(crate) struct SlicesIterator<'a> {
     iter: Enumerate<BitChunkIterator<'a>>,
     state: State,
     filter_count: usize,
@@ -57,7 +57,7 @@ struct SlicesIterator<'a> {
 }
 
 impl<'a> SlicesIterator<'a> {
-    fn new(filter: &'a BooleanArray) -> Self {
+    pub(crate) fn new(filter: &'a BooleanArray) -> Self {
         let values = &filter.data_ref().buffers()[0];
 
         // this operation is performed before iteration
diff --git a/rust/arrow/src/compute/kernels/mod.rs 
b/rust/arrow/src/compute/kernels/mod.rs
index 7c3c8c7..62d3642 100644
--- a/rust/arrow/src/compute/kernels/mod.rs
+++ b/rust/arrow/src/compute/kernels/mod.rs
@@ -32,3 +32,4 @@ pub mod substring;
 pub mod take;
 pub mod temporal;
 pub mod window;
+pub mod zip;
diff --git a/rust/arrow/src/compute/kernels/zip.rs 
b/rust/arrow/src/compute/kernels/zip.rs
new file mode 100644
index 0000000..ba84568
--- /dev/null
+++ b/rust/arrow/src/compute/kernels/zip.rs
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::array::*;
+use crate::compute::SlicesIterator;
+use crate::error::{ArrowError, Result};
+use std::sync::Arc;
+
+/// Zip two arrays by some boolean mask. Where the mask evaluates `true` 
values of `truthy`
+/// are taken, where the mask evaluates `false` values of `falsy` are taken.
+///
+/// # Arguments
+/// * `mask` - Boolean values used to determine from which array to take the 
values.
+/// * `truthy` - Values of this array are taken if mask evaluates `true`
+/// * `falsy` - Values of this array are taken if mask evaluates `false`
+pub fn zip(
+    mask: &BooleanArray,
+    truthy: &dyn Array,
+    falsy: &dyn Array,
+) -> Result<ArrayRef> {
+    if truthy.data_type() != falsy.data_type() {
+        return Err(ArrowError::InvalidArgumentError(
+            "arguments need to have the same data type".into(),
+        ));
+    }
+    if truthy.len() != falsy.len() || falsy.len() != mask.len() {
+        return Err(ArrowError::InvalidArgumentError(
+            "all arrays should have the same length".into(),
+        ));
+    }
+    let falsy = falsy.data();
+    let truthy = truthy.data();
+
+    let mut mutable = MutableArrayData::new(vec![&*truthy, &*falsy], false, 
truthy.len());
+
+    // the SlicesIterator slices only the true values. So the gaps left by 
this iterator we need to
+    // fill with falsy values
+
+    // keep track of how much is filled
+    let mut filled = 0;
+
+    SlicesIterator::new(mask).for_each(|(start, end)| {
+        // the gap needs to be filled with falsy values
+        if start > filled {
+            mutable.extend(1, filled, start);
+        }
+        // fill with truthy values
+        mutable.extend(0, start, end);
+        filled = end;
+    });
+    // the remaining part is falsy
+    if filled < truthy.len() {
+        mutable.extend(1, filled, truthy.len());
+    }
+
+    let data = mutable.freeze();
+    Ok(make_array(Arc::new(data)))
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_zip_kernel() {
+        let a = Int32Array::from(vec![Some(5), None, Some(7), None, Some(1)]);
+        let b = Int32Array::from(vec![None, Some(3), Some(6), Some(7), 
Some(3)]);
+        let mask = BooleanArray::from(vec![true, true, false, false, true]);
+        let out = zip(&mask, &a, &b).unwrap();
+        let actual = out.as_any().downcast_ref::<Int32Array>().unwrap();
+        let expected = Int32Array::from(vec![Some(5), None, Some(6), Some(7), 
Some(1)]);
+        assert_eq!(actual, &expected);
+    }
+}

Reply via email to