This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new e81f93b05 Arbitrary size concat elements utf8 (#1787)
e81f93b05 is described below

commit e81f93b052531b9d9fe5313e551fefa2ec939538
Author: Ismail-Maj <[email protected]>
AuthorDate: Wed Jun 29 21:14:37 2022 +0200

    Arbitrary size concat elements utf8 (#1787)
    
    * arbitrary size combine_option_bitmap and tests
    
    * more tests and error
    
    * format
    
    * more tests
    
    * clone and reduce
    
    * arbitrary size concat_elements_utf8
    
    * nit
    
    * tests
    
    * Update arrow/src/compute/kernels/concat_elements.rs
    
    * support one element input
    
    * split implementations
    
    * fmt
---
 arrow/src/compute/kernels/concat_elements.rs | 121 ++++++++++++++++++++++++++-
 1 file changed, 120 insertions(+), 1 deletion(-)

diff --git a/arrow/src/compute/kernels/concat_elements.rs 
b/arrow/src/compute/kernels/concat_elements.rs
index bc341df88..7d460b21c 100644
--- a/arrow/src/compute/kernels/concat_elements.rs
+++ b/arrow/src/compute/kernels/concat_elements.rs
@@ -85,6 +85,86 @@ pub fn concat_elements_utf8<Offset: OffsetSizeTrait>(
     Ok(unsafe { builder.build_unchecked() }.into())
 }
 
+/// Returns the elementwise concatenation of [`StringArray`].
+/// ```text
+/// e.g:
+///   ["a", "b"] + [None, "c"] + [None, "d"] = [None, "bcd"]
+/// ```
+///
+/// An error will be returned if the [`StringArray`] are of different lengths
+pub fn concat_elements_utf8_many<Offset: OffsetSizeTrait>(
+    arrays: &[&GenericStringArray<Offset>],
+) -> Result<GenericStringArray<Offset>> {
+    if arrays.is_empty() {
+        return Err(ArrowError::ComputeError(
+            "concat requires input of at least one array".to_string(),
+        ));
+    }
+
+    let size = arrays[0].len();
+    if !arrays.iter().all(|array| array.len() == size) {
+        return Err(ArrowError::ComputeError(format!(
+            "Arrays must have the same length of {}",
+            size,
+        )));
+    }
+
+    let output_bitmap = combine_option_bitmap(
+        arrays
+            .iter()
+            .map(|a| a.data())
+            .collect::<Vec<_>>()
+            .as_slice(),
+        size,
+    )?;
+
+    let data_buffers = arrays
+        .iter()
+        .map(|array| array.value_data())
+        .collect::<Vec<_>>();
+
+    let data_values = data_buffers
+        .iter()
+        .map(|buffer| buffer.as_slice())
+        .collect::<Vec<_>>();
+
+    let mut offsets = arrays
+        .iter()
+        .map(|a| a.value_offsets().iter().peekable())
+        .collect::<Vec<_>>();
+
+    let mut output_values = BufferBuilder::<u8>::new(
+        data_values
+            .iter()
+            .zip(offsets.iter_mut())
+            .map(|(data, offset)| data.len() - 
offset.peek().unwrap().to_usize().unwrap())
+            .sum(),
+    );
+
+    let mut output_offsets = BufferBuilder::<Offset>::new(size + 1);
+    output_offsets.append(Offset::zero());
+    for _ in 0..size {
+        data_values
+            .iter()
+            .zip(offsets.iter_mut())
+            .for_each(|(values, offset)| {
+                let index_start = offset.next().unwrap().to_usize().unwrap();
+                let index_end = offset.peek().unwrap().to_usize().unwrap();
+                output_values.append_slice(&values[index_start..index_end]);
+            });
+        
output_offsets.append(Offset::from_usize(output_values.len()).unwrap());
+    }
+
+    let builder = 
ArrayDataBuilder::new(GenericStringArray::<Offset>::get_data_type())
+        .len(size)
+        .add_buffer(output_offsets.finish())
+        .add_buffer(output_values.finish())
+        .null_bit_buffer(output_bitmap);
+
+    // SAFETY - offsets valid by construction
+    Ok(unsafe { builder.build_unchecked() }.into())
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -143,7 +223,10 @@ mod tests {
 
         let output = concat_elements_utf8(&left, &right);
 
-        assert!(output.is_err());
+        assert_eq!(
+            output.unwrap_err().to_string(),
+            "Compute error: Arrays must have the same length: 2 != 
1".to_string()
+        );
     }
 
     #[test]
@@ -190,4 +273,40 @@ mod tests {
 
         assert_eq!(output, expected);
     }
+
+    #[test]
+    fn test_string_concat_error_empty() {
+        assert_eq!(
+            concat_elements_utf8_many::<i32>(&[])
+                .unwrap_err()
+                .to_string(),
+            "Compute error: concat requires input of at least one 
array".to_string()
+        );
+    }
+
+    #[test]
+    fn test_string_concat_one() {
+        let expected = [None, Some("baryyy"), None]
+            .into_iter()
+            .collect::<StringArray>();
+
+        let output = concat_elements_utf8_many(&[&expected]).unwrap();
+
+        assert_eq!(output, expected);
+    }
+
+    #[test]
+    fn test_string_concat_many() {
+        let foo = StringArray::from(vec![Some("f"), Some("o"), Some("o"), 
None]);
+        let bar = StringArray::from(vec![None, Some("b"), Some("a"), 
Some("r")]);
+        let baz = StringArray::from(vec![Some("b"), None, Some("a"), 
Some("z")]);
+
+        let output = concat_elements_utf8_many(&[&foo, &bar, &baz]).unwrap();
+
+        let expected = [None, None, Some("oaa"), None]
+            .into_iter()
+            .collect::<StringArray>();
+
+        assert_eq!(output, expected);
+    }
 }

Reply via email to