This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 6850288  ARROW-3954: [Rust] Add Slice to Array and ArrayData
6850288 is described below

commit 685028801a05629be6afcb6411f295b85324a761
Author: Chao Sun <[email protected]>
AuthorDate: Sun Mar 10 17:16:32 2019 +0100

    ARROW-3954: [Rust] Add Slice to Array and ArrayData
    
    Author: Chao Sun <[email protected]>
    
    Closes #3856 from sunchao/ARROW-3954 and squashes the following commits:
    
    eb922ca9 <Chao Sun> Fix warnings
    2cb76f7f <Chao Sun> ARROW-3954:  Add Slice to Array and ArrayData
---
 rust/arrow/src/array.rs         | 61 +++++++++++++++++++++++++++++++++++++++
 rust/arrow/src/array_data.rs    | 20 +++++++------
 rust/arrow/src/bitmap.rs        |  2 +-
 rust/arrow/src/util/bit_util.rs | 63 +++++++++++++++++++++++++----------------
 rust/parquet/src/data_type.rs   |  2 +-
 5 files changed, 113 insertions(+), 35 deletions(-)

diff --git a/rust/arrow/src/array.rs b/rust/arrow/src/array.rs
index 0d702ee..d15d88b 100644
--- a/rust/arrow/src/array.rs
+++ b/rust/arrow/src/array.rs
@@ -96,6 +96,11 @@ pub trait Array: Send + Sync {
         self.data_ref().data_type()
     }
 
+    /// Returns a zero-copy slice of this array with the indicated offset and 
length.
+    fn slice(&self, offset: usize, length: usize) -> ArrayRef {
+        make_array(slice_data(self.data(), offset, length))
+    }
+
     /// Returns the length (i.e., number of elements) of this array
     fn len(&self) -> usize {
         self.data().len()
@@ -148,6 +153,28 @@ fn make_array(data: ArrayDataRef) -> ArrayRef {
     }
 }
 
+fn slice_data(data: ArrayDataRef, mut offset: usize, length: usize) -> 
ArrayDataRef {
+    assert!((offset + length) <= data.len());
+
+    let mut new_data = data.as_ref().clone();
+    let len = ::std::cmp::min(new_data.len - offset, length);
+
+    offset += data.offset;
+    new_data.len = len;
+    new_data.offset = offset;
+
+    // Calculate the new null count based on the offset
+    new_data.null_count = if let Some(bitmap) = new_data.null_bitmap() {
+        let valid_bits = bitmap.bits.data();
+        len.checked_sub(bit_util::count_set_bits_offset(valid_bits, offset, 
length))
+            .unwrap()
+    } else {
+        0
+    };
+
+    Arc::new(new_data)
+}
+
 /// 
----------------------------------------------------------------------------
 /// Implementations of different array types
 
@@ -1095,6 +1122,40 @@ mod tests {
     }
 
     #[test]
+    fn test_primitive_array_slice() {
+        let arr = Int32Array::from(vec![
+            Some(0),
+            None,
+            Some(2),
+            None,
+            Some(4),
+            Some(5),
+            Some(6),
+            None,
+            None,
+        ]);
+        assert_eq!(9, arr.len());
+        assert_eq!(0, arr.offset());
+        assert_eq!(4, arr.null_count());
+
+        let arr2 = arr.slice(2, 5);
+        assert_eq!(5, arr2.len());
+        assert_eq!(2, arr2.offset());
+        assert_eq!(1, arr2.null_count());
+        assert!(arr2.is_null(1));
+
+        let arr3 = arr2.slice(2, 3);
+        assert_eq!(3, arr3.len());
+        assert_eq!(4, arr3.offset());
+        assert_eq!(0, arr3.null_count());
+
+        let int_arr = arr3.as_any().downcast_ref::<Int32Array>().unwrap();
+        assert_eq!(4, int_arr.value(0));
+        assert_eq!(5, int_arr.value(1));
+        assert_eq!(6, int_arr.value(2));
+    }
+
+    #[test]
     fn test_value_slice_no_bounds_check() {
         let arr = Int32Array::from(vec![2, 3, 4]);
         let _slice = arr.value_slice(0, 4);
diff --git a/rust/arrow/src/array_data.rs b/rust/arrow/src/array_data.rs
index a24dd01..ac6ad4f 100644
--- a/rust/arrow/src/array_data.rs
+++ b/rust/arrow/src/array_data.rs
@@ -28,19 +28,19 @@ use crate::util::bit_util;
 /// An generic representation of Arrow array data which encapsulates common 
attributes and
 /// operations for Arrow array. Specific operations for different arrays types 
(e.g.,
 /// primitive, list, struct) are implemented in `Array`.
-#[derive(PartialEq, Debug)]
+#[derive(PartialEq, Debug, Clone)]
 pub struct ArrayData {
     /// The data type for this array data
     data_type: DataType,
 
     /// The number of elements in this array data
-    len: usize,
+    pub(crate) len: usize,
 
     /// The number of null elements in this array data
-    null_count: usize,
+    pub(crate) null_count: usize,
 
     /// The offset into this array data
-    offset: usize,
+    pub(crate) offset: usize,
 
     /// The buffers for this array data. Note that depending on the array 
types, this
     /// could hold different kinds of buffers (e.g., value buffer, value 
offset buffer)
@@ -71,8 +71,12 @@ impl ArrayData {
         let null_count = match null_count {
             None => {
                 if let Some(ref buf) = null_bit_buffer {
-                    
len.checked_sub(bit_util::count_set_bits_offset(buf.data(), offset))
-                        .unwrap()
+                    len.checked_sub(bit_util::count_set_bits_offset(
+                        buf.data(),
+                        offset,
+                        len,
+                    ))
+                    .unwrap()
                 } else {
                     0
                 }
@@ -294,10 +298,10 @@ mod tests {
         bit_util::set_bit(&mut bit_v, 3);
         bit_util::set_bit(&mut bit_v, 10);
         let arr_data = ArrayData::builder(DataType::Int32)
-            .len(16)
+            .len(12)
             .offset(2)
             .null_bit_buffer(Buffer::from(bit_v))
             .build();
-        assert_eq!(14, arr_data.null_count());
+        assert_eq!(10, arr_data.null_count());
     }
 }
diff --git a/rust/arrow/src/bitmap.rs b/rust/arrow/src/bitmap.rs
index e42e560..cd05b59 100644
--- a/rust/arrow/src/bitmap.rs
+++ b/rust/arrow/src/bitmap.rs
@@ -24,7 +24,7 @@ use crate::util::bit_util;
 
 use std::ops::{BitAnd, BitOr};
 
-#[derive(PartialEq, Debug)]
+#[derive(PartialEq, Debug, Clone)]
 pub struct Bitmap {
     pub(crate) bits: Buffer,
 }
diff --git a/rust/arrow/src/util/bit_util.rs b/rust/arrow/src/util/bit_util.rs
index 89ebd95..7924f5f 100644
--- a/rust/arrow/src/util/bit_util.rs
+++ b/rust/arrow/src/util/bit_util.rs
@@ -87,26 +87,33 @@ pub fn count_set_bits(data: &[u8]) -> usize {
     count
 }
 
-/// Returns the number of 1-bits in `data`, starting from `offset`.
+/// Returns the number of 1-bits in `data`, starting from `offset` with 
`length` bits
+/// inspected. Note that both `offset` and `length` are measured in bits.
 #[inline]
-pub fn count_set_bits_offset(data: &[u8], offset: usize) -> usize {
-    debug_assert!(offset <= (data.len() << 3));
-
-    let start_byte_pos = offset >> 3;
-    let start_bit_pos = offset & 7;
-
-    if start_bit_pos == 0 {
-        count_set_bits(&data[start_byte_pos..])
-    } else {
-        let mut result = 0;
-        result += count_set_bits(&data[start_byte_pos + 1..]);
-        for i in start_bit_pos..8 {
-            if get_bit(&data[start_byte_pos..start_byte_pos + 1], i) {
-                result += 1;
-            }
+pub fn count_set_bits_offset(data: &[u8], offset: usize, length: usize) -> 
usize {
+    let bit_end = offset + length;
+    assert!(bit_end <= (data.len() << 3));
+
+    let byte_start = ::std::cmp::min(round_upto_power_of_2(offset, 8), 
bit_end);
+    let num_bytes = (bit_end - byte_start) >> 3;
+
+    let mut result = 0;
+
+    for i in offset..byte_start {
+        if get_bit(data, i) {
+            result += 1;
         }
-        result
     }
+    for i in 0..num_bytes {
+        result += POPCOUNT_TABLE[data[(byte_start >> 3) + i] as usize] as 
usize;
+    }
+    for i in (byte_start + (num_bytes << 3))..bit_end {
+        if get_bit(data, i) {
+            result += 1;
+        }
+    }
+
+    result
 }
 
 /// Returns the ceil of `value`/`divisor`
@@ -264,14 +271,20 @@ mod tests {
 
     #[test]
     fn test_count_bits_offset_slice() {
-        assert_eq!(8, count_set_bits_offset(&[0b11111111], 0));
-        assert_eq!(5, count_set_bits_offset(&[0b11111111], 3));
-        assert_eq!(0, count_set_bits_offset(&[0b11111111], 8));
-        assert_eq!(16, count_set_bits_offset(&[0b11111111, 0b11111111], 0));
-        assert_eq!(13, count_set_bits_offset(&[0b11111111, 0b11111111], 3));
-        assert_eq!(8, count_set_bits_offset(&[0b11111111, 0b11111111], 8));
-        assert_eq!(5, count_set_bits_offset(&[0b11111111, 0b11111111], 11));
-        assert_eq!(0, count_set_bits_offset(&[0b11111111, 0b11111111], 16));
+        assert_eq!(8, count_set_bits_offset(&[0b11111111], 0, 8));
+        assert_eq!(3, count_set_bits_offset(&[0b11111111], 0, 3));
+        assert_eq!(5, count_set_bits_offset(&[0b11111111], 3, 5));
+        assert_eq!(1, count_set_bits_offset(&[0b11111111], 3, 1));
+        assert_eq!(0, count_set_bits_offset(&[0b11111111], 8, 0));
+        assert_eq!(2, count_set_bits_offset(&[0b01010101], 0, 3));
+        assert_eq!(16, count_set_bits_offset(&[0b11111111, 0b11111111], 0, 
16));
+        assert_eq!(10, count_set_bits_offset(&[0b11111111, 0b11111111], 0, 
10));
+        assert_eq!(10, count_set_bits_offset(&[0b11111111, 0b11111111], 3, 
10));
+        assert_eq!(8, count_set_bits_offset(&[0b11111111, 0b11111111], 8, 8));
+        assert_eq!(5, count_set_bits_offset(&[0b11111111, 0b11111111], 11, 5));
+        assert_eq!(0, count_set_bits_offset(&[0b11111111, 0b11111111], 16, 0));
+        assert_eq!(2, count_set_bits_offset(&[0b01101101, 0b10101010], 7, 5));
+        assert_eq!(4, count_set_bits_offset(&[0b01101101, 0b10101010], 7, 9));
     }
 
     #[test]
diff --git a/rust/parquet/src/data_type.rs b/rust/parquet/src/data_type.rs
index e09ba2a..4ef472e 100644
--- a/rust/parquet/src/data_type.rs
+++ b/rust/parquet/src/data_type.rs
@@ -362,7 +362,7 @@ macro_rules! make_type {
     };
 }
 
-/// Generate struct definitions for all physical types
+// Generate struct definitions for all physical types
 
 make_type!(BoolType, Type::BOOLEAN, bool, 1);
 make_type!(Int32Type, Type::INT32, i32, 4);

Reply via email to