This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 6850288 ARROW-3954: [Rust] Add Slice to Array and ArrayData
6850288 is described below
commit 685028801a05629be6afcb6411f295b85324a761
Author: Chao Sun <[email protected]>
AuthorDate: Sun Mar 10 17:16:32 2019 +0100
ARROW-3954: [Rust] Add Slice to Array and ArrayData
Author: Chao Sun <[email protected]>
Closes #3856 from sunchao/ARROW-3954 and squashes the following commits:
eb922ca9 <Chao Sun> Fix warnings
2cb76f7f <Chao Sun> ARROW-3954: Add Slice to Array and ArrayData
---
rust/arrow/src/array.rs | 61 +++++++++++++++++++++++++++++++++++++++
rust/arrow/src/array_data.rs | 20 +++++++------
rust/arrow/src/bitmap.rs | 2 +-
rust/arrow/src/util/bit_util.rs | 63 +++++++++++++++++++++++++----------------
rust/parquet/src/data_type.rs | 2 +-
5 files changed, 113 insertions(+), 35 deletions(-)
diff --git a/rust/arrow/src/array.rs b/rust/arrow/src/array.rs
index 0d702ee..d15d88b 100644
--- a/rust/arrow/src/array.rs
+++ b/rust/arrow/src/array.rs
@@ -96,6 +96,11 @@ pub trait Array: Send + Sync {
self.data_ref().data_type()
}
+ /// Returns a zero-copy slice of this array with the indicated offset and
length.
+ fn slice(&self, offset: usize, length: usize) -> ArrayRef {
+ make_array(slice_data(self.data(), offset, length))
+ }
+
/// Returns the length (i.e., number of elements) of this array
fn len(&self) -> usize {
self.data().len()
@@ -148,6 +153,28 @@ fn make_array(data: ArrayDataRef) -> ArrayRef {
}
}
+fn slice_data(data: ArrayDataRef, mut offset: usize, length: usize) ->
ArrayDataRef {
+ assert!((offset + length) <= data.len());
+
+ let mut new_data = data.as_ref().clone();
+ let len = ::std::cmp::min(new_data.len - offset, length);
+
+ offset += data.offset;
+ new_data.len = len;
+ new_data.offset = offset;
+
+ // Calculate the new null count based on the offset
+ new_data.null_count = if let Some(bitmap) = new_data.null_bitmap() {
+ let valid_bits = bitmap.bits.data();
+ len.checked_sub(bit_util::count_set_bits_offset(valid_bits, offset,
length))
+ .unwrap()
+ } else {
+ 0
+ };
+
+ Arc::new(new_data)
+}
+
///
----------------------------------------------------------------------------
/// Implementations of different array types
@@ -1095,6 +1122,40 @@ mod tests {
}
#[test]
+ fn test_primitive_array_slice() {
+ let arr = Int32Array::from(vec![
+ Some(0),
+ None,
+ Some(2),
+ None,
+ Some(4),
+ Some(5),
+ Some(6),
+ None,
+ None,
+ ]);
+ assert_eq!(9, arr.len());
+ assert_eq!(0, arr.offset());
+ assert_eq!(4, arr.null_count());
+
+ let arr2 = arr.slice(2, 5);
+ assert_eq!(5, arr2.len());
+ assert_eq!(2, arr2.offset());
+ assert_eq!(1, arr2.null_count());
+ assert!(arr2.is_null(1));
+
+ let arr3 = arr2.slice(2, 3);
+ assert_eq!(3, arr3.len());
+ assert_eq!(4, arr3.offset());
+ assert_eq!(0, arr3.null_count());
+
+ let int_arr = arr3.as_any().downcast_ref::<Int32Array>().unwrap();
+ assert_eq!(4, int_arr.value(0));
+ assert_eq!(5, int_arr.value(1));
+ assert_eq!(6, int_arr.value(2));
+ }
+
+ #[test]
fn test_value_slice_no_bounds_check() {
let arr = Int32Array::from(vec![2, 3, 4]);
let _slice = arr.value_slice(0, 4);
diff --git a/rust/arrow/src/array_data.rs b/rust/arrow/src/array_data.rs
index a24dd01..ac6ad4f 100644
--- a/rust/arrow/src/array_data.rs
+++ b/rust/arrow/src/array_data.rs
@@ -28,19 +28,19 @@ use crate::util::bit_util;
/// An generic representation of Arrow array data which encapsulates common
attributes and
/// operations for Arrow array. Specific operations for different arrays types
(e.g.,
/// primitive, list, struct) are implemented in `Array`.
-#[derive(PartialEq, Debug)]
+#[derive(PartialEq, Debug, Clone)]
pub struct ArrayData {
/// The data type for this array data
data_type: DataType,
/// The number of elements in this array data
- len: usize,
+ pub(crate) len: usize,
/// The number of null elements in this array data
- null_count: usize,
+ pub(crate) null_count: usize,
/// The offset into this array data
- offset: usize,
+ pub(crate) offset: usize,
/// The buffers for this array data. Note that depending on the array
types, this
/// could hold different kinds of buffers (e.g., value buffer, value
offset buffer)
@@ -71,8 +71,12 @@ impl ArrayData {
let null_count = match null_count {
None => {
if let Some(ref buf) = null_bit_buffer {
-
len.checked_sub(bit_util::count_set_bits_offset(buf.data(), offset))
- .unwrap()
+ len.checked_sub(bit_util::count_set_bits_offset(
+ buf.data(),
+ offset,
+ len,
+ ))
+ .unwrap()
} else {
0
}
@@ -294,10 +298,10 @@ mod tests {
bit_util::set_bit(&mut bit_v, 3);
bit_util::set_bit(&mut bit_v, 10);
let arr_data = ArrayData::builder(DataType::Int32)
- .len(16)
+ .len(12)
.offset(2)
.null_bit_buffer(Buffer::from(bit_v))
.build();
- assert_eq!(14, arr_data.null_count());
+ assert_eq!(10, arr_data.null_count());
}
}
diff --git a/rust/arrow/src/bitmap.rs b/rust/arrow/src/bitmap.rs
index e42e560..cd05b59 100644
--- a/rust/arrow/src/bitmap.rs
+++ b/rust/arrow/src/bitmap.rs
@@ -24,7 +24,7 @@ use crate::util::bit_util;
use std::ops::{BitAnd, BitOr};
-#[derive(PartialEq, Debug)]
+#[derive(PartialEq, Debug, Clone)]
pub struct Bitmap {
pub(crate) bits: Buffer,
}
diff --git a/rust/arrow/src/util/bit_util.rs b/rust/arrow/src/util/bit_util.rs
index 89ebd95..7924f5f 100644
--- a/rust/arrow/src/util/bit_util.rs
+++ b/rust/arrow/src/util/bit_util.rs
@@ -87,26 +87,33 @@ pub fn count_set_bits(data: &[u8]) -> usize {
count
}
-/// Returns the number of 1-bits in `data`, starting from `offset`.
+/// Returns the number of 1-bits in `data`, starting from `offset` with
`length` bits
+/// inspected. Note that both `offset` and `length` are measured in bits.
#[inline]
-pub fn count_set_bits_offset(data: &[u8], offset: usize) -> usize {
- debug_assert!(offset <= (data.len() << 3));
-
- let start_byte_pos = offset >> 3;
- let start_bit_pos = offset & 7;
-
- if start_bit_pos == 0 {
- count_set_bits(&data[start_byte_pos..])
- } else {
- let mut result = 0;
- result += count_set_bits(&data[start_byte_pos + 1..]);
- for i in start_bit_pos..8 {
- if get_bit(&data[start_byte_pos..start_byte_pos + 1], i) {
- result += 1;
- }
+pub fn count_set_bits_offset(data: &[u8], offset: usize, length: usize) ->
usize {
+ let bit_end = offset + length;
+ assert!(bit_end <= (data.len() << 3));
+
+ let byte_start = ::std::cmp::min(round_upto_power_of_2(offset, 8),
bit_end);
+ let num_bytes = (bit_end - byte_start) >> 3;
+
+ let mut result = 0;
+
+ for i in offset..byte_start {
+ if get_bit(data, i) {
+ result += 1;
}
- result
}
+ for i in 0..num_bytes {
+ result += POPCOUNT_TABLE[data[(byte_start >> 3) + i] as usize] as
usize;
+ }
+ for i in (byte_start + (num_bytes << 3))..bit_end {
+ if get_bit(data, i) {
+ result += 1;
+ }
+ }
+
+ result
}
/// Returns the ceil of `value`/`divisor`
@@ -264,14 +271,20 @@ mod tests {
#[test]
fn test_count_bits_offset_slice() {
- assert_eq!(8, count_set_bits_offset(&[0b11111111], 0));
- assert_eq!(5, count_set_bits_offset(&[0b11111111], 3));
- assert_eq!(0, count_set_bits_offset(&[0b11111111], 8));
- assert_eq!(16, count_set_bits_offset(&[0b11111111, 0b11111111], 0));
- assert_eq!(13, count_set_bits_offset(&[0b11111111, 0b11111111], 3));
- assert_eq!(8, count_set_bits_offset(&[0b11111111, 0b11111111], 8));
- assert_eq!(5, count_set_bits_offset(&[0b11111111, 0b11111111], 11));
- assert_eq!(0, count_set_bits_offset(&[0b11111111, 0b11111111], 16));
+ assert_eq!(8, count_set_bits_offset(&[0b11111111], 0, 8));
+ assert_eq!(3, count_set_bits_offset(&[0b11111111], 0, 3));
+ assert_eq!(5, count_set_bits_offset(&[0b11111111], 3, 5));
+ assert_eq!(1, count_set_bits_offset(&[0b11111111], 3, 1));
+ assert_eq!(0, count_set_bits_offset(&[0b11111111], 8, 0));
+ assert_eq!(2, count_set_bits_offset(&[0b01010101], 0, 3));
+ assert_eq!(16, count_set_bits_offset(&[0b11111111, 0b11111111], 0,
16));
+ assert_eq!(10, count_set_bits_offset(&[0b11111111, 0b11111111], 0,
10));
+ assert_eq!(10, count_set_bits_offset(&[0b11111111, 0b11111111], 3,
10));
+ assert_eq!(8, count_set_bits_offset(&[0b11111111, 0b11111111], 8, 8));
+ assert_eq!(5, count_set_bits_offset(&[0b11111111, 0b11111111], 11, 5));
+ assert_eq!(0, count_set_bits_offset(&[0b11111111, 0b11111111], 16, 0));
+ assert_eq!(2, count_set_bits_offset(&[0b01101101, 0b10101010], 7, 5));
+ assert_eq!(4, count_set_bits_offset(&[0b01101101, 0b10101010], 7, 9));
}
#[test]
diff --git a/rust/parquet/src/data_type.rs b/rust/parquet/src/data_type.rs
index e09ba2a..4ef472e 100644
--- a/rust/parquet/src/data_type.rs
+++ b/rust/parquet/src/data_type.rs
@@ -362,7 +362,7 @@ macro_rules! make_type {
};
}
-/// Generate struct definitions for all physical types
+// Generate struct definitions for all physical types
make_type!(BoolType, Type::BOOLEAN, bool, 1);
make_type!(Int32Type, Type::INT32, i32, 4);