This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 979a070dc8 Logical Nullability (#4691)
979a070dc8 is described below
commit 979a070dc82eeb26b38a8651cac879b2c276c0ed
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Mon Aug 14 20:02:45 2023 +0100
Logical Nullability (#4691)
---
arrow-arith/src/arity.rs | 13 +++--
arrow-arith/src/boolean.rs | 14 ++----
arrow-array/src/array/boolean_array.rs | 7 ++-
arrow-array/src/array/dictionary_array.rs | 49 ++++++++++++++++++
arrow-array/src/array/fixed_size_list_array.rs | 6 +--
arrow-array/src/array/list_array.rs | 4 +-
arrow-array/src/array/mod.rs | 60 +++++++++++++++++++++-
arrow-array/src/array/null_array.rs | 33 ++++++------
arrow-array/src/array/run_array.rs | 69 +++++++++++++++++++++++++-
arrow-array/src/array/struct_array.rs | 19 ++++---
arrow-array/src/builder/null_builder.rs | 9 ++--
arrow-array/src/iterator.rs | 16 +++++-
arrow-buffer/src/builder/boolean.rs | 6 +++
arrow-cast/src/cast.rs | 5 +-
arrow-ord/src/comparison.rs | 14 ++++++
arrow-ord/src/sort.rs | 44 ++++++++++++----
arrow-string/src/like.rs | 5 +-
parquet/src/arrow/arrow_writer/levels.rs | 34 +++++++++++--
parquet/src/arrow/arrow_writer/mod.rs | 2 +-
19 files changed, 333 insertions(+), 76 deletions(-)
diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs
index 2dac33a4f2..fdfb26f7f7 100644
--- a/arrow-arith/src/arity.rs
+++ b/arrow-arith/src/arity.rs
@@ -198,7 +198,7 @@ where
return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE)));
}
- let nulls = NullBuffer::union(a.nulls(), b.nulls());
+ let nulls = NullBuffer::union(a.logical_nulls().as_ref(),
b.logical_nulls().as_ref());
let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r));
// JUSTIFICATION
@@ -248,7 +248,7 @@ where
))));
}
- let nulls = NullBuffer::union(a.nulls(), b.nulls());
+ let nulls = NullBuffer::union(a.logical_nulls().as_ref(),
b.logical_nulls().as_ref());
let mut builder = a.into_builder()?;
@@ -296,7 +296,9 @@ where
if a.null_count() == 0 && b.null_count() == 0 {
try_binary_no_nulls(len, a, b, op)
} else {
- let nulls = NullBuffer::union(a.nulls(), b.nulls()).unwrap();
+ let nulls =
+ NullBuffer::union(a.logical_nulls().as_ref(),
b.logical_nulls().as_ref())
+ .unwrap();
let mut buffer = BufferBuilder::<O::Native>::new(len);
buffer.append_n_zeroed(len);
@@ -355,7 +357,10 @@ where
if a.null_count() == 0 && b.null_count() == 0 {
try_binary_no_nulls_mut(len, a, b, op)
} else {
- let nulls = NullBuffer::union(a.nulls(), b.nulls()).unwrap();
+ let nulls =
+ NullBuffer::union(a.logical_nulls().as_ref(),
b.logical_nulls().as_ref())
+ .unwrap();
+
let mut builder = a.into_builder()?;
let slice = builder.values_slice_mut();
diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs
index 61e591d516..46e5998208 100644
--- a/arrow-arith/src/boolean.rs
+++ b/arrow-arith/src/boolean.rs
@@ -25,7 +25,7 @@
use arrow_array::*;
use arrow_buffer::buffer::{bitwise_bin_op_helper,
bitwise_quaternary_op_helper};
use arrow_buffer::{BooleanBuffer, NullBuffer};
-use arrow_schema::{ArrowError, DataType};
+use arrow_schema::ArrowError;
/// Logical 'and' boolean values with Kleene logic
///
@@ -311,11 +311,7 @@ pub fn not(left: &BooleanArray) -> Result<BooleanArray,
ArrowError> {
/// assert_eq!(a_is_null, BooleanArray::from(vec![false, false, true]));
/// ```
pub fn is_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
- let values = match input.nulls() {
- // NullArray has no nulls buffer yet all values are null
- None if input.data_type() == &DataType::Null => {
- BooleanBuffer::new_set(input.len())
- }
+ let values = match input.logical_nulls() {
None => BooleanBuffer::new_unset(input.len()),
Some(nulls) => !nulls.inner(),
};
@@ -335,11 +331,7 @@ pub fn is_null(input: &dyn Array) -> Result<BooleanArray,
ArrowError> {
/// assert_eq!(a_is_not_null, BooleanArray::from(vec![true, true, false]));
/// ```
pub fn is_not_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
- let values = match input.nulls() {
- // NullArray has no nulls buffer yet all values are null
- None if input.data_type() == &DataType::Null => {
- BooleanBuffer::new_unset(input.len())
- }
+ let values = match input.logical_nulls() {
None => BooleanBuffer::new_set(input.len()),
Some(n) => n.inner().clone(),
};
diff --git a/arrow-array/src/array/boolean_array.rs
b/arrow-array/src/array/boolean_array.rs
index 14fa87e138..995bb7d510 100644
--- a/arrow-array/src/array/boolean_array.rs
+++ b/arrow-array/src/array/boolean_array.rs
@@ -205,7 +205,7 @@ impl BooleanArray {
where
F: FnMut(T::Item) -> bool,
{
- let nulls = left.nulls().cloned();
+ let nulls = left.logical_nulls();
let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe {
// SAFETY: i in range 0..len
op(left.value_unchecked(i))
@@ -239,7 +239,10 @@ impl BooleanArray {
{
assert_eq!(left.len(), right.len());
- let nulls = NullBuffer::union(left.nulls(), right.nulls());
+ let nulls = NullBuffer::union(
+ left.logical_nulls().as_ref(),
+ right.logical_nulls().as_ref(),
+ );
let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe {
// SAFETY: i in range 0..len
op(left.value_unchecked(i), right.value_unchecked(i))
diff --git a/arrow-array/src/array/dictionary_array.rs
b/arrow-array/src/array/dictionary_array.rs
index 5a2f439a8e..2d80c75f07 100644
--- a/arrow-array/src/array/dictionary_array.rs
+++ b/arrow-array/src/array/dictionary_array.rs
@@ -729,6 +729,31 @@ impl<T: ArrowDictionaryKeyType> Array for
DictionaryArray<T> {
self.keys.nulls()
}
+ fn logical_nulls(&self) -> Option<NullBuffer> {
+ match self.values.nulls() {
+ None => self.nulls().cloned(),
+ Some(value_nulls) => {
+ let mut builder = BooleanBufferBuilder::new(self.len());
+ match self.keys.nulls() {
+ Some(n) => builder.append_buffer(n.inner()),
+ None => builder.append_n(self.len(), true),
+ }
+ for (idx, k) in self.keys.values().iter().enumerate() {
+ let k = k.as_usize();
+ // Check range to allow for nulls
+ if k < value_nulls.len() && value_nulls.is_null(k) {
+ builder.set_bit(idx, false);
+ }
+ }
+ Some(builder.finish().into())
+ }
+ }
+ }
+
+ fn is_nullable(&self) -> bool {
+ !self.is_empty() && (self.nulls().is_some() ||
self.values.is_nullable())
+ }
+
fn get_buffer_memory_size(&self) -> usize {
self.keys.get_buffer_memory_size() +
self.values.get_buffer_memory_size()
}
@@ -843,6 +868,14 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for
TypedDictionaryArray<'a,
self.dictionary.nulls()
}
+ fn logical_nulls(&self) -> Option<NullBuffer> {
+ self.dictionary.logical_nulls()
+ }
+
+ fn is_nullable(&self) -> bool {
+ self.dictionary.is_nullable()
+ }
+
fn get_buffer_memory_size(&self) -> usize {
self.dictionary.get_buffer_memory_size()
}
@@ -1253,4 +1286,20 @@ mod tests {
assert_eq!(v, expected, "{idx}");
}
}
+
+ #[test]
+ fn test_iterator_nulls() {
+ let keys = Int32Array::new(
+ vec![0, 700, 1, 2].into(),
+ Some(NullBuffer::from(vec![true, false, true, true])),
+ );
+ let values = Int32Array::from(vec![Some(50), None, Some(2)]);
+ let dict = DictionaryArray::new(keys, Arc::new(values));
+ let values: Vec<_> = dict
+ .downcast_dict::<Int32Array>()
+ .unwrap()
+ .into_iter()
+ .collect();
+ assert_eq!(values, &[Some(50), None, None, Some(2)])
+ }
}
diff --git a/arrow-array/src/array/fixed_size_list_array.rs
b/arrow-array/src/array/fixed_size_list_array.rs
index 6c3abb556a..8996fc8da4 100644
--- a/arrow-array/src/array/fixed_size_list_array.rs
+++ b/arrow-array/src/array/fixed_size_list_array.rs
@@ -147,7 +147,7 @@ impl FixedSizeListArray {
/// * `size < 0`
/// * `values.len() / size != nulls.len()`
/// * `values.data_type() != field.data_type()`
- /// * `!field.is_nullable() &&
!nulls.expand(size).contains(values.nulls())`
+ /// * `!field.is_nullable() &&
!nulls.expand(size).contains(values.logical_nulls())`
pub fn try_new(
field: FieldRef,
size: i32,
@@ -181,11 +181,11 @@ impl FixedSizeListArray {
)));
}
- if let Some(a) = values.nulls() {
+ if let Some(a) = values.logical_nulls() {
let nulls_valid = field.is_nullable()
|| nulls
.as_ref()
- .map(|n| n.expand(size as _).contains(a))
+ .map(|n| n.expand(size as _).contains(&a))
.unwrap_or_default();
if !nulls_valid {
diff --git a/arrow-array/src/array/list_array.rs
b/arrow-array/src/array/list_array.rs
index 05628084c8..f5b7ae77c3 100644
--- a/arrow-array/src/array/list_array.rs
+++ b/arrow-array/src/array/list_array.rs
@@ -161,7 +161,7 @@ impl<OffsetSize: OffsetSizeTrait>
GenericListArray<OffsetSize> {
///
/// * `offsets.len() - 1 != nulls.len()`
/// * `offsets.last() > values.len()`
- /// * `!field.is_nullable() && values.null_count() != 0`
+ /// * `!field.is_nullable() && values.is_nullable()`
/// * `field.data_type() != values.data_type()`
pub fn try_new(
field: FieldRef,
@@ -189,7 +189,7 @@ impl<OffsetSize: OffsetSizeTrait>
GenericListArray<OffsetSize> {
)));
}
}
- if !field.is_nullable() && values.null_count() != 0 {
+ if !field.is_nullable() && values.is_nullable() {
return Err(ArrowError::InvalidArgumentError(format!(
"Non-nullable field of {}ListArray {:?} cannot contain nulls",
OffsetSize::PREFIX,
diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs
index 0157279dfe..79240d105a 100644
--- a/arrow-array/src/array/mod.rs
+++ b/arrow-array/src/array/mod.rs
@@ -173,12 +173,33 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// ```
fn offset(&self) -> usize;
- /// Returns the null buffers of this array if any
+ /// Returns the null buffer of this array if any
+ ///
+ /// Note: some arrays can encode their nullability in their children, for
example,
+ /// [`DictionaryArray::values`] values or [`RunArray::values`], or without
a null buffer,
+ /// such as [`NullArray`]. Use [`Array::logical_nulls`] to obtain a
computed mask encoding this
fn nulls(&self) -> Option<&NullBuffer>;
+ /// Returns the logical null buffer of this array if any
+ ///
+ /// In most cases this will be the same as [`Array::nulls`], except for:
+ ///
+ /// * DictionaryArray where [`DictionaryArray::values`] contains nulls
+ /// * RunArray where [`RunArray::values`] contains nulls
+ /// * NullArray where all indices are nulls
+ ///
+ /// In these cases a logical [`NullBuffer`] will be computed, encoding the
logical nullability
+ /// of these arrays, beyond what is encoded in [`Array::nulls`]
+ fn logical_nulls(&self) -> Option<NullBuffer> {
+ self.nulls().cloned()
+ }
+
/// Returns whether the element at `index` is null.
/// When using this function on a slice, the index is relative to the
slice.
///
+ /// Note: this method returns the physical nullability, i.e. that encoded
in [`Array::nulls`]
+ /// see [`Array::logical_nulls`] for logical nullability
+ ///
/// # Example:
///
/// ```
@@ -196,6 +217,9 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// Returns whether the element at `index` is not null.
/// When using this function on a slice, the index is relative to the
slice.
///
+ /// Note: this method returns the physical nullability, i.e. that encoded
in [`Array::nulls`]
+ /// see [`Array::logical_nulls`] for logical nullability
+ ///
/// # Example:
///
/// ```
@@ -210,7 +234,10 @@ pub trait Array: std::fmt::Debug + Send + Sync {
!self.is_null(index)
}
- /// Returns the total number of null values in this array.
+ /// Returns the total number of physical null values in this array.
+ ///
+ /// Note: this method returns the physical null count, i.e. that encoded
in [`Array::nulls`],
+ /// see [`Array::logical_nulls`] for logical nullability
///
/// # Example:
///
@@ -226,6 +253,19 @@ pub trait Array: std::fmt::Debug + Send + Sync {
self.nulls().map(|n| n.null_count()).unwrap_or_default()
}
+ /// Returns `false` if the array is guaranteed to not contain any logical
nulls
+ ///
+ /// In general this will be equivalent to `Array::null_count() != 0` but
may differ in the
+ /// presence of logical nullability, see [`Array::logical_nulls`].
+ ///
+ /// Implementations will return `true` unless they can cheaply prove no
logical nulls
+ /// are present. For example a [`DictionaryArray`] with nullable values
will still return true,
+ /// even if the nulls present in [`DictionaryArray::values`] are not
referenced by any key,
+ /// and therefore would not appear in [`Array::logical_nulls`].
+ fn is_nullable(&self) -> bool {
+ self.null_count() != 0
+ }
+
/// Returns the total number of bytes of memory pointed to by this array.
/// The buffers store bytes in the Arrow memory format, and include the
data as well as the validity map.
fn get_buffer_memory_size(&self) -> usize;
@@ -277,6 +317,10 @@ impl Array for ArrayRef {
self.as_ref().nulls()
}
+ fn logical_nulls(&self) -> Option<NullBuffer> {
+ self.as_ref().logical_nulls()
+ }
+
fn is_null(&self, index: usize) -> bool {
self.as_ref().is_null(index)
}
@@ -289,6 +333,10 @@ impl Array for ArrayRef {
self.as_ref().null_count()
}
+ fn is_nullable(&self) -> bool {
+ self.as_ref().is_nullable()
+ }
+
fn get_buffer_memory_size(&self) -> usize {
self.as_ref().get_buffer_memory_size()
}
@@ -335,6 +383,10 @@ impl<'a, T: Array> Array for &'a T {
T::nulls(self)
}
+ fn logical_nulls(&self) -> Option<NullBuffer> {
+ T::logical_nulls(self)
+ }
+
fn is_null(&self, index: usize) -> bool {
T::is_null(self, index)
}
@@ -347,6 +399,10 @@ impl<'a, T: Array> Array for &'a T {
T::null_count(self)
}
+ fn is_nullable(&self) -> bool {
+ T::is_nullable(self)
+ }
+
fn get_buffer_memory_size(&self) -> usize {
T::get_buffer_memory_size(self)
}
diff --git a/arrow-array/src/array/null_array.rs
b/arrow-array/src/array/null_array.rs
index c054c89043..af3ec0b57d 100644
--- a/arrow-array/src/array/null_array.rs
+++ b/arrow-array/src/array/null_array.rs
@@ -36,8 +36,10 @@ use std::sync::Arc;
///
/// let array = NullArray::new(10);
///
+/// assert!(array.is_nullable());
/// assert_eq!(array.len(), 10);
-/// assert_eq!(array.null_count(), 10);
+/// assert_eq!(array.null_count(), 0);
+/// assert_eq!(array.logical_nulls().unwrap().null_count(), 10);
/// ```
#[derive(Clone)]
pub struct NullArray {
@@ -107,22 +109,12 @@ impl Array for NullArray {
None
}
- /// Returns whether the element at `index` is null.
- /// All elements of a `NullArray` are always null.
- fn is_null(&self, _index: usize) -> bool {
- true
+ fn logical_nulls(&self) -> Option<NullBuffer> {
+ (self.len != 0).then(|| NullBuffer::new_null(self.len))
}
- /// Returns whether the element at `index` is valid.
- /// All elements of a `NullArray` are always invalid.
- fn is_valid(&self, _index: usize) -> bool {
- false
- }
-
- /// Returns the total number of null values in this array.
- /// The null count of a `NullArray` always equals its length.
- fn null_count(&self) -> usize {
- self.len()
+ fn is_nullable(&self) -> bool {
+ !self.is_empty()
}
fn get_buffer_memory_size(&self) -> usize {
@@ -176,8 +168,10 @@ mod tests {
let null_arr = NullArray::new(32);
assert_eq!(null_arr.len(), 32);
- assert_eq!(null_arr.null_count(), 32);
- assert!(!null_arr.is_valid(0));
+ assert_eq!(null_arr.null_count(), 0);
+ assert_eq!(null_arr.logical_nulls().unwrap().null_count(), 32);
+ assert!(null_arr.is_valid(0));
+ assert!(null_arr.is_nullable());
}
#[test]
@@ -186,7 +180,10 @@ mod tests {
let array2 = array1.slice(8, 16);
assert_eq!(array2.len(), 16);
- assert_eq!(array2.null_count(), 16);
+ assert_eq!(array2.null_count(), 0);
+ assert_eq!(array2.logical_nulls().unwrap().null_count(), 16);
+ assert!(array2.is_valid(0));
+ assert!(array2.is_nullable());
}
#[test]
diff --git a/arrow-array/src/array/run_array.rs
b/arrow-array/src/array/run_array.rs
index 820d5c9ebf..30cefaeb4d 100644
--- a/arrow-array/src/array/run_array.rs
+++ b/arrow-array/src/array/run_array.rs
@@ -18,7 +18,7 @@
use std::any::Any;
use std::sync::Arc;
-use arrow_buffer::{ArrowNativeType, NullBuffer, RunEndBuffer};
+use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer,
RunEndBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType, Field};
@@ -349,6 +349,43 @@ impl<T: RunEndIndexType> Array for RunArray<T> {
None
}
+ fn logical_nulls(&self) -> Option<NullBuffer> {
+ let len = self.len();
+ let nulls = self.values.logical_nulls()?;
+ let mut out = BooleanBufferBuilder::new(len);
+ let offset = self.run_ends.offset();
+ let mut valid_start = 0;
+ let mut last_end = 0;
+ for (idx, end) in self.run_ends.values().iter().enumerate() {
+ let end = end.as_usize();
+ if end < offset {
+ continue;
+ }
+ let end = (end - offset).min(len);
+ if nulls.is_null(idx) {
+ if valid_start < last_end {
+ out.append_n(last_end - valid_start, true);
+ }
+ out.append_n(end - last_end, false);
+ valid_start = end;
+ }
+ last_end = end;
+ if end == len {
+ break;
+ }
+ }
+ if valid_start < len {
+ out.append_n(len - valid_start, true)
+ }
+ // Sanity check
+ assert_eq!(out.len(), len);
+ Some(out.finish().into())
+ }
+
+ fn is_nullable(&self) -> bool {
+ !self.is_empty() && self.values.is_nullable()
+ }
+
fn get_buffer_memory_size(&self) -> usize {
self.run_ends.inner().inner().capacity() +
self.values.get_buffer_memory_size()
}
@@ -569,6 +606,14 @@ impl<'a, R: RunEndIndexType, V: Sync> Array for
TypedRunArray<'a, R, V> {
self.run_array.nulls()
}
+ fn logical_nulls(&self) -> Option<NullBuffer> {
+ self.run_array.logical_nulls()
+ }
+
+ fn is_nullable(&self) -> bool {
+ self.run_array.is_nullable()
+ }
+
fn get_buffer_memory_size(&self) -> usize {
self.run_array.get_buffer_memory_size()
}
@@ -1041,4 +1086,26 @@ mod tests {
);
}
}
+
+ #[test]
+ fn test_logical_nulls() {
+ let run = Int32Array::from(vec![3, 6, 9, 12]);
+ let values = Int32Array::from(vec![Some(0), None, Some(1), None]);
+ let array = RunArray::try_new(&run, &values).unwrap();
+
+ let expected = vec![
+ true, true, true, false, false, false, true, true, true, false,
false, false,
+ ];
+
+ let n = array.logical_nulls().unwrap();
+ assert_eq!(n.null_count(), 6);
+
+ let slices = [(0, 12), (0, 2), (2, 5), (3, 0), (3, 3), (3, 4), (4, 8)];
+ for (offset, length) in slices {
+ let a = array.slice(offset, length);
+ let n = a.logical_nulls().unwrap();
+ let n = n.into_iter().collect::<Vec<_>>();
+ assert_eq!(&n, &expected[offset..offset + length], "{offset}
{length}");
+ }
+ }
}
diff --git a/arrow-array/src/array/struct_array.rs
b/arrow-array/src/array/struct_array.rs
index 1a79ebd95f..284c3b26a9 100644
--- a/arrow-array/src/array/struct_array.rs
+++ b/arrow-array/src/array/struct_array.rs
@@ -143,15 +143,14 @@ impl StructArray {
)));
}
- if let Some(a) = a.nulls() {
- let nulls_valid = f.is_nullable()
- || nulls.as_ref().map(|n|
n.contains(a)).unwrap_or_default();
-
- if !nulls_valid {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Found unmasked nulls for non-nullable StructArray
field {:?}",
- f.name()
- )));
+ if !f.is_nullable() {
+ if let Some(a) = a.logical_nulls() {
+ if !nulls.as_ref().map(|n|
n.contains(&a)).unwrap_or_default() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Found unmasked nulls for non-nullable StructArray
field {:?}",
+ f.name()
+ )));
+ }
}
}
}
@@ -314,7 +313,7 @@ impl TryFrom<Vec<(&str, ArrayRef)>> for StructArray {
.into_iter()
.map(|(name, array)| {
(
- Field::new(name, array.data_type().clone(),
array.nulls().is_some()),
+ Field::new(name, array.data_type().clone(),
array.is_nullable()),
array,
)
})
diff --git a/arrow-array/src/builder/null_builder.rs
b/arrow-array/src/builder/null_builder.rs
index 94cb7f5cc2..53a6b103d5 100644
--- a/arrow-array/src/builder/null_builder.rs
+++ b/arrow-array/src/builder/null_builder.rs
@@ -40,7 +40,7 @@ use std::sync::Arc;
/// let arr = b.finish();
///
/// assert_eq!(8, arr.len());
-/// assert_eq!(8, arr.null_count());
+/// assert_eq!(0, arr.null_count());
/// ```
#[derive(Debug)]
pub struct NullBuilder {
@@ -160,7 +160,8 @@ mod tests {
let arr = builder.finish();
assert_eq!(20, arr.len());
assert_eq!(0, arr.offset());
- assert_eq!(20, arr.null_count());
+ assert_eq!(0, arr.null_count());
+ assert!(arr.is_nullable());
}
#[test]
@@ -170,10 +171,10 @@ mod tests {
builder.append_empty_value();
builder.append_empty_values(3);
let mut array = builder.finish_cloned();
- assert_eq!(21, array.null_count());
+ assert_eq!(21, array.len());
builder.append_empty_values(5);
array = builder.finish();
- assert_eq!(26, array.null_count());
+ assert_eq!(26, array.len());
}
}
diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs
index 86f5d99128..a198332ca5 100644
--- a/arrow-array/src/iterator.rs
+++ b/arrow-array/src/iterator.rs
@@ -22,6 +22,7 @@ use crate::array::{
GenericListArray, GenericStringArray, PrimitiveArray,
};
use crate::{FixedSizeListArray, MapArray};
+use arrow_buffer::NullBuffer;
/// An iterator that returns Some(T) or None, that can be used on any
[`ArrayAccessor`]
///
@@ -46,6 +47,7 @@ use crate::{FixedSizeListArray, MapArray};
#[derive(Debug)]
pub struct ArrayIter<T: ArrayAccessor> {
array: T,
+ logical_nulls: Option<NullBuffer>,
current: usize,
current_end: usize,
}
@@ -54,12 +56,22 @@ impl<T: ArrayAccessor> ArrayIter<T> {
/// create a new iterator
pub fn new(array: T) -> Self {
let len = array.len();
+ let logical_nulls = array.logical_nulls();
ArrayIter {
array,
+ logical_nulls,
current: 0,
current_end: len,
}
}
+
+ #[inline]
+ fn is_null(&self, idx: usize) -> bool {
+ self.logical_nulls
+ .as_ref()
+ .map(|x| x.is_null(idx))
+ .unwrap_or_default()
+ }
}
impl<T: ArrayAccessor> Iterator for ArrayIter<T> {
@@ -69,7 +81,7 @@ impl<T: ArrayAccessor> Iterator for ArrayIter<T> {
fn next(&mut self) -> Option<Self::Item> {
if self.current == self.current_end {
None
- } else if self.array.is_null(self.current) {
+ } else if self.is_null(self.current) {
self.current += 1;
Some(None)
} else {
@@ -98,7 +110,7 @@ impl<T: ArrayAccessor> DoubleEndedIterator for ArrayIter<T> {
None
} else {
self.current_end -= 1;
- Some(if self.array.is_null(self.current_end) {
+ Some(if self.is_null(self.current_end) {
None
} else {
// Safety:
diff --git a/arrow-buffer/src/builder/boolean.rs
b/arrow-buffer/src/builder/boolean.rs
index f84cfa79c2..f0e7f0f136 100644
--- a/arrow-buffer/src/builder/boolean.rs
+++ b/arrow-buffer/src/builder/boolean.rs
@@ -203,6 +203,12 @@ impl BooleanBufferBuilder {
);
}
+ /// Append [`BooleanBuffer`] to this [`BooleanBufferBuilder`]
+ pub fn append_buffer(&mut self, buffer: &BooleanBuffer) {
+ let range = buffer.offset()..buffer.offset() + buffer.len();
+ self.append_packed_range(range, buffer.values())
+ }
+
/// Returns the packed bits
pub fn as_slice(&self) -> &[u8] {
self.buffer.as_slice()
diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs
index c7fd082de2..a08a7a4fd4 100644
--- a/arrow-cast/src/cast.rs
+++ b/arrow-cast/src/cast.rs
@@ -7233,9 +7233,8 @@ mod tests {
assert_eq!(array.data_type(), &data_type);
let cast_array = cast(&array, &DataType::Null).expect("cast failed");
assert_eq!(cast_array.data_type(), &DataType::Null);
- for i in 0..4 {
- assert!(cast_array.is_null(i));
- }
+ assert_eq!(cast_array.len(), 4);
+ assert_eq!(cast_array.logical_nulls().unwrap().null_count(), 4);
}
#[test]
diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs
index d18b0e36e9..21583fac08 100644
--- a/arrow-ord/src/comparison.rs
+++ b/arrow-ord/src/comparison.rs
@@ -6352,4 +6352,18 @@ mod tests {
.to_string()
.contains("Could not convert ToType with to_i128"));
}
+
+ #[test]
+ #[cfg(feature = "dyn_cmp_dict")]
+ fn test_dictionary_nested_nulls() {
+ let keys = Int32Array::from(vec![0, 1, 2]);
+ let v1 = Arc::new(Int32Array::from(vec![Some(0), None, Some(2)]));
+ let a = DictionaryArray::new(keys.clone(), v1);
+ let v2 = Arc::new(Int32Array::from(vec![None, Some(0), Some(2)]));
+ let b = DictionaryArray::new(keys, v2);
+
+ let r = eq_dyn(&a, &b).unwrap();
+ assert_eq!(r.null_count(), 2);
+ assert!(r.is_valid(2));
+ }
}
diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs
index 648a7d7afc..8785863059 100644
--- a/arrow-ord/src/sort.rs
+++ b/arrow-ord/src/sort.rs
@@ -719,19 +719,19 @@ where
}
}
-type LexicographicalCompareItem<'a> = (
- Option<&'a NullBuffer>, // nulls
- DynComparator, // comparator
- SortOptions, // sort_option
+type LexicographicalCompareItem = (
+ Option<NullBuffer>, // nulls
+ DynComparator, // comparator
+ SortOptions, // sort_option
);
/// A lexicographical comparator that wraps given array data (columns) and can
lexicographically compare data
/// at given two indices. The lifetime is the same at the data wrapped.
-pub struct LexicographicalComparator<'a> {
- compare_items: Vec<LexicographicalCompareItem<'a>>,
+pub struct LexicographicalComparator {
+ compare_items: Vec<LexicographicalCompareItem>,
}
-impl LexicographicalComparator<'_> {
+impl LexicographicalComparator {
/// lexicographically compare values at the wrapped columns with given
indices.
pub fn compare(&self, a_idx: usize, b_idx: usize) -> Ordering {
for (nulls, comparator, sort_option) in &self.compare_items {
@@ -780,14 +780,14 @@ impl LexicographicalComparator<'_> {
/// results with two indices.
pub fn try_new(
columns: &[SortColumn],
- ) -> Result<LexicographicalComparator<'_>, ArrowError> {
+ ) -> Result<LexicographicalComparator, ArrowError> {
let compare_items = columns
.iter()
.map(|column| {
// flatten and convert build comparators
let values = column.values.as_ref();
Ok((
- values.nulls(),
+ values.logical_nulls(),
build_compare(values, values)?,
column.options.unwrap_or_default(),
))
@@ -4016,4 +4016,30 @@ mod tests {
vec![None, None, None, Some(5.1), Some(5.1), Some(3.0), Some(1.2)],
);
}
+
+ #[test]
+ fn test_lexicographic_comparator_null_dict_values() {
+ let values = Int32Array::new(
+ vec![1, 2, 3, 4].into(),
+ Some(NullBuffer::from(vec![true, false, false, true])),
+ );
+ let keys = Int32Array::new(
+ vec![0, 1, 53, 3].into(),
+ Some(NullBuffer::from(vec![true, true, false, true])),
+ );
+ // [1, NULL, NULL, 4]
+ let dict = DictionaryArray::new(keys, Arc::new(values));
+
+ let comparator = LexicographicalComparator::try_new(&[SortColumn {
+ values: Arc::new(dict),
+ options: None,
+ }])
+ .unwrap();
+ // 1.cmp(NULL)
+ assert_eq!(comparator.compare(0, 1), Ordering::Greater);
+ // NULL.cmp(NULL)
+ assert_eq!(comparator.compare(2, 1), Ordering::Equal);
+ // NULL.cmp(4)
+ assert_eq!(comparator.compare(2, 3), Ordering::Less);
+ }
}
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index 1223280e37..9d3abea66f 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -581,7 +581,10 @@ where
));
}
- let nulls = NullBuffer::union(left.nulls(), right.nulls());
+ let nulls = NullBuffer::union(
+ left.logical_nulls().as_ref(),
+ right.logical_nulls().as_ref(),
+ );
let mut result = BooleanBufferBuilder::new(left.len());
for i in 0..left.len() {
diff --git a/parquet/src/arrow/arrow_writer/levels.rs
b/parquet/src/arrow/arrow_writer/levels.rs
index 47b0189030..48615dc3d5 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -494,7 +494,7 @@ impl LevelInfoBuilder {
def_levels.reserve(len);
info.non_null_indices.reserve(len);
- match array.nulls() {
+ match array.logical_nulls() {
Some(nulls) => {
// TODO: Faster bitmask iteration (#1757)
for i in range {
@@ -1751,7 +1751,6 @@ mod tests {
builder.write(&a, 0..4);
let levels = builder.finish();
- let list_level = levels.get(0).unwrap();
let expected_level = LevelInfo {
def_levels: Some(vec![5, 4, 5, 2, 5, 3, 5, 5, 4, 4, 0]),
rep_levels: Some(vec![0, 2, 2, 1, 0, 1, 0, 2, 1, 2, 0]),
@@ -1760,6 +1759,35 @@ mod tests {
max_rep_level: 2,
};
- assert_eq!(list_level, &expected_level);
+ assert_eq!(levels[0], expected_level);
+ }
+
+ #[test]
+ fn test_null_dictionary_values() {
+ let values = Int32Array::new(
+ vec![1, 2, 3, 4].into(),
+ Some(NullBuffer::from(vec![true, false, true, true])),
+ );
+ let keys = Int32Array::new(
+ vec![1, 54, 2, 0].into(),
+ Some(NullBuffer::from(vec![true, false, true, true])),
+ );
+ // [NULL, NULL, 3, 0]
+ let dict = DictionaryArray::new(keys, Arc::new(values));
+
+ let item_field = Field::new("item", dict.data_type().clone(), true);
+
+ let mut builder =
+ LevelInfoBuilder::try_new(&item_field,
Default::default()).unwrap();
+ builder.write(&dict, 0..4);
+ let levels = builder.finish();
+ let expected_level = LevelInfo {
+ def_levels: Some(vec![0, 0, 1, 1]),
+ rep_levels: None,
+ non_null_indices: vec![2, 3],
+ max_def_level: 1,
+ max_rep_level: 0,
+ };
+ assert_eq!(levels[0], expected_level);
}
}
diff --git a/parquet/src/arrow/arrow_writer/mod.rs
b/parquet/src/arrow/arrow_writer/mod.rs
index d3d4e2626f..c4d174b6ad 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1965,7 +1965,7 @@ mod tests {
assert_eq!(a.value(0).len(), 0);
assert_eq!(a.value(2).len(), 2);
- assert_eq!(a.value(2).null_count(), 2);
+ assert_eq!(a.value(2).logical_nulls().unwrap().null_count(), 2);
let batch = RecordBatch::try_new(Arc::new(schema),
vec![Arc::new(a)]).unwrap();
roundtrip(batch, None);