This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 7e51d40e57 Add Array::logical_null_count for inspecting number of null
values (#6608)
7e51d40e57 is described below
commit 7e51d40e57392be1eca47d96f1393d8de561fa6d
Author: Piotr Findeisen <[email protected]>
AuthorDate: Tue Oct 22 10:07:48 2024 +0200
Add Array::logical_null_count for inspecting number of null values (#6608)
Add counter-part of `Array::null_count`, but counting the logical null
values. This will be useful in DataFusion. Current alternative is to
compute null mask (via `Array::logical_nulls()`) and do counting on it.
Given this might be expensive and verbose, caller may naturally feel
steer towards `Array::null_count` which may or may not be applicable,
depending on the context.
---
arrow-array/src/array/boolean_array.rs | 4 +++
arrow-array/src/array/dictionary_array.rs | 1 +
arrow-array/src/array/fixed_size_binary_array.rs | 1 +
arrow-array/src/array/mod.rs | 37 +++++++++++++++++++++-
arrow-array/src/array/null_array.rs | 7 ++++
arrow-array/src/array/primitive_array.rs | 4 +++
arrow-array/src/array/run_array.rs | 6 ++++
.../src/builder/generic_byte_run_builder.rs | 3 ++
arrow-array/src/builder/generic_list_builder.rs | 2 ++
arrow-array/src/builder/primitive_run_builder.rs | 2 ++
arrow-select/src/concat.rs | 1 +
arrow/src/util/data_gen.rs | 2 ++
.../arrow/array_reader/byte_array_dictionary.rs | 2 ++
13 files changed, 71 insertions(+), 1 deletion(-)
diff --git a/arrow-array/src/array/boolean_array.rs
b/arrow-array/src/array/boolean_array.rs
index fea8616c91..0f95adacf1 100644
--- a/arrow-array/src/array/boolean_array.rs
+++ b/arrow-array/src/array/boolean_array.rs
@@ -316,6 +316,10 @@ impl Array for BooleanArray {
self.nulls.as_ref()
}
+ fn logical_null_count(&self) -> usize {
+ self.null_count()
+ }
+
fn get_buffer_memory_size(&self) -> usize {
let mut sum = self.values.inner().capacity();
if let Some(x) = &self.nulls {
diff --git a/arrow-array/src/array/dictionary_array.rs
b/arrow-array/src/array/dictionary_array.rs
index a3931d38d4..6f27b383c0 100644
--- a/arrow-array/src/array/dictionary_array.rs
+++ b/arrow-array/src/array/dictionary_array.rs
@@ -1311,6 +1311,7 @@ mod tests {
assert_eq!(array.values().data_type(), &DataType::Utf8);
assert_eq!(array.null_count(), 1);
+ assert_eq!(array.logical_null_count(), 1);
assert!(array.keys().is_valid(0));
assert!(array.keys().is_valid(1));
diff --git a/arrow-array/src/array/fixed_size_binary_array.rs
b/arrow-array/src/array/fixed_size_binary_array.rs
index e393e2b15a..1371e81e26 100644
--- a/arrow-array/src/array/fixed_size_binary_array.rs
+++ b/arrow-array/src/array/fixed_size_binary_array.rs
@@ -858,6 +858,7 @@ mod tests {
let array = FixedSizeBinaryArray::from(values);
assert_eq!(array.len(), 4);
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 0);
assert_eq!(array.value(0), b"one");
assert_eq!(array.value(1), b"two");
assert_eq!(array.value(2), b"six");
diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs
index f1ed118826..4205d00ec8 100644
--- a/arrow-array/src/array/mod.rs
+++ b/arrow-array/src/array/mod.rs
@@ -279,9 +279,31 @@ pub trait Array: std::fmt::Debug + Send + Sync {
self.nulls().map(|n| n.null_count()).unwrap_or_default()
}
+ /// Returns the total number of logical null values in this array.
+ ///
+ /// Note: this method returns the logical null count, i.e. that encoded in
+ /// [`Array::logical_nulls`]. In general this is equivalent to
[`Array::null_count`] but may differ in the
+ /// presence of logical nullability, see [`Array::nulls`] and
[`Array::logical_nulls`].
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow_array::{Array, Int32Array};
+ ///
+ /// // Construct an array with values [1, NULL, NULL]
+ /// let array = Int32Array::from(vec![Some(1), None, None]);
+ ///
+ /// assert_eq!(array.logical_null_count(), 2);
+ /// ```
+ fn logical_null_count(&self) -> usize {
+ self.logical_nulls()
+ .map(|n| n.null_count())
+ .unwrap_or_default()
+ }
+
/// Returns `false` if the array is guaranteed to not contain any logical
nulls
///
- /// In general this will be equivalent to `Array::null_count() != 0` but
may differ in the
+ /// In general this will be equivalent to `Array::logical_null_count() !=
0` but may differ in the
/// presence of logical nullability, see [`Array::logical_nulls`].
///
/// Implementations will return `true` unless they can cheaply prove no
logical nulls
@@ -289,6 +311,7 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// even if the nulls present in [`DictionaryArray::values`] are not
referenced by any key,
/// and therefore would not appear in [`Array::logical_nulls`].
fn is_nullable(&self) -> bool {
+ // TODO this is not necessarily perfect default implementation, since
null_count() and logical_null_count() are not always equivalent
self.null_count() != 0
}
@@ -361,6 +384,10 @@ impl Array for ArrayRef {
self.as_ref().null_count()
}
+ fn logical_null_count(&self) -> usize {
+ self.as_ref().logical_null_count()
+ }
+
fn is_nullable(&self) -> bool {
self.as_ref().is_nullable()
}
@@ -427,6 +454,10 @@ impl<T: Array> Array for &T {
T::null_count(self)
}
+ fn logical_null_count(&self) -> usize {
+ T::logical_null_count(self)
+ }
+
fn is_nullable(&self) -> bool {
T::is_nullable(self)
}
@@ -959,11 +990,13 @@ mod tests {
let array = as_union_array(array.as_ref());
assert_eq!(array.len(), 4);
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 4);
for i in 0..4 {
let a = array.value(i);
assert_eq!(a.len(), 1);
assert_eq!(a.null_count(), 1);
+ assert_eq!(a.logical_null_count(), 1);
assert!(a.is_null(0))
}
@@ -987,6 +1020,7 @@ mod tests {
array => {
assert_eq!(array.len(), 4);
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 4);
assert_eq!(array.values().len(), 1);
assert_eq!(array.values().null_count(), 1);
assert_eq!(array.run_ends().len(), 4);
@@ -1012,6 +1046,7 @@ mod tests {
assert_eq!(array.len(), 6);
assert_eq!(array.null_count(), 6);
+ assert_eq!(array.logical_null_count(), 6);
array.iter().for_each(|x| assert!(x.is_none()));
}
}
diff --git a/arrow-array/src/array/null_array.rs
b/arrow-array/src/array/null_array.rs
index 88cc2d911f..9a7a5ebe17 100644
--- a/arrow-array/src/array/null_array.rs
+++ b/arrow-array/src/array/null_array.rs
@@ -39,6 +39,7 @@ use std::sync::Arc;
/// assert!(array.is_nullable());
/// assert_eq!(array.len(), 10);
/// assert_eq!(array.null_count(), 0);
+/// assert_eq!(array.logical_null_count(), 10);
/// assert_eq!(array.logical_nulls().unwrap().null_count(), 10);
/// ```
#[derive(Clone)]
@@ -120,6 +121,10 @@ impl Array for NullArray {
!self.is_empty()
}
+ fn logical_null_count(&self) -> usize {
+ self.len
+ }
+
fn get_buffer_memory_size(&self) -> usize {
0
}
@@ -172,6 +177,7 @@ mod tests {
assert_eq!(null_arr.len(), 32);
assert_eq!(null_arr.null_count(), 0);
+ assert_eq!(null_arr.logical_null_count(), 32);
assert_eq!(null_arr.logical_nulls().unwrap().null_count(), 32);
assert!(null_arr.is_valid(0));
assert!(null_arr.is_nullable());
@@ -184,6 +190,7 @@ mod tests {
let array2 = array1.slice(8, 16);
assert_eq!(array2.len(), 16);
assert_eq!(array2.null_count(), 0);
+ assert_eq!(array2.logical_null_count(), 16);
assert_eq!(array2.logical_nulls().unwrap().null_count(), 16);
assert!(array2.is_valid(0));
assert!(array2.is_nullable());
diff --git a/arrow-array/src/array/primitive_array.rs
b/arrow-array/src/array/primitive_array.rs
index 0c100e39af..7b0d6c5ca1 100644
--- a/arrow-array/src/array/primitive_array.rs
+++ b/arrow-array/src/array/primitive_array.rs
@@ -1160,6 +1160,10 @@ impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
self.nulls.as_ref()
}
+ fn logical_null_count(&self) -> usize {
+ self.null_count()
+ }
+
fn get_buffer_memory_size(&self) -> usize {
let mut size = self.values.inner().capacity();
if let Some(n) = self.nulls.as_ref() {
diff --git a/arrow-array/src/array/run_array.rs
b/arrow-array/src/array/run_array.rs
index 95291f8829..81c8cdcea4 100644
--- a/arrow-array/src/array/run_array.rs
+++ b/arrow-array/src/array/run_array.rs
@@ -777,6 +777,7 @@ mod tests {
assert_eq!(array.len(), 20);
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 0);
assert_eq!(
"RunArray {run_ends: [20], values: PrimitiveArray<UInt32>\n[\n
1,\n]}\n",
@@ -798,6 +799,7 @@ mod tests {
assert_eq!(array.len(), 4);
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 1);
let array: RunArray<Int16Type> = test.into_iter().collect();
assert_eq!(
@@ -813,6 +815,7 @@ mod tests {
assert_eq!(array.len(), 4);
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 0);
let run_ends = array.run_ends();
assert_eq!(&[1, 2, 3, 4], run_ends.values());
@@ -825,6 +828,7 @@ mod tests {
assert_eq!(array.len(), 6);
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 3);
let run_ends = array.run_ends();
assert_eq!(&[1, 2, 3, 5, 6], run_ends.values());
@@ -841,6 +845,7 @@ mod tests {
assert_eq!(array.len(), 3);
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 3);
let run_ends = array.run_ends();
assert_eq!(3, run_ends.len());
@@ -861,6 +866,7 @@ mod tests {
assert_eq!(array.values().data_type(), &DataType::Utf8);
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 1);
assert_eq!(array.len(), 4);
assert_eq!(array.values().null_count(), 1);
diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs
b/arrow-array/src/builder/generic_byte_run_builder.rs
index 3cde76c4a0..0bf5658b29 100644
--- a/arrow-array/src/builder/generic_byte_run_builder.rs
+++ b/arrow-array/src/builder/generic_byte_run_builder.rs
@@ -403,6 +403,7 @@ mod tests {
assert_eq!(array.len(), 11);
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 2);
assert_eq!(array.run_ends().values(), &[3, 5, 7, 11]);
@@ -448,6 +449,7 @@ mod tests {
assert_eq!(array.len(), 5);
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 1);
assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]);
@@ -469,6 +471,7 @@ mod tests {
assert_eq!(array.len(), 8);
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 1);
assert_eq!(array.run_ends().values(), &[1, 2, 4, 7, 8]);
diff --git a/arrow-array/src/builder/generic_list_builder.rs
b/arrow-array/src/builder/generic_list_builder.rs
index 6ff5f20df6..a7d16f45f5 100644
--- a/arrow-array/src/builder/generic_list_builder.rs
+++ b/arrow-array/src/builder/generic_list_builder.rs
@@ -545,10 +545,12 @@ mod tests {
let array = builder.finish();
assert_eq!(array.value_offsets(), [0, 4, 4, 6, 6]);
assert_eq!(array.null_count(), 1);
+ assert_eq!(array.logical_null_count(), 1);
assert!(array.is_null(3));
let elements = array.values().as_primitive::<Int32Type>();
assert_eq!(elements.values(), &[1, 2, 7, 0, 4, 5]);
assert_eq!(elements.null_count(), 1);
+ assert_eq!(elements.logical_null_count(), 1);
assert!(elements.is_null(3));
}
diff --git a/arrow-array/src/builder/primitive_run_builder.rs
b/arrow-array/src/builder/primitive_run_builder.rs
index 01a989199b..1db9c91e08 100644
--- a/arrow-array/src/builder/primitive_run_builder.rs
+++ b/arrow-array/src/builder/primitive_run_builder.rs
@@ -277,6 +277,7 @@ mod tests {
let array = builder.finish();
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 1);
assert_eq!(array.len(), 6);
assert_eq!(array.run_ends().values(), &[3, 4, 6]);
@@ -302,6 +303,7 @@ mod tests {
assert_eq!(array.len(), 11);
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 0);
assert_eq!(array.run_ends().values(), &[1, 3, 5, 9, 10, 11]);
assert_eq!(
array.values().as_primitive::<Int16Type>().values(),
diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs
index ca6d0ac1f2..129b90ee04 100644
--- a/arrow-select/src/concat.rs
+++ b/arrow-select/src/concat.rs
@@ -849,5 +849,6 @@ mod tests {
let dict_b = DictionaryArray::new(keys, Arc::new(values));
let array = concat(&[&dict_a, &dict_b]).unwrap();
assert_eq!(array.null_count(), 10);
+ assert_eq!(array.logical_null_count(), 10);
}
}
diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs
index 55cab368af..56bbdefd52 100644
--- a/arrow/src/util/data_gen.rs
+++ b/arrow/src/util/data_gen.rs
@@ -551,6 +551,7 @@ mod tests {
assert_eq!(batch.num_columns(), schema_ref.fields().len());
for array in batch.columns() {
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 0);
}
// Test that the list's child values are non-null
let b_array = batch.column(1);
@@ -710,6 +711,7 @@ mod tests {
assert_eq!(array.len(), 100);
// Map field is not null
assert_eq!(array.null_count(), 0);
+ assert_eq!(array.logical_null_count(), 0);
// Maps have multiple values like a list, so internal arrays are longer
assert!(array.as_map().keys().len() > array.len());
assert!(array.as_map().values().len() > array.len());
diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs
b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
index 3678f24621..440db641a2 100644
--- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs
+++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
@@ -653,6 +653,7 @@ mod tests {
assert_eq!(array.len(), 8);
assert_eq!(array.null_count(), 8);
+ assert_eq!(array.logical_null_count(), 8);
}
for (encoding, page) in pages {
@@ -667,6 +668,7 @@ mod tests {
assert_eq!(array.len(), 8);
assert_eq!(array.null_count(), 8);
+ assert_eq!(array.logical_null_count(), 8);
}
}
}