This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 7c50fa8c44 Improve default implementation of Array::is_nullable (#6721)
7c50fa8c44 is described below
commit 7c50fa8c441fad05e15c85af27db050f3810bef2
Author: Piotr Findeisen <[email protected]>
AuthorDate: Wed Nov 20 23:42:33 2024 +0100
Improve default implementation of Array::is_nullable (#6721)
* Improve default implementation of Array::is_nullable
Since is_nullable returns `false` if the array is guaranteed to not
contain any logical nulls, the correct default implementation could
leverage `self.logical_null_count` more appropriately than
`self.null_count`.
To reduce chance of negative surprises in downstream projects, we could
introduce the new behavior under `is_logically_nullable` name and
deprecate `is_nullable` without changing it.
* Implement efficient DictionaryArray::logical_null_count
---
arrow-array/src/array/dictionary_array.rs | 20 ++++++++++++++++++++
arrow-array/src/array/mod.rs | 3 +--
2 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/arrow-array/src/array/dictionary_array.rs
b/arrow-array/src/array/dictionary_array.rs
index 920be65fcf..55ecd56f98 100644
--- a/arrow-array/src/array/dictionary_array.rs
+++ b/arrow-array/src/array/dictionary_array.rs
@@ -749,6 +749,26 @@ impl<T: ArrowDictionaryKeyType> Array for
DictionaryArray<T> {
}
}
+ fn logical_null_count(&self) -> usize {
+ match (self.keys.nulls(), self.values.logical_nulls()) {
+ (None, None) => 0,
+ (Some(key_nulls), None) => key_nulls.null_count(),
+ (None, Some(value_nulls)) => self
+ .keys
+ .values()
+ .iter()
+ .filter(|k| value_nulls.is_null(k.as_usize()))
+ .count(),
+ (Some(key_nulls), Some(value_nulls)) => self
+ .keys
+ .values()
+ .iter()
+ .enumerate()
+ .filter(|(idx, k)| key_nulls.is_null(*idx) ||
value_nulls.is_null(k.as_usize()))
+ .count(),
+ }
+ }
+
fn is_nullable(&self) -> bool {
!self.is_empty() && (self.nulls().is_some() ||
self.values.is_nullable())
}
diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs
index 04d9883f5b..87577166ea 100644
--- a/arrow-array/src/array/mod.rs
+++ b/arrow-array/src/array/mod.rs
@@ -317,8 +317,7 @@ pub trait Array: std::fmt::Debug + Send + Sync {
/// even if the nulls present in [`DictionaryArray::values`] are not
referenced by any key,
/// and therefore would not appear in [`Array::logical_nulls`].
fn is_nullable(&self) -> bool {
- // TODO this is not necessarily perfect default implementation, since
null_count() and logical_null_count() are not always equivalent
- self.null_count() != 0
+ self.logical_null_count() != 0
}
/// Returns the total number of bytes of memory pointed to by this array.