This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 7e51d40e57 Add Array::logical_null_count for inspecting number of null 
values (#6608)
7e51d40e57 is described below

commit 7e51d40e57392be1eca47d96f1393d8de561fa6d
Author: Piotr Findeisen <[email protected]>
AuthorDate: Tue Oct 22 10:07:48 2024 +0200

    Add Array::logical_null_count for inspecting number of null values (#6608)
    
    Add counter-part of `Array::null_count`, but counting the logical null
    values. This will be useful in DataFusion. Current alternative is to
    compute null mask (via `Array::logical_nulls()`) and do counting on it.
    Given this might be expensive and verbose, caller may naturally feel
    steer towards `Array::null_count` which may or may not be applicable,
    depending on the context.
---
 arrow-array/src/array/boolean_array.rs             |  4 +++
 arrow-array/src/array/dictionary_array.rs          |  1 +
 arrow-array/src/array/fixed_size_binary_array.rs   |  1 +
 arrow-array/src/array/mod.rs                       | 37 +++++++++++++++++++++-
 arrow-array/src/array/null_array.rs                |  7 ++++
 arrow-array/src/array/primitive_array.rs           |  4 +++
 arrow-array/src/array/run_array.rs                 |  6 ++++
 .../src/builder/generic_byte_run_builder.rs        |  3 ++
 arrow-array/src/builder/generic_list_builder.rs    |  2 ++
 arrow-array/src/builder/primitive_run_builder.rs   |  2 ++
 arrow-select/src/concat.rs                         |  1 +
 arrow/src/util/data_gen.rs                         |  2 ++
 .../arrow/array_reader/byte_array_dictionary.rs    |  2 ++
 13 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/arrow-array/src/array/boolean_array.rs 
b/arrow-array/src/array/boolean_array.rs
index fea8616c91..0f95adacf1 100644
--- a/arrow-array/src/array/boolean_array.rs
+++ b/arrow-array/src/array/boolean_array.rs
@@ -316,6 +316,10 @@ impl Array for BooleanArray {
         self.nulls.as_ref()
     }
 
+    fn logical_null_count(&self) -> usize {
+        self.null_count()
+    }
+
     fn get_buffer_memory_size(&self) -> usize {
         let mut sum = self.values.inner().capacity();
         if let Some(x) = &self.nulls {
diff --git a/arrow-array/src/array/dictionary_array.rs 
b/arrow-array/src/array/dictionary_array.rs
index a3931d38d4..6f27b383c0 100644
--- a/arrow-array/src/array/dictionary_array.rs
+++ b/arrow-array/src/array/dictionary_array.rs
@@ -1311,6 +1311,7 @@ mod tests {
         assert_eq!(array.values().data_type(), &DataType::Utf8);
 
         assert_eq!(array.null_count(), 1);
+        assert_eq!(array.logical_null_count(), 1);
 
         assert!(array.keys().is_valid(0));
         assert!(array.keys().is_valid(1));
diff --git a/arrow-array/src/array/fixed_size_binary_array.rs 
b/arrow-array/src/array/fixed_size_binary_array.rs
index e393e2b15a..1371e81e26 100644
--- a/arrow-array/src/array/fixed_size_binary_array.rs
+++ b/arrow-array/src/array/fixed_size_binary_array.rs
@@ -858,6 +858,7 @@ mod tests {
         let array = FixedSizeBinaryArray::from(values);
         assert_eq!(array.len(), 4);
         assert_eq!(array.null_count(), 0);
+        assert_eq!(array.logical_null_count(), 0);
         assert_eq!(array.value(0), b"one");
         assert_eq!(array.value(1), b"two");
         assert_eq!(array.value(2), b"six");
diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs
index f1ed118826..4205d00ec8 100644
--- a/arrow-array/src/array/mod.rs
+++ b/arrow-array/src/array/mod.rs
@@ -279,9 +279,31 @@ pub trait Array: std::fmt::Debug + Send + Sync {
         self.nulls().map(|n| n.null_count()).unwrap_or_default()
     }
 
+    /// Returns the total number of logical null values in this array.
+    ///
+    /// Note: this method returns the logical null count, i.e. that encoded in
+    /// [`Array::logical_nulls`]. In general this is equivalent to 
[`Array::null_count`] but may differ in the
+    /// presence of logical nullability, see [`Array::nulls`] and 
[`Array::logical_nulls`].
+    ///
+    /// # Example:
+    ///
+    /// ```
+    /// use arrow_array::{Array, Int32Array};
+    ///
+    /// // Construct an array with values [1, NULL, NULL]
+    /// let array = Int32Array::from(vec![Some(1), None, None]);
+    ///
+    /// assert_eq!(array.logical_null_count(), 2);
+    /// ```
+    fn logical_null_count(&self) -> usize {
+        self.logical_nulls()
+            .map(|n| n.null_count())
+            .unwrap_or_default()
+    }
+
     /// Returns `false` if the array is guaranteed to not contain any logical 
nulls
     ///
-    /// In general this will be equivalent to `Array::null_count() != 0` but 
may differ in the
+    /// In general this will be equivalent to `Array::logical_null_count() != 
0` but may differ in the
     /// presence of logical nullability, see [`Array::logical_nulls`].
     ///
     /// Implementations will return `true` unless they can cheaply prove no 
logical nulls
@@ -289,6 +311,7 @@ pub trait Array: std::fmt::Debug + Send + Sync {
     /// even if the nulls present in [`DictionaryArray::values`] are not 
referenced by any key,
     /// and therefore would not appear in [`Array::logical_nulls`].
     fn is_nullable(&self) -> bool {
+        // TODO this is not necessarily perfect default implementation, since 
null_count() and logical_null_count() are not always equivalent
         self.null_count() != 0
     }
 
@@ -361,6 +384,10 @@ impl Array for ArrayRef {
         self.as_ref().null_count()
     }
 
+    fn logical_null_count(&self) -> usize {
+        self.as_ref().logical_null_count()
+    }
+
     fn is_nullable(&self) -> bool {
         self.as_ref().is_nullable()
     }
@@ -427,6 +454,10 @@ impl<T: Array> Array for &T {
         T::null_count(self)
     }
 
+    fn logical_null_count(&self) -> usize {
+        T::logical_null_count(self)
+    }
+
     fn is_nullable(&self) -> bool {
         T::is_nullable(self)
     }
@@ -959,11 +990,13 @@ mod tests {
             let array = as_union_array(array.as_ref());
             assert_eq!(array.len(), 4);
             assert_eq!(array.null_count(), 0);
+            assert_eq!(array.logical_null_count(), 4);
 
             for i in 0..4 {
                 let a = array.value(i);
                 assert_eq!(a.len(), 1);
                 assert_eq!(a.null_count(), 1);
+                assert_eq!(a.logical_null_count(), 1);
                 assert!(a.is_null(0))
             }
 
@@ -987,6 +1020,7 @@ mod tests {
                 array => {
                     assert_eq!(array.len(), 4);
                     assert_eq!(array.null_count(), 0);
+                    assert_eq!(array.logical_null_count(), 4);
                     assert_eq!(array.values().len(), 1);
                     assert_eq!(array.values().null_count(), 1);
                     assert_eq!(array.run_ends().len(), 4);
@@ -1012,6 +1046,7 @@ mod tests {
 
             assert_eq!(array.len(), 6);
             assert_eq!(array.null_count(), 6);
+            assert_eq!(array.logical_null_count(), 6);
             array.iter().for_each(|x| assert!(x.is_none()));
         }
     }
diff --git a/arrow-array/src/array/null_array.rs 
b/arrow-array/src/array/null_array.rs
index 88cc2d911f..9a7a5ebe17 100644
--- a/arrow-array/src/array/null_array.rs
+++ b/arrow-array/src/array/null_array.rs
@@ -39,6 +39,7 @@ use std::sync::Arc;
 /// assert!(array.is_nullable());
 /// assert_eq!(array.len(), 10);
 /// assert_eq!(array.null_count(), 0);
+/// assert_eq!(array.logical_null_count(), 10);
 /// assert_eq!(array.logical_nulls().unwrap().null_count(), 10);
 /// ```
 #[derive(Clone)]
@@ -120,6 +121,10 @@ impl Array for NullArray {
         !self.is_empty()
     }
 
+    fn logical_null_count(&self) -> usize {
+        self.len
+    }
+
     fn get_buffer_memory_size(&self) -> usize {
         0
     }
@@ -172,6 +177,7 @@ mod tests {
 
         assert_eq!(null_arr.len(), 32);
         assert_eq!(null_arr.null_count(), 0);
+        assert_eq!(null_arr.logical_null_count(), 32);
         assert_eq!(null_arr.logical_nulls().unwrap().null_count(), 32);
         assert!(null_arr.is_valid(0));
         assert!(null_arr.is_nullable());
@@ -184,6 +190,7 @@ mod tests {
         let array2 = array1.slice(8, 16);
         assert_eq!(array2.len(), 16);
         assert_eq!(array2.null_count(), 0);
+        assert_eq!(array2.logical_null_count(), 16);
         assert_eq!(array2.logical_nulls().unwrap().null_count(), 16);
         assert!(array2.is_valid(0));
         assert!(array2.is_nullable());
diff --git a/arrow-array/src/array/primitive_array.rs 
b/arrow-array/src/array/primitive_array.rs
index 0c100e39af..7b0d6c5ca1 100644
--- a/arrow-array/src/array/primitive_array.rs
+++ b/arrow-array/src/array/primitive_array.rs
@@ -1160,6 +1160,10 @@ impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
         self.nulls.as_ref()
     }
 
+    fn logical_null_count(&self) -> usize {
+        self.null_count()
+    }
+
     fn get_buffer_memory_size(&self) -> usize {
         let mut size = self.values.inner().capacity();
         if let Some(n) = self.nulls.as_ref() {
diff --git a/arrow-array/src/array/run_array.rs 
b/arrow-array/src/array/run_array.rs
index 95291f8829..81c8cdcea4 100644
--- a/arrow-array/src/array/run_array.rs
+++ b/arrow-array/src/array/run_array.rs
@@ -777,6 +777,7 @@ mod tests {
 
         assert_eq!(array.len(), 20);
         assert_eq!(array.null_count(), 0);
+        assert_eq!(array.logical_null_count(), 0);
 
         assert_eq!(
             "RunArray {run_ends: [20], values: PrimitiveArray<UInt32>\n[\n  
1,\n]}\n",
@@ -798,6 +799,7 @@ mod tests {
 
         assert_eq!(array.len(), 4);
         assert_eq!(array.null_count(), 0);
+        assert_eq!(array.logical_null_count(), 1);
 
         let array: RunArray<Int16Type> = test.into_iter().collect();
         assert_eq!(
@@ -813,6 +815,7 @@ mod tests {
 
         assert_eq!(array.len(), 4);
         assert_eq!(array.null_count(), 0);
+        assert_eq!(array.logical_null_count(), 0);
 
         let run_ends = array.run_ends();
         assert_eq!(&[1, 2, 3, 4], run_ends.values());
@@ -825,6 +828,7 @@ mod tests {
 
         assert_eq!(array.len(), 6);
         assert_eq!(array.null_count(), 0);
+        assert_eq!(array.logical_null_count(), 3);
 
         let run_ends = array.run_ends();
         assert_eq!(&[1, 2, 3, 5, 6], run_ends.values());
@@ -841,6 +845,7 @@ mod tests {
 
         assert_eq!(array.len(), 3);
         assert_eq!(array.null_count(), 0);
+        assert_eq!(array.logical_null_count(), 3);
 
         let run_ends = array.run_ends();
         assert_eq!(3, run_ends.len());
@@ -861,6 +866,7 @@ mod tests {
         assert_eq!(array.values().data_type(), &DataType::Utf8);
 
         assert_eq!(array.null_count(), 0);
+        assert_eq!(array.logical_null_count(), 1);
         assert_eq!(array.len(), 4);
         assert_eq!(array.values().null_count(), 1);
 
diff --git a/arrow-array/src/builder/generic_byte_run_builder.rs 
b/arrow-array/src/builder/generic_byte_run_builder.rs
index 3cde76c4a0..0bf5658b29 100644
--- a/arrow-array/src/builder/generic_byte_run_builder.rs
+++ b/arrow-array/src/builder/generic_byte_run_builder.rs
@@ -403,6 +403,7 @@ mod tests {
 
         assert_eq!(array.len(), 11);
         assert_eq!(array.null_count(), 0);
+        assert_eq!(array.logical_null_count(), 2);
 
         assert_eq!(array.run_ends().values(), &[3, 5, 7, 11]);
 
@@ -448,6 +449,7 @@ mod tests {
 
         assert_eq!(array.len(), 5);
         assert_eq!(array.null_count(), 0);
+        assert_eq!(array.logical_null_count(), 1);
 
         assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]);
 
@@ -469,6 +471,7 @@ mod tests {
 
         assert_eq!(array.len(), 8);
         assert_eq!(array.null_count(), 0);
+        assert_eq!(array.logical_null_count(), 1);
 
         assert_eq!(array.run_ends().values(), &[1, 2, 4, 7, 8]);
 
diff --git a/arrow-array/src/builder/generic_list_builder.rs 
b/arrow-array/src/builder/generic_list_builder.rs
index 6ff5f20df6..a7d16f45f5 100644
--- a/arrow-array/src/builder/generic_list_builder.rs
+++ b/arrow-array/src/builder/generic_list_builder.rs
@@ -545,10 +545,12 @@ mod tests {
         let array = builder.finish();
         assert_eq!(array.value_offsets(), [0, 4, 4, 6, 6]);
         assert_eq!(array.null_count(), 1);
+        assert_eq!(array.logical_null_count(), 1);
         assert!(array.is_null(3));
         let elements = array.values().as_primitive::<Int32Type>();
         assert_eq!(elements.values(), &[1, 2, 7, 0, 4, 5]);
         assert_eq!(elements.null_count(), 1);
+        assert_eq!(elements.logical_null_count(), 1);
         assert!(elements.is_null(3));
     }
 
diff --git a/arrow-array/src/builder/primitive_run_builder.rs 
b/arrow-array/src/builder/primitive_run_builder.rs
index 01a989199b..1db9c91e08 100644
--- a/arrow-array/src/builder/primitive_run_builder.rs
+++ b/arrow-array/src/builder/primitive_run_builder.rs
@@ -277,6 +277,7 @@ mod tests {
         let array = builder.finish();
 
         assert_eq!(array.null_count(), 0);
+        assert_eq!(array.logical_null_count(), 1);
         assert_eq!(array.len(), 6);
 
         assert_eq!(array.run_ends().values(), &[3, 4, 6]);
@@ -302,6 +303,7 @@ mod tests {
 
         assert_eq!(array.len(), 11);
         assert_eq!(array.null_count(), 0);
+        assert_eq!(array.logical_null_count(), 0);
         assert_eq!(array.run_ends().values(), &[1, 3, 5, 9, 10, 11]);
         assert_eq!(
             array.values().as_primitive::<Int16Type>().values(),
diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs
index ca6d0ac1f2..129b90ee04 100644
--- a/arrow-select/src/concat.rs
+++ b/arrow-select/src/concat.rs
@@ -849,5 +849,6 @@ mod tests {
         let dict_b = DictionaryArray::new(keys, Arc::new(values));
         let array = concat(&[&dict_a, &dict_b]).unwrap();
         assert_eq!(array.null_count(), 10);
+        assert_eq!(array.logical_null_count(), 10);
     }
 }
diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs
index 55cab368af..56bbdefd52 100644
--- a/arrow/src/util/data_gen.rs
+++ b/arrow/src/util/data_gen.rs
@@ -551,6 +551,7 @@ mod tests {
         assert_eq!(batch.num_columns(), schema_ref.fields().len());
         for array in batch.columns() {
             assert_eq!(array.null_count(), 0);
+            assert_eq!(array.logical_null_count(), 0);
         }
         // Test that the list's child values are non-null
         let b_array = batch.column(1);
@@ -710,6 +711,7 @@ mod tests {
         assert_eq!(array.len(), 100);
         // Map field is not null
         assert_eq!(array.null_count(), 0);
+        assert_eq!(array.logical_null_count(), 0);
         // Maps have multiple values like a list, so internal arrays are longer
         assert!(array.as_map().keys().len() > array.len());
         assert!(array.as_map().values().len() > array.len());
diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs 
b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
index 3678f24621..440db641a2 100644
--- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs
+++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
@@ -653,6 +653,7 @@ mod tests {
 
             assert_eq!(array.len(), 8);
             assert_eq!(array.null_count(), 8);
+            assert_eq!(array.logical_null_count(), 8);
         }
 
         for (encoding, page) in pages {
@@ -667,6 +668,7 @@ mod tests {
 
             assert_eq!(array.len(), 8);
             assert_eq!(array.null_count(), 8);
+            assert_eq!(array.logical_null_count(), 8);
         }
     }
 }

Reply via email to