This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 979a070dc8 Logical Nullability (#4691)
979a070dc8 is described below

commit 979a070dc82eeb26b38a8651cac879b2c276c0ed
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Mon Aug 14 20:02:45 2023 +0100

    Logical Nullability (#4691)
---
 arrow-arith/src/arity.rs                       | 13 +++--
 arrow-arith/src/boolean.rs                     | 14 ++----
 arrow-array/src/array/boolean_array.rs         |  7 ++-
 arrow-array/src/array/dictionary_array.rs      | 49 ++++++++++++++++++
 arrow-array/src/array/fixed_size_list_array.rs |  6 +--
 arrow-array/src/array/list_array.rs            |  4 +-
 arrow-array/src/array/mod.rs                   | 60 +++++++++++++++++++++-
 arrow-array/src/array/null_array.rs            | 33 ++++++------
 arrow-array/src/array/run_array.rs             | 69 +++++++++++++++++++++++++-
 arrow-array/src/array/struct_array.rs          | 19 ++++---
 arrow-array/src/builder/null_builder.rs        |  9 ++--
 arrow-array/src/iterator.rs                    | 16 +++++-
 arrow-buffer/src/builder/boolean.rs            |  6 +++
 arrow-cast/src/cast.rs                         |  5 +-
 arrow-ord/src/comparison.rs                    | 14 ++++++
 arrow-ord/src/sort.rs                          | 44 ++++++++++++----
 arrow-string/src/like.rs                       |  5 +-
 parquet/src/arrow/arrow_writer/levels.rs       | 34 +++++++++++--
 parquet/src/arrow/arrow_writer/mod.rs          |  2 +-
 19 files changed, 333 insertions(+), 76 deletions(-)

diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs
index 2dac33a4f2..fdfb26f7f7 100644
--- a/arrow-arith/src/arity.rs
+++ b/arrow-arith/src/arity.rs
@@ -198,7 +198,7 @@ where
         return Ok(PrimitiveArray::from(ArrayData::new_empty(&O::DATA_TYPE)));
     }
 
-    let nulls = NullBuffer::union(a.nulls(), b.nulls());
+    let nulls = NullBuffer::union(a.logical_nulls().as_ref(), 
b.logical_nulls().as_ref());
 
     let values = a.values().iter().zip(b.values()).map(|(l, r)| op(*l, *r));
     // JUSTIFICATION
@@ -248,7 +248,7 @@ where
         ))));
     }
 
-    let nulls = NullBuffer::union(a.nulls(), b.nulls());
+    let nulls = NullBuffer::union(a.logical_nulls().as_ref(), 
b.logical_nulls().as_ref());
 
     let mut builder = a.into_builder()?;
 
@@ -296,7 +296,9 @@ where
     if a.null_count() == 0 && b.null_count() == 0 {
         try_binary_no_nulls(len, a, b, op)
     } else {
-        let nulls = NullBuffer::union(a.nulls(), b.nulls()).unwrap();
+        let nulls =
+            NullBuffer::union(a.logical_nulls().as_ref(), 
b.logical_nulls().as_ref())
+                .unwrap();
 
         let mut buffer = BufferBuilder::<O::Native>::new(len);
         buffer.append_n_zeroed(len);
@@ -355,7 +357,10 @@ where
     if a.null_count() == 0 && b.null_count() == 0 {
         try_binary_no_nulls_mut(len, a, b, op)
     } else {
-        let nulls = NullBuffer::union(a.nulls(), b.nulls()).unwrap();
+        let nulls =
+            NullBuffer::union(a.logical_nulls().as_ref(), 
b.logical_nulls().as_ref())
+                .unwrap();
+
         let mut builder = a.into_builder()?;
 
         let slice = builder.values_slice_mut();
diff --git a/arrow-arith/src/boolean.rs b/arrow-arith/src/boolean.rs
index 61e591d516..46e5998208 100644
--- a/arrow-arith/src/boolean.rs
+++ b/arrow-arith/src/boolean.rs
@@ -25,7 +25,7 @@
 use arrow_array::*;
 use arrow_buffer::buffer::{bitwise_bin_op_helper, 
bitwise_quaternary_op_helper};
 use arrow_buffer::{BooleanBuffer, NullBuffer};
-use arrow_schema::{ArrowError, DataType};
+use arrow_schema::ArrowError;
 
 /// Logical 'and' boolean values with Kleene logic
 ///
@@ -311,11 +311,7 @@ pub fn not(left: &BooleanArray) -> Result<BooleanArray, 
ArrowError> {
 /// assert_eq!(a_is_null, BooleanArray::from(vec![false, false, true]));
 /// ```
 pub fn is_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
-    let values = match input.nulls() {
-        // NullArray has no nulls buffer yet all values are null
-        None if input.data_type() == &DataType::Null => {
-            BooleanBuffer::new_set(input.len())
-        }
+    let values = match input.logical_nulls() {
         None => BooleanBuffer::new_unset(input.len()),
         Some(nulls) => !nulls.inner(),
     };
@@ -335,11 +331,7 @@ pub fn is_null(input: &dyn Array) -> Result<BooleanArray, 
ArrowError> {
 /// assert_eq!(a_is_not_null, BooleanArray::from(vec![true, true, false]));
 /// ```
 pub fn is_not_null(input: &dyn Array) -> Result<BooleanArray, ArrowError> {
-    let values = match input.nulls() {
-        // NullArray has no nulls buffer yet all values are null
-        None if input.data_type() == &DataType::Null => {
-            BooleanBuffer::new_unset(input.len())
-        }
+    let values = match input.logical_nulls() {
         None => BooleanBuffer::new_set(input.len()),
         Some(n) => n.inner().clone(),
     };
diff --git a/arrow-array/src/array/boolean_array.rs 
b/arrow-array/src/array/boolean_array.rs
index 14fa87e138..995bb7d510 100644
--- a/arrow-array/src/array/boolean_array.rs
+++ b/arrow-array/src/array/boolean_array.rs
@@ -205,7 +205,7 @@ impl BooleanArray {
     where
         F: FnMut(T::Item) -> bool,
     {
-        let nulls = left.nulls().cloned();
+        let nulls = left.logical_nulls();
         let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe {
             // SAFETY: i in range 0..len
             op(left.value_unchecked(i))
@@ -239,7 +239,10 @@ impl BooleanArray {
     {
         assert_eq!(left.len(), right.len());
 
-        let nulls = NullBuffer::union(left.nulls(), right.nulls());
+        let nulls = NullBuffer::union(
+            left.logical_nulls().as_ref(),
+            right.logical_nulls().as_ref(),
+        );
         let values = BooleanBuffer::collect_bool(left.len(), |i| unsafe {
             // SAFETY: i in range 0..len
             op(left.value_unchecked(i), right.value_unchecked(i))
diff --git a/arrow-array/src/array/dictionary_array.rs 
b/arrow-array/src/array/dictionary_array.rs
index 5a2f439a8e..2d80c75f07 100644
--- a/arrow-array/src/array/dictionary_array.rs
+++ b/arrow-array/src/array/dictionary_array.rs
@@ -729,6 +729,31 @@ impl<T: ArrowDictionaryKeyType> Array for 
DictionaryArray<T> {
         self.keys.nulls()
     }
 
+    fn logical_nulls(&self) -> Option<NullBuffer> {
+        match self.values.nulls() {
+            None => self.nulls().cloned(),
+            Some(value_nulls) => {
+                let mut builder = BooleanBufferBuilder::new(self.len());
+                match self.keys.nulls() {
+                    Some(n) => builder.append_buffer(n.inner()),
+                    None => builder.append_n(self.len(), true),
+                }
+                for (idx, k) in self.keys.values().iter().enumerate() {
+                    let k = k.as_usize();
+                    // Check range to allow for nulls
+                    if k < value_nulls.len() && value_nulls.is_null(k) {
+                        builder.set_bit(idx, false);
+                    }
+                }
+                Some(builder.finish().into())
+            }
+        }
+    }
+
+    fn is_nullable(&self) -> bool {
+        !self.is_empty() && (self.nulls().is_some() || 
self.values.is_nullable())
+    }
+
     fn get_buffer_memory_size(&self) -> usize {
         self.keys.get_buffer_memory_size() + 
self.values.get_buffer_memory_size()
     }
@@ -843,6 +868,14 @@ impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for 
TypedDictionaryArray<'a,
         self.dictionary.nulls()
     }
 
+    fn logical_nulls(&self) -> Option<NullBuffer> {
+        self.dictionary.logical_nulls()
+    }
+
+    fn is_nullable(&self) -> bool {
+        self.dictionary.is_nullable()
+    }
+
     fn get_buffer_memory_size(&self) -> usize {
         self.dictionary.get_buffer_memory_size()
     }
@@ -1253,4 +1286,20 @@ mod tests {
             assert_eq!(v, expected, "{idx}");
         }
     }
+
+    #[test]
+    fn test_iterator_nulls() {
+        let keys = Int32Array::new(
+            vec![0, 700, 1, 2].into(),
+            Some(NullBuffer::from(vec![true, false, true, true])),
+        );
+        let values = Int32Array::from(vec![Some(50), None, Some(2)]);
+        let dict = DictionaryArray::new(keys, Arc::new(values));
+        let values: Vec<_> = dict
+            .downcast_dict::<Int32Array>()
+            .unwrap()
+            .into_iter()
+            .collect();
+        assert_eq!(values, &[Some(50), None, None, Some(2)])
+    }
 }
diff --git a/arrow-array/src/array/fixed_size_list_array.rs 
b/arrow-array/src/array/fixed_size_list_array.rs
index 6c3abb556a..8996fc8da4 100644
--- a/arrow-array/src/array/fixed_size_list_array.rs
+++ b/arrow-array/src/array/fixed_size_list_array.rs
@@ -147,7 +147,7 @@ impl FixedSizeListArray {
     /// * `size < 0`
     /// * `values.len() / size != nulls.len()`
     /// * `values.data_type() != field.data_type()`
-    /// * `!field.is_nullable() && 
!nulls.expand(size).contains(values.nulls())`
+    /// * `!field.is_nullable() && 
!nulls.expand(size).contains(values.logical_nulls())`
     pub fn try_new(
         field: FieldRef,
         size: i32,
@@ -181,11 +181,11 @@ impl FixedSizeListArray {
             )));
         }
 
-        if let Some(a) = values.nulls() {
+        if let Some(a) = values.logical_nulls() {
             let nulls_valid = field.is_nullable()
                 || nulls
                     .as_ref()
-                    .map(|n| n.expand(size as _).contains(a))
+                    .map(|n| n.expand(size as _).contains(&a))
                     .unwrap_or_default();
 
             if !nulls_valid {
diff --git a/arrow-array/src/array/list_array.rs 
b/arrow-array/src/array/list_array.rs
index 05628084c8..f5b7ae77c3 100644
--- a/arrow-array/src/array/list_array.rs
+++ b/arrow-array/src/array/list_array.rs
@@ -161,7 +161,7 @@ impl<OffsetSize: OffsetSizeTrait> 
GenericListArray<OffsetSize> {
     ///
     /// * `offsets.len() - 1 != nulls.len()`
     /// * `offsets.last() > values.len()`
-    /// * `!field.is_nullable() && values.null_count() != 0`
+    /// * `!field.is_nullable() && values.is_nullable()`
     /// * `field.data_type() != values.data_type()`
     pub fn try_new(
         field: FieldRef,
@@ -189,7 +189,7 @@ impl<OffsetSize: OffsetSizeTrait> 
GenericListArray<OffsetSize> {
                 )));
             }
         }
-        if !field.is_nullable() && values.null_count() != 0 {
+        if !field.is_nullable() && values.is_nullable() {
             return Err(ArrowError::InvalidArgumentError(format!(
                 "Non-nullable field of {}ListArray {:?} cannot contain nulls",
                 OffsetSize::PREFIX,
diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs
index 0157279dfe..79240d105a 100644
--- a/arrow-array/src/array/mod.rs
+++ b/arrow-array/src/array/mod.rs
@@ -173,12 +173,33 @@ pub trait Array: std::fmt::Debug + Send + Sync {
     /// ```
     fn offset(&self) -> usize;
 
-    /// Returns the null buffers of this array if any
+    /// Returns the null buffer of this array if any
+    ///
+    /// Note: some arrays can encode their nullability in their children, for 
example,
+    /// [`DictionaryArray::values`] values or [`RunArray::values`], or without 
a null buffer,
+    /// such as [`NullArray`]. Use [`Array::logical_nulls`] to obtain a 
computed mask encoding this
     fn nulls(&self) -> Option<&NullBuffer>;
 
+    /// Returns the logical null buffer of this array if any
+    ///
+    /// In most cases this will be the same as [`Array::nulls`], except for:
+    ///
+    /// * DictionaryArray where [`DictionaryArray::values`] contains nulls
+    /// * RunArray where [`RunArray::values`] contains nulls
+    /// * NullArray where all indices are nulls
+    ///
+    /// In these cases a logical [`NullBuffer`] will be computed, encoding the 
logical nullability
+    /// of these arrays, beyond what is encoded in [`Array::nulls`]
+    fn logical_nulls(&self) -> Option<NullBuffer> {
+        self.nulls().cloned()
+    }
+
     /// Returns whether the element at `index` is null.
     /// When using this function on a slice, the index is relative to the 
slice.
     ///
+    /// Note: this method returns the physical nullability, i.e. that encoded 
in [`Array::nulls`]
+    /// see [`Array::logical_nulls`] for logical nullability
+    ///
     /// # Example:
     ///
     /// ```
@@ -196,6 +217,9 @@ pub trait Array: std::fmt::Debug + Send + Sync {
     /// Returns whether the element at `index` is not null.
     /// When using this function on a slice, the index is relative to the 
slice.
     ///
+    /// Note: this method returns the physical nullability, i.e. that encoded 
in [`Array::nulls`]
+    /// see [`Array::logical_nulls`] for logical nullability
+    ///
     /// # Example:
     ///
     /// ```
@@ -210,7 +234,10 @@ pub trait Array: std::fmt::Debug + Send + Sync {
         !self.is_null(index)
     }
 
-    /// Returns the total number of null values in this array.
+    /// Returns the total number of physical null values in this array.
+    ///
+    /// Note: this method returns the physical null count, i.e. that encoded 
in [`Array::nulls`],
+    /// see [`Array::logical_nulls`] for logical nullability
     ///
     /// # Example:
     ///
@@ -226,6 +253,19 @@ pub trait Array: std::fmt::Debug + Send + Sync {
         self.nulls().map(|n| n.null_count()).unwrap_or_default()
     }
 
+    /// Returns `false` if the array is guaranteed to not contain any logical 
nulls
+    ///
+    /// In general this will be equivalent to `Array::null_count() != 0` but 
may differ in the
+    /// presence of logical nullability, see [`Array::logical_nulls`].
+    ///
+    /// Implementations will return `true` unless they can cheaply prove no 
logical nulls
+    /// are present. For example a [`DictionaryArray`] with nullable values 
will still return true,
+    /// even if the nulls present in [`DictionaryArray::values`] are not 
referenced by any key,
+    /// and therefore would not appear in [`Array::logical_nulls`].
+    fn is_nullable(&self) -> bool {
+        self.null_count() != 0
+    }
+
     /// Returns the total number of bytes of memory pointed to by this array.
     /// The buffers store bytes in the Arrow memory format, and include the 
data as well as the validity map.
     fn get_buffer_memory_size(&self) -> usize;
@@ -277,6 +317,10 @@ impl Array for ArrayRef {
         self.as_ref().nulls()
     }
 
+    fn logical_nulls(&self) -> Option<NullBuffer> {
+        self.as_ref().logical_nulls()
+    }
+
     fn is_null(&self, index: usize) -> bool {
         self.as_ref().is_null(index)
     }
@@ -289,6 +333,10 @@ impl Array for ArrayRef {
         self.as_ref().null_count()
     }
 
+    fn is_nullable(&self) -> bool {
+        self.as_ref().is_nullable()
+    }
+
     fn get_buffer_memory_size(&self) -> usize {
         self.as_ref().get_buffer_memory_size()
     }
@@ -335,6 +383,10 @@ impl<'a, T: Array> Array for &'a T {
         T::nulls(self)
     }
 
+    fn logical_nulls(&self) -> Option<NullBuffer> {
+        T::logical_nulls(self)
+    }
+
     fn is_null(&self, index: usize) -> bool {
         T::is_null(self, index)
     }
@@ -347,6 +399,10 @@ impl<'a, T: Array> Array for &'a T {
         T::null_count(self)
     }
 
+    fn is_nullable(&self) -> bool {
+        T::is_nullable(self)
+    }
+
     fn get_buffer_memory_size(&self) -> usize {
         T::get_buffer_memory_size(self)
     }
diff --git a/arrow-array/src/array/null_array.rs 
b/arrow-array/src/array/null_array.rs
index c054c89043..af3ec0b57d 100644
--- a/arrow-array/src/array/null_array.rs
+++ b/arrow-array/src/array/null_array.rs
@@ -36,8 +36,10 @@ use std::sync::Arc;
 ///
 /// let array = NullArray::new(10);
 ///
+/// assert!(array.is_nullable());
 /// assert_eq!(array.len(), 10);
-/// assert_eq!(array.null_count(), 10);
+/// assert_eq!(array.null_count(), 0);
+/// assert_eq!(array.logical_nulls().unwrap().null_count(), 10);
 /// ```
 #[derive(Clone)]
 pub struct NullArray {
@@ -107,22 +109,12 @@ impl Array for NullArray {
         None
     }
 
-    /// Returns whether the element at `index` is null.
-    /// All elements of a `NullArray` are always null.
-    fn is_null(&self, _index: usize) -> bool {
-        true
+    fn logical_nulls(&self) -> Option<NullBuffer> {
+        (self.len != 0).then(|| NullBuffer::new_null(self.len))
     }
 
-    /// Returns whether the element at `index` is valid.
-    /// All elements of a `NullArray` are always invalid.
-    fn is_valid(&self, _index: usize) -> bool {
-        false
-    }
-
-    /// Returns the total number of null values in this array.
-    /// The null count of a `NullArray` always equals its length.
-    fn null_count(&self) -> usize {
-        self.len()
+    fn is_nullable(&self) -> bool {
+        !self.is_empty()
     }
 
     fn get_buffer_memory_size(&self) -> usize {
@@ -176,8 +168,10 @@ mod tests {
         let null_arr = NullArray::new(32);
 
         assert_eq!(null_arr.len(), 32);
-        assert_eq!(null_arr.null_count(), 32);
-        assert!(!null_arr.is_valid(0));
+        assert_eq!(null_arr.null_count(), 0);
+        assert_eq!(null_arr.logical_nulls().unwrap().null_count(), 32);
+        assert!(null_arr.is_valid(0));
+        assert!(null_arr.is_nullable());
     }
 
     #[test]
@@ -186,7 +180,10 @@ mod tests {
 
         let array2 = array1.slice(8, 16);
         assert_eq!(array2.len(), 16);
-        assert_eq!(array2.null_count(), 16);
+        assert_eq!(array2.null_count(), 0);
+        assert_eq!(array2.logical_nulls().unwrap().null_count(), 16);
+        assert!(array2.is_valid(0));
+        assert!(array2.is_nullable());
     }
 
     #[test]
diff --git a/arrow-array/src/array/run_array.rs 
b/arrow-array/src/array/run_array.rs
index 820d5c9ebf..30cefaeb4d 100644
--- a/arrow-array/src/array/run_array.rs
+++ b/arrow-array/src/array/run_array.rs
@@ -18,7 +18,7 @@
 use std::any::Any;
 use std::sync::Arc;
 
-use arrow_buffer::{ArrowNativeType, NullBuffer, RunEndBuffer};
+use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, 
RunEndBuffer};
 use arrow_data::{ArrayData, ArrayDataBuilder};
 use arrow_schema::{ArrowError, DataType, Field};
 
@@ -349,6 +349,43 @@ impl<T: RunEndIndexType> Array for RunArray<T> {
         None
     }
 
+    fn logical_nulls(&self) -> Option<NullBuffer> {
+        let len = self.len();
+        let nulls = self.values.logical_nulls()?;
+        let mut out = BooleanBufferBuilder::new(len);
+        let offset = self.run_ends.offset();
+        let mut valid_start = 0;
+        let mut last_end = 0;
+        for (idx, end) in self.run_ends.values().iter().enumerate() {
+            let end = end.as_usize();
+            if end < offset {
+                continue;
+            }
+            let end = (end - offset).min(len);
+            if nulls.is_null(idx) {
+                if valid_start < last_end {
+                    out.append_n(last_end - valid_start, true);
+                }
+                out.append_n(end - last_end, false);
+                valid_start = end;
+            }
+            last_end = end;
+            if end == len {
+                break;
+            }
+        }
+        if valid_start < len {
+            out.append_n(len - valid_start, true)
+        }
+        // Sanity check
+        assert_eq!(out.len(), len);
+        Some(out.finish().into())
+    }
+
+    fn is_nullable(&self) -> bool {
+        !self.is_empty() && self.values.is_nullable()
+    }
+
     fn get_buffer_memory_size(&self) -> usize {
         self.run_ends.inner().inner().capacity() + 
self.values.get_buffer_memory_size()
     }
@@ -569,6 +606,14 @@ impl<'a, R: RunEndIndexType, V: Sync> Array for 
TypedRunArray<'a, R, V> {
         self.run_array.nulls()
     }
 
+    fn logical_nulls(&self) -> Option<NullBuffer> {
+        self.run_array.logical_nulls()
+    }
+
+    fn is_nullable(&self) -> bool {
+        self.run_array.is_nullable()
+    }
+
     fn get_buffer_memory_size(&self) -> usize {
         self.run_array.get_buffer_memory_size()
     }
@@ -1041,4 +1086,26 @@ mod tests {
             );
         }
     }
+
+    #[test]
+    fn test_logical_nulls() {
+        let run = Int32Array::from(vec![3, 6, 9, 12]);
+        let values = Int32Array::from(vec![Some(0), None, Some(1), None]);
+        let array = RunArray::try_new(&run, &values).unwrap();
+
+        let expected = vec![
+            true, true, true, false, false, false, true, true, true, false, 
false, false,
+        ];
+
+        let n = array.logical_nulls().unwrap();
+        assert_eq!(n.null_count(), 6);
+
+        let slices = [(0, 12), (0, 2), (2, 5), (3, 0), (3, 3), (3, 4), (4, 8)];
+        for (offset, length) in slices {
+            let a = array.slice(offset, length);
+            let n = a.logical_nulls().unwrap();
+            let n = n.into_iter().collect::<Vec<_>>();
+            assert_eq!(&n, &expected[offset..offset + length], "{offset} 
{length}");
+        }
+    }
 }
diff --git a/arrow-array/src/array/struct_array.rs 
b/arrow-array/src/array/struct_array.rs
index 1a79ebd95f..284c3b26a9 100644
--- a/arrow-array/src/array/struct_array.rs
+++ b/arrow-array/src/array/struct_array.rs
@@ -143,15 +143,14 @@ impl StructArray {
                 )));
             }
 
-            if let Some(a) = a.nulls() {
-                let nulls_valid = f.is_nullable()
-                    || nulls.as_ref().map(|n| 
n.contains(a)).unwrap_or_default();
-
-                if !nulls_valid {
-                    return Err(ArrowError::InvalidArgumentError(format!(
-                        "Found unmasked nulls for non-nullable StructArray 
field {:?}",
-                        f.name()
-                    )));
+            if !f.is_nullable() {
+                if let Some(a) = a.logical_nulls() {
+                    if !nulls.as_ref().map(|n| 
n.contains(&a)).unwrap_or_default() {
+                        return Err(ArrowError::InvalidArgumentError(format!(
+                            "Found unmasked nulls for non-nullable StructArray 
field {:?}",
+                            f.name()
+                        )));
+                    }
                 }
             }
         }
@@ -314,7 +313,7 @@ impl TryFrom<Vec<(&str, ArrayRef)>> for StructArray {
             .into_iter()
             .map(|(name, array)| {
                 (
-                    Field::new(name, array.data_type().clone(), 
array.nulls().is_some()),
+                    Field::new(name, array.data_type().clone(), 
array.is_nullable()),
                     array,
                 )
             })
diff --git a/arrow-array/src/builder/null_builder.rs 
b/arrow-array/src/builder/null_builder.rs
index 94cb7f5cc2..53a6b103d5 100644
--- a/arrow-array/src/builder/null_builder.rs
+++ b/arrow-array/src/builder/null_builder.rs
@@ -40,7 +40,7 @@ use std::sync::Arc;
 /// let arr = b.finish();
 ///
 /// assert_eq!(8, arr.len());
-/// assert_eq!(8, arr.null_count());
+/// assert_eq!(0, arr.null_count());
 /// ```
 #[derive(Debug)]
 pub struct NullBuilder {
@@ -160,7 +160,8 @@ mod tests {
         let arr = builder.finish();
         assert_eq!(20, arr.len());
         assert_eq!(0, arr.offset());
-        assert_eq!(20, arr.null_count());
+        assert_eq!(0, arr.null_count());
+        assert!(arr.is_nullable());
     }
 
     #[test]
@@ -170,10 +171,10 @@ mod tests {
         builder.append_empty_value();
         builder.append_empty_values(3);
         let mut array = builder.finish_cloned();
-        assert_eq!(21, array.null_count());
+        assert_eq!(21, array.len());
 
         builder.append_empty_values(5);
         array = builder.finish();
-        assert_eq!(26, array.null_count());
+        assert_eq!(26, array.len());
     }
 }
diff --git a/arrow-array/src/iterator.rs b/arrow-array/src/iterator.rs
index 86f5d99128..a198332ca5 100644
--- a/arrow-array/src/iterator.rs
+++ b/arrow-array/src/iterator.rs
@@ -22,6 +22,7 @@ use crate::array::{
     GenericListArray, GenericStringArray, PrimitiveArray,
 };
 use crate::{FixedSizeListArray, MapArray};
+use arrow_buffer::NullBuffer;
 
 /// An iterator that returns Some(T) or None, that can be used on any 
[`ArrayAccessor`]
 ///
@@ -46,6 +47,7 @@ use crate::{FixedSizeListArray, MapArray};
 #[derive(Debug)]
 pub struct ArrayIter<T: ArrayAccessor> {
     array: T,
+    logical_nulls: Option<NullBuffer>,
     current: usize,
     current_end: usize,
 }
@@ -54,12 +56,22 @@ impl<T: ArrayAccessor> ArrayIter<T> {
     /// create a new iterator
     pub fn new(array: T) -> Self {
         let len = array.len();
+        let logical_nulls = array.logical_nulls();
         ArrayIter {
             array,
+            logical_nulls,
             current: 0,
             current_end: len,
         }
     }
+
+    #[inline]
+    fn is_null(&self, idx: usize) -> bool {
+        self.logical_nulls
+            .as_ref()
+            .map(|x| x.is_null(idx))
+            .unwrap_or_default()
+    }
 }
 
 impl<T: ArrayAccessor> Iterator for ArrayIter<T> {
@@ -69,7 +81,7 @@ impl<T: ArrayAccessor> Iterator for ArrayIter<T> {
     fn next(&mut self) -> Option<Self::Item> {
         if self.current == self.current_end {
             None
-        } else if self.array.is_null(self.current) {
+        } else if self.is_null(self.current) {
             self.current += 1;
             Some(None)
         } else {
@@ -98,7 +110,7 @@ impl<T: ArrayAccessor> DoubleEndedIterator for ArrayIter<T> {
             None
         } else {
             self.current_end -= 1;
-            Some(if self.array.is_null(self.current_end) {
+            Some(if self.is_null(self.current_end) {
                 None
             } else {
                 // Safety:
diff --git a/arrow-buffer/src/builder/boolean.rs 
b/arrow-buffer/src/builder/boolean.rs
index f84cfa79c2..f0e7f0f136 100644
--- a/arrow-buffer/src/builder/boolean.rs
+++ b/arrow-buffer/src/builder/boolean.rs
@@ -203,6 +203,12 @@ impl BooleanBufferBuilder {
         );
     }
 
+    /// Append [`BooleanBuffer`] to this [`BooleanBufferBuilder`]
+    pub fn append_buffer(&mut self, buffer: &BooleanBuffer) {
+        let range = buffer.offset()..buffer.offset() + buffer.len();
+        self.append_packed_range(range, buffer.values())
+    }
+
     /// Returns the packed bits
     pub fn as_slice(&self) -> &[u8] {
         self.buffer.as_slice()
diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs
index c7fd082de2..a08a7a4fd4 100644
--- a/arrow-cast/src/cast.rs
+++ b/arrow-cast/src/cast.rs
@@ -7233,9 +7233,8 @@ mod tests {
         assert_eq!(array.data_type(), &data_type);
         let cast_array = cast(&array, &DataType::Null).expect("cast failed");
         assert_eq!(cast_array.data_type(), &DataType::Null);
-        for i in 0..4 {
-            assert!(cast_array.is_null(i));
-        }
+        assert_eq!(cast_array.len(), 4);
+        assert_eq!(cast_array.logical_nulls().unwrap().null_count(), 4);
     }
 
     #[test]
diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs
index d18b0e36e9..21583fac08 100644
--- a/arrow-ord/src/comparison.rs
+++ b/arrow-ord/src/comparison.rs
@@ -6352,4 +6352,18 @@ mod tests {
             .to_string()
             .contains("Could not convert ToType with to_i128"));
     }
+
+    #[test]
+    #[cfg(feature = "dyn_cmp_dict")]
+    fn test_dictionary_nested_nulls() {
+        let keys = Int32Array::from(vec![0, 1, 2]);
+        let v1 = Arc::new(Int32Array::from(vec![Some(0), None, Some(2)]));
+        let a = DictionaryArray::new(keys.clone(), v1);
+        let v2 = Arc::new(Int32Array::from(vec![None, Some(0), Some(2)]));
+        let b = DictionaryArray::new(keys, v2);
+
+        let r = eq_dyn(&a, &b).unwrap();
+        assert_eq!(r.null_count(), 2);
+        assert!(r.is_valid(2));
+    }
 }
diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs
index 648a7d7afc..8785863059 100644
--- a/arrow-ord/src/sort.rs
+++ b/arrow-ord/src/sort.rs
@@ -719,19 +719,19 @@ where
     }
 }
 
-type LexicographicalCompareItem<'a> = (
-    Option<&'a NullBuffer>, // nulls
-    DynComparator,          // comparator
-    SortOptions,            // sort_option
+type LexicographicalCompareItem = (
+    Option<NullBuffer>, // nulls
+    DynComparator,      // comparator
+    SortOptions,        // sort_option
 );
 
 /// A lexicographical comparator that wraps given array data (columns) and can 
lexicographically compare data
 /// at given two indices. The lifetime is the same at the data wrapped.
-pub struct LexicographicalComparator<'a> {
-    compare_items: Vec<LexicographicalCompareItem<'a>>,
+pub struct LexicographicalComparator {
+    compare_items: Vec<LexicographicalCompareItem>,
 }
 
-impl LexicographicalComparator<'_> {
+impl LexicographicalComparator {
     /// lexicographically compare values at the wrapped columns with given 
indices.
     pub fn compare(&self, a_idx: usize, b_idx: usize) -> Ordering {
         for (nulls, comparator, sort_option) in &self.compare_items {
@@ -780,14 +780,14 @@ impl LexicographicalComparator<'_> {
     /// results with two indices.
     pub fn try_new(
         columns: &[SortColumn],
-    ) -> Result<LexicographicalComparator<'_>, ArrowError> {
+    ) -> Result<LexicographicalComparator, ArrowError> {
         let compare_items = columns
             .iter()
             .map(|column| {
                 // flatten and convert build comparators
                 let values = column.values.as_ref();
                 Ok((
-                    values.nulls(),
+                    values.logical_nulls(),
                     build_compare(values, values)?,
                     column.options.unwrap_or_default(),
                 ))
@@ -4016,4 +4016,30 @@ mod tests {
             vec![None, None, None, Some(5.1), Some(5.1), Some(3.0), Some(1.2)],
         );
     }
+
+    #[test]
+    fn test_lexicographic_comparator_null_dict_values() {
+        let values = Int32Array::new(
+            vec![1, 2, 3, 4].into(),
+            Some(NullBuffer::from(vec![true, false, false, true])),
+        );
+        let keys = Int32Array::new(
+            vec![0, 1, 53, 3].into(),
+            Some(NullBuffer::from(vec![true, true, false, true])),
+        );
+        // [1, NULL, NULL, 4]
+        let dict = DictionaryArray::new(keys, Arc::new(values));
+
+        let comparator = LexicographicalComparator::try_new(&[SortColumn {
+            values: Arc::new(dict),
+            options: None,
+        }])
+        .unwrap();
+        // 1.cmp(NULL)
+        assert_eq!(comparator.compare(0, 1), Ordering::Greater);
+        // NULL.cmp(NULL)
+        assert_eq!(comparator.compare(2, 1), Ordering::Equal);
+        // NULL.cmp(4)
+        assert_eq!(comparator.compare(2, 3), Ordering::Less);
+    }
 }
diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs
index 1223280e37..9d3abea66f 100644
--- a/arrow-string/src/like.rs
+++ b/arrow-string/src/like.rs
@@ -581,7 +581,10 @@ where
         ));
     }
 
-    let nulls = NullBuffer::union(left.nulls(), right.nulls());
+    let nulls = NullBuffer::union(
+        left.logical_nulls().as_ref(),
+        right.logical_nulls().as_ref(),
+    );
 
     let mut result = BooleanBufferBuilder::new(left.len());
     for i in 0..left.len() {
diff --git a/parquet/src/arrow/arrow_writer/levels.rs 
b/parquet/src/arrow/arrow_writer/levels.rs
index 47b0189030..48615dc3d5 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -494,7 +494,7 @@ impl LevelInfoBuilder {
                 def_levels.reserve(len);
                 info.non_null_indices.reserve(len);
 
-                match array.nulls() {
+                match array.logical_nulls() {
                     Some(nulls) => {
                         // TODO: Faster bitmask iteration (#1757)
                         for i in range {
@@ -1751,7 +1751,6 @@ mod tests {
         builder.write(&a, 0..4);
         let levels = builder.finish();
 
-        let list_level = levels.get(0).unwrap();
         let expected_level = LevelInfo {
             def_levels: Some(vec![5, 4, 5, 2, 5, 3, 5, 5, 4, 4, 0]),
             rep_levels: Some(vec![0, 2, 2, 1, 0, 1, 0, 2, 1, 2, 0]),
@@ -1760,6 +1759,35 @@ mod tests {
             max_rep_level: 2,
         };
 
-        assert_eq!(list_level, &expected_level);
+        assert_eq!(levels[0], expected_level);
+    }
+
+    #[test]
+    fn test_null_dictionary_values() {
+        let values = Int32Array::new(
+            vec![1, 2, 3, 4].into(),
+            Some(NullBuffer::from(vec![true, false, true, true])),
+        );
+        let keys = Int32Array::new(
+            vec![1, 54, 2, 0].into(),
+            Some(NullBuffer::from(vec![true, false, true, true])),
+        );
+        // [NULL, NULL, 3, 0]
+        let dict = DictionaryArray::new(keys, Arc::new(values));
+
+        let item_field = Field::new("item", dict.data_type().clone(), true);
+
+        let mut builder =
+            LevelInfoBuilder::try_new(&item_field, 
Default::default()).unwrap();
+        builder.write(&dict, 0..4);
+        let levels = builder.finish();
+        let expected_level = LevelInfo {
+            def_levels: Some(vec![0, 0, 1, 1]),
+            rep_levels: None,
+            non_null_indices: vec![2, 3],
+            max_def_level: 1,
+            max_rep_level: 0,
+        };
+        assert_eq!(levels[0], expected_level);
     }
 }
diff --git a/parquet/src/arrow/arrow_writer/mod.rs 
b/parquet/src/arrow/arrow_writer/mod.rs
index d3d4e2626f..c4d174b6ad 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1965,7 +1965,7 @@ mod tests {
 
         assert_eq!(a.value(0).len(), 0);
         assert_eq!(a.value(2).len(), 2);
-        assert_eq!(a.value(2).null_count(), 2);
+        assert_eq!(a.value(2).logical_nulls().unwrap().null_count(), 2);
 
         let batch = RecordBatch::try_new(Arc::new(schema), 
vec![Arc::new(a)]).unwrap();
         roundtrip(batch, None);

Reply via email to