This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 9c70e4acd Lazily materialize the null buffer builder for all array 
builders. (#2127)
9c70e4acd is described below

commit 9c70e4acd3a872c472a11bc12ae76837c1d4aaaa
Author: Remzi Yang <[email protected]>
AuthorDate: Tue Jul 26 23:18:30 2022 +0800

    Lazily materialize the null buffer builder for all array builders. (#2127)
    
    * create null buffer builder
    
    Signed-off-by: remzi <[email protected]>
    
    * use null buffer builder in boolean array builder
    
    Signed-off-by: remzi <[email protected]>
    
    * add capacity field and update docs
    
    Signed-off-by: remzi <[email protected]>
    
    * update fixed size binary builder
    
    Signed-off-by: remzi <[email protected]>
    
    * update fixed size list builder
    
    Signed-off-by: remzi <[email protected]>
    
    * update binary builder
    
    Signed-off-by: remzi <[email protected]>
    
    * update list builder
    
    Signed-off-by: remzi <[email protected]>
    
    * update map builder
    
    Signed-off-by: remzi <[email protected]>
    
    * update struct builder
    
    Signed-off-by: remzi <[email protected]>
    
    * update union builder
    
    Signed-off-by: remzi <[email protected]>
    
    * rename
    
    Signed-off-by: remzi <[email protected]>
    
    * add tests
    
    Signed-off-by: remzi <[email protected]>
    
    * optimize
    
    Signed-off-by: remzi <[email protected]>
    
    * expose null buffer builder in builder mod
    
    Signed-off-by: remzi <[email protected]>
    
    * add docs
    
    Signed-off-by: remzi <[email protected]>
    
    * reduce redundant computation
    
    Signed-off-by: remzi <[email protected]>
    
    * inline methods to achieve better performance
    
    Signed-off-by: remzi <[email protected]>
    
    * use instead of pub(self) use
    
    Signed-off-by: remzi <[email protected]>
    
    * Update arrow/src/array/builder/null_buffer_builder.rs
    
    fix spelling mistake
    
    Co-authored-by: Jörn Horstmann <[email protected]>
    
    * reorder
    
    Signed-off-by: remzi <[email protected]>
    
    * rename methods and update docs
    
    Signed-off-by: remzi <[email protected]>
    
    Co-authored-by: Jörn Horstmann <[email protected]>
---
 arrow/src/array/builder/boolean_builder.rs         |  44 ++---
 .../src/array/builder/fixed_size_binary_builder.rs |  16 +-
 arrow/src/array/builder/fixed_size_list_builder.rs |  16 +-
 arrow/src/array/builder/generic_binary_builder.rs  |   8 +-
 arrow/src/array/builder/generic_list_builder.rs    |  16 +-
 arrow/src/array/builder/map_builder.rs             |  12 +-
 arrow/src/array/builder/mod.rs                     |   2 +
 arrow/src/array/builder/null_buffer_builder.rs     | 204 +++++++++++++++++++++
 arrow/src/array/builder/primitive_builder.rs       |  64 ++-----
 arrow/src/array/builder/struct_builder.rs          |  21 +--
 arrow/src/array/builder/union_builder.rs           |  14 +-
 11 files changed, 284 insertions(+), 133 deletions(-)

diff --git a/arrow/src/array/builder/boolean_builder.rs 
b/arrow/src/array/builder/boolean_builder.rs
index 1e052b644..e28e37bc9 100644
--- a/arrow/src/array/builder/boolean_builder.rs
+++ b/arrow/src/array/builder/boolean_builder.rs
@@ -28,6 +28,7 @@ use crate::error::ArrowError;
 use crate::error::Result;
 
 use super::BooleanBufferBuilder;
+use super::NullBufferBuilder;
 
 ///  Array builder for fixed-width primitive types
 ///
@@ -62,9 +63,7 @@ use super::BooleanBufferBuilder;
 #[derive(Debug)]
 pub struct BooleanBuilder {
     values_builder: BooleanBufferBuilder,
-    /// We only materialize the builder when we add `false`.
-    /// This optimization is **very** important for the performance.
-    bitmap_builder: Option<BooleanBufferBuilder>,
+    null_buffer_builder: NullBufferBuilder,
 }
 
 impl BooleanBuilder {
@@ -72,7 +71,7 @@ impl BooleanBuilder {
     pub fn new(capacity: usize) -> Self {
         Self {
             values_builder: BooleanBufferBuilder::new(capacity),
-            bitmap_builder: None,
+            null_buffer_builder: NullBufferBuilder::new(capacity),
         }
     }
 
@@ -85,19 +84,23 @@ impl BooleanBuilder {
     #[inline]
     pub fn append_value(&mut self, v: bool) {
         self.values_builder.append(v);
-        if let Some(b) = self.bitmap_builder.as_mut() {
-            b.append(true)
-        }
+        self.null_buffer_builder.append_non_null();
     }
 
     /// Appends a null slot into the builder
     #[inline]
     pub fn append_null(&mut self) {
-        self.materialize_bitmap_builder();
-        self.bitmap_builder.as_mut().unwrap().append(false);
+        self.null_buffer_builder.append_null();
         self.values_builder.advance(1);
     }
 
+    /// Appends `n` `null`s into the builder.
+    #[inline]
+    pub fn append_nulls(&mut self, n: usize) {
+        self.null_buffer_builder.append_n_nulls(n);
+        self.values_builder.advance(n);
+    }
+
     /// Appends an `Option<T>` into the builder
     #[inline]
     pub fn append_option(&mut self, v: Option<bool>) {
@@ -110,10 +113,8 @@ impl BooleanBuilder {
     /// Appends a slice of type `T` into the builder
     #[inline]
     pub fn append_slice(&mut self, v: &[bool]) {
-        if let Some(b) = self.bitmap_builder.as_mut() {
-            b.append_n(v.len(), true)
-        }
         self.values_builder.append_slice(v);
+        self.null_buffer_builder.append_n_non_nulls(v.len());
     }
 
     /// Appends values from a slice of type `T` and a validity boolean slice.
@@ -126,13 +127,7 @@ impl BooleanBuilder {
                 "Value and validity lengths must be equal".to_string(),
             ))
         } else {
-            is_valid
-                .iter()
-                .any(|v| !*v)
-                .then(|| self.materialize_bitmap_builder());
-            if let Some(b) = self.bitmap_builder.as_mut() {
-                b.append_slice(is_valid)
-            }
+            self.null_buffer_builder.append_slice(is_valid);
             self.values_builder.append_slice(values);
             Ok(())
         }
@@ -141,7 +136,7 @@ impl BooleanBuilder {
     /// Builds the [BooleanArray] and reset this builder.
     pub fn finish(&mut self) -> BooleanArray {
         let len = self.len();
-        let null_bit_buffer = self.bitmap_builder.as_mut().map(|b| b.finish());
+        let null_bit_buffer = self.null_buffer_builder.finish();
         let builder = ArrayData::builder(DataType::Boolean)
             .len(len)
             .add_buffer(self.values_builder.finish())
@@ -150,15 +145,6 @@ impl BooleanBuilder {
         let array_data = unsafe { builder.build_unchecked() };
         BooleanArray::from(array_data)
     }
-
-    fn materialize_bitmap_builder(&mut self) {
-        if self.bitmap_builder.is_none() {
-            let mut b = BooleanBufferBuilder::new(0);
-            b.reserve(self.values_builder.capacity());
-            b.append_n(self.values_builder.len(), true);
-            self.bitmap_builder = Some(b);
-        }
-    }
 }
 
 impl ArrayBuilder for BooleanBuilder {
diff --git a/arrow/src/array/builder/fixed_size_binary_builder.rs 
b/arrow/src/array/builder/fixed_size_binary_builder.rs
index 5ef89d8f4..96f90c4ca 100644
--- a/arrow/src/array/builder/fixed_size_binary_builder.rs
+++ b/arrow/src/array/builder/fixed_size_binary_builder.rs
@@ -23,12 +23,12 @@ use crate::error::{ArrowError, Result};
 use std::any::Any;
 use std::sync::Arc;
 
-use super::BooleanBufferBuilder;
+use super::NullBufferBuilder;
 
 #[derive(Debug)]
 pub struct FixedSizeBinaryBuilder {
     values_builder: UInt8BufferBuilder,
-    bitmap_builder: BooleanBufferBuilder,
+    null_buffer_builder: NullBufferBuilder,
     value_length: i32,
 }
 
@@ -43,7 +43,7 @@ impl FixedSizeBinaryBuilder {
         );
         Self {
             values_builder: UInt8BufferBuilder::new(capacity),
-            bitmap_builder: BooleanBufferBuilder::new(if byte_width > 0 {
+            null_buffer_builder: NullBufferBuilder::new(if byte_width > 0 {
                 capacity / byte_width as usize
             } else {
                 0
@@ -64,7 +64,7 @@ impl FixedSizeBinaryBuilder {
             ))
         } else {
             self.values_builder.append_slice(value.as_ref());
-            self.bitmap_builder.append(true);
+            self.null_buffer_builder.append_non_null();
             Ok(())
         }
     }
@@ -74,7 +74,7 @@ impl FixedSizeBinaryBuilder {
     pub fn append_null(&mut self) {
         self.values_builder
             .append_slice(&vec![0u8; self.value_length as usize][..]);
-        self.bitmap_builder.append(false);
+        self.null_buffer_builder.append_null();
     }
 
     /// Builds the [`FixedSizeBinaryArray`] and reset this builder.
@@ -83,7 +83,7 @@ impl FixedSizeBinaryBuilder {
         let array_data_builder =
             ArrayData::builder(DataType::FixedSizeBinary(self.value_length))
                 .add_buffer(self.values_builder.finish())
-                .null_bit_buffer(Some(self.bitmap_builder.finish()))
+                .null_bit_buffer(self.null_buffer_builder.finish())
                 .len(array_length);
         let array_data = unsafe { array_data_builder.build_unchecked() };
         FixedSizeBinaryArray::from(array_data)
@@ -108,12 +108,12 @@ impl ArrayBuilder for FixedSizeBinaryBuilder {
 
     /// Returns the number of array slots in the builder
     fn len(&self) -> usize {
-        self.bitmap_builder.len()
+        self.null_buffer_builder.len()
     }
 
     /// Returns whether the number of array slots is zero
     fn is_empty(&self) -> bool {
-        self.bitmap_builder.is_empty()
+        self.null_buffer_builder.is_empty()
     }
 
     /// Builds the array and reset this builder.
diff --git a/arrow/src/array/builder/fixed_size_list_builder.rs 
b/arrow/src/array/builder/fixed_size_list_builder.rs
index 4e80e585a..343ce3657 100644
--- a/arrow/src/array/builder/fixed_size_list_builder.rs
+++ b/arrow/src/array/builder/fixed_size_list_builder.rs
@@ -25,12 +25,12 @@ use crate::datatypes::DataType;
 use crate::datatypes::Field;
 
 use super::ArrayBuilder;
-use super::BooleanBufferBuilder;
+use super::NullBufferBuilder;
 
 ///  Array builder for [`FixedSizeListArray`]
 #[derive(Debug)]
 pub struct FixedSizeListBuilder<T: ArrayBuilder> {
-    bitmap_builder: BooleanBufferBuilder,
+    null_buffer_builder: NullBufferBuilder,
     values_builder: T,
     list_len: i32,
 }
@@ -48,7 +48,7 @@ impl<T: ArrayBuilder> FixedSizeListBuilder<T> {
     /// `capacity` is the number of items to pre-allocate space for in this 
builder
     pub fn with_capacity(values_builder: T, value_length: i32, capacity: 
usize) -> Self {
         Self {
-            bitmap_builder: BooleanBufferBuilder::new(capacity),
+            null_buffer_builder: NullBufferBuilder::new(capacity),
             values_builder,
             list_len: value_length,
         }
@@ -76,12 +76,12 @@ where
 
     /// Returns the number of array slots in the builder
     fn len(&self) -> usize {
-        self.bitmap_builder.len()
+        self.null_buffer_builder.len()
     }
 
     /// Returns whether the number of array slots is zero
     fn is_empty(&self) -> bool {
-        self.bitmap_builder.is_empty()
+        self.null_buffer_builder.is_empty()
     }
 
     /// Builds the array and reset this builder.
@@ -109,7 +109,7 @@ where
     /// Finish the current fixed-length list array slot
     #[inline]
     pub fn append(&mut self, is_valid: bool) {
-        self.bitmap_builder.append(is_valid);
+        self.null_buffer_builder.append(is_valid);
     }
 
     /// Builds the [`FixedSizeListBuilder`] and reset this builder.
@@ -131,14 +131,14 @@ where
             len,
         );
 
-        let null_bit_buffer = self.bitmap_builder.finish();
+        let null_bit_buffer = self.null_buffer_builder.finish();
         let array_data = ArrayData::builder(DataType::FixedSizeList(
             Box::new(Field::new("item", values_data.data_type().clone(), 
true)),
             self.list_len,
         ))
         .len(len)
         .add_child_data(values_data.clone())
-        .null_bit_buffer(Some(null_bit_buffer));
+        .null_bit_buffer(null_bit_buffer);
 
         let array_data = unsafe { array_data.build_unchecked() };
 
diff --git a/arrow/src/array/builder/generic_binary_builder.rs 
b/arrow/src/array/builder/generic_binary_builder.rs
index 54c1855a1..1e29c470c 100644
--- a/arrow/src/array/builder/generic_binary_builder.rs
+++ b/arrow/src/array/builder/generic_binary_builder.rs
@@ -25,14 +25,14 @@ use crate::{
 use std::any::Any;
 use std::sync::Arc;
 
-use super::{BooleanBufferBuilder, BufferBuilder};
+use super::{BufferBuilder, NullBufferBuilder};
 
 ///  Array builder for [`GenericBinaryArray`]
 #[derive(Debug)]
 pub struct GenericBinaryBuilder<OffsetSize: OffsetSizeTrait> {
     value_builder: UInt8BufferBuilder,
     offsets_builder: BufferBuilder<OffsetSize>,
-    null_buffer_builder: BooleanBufferBuilder,
+    null_buffer_builder: NullBufferBuilder,
 }
 
 impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
@@ -44,7 +44,7 @@ impl<OffsetSize: OffsetSizeTrait> 
GenericBinaryBuilder<OffsetSize> {
         Self {
             value_builder: UInt8BufferBuilder::new(capacity),
             offsets_builder,
-            null_buffer_builder: BooleanBufferBuilder::new(1024),
+            null_buffer_builder: NullBufferBuilder::new(1024),
         }
     }
 
@@ -76,7 +76,7 @@ impl<OffsetSize: OffsetSizeTrait> 
GenericBinaryBuilder<OffsetSize> {
             .len(self.len())
             .add_buffer(self.offsets_builder.finish())
             .add_buffer(self.value_builder.finish())
-            .null_bit_buffer(Some(self.null_buffer_builder.finish()));
+            .null_bit_buffer(self.null_buffer_builder.finish());
 
         self.offsets_builder.append(OffsetSize::zero());
         let array_data = unsafe { array_builder.build_unchecked() };
diff --git a/arrow/src/array/builder/generic_list_builder.rs 
b/arrow/src/array/builder/generic_list_builder.rs
index e478cc7a3..911182f65 100644
--- a/arrow/src/array/builder/generic_list_builder.rs
+++ b/arrow/src/array/builder/generic_list_builder.rs
@@ -25,13 +25,13 @@ use crate::array::OffsetSizeTrait;
 use crate::datatypes::DataType;
 use crate::datatypes::Field;
 
-use super::{ArrayBuilder, BooleanBufferBuilder, BufferBuilder};
+use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder};
 
 ///  Array builder for [`GenericListArray`]
 #[derive(Debug)]
 pub struct GenericListBuilder<OffsetSize: OffsetSizeTrait, T: ArrayBuilder> {
     offsets_builder: BufferBuilder<OffsetSize>,
-    bitmap_builder: BooleanBufferBuilder,
+    null_buffer_builder: NullBufferBuilder,
     values_builder: T,
 }
 
@@ -49,7 +49,7 @@ impl<OffsetSize: OffsetSizeTrait, T: ArrayBuilder> 
GenericListBuilder<OffsetSize
         offsets_builder.append(OffsetSize::zero());
         Self {
             offsets_builder,
-            bitmap_builder: BooleanBufferBuilder::new(capacity),
+            null_buffer_builder: NullBufferBuilder::new(capacity),
             values_builder,
         }
     }
@@ -77,12 +77,12 @@ where
 
     /// Returns the number of array slots in the builder
     fn len(&self) -> usize {
-        self.bitmap_builder.len()
+        self.null_buffer_builder.len()
     }
 
     /// Returns whether the number of array slots is zero
     fn is_empty(&self) -> bool {
-        self.bitmap_builder.is_empty()
+        self.null_buffer_builder.is_empty()
     }
 
     /// Builds the array and reset this builder.
@@ -113,7 +113,7 @@ where
     pub fn append(&mut self, is_valid: bool) {
         self.offsets_builder
             
.append(OffsetSize::from_usize(self.values_builder.len()).unwrap());
-        self.bitmap_builder.append(is_valid);
+        self.null_buffer_builder.append(is_valid);
     }
 
     /// Builds the [`GenericListArray`] and reset this builder.
@@ -128,7 +128,7 @@ where
         let values_data = values_arr.data();
 
         let offset_buffer = self.offsets_builder.finish();
-        let null_bit_buffer = self.bitmap_builder.finish();
+        let null_bit_buffer = self.null_buffer_builder.finish();
         self.offsets_builder.append(OffsetSize::zero());
         let field = Box::new(Field::new(
             "item",
@@ -144,7 +144,7 @@ where
             .len(len)
             .add_buffer(offset_buffer)
             .add_child_data(values_data.clone())
-            .null_bit_buffer(Some(null_bit_buffer));
+            .null_bit_buffer(null_bit_buffer);
 
         let array_data = unsafe { array_data_builder.build_unchecked() };
 
diff --git a/arrow/src/array/builder/map_builder.rs 
b/arrow/src/array/builder/map_builder.rs
index 6078f8f22..7e68abd5f 100644
--- a/arrow/src/array/builder/map_builder.rs
+++ b/arrow/src/array/builder/map_builder.rs
@@ -18,7 +18,7 @@
 use std::any::Any;
 use std::sync::Arc;
 
-use super::{ArrayBuilder, BooleanBufferBuilder, BufferBuilder};
+use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder};
 use crate::array::array::Array;
 use crate::array::ArrayData;
 use crate::array::ArrayRef;
@@ -32,7 +32,7 @@ use crate::error::Result;
 #[derive(Debug)]
 pub struct MapBuilder<K: ArrayBuilder, V: ArrayBuilder> {
     offsets_builder: BufferBuilder<i32>,
-    bitmap_builder: BooleanBufferBuilder,
+    null_buffer_builder: NullBufferBuilder,
     field_names: MapFieldNames,
     key_builder: K,
     value_builder: V,
@@ -78,7 +78,7 @@ impl<K: ArrayBuilder, V: ArrayBuilder> MapBuilder<K, V> {
         offsets_builder.append(len);
         Self {
             offsets_builder,
-            bitmap_builder: BooleanBufferBuilder::new(capacity),
+            null_buffer_builder: NullBufferBuilder::new(capacity),
             field_names: field_names.unwrap_or_default(),
             key_builder,
             value_builder,
@@ -107,7 +107,7 @@ impl<K: ArrayBuilder, V: ArrayBuilder> MapBuilder<K, V> {
             )));
         }
         self.offsets_builder.append(self.key_builder.len() as i32);
-        self.bitmap_builder.append(is_valid);
+        self.null_buffer_builder.append(is_valid);
         self.len += 1;
         Ok(())
     }
@@ -145,7 +145,7 @@ impl<K: ArrayBuilder, V: ArrayBuilder> MapBuilder<K, V> {
             StructArray::from(vec![(keys_field, keys_arr), (values_field, 
values_arr)]);
 
         let offset_buffer = self.offsets_builder.finish();
-        let null_bit_buffer = self.bitmap_builder.finish();
+        let null_bit_buffer = self.null_buffer_builder.finish();
         self.offsets_builder.append(self.len);
         let map_field = Box::new(Field::new(
             self.field_names.entry.as_str(),
@@ -156,7 +156,7 @@ impl<K: ArrayBuilder, V: ArrayBuilder> MapBuilder<K, V> {
             .len(len)
             .add_buffer(offset_buffer)
             .add_child_data(struct_array.into_data())
-            .null_bit_buffer(Some(null_bit_buffer));
+            .null_bit_buffer(null_bit_buffer);
 
         let array_data = unsafe { array_data.build_unchecked() };
 
diff --git a/arrow/src/array/builder/mod.rs b/arrow/src/array/builder/mod.rs
index f97ff4cac..77dd907f6 100644
--- a/arrow/src/array/builder/mod.rs
+++ b/arrow/src/array/builder/mod.rs
@@ -30,6 +30,7 @@ mod generic_binary_builder;
 mod generic_list_builder;
 mod generic_string_builder;
 mod map_builder;
+mod null_buffer_builder;
 mod primitive_builder;
 mod primitive_dictionary_builder;
 mod string_dictionary_builder;
@@ -53,6 +54,7 @@ pub use generic_binary_builder::GenericBinaryBuilder;
 pub use generic_list_builder::GenericListBuilder;
 pub use generic_string_builder::GenericStringBuilder;
 pub use map_builder::{MapBuilder, MapFieldNames};
+use null_buffer_builder::NullBufferBuilder;
 pub use primitive_builder::PrimitiveBuilder;
 pub use primitive_dictionary_builder::PrimitiveDictionaryBuilder;
 pub use string_dictionary_builder::StringDictionaryBuilder;
diff --git a/arrow/src/array/builder/null_buffer_builder.rs 
b/arrow/src/array/builder/null_buffer_builder.rs
new file mode 100644
index 000000000..ef2e4c50a
--- /dev/null
+++ b/arrow/src/array/builder/null_buffer_builder.rs
@@ -0,0 +1,204 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::buffer::Buffer;
+
+use super::BooleanBufferBuilder;
+
+/// Builder for creating the null bit buffer.
+/// This builder only materializes the buffer when we append `false`.
+/// If you only append `true`s to the builder, what you get will be
+/// `None` when calling [`finish`](#method.finish).
+/// This optimization is **very** important for the performance.
+#[derive(Debug)]
+pub(super) struct NullBufferBuilder {
+    bitmap_builder: Option<BooleanBufferBuilder>,
+    /// Store the length of the buffer before materializing.
+    len: usize,
+    capacity: usize,
+}
+
+impl NullBufferBuilder {
+    /// Creates a new empty builder.
+    /// `capacity` is the number of bits in the null buffer.
+    pub fn new(capacity: usize) -> Self {
+        Self {
+            bitmap_builder: None,
+            len: 0,
+            capacity,
+        }
+    }
+
+    /// Appends `n` `true`s into the builder
+    /// to indicate that these `n` items are not nulls.
+    #[inline]
+    pub fn append_n_non_nulls(&mut self, n: usize) {
+        if let Some(buf) = self.bitmap_builder.as_mut() {
+            buf.append_n(n, true)
+        } else {
+            self.len += n;
+        }
+    }
+
+    /// Appends a `true` into the builder
+    /// to indicate that this item is not null.
+    #[inline]
+    pub fn append_non_null(&mut self) {
+        if let Some(buf) = self.bitmap_builder.as_mut() {
+            buf.append(true)
+        } else {
+            self.len += 1;
+        }
+    }
+
+    /// Appends `n` `false`s into the builder
+    /// to indicate that these `n` items are nulls.
+    #[inline]
+    pub fn append_n_nulls(&mut self, n: usize) {
+        self.materialize_if_needed();
+        self.bitmap_builder.as_mut().unwrap().append_n(n, false);
+    }
+
+    /// Appends a `false` into the builder
+    /// to indicate that this item is null.
+    #[inline]
+    pub fn append_null(&mut self) {
+        self.materialize_if_needed();
+        self.bitmap_builder.as_mut().unwrap().append(false);
+    }
+
+    /// Appends a boolean value into the builder.
+    #[inline]
+    pub fn append(&mut self, not_null: bool) {
+        if not_null {
+            self.append_non_null()
+        } else {
+            self.append_null()
+        }
+    }
+
+    /// Appends a boolean slice into the builder
+    /// to indicate the validations of these items.
+    pub fn append_slice(&mut self, slice: &[bool]) {
+        if slice.iter().any(|v| !v) {
+            self.materialize_if_needed()
+        }
+        if let Some(buf) = self.bitmap_builder.as_mut() {
+            buf.append_slice(slice)
+        } else {
+            self.len += slice.len();
+        }
+    }
+
+    /// Builds the null buffer and resets the builder.
+    /// Returns `None` if the builder only contains `true`s.
+    pub fn finish(&mut self) -> Option<Buffer> {
+        let buf = self.bitmap_builder.as_mut().map(|b| b.finish());
+        self.bitmap_builder = None;
+        self.len = 0;
+        buf
+    }
+
+    #[inline]
+    fn materialize_if_needed(&mut self) {
+        if self.bitmap_builder.is_none() {
+            self.materialize()
+        }
+    }
+
+    #[cold]
+    fn materialize(&mut self) {
+        if self.bitmap_builder.is_none() {
+            let mut b = BooleanBufferBuilder::new(self.len.max(self.capacity));
+            b.append_n(self.len, true);
+            self.bitmap_builder = Some(b);
+        }
+    }
+}
+
+impl NullBufferBuilder {
+    pub fn len(&self) -> usize {
+        if let Some(b) = &self.bitmap_builder {
+            b.len()
+        } else {
+            self.len
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_null_buffer_builder() {
+        let mut builder = NullBufferBuilder::new(0);
+        builder.append_null();
+        builder.append_non_null();
+        builder.append_n_nulls(2);
+        builder.append_n_non_nulls(2);
+        assert_eq!(6, builder.len());
+
+        let buf = builder.finish().unwrap();
+        assert_eq!(Buffer::from(&[0b110010_u8]), buf);
+    }
+
+    #[test]
+    fn test_null_buffer_builder_all_nulls() {
+        let mut builder = NullBufferBuilder::new(0);
+        builder.append_null();
+        builder.append_n_nulls(2);
+        builder.append_slice(&[false, false, false]);
+        assert_eq!(6, builder.len());
+
+        let buf = builder.finish().unwrap();
+        assert_eq!(Buffer::from(&[0b0_u8]), buf);
+    }
+
+    #[test]
+    fn test_null_buffer_builder_no_null() {
+        let mut builder = NullBufferBuilder::new(0);
+        builder.append_non_null();
+        builder.append_n_non_nulls(2);
+        builder.append_slice(&[true, true, true]);
+        assert_eq!(6, builder.len());
+
+        let buf = builder.finish();
+        assert!(buf.is_none());
+    }
+
+    #[test]
+    fn test_null_buffer_builder_reset() {
+        let mut builder = NullBufferBuilder::new(0);
+        builder.append_slice(&[true, false, true]);
+        builder.finish();
+        assert!(builder.is_empty());
+
+        builder.append_slice(&[true, true, true]);
+        assert!(builder.finish().is_none());
+        assert!(builder.is_empty());
+
+        builder.append_slice(&[true, true, false, true]);
+
+        let buf = builder.finish().unwrap();
+        assert_eq!(Buffer::from(&[0b1011_u8]), buf);
+    }
+}
diff --git a/arrow/src/array/builder/primitive_builder.rs 
b/arrow/src/array/builder/primitive_builder.rs
index 36f4ae2b7..3b9db1f01 100644
--- a/arrow/src/array/builder/primitive_builder.rs
+++ b/arrow/src/array/builder/primitive_builder.rs
@@ -23,15 +23,13 @@ use crate::array::ArrayRef;
 use crate::array::PrimitiveArray;
 use crate::datatypes::ArrowPrimitiveType;
 
-use super::{ArrayBuilder, BooleanBufferBuilder, BufferBuilder};
+use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder};
 
 ///  Array builder for fixed-width primitive types
 #[derive(Debug)]
 pub struct PrimitiveBuilder<T: ArrowPrimitiveType> {
     values_builder: BufferBuilder<T::Native>,
-    /// We only materialize the builder when we add `false`.
-    /// This optimization is **very** important for performance of 
`StringBuilder`.
-    bitmap_builder: Option<BooleanBufferBuilder>,
+    null_buffer_builder: NullBufferBuilder,
 }
 
 impl<T: ArrowPrimitiveType> ArrayBuilder for PrimitiveBuilder<T> {
@@ -71,7 +69,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
     pub fn new(capacity: usize) -> Self {
         Self {
             values_builder: BufferBuilder::<T::Native>::new(capacity),
-            bitmap_builder: None,
+            null_buffer_builder: NullBufferBuilder::new(capacity),
         }
     }
 
@@ -83,24 +81,20 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
     /// Appends a value of type `T` into the builder
     #[inline]
     pub fn append_value(&mut self, v: T::Native) {
-        if let Some(b) = self.bitmap_builder.as_mut() {
-            b.append(true);
-        }
+        self.null_buffer_builder.append_non_null();
         self.values_builder.append(v);
     }
 
     /// Appends a null slot into the builder
     #[inline]
     pub fn append_null(&mut self) {
-        self.materialize_bitmap_builder_if_needed();
-        self.bitmap_builder.as_mut().unwrap().append(false);
+        self.null_buffer_builder.append_null();
         self.values_builder.advance(1);
     }
 
     #[inline]
     pub fn append_nulls(&mut self, n: usize) {
-        self.materialize_bitmap_builder_if_needed();
-        self.bitmap_builder.as_mut().unwrap().append_n(n, false);
+        self.null_buffer_builder.append_n_nulls(n);
         self.values_builder.advance(n);
     }
 
@@ -116,9 +110,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
     /// Appends a slice of type `T` into the builder
     #[inline]
     pub fn append_slice(&mut self, v: &[T::Native]) {
-        if let Some(b) = self.bitmap_builder.as_mut() {
-            b.append_n(v.len(), true);
-        }
+        self.null_buffer_builder.append_n_non_nulls(v.len());
         self.values_builder.append_slice(v);
     }
 
@@ -130,12 +122,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
             is_valid.len(),
             "Value and validity lengths must be equal"
         );
-        if is_valid.iter().any(|v| !*v) {
-            self.materialize_bitmap_builder_if_needed();
-        }
-        if let Some(b) = self.bitmap_builder.as_mut() {
-            b.append_slice(is_valid);
-        }
+        self.null_buffer_builder.append_slice(is_valid);
         self.values_builder.append_slice(values);
     }
 
@@ -155,50 +142,23 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
             .1
             .expect("append_trusted_len_iter requires an upper bound");
 
-        if let Some(b) = self.bitmap_builder.as_mut() {
-            b.append_n(len, true);
-        }
+        self.null_buffer_builder.append_n_non_nulls(len);
         self.values_builder.append_trusted_len_iter(iter);
     }
 
-    /// Builds the `PrimitiveArray` and reset this builder.
+    /// Builds the [`PrimitiveArray`] and reset this builder.
     pub fn finish(&mut self) -> PrimitiveArray<T> {
         let len = self.len();
-        let null_bit_buffer = self.bitmap_builder.as_mut().map(|b| b.finish());
-        let null_count = len
-            - null_bit_buffer
-                .as_ref()
-                .map(|b| b.count_set_bits())
-                .unwrap_or(len);
+        let null_bit_buffer = self.null_buffer_builder.finish();
         let builder = ArrayData::builder(T::DATA_TYPE)
             .len(len)
             .add_buffer(self.values_builder.finish())
-            .null_bit_buffer(if null_count > 0 {
-                null_bit_buffer
-            } else {
-                None
-            });
+            .null_bit_buffer(null_bit_buffer);
 
         let array_data = unsafe { builder.build_unchecked() };
         PrimitiveArray::<T>::from(array_data)
     }
 
-    #[inline]
-    fn materialize_bitmap_builder_if_needed(&mut self) {
-        if self.bitmap_builder.is_some() {
-            return;
-        }
-        self.materialize_bitmap_builder()
-    }
-
-    #[cold]
-    fn materialize_bitmap_builder(&mut self) {
-        let mut b = BooleanBufferBuilder::new(0);
-        b.reserve(self.values_builder.capacity());
-        b.append_n(self.values_builder.len(), true);
-        self.bitmap_builder = Some(b);
-    }
-
     /// Returns the current values buffer as a slice
     pub fn values_slice(&self) -> &[T::Native] {
         self.values_builder.as_slice()
diff --git a/arrow/src/array/builder/struct_builder.rs 
b/arrow/src/array/builder/struct_builder.rs
index c679dd621..373a84582 100644
--- a/arrow/src/array/builder/struct_builder.rs
+++ b/arrow/src/array/builder/struct_builder.rs
@@ -24,6 +24,8 @@ use crate::array::*;
 use crate::datatypes::DataType;
 use crate::datatypes::Field;
 
+use super::NullBufferBuilder;
+
 /// Array builder for Struct types.
 ///
 /// Note that callers should make sure that methods of all the child field 
builders are
@@ -31,7 +33,7 @@ use crate::datatypes::Field;
 pub struct StructBuilder {
     fields: Vec<Field>,
     field_builders: Vec<Box<dyn ArrayBuilder>>,
-    bitmap_builder: BooleanBufferBuilder,
+    null_buffer_builder: NullBufferBuilder,
     len: usize,
 }
 
@@ -39,7 +41,7 @@ impl fmt::Debug for StructBuilder {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_struct("StructBuilder")
             .field("fields", &self.fields)
-            .field("bitmap_builder", &self.bitmap_builder)
+            .field("bitmap_builder", &self.null_buffer_builder)
             .field("len", &self.len)
             .finish()
     }
@@ -173,7 +175,7 @@ impl StructBuilder {
         Self {
             fields,
             field_builders,
-            bitmap_builder: BooleanBufferBuilder::new(0),
+            null_buffer_builder: NullBufferBuilder::new(0),
             len: 0,
         }
     }
@@ -202,7 +204,7 @@ impl StructBuilder {
     /// should be appended for each child sub-array in a consistent way.
     #[inline]
     pub fn append(&mut self, is_valid: bool) {
-        self.bitmap_builder.append(is_valid);
+        self.null_buffer_builder.append(is_valid);
         self.len += 1;
     }
 
@@ -220,14 +222,11 @@ impl StructBuilder {
             child_data.push(arr.into_data());
         }
 
-        let null_bit_buffer = self.bitmap_builder.finish();
-        let null_count = self.len - null_bit_buffer.count_set_bits();
-        let mut builder = 
ArrayData::builder(DataType::Struct(self.fields.clone()))
+        let null_bit_buffer = self.null_buffer_builder.finish();
+        let builder = ArrayData::builder(DataType::Struct(self.fields.clone()))
             .len(self.len)
-            .child_data(child_data);
-        if null_count > 0 {
-            builder = builder.null_bit_buffer(Some(null_bit_buffer));
-        }
+            .child_data(child_data)
+            .null_bit_buffer(null_bit_buffer);
 
         self.len = 0;
 
diff --git a/arrow/src/array/builder/union_builder.rs 
b/arrow/src/array/builder/union_builder.rs
index 95d9ea40a..8ef5028af 100644
--- a/arrow/src/array/builder/union_builder.rs
+++ b/arrow/src/array/builder/union_builder.rs
@@ -29,7 +29,7 @@ use crate::datatypes::Field;
 use crate::datatypes::{ArrowNativeType, ArrowPrimitiveType};
 use crate::error::{ArrowError, Result};
 
-use super::{BooleanBufferBuilder, BufferBuilder};
+use super::{BufferBuilder, NullBufferBuilder};
 
 use crate::array::make_array;
 
@@ -45,7 +45,7 @@ struct FieldData {
     ///  The number of array slots represented by the buffer
     slots: usize,
     /// A builder for the null bitmap
-    bitmap_builder: BooleanBufferBuilder,
+    null_buffer_builder: NullBufferBuilder,
 }
 
 /// A type-erased [`BufferBuilder`] used by [`FieldData`]
@@ -79,7 +79,7 @@ impl FieldData {
             data_type,
             slots: 0,
             values_buffer: Box::new(BufferBuilder::<T::Native>::new(1)),
-            bitmap_builder: BooleanBufferBuilder::new(1),
+            null_buffer_builder: NullBufferBuilder::new(1),
         }
     }
 
@@ -91,14 +91,14 @@ impl FieldData {
             .expect("Tried to append unexpected type")
             .append(v);
 
-        self.bitmap_builder.append(true);
+        self.null_buffer_builder.append(true);
         self.slots += 1;
     }
 
     /// Appends a null to this `FieldData`.
     fn append_null(&mut self) {
         self.values_buffer.append_null();
-        self.bitmap_builder.append(false);
+        self.null_buffer_builder.append(false);
         self.slots += 1;
     }
 }
@@ -264,7 +264,7 @@ impl UnionBuilder {
                 data_type,
                 mut values_buffer,
                 slots,
-                mut bitmap_builder,
+                null_buffer_builder: mut bitmap_builder,
             },
         ) in self.fields.into_iter()
         {
@@ -272,7 +272,7 @@ impl UnionBuilder {
             let arr_data_builder = ArrayDataBuilder::new(data_type.clone())
                 .add_buffer(buffer)
                 .len(slots)
-                .null_bit_buffer(Some(bitmap_builder.finish()));
+                .null_bit_buffer(bitmap_builder.finish());
 
             let arr_data_ref = unsafe { arr_data_builder.build_unchecked() };
             let array_ref = make_array(arr_data_ref);

Reply via email to