This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 9c70e4acd Lazily materialize the null buffer builder for all array
builders. (#2127)
9c70e4acd is described below
commit 9c70e4acd3a872c472a11bc12ae76837c1d4aaaa
Author: Remzi Yang <[email protected]>
AuthorDate: Tue Jul 26 23:18:30 2022 +0800
Lazily materialize the null buffer builder for all array builders. (#2127)
* create null buffer builder
Signed-off-by: remzi <[email protected]>
* use null buffer builder in boolean array builder
Signed-off-by: remzi <[email protected]>
* add capacity field and update docs
Signed-off-by: remzi <[email protected]>
* update fixed size binary builder
Signed-off-by: remzi <[email protected]>
* update fixed size list builder
Signed-off-by: remzi <[email protected]>
* update binary builder
Signed-off-by: remzi <[email protected]>
* update list builder
Signed-off-by: remzi <[email protected]>
* update map builder
Signed-off-by: remzi <[email protected]>
* update struct builder
Signed-off-by: remzi <[email protected]>
* update union builder
Signed-off-by: remzi <[email protected]>
* rename
Signed-off-by: remzi <[email protected]>
* add tests
Signed-off-by: remzi <[email protected]>
* optimize
Signed-off-by: remzi <[email protected]>
* expose null buffer builder in builder mod
Signed-off-by: remzi <[email protected]>
* add docs
Signed-off-by: remzi <[email protected]>
* reduce redundant computation
Signed-off-by: remzi <[email protected]>
* inline methods to achieve better performance
Signed-off-by: remzi <[email protected]>
* use instead of pub(self) use
Signed-off-by: remzi <[email protected]>
* Update arrow/src/array/builder/null_buffer_builder.rs
fix spelling mistake
Co-authored-by: Jörn Horstmann <[email protected]>
* reorder
Signed-off-by: remzi <[email protected]>
* rename methods and update docs
Signed-off-by: remzi <[email protected]>
Co-authored-by: Jörn Horstmann <[email protected]>
---
arrow/src/array/builder/boolean_builder.rs | 44 ++---
.../src/array/builder/fixed_size_binary_builder.rs | 16 +-
arrow/src/array/builder/fixed_size_list_builder.rs | 16 +-
arrow/src/array/builder/generic_binary_builder.rs | 8 +-
arrow/src/array/builder/generic_list_builder.rs | 16 +-
arrow/src/array/builder/map_builder.rs | 12 +-
arrow/src/array/builder/mod.rs | 2 +
arrow/src/array/builder/null_buffer_builder.rs | 204 +++++++++++++++++++++
arrow/src/array/builder/primitive_builder.rs | 64 ++-----
arrow/src/array/builder/struct_builder.rs | 21 +--
arrow/src/array/builder/union_builder.rs | 14 +-
11 files changed, 284 insertions(+), 133 deletions(-)
diff --git a/arrow/src/array/builder/boolean_builder.rs
b/arrow/src/array/builder/boolean_builder.rs
index 1e052b644..e28e37bc9 100644
--- a/arrow/src/array/builder/boolean_builder.rs
+++ b/arrow/src/array/builder/boolean_builder.rs
@@ -28,6 +28,7 @@ use crate::error::ArrowError;
use crate::error::Result;
use super::BooleanBufferBuilder;
+use super::NullBufferBuilder;
/// Array builder for fixed-width primitive types
///
@@ -62,9 +63,7 @@ use super::BooleanBufferBuilder;
#[derive(Debug)]
pub struct BooleanBuilder {
values_builder: BooleanBufferBuilder,
- /// We only materialize the builder when we add `false`.
- /// This optimization is **very** important for the performance.
- bitmap_builder: Option<BooleanBufferBuilder>,
+ null_buffer_builder: NullBufferBuilder,
}
impl BooleanBuilder {
@@ -72,7 +71,7 @@ impl BooleanBuilder {
pub fn new(capacity: usize) -> Self {
Self {
values_builder: BooleanBufferBuilder::new(capacity),
- bitmap_builder: None,
+ null_buffer_builder: NullBufferBuilder::new(capacity),
}
}
@@ -85,19 +84,23 @@ impl BooleanBuilder {
#[inline]
pub fn append_value(&mut self, v: bool) {
self.values_builder.append(v);
- if let Some(b) = self.bitmap_builder.as_mut() {
- b.append(true)
- }
+ self.null_buffer_builder.append_non_null();
}
/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
- self.materialize_bitmap_builder();
- self.bitmap_builder.as_mut().unwrap().append(false);
+ self.null_buffer_builder.append_null();
self.values_builder.advance(1);
}
+ /// Appends `n` `null`s into the builder.
+ #[inline]
+ pub fn append_nulls(&mut self, n: usize) {
+ self.null_buffer_builder.append_n_nulls(n);
+ self.values_builder.advance(n);
+ }
+
/// Appends an `Option<T>` into the builder
#[inline]
pub fn append_option(&mut self, v: Option<bool>) {
@@ -110,10 +113,8 @@ impl BooleanBuilder {
/// Appends a slice of type `T` into the builder
#[inline]
pub fn append_slice(&mut self, v: &[bool]) {
- if let Some(b) = self.bitmap_builder.as_mut() {
- b.append_n(v.len(), true)
- }
self.values_builder.append_slice(v);
+ self.null_buffer_builder.append_n_non_nulls(v.len());
}
/// Appends values from a slice of type `T` and a validity boolean slice.
@@ -126,13 +127,7 @@ impl BooleanBuilder {
"Value and validity lengths must be equal".to_string(),
))
} else {
- is_valid
- .iter()
- .any(|v| !*v)
- .then(|| self.materialize_bitmap_builder());
- if let Some(b) = self.bitmap_builder.as_mut() {
- b.append_slice(is_valid)
- }
+ self.null_buffer_builder.append_slice(is_valid);
self.values_builder.append_slice(values);
Ok(())
}
@@ -141,7 +136,7 @@ impl BooleanBuilder {
/// Builds the [BooleanArray] and reset this builder.
pub fn finish(&mut self) -> BooleanArray {
let len = self.len();
- let null_bit_buffer = self.bitmap_builder.as_mut().map(|b| b.finish());
+ let null_bit_buffer = self.null_buffer_builder.finish();
let builder = ArrayData::builder(DataType::Boolean)
.len(len)
.add_buffer(self.values_builder.finish())
@@ -150,15 +145,6 @@ impl BooleanBuilder {
let array_data = unsafe { builder.build_unchecked() };
BooleanArray::from(array_data)
}
-
- fn materialize_bitmap_builder(&mut self) {
- if self.bitmap_builder.is_none() {
- let mut b = BooleanBufferBuilder::new(0);
- b.reserve(self.values_builder.capacity());
- b.append_n(self.values_builder.len(), true);
- self.bitmap_builder = Some(b);
- }
- }
}
impl ArrayBuilder for BooleanBuilder {
diff --git a/arrow/src/array/builder/fixed_size_binary_builder.rs
b/arrow/src/array/builder/fixed_size_binary_builder.rs
index 5ef89d8f4..96f90c4ca 100644
--- a/arrow/src/array/builder/fixed_size_binary_builder.rs
+++ b/arrow/src/array/builder/fixed_size_binary_builder.rs
@@ -23,12 +23,12 @@ use crate::error::{ArrowError, Result};
use std::any::Any;
use std::sync::Arc;
-use super::BooleanBufferBuilder;
+use super::NullBufferBuilder;
#[derive(Debug)]
pub struct FixedSizeBinaryBuilder {
values_builder: UInt8BufferBuilder,
- bitmap_builder: BooleanBufferBuilder,
+ null_buffer_builder: NullBufferBuilder,
value_length: i32,
}
@@ -43,7 +43,7 @@ impl FixedSizeBinaryBuilder {
);
Self {
values_builder: UInt8BufferBuilder::new(capacity),
- bitmap_builder: BooleanBufferBuilder::new(if byte_width > 0 {
+ null_buffer_builder: NullBufferBuilder::new(if byte_width > 0 {
capacity / byte_width as usize
} else {
0
@@ -64,7 +64,7 @@ impl FixedSizeBinaryBuilder {
))
} else {
self.values_builder.append_slice(value.as_ref());
- self.bitmap_builder.append(true);
+ self.null_buffer_builder.append_non_null();
Ok(())
}
}
@@ -74,7 +74,7 @@ impl FixedSizeBinaryBuilder {
pub fn append_null(&mut self) {
self.values_builder
.append_slice(&vec![0u8; self.value_length as usize][..]);
- self.bitmap_builder.append(false);
+ self.null_buffer_builder.append_null();
}
/// Builds the [`FixedSizeBinaryArray`] and reset this builder.
@@ -83,7 +83,7 @@ impl FixedSizeBinaryBuilder {
let array_data_builder =
ArrayData::builder(DataType::FixedSizeBinary(self.value_length))
.add_buffer(self.values_builder.finish())
- .null_bit_buffer(Some(self.bitmap_builder.finish()))
+ .null_bit_buffer(self.null_buffer_builder.finish())
.len(array_length);
let array_data = unsafe { array_data_builder.build_unchecked() };
FixedSizeBinaryArray::from(array_data)
@@ -108,12 +108,12 @@ impl ArrayBuilder for FixedSizeBinaryBuilder {
/// Returns the number of array slots in the builder
fn len(&self) -> usize {
- self.bitmap_builder.len()
+ self.null_buffer_builder.len()
}
/// Returns whether the number of array slots is zero
fn is_empty(&self) -> bool {
- self.bitmap_builder.is_empty()
+ self.null_buffer_builder.is_empty()
}
/// Builds the array and reset this builder.
diff --git a/arrow/src/array/builder/fixed_size_list_builder.rs
b/arrow/src/array/builder/fixed_size_list_builder.rs
index 4e80e585a..343ce3657 100644
--- a/arrow/src/array/builder/fixed_size_list_builder.rs
+++ b/arrow/src/array/builder/fixed_size_list_builder.rs
@@ -25,12 +25,12 @@ use crate::datatypes::DataType;
use crate::datatypes::Field;
use super::ArrayBuilder;
-use super::BooleanBufferBuilder;
+use super::NullBufferBuilder;
/// Array builder for [`FixedSizeListArray`]
#[derive(Debug)]
pub struct FixedSizeListBuilder<T: ArrayBuilder> {
- bitmap_builder: BooleanBufferBuilder,
+ null_buffer_builder: NullBufferBuilder,
values_builder: T,
list_len: i32,
}
@@ -48,7 +48,7 @@ impl<T: ArrayBuilder> FixedSizeListBuilder<T> {
/// `capacity` is the number of items to pre-allocate space for in this
builder
pub fn with_capacity(values_builder: T, value_length: i32, capacity:
usize) -> Self {
Self {
- bitmap_builder: BooleanBufferBuilder::new(capacity),
+ null_buffer_builder: NullBufferBuilder::new(capacity),
values_builder,
list_len: value_length,
}
@@ -76,12 +76,12 @@ where
/// Returns the number of array slots in the builder
fn len(&self) -> usize {
- self.bitmap_builder.len()
+ self.null_buffer_builder.len()
}
/// Returns whether the number of array slots is zero
fn is_empty(&self) -> bool {
- self.bitmap_builder.is_empty()
+ self.null_buffer_builder.is_empty()
}
/// Builds the array and reset this builder.
@@ -109,7 +109,7 @@ where
/// Finish the current fixed-length list array slot
#[inline]
pub fn append(&mut self, is_valid: bool) {
- self.bitmap_builder.append(is_valid);
+ self.null_buffer_builder.append(is_valid);
}
/// Builds the [`FixedSizeListBuilder`] and reset this builder.
@@ -131,14 +131,14 @@ where
len,
);
- let null_bit_buffer = self.bitmap_builder.finish();
+ let null_bit_buffer = self.null_buffer_builder.finish();
let array_data = ArrayData::builder(DataType::FixedSizeList(
Box::new(Field::new("item", values_data.data_type().clone(),
true)),
self.list_len,
))
.len(len)
.add_child_data(values_data.clone())
- .null_bit_buffer(Some(null_bit_buffer));
+ .null_bit_buffer(null_bit_buffer);
let array_data = unsafe { array_data.build_unchecked() };
diff --git a/arrow/src/array/builder/generic_binary_builder.rs
b/arrow/src/array/builder/generic_binary_builder.rs
index 54c1855a1..1e29c470c 100644
--- a/arrow/src/array/builder/generic_binary_builder.rs
+++ b/arrow/src/array/builder/generic_binary_builder.rs
@@ -25,14 +25,14 @@ use crate::{
use std::any::Any;
use std::sync::Arc;
-use super::{BooleanBufferBuilder, BufferBuilder};
+use super::{BufferBuilder, NullBufferBuilder};
/// Array builder for [`GenericBinaryArray`]
#[derive(Debug)]
pub struct GenericBinaryBuilder<OffsetSize: OffsetSizeTrait> {
value_builder: UInt8BufferBuilder,
offsets_builder: BufferBuilder<OffsetSize>,
- null_buffer_builder: BooleanBufferBuilder,
+ null_buffer_builder: NullBufferBuilder,
}
impl<OffsetSize: OffsetSizeTrait> GenericBinaryBuilder<OffsetSize> {
@@ -44,7 +44,7 @@ impl<OffsetSize: OffsetSizeTrait>
GenericBinaryBuilder<OffsetSize> {
Self {
value_builder: UInt8BufferBuilder::new(capacity),
offsets_builder,
- null_buffer_builder: BooleanBufferBuilder::new(1024),
+ null_buffer_builder: NullBufferBuilder::new(1024),
}
}
@@ -76,7 +76,7 @@ impl<OffsetSize: OffsetSizeTrait>
GenericBinaryBuilder<OffsetSize> {
.len(self.len())
.add_buffer(self.offsets_builder.finish())
.add_buffer(self.value_builder.finish())
- .null_bit_buffer(Some(self.null_buffer_builder.finish()));
+ .null_bit_buffer(self.null_buffer_builder.finish());
self.offsets_builder.append(OffsetSize::zero());
let array_data = unsafe { array_builder.build_unchecked() };
diff --git a/arrow/src/array/builder/generic_list_builder.rs
b/arrow/src/array/builder/generic_list_builder.rs
index e478cc7a3..911182f65 100644
--- a/arrow/src/array/builder/generic_list_builder.rs
+++ b/arrow/src/array/builder/generic_list_builder.rs
@@ -25,13 +25,13 @@ use crate::array::OffsetSizeTrait;
use crate::datatypes::DataType;
use crate::datatypes::Field;
-use super::{ArrayBuilder, BooleanBufferBuilder, BufferBuilder};
+use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder};
/// Array builder for [`GenericListArray`]
#[derive(Debug)]
pub struct GenericListBuilder<OffsetSize: OffsetSizeTrait, T: ArrayBuilder> {
offsets_builder: BufferBuilder<OffsetSize>,
- bitmap_builder: BooleanBufferBuilder,
+ null_buffer_builder: NullBufferBuilder,
values_builder: T,
}
@@ -49,7 +49,7 @@ impl<OffsetSize: OffsetSizeTrait, T: ArrayBuilder>
GenericListBuilder<OffsetSize
offsets_builder.append(OffsetSize::zero());
Self {
offsets_builder,
- bitmap_builder: BooleanBufferBuilder::new(capacity),
+ null_buffer_builder: NullBufferBuilder::new(capacity),
values_builder,
}
}
@@ -77,12 +77,12 @@ where
/// Returns the number of array slots in the builder
fn len(&self) -> usize {
- self.bitmap_builder.len()
+ self.null_buffer_builder.len()
}
/// Returns whether the number of array slots is zero
fn is_empty(&self) -> bool {
- self.bitmap_builder.is_empty()
+ self.null_buffer_builder.is_empty()
}
/// Builds the array and reset this builder.
@@ -113,7 +113,7 @@ where
pub fn append(&mut self, is_valid: bool) {
self.offsets_builder
.append(OffsetSize::from_usize(self.values_builder.len()).unwrap());
- self.bitmap_builder.append(is_valid);
+ self.null_buffer_builder.append(is_valid);
}
/// Builds the [`GenericListArray`] and reset this builder.
@@ -128,7 +128,7 @@ where
let values_data = values_arr.data();
let offset_buffer = self.offsets_builder.finish();
- let null_bit_buffer = self.bitmap_builder.finish();
+ let null_bit_buffer = self.null_buffer_builder.finish();
self.offsets_builder.append(OffsetSize::zero());
let field = Box::new(Field::new(
"item",
@@ -144,7 +144,7 @@ where
.len(len)
.add_buffer(offset_buffer)
.add_child_data(values_data.clone())
- .null_bit_buffer(Some(null_bit_buffer));
+ .null_bit_buffer(null_bit_buffer);
let array_data = unsafe { array_data_builder.build_unchecked() };
diff --git a/arrow/src/array/builder/map_builder.rs
b/arrow/src/array/builder/map_builder.rs
index 6078f8f22..7e68abd5f 100644
--- a/arrow/src/array/builder/map_builder.rs
+++ b/arrow/src/array/builder/map_builder.rs
@@ -18,7 +18,7 @@
use std::any::Any;
use std::sync::Arc;
-use super::{ArrayBuilder, BooleanBufferBuilder, BufferBuilder};
+use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder};
use crate::array::array::Array;
use crate::array::ArrayData;
use crate::array::ArrayRef;
@@ -32,7 +32,7 @@ use crate::error::Result;
#[derive(Debug)]
pub struct MapBuilder<K: ArrayBuilder, V: ArrayBuilder> {
offsets_builder: BufferBuilder<i32>,
- bitmap_builder: BooleanBufferBuilder,
+ null_buffer_builder: NullBufferBuilder,
field_names: MapFieldNames,
key_builder: K,
value_builder: V,
@@ -78,7 +78,7 @@ impl<K: ArrayBuilder, V: ArrayBuilder> MapBuilder<K, V> {
offsets_builder.append(len);
Self {
offsets_builder,
- bitmap_builder: BooleanBufferBuilder::new(capacity),
+ null_buffer_builder: NullBufferBuilder::new(capacity),
field_names: field_names.unwrap_or_default(),
key_builder,
value_builder,
@@ -107,7 +107,7 @@ impl<K: ArrayBuilder, V: ArrayBuilder> MapBuilder<K, V> {
)));
}
self.offsets_builder.append(self.key_builder.len() as i32);
- self.bitmap_builder.append(is_valid);
+ self.null_buffer_builder.append(is_valid);
self.len += 1;
Ok(())
}
@@ -145,7 +145,7 @@ impl<K: ArrayBuilder, V: ArrayBuilder> MapBuilder<K, V> {
StructArray::from(vec![(keys_field, keys_arr), (values_field,
values_arr)]);
let offset_buffer = self.offsets_builder.finish();
- let null_bit_buffer = self.bitmap_builder.finish();
+ let null_bit_buffer = self.null_buffer_builder.finish();
self.offsets_builder.append(self.len);
let map_field = Box::new(Field::new(
self.field_names.entry.as_str(),
@@ -156,7 +156,7 @@ impl<K: ArrayBuilder, V: ArrayBuilder> MapBuilder<K, V> {
.len(len)
.add_buffer(offset_buffer)
.add_child_data(struct_array.into_data())
- .null_bit_buffer(Some(null_bit_buffer));
+ .null_bit_buffer(null_bit_buffer);
let array_data = unsafe { array_data.build_unchecked() };
diff --git a/arrow/src/array/builder/mod.rs b/arrow/src/array/builder/mod.rs
index f97ff4cac..77dd907f6 100644
--- a/arrow/src/array/builder/mod.rs
+++ b/arrow/src/array/builder/mod.rs
@@ -30,6 +30,7 @@ mod generic_binary_builder;
mod generic_list_builder;
mod generic_string_builder;
mod map_builder;
+mod null_buffer_builder;
mod primitive_builder;
mod primitive_dictionary_builder;
mod string_dictionary_builder;
@@ -53,6 +54,7 @@ pub use generic_binary_builder::GenericBinaryBuilder;
pub use generic_list_builder::GenericListBuilder;
pub use generic_string_builder::GenericStringBuilder;
pub use map_builder::{MapBuilder, MapFieldNames};
+use null_buffer_builder::NullBufferBuilder;
pub use primitive_builder::PrimitiveBuilder;
pub use primitive_dictionary_builder::PrimitiveDictionaryBuilder;
pub use string_dictionary_builder::StringDictionaryBuilder;
diff --git a/arrow/src/array/builder/null_buffer_builder.rs
b/arrow/src/array/builder/null_buffer_builder.rs
new file mode 100644
index 000000000..ef2e4c50a
--- /dev/null
+++ b/arrow/src/array/builder/null_buffer_builder.rs
@@ -0,0 +1,204 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::buffer::Buffer;
+
+use super::BooleanBufferBuilder;
+
+/// Builder for creating the null bit buffer.
+/// This builder only materializes the buffer when we append `false`.
+/// If you only append `true`s to the builder, what you get will be
+/// `None` when calling [`finish`](#method.finish).
+/// This optimization is **very** important for the performance.
+#[derive(Debug)]
+pub(super) struct NullBufferBuilder {
+ bitmap_builder: Option<BooleanBufferBuilder>,
+ /// Store the length of the buffer before materializing.
+ len: usize,
+ capacity: usize,
+}
+
+impl NullBufferBuilder {
+ /// Creates a new empty builder.
+ /// `capacity` is the number of bits in the null buffer.
+ pub fn new(capacity: usize) -> Self {
+ Self {
+ bitmap_builder: None,
+ len: 0,
+ capacity,
+ }
+ }
+
+ /// Appends `n` `true`s into the builder
+ /// to indicate that these `n` items are not nulls.
+ #[inline]
+ pub fn append_n_non_nulls(&mut self, n: usize) {
+ if let Some(buf) = self.bitmap_builder.as_mut() {
+ buf.append_n(n, true)
+ } else {
+ self.len += n;
+ }
+ }
+
+ /// Appends a `true` into the builder
+ /// to indicate that this item is not null.
+ #[inline]
+ pub fn append_non_null(&mut self) {
+ if let Some(buf) = self.bitmap_builder.as_mut() {
+ buf.append(true)
+ } else {
+ self.len += 1;
+ }
+ }
+
+ /// Appends `n` `false`s into the builder
+ /// to indicate that these `n` items are nulls.
+ #[inline]
+ pub fn append_n_nulls(&mut self, n: usize) {
+ self.materialize_if_needed();
+ self.bitmap_builder.as_mut().unwrap().append_n(n, false);
+ }
+
+ /// Appends a `false` into the builder
+ /// to indicate that this item is null.
+ #[inline]
+ pub fn append_null(&mut self) {
+ self.materialize_if_needed();
+ self.bitmap_builder.as_mut().unwrap().append(false);
+ }
+
+ /// Appends a boolean value into the builder.
+ #[inline]
+ pub fn append(&mut self, not_null: bool) {
+ if not_null {
+ self.append_non_null()
+ } else {
+ self.append_null()
+ }
+ }
+
+ /// Appends a boolean slice into the builder
+ /// to indicate the validations of these items.
+ pub fn append_slice(&mut self, slice: &[bool]) {
+ if slice.iter().any(|v| !v) {
+ self.materialize_if_needed()
+ }
+ if let Some(buf) = self.bitmap_builder.as_mut() {
+ buf.append_slice(slice)
+ } else {
+ self.len += slice.len();
+ }
+ }
+
+ /// Builds the null buffer and resets the builder.
+ /// Returns `None` if the builder only contains `true`s.
+ pub fn finish(&mut self) -> Option<Buffer> {
+ let buf = self.bitmap_builder.as_mut().map(|b| b.finish());
+ self.bitmap_builder = None;
+ self.len = 0;
+ buf
+ }
+
+ #[inline]
+ fn materialize_if_needed(&mut self) {
+ if self.bitmap_builder.is_none() {
+ self.materialize()
+ }
+ }
+
+ #[cold]
+ fn materialize(&mut self) {
+ if self.bitmap_builder.is_none() {
+ let mut b = BooleanBufferBuilder::new(self.len.max(self.capacity));
+ b.append_n(self.len, true);
+ self.bitmap_builder = Some(b);
+ }
+ }
+}
+
+impl NullBufferBuilder {
+ pub fn len(&self) -> usize {
+ if let Some(b) = &self.bitmap_builder {
+ b.len()
+ } else {
+ self.len
+ }
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_null_buffer_builder() {
+ let mut builder = NullBufferBuilder::new(0);
+ builder.append_null();
+ builder.append_non_null();
+ builder.append_n_nulls(2);
+ builder.append_n_non_nulls(2);
+ assert_eq!(6, builder.len());
+
+ let buf = builder.finish().unwrap();
+ assert_eq!(Buffer::from(&[0b110010_u8]), buf);
+ }
+
+ #[test]
+ fn test_null_buffer_builder_all_nulls() {
+ let mut builder = NullBufferBuilder::new(0);
+ builder.append_null();
+ builder.append_n_nulls(2);
+ builder.append_slice(&[false, false, false]);
+ assert_eq!(6, builder.len());
+
+ let buf = builder.finish().unwrap();
+ assert_eq!(Buffer::from(&[0b0_u8]), buf);
+ }
+
+ #[test]
+ fn test_null_buffer_builder_no_null() {
+ let mut builder = NullBufferBuilder::new(0);
+ builder.append_non_null();
+ builder.append_n_non_nulls(2);
+ builder.append_slice(&[true, true, true]);
+ assert_eq!(6, builder.len());
+
+ let buf = builder.finish();
+ assert!(buf.is_none());
+ }
+
+ #[test]
+ fn test_null_buffer_builder_reset() {
+ let mut builder = NullBufferBuilder::new(0);
+ builder.append_slice(&[true, false, true]);
+ builder.finish();
+ assert!(builder.is_empty());
+
+ builder.append_slice(&[true, true, true]);
+ assert!(builder.finish().is_none());
+ assert!(builder.is_empty());
+
+ builder.append_slice(&[true, true, false, true]);
+
+ let buf = builder.finish().unwrap();
+ assert_eq!(Buffer::from(&[0b1011_u8]), buf);
+ }
+}
diff --git a/arrow/src/array/builder/primitive_builder.rs
b/arrow/src/array/builder/primitive_builder.rs
index 36f4ae2b7..3b9db1f01 100644
--- a/arrow/src/array/builder/primitive_builder.rs
+++ b/arrow/src/array/builder/primitive_builder.rs
@@ -23,15 +23,13 @@ use crate::array::ArrayRef;
use crate::array::PrimitiveArray;
use crate::datatypes::ArrowPrimitiveType;
-use super::{ArrayBuilder, BooleanBufferBuilder, BufferBuilder};
+use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder};
/// Array builder for fixed-width primitive types
#[derive(Debug)]
pub struct PrimitiveBuilder<T: ArrowPrimitiveType> {
values_builder: BufferBuilder<T::Native>,
- /// We only materialize the builder when we add `false`.
- /// This optimization is **very** important for performance of
`StringBuilder`.
- bitmap_builder: Option<BooleanBufferBuilder>,
+ null_buffer_builder: NullBufferBuilder,
}
impl<T: ArrowPrimitiveType> ArrayBuilder for PrimitiveBuilder<T> {
@@ -71,7 +69,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
pub fn new(capacity: usize) -> Self {
Self {
values_builder: BufferBuilder::<T::Native>::new(capacity),
- bitmap_builder: None,
+ null_buffer_builder: NullBufferBuilder::new(capacity),
}
}
@@ -83,24 +81,20 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
/// Appends a value of type `T` into the builder
#[inline]
pub fn append_value(&mut self, v: T::Native) {
- if let Some(b) = self.bitmap_builder.as_mut() {
- b.append(true);
- }
+ self.null_buffer_builder.append_non_null();
self.values_builder.append(v);
}
/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
- self.materialize_bitmap_builder_if_needed();
- self.bitmap_builder.as_mut().unwrap().append(false);
+ self.null_buffer_builder.append_null();
self.values_builder.advance(1);
}
#[inline]
pub fn append_nulls(&mut self, n: usize) {
- self.materialize_bitmap_builder_if_needed();
- self.bitmap_builder.as_mut().unwrap().append_n(n, false);
+ self.null_buffer_builder.append_n_nulls(n);
self.values_builder.advance(n);
}
@@ -116,9 +110,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
/// Appends a slice of type `T` into the builder
#[inline]
pub fn append_slice(&mut self, v: &[T::Native]) {
- if let Some(b) = self.bitmap_builder.as_mut() {
- b.append_n(v.len(), true);
- }
+ self.null_buffer_builder.append_n_non_nulls(v.len());
self.values_builder.append_slice(v);
}
@@ -130,12 +122,7 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
is_valid.len(),
"Value and validity lengths must be equal"
);
- if is_valid.iter().any(|v| !*v) {
- self.materialize_bitmap_builder_if_needed();
- }
- if let Some(b) = self.bitmap_builder.as_mut() {
- b.append_slice(is_valid);
- }
+ self.null_buffer_builder.append_slice(is_valid);
self.values_builder.append_slice(values);
}
@@ -155,50 +142,23 @@ impl<T: ArrowPrimitiveType> PrimitiveBuilder<T> {
.1
.expect("append_trusted_len_iter requires an upper bound");
- if let Some(b) = self.bitmap_builder.as_mut() {
- b.append_n(len, true);
- }
+ self.null_buffer_builder.append_n_non_nulls(len);
self.values_builder.append_trusted_len_iter(iter);
}
- /// Builds the `PrimitiveArray` and reset this builder.
+ /// Builds the [`PrimitiveArray`] and reset this builder.
pub fn finish(&mut self) -> PrimitiveArray<T> {
let len = self.len();
- let null_bit_buffer = self.bitmap_builder.as_mut().map(|b| b.finish());
- let null_count = len
- - null_bit_buffer
- .as_ref()
- .map(|b| b.count_set_bits())
- .unwrap_or(len);
+ let null_bit_buffer = self.null_buffer_builder.finish();
let builder = ArrayData::builder(T::DATA_TYPE)
.len(len)
.add_buffer(self.values_builder.finish())
- .null_bit_buffer(if null_count > 0 {
- null_bit_buffer
- } else {
- None
- });
+ .null_bit_buffer(null_bit_buffer);
let array_data = unsafe { builder.build_unchecked() };
PrimitiveArray::<T>::from(array_data)
}
- #[inline]
- fn materialize_bitmap_builder_if_needed(&mut self) {
- if self.bitmap_builder.is_some() {
- return;
- }
- self.materialize_bitmap_builder()
- }
-
- #[cold]
- fn materialize_bitmap_builder(&mut self) {
- let mut b = BooleanBufferBuilder::new(0);
- b.reserve(self.values_builder.capacity());
- b.append_n(self.values_builder.len(), true);
- self.bitmap_builder = Some(b);
- }
-
/// Returns the current values buffer as a slice
pub fn values_slice(&self) -> &[T::Native] {
self.values_builder.as_slice()
diff --git a/arrow/src/array/builder/struct_builder.rs
b/arrow/src/array/builder/struct_builder.rs
index c679dd621..373a84582 100644
--- a/arrow/src/array/builder/struct_builder.rs
+++ b/arrow/src/array/builder/struct_builder.rs
@@ -24,6 +24,8 @@ use crate::array::*;
use crate::datatypes::DataType;
use crate::datatypes::Field;
+use super::NullBufferBuilder;
+
/// Array builder for Struct types.
///
/// Note that callers should make sure that methods of all the child field
builders are
@@ -31,7 +33,7 @@ use crate::datatypes::Field;
pub struct StructBuilder {
fields: Vec<Field>,
field_builders: Vec<Box<dyn ArrayBuilder>>,
- bitmap_builder: BooleanBufferBuilder,
+ null_buffer_builder: NullBufferBuilder,
len: usize,
}
@@ -39,7 +41,7 @@ impl fmt::Debug for StructBuilder {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("StructBuilder")
.field("fields", &self.fields)
- .field("bitmap_builder", &self.bitmap_builder)
+ .field("bitmap_builder", &self.null_buffer_builder)
.field("len", &self.len)
.finish()
}
@@ -173,7 +175,7 @@ impl StructBuilder {
Self {
fields,
field_builders,
- bitmap_builder: BooleanBufferBuilder::new(0),
+ null_buffer_builder: NullBufferBuilder::new(0),
len: 0,
}
}
@@ -202,7 +204,7 @@ impl StructBuilder {
/// should be appended for each child sub-array in a consistent way.
#[inline]
pub fn append(&mut self, is_valid: bool) {
- self.bitmap_builder.append(is_valid);
+ self.null_buffer_builder.append(is_valid);
self.len += 1;
}
@@ -220,14 +222,11 @@ impl StructBuilder {
child_data.push(arr.into_data());
}
- let null_bit_buffer = self.bitmap_builder.finish();
- let null_count = self.len - null_bit_buffer.count_set_bits();
- let mut builder =
ArrayData::builder(DataType::Struct(self.fields.clone()))
+ let null_bit_buffer = self.null_buffer_builder.finish();
+ let builder = ArrayData::builder(DataType::Struct(self.fields.clone()))
.len(self.len)
- .child_data(child_data);
- if null_count > 0 {
- builder = builder.null_bit_buffer(Some(null_bit_buffer));
- }
+ .child_data(child_data)
+ .null_bit_buffer(null_bit_buffer);
self.len = 0;
diff --git a/arrow/src/array/builder/union_builder.rs
b/arrow/src/array/builder/union_builder.rs
index 95d9ea40a..8ef5028af 100644
--- a/arrow/src/array/builder/union_builder.rs
+++ b/arrow/src/array/builder/union_builder.rs
@@ -29,7 +29,7 @@ use crate::datatypes::Field;
use crate::datatypes::{ArrowNativeType, ArrowPrimitiveType};
use crate::error::{ArrowError, Result};
-use super::{BooleanBufferBuilder, BufferBuilder};
+use super::{BufferBuilder, NullBufferBuilder};
use crate::array::make_array;
@@ -45,7 +45,7 @@ struct FieldData {
/// The number of array slots represented by the buffer
slots: usize,
/// A builder for the null bitmap
- bitmap_builder: BooleanBufferBuilder,
+ null_buffer_builder: NullBufferBuilder,
}
/// A type-erased [`BufferBuilder`] used by [`FieldData`]
@@ -79,7 +79,7 @@ impl FieldData {
data_type,
slots: 0,
values_buffer: Box::new(BufferBuilder::<T::Native>::new(1)),
- bitmap_builder: BooleanBufferBuilder::new(1),
+ null_buffer_builder: NullBufferBuilder::new(1),
}
}
@@ -91,14 +91,14 @@ impl FieldData {
.expect("Tried to append unexpected type")
.append(v);
- self.bitmap_builder.append(true);
+ self.null_buffer_builder.append(true);
self.slots += 1;
}
/// Appends a null to this `FieldData`.
fn append_null(&mut self) {
self.values_buffer.append_null();
- self.bitmap_builder.append(false);
+ self.null_buffer_builder.append(false);
self.slots += 1;
}
}
@@ -264,7 +264,7 @@ impl UnionBuilder {
data_type,
mut values_buffer,
slots,
- mut bitmap_builder,
+ null_buffer_builder: mut bitmap_builder,
},
) in self.fields.into_iter()
{
@@ -272,7 +272,7 @@ impl UnionBuilder {
let arr_data_builder = ArrayDataBuilder::new(data_type.clone())
.add_buffer(buffer)
.len(slots)
- .null_bit_buffer(Some(bitmap_builder.finish()));
+ .null_bit_buffer(bitmap_builder.finish());
let arr_data_ref = unsafe { arr_data_builder.build_unchecked() };
let array_ref = make_array(arr_data_ref);