This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 713bd6d Truncate bitmask on split (#1183)
713bd6d is described below
commit 713bd6dca996417f1deccc5b130eeb4e53c8d89a
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Mon Jan 17 15:51:21 2022 +0000
Truncate bitmask on split (#1183)
* Truncate bitmask on split
* Fix BooleanBufferBuilder::resize
* Format
---
arrow/src/array/builder.rs | 32 ++++++++++++++
.../src/arrow/record_reader/definition_levels.rs | 51 +++++++++++++++++-----
2 files changed, 72 insertions(+), 11 deletions(-)
diff --git a/arrow/src/array/builder.rs b/arrow/src/array/builder.rs
index 85c013c..18deac3 100644
--- a/arrow/src/array/builder.rs
+++ b/arrow/src/array/builder.rs
@@ -367,6 +367,15 @@ impl BooleanBufferBuilder {
}
}
+ /// Resizes the buffer, either truncating its contents (with no change in
capacity), or
+ /// growing it (potentially reallocating it) and writing `false` in the
newly available bits.
+ #[inline]
+ pub fn resize(&mut self, len: usize) {
+ let len_bytes = bit_util::ceil(len, 8);
+ self.buffer.resize(len_bytes, 0);
+ self.len = len;
+ }
+
#[inline]
pub fn append(&mut self, v: bool) {
self.advance(1);
@@ -2932,6 +2941,29 @@ mod tests {
}
#[test]
+ fn test_boolean_array_builder_resize() {
+ let mut builder = BooleanBufferBuilder::new(20);
+ builder.append_n(4, true);
+ builder.append_n(7, false);
+ builder.append_n(2, true);
+ builder.resize(20);
+
+ assert_eq!(builder.len, 20);
+ assert_eq!(
+ builder.buffer.as_slice(),
+ &[0b00001111, 0b00011000, 0b00000000]
+ );
+
+ builder.resize(5);
+ assert_eq!(builder.len, 5);
+ assert_eq!(builder.buffer.as_slice(), &[0b00001111]);
+
+ builder.append_n(4, true);
+ assert_eq!(builder.len, 9);
+ assert_eq!(builder.buffer.as_slice(), &[0b11101111, 0b00000001]);
+ }
+
+ #[test]
fn test_boolean_builder_increases_buffer_len() {
// 00000010 01001000
let buf = Buffer::from([72_u8, 2_u8]);
diff --git a/parquet/src/arrow/record_reader/definition_levels.rs
b/parquet/src/arrow/record_reader/definition_levels.rs
index c38505a..750e118 100644
--- a/parquet/src/arrow/record_reader/definition_levels.rs
+++ b/parquet/src/arrow/record_reader/definition_levels.rs
@@ -105,25 +105,25 @@ impl DefinitionLevelBuffer {
/// Split `len` levels out of `self`
pub fn split_bitmask(&mut self, len: usize) -> Bitmap {
- let builder = match &mut self.inner {
+ let old_builder = match &mut self.inner {
BufferInner::Full { nulls, .. } => nulls,
BufferInner::Mask { nulls } => nulls,
};
- let old_len = builder.len();
- let num_left_values = old_len - len;
- let new_bitmap_builder =
+ // Compute the number of values left behind
+ let num_left_values = old_builder.len() - len;
+ let mut new_builder =
BooleanBufferBuilder::new(MIN_BATCH_SIZE.max(num_left_values));
- let old_bitmap = std::mem::replace(builder,
new_bitmap_builder).finish();
- let old_bitmap = Bitmap::from(old_bitmap);
+ // Copy across remaining values
+ new_builder.append_packed_range(len..old_builder.len(),
old_builder.as_slice());
- for i in len..old_len {
- builder.append(old_bitmap.is_set(i));
- }
+ // Truncate buffer
+ old_builder.resize(len);
- self.len = builder.len();
- old_bitmap
+ // Swap into self
+ self.len = new_builder.len();
+ Bitmap::from(std::mem::replace(old_builder, new_builder).finish())
}
/// Returns an iterator of the valid positions in `range` in descending
order
@@ -391,8 +391,11 @@ fn iter_set_bits_rev(bytes: &[u8]) -> impl Iterator<Item =
usize> + '_ {
#[cfg(test)]
mod tests {
use super::*;
+ use std::sync::Arc;
+ use crate::basic::Type as PhysicalType;
use crate::encodings::rle::RleEncoder;
+ use crate::schema::types::{ColumnDescriptor, ColumnPath, Type};
use rand::{thread_rng, Rng, RngCore};
#[test]
@@ -462,4 +465,30 @@ mod tests {
assert_eq!(actual, expected);
}
}
+
+ #[test]
+ fn test_split_off() {
+ let t = Type::primitive_type_builder("col", PhysicalType::INT32)
+ .build()
+ .unwrap();
+
+ let descriptor = Arc::new(ColumnDescriptor::new(
+ Arc::new(t),
+ 1,
+ 0,
+ ColumnPath::new(vec![]),
+ ));
+
+ let mut buffer = DefinitionLevelBuffer::new(&descriptor, true);
+ match &mut buffer.inner {
+ BufferInner::Mask { nulls } => nulls.append_n(100, false),
+ _ => unreachable!(),
+ };
+
+ let bitmap = buffer.split_bitmask(19);
+
+ // Should have split off 19 records leaving, 81 behind
+ assert_eq!(bitmap.len(), 3 * 8); // Note: bitmask only tracks bytes
not bits
+ assert_eq!(buffer.nulls().len(), 81);
+ }
}