This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 713bd6d  Truncate bitmask on split (#1183)
713bd6d is described below

commit 713bd6dca996417f1deccc5b130eeb4e53c8d89a
Author: Raphael Taylor-Davies <1781103+tustv...@users.noreply.github.com>
AuthorDate: Mon Jan 17 15:51:21 2022 +0000

    Truncate bitmask on split (#1183)
    
    * Truncate bitmask on split
    
    * Fix BooleanBufferBuilder::resize
    
    * Format
---
 arrow/src/array/builder.rs                         | 32 ++++++++++++++
 .../src/arrow/record_reader/definition_levels.rs   | 51 +++++++++++++++++-----
 2 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/arrow/src/array/builder.rs b/arrow/src/array/builder.rs
index 85c013c..18deac3 100644
--- a/arrow/src/array/builder.rs
+++ b/arrow/src/array/builder.rs
@@ -367,6 +367,15 @@ impl BooleanBufferBuilder {
         }
     }
 
+    /// Resizes the buffer, either truncating its contents (with no change in 
capacity), or
+    /// growing it (potentially reallocating it) and writing `false` in the 
newly available bits.
+    #[inline]
+    pub fn resize(&mut self, len: usize) {
+        let len_bytes = bit_util::ceil(len, 8);
+        self.buffer.resize(len_bytes, 0);
+        self.len = len;
+    }
+
     #[inline]
     pub fn append(&mut self, v: bool) {
         self.advance(1);
@@ -2932,6 +2941,29 @@ mod tests {
     }
 
     #[test]
+    fn test_boolean_array_builder_resize() {
+        let mut builder = BooleanBufferBuilder::new(20);
+        builder.append_n(4, true);
+        builder.append_n(7, false);
+        builder.append_n(2, true);
+        builder.resize(20);
+
+        assert_eq!(builder.len, 20);
+        assert_eq!(
+            builder.buffer.as_slice(),
+            &[0b00001111, 0b00011000, 0b00000000]
+        );
+
+        builder.resize(5);
+        assert_eq!(builder.len, 5);
+        assert_eq!(builder.buffer.as_slice(), &[0b00001111]);
+
+        builder.append_n(4, true);
+        assert_eq!(builder.len, 9);
+        assert_eq!(builder.buffer.as_slice(), &[0b11101111, 0b00000001]);
+    }
+
+    #[test]
     fn test_boolean_builder_increases_buffer_len() {
         // 00000010 01001000
         let buf = Buffer::from([72_u8, 2_u8]);
diff --git a/parquet/src/arrow/record_reader/definition_levels.rs 
b/parquet/src/arrow/record_reader/definition_levels.rs
index c38505a..750e118 100644
--- a/parquet/src/arrow/record_reader/definition_levels.rs
+++ b/parquet/src/arrow/record_reader/definition_levels.rs
@@ -105,25 +105,25 @@ impl DefinitionLevelBuffer {
 
     /// Split `len` levels out of `self`
     pub fn split_bitmask(&mut self, len: usize) -> Bitmap {
-        let builder = match &mut self.inner {
+        let old_builder = match &mut self.inner {
             BufferInner::Full { nulls, .. } => nulls,
             BufferInner::Mask { nulls } => nulls,
         };
 
-        let old_len = builder.len();
-        let num_left_values = old_len - len;
-        let new_bitmap_builder =
+        // Compute the number of values left behind
+        let num_left_values = old_builder.len() - len;
+        let mut new_builder =
             BooleanBufferBuilder::new(MIN_BATCH_SIZE.max(num_left_values));
 
-        let old_bitmap = std::mem::replace(builder, 
new_bitmap_builder).finish();
-        let old_bitmap = Bitmap::from(old_bitmap);
+        // Copy across remaining values
+        new_builder.append_packed_range(len..old_builder.len(), 
old_builder.as_slice());
 
-        for i in len..old_len {
-            builder.append(old_bitmap.is_set(i));
-        }
+        // Truncate buffer
+        old_builder.resize(len);
 
-        self.len = builder.len();
-        old_bitmap
+        // Swap into self
+        self.len = new_builder.len();
+        Bitmap::from(std::mem::replace(old_builder, new_builder).finish())
     }
 
     /// Returns an iterator of the valid positions in `range` in descending 
order
@@ -391,8 +391,11 @@ fn iter_set_bits_rev(bytes: &[u8]) -> impl Iterator<Item = 
usize> + '_ {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use std::sync::Arc;
 
+    use crate::basic::Type as PhysicalType;
     use crate::encodings::rle::RleEncoder;
+    use crate::schema::types::{ColumnDescriptor, ColumnPath, Type};
     use rand::{thread_rng, Rng, RngCore};
 
     #[test]
@@ -462,4 +465,30 @@ mod tests {
             assert_eq!(actual, expected);
         }
     }
+
+    #[test]
+    fn test_split_off() {
+        let t = Type::primitive_type_builder("col", PhysicalType::INT32)
+            .build()
+            .unwrap();
+
+        let descriptor = Arc::new(ColumnDescriptor::new(
+            Arc::new(t),
+            1,
+            0,
+            ColumnPath::new(vec![]),
+        ));
+
+        let mut buffer = DefinitionLevelBuffer::new(&descriptor, true);
+        match &mut buffer.inner {
+            BufferInner::Mask { nulls } => nulls.append_n(100, false),
+            _ => unreachable!(),
+        };
+
+        let bitmap = buffer.split_bitmask(19);
+
+        // Should have split off 19 records leaving, 81 behind
+        assert_eq!(bitmap.len(), 3 * 8); // Note: bitmask only tracks bytes 
not bits
+        assert_eq!(buffer.nulls().len(), 81);
+    }
 }

Reply via email to