This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 2b986dfd5d Deduplicate strings/binarys when building view types (#6005)
2b986dfd5d is described below

commit 2b986dfd5d2d104eb1719e608b5adc8000507f17
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Mon Jul 8 09:37:19 2024 -0400

    Deduplicate strings/binarys when building view types (#6005)
    
    * implement string view deduplication in builder
    
    * make clippy happy
    
    * Apply suggestions from code review
    
    Co-authored-by: Andrew Lamb <[email protected]>
    
    * better coding style
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-array/src/array/byte_view_array.rs           |   3 +-
 .../src/builder/generic_bytes_view_builder.rs      | 110 +++++++++++++++++++++
 2 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/arrow-array/src/array/byte_view_array.rs 
b/arrow-array/src/array/byte_view_array.rs
index dc4cbe6834..a00bf7271b 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -325,7 +325,8 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
     /// Use with caution as this can be an expensive operation, only use it 
when you are sure that the view
     /// array is significantly smaller than when it is originally created, 
e.g., after filtering or slicing.
     pub fn gc(&self) -> Self {
-        let mut builder = 
GenericByteViewBuilder::<T>::with_capacity(self.len());
+        let mut builder =
+            
GenericByteViewBuilder::<T>::with_capacity(self.len()).with_deduplicate_strings();
 
         for v in self.iter() {
             builder.append_option(v);
diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs 
b/arrow-array/src/builder/generic_bytes_view_builder.rs
index 2bcc5a3f30..dda5535456 100644
--- a/arrow-array/src/builder/generic_bytes_view_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_view_builder.rs
@@ -22,6 +22,8 @@ use std::sync::Arc;
 use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer};
 use arrow_data::ByteView;
 use arrow_schema::ArrowError;
+use hashbrown::hash_table::Entry;
+use hashbrown::HashTable;
 
 use crate::builder::ArrayBuilder;
 use crate::types::bytes::ByteArrayNativeType;
@@ -57,6 +59,9 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
     completed: Vec<Buffer>,
     in_progress: Vec<u8>,
     block_size: u32,
+    /// Some if deduplicating strings
+    /// map `<string hash> -> <index to the views>`
+    string_tracker: Option<(HashTable<usize>, ahash::RandomState)>,
     phantom: PhantomData<T>,
 }
 
@@ -74,6 +79,7 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
             completed: vec![],
             in_progress: vec![],
             block_size: DEFAULT_BLOCK_SIZE,
+            string_tracker: None,
             phantom: Default::default(),
         }
     }
@@ -83,6 +89,20 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
         Self { block_size, ..self }
     }
 
+    /// Deduplicate strings while building the array
+    ///
+    /// This will potentially decrease the memory usage if the array have 
repeated strings
+    /// It will also increase the time to build the array as it needs to hash 
the strings
+    pub fn with_deduplicate_strings(self) -> Self {
+        Self {
+            string_tracker: Some((
+                HashTable::with_capacity(self.views_builder.capacity()),
+                Default::default(),
+            )),
+            ..self
+        }
+    }
+
     /// Append a new data block returning the new block offset
     ///
     /// Note: this will first flush any in-progress block
@@ -179,6 +199,26 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
         self.completed.push(block);
     }
 
+    /// Returns the value at the given index
+    /// Useful if we want to know what value has been inserted to the builder
+    fn get_value(&self, index: usize) -> &[u8] {
+        let view = self.views_builder.as_slice().get(index).unwrap();
+        let len = *view as u32;
+        if len <= 12 {
+            // # Safety
+            // The view is valid from the builder
+            unsafe { GenericByteViewArray::<T>::inline_value(view, len as 
usize) }
+        } else {
+            let view = ByteView::from(*view);
+            if view.buffer_index < self.completed.len() as u32 {
+                let block = &self.completed[view.buffer_index as usize];
+                &block[view.offset as usize..view.offset as usize + 
view.length as usize]
+            } else {
+                &self.in_progress[view.offset as usize..view.offset as usize + 
view.length as usize]
+            }
+        }
+    }
+
     /// Appends a value into the builder
     ///
     /// # Panics
@@ -199,6 +239,40 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
             return;
         }
 
+        // Deduplication if:
+        // (1) deduplication is enabled.
+        // (2) len > 12
+        if let Some((mut ht, hasher)) = self.string_tracker.take() {
+            let hash_val = hasher.hash_one(v);
+            let hasher_fn = |v: &_| hasher.hash_one(v);
+
+            let entry = ht.entry(
+                hash_val,
+                |idx| {
+                    let stored_value = self.get_value(*idx);
+                    v == stored_value
+                },
+                hasher_fn,
+            );
+            match entry {
+                Entry::Occupied(occupied) => {
+                    // If the string already exists, we will directly use the 
view
+                    let idx = occupied.get();
+                    self.views_builder
+                        .append(self.views_builder.as_slice()[*idx]);
+                    self.null_buffer_builder.append_non_null();
+                    self.string_tracker = Some((ht, hasher));
+                    return;
+                }
+                Entry::Vacant(vacant) => {
+                    // o.w. we insert the (string hash -> view index)
+                    // the idx is current length of views_builder, as we are 
inserting a new view
+                    vacant.insert(self.views_builder.len());
+                }
+            }
+            self.string_tracker = Some((ht, hasher));
+        }
+
         let required_cap = self.in_progress.len() + v.len();
         if self.in_progress.capacity() < required_cap {
             self.flush_in_progress();
@@ -357,6 +431,42 @@ mod tests {
     use super::*;
     use crate::Array;
 
+    #[test]
+    fn test_string_view_deduplicate() {
+        let value_1 = "long string to test string view";
+        let value_2 = "not so similar string but long";
+
+        let mut builder = StringViewBuilder::new()
+            .with_deduplicate_strings()
+            .with_block_size(value_1.len() as u32 * 2); // so that we will 
have multiple buffers
+
+        let values = vec![
+            Some(value_1),
+            Some(value_2),
+            Some("short"),
+            Some(value_1),
+            None,
+            Some(value_2),
+            Some(value_1),
+        ];
+        builder.extend(values.clone());
+
+        let array = builder.finish_cloned();
+        array.to_data().validate_full().unwrap();
+        assert_eq!(array.data_buffers().len(), 1); // without duplication we 
would need 3 buffers.
+        let actual: Vec<_> = array.iter().collect();
+        assert_eq!(actual, values);
+
+        let view0 = array.views().first().unwrap();
+        let view3 = array.views().get(3).unwrap();
+        let view6 = array.views().get(6).unwrap();
+
+        assert_eq!(view0, view3);
+        assert_eq!(view0, view6);
+
+        assert_eq!(array.views().get(1), array.views().get(5));
+    }
+
     #[test]
     fn test_string_view() {
         let b1 = Buffer::from(b"world\xFFbananas\xF0\x9F\x98\x81");

Reply via email to