This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 956fe76731b Add simple GC for view array types (#5885)
956fe76731b is described below

commit 956fe76731b80f62559e6290cfe6fb360794c3ce
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Fri Jun 14 14:47:15 2024 -0400

    Add simple GC for view array types (#5885)
    
    * feat: support gc for view_arrays
    
    part 1: implement checker to check if current buffer is compact
    
    Signed-off-by: 蔡略 <[email protected]>
    
    * feat: add gc to view-arrays
    
    part2: implement actual gc algorithm
    
    Signed-off-by: 蔡略 <[email protected]>
    
    * patch: add more tests
    
    Signed-off-by: 蔡略 <[email protected]>
    
    * patch: improve robustness
    
    Signed-off-by: 蔡略 <[email protected]>
    
    * simplify gc
    
    * make fmt happy
    
    * Update arrow-array/src/array/byte_view_array.rs
    
    Co-authored-by: Andrew Lamb <[email protected]>
    
    * improve testing
    
    * check in a benchmark
    
    * actually have multiple buffers in test
    
    ---------
    
    Signed-off-by: 蔡略 <[email protected]>
    Co-authored-by: 蔡略 <[email protected]>
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-array/Cargo.toml                   |  4 ++
 arrow-array/benches/gc_view_types.rs     | 48 ++++++++++++++++++
 arrow-array/src/array/byte_view_array.rs | 85 ++++++++++++++++++++++++++++++++
 3 files changed, 137 insertions(+)

diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml
index b00d2c88e1a..a8dbeded9ce 100644
--- a/arrow-array/Cargo.toml
+++ b/arrow-array/Cargo.toml
@@ -62,3 +62,7 @@ criterion = { version = "0.5", default-features = false }
 [[bench]]
 name = "occupancy"
 harness = false
+
+[[bench]]
+name = "gc_view_types"
+harness = false
diff --git a/arrow-array/benches/gc_view_types.rs 
b/arrow-array/benches/gc_view_types.rs
new file mode 100644
index 00000000000..4b74a8f60b0
--- /dev/null
+++ b/arrow-array/benches/gc_view_types.rs
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::StringViewArray;
+use criterion::*;
+
+fn gen_view_array(size: usize) -> StringViewArray {
+    StringViewArray::from_iter((0..size).map(|v| match v % 3 {
+        0 => Some("small"),
+        1 => Some("larger than 12 bytes array"),
+        2 => None,
+        _ => unreachable!("unreachable"),
+    }))
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let array = gen_view_array(100_000);
+
+    c.bench_function("gc view types all", |b| {
+        b.iter(|| {
+            black_box(array.gc());
+        });
+    });
+
+    let sliced = array.slice(0, 100_000 / 2);
+    c.bench_function("gc view types slice half", |b| {
+        b.iter(|| {
+            black_box(sliced.gc());
+        });
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/arrow-array/src/array/byte_view_array.rs 
b/arrow-array/src/array/byte_view_array.rs
index 79f2d47587a..88033b83cbe 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -265,6 +265,56 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
             phantom: Default::default(),
         }
     }
+
+    /// Returns a "compacted" version of this array
+    ///
+    /// The original array will *not* be modified
+    ///
+    /// # Garbage Collection
+    ///
+    /// Before GC:
+    /// ```text
+    ///                                        ┌──────┐                 
+    ///                                        │......│                 
+    ///                                        │......│                 
+    /// ┌────────────────────┐       ┌ ─ ─ ─ ▶ │Data1 │   Large buffer  
+    /// │       View 1       │─ ─ ─ ─          │......│  with data that
+    /// ├────────────────────┤                 │......│ is not referred
+    /// │       View 2       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
+    /// └────────────────────┘                 │......│      View 2     
+    ///                                        │......│                 
+    ///    2 views, refer to                   │......│                 
+    ///   small portions of a                  └──────┘                 
+    ///      large buffer                                               
+    /// ```
+    ///                                                                
+    /// After GC:
+    ///
+    /// ```text
+    /// ┌────────────────────┐                 ┌─────┐    After gc, only
+    /// │       View 1       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│     data that is  
+    /// ├────────────────────┤       ┌ ─ ─ ─ ▶ │Data2│    pointed to by  
+    /// │       View 2       │─ ─ ─ ─          └─────┘     the views is  
+    /// └────────────────────┘                                 left      
+    ///                                                                  
+    ///                                                                  
+    ///         2 views                                                  
+    /// ```
+    /// This method will compact the data buffers by recreating the view array 
and only include the data
+    /// that is pointed to by the views.
+    ///
+    /// Note that it will copy the array regardless of whether the original 
array is compact.
+    /// Use with caution as this can be an expensive operation, only use it 
when you are sure that the view
+    /// array is significantly smaller than when it is originally created, 
e.g., after filtering or slicing.
+    pub fn gc(&self) -> Self {
+        let mut builder = 
GenericByteViewBuilder::<T>::with_capacity(self.len());
+
+        for v in self.iter() {
+            builder.append_option(v);
+        }
+
+        builder.finish()
+    }
 }
 
 impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
@@ -645,4 +695,39 @@ mod tests {
 
         StringViewArray::new(views, buffers, None);
     }
+
+    #[test]
+    fn test_gc() {
+        let test_data = [
+            Some("longer than 12 bytes"),
+            Some("short"),
+            Some("t"),
+            Some("longer than 12 bytes"),
+            None,
+            Some("short"),
+        ];
+
+        let array = {
+            let mut builder = StringViewBuilder::new().with_block_size(8); // 
create multiple buffers
+            test_data.into_iter().for_each(|v| builder.append_option(v));
+            builder.finish()
+        };
+        assert!(array.buffers.len() > 1);
+
+        fn check_gc(to_test: &StringViewArray) {
+            let gc = to_test.gc();
+            assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
+
+            to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
+                assert_eq!(a, b);
+            });
+            assert_eq!(to_test.len(), gc.len());
+        }
+
+        check_gc(&array);
+        check_gc(&array.slice(1, 3));
+        check_gc(&array.slice(2, 1));
+        check_gc(&array.slice(2, 2));
+        check_gc(&array.slice(3, 1));
+    }
 }

Reply via email to