This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 956fe76731b Add simple GC for view array types (#5885)
956fe76731b is described below
commit 956fe76731b80f62559e6290cfe6fb360794c3ce
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Fri Jun 14 14:47:15 2024 -0400
Add simple GC for view array types (#5885)
* feat: support gc for view_arrays
part 1: implement checker to check if current buffer is compact
Signed-off-by: 蔡略 <[email protected]>
* feat: add gc to view-arrays
part2: implement actual gc algorithm
Signed-off-by: 蔡略 <[email protected]>
* patch: add more tests
Signed-off-by: 蔡略 <[email protected]>
* patch: improve robustness
Signed-off-by: 蔡略 <[email protected]>
* simplify gc
* make fmt happy
* Update arrow-array/src/array/byte_view_array.rs
Co-authored-by: Andrew Lamb <[email protected]>
* improve testing
* check in a benchmark
* actually have multiple buffers in test
---------
Signed-off-by: 蔡略 <[email protected]>
Co-authored-by: 蔡略 <[email protected]>
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-array/Cargo.toml | 4 ++
arrow-array/benches/gc_view_types.rs | 48 ++++++++++++++++++
arrow-array/src/array/byte_view_array.rs | 85 ++++++++++++++++++++++++++++++++
3 files changed, 137 insertions(+)
diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml
index b00d2c88e1a..a8dbeded9ce 100644
--- a/arrow-array/Cargo.toml
+++ b/arrow-array/Cargo.toml
@@ -62,3 +62,7 @@ criterion = { version = "0.5", default-features = false }
[[bench]]
name = "occupancy"
harness = false
+
+[[bench]]
+name = "gc_view_types"
+harness = false
diff --git a/arrow-array/benches/gc_view_types.rs
b/arrow-array/benches/gc_view_types.rs
new file mode 100644
index 00000000000..4b74a8f60b0
--- /dev/null
+++ b/arrow-array/benches/gc_view_types.rs
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::StringViewArray;
+use criterion::*;
+
+fn gen_view_array(size: usize) -> StringViewArray {
+ StringViewArray::from_iter((0..size).map(|v| match v % 3 {
+ 0 => Some("small"),
+ 1 => Some("larger than 12 bytes array"),
+ 2 => None,
+ _ => unreachable!("unreachable"),
+ }))
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+ let array = gen_view_array(100_000);
+
+ c.bench_function("gc view types all", |b| {
+ b.iter(|| {
+ black_box(array.gc());
+ });
+ });
+
+ let sliced = array.slice(0, 100_000 / 2);
+ c.bench_function("gc view types slice half", |b| {
+ b.iter(|| {
+ black_box(sliced.gc());
+ });
+ });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/arrow-array/src/array/byte_view_array.rs
b/arrow-array/src/array/byte_view_array.rs
index 79f2d47587a..88033b83cbe 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -265,6 +265,56 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
phantom: Default::default(),
}
}
+
+ /// Returns a "compacted" version of this array
+ ///
+ /// The original array will *not* be modified
+ ///
+ /// # Garbage Collection
+ ///
+ /// Before GC:
+ /// ```text
+ /// ┌──────┐
+ /// │......│
+ /// │......│
+ /// ┌────────────────────┐ ┌ ─ ─ ─ ▶ │Data1 │ Large buffer
+ /// │ View 1 │─ ─ ─ ─ │......│ with data that
+ /// ├────────────────────┤ │......│ is not referred
+ /// │ View 2 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
+ /// └────────────────────┘ │......│ View 2
+ /// │......│
+ /// 2 views, refer to │......│
+ /// small portions of a └──────┘
+ /// large buffer
+ /// ```
+ ///
+ /// After GC:
+ ///
+ /// ```text
+ /// ┌────────────────────┐ ┌─────┐ After gc, only
+ /// │ View 1 │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│ data that is
+ /// ├────────────────────┤ ┌ ─ ─ ─ ▶ │Data2│ pointed to by
+ /// │ View 2 │─ ─ ─ ─ └─────┘ the views is
+ /// └────────────────────┘ left
+ ///
+ ///
+ /// 2 views
+ /// ```
+ /// This method will compact the data buffers by recreating the view array
and only include the data
+ /// that is pointed to by the views.
+ ///
+ /// Note that it will copy the array regardless of whether the original
array is compact.
+ /// Use with caution as this can be an expensive operation, only use it
when you are sure that the view
+ /// array is significantly smaller than when it is originally created,
e.g., after filtering or slicing.
+ pub fn gc(&self) -> Self {
+ let mut builder =
GenericByteViewBuilder::<T>::with_capacity(self.len());
+
+ for v in self.iter() {
+ builder.append_option(v);
+ }
+
+ builder.finish()
+ }
}
impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
@@ -645,4 +695,39 @@ mod tests {
StringViewArray::new(views, buffers, None);
}
+
+ #[test]
+ fn test_gc() {
+ let test_data = [
+ Some("longer than 12 bytes"),
+ Some("short"),
+ Some("t"),
+ Some("longer than 12 bytes"),
+ None,
+ Some("short"),
+ ];
+
+ let array = {
+ let mut builder = StringViewBuilder::new().with_block_size(8); //
create multiple buffers
+ test_data.into_iter().for_each(|v| builder.append_option(v));
+ builder.finish()
+ };
+ assert!(array.buffers.len() > 1);
+
+ fn check_gc(to_test: &StringViewArray) {
+ let gc = to_test.gc();
+ assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());
+
+ to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
+ assert_eq!(a, b);
+ });
+ assert_eq!(to_test.len(), gc.len());
+ }
+
+ check_gc(&array);
+ check_gc(&array.slice(1, 3));
+ check_gc(&array.slice(2, 1));
+ check_gc(&array.slice(2, 2));
+ check_gc(&array.slice(3, 1));
+ }
}