This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 116ae12f2a [Arrow]Configure max deduplication length for `StringView`
(#8990)
116ae12f2a is described below
commit 116ae12f2a200579025400c0f1d871d0e3fb0d47
Author: lichuang <[email protected]>
AuthorDate: Thu Dec 18 04:30:15 2025 +0800
[Arrow]Configure max deduplication length for `StringView` (#8990)
Configure max deduplication length when deduplicating strings while
building the array
# Which issue does this PR close?
Configure max deduplication length when deduplicating strings while
building the array
- Closes #7187.
# Rationale for this change
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
# What changes are included in this PR?
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
# Are these changes tested?
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
# Are there any user-facing changes?
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
---
.../src/builder/generic_bytes_view_builder.rs | 125 ++++++++++++++++-----
1 file changed, 98 insertions(+), 27 deletions(-)
diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs
b/arrow-array/src/builder/generic_bytes_view_builder.rs
index 35f5684cc7..d7ff7ce051 100644
--- a/arrow-array/src/builder/generic_bytes_view_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_view_builder.rs
@@ -87,6 +87,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
/// Some if deduplicating strings
/// map `<string hash> -> <index to the views>`
string_tracker: Option<(HashTable<usize>, ahash::RandomState)>,
+ max_deduplication_len: Option<u32>,
phantom: PhantomData<T>,
}
@@ -107,10 +108,28 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
current_size: STARTING_BLOCK_SIZE,
},
string_tracker: None,
+ max_deduplication_len: None,
phantom: Default::default(),
}
}
+ /// Configure max deduplication length when deduplicating strings while
building the array.
+ /// Default is None.
+ ///
+ /// When [`Self::with_deduplicate_strings`] is enabled, the builder
attempts to deduplicate
+ /// any strings longer than 12 bytes. However, since it takes time
proportional to the length
+ /// of the string to deduplicate, setting this option limits the CPU
overhead for this option.
+ pub fn with_max_deduplication_len(self, max_deduplication_len: u32) ->
Self {
+ debug_assert!(
+ max_deduplication_len > 0,
+ "max_deduplication_len must be greater than 0"
+ );
+ Self {
+ max_deduplication_len: Some(max_deduplication_len),
+ ..self
+ }
+ }
+
/// Set a fixed buffer size for variable length strings
///
/// The block size is the size of the buffer used to store values greater
@@ -334,35 +353,42 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
// Deduplication if:
// (1) deduplication is enabled.
- // (2) len > 12
- if let Some((mut ht, hasher)) = self.string_tracker.take() {
- let hash_val = hasher.hash_one(v);
- let hasher_fn = |v: &_| hasher.hash_one(v);
-
- let entry = ht.entry(
- hash_val,
- |idx| {
- let stored_value = self.get_value(*idx);
- v == stored_value
- },
- hasher_fn,
- );
- match entry {
- Entry::Occupied(occupied) => {
- // If the string already exists, we will directly use the
view
- let idx = occupied.get();
- self.views_buffer.push(self.views_buffer[*idx]);
- self.null_buffer_builder.append_non_null();
- self.string_tracker = Some((ht, hasher));
- return Ok(());
- }
- Entry::Vacant(vacant) => {
- // o.w. we insert the (string hash -> view index)
- // the idx is current length of views_builder, as we are
inserting a new view
- vacant.insert(self.views_buffer.len());
+ // (2) len > `MAX_INLINE_VIEW_LEN` and len <= `max_deduplication_len`
+ let can_deduplicate = self.string_tracker.is_some()
+ && self
+ .max_deduplication_len
+ .map(|max_length| length <= max_length)
+ .unwrap_or(true);
+ if can_deduplicate {
+ if let Some((mut ht, hasher)) = self.string_tracker.take() {
+ let hash_val = hasher.hash_one(v);
+ let hasher_fn = |v: &_| hasher.hash_one(v);
+
+ let entry = ht.entry(
+ hash_val,
+ |idx| {
+ let stored_value = self.get_value(*idx);
+ v == stored_value
+ },
+ hasher_fn,
+ );
+ match entry {
+ Entry::Occupied(occupied) => {
+ // If the string already exists, we will directly use
the view
+ let idx = occupied.get();
+ self.views_buffer.push(self.views_buffer[*idx]);
+ self.null_buffer_builder.append_non_null();
+ self.string_tracker = Some((ht, hasher));
+ return Ok(());
+ }
+ Entry::Vacant(vacant) => {
+ // o.w. we insert the (string hash -> view index)
+ // the idx is current length of views_builder, as we
are inserting a new view
+ vacant.insert(self.views_buffer.len());
+ }
}
+ self.string_tracker = Some((ht, hasher));
}
- self.string_tracker = Some((ht, hasher));
}
let required_cap = self.in_progress.len() + v.len();
@@ -636,8 +662,53 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32)
-> u128 {
mod tests {
use core::str;
+ use arrow_buffer::ArrowNativeType;
+
use super::*;
+ #[test]
+ fn test_string_max_deduplication_len() {
+ let value_1 = "short";
+ let value_2 = "not so similar string but long";
+ let value_3 = "1234567890123";
+
+ let max_deduplication_len = MAX_INLINE_VIEW_LEN * 2;
+
+ let mut builder = StringViewBuilder::new()
+ .with_deduplicate_strings()
+ .with_max_deduplication_len(max_deduplication_len);
+
+ assert!(value_1.len() < MAX_INLINE_VIEW_LEN.as_usize());
+ assert!(value_2.len() > max_deduplication_len.as_usize());
+ assert!(
+ value_3.len() > MAX_INLINE_VIEW_LEN.as_usize()
+ && value_3.len() < max_deduplication_len.as_usize()
+ );
+
+ // append value1 (short), expect it is inlined and not deduplicated
+ builder.append_value(value_1); // view 0
+ builder.append_value(value_1); // view 1
+ // append value2, expect second copy is not deduplicated as it exceeds
max_deduplication_len
+ builder.append_value(value_2); // view 2
+ builder.append_value(value_2); // view 3
+ // append value3, expect second copy is deduplicated
+ builder.append_value(value_3); // view 4
+ builder.append_value(value_3); // view 5
+
+ let array = builder.finish();
+
+ // verify
+ let v2 = ByteView::from(array.views()[2]);
+ let v3 = ByteView::from(array.views()[3]);
+ assert_eq!(v2.buffer_index, v3.buffer_index); // stored in same buffer
+ assert_ne!(v2.offset, v3.offset); // different offsets --> not
deduplicated
+
+ let v4 = ByteView::from(array.views()[4]);
+ let v5 = ByteView::from(array.views()[5]);
+ assert_eq!(v4.buffer_index, v5.buffer_index); // stored in same buffer
+ assert_eq!(v4.offset, v5.offset); // same offsets --> deduplicated
+ }
+
#[test]
fn test_string_view_deduplicate() {
let value_1 = "long string to test string view";