alamb commented on code in PR #8990:
URL: https://github.com/apache/arrow-rs/pull/8990#discussion_r2624469559


##########
arrow-array/src/builder/generic_bytes_view_builder.rs:
##########
@@ -334,35 +353,45 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
 
         // Deduplication if:
         // (1) deduplication is enabled.
-        // (2) len > 12
-        if let Some((mut ht, hasher)) = self.string_tracker.take() {
-            let hash_val = hasher.hash_one(v);
-            let hasher_fn = |v: &_| hasher.hash_one(v);
-
-            let entry = ht.entry(
-                hash_val,
-                |idx| {
-                    let stored_value = self.get_value(*idx);
-                    v == stored_value
-                },
-                hasher_fn,
-            );
-            match entry {
-                Entry::Occupied(occupied) => {
-                    // If the string already exists, we will directly use the 
view
-                    let idx = occupied.get();
-                    self.views_buffer.push(self.views_buffer[*idx]);
-                    self.null_buffer_builder.append_non_null();
-                    self.string_tracker = Some((ht, hasher));
-                    return Ok(());
-                }
-                Entry::Vacant(vacant) => {
-                    // o.w. we insert the (string hash -> view index)
-                    // the idx is current length of views_builder, as we are 
inserting a new view
-                    vacant.insert(self.views_buffer.len());
+        // (2) len > `MAX_INLINE_VIEW_LEN` and len <= `max_deduplication_len`
+        let can_deduplicate = if self.string_tracker.is_some() {
+            match self.max_deduplication_len {
+                Some(max_deduplication_len) => length <= max_deduplication_len,
+                None => true,
+            }
+        } else {
+            false
+        };

Review Comment:
   Here is another possibly clearer way to express this same logic:
   
   ```rust
           let can_deduplicate = self.string_tracker.is_some()
               && self
                   .max_deduplication_len
                   .map(|max_length| length <= max_length)
                   .unwrap_or_default();
   ```



##########
arrow-array/src/builder/generic_bytes_view_builder.rs:
##########
@@ -334,35 +353,45 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
 
         // Deduplication if:
         // (1) deduplication is enabled.
-        // (2) len > 12
-        if let Some((mut ht, hasher)) = self.string_tracker.take() {
-            let hash_val = hasher.hash_one(v);
-            let hasher_fn = |v: &_| hasher.hash_one(v);
-
-            let entry = ht.entry(
-                hash_val,
-                |idx| {
-                    let stored_value = self.get_value(*idx);
-                    v == stored_value
-                },
-                hasher_fn,
-            );
-            match entry {
-                Entry::Occupied(occupied) => {
-                    // If the string already exists, we will directly use the 
view
-                    let idx = occupied.get();
-                    self.views_buffer.push(self.views_buffer[*idx]);
-                    self.null_buffer_builder.append_non_null();
-                    self.string_tracker = Some((ht, hasher));
-                    return Ok(());
-                }
-                Entry::Vacant(vacant) => {
-                    // o.w. we insert the (string hash -> view index)
-                    // the idx is current length of views_builder, as we are 
inserting a new view
-                    vacant.insert(self.views_buffer.len());
+        // (2) len > `MAX_INLINE_VIEW_LEN` and len <= `max_deduplication_len`
+        let can_deduplicate = if self.string_tracker.is_some() {
+            match self.max_deduplication_len {
+                Some(max_deduplication_len) => length <= max_deduplication_len,
+                None => true,
+            }
+        } else {
+            false
+        };
+        if can_deduplicate {

Review Comment:
   You can avoid the extra indent layer by combining the statements together I 
think:
   
   Instead of
   ```rust
           if can_deduplicate {
               if let Some((mut ht, hasher)) = self.string_tracker.take() {
   ```
   
   This:
   
   ```rust
           if can_deduplicate && let Some((mut ht, hasher)) = 
self.string_tracker.take() {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to