Dandandan commented on code in PR #6913:
URL: https://github.com/apache/arrow-datafusion/pull/6913#discussion_r1259506459


##########
datafusion/core/src/physical_plan/joins/hash_join_utils.rs:
##########
@@ -87,28 +87,77 @@ use datafusion_common::Result;
 // | 0 | 0 | 0 | 2 | 4 | <--- hash value 1 maps to 5,4,2 (which means indices 
values 4,3,1)
 // ---------------------
 
-// TODO: speed up collision checks
-// https://github.com/apache/arrow-datafusion/issues/50
+const MIN_JOIN_HASH_MAP_LEN: usize = 1024;
+
 pub struct JoinHashMap {
-    // Stores hash value to first index
-    pub map: RawTable<(u64, u64)>,
+    // Stores the first index for a bucket
+    buckets: Vec<u64>,
     // Stores indices in chained list data structure
     pub next: Vec<u64>,
+    // Stores the bucket mask for quickly finding a bucket for a hash value
+    bucket_mask: usize,
 }
 
-/// SymmetricJoinHashMap is similar to JoinHashMap, except that it stores the 
indices inline, allowing it to mutate
-/// and shrink the indices.
-pub struct SymmetricJoinHashMap(pub RawTable<(u64, SmallVec<[u64; 1]>)>);
-
 impl JoinHashMap {
     pub(crate) fn with_capacity(capacity: usize) -> Self {
+        let mut bucket_capacity = capacity.next_power_of_two();
+        if bucket_capacity < MIN_JOIN_HASH_MAP_LEN {
+            bucket_capacity = MIN_JOIN_HASH_MAP_LEN;
+        }
+        let bucket_mask = bucket_capacity - 1;
         JoinHashMap {
-            map: RawTable::with_capacity(capacity),
+            buckets: vec![0; bucket_capacity],
             next: vec![0; capacity],
+            bucket_mask,
         }
     }
+
+    /// Only used for testing
+    pub(crate) fn with_bucket_capacity(bucket_capacity: usize, capacity: 
usize) -> Self {
+        assert!(bucket_capacity > 0);
+        let bucket_capacity = bucket_capacity.next_power_of_two();
+        let bucket_mask = bucket_capacity - 1;
+        JoinHashMap {
+            buckets: vec![0; bucket_capacity],
+            next: vec![0; capacity],
+            bucket_mask,
+        }
+    }
+
+    #[inline]
+    pub(crate) fn insert(&mut self, hash_value: u64, index: usize) {
+        let bucket_index = self.bucket_mask & (hash_value as usize);
+        // We are sure the `bucket_index` is legal
+        unsafe {
+            let index_ref = self.buckets.get_unchecked_mut(bucket_index);

Review Comment:
   I think we can limit the unsafe to:
   ```suggestion
               let index_ref = unsafe 
{self.buckets.get_unchecked_mut(bucket_index); }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to