Dandandan commented on a change in pull request #808:
URL: https://github.com/apache/arrow-datafusion/pull/808#discussion_r683752478



##########
File path: datafusion/src/physical_plan/hash_aggregate.rs
##########
@@ -363,55 +348,74 @@ fn group_aggregate_batch(
 
     let mut group_by_values = group_by_values.into_boxed_slice();
 
-    let mut key = Vec::with_capacity(group_values.len());
-
     // 1.1 construct the key from the group values
     // 1.2 construct the mapping key if it does not exist
     // 1.3 add the row' index to `indices`
 
     // Make sure we can create the accumulators or otherwise return an error
     
create_accumulators(aggr_expr).map_err(DataFusionError::into_arrow_external_error)?;
 
-    // Keys received in this batch
-    let mut batch_keys = vec![];
+    // track which entries in `accumulators` have rows in this batch to 
aggregate
+    let mut groups_with_rows = vec![];
 
-    for row in 0..batch.num_rows() {
-        // 1.1
-        create_key(&group_values, row, &mut key)
-            .map_err(DataFusionError::into_arrow_external_error)?;
+    // 1.1 Calculate the group keys for the group values
+    let mut batch_hashes = vec![0; batch.num_rows()];
+    create_hashes(&group_values, random_state, &mut batch_hashes)?;
 
-        accumulators
-            .raw_entry_mut()
-            .from_key(&key)
-            // 1.3
-            .and_modify(|_, (_, _, v)| {
-                if v.is_empty() {
-                    batch_keys.push(key.clone())
+    for (row, hash) in batch_hashes.into_iter().enumerate() {
+        let Accumulators { map, group_states } = &mut accumulators;
+
+        let entry = map.get_mut(hash, |(_hash, group_idx)| {
+            // verify that a group that we are inserting with hash is

Review comment:
       I wonder whether it might be faster to also check for the hashes being 
equal. `get_mut` returns a potential match based on the values being in the 
same bucket (which might have a different hash).
   Checking that first before checking the actual values might save some time 
if there is a higher percentage of collisions, and the check itself is 
relatively slow.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to