Dandandan commented on a change in pull request #812:
URL: https://github.com/apache/arrow-datafusion/pull/812#discussion_r682319823



##########
File path: datafusion/src/physical_plan/hash_utils.rs
##########
@@ -245,9 +249,60 @@ macro_rules! hash_array_float {
     };
 }
 
-/// Creates hash values for every row, based on the values in the columns
+/// Hash the values in a dictionary array
+fn create_hashes_dictionary<K: ArrowDictionaryKeyType>(
+    array: &ArrayRef,
+    random_state: &RandomState,
+    hashes_buffer: &mut Vec<u64>,
+    multi_col: bool,
+) -> Result<()> {
+    let dict_array = 
array.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
+
+    // Hash each dictionary value once, and then use that computed
+    // hash for each key value to avoid a potentially expensive
+    // redundant hashing for large dictionary elements (e.g. strings)
+    let dict_values = Arc::clone(dict_array.values());
+    let mut dict_hashes = vec![0; dict_values.len()];
+    create_hashes(&[dict_values], random_state, &mut dict_hashes)?;
+
+    // combine hash for each index in values
+    if multi_col {
+        for (hash, key) in 
hashes_buffer.iter_mut().zip(dict_array.keys().iter()) {
+            if let Some(key) = key {
+                let idx = key
+                    .to_usize()
+                    .ok_or_else(|| {
+                        DataFusionError::Internal(format!(
+                            "Can not convert key value {:?} to usize in 
dictionary of type {:?}",
+                            key, dict_array.data_type()
+                        ))
+                    })?;
+                *hash = combine_hashes(dict_hashes[idx], *hash)
+            } // no update for Null, consistent with other hashes
+        }
+    } else {
+        for (hash, key) in 
hashes_buffer.iter_mut().zip(dict_array.keys().iter()) {
+            if let Some(key) = key {
+                let idx = key
+                    .to_usize()
+                    .ok_or_else(|| {
+                        DataFusionError::Internal(format!(
+                            "Can not convert key value {:?} to usize in 
dictionary of type {:?}",
+                            key, dict_array.data_type()
+                        ))
+                    })?;
+                *hash = dict_hashes[idx]
+            } // no update for Null, consistent with other hashes

Review comment:
       Wondering now whether this is actually good for some edge cases, as it 
might make the hashing of values from two columns, for example`NULL,1` and 
`1,NULL` is the same regardless of order => probably better to set it to some 
fixed value and let it participate in hashing.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to