alamb commented on a change in pull request #9233:
URL: https://github.com/apache/arrow/pull/9233#discussion_r575154628



##########
File path: rust/datafusion/src/physical_plan/hash_aggregate.rs
##########
@@ -398,97 +405,165 @@ fn group_aggregate_batch(
     Ok(accumulators)
 }
 
-/// Create a key `Vec<u8>` that is used as key for the hashmap
-pub(crate) fn create_key(
-    group_by_keys: &[ArrayRef],
+/// Appends a sequence of [u8] bytes for the value in `col[row]` to
+/// `vec` to be used as a key into the hash map for a dictionary type
+///
+/// Note that ideally, for dictionary encoded columns, we would be
+/// able to simply use the dictionary idicies themselves (no need to
+/// look up values) or possibly simply build the hash table entirely
+/// on the dictionary indexes.
+///
+/// This aproach would likely work (very) well for the common case,
+/// but it also has to to handle the case where the dictionary itself
+/// is not the same across all record batches (and thus indexes in one
+/// record batch may not correspond to the same index in another)
+fn dictionary_create_key_for_col<K: ArrowDictionaryKeyType>(
+    col: &ArrayRef,
     row: usize,
     vec: &mut Vec<u8>,
 ) -> Result<()> {
-    vec.clear();
-    for col in group_by_keys {
-        match col.data_type() {
-            DataType::Boolean => {
-                let array = 
col.as_any().downcast_ref::<BooleanArray>().unwrap();
-                vec.extend_from_slice(&[array.value(row) as u8]);
-            }
-            DataType::Float32 => {
-                let array = 
col.as_any().downcast_ref::<Float32Array>().unwrap();
-                vec.extend_from_slice(&array.value(row).to_le_bytes());
-            }
-            DataType::Float64 => {
-                let array = 
col.as_any().downcast_ref::<Float64Array>().unwrap();
-                vec.extend_from_slice(&array.value(row).to_le_bytes());
-            }
-            DataType::UInt8 => {
-                let array = col.as_any().downcast_ref::<UInt8Array>().unwrap();
-                vec.extend_from_slice(&array.value(row).to_le_bytes());
-            }
-            DataType::UInt16 => {
-                let array = 
col.as_any().downcast_ref::<UInt16Array>().unwrap();
-                vec.extend_from_slice(&array.value(row).to_le_bytes());
-            }
-            DataType::UInt32 => {
-                let array = 
col.as_any().downcast_ref::<UInt32Array>().unwrap();
-                vec.extend_from_slice(&array.value(row).to_le_bytes());
-            }
-            DataType::UInt64 => {
-                let array = 
col.as_any().downcast_ref::<UInt64Array>().unwrap();
-                vec.extend_from_slice(&array.value(row).to_le_bytes());
-            }
+    let dict_col = col.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
+
+    // look up the index in the values dictionary
+    let keys_col = dict_col.keys_array();
+    let values_index = keys_col.value(row).to_usize().ok_or_else(|| {
+        DataFusionError::Internal(format!(
+            "Can not convert index to usize in dictionary of type creating 
group by value {:?}",
+            keys_col.data_type()
+        ))
+    })?;
+
+    create_key_for_col(&dict_col.values(), values_index, vec)
+}
+
+/// Appends a sequence of [u8] bytes for the value in `col[row]` to
+/// `vec` to be used as a key into the hash map
+fn create_key_for_col(col: &ArrayRef, row: usize, vec: &mut Vec<u8>) -> 
Result<()> {

Review comment:
       This PR looks larger than it really is -- all it does is lift the per 
column treatment of arrays into its own function (so it can be called 
recursively) and then adds handling for dictionary support. 
   
   So while github renders the diff as a large change, it is very small logic 
change: 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to