jonathanc-n commented on code in PR #20179:
URL: https://github.com/apache/datafusion/pull/20179#discussion_r2777827880


##########
datafusion/common/src/hash_utils.rs:
##########
@@ -630,6 +679,69 @@ fn hash_union_array(
     Ok(())
 }
 
+/// Hash a sparse union array.
+/// Sparse unions have child arrays with the same length as the union array.
+/// For 3+ types, we optimize by only hashing the N elements that are actually 
used
+/// (via take/scatter), instead of hashing all K*N elements.
+///
+/// For 1-2 types, the overhead of take/scatter outweighs the benefit, so we 
use
+/// the default approach of hashing all children (same as dense unions).
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_sparse_union_array(
+    array: &UnionArray,
+    union_fields: &UnionFields,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+) -> Result<()> {
+    use std::collections::HashMap;
+
+    // For 1-2 types, the take/scatter overhead isn't worth it.
+    // Fall back to the default approach (same as dense union).
+    if union_fields.len() <= 2 {
+        return hash_union_array_default(
+            array,
+            union_fields,
+            random_state,
+            hashes_buffer,
+        );
+    }
+
+    let type_ids = array.type_ids();
+
+    // Group indices by type_id
+    let mut indices_by_type: HashMap<i8, Vec<u32>> = HashMap::new();
+    for (i, &type_id) in type_ids.iter().enumerate() {
+        indices_by_type.entry(type_id).or_default().push(i as u32);
+    }
+
+    // For each type, extract only the needed elements, hash them, and scatter 
back
+    for (type_id, _field) in union_fields.iter() {
+        if let Some(indices) = indices_by_type.get(&type_id) {
+            if indices.is_empty() {
+                continue;
+            }
+
+            let child = array.child(type_id);
+            let indices_array = UInt32Array::from(indices.clone());
+
+            // Extract only the elements we need using take()
+            let filtered = take(child.as_ref(), &indices_array, None)?;

Review Comment:
   This is a bit awkward to test due to the run array. I think they're good 
ideas, I'll open an issue for it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to