alamb commented on code in PR #18448:
URL: https://github.com/apache/datafusion/pull/18448#discussion_r2502900983


##########
datafusion/common/src/hash_utils.rs:
##########
@@ -366,83 +362,113 @@ fn hash_fixed_list_array(
     Ok(())
 }
 
-/// Test version of `create_hashes` that produces the same value for
-/// all hashes (to test collisions)
-///
-/// See comments on `hashes_buffer` for more details
+/// Internal helper function that hashes a single array and either initializes 
or combines
+/// the hash values in the buffer.
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_single_array(
+    array: &dyn Array,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+    rehash: bool,
+) -> Result<()> {
+    downcast_primitive_array! {
+        array => hash_array_primitive(array, random_state, hashes_buffer, 
rehash),
+        DataType::Null => hash_null(random_state, hashes_buffer, rehash),
+        DataType::Boolean => hash_array(as_boolean_array(array)?, 
random_state, hashes_buffer, rehash),
+        DataType::Utf8 => hash_array(as_string_array(array)?, random_state, 
hashes_buffer, rehash),
+        DataType::Utf8View => hash_array(as_string_view_array(array)?, 
random_state, hashes_buffer, rehash),
+        DataType::LargeUtf8 => hash_array(as_largestring_array(array), 
random_state, hashes_buffer, rehash),
+        DataType::Binary => hash_array(as_generic_binary_array::<i32>(array)?, 
random_state, hashes_buffer, rehash),
+        DataType::BinaryView => hash_array(as_binary_view_array(array)?, 
random_state, hashes_buffer, rehash),
+        DataType::LargeBinary => 
hash_array(as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, 
rehash),
+        DataType::FixedSizeBinary(_) => {
+            let array: &FixedSizeBinaryArray = 
array.as_any().downcast_ref().unwrap();
+            hash_array(array, random_state, hashes_buffer, rehash)
+        }
+        DataType::Dictionary(_, _) => downcast_dictionary_array! {
+            array => hash_dictionary(array, random_state, hashes_buffer, 
rehash)?,
+            _ => unreachable!()
+        }
+        DataType::Struct(_) => {
+            let array = as_struct_array(array)?;
+            hash_struct_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::List(_) => {
+            let array = as_list_array(array)?;
+            hash_list_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::LargeList(_) => {
+            let array = as_large_list_array(array)?;
+            hash_list_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::Map(_, _) => {
+            let array = as_map_array(array)?;
+            hash_map_array(array, random_state, hashes_buffer)?;
+        }
+        DataType::FixedSizeList(_,_) => {
+            let array = as_fixed_size_list_array(array)?;
+            hash_fixed_list_array(array, random_state, hashes_buffer)?;
+        }
+        _ => {
+            // This is internal because we should have caught this before.
+            return _internal_err!(
+                "Unsupported data type in hasher: {}",
+                array.data_type()
+            );
+        }
+    }
+    Ok(())
+}
+
+/// Test version of `hash_single_array` that forces all hashes to collide to 
zero.
 #[cfg(feature = "force_hash_collisions")]
-pub fn create_hashes<'a>(
-    _arrays: &[ArrayRef],
+fn hash_single_array(
+    _array: &dyn Array,
     _random_state: &RandomState,
-    hashes_buffer: &'a mut Vec<u64>,
-) -> Result<&'a mut Vec<u64>> {
+    hashes_buffer: &mut [u64],
+    _rehash: bool,
+) -> Result<()> {
     for hash in hashes_buffer.iter_mut() {
         *hash = 0
     }
-    Ok(hashes_buffer)
+    Ok(())
 }
 
-/// Creates hash values for every row, based on the values in the
-/// columns.
+/// Creates hash values for every row, based on the values in the columns.
 ///
 /// The number of rows to hash is determined by `hashes_buffer.len()`.
 /// `hashes_buffer` should be pre-sized appropriately
-#[cfg(not(feature = "force_hash_collisions"))]
+///
+/// This is the same as [`create_hashes`] but accepts `&dyn Array`s instead of 
requiring
+/// `ArrayRef`s.
+pub fn create_hashes_from_arrays<'a>(

Review Comment:
   I think we can avoid this new function and instead update `create_hashes` 
with a little generics magic so it can take array references
   
   Here is a PR that shows how it works
   - https://github.com/pydantic/datafusion/pull/42



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to