alamb commented on code in PR #11517: URL: https://github.com/apache/datafusion/pull/11517#discussion_r1683489582
########## datafusion/physical-expr/src/aggregate/min_max.rs: ########## @@ -453,6 +454,14 @@ fn min_batch(values: &ArrayRef) -> Result<ScalarValue> { DataType::LargeUtf8 => { typed_min_max_batch_string!(values, LargeStringArray, LargeUtf8, min_string) } + DataType::Utf8View => { + typed_min_max_batch_string!( Review Comment: Presumably this is using the fast kernel from https://github.com/apache/arrow-rs/pull/6053 ########## datafusion/common/src/hash_utils.rs: ########## @@ -486,22 +490,57 @@ mod tests { Ok(()) } - #[test] - fn create_hashes_binary() -> Result<()> { - let byte_array = Arc::new(BinaryArray::from_vec(vec![ - &[4, 3, 2], - &[4, 3, 2], - &[1, 2, 3], - ])); + macro_rules! create_hash_binary { + ($NAME:ident, $ARRAY:ty) => { + #[cfg(not(feature = "force_hash_collisions"))] + #[test] + fn $NAME() { + let binary = [ + Some(b"short".to_byte_slice()), + None, + Some(b"long but different 12 bytes string"), + Some(b"short2"), + Some(b"Longer than 12 bytes string"), + Some(b"short"), + Some(b"Longer than 12 bytes string"), + ]; + + let binary_array = Arc::new(binary.iter().cloned().collect::<$ARRAY>()); + let ref_array = Arc::new(binary.iter().cloned().collect::<BinaryArray>()); + + let random_state = RandomState::with_seeds(0, 0, 0, 0); + + let mut binary_hashes = vec![0; binary.len()]; + create_hashes(&[binary_array], &random_state, &mut binary_hashes) + .unwrap(); + + let mut ref_hashes = vec![0; binary.len()]; + create_hashes(&[ref_array], &random_state, &mut ref_hashes).unwrap(); + + // Null values result in a zero hash, + for (val, hash) in binary.iter().zip(binary_hashes.iter()) { + match val { + Some(_) => assert_ne!(*hash, 0), + None => assert_eq!(*hash, 0), + } + } - let random_state = RandomState::with_seeds(0, 0, 0, 0); - let hashes_buff = &mut vec![0; byte_array.len()]; - let hashes = create_hashes(&[byte_array], &random_state, hashes_buff)?; - assert_eq!(hashes.len(), 3,); + // same logical values should hash to the same hash value + assert_eq!(binary_hashes, ref_hashes); - Ok(()) + // Same values should map to same hash values + assert_eq!(binary[0], binary[5]); Review Comment: 👍 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org