This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new e358c16 fix: Remove wrong calculation for Murmur3Hash for float with
null input (#245)
e358c16 is described below
commit e358c161d1ce50f11f6e18db2494a1f8b3cb2719
Author: advancedxy <[email protected]>
AuthorDate: Mon Apr 8 02:05:52 2024 +0800
fix: Remove wrong calculation for Murmur3Hash for float with null input
(#245)
* fix: Remove extra & wrong calculation for Murmur3Hash with float with
null input
* chore: address comments
* address comments
---
core/src/execution/datafusion/spark_hash.rs | 206 +++++++++++++++++-----------
1 file changed, 126 insertions(+), 80 deletions(-)
diff --git a/core/src/execution/datafusion/spark_hash.rs
b/core/src/execution/datafusion/spark_hash.rs
index 1d8d1f2..aa4269d 100644
--- a/core/src/execution/datafusion/spark_hash.rs
+++ b/core/src/execution/datafusion/spark_hash.rs
@@ -165,7 +165,6 @@ macro_rules! hash_array_primitive_float {
} else {
*hash = spark_compatible_murmur3_hash((*value as
$ty).to_le_bytes(), *hash);
}
- *hash = spark_compatible_murmur3_hash((*value as
$ty).to_le_bytes(), *hash);
}
}
}
@@ -364,107 +363,154 @@ mod tests {
use crate::execution::datafusion::spark_hash::{create_hashes, pmod};
use datafusion::arrow::array::{ArrayRef, Int32Array, Int64Array,
Int8Array, StringArray};
+ macro_rules! test_hashes {
+ ($ty:ty, $values:expr, $expected:expr) => {
+ let i = Arc::new(<$ty>::from($values)) as ArrayRef;
+ let mut hashes = vec![42; $values.len()];
+ create_hashes(&[i], &mut hashes).unwrap();
+ assert_eq!(hashes, $expected);
+ };
+ }
+
#[test]
fn test_i8() {
- let i = Arc::new(Int8Array::from(vec![
- Some(1),
- Some(0),
- Some(-1),
- Some(i8::MAX),
- Some(i8::MIN),
- ])) as ArrayRef;
- let mut hashes = vec![42; 5];
- create_hashes(&[i], &mut hashes).unwrap();
-
- // generated with Spark Murmur3_x86_32
- let expected = vec![0xdea578e3, 0x379fae8f, 0xa0590e3d, 0x43b4d8ed,
0x422a1365];
- assert_eq!(hashes, expected);
+ test_hashes!(
+ Int8Array,
+ vec![Some(1), Some(0), Some(-1), Some(i8::MAX), Some(i8::MIN)],
+ vec![0xdea578e3, 0x379fae8f, 0xa0590e3d, 0x43b4d8ed, 0x422a1365]
+ );
+ // with null input
+ test_hashes!(
+ Int8Array,
+ vec![Some(1), None, Some(-1), Some(i8::MAX), Some(i8::MIN)],
+ vec![0xdea578e3, 42, 0xa0590e3d, 0x43b4d8ed, 0x422a1365]
+ );
}
#[test]
fn test_i32() {
- let i = Arc::new(Int32Array::from(vec![
- Some(1),
- Some(0),
- Some(-1),
- Some(i32::MAX),
- Some(i32::MIN),
- ])) as ArrayRef;
- let mut hashes = vec![42; 5];
- create_hashes(&[i], &mut hashes).unwrap();
-
- // generated with Spark Murmur3_x86_32
- let expected = vec![0xdea578e3, 0x379fae8f, 0xa0590e3d, 0x07fb67e7,
0x2b1f0fc6];
- assert_eq!(hashes, expected);
+ test_hashes!(
+ Int32Array,
+ vec![Some(1), Some(0), Some(-1), Some(i32::MAX), Some(i32::MIN)],
+ vec![0xdea578e3, 0x379fae8f, 0xa0590e3d, 0x07fb67e7, 0x2b1f0fc6]
+ );
+ // with null input
+ test_hashes!(
+ Int32Array,
+ vec![
+ Some(1),
+ Some(0),
+ Some(-1),
+ None,
+ Some(i32::MAX),
+ Some(i32::MIN)
+ ],
+ vec![0xdea578e3, 0x379fae8f, 0xa0590e3d, 42, 0x07fb67e7,
0x2b1f0fc6]
+ );
}
#[test]
fn test_i64() {
- let i = Arc::new(Int64Array::from(vec![
- Some(1),
- Some(0),
- Some(-1),
- Some(i64::MAX),
- Some(i64::MIN),
- ])) as ArrayRef;
- let mut hashes = vec![42; 5];
- create_hashes(&[i], &mut hashes).unwrap();
-
- // generated with Spark Murmur3_x86_32
- let expected = vec![0x99f0149d, 0x9c67b85d, 0xc8008529, 0xa05b5d7b,
0xcd1e64fb];
- assert_eq!(hashes, expected);
+ test_hashes!(
+ Int64Array,
+ vec![Some(1), Some(0), Some(-1), Some(i64::MAX), Some(i64::MIN)],
+ vec![0x99f0149d, 0x9c67b85d, 0xc8008529, 0xa05b5d7b, 0xcd1e64fb]
+ );
+ // with null input
+ test_hashes!(
+ Int64Array,
+ vec![
+ Some(1),
+ Some(0),
+ Some(-1),
+ None,
+ Some(i64::MAX),
+ Some(i64::MIN)
+ ],
+ vec![0x99f0149d, 0x9c67b85d, 0xc8008529, 42, 0xa05b5d7b,
0xcd1e64fb]
+ );
}
#[test]
fn test_f32() {
- let i = Arc::new(Float32Array::from(vec![
- Some(1.0),
- Some(0.0),
- Some(-0.0),
- Some(-1.0),
- Some(99999999999.99999999999),
- Some(-99999999999.99999999999),
- ])) as ArrayRef;
- let mut hashes = vec![42; 6];
- create_hashes(&[i], &mut hashes).unwrap();
-
- // generated with Spark Murmur3_x86_32
- let expected = vec![
- 0xe434cc39, 0x379fae8f, 0x379fae8f, 0xdc0da8eb, 0xcbdc340f,
0xc0361c86,
- ];
- assert_eq!(hashes, expected);
+ test_hashes!(
+ Float32Array,
+ vec![
+ Some(1.0),
+ Some(0.0),
+ Some(-0.0),
+ Some(-1.0),
+ Some(99999999999.99999999999),
+ Some(-99999999999.99999999999),
+ ],
+ vec![0xe434cc39, 0x379fae8f, 0x379fae8f, 0xdc0da8eb, 0xcbdc340f,
0xc0361c86]
+ );
+ // with null input
+ test_hashes!(
+ Float32Array,
+ vec![
+ Some(1.0),
+ Some(0.0),
+ Some(-0.0),
+ Some(-1.0),
+ None,
+ Some(99999999999.99999999999),
+ Some(-99999999999.99999999999)
+ ],
+ vec![0xe434cc39, 0x379fae8f, 0x379fae8f, 0xdc0da8eb, 42,
0xcbdc340f, 0xc0361c86]
+ );
}
#[test]
fn test_f64() {
- let i = Arc::new(Float64Array::from(vec![
- Some(1.0),
- Some(0.0),
- Some(-0.0),
- Some(-1.0),
- Some(99999999999.99999999999),
- Some(-99999999999.99999999999),
- ])) as ArrayRef;
- let mut hashes = vec![42; 6];
- create_hashes(&[i], &mut hashes).unwrap();
-
- // generated with Spark Murmur3_x86_32
- let expected = vec![
- 0xe4876492, 0x9c67b85d, 0x9c67b85d, 0x13d81357, 0xb87e1595,
0xa0eef9f9,
- ];
- assert_eq!(hashes, expected);
+ test_hashes!(
+ Float64Array,
+ vec![
+ Some(1.0),
+ Some(0.0),
+ Some(-0.0),
+ Some(-1.0),
+ Some(99999999999.99999999999),
+ Some(-99999999999.99999999999),
+ ],
+ vec![0xe4876492, 0x9c67b85d, 0x9c67b85d, 0x13d81357, 0xb87e1595,
0xa0eef9f9]
+ );
+ // with null input
+ test_hashes!(
+ Float64Array,
+ vec![
+ Some(1.0),
+ Some(0.0),
+ Some(-0.0),
+ Some(-1.0),
+ None,
+ Some(99999999999.99999999999),
+ Some(-99999999999.99999999999)
+ ],
+ vec![0xe4876492, 0x9c67b85d, 0x9c67b85d, 0x13d81357, 42,
0xb87e1595, 0xa0eef9f9]
+ );
}
#[test]
fn test_str() {
- let i = Arc::new(StringArray::from(vec!["hello", "bar", "", "😁",
"天地"]));
- let mut hashes = vec![42; 5];
- create_hashes(&[i], &mut hashes).unwrap();
-
- // generated with Murmur3Hash(Seq(Literal("")), 42).eval() since Spark
is tested against
- // this as well
- let expected = vec![3286402344, 2486176763, 142593372, 885025535,
2395000894];
- assert_eq!(hashes, expected);
+ test_hashes!(
+ StringArray,
+ vec!["hello", "bar", "", "😁", "天地"],
+ vec![3286402344, 2486176763, 142593372, 885025535, 2395000894]
+ );
+ // test with null input
+ test_hashes!(
+ StringArray,
+ vec![
+ Some("hello"),
+ Some("bar"),
+ None,
+ Some(""),
+ Some("😁"),
+ Some("天地")
+ ],
+ vec![3286402344, 2486176763, 42, 142593372, 885025535, 2395000894]
+ );
}
#[test]