(datafusion-comet) branch main updated: fix: partially fix consistency issue of hash functions with decimal input (#1295)

agrove Sun, 19 Jan 2025 05:51:38 -0800

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git



The following commit(s) were added to refs/heads/main by this push:
     new 2588e1366 fix: partially fix consistency issue of hash functions with 
decimal input (#1295)
2588e1366 is described below

commit 2588e1366fbff6541853ebfb164260cd2684b7d6
Author: Zhen Wang <[email protected]>
AuthorDate: Sun Jan 19 21:51:27 2025 +0800

    fix: partially fix consistency issue of hash functions with decimal input 
(#1295)
---
 native/spark-expr/src/hash_funcs/utils.rs          | 25 ++++++++++++++++++++++
 .../org/apache/comet/CometExpressionSuite.scala    | 17 +++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/native/spark-expr/src/hash_funcs/utils.rs 
b/native/spark-expr/src/hash_funcs/utils.rs
index 07ba1952d..ede89c0ba 100644
--- a/native/spark-expr/src/hash_funcs/utils.rs
+++ b/native/spark-expr/src/hash_funcs/utils.rs
@@ -104,6 +104,26 @@ macro_rules! hash_array_primitive_float {
     };
 }
 
+#[macro_export]
+macro_rules! hash_array_small_decimal {
+    ($array_type:ident, $column: ident, $hashes: ident, $hash_method: ident) 
=> {
+        let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+
+        if array.null_count() == 0 {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                *hash = 
$hash_method(i64::try_from(array.value(i)).unwrap().to_le_bytes(), *hash);
+            }
+        } else {
+            for (i, hash) in $hashes.iter_mut().enumerate() {
+                if !array.is_null(i) {
+                    *hash =
+                        
$hash_method(i64::try_from(array.value(i)).unwrap().to_le_bytes(), *hash);
+                }
+            }
+        }
+    };
+}
+
 #[macro_export]
 macro_rules! hash_array_decimal {
     ($array_type:ident, $column: ident, $hashes: ident, $hash_method: ident) 
=> {
@@ -274,6 +294,11 @@ macro_rules! create_hashes_internal {
                 DataType::FixedSizeBinary(_) => {
                     $crate::hash_array!(FixedSizeBinaryArray, col, 
$hashes_buffer, $hash_method);
                 }
+                // Apache Spark: if it's a small decimal, i.e. precision <= 
18, turn it into long and hash it.
+                // Else, turn it into bytes and hash it.
+                DataType::Decimal128(precision, _) if *precision <= 18 => {
+                    $crate::hash_array_small_decimal!(Decimal128Array, col, 
$hashes_buffer, $hash_method);
+                }
                 DataType::Decimal128(_, _) => {
                     $crate::hash_array_decimal!(Decimal128Array, col, 
$hashes_buffer, $hash_method);
                 }
diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala 
b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
index 039ad1f20..058eb7a13 100644
--- a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
@@ -1928,6 +1928,23 @@ class CometExpressionSuite extends CometTestBase with 
AdaptiveSparkPlanHelper {
       }
     }
   }
+
+  test("hash functions with decimal input") {
+    withTable("t1", "t2") {
+      // Apache Spark: if it's a small decimal, i.e. precision <= 18, turn it 
into long and hash it.
+      // Else, turn it into bytes and hash it.
+      sql("create table t1(c1 decimal(18, 2)) using parquet")
+      sql("insert into t1 values(1.23), (-1.23), (0.0), (null)")
+      checkSparkAnswerAndOperator("select c1, hash(c1), xxhash64(c1) from t1 
order by c1")
+
+      // TODO: comet hash function is not compatible with spark for decimal 
with precision greater than 18.
+      // https://github.com/apache/datafusion-comet/issues/1294
+//       sql("create table t2(c1 decimal(20, 2)) using parquet")
+//       sql("insert into t2 values(1.23), (-1.23), (0.0), (null)")
+//       checkSparkAnswerAndOperator("select c1, hash(c1), xxhash64(c1) from 
t2 order by c1")
+    }
+  }
+
   test("unary negative integer overflow test") {
     def withAnsiMode(enabled: Boolean)(f: => Unit): Unit = {
       withSQLConf(


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion-comet) branch main updated: fix: partially fix consistency issue of hash functions with decimal input (#1295)

Reply via email to