This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new 2588e1366 fix: partially fix consistency issue of hash functions with
decimal input (#1295)
2588e1366 is described below
commit 2588e1366fbff6541853ebfb164260cd2684b7d6
Author: Zhen Wang <[email protected]>
AuthorDate: Sun Jan 19 21:51:27 2025 +0800
fix: partially fix consistency issue of hash functions with decimal input
(#1295)
---
native/spark-expr/src/hash_funcs/utils.rs | 25 ++++++++++++++++++++++
.../org/apache/comet/CometExpressionSuite.scala | 17 +++++++++++++++
2 files changed, 42 insertions(+)
diff --git a/native/spark-expr/src/hash_funcs/utils.rs
b/native/spark-expr/src/hash_funcs/utils.rs
index 07ba1952d..ede89c0ba 100644
--- a/native/spark-expr/src/hash_funcs/utils.rs
+++ b/native/spark-expr/src/hash_funcs/utils.rs
@@ -104,6 +104,26 @@ macro_rules! hash_array_primitive_float {
};
}
+#[macro_export]
+macro_rules! hash_array_small_decimal {
+ ($array_type:ident, $column: ident, $hashes: ident, $hash_method: ident)
=> {
+ let array = $column.as_any().downcast_ref::<$array_type>().unwrap();
+
+ if array.null_count() == 0 {
+ for (i, hash) in $hashes.iter_mut().enumerate() {
+ *hash =
$hash_method(i64::try_from(array.value(i)).unwrap().to_le_bytes(), *hash);
+ }
+ } else {
+ for (i, hash) in $hashes.iter_mut().enumerate() {
+ if !array.is_null(i) {
+ *hash =
+
$hash_method(i64::try_from(array.value(i)).unwrap().to_le_bytes(), *hash);
+ }
+ }
+ }
+ };
+}
+
#[macro_export]
macro_rules! hash_array_decimal {
($array_type:ident, $column: ident, $hashes: ident, $hash_method: ident)
=> {
@@ -274,6 +294,11 @@ macro_rules! create_hashes_internal {
DataType::FixedSizeBinary(_) => {
$crate::hash_array!(FixedSizeBinaryArray, col,
$hashes_buffer, $hash_method);
}
+ // Apache Spark: if it's a small decimal, i.e. precision <=
18, turn it into long and hash it.
+ // Else, turn it into bytes and hash it.
+ DataType::Decimal128(precision, _) if *precision <= 18 => {
+ $crate::hash_array_small_decimal!(Decimal128Array, col,
$hashes_buffer, $hash_method);
+ }
DataType::Decimal128(_, _) => {
$crate::hash_array_decimal!(Decimal128Array, col,
$hashes_buffer, $hash_method);
}
diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
index 039ad1f20..058eb7a13 100644
--- a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
@@ -1928,6 +1928,23 @@ class CometExpressionSuite extends CometTestBase with
AdaptiveSparkPlanHelper {
}
}
}
+
+ test("hash functions with decimal input") {
+ withTable("t1", "t2") {
+ // Apache Spark: if it's a small decimal, i.e. precision <= 18, turn it
into long and hash it.
+ // Else, turn it into bytes and hash it.
+ sql("create table t1(c1 decimal(18, 2)) using parquet")
+ sql("insert into t1 values(1.23), (-1.23), (0.0), (null)")
+ checkSparkAnswerAndOperator("select c1, hash(c1), xxhash64(c1) from t1
order by c1")
+
+ // TODO: comet hash function is not compatible with spark for decimal
with precision greater than 18.
+ // https://github.com/apache/datafusion-comet/issues/1294
+// sql("create table t2(c1 decimal(20, 2)) using parquet")
+// sql("insert into t2 values(1.23), (-1.23), (0.0), (null)")
+// checkSparkAnswerAndOperator("select c1, hash(c1), xxhash64(c1) from
t2 order by c1")
+ }
+ }
+
test("unary negative integer overflow test") {
def withAnsiMode(enabled: Boolean)(f: => Unit): Unit = {
withSQLConf(
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]