(incubator-gluten) branch main updated: [VL] Support kurtosis aggregate function (#5151)

philo Fri, 29 Mar 2024 02:18:27 -0700

This is an automated email from the ASF dual-hosted git repository.

philo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git



The following commit(s) were added to refs/heads/main by this push:
     new 3b9a3ea09 [VL] Support kurtosis aggregate function (#5151)
3b9a3ea09 is described below

commit 3b9a3ea093e64e6b4a3b208dc7f805292ecf117b
Author: Joey <[email protected]>
AuthorDate: Fri Mar 29 17:18:18 2024 +0800

    [VL] Support kurtosis aggregate function (#5151)
---
 .../io/glutenproject/utils/CHExpressionUtil.scala      |  3 ++-
 .../io/glutenproject/utils/VeloxIntermediateData.scala | 17 +++++++++--------
 .../execution/VeloxAggregateFunctionsSuite.scala       | 18 ++++++++++++++++++
 cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc   |  3 ++-
 docs/velox-backend-support-progress.md                 |  6 +++---
 .../glutenproject/expression/ExpressionMappings.scala  |  1 +
 .../test/resources/sql-tests/results/group-by.sql.out  |  2 +-
 .../utils/velox/VeloxSQLQueryTestSettings.scala        |  4 +++-
 .../test/resources/sql-tests/results/group-by.sql.out  |  2 +-
 .../sql-tests/results/udf/udf-group-by.sql.out         |  2 +-
 .../utils/velox/VeloxSQLQueryTestSettings.scala        |  4 +++-
 .../test/resources/sql-tests/results/group-by.sql.out  |  2 +-
 .../sql-tests/results/udf/udf-group-by.sql.out         |  2 +-
 .../utils/velox/VeloxSQLQueryTestSettings.scala        |  4 +++-
 .../io/glutenproject/expression/ExpressionNames.scala  |  3 ++-
 15 files changed, 51 insertions(+), 22 deletions(-)

diff --git 
a/backends-clickhouse/src/main/scala/io/glutenproject/utils/CHExpressionUtil.scala
 
b/backends-clickhouse/src/main/scala/io/glutenproject/utils/CHExpressionUtil.scala
index 6dfed9dd6..d431d0c87 100644
--- 
a/backends-clickhouse/src/main/scala/io/glutenproject/utils/CHExpressionUtil.scala
+++ 
b/backends-clickhouse/src/main/scala/io/glutenproject/utils/CHExpressionUtil.scala
@@ -180,6 +180,7 @@ object CHExpressionUtil {
     URL_DECODE -> DefaultValidator(),
     SKEWNESS -> DefaultValidator(),
     BIT_LENGTH -> DefaultValidator(),
-    MAKE_YM_INTERVAL -> DefaultValidator()
+    MAKE_YM_INTERVAL -> DefaultValidator(),
+    KURTOSIS -> DefaultValidator()
   )
 }
diff --git 
a/backends-velox/src/main/scala/io/glutenproject/utils/VeloxIntermediateData.scala
 
b/backends-velox/src/main/scala/io/glutenproject/utils/VeloxIntermediateData.scala
index 773fedfe9..0c3508b5d 100644
--- 
a/backends-velox/src/main/scala/io/glutenproject/utils/VeloxIntermediateData.scala
+++ 
b/backends-velox/src/main/scala/io/glutenproject/utils/VeloxIntermediateData.scala
@@ -31,8 +31,9 @@ object VeloxIntermediateData {
     Seq("ck", "n", "xMk", "yMk", "xAvg", "yAvg")
   // CovPopulation, CovSample
   private val veloxCovarIntermediateDataOrder: Seq[String] = Seq("ck", "n", 
"xAvg", "yAvg")
-  // Skewness
-  private val veloxSkewnessIntermediateDataOrder: Seq[String] = Seq("n", 
"avg", "m2", "m3", "m4")
+  // Skewness, Kurtosis
+  private val veloxCentralMomentAggIntermediateDataOrder: Seq[String] =
+    Seq("n", "avg", "m2", "m3", "m4")
 
   // Agg functions with inconsistent types of intermediate data between Velox 
and Spark.
   // StddevSamp, StddevPop, VarianceSamp, VariancePop
@@ -43,8 +44,8 @@ object VeloxIntermediateData {
   // Corr
   private val veloxCorrIntermediateTypes: Seq[DataType] =
     Seq(DoubleType, LongType, DoubleType, DoubleType, DoubleType, DoubleType)
-  // Skewness
-  private val veloxSkewnessIntermediateTypes: Seq[DataType] =
+  // Skewness, Kurtosis
+  private val veloxCentralMomentAggIntermediateTypes: Seq[DataType] =
     Seq(LongType, DoubleType, DoubleType, DoubleType, DoubleType)
 
   /**
@@ -62,8 +63,8 @@ object VeloxIntermediateData {
         veloxCorrIntermediateDataOrder
       case _: CovPopulation | _: CovSample =>
         veloxCovarIntermediateDataOrder
-      case _: Skewness =>
-        veloxSkewnessIntermediateDataOrder
+      case _: Skewness | _: Kurtosis =>
+        veloxCentralMomentAggIntermediateDataOrder
       case _ =>
         aggFunc.aggBufferAttributes.map(_.name)
     }
@@ -143,8 +144,8 @@ object VeloxIntermediateData {
           Some(veloxCovarIntermediateTypes)
         case _: StddevSamp | _: StddevPop | _: VarianceSamp | _: VariancePop =>
           Some(veloxVarianceIntermediateTypes)
-        case _: Skewness =>
-          Some(veloxSkewnessIntermediateTypes)
+        case _: Skewness | _: Kurtosis =>
+          Some(veloxCentralMomentAggIntermediateTypes)
         case _ if aggFunc.aggBufferAttributes.size > 1 =>
           Some(aggFunc.aggBufferAttributes.map(_.dataType))
         case _ => None
diff --git 
a/backends-velox/src/test/scala/io/glutenproject/execution/VeloxAggregateFunctionsSuite.scala
 
b/backends-velox/src/test/scala/io/glutenproject/execution/VeloxAggregateFunctionsSuite.scala
index c306f4585..abeadaadf 100644
--- 
a/backends-velox/src/test/scala/io/glutenproject/execution/VeloxAggregateFunctionsSuite.scala
+++ 
b/backends-velox/src/test/scala/io/glutenproject/execution/VeloxAggregateFunctionsSuite.scala
@@ -901,6 +901,24 @@ abstract class VeloxAggregateFunctionsSuite extends 
VeloxWholeStageTransformerSu
         }
     }
   }
+
+  test("kurtosis") {
+    runQueryAndCompare("""
+                         |select kurtosis(l_partkey) from lineitem;
+                         |""".stripMargin) {
+      checkOperatorMatch[HashAggregateExecTransformer]
+    }
+    runQueryAndCompare("select kurtosis(l_partkey), count(distinct l_orderkey) 
from lineitem") {
+      df =>
+        {
+          assert(
+            getExecutedPlan(df).count(
+              plan => {
+                plan.isInstanceOf[HashAggregateExecTransformer]
+              }) == 4)
+        }
+    }
+  }
 }
 
 class VeloxAggregateFunctionsDefaultSuite extends VeloxAggregateFunctionsSuite 
{
diff --git a/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc 
b/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc
index f04dcda7c..264379716 100644
--- a/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc
+++ b/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc
@@ -1144,7 +1144,8 @@ bool SubstraitToVeloxPlanValidator::validate(const 
::substrait::AggregateRel& ag
       "covar_pop",
       "covar_samp",
       "approx_distinct",
-      "skewness"};
+      "skewness",
+      "kurtosis"};
 
   for (const auto& funcSpec : funcSpecs) {
     auto funcName = SubstraitParser::getNameBeforeDelimiter(funcSpec);
diff --git a/docs/velox-backend-support-progress.md 
b/docs/velox-backend-support-progress.md
index 56cb95c1b..c08a2e6aa 100644
--- a/docs/velox-backend-support-progress.md
+++ b/docs/velox-backend-support-progress.md
@@ -364,7 +364,7 @@ Gluten supports 199 functions. (Draw to right to see all 
data types)
 | first_value                   |                        | first_value         
  | S      |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
 | grouping                      |                        |                     
  |        |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
 | grouping_id                   |                        |                     
  |        |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
-| kurtosis                      |                        |                     
  |        |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
+| kurtosis                      | kurtosis               | kurtosis            
  | S      |                        |         |      | S     | S   | S    | S   
  | S      |      |           |        |         |      |        |          |   
    |      |        |      |
 | last                          |                        | last                
  | S      |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
 | last_value                    |                        | last_value          
  | S      |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
 | max                           | max                    |                     
  | S      |                        |         |      | S     | S   | S    | S   
  | S      |      |           |        |         |      |        |          |   
    |      |        |      |
@@ -416,7 +416,7 @@ Gluten supports 199 functions. (Draw to right to see all 
data types)
 | java_method                   |                        |                     
  |        |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
 | least                         | least                  | least               
  | S      |                        |         |      |       |     | S    | S   
  | S      | S    | S         |        |         |      |        |          |   
    |      |        |      |
 | md5                           | md5                    |                     
  | S      |                        |         | S    |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
-| monotonically_increasing_id   |                        |                     
  |   S     |                        |         |      |       |     |      |    
   |        |      |           |        |         |      |        |          |  
     |      |        |      |
+| monotonically_increasing_id   |                        |                     
  | S      |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
 | nanvl                         |                        |                     
  | S      |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
 | nvl                           |                        |                     
  |        |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
 | nvl2                          |                        |                     
  |        |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
@@ -428,4 +428,4 @@ Gluten supports 199 functions. (Draw to right to see all 
data types)
 | spark_partition_id            |                        |                     
  | S      |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
 | stack                         |                        |                     
  |        |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
 | xxhash64                      | xxhash64               | xxhash64            
  |        |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
-| uuid                      | uuid               | uuid              | S       
|                        |         |      |       |     |      |       |        
|      |           |        |         |      |        |          |       |      
|        |      |
+| uuid                          | uuid                   | uuid                
  | S      |                        |         |      |       |     |      |     
  |        |      |           |        |         |      |        |          |   
    |      |        |      |
diff --git 
a/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionMappings.scala
 
b/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionMappings.scala
index e9133b6f2..b3b1ab12b 100644
--- 
a/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionMappings.scala
+++ 
b/gluten-core/src/main/scala/io/glutenproject/expression/ExpressionMappings.scala
@@ -277,6 +277,7 @@ object ExpressionMappings {
     Sig[Last](LAST),
     Sig[First](FIRST),
     Sig[Skewness](SKEWNESS),
+    Sig[Kurtosis](KURTOSIS),
     Sig[ApproximatePercentile](APPROX_PERCENTILE)
   ) ++ SparkShimLoader.getSparkShims.aggregateExpressionMappings
 
diff --git 
a/gluten-ut/spark32/src/test/resources/sql-tests/results/group-by.sql.out 
b/gluten-ut/spark32/src/test/resources/sql-tests/results/group-by.sql.out
index 79e6f72df..a12e830c1 100644
--- a/gluten-ut/spark32/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/gluten-ut/spark32/src/test/resources/sql-tests/results/group-by.sql.out
@@ -130,7 +130,7 @@ FROM testData
 -- !query schema
 
struct<skewness(a):double,kurtosis(a):double,min(a):int,max(a):int,avg(a):double,variance(a):double,stddev(a):double,sum(a):bigint,count(a):bigint>
 -- !query output
--0.2723801058145729    -1.5069204152249134     1       3       
2.142857142857143       0.8095238095238094      0.8997354108424372      15      
7
+-0.27238010581457284   -1.5069204152249138     1       3       
2.142857142857143       0.8095238095238096      0.8997354108424375      15      
7
 
 
 -- !query
diff --git 
a/gluten-ut/spark32/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
 
b/gluten-ut/spark32/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
index 9ec55f015..dd63b1e4c 100644
--- 
a/gluten-ut/spark32/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
+++ 
b/gluten-ut/spark32/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
@@ -229,9 +229,11 @@ object VeloxSQLQueryTestSettings extends 
SQLQueryTestSettings {
     )
 
   val OVERWRITE_SQL_QUERY_LIST: Set[String] = Set(
-    // Velox corr has better computation logic but it fails Spark's precision 
check.
+    // The calculation formulas for corr, skewness, kurtosis, variance, and 
stddev in Velox differ
+    // slightly from those in Spark, resulting in some differences in the 
final results.
     // Overwrite below test cases.
     // -- SPARK-24369 multiple distinct aggregations having the same argument 
set
+    // -- Aggregate with nulls.
     "group-by.sql",
     "udf/udf-group-by.sql"
   )
diff --git 
a/gluten-ut/spark33/src/test/resources/sql-tests/results/group-by.sql.out 
b/gluten-ut/spark33/src/test/resources/sql-tests/results/group-by.sql.out
index ffaa2d511..982278fa5 100644
--- a/gluten-ut/spark33/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/gluten-ut/spark33/src/test/resources/sql-tests/results/group-by.sql.out
@@ -150,7 +150,7 @@ FROM testData
 -- !query schema
 
struct<skewness(a):double,kurtosis(a):double,min(a):int,max(a):int,avg(a):double,variance(a):double,stddev(a):double,sum(a):bigint,count(a):bigint>
 -- !query output
--0.2723801058145729    -1.5069204152249134     1       3       
2.142857142857143       0.8095238095238094      0.8997354108424372      15      
7
+-0.27238010581457284   -1.5069204152249138     1       3       
2.142857142857143       0.8095238095238096      0.8997354108424375      15      
7
 
 
 -- !query
diff --git 
a/gluten-ut/spark33/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
 
b/gluten-ut/spark33/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
index 90272001f..ea088f8e8 100644
--- 
a/gluten-ut/spark33/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
+++ 
b/gluten-ut/spark33/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
@@ -130,7 +130,7 @@ FROM testData
 -- !query schema
 
struct<skewness(udf(a)):double,udf(kurtosis(a)):double,udf(min(a)):int,max(udf(a)):int,udf(avg(udf(a))):double,udf(variance(a)):double,stddev(udf(a)):double,udf(sum(a)):bigint,udf(count(a)):bigint>
 -- !query output
--0.2723801058145729    -1.5069204152249134     1       3       
2.142857142857143       0.8095238095238094      0.8997354108424372      15      
7
+-0.27238010581457284   -1.5069204152249138     1       3       
2.142857142857143       0.8095238095238096      0.8997354108424375      15      
7
 
 
 -- !query
diff --git 
a/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
 
b/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
index f773e78e8..1d00c739c 100644
--- 
a/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
+++ 
b/gluten-ut/spark33/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
@@ -232,9 +232,11 @@ object VeloxSQLQueryTestSettings extends 
SQLQueryTestSettings {
   )
 
   private val OVERWRITE_SQL_QUERY_LIST: Set[String] = Set(
-    // Velox corr has better computation logic but it fails Spark's precision 
check.
+    // The calculation formulas for corr, skewness, kurtosis, variance, and 
stddev in Velox differ
+    // slightly from those in Spark, resulting in some differences in the 
final results.
     // Overwrite below test cases.
     // -- SPARK-24369 multiple distinct aggregations having the same argument 
set
+    // -- Aggregate with nulls.
     "group-by.sql",
     "udf/udf-group-by.sql"
   )
diff --git 
a/gluten-ut/spark34/src/test/resources/sql-tests/results/group-by.sql.out 
b/gluten-ut/spark34/src/test/resources/sql-tests/results/group-by.sql.out
index a92a58efb..f56420926 100644
--- a/gluten-ut/spark34/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/gluten-ut/spark34/src/test/resources/sql-tests/results/group-by.sql.out
@@ -162,7 +162,7 @@ FROM testData
 -- !query schema
 
struct<skewness(a):double,kurtosis(a):double,min(a):int,max(a):int,avg(a):double,variance(a):double,stddev(a):double,sum(a):bigint,count(a):bigint>
 -- !query output
--0.2723801058145729    -1.5069204152249134     1       3       
2.142857142857143       0.8095238095238094      0.8997354108424372      15      
7
+-0.27238010581457284   -1.5069204152249138     1       3       
2.142857142857143       0.8095238095238096      0.8997354108424375      15      
7
 
 
 -- !query
diff --git 
a/gluten-ut/spark34/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
 
b/gluten-ut/spark34/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
index 35f91a7c4..d3735acf0 100644
--- 
a/gluten-ut/spark34/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
+++ 
b/gluten-ut/spark34/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out
@@ -151,7 +151,7 @@ FROM testData
 -- !query schema
 
struct<skewness(udf(a)):double,udf(kurtosis(a)):double,udf(min(a)):int,max(udf(a)):int,udf(avg(udf(a))):double,udf(variance(a)):double,stddev(udf(a)):double,udf(sum(a)):bigint,udf(count(a)):bigint>
 -- !query output
--0.2723801058145729    -1.5069204152249134     1       3       
2.142857142857143       0.8095238095238094      0.8997354108424372      15      
7
+-0.27238010581457284   -1.5069204152249138     1       3       
2.142857142857143       0.8095238095238096      0.8997354108424375      15      
7
 
 
 -- !query
diff --git 
a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
 
b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
index a7f190c0d..1421a1260 100644
--- 
a/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
+++ 
b/gluten-ut/spark34/src/test/scala/io/glutenproject/utils/velox/VeloxSQLQueryTestSettings.scala
@@ -233,9 +233,11 @@ object VeloxSQLQueryTestSettings extends 
SQLQueryTestSettings {
   )
 
   val OVERWRITE_SQL_QUERY_LIST: Set[String] = Set(
-    // Velox corr has better computation logic but it fails Spark's precision 
check.
+    // The calculation formulas for corr, skewness, kurtosis, variance, and 
stddev in Velox differ
+    // slightly from those in Spark, resulting in some differences in the 
final results.
     // Overwrite below test cases.
     // -- SPARK-24369 multiple distinct aggregations having the same argument 
set
+    // -- Aggregate with nulls.
     "group-by.sql",
     "udf/udf-group-by.sql",
     // Exception string doesn't match for
diff --git 
a/shims/common/src/main/scala/io/glutenproject/expression/ExpressionNames.scala 
b/shims/common/src/main/scala/io/glutenproject/expression/ExpressionNames.scala
index f61aa3161..8717be50d 100644
--- 
a/shims/common/src/main/scala/io/glutenproject/expression/ExpressionNames.scala
+++ 
b/shims/common/src/main/scala/io/glutenproject/expression/ExpressionNames.scala
@@ -45,8 +45,9 @@ object ExpressionNames {
   final val FIRST = "first"
   final val FIRST_IGNORE_NULL = "first_ignore_null"
   final val APPROX_DISTINCT = "approx_distinct"
-  final val SKEWNESS = "skewness"
   final val APPROX_PERCENTILE = "approx_percentile"
+  final val SKEWNESS = "skewness"
+  final val KURTOSIS = "kurtosis"
 
   // Function names used by Substrait plan.
   final val ADD = "add"


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(incubator-gluten) branch main updated: [VL] Support kurtosis aggregate function (#5151)

Reply via email to