This is an automated email from the ASF dual-hosted git repository.
philo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 7e5b4fc7a [VL] Support regr_intercept aggregate function (#5273)
7e5b4fc7a is described below
commit 7e5b4fc7a1ec495e516add2d2b89fe9a1a1af5e4
Author: Joey <[email protected]>
AuthorDate: Wed Apr 3 17:22:33 2024 +0800
[VL] Support regr_intercept aggregate function (#5273)
---
.../org/apache/gluten/utils/CHExpressionUtil.scala | 1 +
.../apache/gluten/utils/VeloxIntermediateData.scala | 20 ++++++++++++--------
.../execution/VeloxAggregateFunctionsSuite.scala | 19 +++++++++++++++++++
cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc | 3 ++-
docs/velox-backend-support-progress.md | 1 +
.../apache/gluten/expression/ExpressionNames.scala | 1 +
.../gluten/sql/shims/spark34/Spark34Shims.scala | 3 ++-
7 files changed, 38 insertions(+), 10 deletions(-)
diff --git
a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
index d169d9ec5..f0f17b172 100644
---
a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
+++
b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala
@@ -186,6 +186,7 @@ object CHExpressionUtil {
KURTOSIS -> DefaultValidator(),
REGR_R2 -> DefaultValidator(),
REGR_SLOPE -> DefaultValidator(),
+ REGR_INTERCEPT -> DefaultValidator(),
TO_UTC_TIMESTAMP -> DefaultValidator(),
FROM_UTC_TIMESTAMP -> DefaultValidator()
)
diff --git
a/backends-velox/src/main/scala/org/apache/gluten/utils/VeloxIntermediateData.scala
b/backends-velox/src/main/scala/org/apache/gluten/utils/VeloxIntermediateData.scala
index 149634a47..b3bb62ec6 100644
---
a/backends-velox/src/main/scala/org/apache/gluten/utils/VeloxIntermediateData.scala
+++
b/backends-velox/src/main/scala/org/apache/gluten/utils/VeloxIntermediateData.scala
@@ -42,8 +42,8 @@ object VeloxIntermediateData {
// Skewness, Kurtosis
private val veloxCentralMomentAggIntermediateDataOrder: Seq[Seq[String]] =
Seq("n", "avg", "m2", "m3", "m4").map(Seq(_))
- // RegrSlope
- private val veloxRegrSlopeIntermediateDataOrder: Seq[Seq[String]] =
+ // RegrSlope, RegrIntercept
+ private val veloxRegrIntermediateDataOrder: Seq[Seq[String]] =
Seq("ck", "n", "m2", "xAvg:avg", "yAvg").map(attr => attr.split(":").toSeq)
// Agg functions with inconsistent types of intermediate data between Velox
and Spark.
@@ -58,8 +58,8 @@ object VeloxIntermediateData {
// Skewness, Kurtosis
private val veloxCentralMomentAggIntermediateTypes: Seq[DataType] =
Seq(LongType, DoubleType, DoubleType, DoubleType, DoubleType)
- // RegrSlope
- private val veloxRegrSlopeIntermediateTypes: Seq[DataType] =
+ // RegrSlope, RegrIntercept
+ private val veloxRegrIntermediateTypes: Seq[DataType] =
Seq(DoubleType, LongType, DoubleType, DoubleType, DoubleType)
def getAttrIndex(intermediateDataOrder: Seq[Seq[String]], attr: String): Int
=
@@ -89,8 +89,10 @@ object VeloxIntermediateData {
// certain versions of Spark, and SparkShim is not dependent on the
backend-velox module. It
// is not convenient to include Velox-specific logic in SparkShim. Using
class names to match
// aggFunc is reliable in this case, as there are no cases of duplicate
names.
- case _ if aggFunc.getClass.getSimpleName.equals("RegrSlope") =>
- veloxRegrSlopeIntermediateDataOrder
+ case _
+ if aggFunc.getClass.getSimpleName.equals("RegrSlope") ||
+ aggFunc.getClass.getSimpleName.equals("RegrIntercept") =>
+ veloxRegrIntermediateDataOrder
case _ =>
aggFunc.aggBufferAttributes.map(_.name).map(Seq(_))
}
@@ -172,8 +174,10 @@ object VeloxIntermediateData {
Some(veloxVarianceIntermediateTypes)
case _: Skewness | _: Kurtosis =>
Some(veloxCentralMomentAggIntermediateTypes)
- case _ if aggFunc.getClass.getSimpleName.equals("RegrSlope") =>
- Some(veloxRegrSlopeIntermediateTypes)
+ case _
+ if aggFunc.getClass.getSimpleName.equals("RegrSlope") ||
+ aggFunc.getClass.getSimpleName.equals("RegrIntercept") =>
+ Some(veloxRegrIntermediateTypes)
case _ if aggFunc.aggBufferAttributes.size > 1 =>
Some(aggFunc.aggBufferAttributes.map(_.dataType))
case _ => None
diff --git
a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxAggregateFunctionsSuite.scala
b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxAggregateFunctionsSuite.scala
index 6d84d622f..3a181cfdd 100644
---
a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxAggregateFunctionsSuite.scala
+++
b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxAggregateFunctionsSuite.scala
@@ -409,6 +409,25 @@ abstract class VeloxAggregateFunctionsSuite extends
VeloxWholeStageTransformerSu
}
}
+ testWithSpecifiedSparkVersion("regr_intercept", Some("3.4")) {
+ runQueryAndCompare("""
+ |select regr_intercept(l_partkey, l_suppkey) from
lineitem;
+ |""".stripMargin) {
+ checkGlutenOperatorMatch[HashAggregateExecTransformer]
+ }
+ runQueryAndCompare(
+ "select regr_intercept(l_partkey, l_suppkey), count(distinct l_orderkey)
from lineitem") {
+ df =>
+ {
+ assert(
+ getExecutedPlan(df).count(
+ plan => {
+ plan.isInstanceOf[HashAggregateExecTransformer]
+ }) == 4)
+ }
+ }
+ }
+
test("first") {
runQueryAndCompare(s"""
|select first(l_linenumber), first(l_linenumber,
true) from lineitem;
diff --git a/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc
b/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc
index a302701b4..43f808066 100644
--- a/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc
+++ b/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc
@@ -1081,7 +1081,8 @@ bool SubstraitToVeloxPlanValidator::validate(const
::substrait::AggregateRel& ag
"approx_distinct",
"skewness",
"kurtosis",
- "regr_slope"};
+ "regr_slope",
+ "regr_intercept"};
auto udfFuncs = UdfLoader::getInstance()->getRegisteredUdafNames();
diff --git a/docs/velox-backend-support-progress.md
b/docs/velox-backend-support-progress.md
index 28ac7218a..0b5015bc5 100644
--- a/docs/velox-backend-support-progress.md
+++ b/docs/velox-backend-support-progress.md
@@ -379,6 +379,7 @@ Gluten supports 199 functions. (Drag to right to see all
data types)
| min | min |
| S | | | | S | S | S | S
| S | | | | | | | |
| | | |
| min_by | |
| S | | | | | | |
| | | | | | | | |
| | | |
| regr_r2 | regr_r2 | regr_r2
| S | | | | S | S | S | S
| S | | | | | | | |
| | | |
+| regr_intercept | regr_intercept | regr_intercept
| S | | | | S | S | S | S
| S | | | | | | | |
| | | |
| regr_slope | regr_slope | regr_slope
| S | | | | S | S | S | S
| S | | | | | | | |
| | | |
| skewness | skewness | skewness
| S | | | | S | S | S | S
| S | | | | | | | |
| | | |
| some | |
| | | | | | | |
| | | | | | | | |
| | | |
diff --git
a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala
b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala
index 6206db3f2..aeaff9e53 100644
---
a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala
+++
b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala
@@ -50,6 +50,7 @@ object ExpressionNames {
final val SKEWNESS = "skewness"
final val KURTOSIS = "kurtosis"
final val REGR_SLOPE = "regr_slope"
+ final val REGR_INTERCEPT = "regr_intercept"
// Function names used by Substrait plan.
final val ADD = "add"
diff --git
a/shims/spark34/src/main/scala/org/apache/gluten/sql/shims/spark34/Spark34Shims.scala
b/shims/spark34/src/main/scala/org/apache/gluten/sql/shims/spark34/Spark34Shims.scala
index 2178b1d17..2604ad929 100644
---
a/shims/spark34/src/main/scala/org/apache/gluten/sql/shims/spark34/Spark34Shims.scala
+++
b/shims/spark34/src/main/scala/org/apache/gluten/sql/shims/spark34/Spark34Shims.scala
@@ -81,7 +81,8 @@ class Spark34Shims extends SparkShims {
override def aggregateExpressionMappings: Seq[Sig] = {
Seq(
Sig[RegrR2](ExpressionNames.REGR_R2),
- Sig[RegrSlope](ExpressionNames.REGR_SLOPE)
+ Sig[RegrSlope](ExpressionNames.REGR_SLOPE),
+ Sig[RegrIntercept](ExpressionNames.REGR_INTERCEPT)
)
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]