This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 0fbd34c45b58 [SPARK-50377][SQL] Allow to evaluate foldable
RuntimeReplaceable
0fbd34c45b58 is described below
commit 0fbd34c45b5818400f1993314acc571690daaf9a
Author: Wenchen Fan <[email protected]>
AuthorDate: Fri Nov 22 13:20:37 2024 +0800
[SPARK-50377][SQL] Allow to evaluate foldable RuntimeReplaceable
### What changes were proposed in this pull request?
This is to fix a regression caused by
https://github.com/apache/spark/pull/47143 . The problem is, in some places, we
want to get a constant from a foldable expression before the query execution
starts. https://github.com/apache/spark/pull/47143 brings two problems:
1. `UnaryPositive` is no longer a `UnaryExpression`, which means it's not
foldable anymore even if its child is foldable.
2. `UnaryPositive` is no longer evaluable.
`Lag` is such a place. It may evaluate the `inputOffset` parameter eagerly.
`lag(..., +1)` no longer works after https://github.com/apache/spark/pull/47143
.
Instead of fixing `Lag`, this PR makes two changes and hopefully we can
avoid all similar problems:
1. Make `UnaryPositive` extend `UnaryExpression` again. We need follow-up
PRs to check other `RuntimeReplaceable` expressions and see if they should
extend `UnaryExpression` or `BinaryExpression`, etc.
2. Implement `RuntimeReplaceable#eval` so that we can evaluate folding
`RuntimeReplaceable` eagerly when needed.
### Why are the changes needed?
Fix the regression on `lag` and avoid similar issues in the future.
### Does this PR introduce _any_ user-facing change?
No, the regression is not released yet.
### How was this patch tested?
new test
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #48912 from cloud-fan/replace.
Lead-authored-by: Wenchen Fan <[email protected]>
Co-authored-by: Wenchen Fan <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../sql/catalyst/expressions/Expression.scala | 8 ++++++--
.../sql/catalyst/expressions/arithmetic.scala | 10 +++-------
.../sql-tests/analyzer-results/window.sql.out | 7 ++++---
.../src/test/resources/sql-tests/inputs/window.sql | 1 +
.../resources/sql-tests/results/window.sql.out | 23 +++++++++++-----------
5 files changed, 26 insertions(+), 23 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index f0f94f088138..c45479985282 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -440,8 +440,12 @@ trait RuntimeReplaceable extends Expression {
// are semantically equal.
override lazy val canonicalized: Expression = replacement.canonicalized
- final override def eval(input: InternalRow = null): Any =
- throw QueryExecutionErrors.cannotEvaluateExpressionError(this)
+ final override def eval(input: InternalRow = null): Any = {
+ // For convenience, we allow to evaluate `RuntimeReplaceable` expressions,
in case we need to
+ // get a constant from foldable expression before the query execution
starts.
+ assert(input == null)
+ replacement.eval()
+ }
final override protected def doGenCode(ctx: CodegenContext, ev: ExprCode):
ExprCode =
throw QueryExecutionErrors.cannotGenerateCodeForExpressionError(this)
}
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 015240472cf5..f9e8b6a17896 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -115,8 +115,7 @@ case class UnaryMinus(
since = "1.5.0",
group = "math_funcs")
case class UnaryPositive(child: Expression)
- extends RuntimeReplaceable with ImplicitCastInputTypes {
- override def nullIntolerant: Boolean = true
+ extends UnaryExpression with RuntimeReplaceable with ImplicitCastInputTypes {
override def prettyName: String = "positive"
@@ -128,11 +127,8 @@ case class UnaryPositive(child: Expression)
override lazy val replacement: Expression = child
- override protected def withNewChildrenInternal(
- newChildren: IndexedSeq[Expression]): UnaryPositive =
- copy(newChildren.head)
-
- override def children: Seq[Expression] = child :: Nil
+ override protected def withNewChildInternal(newChild: Expression):
Expression =
+ copy(child = newChild)
}
/**
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/window.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/window.sql.out
index db49b1bfd39d..a1a2b7600637 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/window.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/window.sql.out
@@ -995,6 +995,7 @@ SELECT
lag(v, 1) IGNORE NULLS OVER w lag_1,
lag(v, 2) IGNORE NULLS OVER w lag_2,
lag(v, 3) IGNORE NULLS OVER w lag_3,
+ lag(v, +3) IGNORE NULLS OVER w lag_plus_3,
nth_value(v, 1) IGNORE NULLS OVER w nth_value_1,
nth_value(v, 2) IGNORE NULLS OVER w nth_value_2,
nth_value(v, 3) IGNORE NULLS OVER w nth_value_3,
@@ -1007,9 +1008,9 @@ WINDOW w AS (ORDER BY id)
ORDER BY id
-- !query analysis
Sort [id#x ASC NULLS FIRST], true
-+- Project [content#x, id#x, v#x, lead_0#x, lead_1#x, lead_2#x, lead_3#x,
lag_0#x, lag_1#x, lag_2#x, lag_3#x, nth_value_1#x, nth_value_2#x,
nth_value_3#x, first_value#x, any_value#x, last_value#x]
- +- Project [content#x, id#x, v#x, lead_0#x, lead_1#x, lead_2#x, lead_3#x,
lag_0#x, lag_1#x, lag_2#x, lag_3#x, nth_value_1#x, nth_value_2#x,
nth_value_3#x, first_value#x, any_value#x, last_value#x, lead_0#x, lead_1#x,
lead_2#x, lead_3#x, lag_0#x, lag_1#x, lag_2#x, ... 7 more fields]
- +- Window [lead(v#x, 0, null) windowspecdefinition(id#x ASC NULLS FIRST,
specifiedwindowframe(RowFrame, 0, 0)) AS lead_0#x, lead(v#x, 1, null)
windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, 1,
1)) AS lead_1#x, lead(v#x, 2, null) windowspecdefinition(id#x ASC NULLS FIRST,
specifiedwindowframe(RowFrame, 2, 2)) AS lead_2#x, lead(v#x, 3, null)
windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, 3,
3)) AS lead_3#x, lag(v#x, 0, null) windo [...]
++- Project [content#x, id#x, v#x, lead_0#x, lead_1#x, lead_2#x, lead_3#x,
lag_0#x, lag_1#x, lag_2#x, lag_3#x, lag_plus_3#x, nth_value_1#x, nth_value_2#x,
nth_value_3#x, first_value#x, any_value#x, last_value#x]
+ +- Project [content#x, id#x, v#x, lead_0#x, lead_1#x, lead_2#x, lead_3#x,
lag_0#x, lag_1#x, lag_2#x, lag_3#x, lag_plus_3#x, nth_value_1#x, nth_value_2#x,
nth_value_3#x, first_value#x, any_value#x, last_value#x, lead_0#x, lead_1#x,
lead_2#x, lead_3#x, lag_0#x, lag_1#x, ... 9 more fields]
+ +- Window [lead(v#x, 0, null) windowspecdefinition(id#x ASC NULLS FIRST,
specifiedwindowframe(RowFrame, 0, 0)) AS lead_0#x, lead(v#x, 1, null)
windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, 1,
1)) AS lead_1#x, lead(v#x, 2, null) windowspecdefinition(id#x ASC NULLS FIRST,
specifiedwindowframe(RowFrame, 2, 2)) AS lead_2#x, lead(v#x, 3, null)
windowspecdefinition(id#x ASC NULLS FIRST, specifiedwindowframe(RowFrame, 3,
3)) AS lead_3#x, lag(v#x, 0, null) windo [...]
+- Project [content#x, id#x, v#x]
+- SubqueryAlias test_ignore_null
+- View (`test_ignore_null`, [content#x, id#x, v#x])
diff --git a/sql/core/src/test/resources/sql-tests/inputs/window.sql
b/sql/core/src/test/resources/sql-tests/inputs/window.sql
index f3cbf6ef1ccb..bec79247f9a6 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/window.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/window.sql
@@ -327,6 +327,7 @@ SELECT
lag(v, 1) IGNORE NULLS OVER w lag_1,
lag(v, 2) IGNORE NULLS OVER w lag_2,
lag(v, 3) IGNORE NULLS OVER w lag_3,
+ lag(v, +3) IGNORE NULLS OVER w lag_plus_3,
nth_value(v, 1) IGNORE NULLS OVER w nth_value_1,
nth_value(v, 2) IGNORE NULLS OVER w nth_value_2,
nth_value(v, 3) IGNORE NULLS OVER w nth_value_3,
diff --git a/sql/core/src/test/resources/sql-tests/results/window.sql.out
b/sql/core/src/test/resources/sql-tests/results/window.sql.out
index 87381b64638b..ce88fb57f8aa 100644
--- a/sql/core/src/test/resources/sql-tests/results/window.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/window.sql.out
@@ -1060,6 +1060,7 @@ SELECT
lag(v, 1) IGNORE NULLS OVER w lag_1,
lag(v, 2) IGNORE NULLS OVER w lag_2,
lag(v, 3) IGNORE NULLS OVER w lag_3,
+ lag(v, +3) IGNORE NULLS OVER w lag_plus_3,
nth_value(v, 1) IGNORE NULLS OVER w nth_value_1,
nth_value(v, 2) IGNORE NULLS OVER w nth_value_2,
nth_value(v, 3) IGNORE NULLS OVER w nth_value_3,
@@ -1071,17 +1072,17 @@ FROM
WINDOW w AS (ORDER BY id)
ORDER BY id
-- !query schema
-struct<content:string,id:int,v:string,lead_0:string,lead_1:string,lead_2:string,lead_3:string,lag_0:string,lag_1:string,lag_2:string,lag_3:string,nth_value_1:string,nth_value_2:string,nth_value_3:string,first_value:string,any_value:string,last_value:string>
--- !query output
-a 0 NULL NULL x y z NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL
-a 1 x x y z v x NULL NULL
NULL x NULL NULL x x x
-b 2 NULL NULL y z v NULL x NULL
NULL x NULL NULL x x x
-c 3 NULL NULL y z v NULL x NULL
NULL x NULL NULL x x x
-a 4 y y z v NULL y x NULL
NULL x y NULL x x y
-b 5 NULL NULL z v NULL NULL y x
NULL x y NULL x x y
-a 6 z z v NULL NULL z y x
NULL x y z x x z
-a 7 v v NULL NULL NULL v z y
x x y z x x v
-a 8 NULL NULL NULL NULL NULL NULL v z
y x y z x x v
+struct<content:string,id:int,v:string,lead_0:string,lead_1:string,lead_2:string,lead_3:string,lag_0:string,lag_1:string,lag_2:string,lag_3:string,lag_plus_3:string,nth_value_1:string,nth_value_2:string,nth_value_3:string,first_value:string,any_value:string,last_value:string>
+-- !query output
+a 0 NULL NULL x y z NULL NULL NULL
NULL NULL NULL NULL NULL NULL NULL NULL
+a 1 x x y z v x NULL NULL
NULL NULL x NULL NULL x x x
+b 2 NULL NULL y z v NULL x NULL
NULL NULL x NULL NULL x x x
+c 3 NULL NULL y z v NULL x NULL
NULL NULL x NULL NULL x x x
+a 4 y y z v NULL y x NULL
NULL NULL x y NULL x x y
+b 5 NULL NULL z v NULL NULL y x
NULL NULL x y NULL x x y
+a 6 z z v NULL NULL z y x
NULL NULL x y z x x z
+a 7 v v NULL NULL NULL v z y
x x x y z x x v
+a 8 NULL NULL NULL NULL NULL NULL v z
y y x y z x x v
-- !query
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]