[
https://issues.apache.org/jira/browse/SPARK-33910?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Yuming Wang updated SPARK-33910:
--------------------------------
Description:
1. Push down the foldable expressions through CaseWhen/If
2. Simplify conditional in predicate
3. Push the UnaryExpression into (if / case) branches
4. Simplify CaseWhen if elseValue is None
5. Simplify conditional if all branches are foldable boolean type
Common use cases are:
{code:sql}
create table t1 using parquet as select * from range(100);
create table t2 using parquet as select * from range(200);
create temp view v1 as
select 'a' as event_type, * from t1
union all
select CASE WHEN id = 1 THEN 'b' ELSE 'c' end as event_type, * from t2
{code}
1. Reduce read the whole table.
{noformat}
explain select * from v1 where event_type = 'a';
Before simplify:
== Physical Plan ==
Union
:- *(1) Project [a AS event_type#7, id#9L]
: +- *(1) ColumnarToRow
: +- FileScan parquet default.t1[id#9L] Batched: true, DataFilters: [],
Format: Parquet, PartitionFilters: [], PushedFilters: [], ReadSchema:
struct<id:bigint>
+- *(2) Project [CASE WHEN (id#10L = 1) THEN b ELSE c END AS event_type#8,
id#10L]
+- *(2) Filter (CASE WHEN (id#10L = 1) THEN b ELSE c END = a)
+- *(2) ColumnarToRow
+- FileScan parquet default.t2[id#10L] Batched: true, DataFilters:
[(CASE WHEN (id#10L = 1) THEN b ELSE c END = a)], Format: Parquet,
PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:bigint>
After simplify:
== Physical Plan ==
*(1) Project [a AS event_type#8, id#4L]
+- *(1) ColumnarToRow
+- FileScan parquet default.t1[id#4L] Batched: true, DataFilters: [],
Format: Parquet
{noformat}
2. Push down the conditional expressions to data source.
{noformat}
explain select * from v1 where event_type = 'b';
Before simplify:
== Physical Plan ==
Union
:- LocalTableScan <empty>, [event_type#7, id#9L]
+- *(1) Project [CASE WHEN (id#10L = 1) THEN b ELSE c END AS event_type#8,
id#10L]
+- *(1) Filter (CASE WHEN (id#10L = 1) THEN b ELSE c END = b)
+- *(1) ColumnarToRow
+- FileScan parquet default.t2[id#10L] Batched: true, DataFilters:
[(CASE WHEN (id#10L = 1) THEN b ELSE c END = b)], Format: Parquet,
PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:bigint>
After simplify:
== Physical Plan ==
*(1) Project [CASE WHEN (id#5L = 1) THEN b ELSE c END AS event_type#8, id#5L AS
id#4L]
+- *(1) Filter (isnotnull(id#5L) AND (id#5L = 1))
+- *(1) ColumnarToRow
+- FileScan parquet default.t2[id#5L] Batched: true, DataFilters:
[isnotnull(id#5L), (id#5L = 1)], Format: Parquet, PartitionFilters: [],
PushedFilters: [IsNotNull(id), EqualTo(id,1)], ReadSchema: struct<id:bigint>
{noformat}
3. Reduce the amount of calculation.
{noformat}
Before simplify:
explain select event_type = 'e' from v1;
== Physical Plan ==
Union
:- *(1) Project [false AS (event_type = e)#37]
: +- *(1) ColumnarToRow
: +- FileScan parquet default.t1[] Batched: true, DataFilters: [], Format:
Parquet, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<>
+- *(2) Project [(CASE WHEN (id#21L = 1) THEN b ELSE c END = e) AS (event_type
= e)#38]
+- *(2) ColumnarToRow
+- FileScan parquet default.t2[id#21L] Batched: true, DataFilters: [],
Format: Parquet, PartitionFilters: [], PushedFilters: [], ReadSchema:
struct<id:bigint>
After simplify:
== Physical Plan ==
Union
:- *(1) Project [false AS (event_type = e)#10]
: +- *(1) ColumnarToRow
: +- FileScan parquet default.t1[] Batched: true, DataFilters: [], Format:
Parquet,
+- *(2) Project [false AS (event_type = e)#14]
+- *(2) ColumnarToRow
+- FileScan parquet default.t2[] Batched: true, DataFilters: [], Format:
Parquet, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<>
{noformat}
was:
1. Push down the foldable expressions through CaseWhen/If
2. Simplify conditional in predicate
3. Push the UnaryExpression into (if / case) branches
4. Simplify CaseWhen if elseValue is None
5. Simplify conditional if all branches are foldable boolean type
Common use cases are:
{code:sql}
create table t1 using parquet as select * from range(100);
create table t2 using parquet as select * from range(200);
create temp view v1 as
select 'a' as event_type, * from t1
union all
select CASE WHEN id = 1 THEN 'b' ELSE 'c' end as event_type, * from t2
{code}
1. Reduce read the whole table.
{noformat}
explain select * from v1 where event_type = 'a';
Before simplify:
== Physical Plan ==
Union
:- *(1) Project [a AS event_type#7, id#9L]
: +- *(1) ColumnarToRow
: +- FileScan parquet default.t1[id#9L] Batched: true, DataFilters: [],
Format: Parquet, PartitionFilters: [], PushedFilters: [], ReadSchema:
struct<id:bigint>
+- *(2) Project [CASE WHEN (id#10L = 1) THEN b ELSE c END AS event_type#8,
id#10L]
+- *(2) Filter (CASE WHEN (id#10L = 1) THEN b ELSE c END = a)
+- *(2) ColumnarToRow
+- FileScan parquet default.t2[id#10L] Batched: true, DataFilters:
[(CASE WHEN (id#10L = 1) THEN b ELSE c END = a)], Format: Parquet,
PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:bigint>
After simplify:
== Physical Plan ==
*(1) Project [a AS event_type#8, id#4L]
+- *(1) ColumnarToRow
+- FileScan parquet default.t1[id#4L] Batched: true, DataFilters: [],
Format: Parquet
{noformat}
2. Push down the conditional expressions to data source.
{noformat}
explain select * from v1 where event_type = 'b';
Before simplify:
== Physical Plan ==
Union
:- LocalTableScan <empty>, [event_type#7, id#9L]
+- *(1) Project [CASE WHEN (id#10L = 1) THEN b ELSE c END AS event_type#8,
id#10L]
+- *(1) Filter (CASE WHEN (id#10L = 1) THEN b ELSE c END = b)
+- *(1) ColumnarToRow
+- FileScan parquet default.t2[id#10L] Batched: true, DataFilters:
[(CASE WHEN (id#10L = 1) THEN b ELSE c END = b)], Format: Parquet,
PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:bigint>
After simplify:
== Physical Plan ==
*(1) Project [CASE WHEN (id#5L = 1) THEN b ELSE c END AS event_type#8, id#5L AS
id#4L]
+- *(1) Filter (isnotnull(id#5L) AND (id#5L = 1))
+- *(1) ColumnarToRow
+- FileScan parquet default.t2[id#5L] Batched: true, DataFilters:
[isnotnull(id#5L), (id#5L = 1)], Format: Parquet, PartitionFilters: [],
PushedFilters: [IsNotNull(id), EqualTo(id,1)], ReadSchema: struct<id:bigint>
{noformat}
3. Reduce the amount of calculation.
{noformat}
Before simplify:
spark-sql> explain select event_type = 'e' from v1;
== Physical Plan ==
Union
:- *(1) Project [false AS (event_type = e)#37]
: +- *(1) ColumnarToRow
: +- FileScan parquet default.t1[] Batched: true, DataFilters: [], Format:
Parquet, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<>
+- *(2) Project [(CASE WHEN (id#21L = 1) THEN b ELSE c END = e) AS (event_type
= e)#38]
+- *(2) ColumnarToRow
+- FileScan parquet default.t2[id#21L] Batched: true, DataFilters: [],
Format: Parquet, PartitionFilters: [], PushedFilters: [], ReadSchema:
struct<id:bigint>
After simplify:
== Physical Plan ==
Union
:- *(1) Project [false AS (event_type = e)#10]
: +- *(1) ColumnarToRow
: +- FileScan parquet default.t1[] Batched: true, DataFilters: [], Format:
Parquet,
+- *(2) Project [false AS (event_type = e)#14]
+- *(2) ColumnarToRow
+- FileScan parquet default.t2[] Batched: true, DataFilters: [], Format:
Parquet, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<>
{noformat}
> Simplify/Optimize conditional expressions
> ------------------------------------------
>
> Key: SPARK-33910
> URL: https://issues.apache.org/jira/browse/SPARK-33910
> Project: Spark
> Issue Type: Umbrella
> Components: SQL
> Affects Versions: 3.2.0
> Reporter: Yuming Wang
> Priority: Major
>
> 1. Push down the foldable expressions through CaseWhen/If
> 2. Simplify conditional in predicate
> 3. Push the UnaryExpression into (if / case) branches
> 4. Simplify CaseWhen if elseValue is None
> 5. Simplify conditional if all branches are foldable boolean type
> Common use cases are:
> {code:sql}
> create table t1 using parquet as select * from range(100);
> create table t2 using parquet as select * from range(200);
> create temp view v1 as
> select 'a' as event_type, * from t1
> union all
> select CASE WHEN id = 1 THEN 'b' ELSE 'c' end as event_type, * from t2
> {code}
> 1. Reduce read the whole table.
> {noformat}
> explain select * from v1 where event_type = 'a';
> Before simplify:
> == Physical Plan ==
> Union
> :- *(1) Project [a AS event_type#7, id#9L]
> : +- *(1) ColumnarToRow
> : +- FileScan parquet default.t1[id#9L] Batched: true, DataFilters: [],
> Format: Parquet, PartitionFilters: [], PushedFilters: [], ReadSchema:
> struct<id:bigint>
> +- *(2) Project [CASE WHEN (id#10L = 1) THEN b ELSE c END AS event_type#8,
> id#10L]
> +- *(2) Filter (CASE WHEN (id#10L = 1) THEN b ELSE c END = a)
> +- *(2) ColumnarToRow
> +- FileScan parquet default.t2[id#10L] Batched: true, DataFilters:
> [(CASE WHEN (id#10L = 1) THEN b ELSE c END = a)], Format: Parquet,
> PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:bigint>
> After simplify:
> == Physical Plan ==
> *(1) Project [a AS event_type#8, id#4L]
> +- *(1) ColumnarToRow
> +- FileScan parquet default.t1[id#4L] Batched: true, DataFilters: [],
> Format: Parquet
> {noformat}
> 2. Push down the conditional expressions to data source.
> {noformat}
> explain select * from v1 where event_type = 'b';
> Before simplify:
> == Physical Plan ==
> Union
> :- LocalTableScan <empty>, [event_type#7, id#9L]
> +- *(1) Project [CASE WHEN (id#10L = 1) THEN b ELSE c END AS event_type#8,
> id#10L]
> +- *(1) Filter (CASE WHEN (id#10L = 1) THEN b ELSE c END = b)
> +- *(1) ColumnarToRow
> +- FileScan parquet default.t2[id#10L] Batched: true, DataFilters:
> [(CASE WHEN (id#10L = 1) THEN b ELSE c END = b)], Format: Parquet,
> PartitionFilters: [], PushedFilters: [], ReadSchema: struct<id:bigint>
> After simplify:
> == Physical Plan ==
> *(1) Project [CASE WHEN (id#5L = 1) THEN b ELSE c END AS event_type#8, id#5L
> AS id#4L]
> +- *(1) Filter (isnotnull(id#5L) AND (id#5L = 1))
> +- *(1) ColumnarToRow
> +- FileScan parquet default.t2[id#5L] Batched: true, DataFilters:
> [isnotnull(id#5L), (id#5L = 1)], Format: Parquet, PartitionFilters: [],
> PushedFilters: [IsNotNull(id), EqualTo(id,1)], ReadSchema: struct<id:bigint>
> {noformat}
> 3. Reduce the amount of calculation.
> {noformat}
> Before simplify:
> explain select event_type = 'e' from v1;
> == Physical Plan ==
> Union
> :- *(1) Project [false AS (event_type = e)#37]
> : +- *(1) ColumnarToRow
> : +- FileScan parquet default.t1[] Batched: true, DataFilters: [],
> Format: Parquet, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<>
> +- *(2) Project [(CASE WHEN (id#21L = 1) THEN b ELSE c END = e) AS
> (event_type = e)#38]
> +- *(2) ColumnarToRow
> +- FileScan parquet default.t2[id#21L] Batched: true, DataFilters: [],
> Format: Parquet, PartitionFilters: [], PushedFilters: [], ReadSchema:
> struct<id:bigint>
> After simplify:
> == Physical Plan ==
> Union
> :- *(1) Project [false AS (event_type = e)#10]
> : +- *(1) ColumnarToRow
> : +- FileScan parquet default.t1[] Batched: true, DataFilters: [],
> Format: Parquet,
> +- *(2) Project [false AS (event_type = e)#14]
> +- *(2) ColumnarToRow
> +- FileScan parquet default.t2[] Batched: true, DataFilters: [],
> Format: Parquet, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<>
> {noformat}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]