[
https://issues.apache.org/jira/browse/SPARK-48660?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Yuming Wang updated SPARK-48660:
--------------------------------
Description:
How to reproduce:
{code:sql}
CREATE TABLE order_history_version_audit_rno (
eventid STRING,
id STRING,
referenceid STRING,
type STRING,
referencetype STRING,
sellerid BIGINT,
buyerid BIGINT,
producerid STRING,
versionid INT,
changedocuments ARRAY<STRUCT<clientId: STRING, type: STRING, timestamp:
BIGINT, changeDetails: STRING>>,
dt STRING,
hr STRING)
USING parquet
PARTITIONED BY (dt, hr);
explain cost
CREATE TABLE order_history_version_audit_rno
USING parquet
PARTITIONED BY (dt)
CLUSTERED BY (id) INTO 1000 buckets
AS SELECT * FROM order_history_version_audit_rno
WHERE dt >= '2023-11-29';
{code}
{noformat}
spark-sql (default)>
> explain cost
> CREATE TABLE order_history_version_audit_rno
> USING parquet
> PARTITIONED BY (dt)
> CLUSTERED BY (id) INTO 1000 buckets
> AS SELECT * FROM order_history_version_audit_rno
> WHERE dt >= '2023-11-29';
== Optimized Logical Plan ==
CreateDataSourceTableAsSelectCommand
`spark_catalog`.`default`.`order_history_version_audit_rno`, ErrorIfExists,
[eventid, id, referenceid, type, referencetype, sellerid, buyerid, producerid,
versionid, changedocuments, hr, dt]
+- Project [eventid#5, id#6, referenceid#7, type#8, referencetype#9,
sellerid#10L, buyerid#11L, producerid#12, versionid#13, changedocuments#14,
hr#16, dt#15]
+- Project [eventid#5, id#6, referenceid#7, type#8, referencetype#9,
sellerid#10L, buyerid#11L, producerid#12, versionid#13, changedocuments#14,
dt#15, hr#16]
+- Filter (dt#15 >= 2023-11-29)
+- SubqueryAlias
spark_catalog.default.order_history_version_audit_rno
+- Relation
spark_catalog.default.order_history_version_audit_rno[eventid#5,id#6,referenceid#7,type#8,referencetype#9,sellerid#10L,buyerid#11L,producerid#12,versionid#13,changedocuments#14,dt#15,hr#16]
parquet
== Physical Plan ==
Execute CreateDataSourceTableAsSelectCommand
+- CreateDataSourceTableAsSelectCommand
`spark_catalog`.`default`.`order_history_version_audit_rno`, ErrorIfExists,
[eventid, id, referenceid, type, referencetype, sellerid, buyerid, producerid,
versionid, changedocuments, hr, dt]
+- Project [eventid#5, id#6, referenceid#7, type#8, referencetype#9,
sellerid#10L, buyerid#11L, producerid#12, versionid#13, changedocuments#14,
hr#16, dt#15]
+- Project [eventid#5, id#6, referenceid#7, type#8,
referencetype#9, sellerid#10L, buyerid#11L, producerid#12, versionid#13,
changedocuments#14, dt#15, hr#16]
+- Filter (dt#15 >= 2023-11-29)
+- SubqueryAlias
spark_catalog.default.order_history_version_audit_rno
+- Relation
spark_catalog.default.order_history_version_audit_rno[eventid#5,id#6,referenceid#7,type#8,referencetype#9,sellerid#10L,buyerid#11L,producerid#12,versionid#13,changedocuments#14,dt#15,hr#16]
parquet
{noformat}
If remove create table:
{noformat}
> explain cost
> SELECT * FROM order_history_version_audit_rno
> WHERE dt >= '2023-11-29';
== Optimized Logical Plan ==
Filter (isnotnull(dt#15) AND (dt#15 >= 2023-11-29)), Statistics(sizeInBytes=1.0
B)
+- Relation
spark_catalog.default.order_history_version_audit_rno[eventid#5,id#6,referenceid#7,type#8,referencetype#9,sellerid#10L,buyerid#11L,producerid#12,versionid#13,changedocuments#14,dt#15,hr#16]
parquet, Statistics(sizeInBytes=0.0 B)
== Physical Plan ==
*(1) ColumnarToRow
+- FileScan parquet
spark_catalog.default.order_history_version_audit_rno[eventid#5,id#6,referenceid#7,type#8,referencetype#9,sellerid#10L,buyerid#11L,producerid#12,versionid#13,changedocuments#14,dt#15,hr#16]
Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(0
paths)[], PartitionFilters: [isnotnull(dt#15), (dt#15 >= 2023-11-29)],
PushedFilters: [], ReadSchema:
struct<eventid:string,id:string,referenceid:string,type:string,referencetype:string,sellerid:bigi...
{noformat}
was:
How to reproduce:
{code:sql}
CREATE TABLE order_history_version_audit_rno (
eventid STRING,
id STRING,
referenceid STRING,
type STRING,
referencetype STRING,
sellerid BIGINT,
buyerid BIGINT,
producerid STRING,
versionid INT,
changedocuments ARRAY<STRUCT<clientId: STRING, type: STRING, timestamp:
BIGINT, changeDetails: STRING>>,
dt STRING,
hr STRING)
USING parquet
PARTITIONED BY (dt, hr);
explain cost
CREATE TABLE order_history_version_audit_rno
USING parquet
PARTITIONED BY (dt)
CLUSTERED BY (id) INTO 1000 buckets
AS SELECT * FROM order_history_version_audit_rno
WHERE dt >= '2023-11-29';
{code}
{noformat}
spark-sql (default)>
> explain cost
> CREATE TABLE order_history_version_audit_rno
> USING parquet
> PARTITIONED BY (dt)
> CLUSTERED BY (id) INTO 1000 buckets
> AS SELECT * FROM order_history_version_audit_rno
> WHERE dt >= '2023-11-29';
== Optimized Logical Plan ==
CreateDataSourceTableAsSelectCommand
`spark_catalog`.`default`.`order_history_version_audit_rno`, ErrorIfExists,
[eventid, id, referenceid, type, referencetype, sellerid, buyerid, producerid,
versionid, changedocuments, hr, dt]
+- Project [eventid#5, id#6, referenceid#7, type#8, referencetype#9,
sellerid#10L, buyerid#11L, producerid#12, versionid#13, changedocuments#14,
hr#16, dt#15]
+- Project [eventid#5, id#6, referenceid#7, type#8, referencetype#9,
sellerid#10L, buyerid#11L, producerid#12, versionid#13, changedocuments#14,
dt#15, hr#16]
+- Filter (dt#15 >= 2023-11-29)
+- SubqueryAlias
spark_catalog.default.order_history_version_audit_rno
+- Relation
spark_catalog.default.order_history_version_audit_rno[eventid#5,id#6,referenceid#7,type#8,referencetype#9,sellerid#10L,buyerid#11L,producerid#12,versionid#13,changedocuments#14,dt#15,hr#16]
parquet
== Physical Plan ==
Execute CreateDataSourceTableAsSelectCommand
+- CreateDataSourceTableAsSelectCommand
`spark_catalog`.`default`.`order_history_version_audit_rno`, ErrorIfExists,
[eventid, id, referenceid, type, referencetype, sellerid, buyerid, producerid,
versionid, changedocuments, hr, dt]
+- Project [eventid#5, id#6, referenceid#7, type#8, referencetype#9,
sellerid#10L, buyerid#11L, producerid#12, versionid#13, changedocuments#14,
hr#16, dt#15]
+- Project [eventid#5, id#6, referenceid#7, type#8,
referencetype#9, sellerid#10L, buyerid#11L, producerid#12, versionid#13,
changedocuments#14, dt#15, hr#16]
+- Filter (dt#15 >= 2023-11-29)
+- SubqueryAlias
spark_catalog.default.order_history_version_audit_rno
+- Relation
spark_catalog.default.order_history_version_audit_rno[eventid#5,id#6,referenceid#7,type#8,referencetype#9,sellerid#10L,buyerid#11L,producerid#12,versionid#13,changedocuments#14,dt#15,hr#16]
parquet
{noformat}
> The result of explain is incorrect for CreateTableAsSelect
> ----------------------------------------------------------
>
> Key: SPARK-48660
> URL: https://issues.apache.org/jira/browse/SPARK-48660
> Project: Spark
> Issue Type: Improvement
> Components: SQL
> Affects Versions: 3.5.0, 4.0.0, 3.5.1
> Reporter: Yuming Wang
> Priority: Major
>
> How to reproduce:
> {code:sql}
> CREATE TABLE order_history_version_audit_rno (
> eventid STRING,
> id STRING,
> referenceid STRING,
> type STRING,
> referencetype STRING,
> sellerid BIGINT,
> buyerid BIGINT,
> producerid STRING,
> versionid INT,
> changedocuments ARRAY<STRUCT<clientId: STRING, type: STRING, timestamp:
> BIGINT, changeDetails: STRING>>,
> dt STRING,
> hr STRING)
> USING parquet
> PARTITIONED BY (dt, hr);
> explain cost
> CREATE TABLE order_history_version_audit_rno
> USING parquet
> PARTITIONED BY (dt)
> CLUSTERED BY (id) INTO 1000 buckets
> AS SELECT * FROM order_history_version_audit_rno
> WHERE dt >= '2023-11-29';
> {code}
> {noformat}
> spark-sql (default)>
> > explain cost
> > CREATE TABLE order_history_version_audit_rno
> > USING parquet
> > PARTITIONED BY (dt)
> > CLUSTERED BY (id) INTO 1000 buckets
> > AS SELECT * FROM order_history_version_audit_rno
> > WHERE dt >= '2023-11-29';
> == Optimized Logical Plan ==
> CreateDataSourceTableAsSelectCommand
> `spark_catalog`.`default`.`order_history_version_audit_rno`, ErrorIfExists,
> [eventid, id, referenceid, type, referencetype, sellerid, buyerid,
> producerid, versionid, changedocuments, hr, dt]
> +- Project [eventid#5, id#6, referenceid#7, type#8, referencetype#9,
> sellerid#10L, buyerid#11L, producerid#12, versionid#13, changedocuments#14,
> hr#16, dt#15]
> +- Project [eventid#5, id#6, referenceid#7, type#8, referencetype#9,
> sellerid#10L, buyerid#11L, producerid#12, versionid#13, changedocuments#14,
> dt#15, hr#16]
> +- Filter (dt#15 >= 2023-11-29)
> +- SubqueryAlias
> spark_catalog.default.order_history_version_audit_rno
> +- Relation
> spark_catalog.default.order_history_version_audit_rno[eventid#5,id#6,referenceid#7,type#8,referencetype#9,sellerid#10L,buyerid#11L,producerid#12,versionid#13,changedocuments#14,dt#15,hr#16]
> parquet
> == Physical Plan ==
> Execute CreateDataSourceTableAsSelectCommand
> +- CreateDataSourceTableAsSelectCommand
> `spark_catalog`.`default`.`order_history_version_audit_rno`, ErrorIfExists,
> [eventid, id, referenceid, type, referencetype, sellerid, buyerid,
> producerid, versionid, changedocuments, hr, dt]
> +- Project [eventid#5, id#6, referenceid#7, type#8, referencetype#9,
> sellerid#10L, buyerid#11L, producerid#12, versionid#13, changedocuments#14,
> hr#16, dt#15]
> +- Project [eventid#5, id#6, referenceid#7, type#8,
> referencetype#9, sellerid#10L, buyerid#11L, producerid#12, versionid#13,
> changedocuments#14, dt#15, hr#16]
> +- Filter (dt#15 >= 2023-11-29)
> +- SubqueryAlias
> spark_catalog.default.order_history_version_audit_rno
> +- Relation
> spark_catalog.default.order_history_version_audit_rno[eventid#5,id#6,referenceid#7,type#8,referencetype#9,sellerid#10L,buyerid#11L,producerid#12,versionid#13,changedocuments#14,dt#15,hr#16]
> parquet
> {noformat}
> If remove create table:
> {noformat}
> > explain cost
> > SELECT * FROM order_history_version_audit_rno
> > WHERE dt >= '2023-11-29';
> == Optimized Logical Plan ==
> Filter (isnotnull(dt#15) AND (dt#15 >= 2023-11-29)),
> Statistics(sizeInBytes=1.0 B)
> +- Relation
> spark_catalog.default.order_history_version_audit_rno[eventid#5,id#6,referenceid#7,type#8,referencetype#9,sellerid#10L,buyerid#11L,producerid#12,versionid#13,changedocuments#14,dt#15,hr#16]
> parquet, Statistics(sizeInBytes=0.0 B)
> == Physical Plan ==
> *(1) ColumnarToRow
> +- FileScan parquet
> spark_catalog.default.order_history_version_audit_rno[eventid#5,id#6,referenceid#7,type#8,referencetype#9,sellerid#10L,buyerid#11L,producerid#12,versionid#13,changedocuments#14,dt#15,hr#16]
> Batched: true, DataFilters: [], Format: Parquet, Location:
> InMemoryFileIndex(0 paths)[], PartitionFilters: [isnotnull(dt#15), (dt#15 >=
> 2023-11-29)], PushedFilters: [], ReadSchema:
> struct<eventid:string,id:string,referenceid:string,type:string,referencetype:string,sellerid:bigi...
> {noformat}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]