[
https://issues.apache.org/jira/browse/SPARK-14616?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
JESSE CHEN updated SPARK-14616:
-------------------------------
Description:
{code:title=tpcds q44}
select asceding.rnk, i1.i_product_name best_performing, i2.i_product_name
worst_performing
from(select *
from (select item_sk,rank() over (order by rank_col asc) rnk
from (select ss_item_sk item_sk,avg(ss_net_profit) rank_col
from store_sales ss1
where ss_store_sk = 4
group by ss_item_sk
having avg(ss_net_profit) > 0.9*(select avg(ss_net_profit)
rank_col
from store_sales
where ss_store_sk = 4
and ss_addr_sk is null
group by ss_store_sk))V1)V11
where rnk < 11) asceding,
(select *
from (select item_sk,rank() over (order by rank_col desc) rnk
from (select ss_item_sk item_sk,avg(ss_net_profit) rank_col
from store_sales ss1
where ss_store_sk = 4
group by ss_item_sk
having avg(ss_net_profit) > 0.9*(select avg(ss_net_profit)
rank_col
from store_sales
where ss_store_sk = 4
and ss_addr_sk is null
group by ss_store_sk))V2)V21
where rnk < 11) descending,
item i1,
item i2
where asceding.rnk = descending.rnk
and i1.i_item_sk=asceding.item_sk
and i2.i_item_sk=descending.item_sk
order by asceding.rnk
limit 100;
{code}
{noformat}
bin/spark-sql --driver-memory 10g --verbose --master yarn-client --packages
com.databricks:spark-csv_2.10:1.3.0 --executor-memory 4g --num-executors 80
--executor-cores 2 --database hadoopds1g -f q44.sql
{noformat}
{noformat}
org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
Exchange SinglePartition, None
+- WholeStageCodegen
: +- Project [item_sk#0,rank_col#1]
: +- Filter havingCondition#219: boolean
: +- TungstenAggregate(key=[ss_item_sk#12],
functions=[(avg(ss_net_profit#32),mode=Final,isDistinct=false)],
output=[havingCondition#219,item_sk#0,rank_col#1])
: +- INPUT
+- Exchange hashpartitioning(ss_item_sk#12,200), None
+- WholeStageCodegen
: +- TungstenAggregate(key=[ss_item_sk#12],
functions=[(avg(ss_net_profit#32),mode=Partial,isDistinct=false)],
output=[ss_item_sk#12,sum#612,count#613L])
: +- Project [ss_item_sk#12,ss_net_profit#32]
: +- Filter (ss_store_sk#17 = 4)
: +- INPUT
+- Scan ParquetRelation:
hadoopds1g.store_sales[ss_item_sk#12,ss_net_profit#32,ss_store_sk#17]
InputPaths:
hdfs://bigaperf116.svl.ibm.com:8020/apps/hive/warehouse/hadoopds1g.db/store_sales,
PushedFilters: [EqualTo(ss_store_sk,4)]
at
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:47)
at org.apache.spark.sql.execution.Exchange.doExecute(Exchange.scala:105)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:116)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
at org.apache.spark.sql.execution.Sort.doExecute(Sort.scala:60)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:116)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
at org.apache.spark.sql.execution.Window.doExecute(Window.scala:288)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:116)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
at
org.apache.spark.sql.execution.InputAdapter.upstream(WholeStageCodegen.scala:176)
at
org.apache.spark.sql.execution.Filter.upstream(basicOperators.scala:73)
at
org.apache.spark.sql.execution.Project.upstream(basicOperators.scala:35)
at
org.apache.spark.sql.execution.WholeStageCodegen.doExecute(WholeStageCodegen.scala:279)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:116)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
at org.apache.spark.sql.execution.Sort.doExecute(Sort.scala:60)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:116)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
at
org.apache.spark.sql.execution.joins.SortMergeJoin.doExecute(SortMergeJoin.scala:63)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
{noformat}
Note this happens only when using parquet tables. Flat file (csv) works fine
and returns correct query results.
Additionally, query 58 also encounters the same exception. The query is as
follows:
{noformat}
with ss_items as
(select i_item_id item_id
,sum(ss_ext_sales_price) ss_item_rev
from store_sales
,item
,date_dim
where ss_item_sk = i_item_sk
and d_date in (select d_date
from date_dim
where d_week_seq = (select d_week_seq
from date_dim
where d_date = '2000-01-03'))
and ss_sold_date_sk = d_date_sk
group by i_item_id),
cs_items as
(select i_item_id item_id
,sum(cs_ext_sales_price) cs_item_rev
from catalog_sales
,item
,date_dim
where cs_item_sk = i_item_sk
and d_date in (select d_date
from date_dim
where d_week_seq = (select d_week_seq
from date_dim
where d_date = '2000-01-03'))
and cs_sold_date_sk = d_date_sk
group by i_item_id),
ws_items as
(select i_item_id item_id
,sum(ws_ext_sales_price) ws_item_rev
from web_sales
,item
,date_dim
where ws_item_sk = i_item_sk
and d_date in (select d_date
from date_dim
where d_week_seq =(select d_week_seq
from date_dim
where d_date = '2000-01-03'))
and ws_sold_date_sk = d_date_sk
group by i_item_id)
select ss_items.item_id
,ss_item_rev
,ss_item_rev/((ss_item_rev+cs_item_rev+ws_item_rev)/3) * 100 ss_dev
,cs_item_rev
,cs_item_rev/((ss_item_rev+cs_item_rev+ws_item_rev)/3) * 100 cs_dev
,ws_item_rev
,ws_item_rev/((ss_item_rev+cs_item_rev+ws_item_rev)/3) * 100 ws_dev
,(ss_item_rev+cs_item_rev+ws_item_rev)/3 average
from ss_items,cs_items,ws_items
where ss_items.item_id=cs_items.item_id
and ss_items.item_id=ws_items.item_id
and ss_item_rev between 0.9 * cs_item_rev and 1.1 * cs_item_rev
and ss_item_rev between 0.9 * ws_item_rev and 1.1 * ws_item_rev
and cs_item_rev between 0.9 * ss_item_rev and 1.1 * ss_item_rev
and cs_item_rev between 0.9 * ws_item_rev and 1.1 * ws_item_rev
and ws_item_rev between 0.9 * ss_item_rev and 1.1 * ss_item_rev
and ws_item_rev between 0.9 * cs_item_rev and 1.1 * cs_item_rev
order by item_id
,ss_item_rev
limit 100;
{noformat}
Not certain if related. But seems a case a bad plan was created.
Here is are logical, optimized and physical plans created for query 44:
{noformat}
Logical Plan =='Limit 100
+- 'Sort ['asceding.rnk ASC], true
+- 'Project
[unresolvedalias('asceding.rnk,None),unresolvedalias('i1.i_product_name AS
best_performing#433,None),unresolvedalias('i2.i_product_name AS
worst_performing#434,None)]
+- 'Filter ((('asceding.rnk = 'descending.rnk) && ('i1.i_item_sk =
'asceding.item_sk)) && ('i2.i_item_sk = 'descending.item_sk))
+- 'Join Inner, None
:- 'Join Inner, None
: :- 'Join Inner, None
: : :- 'Subquery asceding
: : : +- 'Project [unresolvedalias(*,None)]
: : : +- 'Filter ('rnk < 11)
: : : +- 'Subquery V11
: : : +- 'Project
[unresolvedalias('item_sk,None),unresolvedalias('rank()
windowspecdefinition('rank_col ASC,UnspecifiedFrame) AS rnk#428,None)]
: : : +- 'Subquery V1
: : : +- 'Filter cast(('avg('ss_net_profit) >
(0.9 * scalarsubquery('Aggregate ['ss_store_sk],
[unresolvedalias('avg('ss_net_profit) AS rank_col#427,None)]
+- 'Filter (('ss_store_sk = 4) && isnull('ss_addr_sk))
+- 'UnresolvedRelation `store_sales`, None
))) as boolean)
: : : +- 'Aggregate ['ss_item_sk],
[unresolvedalias('ss_item_sk AS
item_sk#425,None),unresolvedalias('avg('ss_net_profit) AS rank_col#426,None)]
: : : +- 'Filter ('ss_store_sk = 4)
: : : +- 'UnresolvedRelation
`store_sales`, Some(ss1)
: : +- 'Subquery descending
: : +- 'Project [unresolvedalias(*,None)]
: : +- 'Filter ('rnk < 11)
: : +- 'Subquery V21
: : +- 'Project
[unresolvedalias('item_sk,None),unresolvedalias('rank()
windowspecdefinition('rank_col DESC,UnspecifiedFrame) AS rnk#432,None)]
: : +- 'Subquery V2
: : +- 'Filter cast(('avg('ss_net_profit) >
(0.9 * scalarsubquery('Aggregate ['ss_store_sk],
[unresolvedalias('avg('ss_net_profit) AS rank_col#431,None)]
+- 'Filter (('ss_store_sk = 4) && isnull('ss_addr_sk))
+- 'UnresolvedRelation `store_sales`, None
))) as boolean)
: : +- 'Aggregate ['ss_item_sk],
[unresolvedalias('ss_item_sk AS
item_sk#429,None),unresolvedalias('avg('ss_net_profit) AS rank_col#430,None)]
: : +- 'Filter ('ss_store_sk = 4)
: : +- 'UnresolvedRelation
`store_sales`, Some(ss1)
: +- 'UnresolvedRelation `item`, Some(i1)
+- 'UnresolvedRelation `item`, Some(i2)
==
Analyzed Plan ==Limit 100
+- Sort [rnk#428 ASC], true
+- Project [rnk#428,i_product_name#202 AS
best_performing#433,i_product_name#478 AS worst_performing#434]
+- Filter (((rnk#428 = rnk#432) && (i_item_sk#181 = item_sk#425)) &&
(i_item_sk#457 = item_sk#429))
+- Join Inner, None
:- Join Inner, None
: :- Join Inner, None
: : :- Subquery asceding
: : : +- Project [item_sk#425,rnk#428]
: : : +- Filter (rnk#428 < 11)
: : : +- Subquery V11
: : : +- Project [item_sk#425,rnk#428]
: : : +- Project
[item_sk#425,rank_col#426,rnk#428,rnk#428]
: : : +- Window [item_sk#425,rank_col#426],
[rank(rank_col#426) windowspecdefinition(rank_col#426 ASC,ROWS BETWEEN
UNBOUNDED PRECEDING AND CURRENT ROW) AS rnk#428], [rank_col#426 ASC]
: : : +- Project [item_sk#425,rank_col#426]
: : : +- Subquery V1
: : : +- Project
[item_sk#425,rank_col#426]
: : : +- Filter havingCondition#439:
boolean
: : : +- Aggregate
[ss_item_sk#282], [cast(((avg(cast(ss_net_profit#302 as
double)),mode=Complete,isDistinct=false) > (cast(0.9 as double) *
scalarsubquery(Aggregate [ss_store_sk#287], [(avg(cast(ss_net_profit#302 as
double)),mode=Complete,isDistinct=false) AS rank_col#427]
+- Filter ((ss_store_sk#287 = 4) && isnull(ss_addr_sk#286))
+- Subquery store_sales
+-
Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
ParquetRelation
))) as boolean) AS havingCondition#439,ss_item_sk#282 AS
item_sk#425,(avg(cast(ss_net_profit#302 as
double)),mode=Complete,isDistinct=false) AS rank_col#426]
: : : +- Filter
(ss_store_sk#287 = 4)
: : : +- Subquery ss1
: : : +- Subquery
store_sales
: : : +-
Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
ParquetRelation
: : +- Subquery descending
: : +- Project [item_sk#429,rnk#432]
: : +- Filter (rnk#432 < 11)
: : +- Subquery V21
: : +- Project [item_sk#429,rnk#432]
: : +- Project
[item_sk#429,rank_col#430,rnk#432,rnk#432]
: : +- Window [item_sk#429,rank_col#430],
[rank(rank_col#430) windowspecdefinition(rank_col#430 DESC,ROWS BETWEEN
UNBOUNDED PRECEDING AND CURRENT ROW) AS rnk#432], [rank_col#430 DESC]
: : +- Project [item_sk#429,rank_col#430]
: : +- Subquery V2
: : +- Project
[item_sk#429,rank_col#430]
: : +- Filter havingCondition#440:
boolean
: : +- Aggregate
[ss_item_sk#282], [cast(((avg(cast(ss_net_profit#302 as
double)),mode=Complete,isDistinct=false) > (cast(0.9 as double) *
scalarsubquery(Aggregate [ss_store_sk#287], [(avg(cast(ss_net_profit#302 as
double)),mode=Complete,isDistinct=false) AS rank_col#431]
+- Filter ((ss_store_sk#287 = 4) && isnull(ss_addr_sk#286))
+- Subquery store_sales
+-
Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
ParquetRelation
))) as boolean) AS havingCondition#440,ss_item_sk#282 AS
item_sk#429,(avg(cast(ss_net_profit#302 as
double)),mode=Complete,isDistinct=false) AS rank_col#430]
: : +- Filter
(ss_store_sk#287 = 4)
: : +- Subquery ss1
: : +- Subquery
store_sales
: : +-
Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
ParquetRelation
: +- Subquery i1
: +- Subquery item
: +-
Relation[i_item_sk#181,i_item_id#182,i_rec_start_date#183,i_rec_end_date#184,i_item_desc#185,i_current_price#186,i_wholesale_cost#187,i_brand_id#188L,i_brand#189,i_class_id#190L,i_class#191,i_category_id#192L,i_category#193,i_manufact_id#194L,i_manufact#195,i_size#196,i_formulation#197,i_color#198,i_units#199,i_container#200,i_manager_id#201L,i_product_name#202]
ParquetRelation
+- Subquery i2
+- Subquery item
+-
Relation[i_item_sk#457,i_item_id#458,i_rec_start_date#459,i_rec_end_date#460,i_item_desc#461,i_current_price#462,i_wholesale_cost#463,i_brand_id#464L,i_brand#465,i_class_id#466L,i_class#467,i_category_id#468L,i_category#469,i_manufact_id#470L,i_manufact#471,i_size#472,i_formulation#473,i_color#474,i_units#475,i_container#476,i_manager_id#477L,i_product_name#478]
ParquetRelation
==
Optimized Plan ==Limit 100
+- Sort [rnk#428 ASC], true
+- Project [rnk#428,i_product_name#202 AS
best_performing#433,i_product_name#478 AS worst_performing#434]
+- Join Inner, Some((i_item_sk#457 = item_sk#429))
:- Project [rnk#428,item_sk#429,i_product_name#202]
: +- Join Inner, Some((i_item_sk#181 = item_sk#425))
: :- Project [rnk#428,item_sk#429,item_sk#425]
: : +- Join Inner, Some((rnk#428 = rnk#432))
: : :- Project [item_sk#425,rnk#428]
: : : +- Filter (rnk#428 < 11)
: : : +- Window [item_sk#425,rank_col#426],
[rank(rank_col#426) windowspecdefinition(rank_col#426 ASC,ROWS BETWEEN
UNBOUNDED PRECEDING AND CURRENT ROW) AS rnk#428], [rank_col#426 ASC]
: : : +- Project [item_sk#425,rank_col#426]
: : : +- Filter havingCondition#439: boolean
: : : +- Aggregate [ss_item_sk#282],
[((avg(cast(ss_net_profit#302 as double)),mode=Complete,isDistinct=false) >
(0.9 * scalarsubquery(Aggregate [ss_store_sk#287], [(avg(cast(ss_net_profit#302
as double)),mode=Complete,isDistinct=false) AS rank_col#427]
+- Filter ((ss_store_sk#287 = 4) && isnull(ss_addr_sk#286))
+- Subquery store_sales
+-
Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
ParquetRelation
))) AS havingCondition#439,ss_item_sk#282 AS
item_sk#425,(avg(cast(ss_net_profit#302 as
double)),mode=Complete,isDistinct=false) AS rank_col#426]
: : : +- Project
[ss_item_sk#282,ss_net_profit#302]
: : : +- Filter (ss_store_sk#287 = 4)
: : : +-
Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
ParquetRelation
: : +- Project [item_sk#429,rnk#432]
: : +- Filter (rnk#432 < 11)
: : +- Window [item_sk#429,rank_col#430],
[rank(rank_col#430) windowspecdefinition(rank_col#430 DESC,ROWS BETWEEN
UNBOUNDED PRECEDING AND CURRENT ROW) AS rnk#432], [rank_col#430 DESC]
: : +- Project [item_sk#429,rank_col#430]
: : +- Filter havingCondition#440: boolean
: : +- Aggregate [ss_item_sk#282],
[((avg(cast(ss_net_profit#302 as double)),mode=Complete,isDistinct=false) >
(0.9 * scalarsubquery(Aggregate [ss_store_sk#287], [(avg(cast(ss_net_profit#302
as double)),mode=Complete,isDistinct=false) AS rank_col#431]
+- Filter ((ss_store_sk#287 = 4) && isnull(ss_addr_sk#286))
+- Subquery store_sales
+-
Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
ParquetRelation
))) AS havingCondition#440,ss_item_sk#282 AS
item_sk#429,(avg(cast(ss_net_profit#302 as
double)),mode=Complete,isDistinct=false) AS rank_col#430]
: : +- Project
[ss_item_sk#282,ss_net_profit#302]
: : +- Filter (ss_store_sk#287 = 4)
: : +-
Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
ParquetRelation
: +- Project [i_item_sk#181,i_product_name#202]
: +-
Relation[i_item_sk#181,i_item_id#182,i_rec_start_date#183,i_rec_end_date#184,i_item_desc#185,i_current_price#186,i_wholesale_cost#187,i_brand_id#188L,i_brand#189,i_class_id#190L,i_class#191,i_category_id#192L,i_category#193,i_manufact_id#194L,i_manufact#195,i_size#196,i_formulation#197,i_color#198,i_units#199,i_container#200,i_manager_id#201L,i_product_name#202]
ParquetRelation
+- Project [i_item_sk#457,i_product_name#478]
+-
Relation[i_item_sk#457,i_item_id#458,i_rec_start_date#459,i_rec_end_date#460,i_item_desc#461,i_current_price#462,i_wholesale_cost#463,i_brand_id#464L,i_brand#465,i_class_id#466L,i_class#467,i_category_id#468L,i_category#469,i_manufact_id#470L,i_manufact#471,i_size#472,i_formulation#473,i_color#474,i_units#475,i_container#476,i_manager_id#477L,i_product_name#478]
ParquetRelation
==
Physical Plan ==TakeOrderedAndProject(limit=100, orderBy=[rnk#428 ASC],
output=[rnk#428,best_performing#433,worst_performing#434])
+- WholeStageCodegen
: +- Project [rnk#428,i_product_name#202 AS
best_performing#433,i_product_name#478 AS worst_performing#434]
: +- BroadcastHashJoin [item_sk#429], [i_item_sk#457], BuildRight, None
: :- Project [rnk#428,item_sk#429,i_product_name#202]
: : +- BroadcastHashJoin [item_sk#425], [i_item_sk#181], BuildRight,
None
: : :- Project [rnk#428,item_sk#429,item_sk#425]
: : : +- INPUT
: : +- INPUT
: +- INPUT
:- SortMergeJoin [rnk#428], [rnk#432], None
: :- Sort [rnk#428 ASC], false, 0
: : +- WholeStageCodegen
: : : +- Project [item_sk#425,rnk#428]
: : : +- Filter (rnk#428 < 11)
: : : +- INPUT
: : +- Window [item_sk#425,rank_col#426], [rank(rank_col#426)
windowspecdefinition(rank_col#426 ASC,ROWS BETWEEN UNBOUNDED PRECEDING AND
CURRENT ROW) AS rnk#428], [rank_col#426 ASC]
: : +- Sort [rank_col#426 ASC], false, 0
: : +- Exchange SinglePartition, None
: : +- WholeStageCodegen
: : : +- Project [item_sk#425,rank_col#426]
: : : +- Filter havingCondition#439: boolean
: : : +- TungstenAggregate(key=[ss_item_sk#282],
functions=[(avg(cast(ss_net_profit#302 as
double)),mode=Final,isDistinct=false)],
output=[havingCondition#439,item_sk#425,rank_col#426])
: : : +- INPUT
: : +- Exchange hashpartitioning(ss_item_sk#282,200), None
: : +- WholeStageCodegen
: : : +- TungstenAggregate(key=[ss_item_sk#282],
functions=[(avg(cast(ss_net_profit#302 as
double)),mode=Partial,isDistinct=false)],
output=[ss_item_sk#282,sum#506,count#507L])
: : : +- Project
[ss_item_sk#282,ss_net_profit#302]
: : : +- Filter (ss_store_sk#287 = 4)
: : : +- INPUT
: : +- Scan
ParquetRelation[ss_item_sk#282,ss_net_profit#302,ss_store_sk#287] InputPaths:
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/_SUCCESS,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/_common_metadata,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/_metadata,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/part-r-00000-2245f9f3-e00c-4c6c-b9cd-62179efcb22c.gz.parquet,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/part-r-00001-2245f9f3-e00c-4c6c-b9cd-62179efcb22c.gz.parquet,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/part-r-00002-2245f9f3-e00c-4c6c-b9cd-62179efcb22c.gz.parquet,
PushedFilters: [EqualTo(ss_store_sk,4)]
: +- Sort [rnk#432 ASC], false, 0
: +- WholeStageCodegen
: : +- Project [item_sk#429,rnk#432]
: : +- Filter (rnk#432 < 11)
: : +- INPUT
: +- Window [item_sk#429,rank_col#430], [rank(rank_col#430)
windowspecdefinition(rank_col#430 DESC,ROWS BETWEEN UNBOUNDED PRECEDING AND
CURRENT ROW) AS rnk#432], [rank_col#430 DESC]
: +- Sort [rank_col#430 DESC], false, 0
: +- Exchange SinglePartition, None
: +- WholeStageCodegen
: : +- Project [item_sk#429,rank_col#430]
: : +- Filter havingCondition#440: boolean
: : +- TungstenAggregate(key=[ss_item_sk#282],
functions=[(avg(cast(ss_net_profit#302 as
double)),mode=Final,isDistinct=false)],
output=[havingCondition#440,item_sk#429,rank_col#430])
: : +- INPUT
: +- Exchange hashpartitioning(ss_item_sk#282,200), None
: +- WholeStageCodegen
: : +- TungstenAggregate(key=[ss_item_sk#282],
functions=[(avg(cast(ss_net_profit#302 as
double)),mode=Partial,isDistinct=false)],
output=[ss_item_sk#282,sum#511,count#512L])
: : +- Project
[ss_item_sk#282,ss_net_profit#302]
: : +- Filter (ss_store_sk#287 = 4)
: : +- INPUT
: +- Scan
ParquetRelation[ss_item_sk#282,ss_net_profit#302,ss_store_sk#287] InputPaths:
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/_SUCCESS,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/_common_metadata,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/_metadata,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/part-r-00000-2245f9f3-e00c-4c6c-b9cd-62179efcb22c.gz.parquet,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/part-r-00001-2245f9f3-e00c-4c6c-b9cd-62179efcb22c.gz.parquet,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/part-r-00002-2245f9f3-e00c-4c6c-b9cd-62179efcb22c.gz.parquet,
PushedFilters: [EqualTo(ss_store_sk,4)]
:- Scan ParquetRelation[i_item_sk#181,i_product_name#202] InputPaths:
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/_SUCCESS,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/_common_metadata,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/_metadata,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/part-r-00000-09ab6229-4b98-42a8-988e-2f1706436007.gz.parquet,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/part-r-00001-09ab6229-4b98-42a8-988e-2f1706436007.gz.parquet
+- Scan ParquetRelation[i_item_sk#457,i_product_name#478] InputPaths:
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/_SUCCESS,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/_common_metadata,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/_metadata,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/part-r-00000-09ab6229-4b98-42a8-988e-2f1706436007.gz.parquet,
hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/part-r-00001-09ab6229-4b98-42a8-988e-2f1706436007.gz.parquet
{noformat}
was:
{code:title=/tmp/bug.py}
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
sc = SparkContext()
sqlc = SQLContext(sc)
R = Row('id', 'foo')
r = sqlc.createDataFrame(sc.parallelize([R('abc', 'foo')]))
q = sqlc.createDataFrame(sc.parallelize([R('aaaaaaaaaaaaaaaaaaaaaaaa', 'bar')]))
q.write.parquet('/tmp/1.parq')
q = sqlc.read.parquet('/tmp/1.parq')
j = r.join(q, r.id == q.id)
print j.count()
{code}
{noformat}
[user@sandbox test]$ spark-submit --executor-memory=32g /tmp/bug.py
[user@sandbox test]$ hadoop fs -rmr /tmp/1.parq
{noformat}
{noformat}
15/11/04 04:28:38 INFO codegen.GenerateUnsafeProjection: Code generated in
119.90324 ms
Traceback (most recent call last):
File "/tmp/bug.py", line 13, in <module>
print j.count()
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/dataframe.py", line
268, in count
File "/usr/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py",
line 538, in __call__
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 36,
in deco
File "/usr/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py", line
300, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o148.count.
: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
TungstenAggregate(key=[], functions=[(count(1),mode=Final,isDistinct=false)],
output=[count#10L])
TungstenExchange SinglePartition
TungstenAggregate(key=[],
functions=[(count(1),mode=Partial,isDistinct=false)], output=[currentCount#13L])
TungstenProject
BroadcastHashJoin [id#0], [id#8], BuildRight
TungstenProject [id#0]
Scan PhysicalRDD[id#0,foo#1]
ConvertToUnsafe
Scan ParquetRelation[hdfs:///tmp/1.parq][id#8]
at
org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49)
at
org.apache.spark.sql.execution.aggregate.TungstenAggregate.doExecute(TungstenAggregate.scala:69)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:140)
at
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:138)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:138)
at
org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:174)
at
org.apache.spark.sql.DataFrame$$anonfun$collect$1.apply(DataFrame.scala:1385)
at
org.apache.spark.sql.DataFrame$$anonfun$collect$1.apply(DataFrame.scala:1385)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:56)
at
org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:1903)
at org.apache.spark.sql.DataFrame.collect(DataFrame.scala:1384)
at org.apache.spark.sql.DataFrame.count(DataFrame.scala:1402)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
{noformat}
Note this happens only under following condition:
# executor memory >= 32GB (doesn't fail with up to 31 GB)
# the ID in the q dataframe has exactly 24 chars (doesn't fail with less or
more then 24 chars)
# q is read from parquet
> TreeNodeException running Q44 and 58 on Parquet tables
> ------------------------------------------------------
>
> Key: SPARK-14616
> URL: https://issues.apache.org/jira/browse/SPARK-14616
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 2.0.0
> Reporter: JESSE CHEN
>
> {code:title=tpcds q44}
> select asceding.rnk, i1.i_product_name best_performing, i2.i_product_name
> worst_performing
> from(select *
> from (select item_sk,rank() over (order by rank_col asc) rnk
> from (select ss_item_sk item_sk,avg(ss_net_profit) rank_col
> from store_sales ss1
> where ss_store_sk = 4
> group by ss_item_sk
> having avg(ss_net_profit) > 0.9*(select avg(ss_net_profit)
> rank_col
> from store_sales
> where ss_store_sk = 4
> and ss_addr_sk is null
> group by ss_store_sk))V1)V11
> where rnk < 11) asceding,
> (select *
> from (select item_sk,rank() over (order by rank_col desc) rnk
> from (select ss_item_sk item_sk,avg(ss_net_profit) rank_col
> from store_sales ss1
> where ss_store_sk = 4
> group by ss_item_sk
> having avg(ss_net_profit) > 0.9*(select avg(ss_net_profit)
> rank_col
> from store_sales
> where ss_store_sk = 4
> and ss_addr_sk is null
> group by ss_store_sk))V2)V21
> where rnk < 11) descending,
> item i1,
> item i2
> where asceding.rnk = descending.rnk
> and i1.i_item_sk=asceding.item_sk
> and i2.i_item_sk=descending.item_sk
> order by asceding.rnk
> limit 100;
> {code}
> {noformat}
> bin/spark-sql --driver-memory 10g --verbose --master yarn-client --packages
> com.databricks:spark-csv_2.10:1.3.0 --executor-memory 4g --num-executors 80
> --executor-cores 2 --database hadoopds1g -f q44.sql
> {noformat}
> {noformat}
> org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
> Exchange SinglePartition, None
> +- WholeStageCodegen
> : +- Project [item_sk#0,rank_col#1]
> : +- Filter havingCondition#219: boolean
> : +- TungstenAggregate(key=[ss_item_sk#12],
> functions=[(avg(ss_net_profit#32),mode=Final,isDistinct=false)],
> output=[havingCondition#219,item_sk#0,rank_col#1])
> : +- INPUT
> +- Exchange hashpartitioning(ss_item_sk#12,200), None
> +- WholeStageCodegen
> : +- TungstenAggregate(key=[ss_item_sk#12],
> functions=[(avg(ss_net_profit#32),mode=Partial,isDistinct=false)],
> output=[ss_item_sk#12,sum#612,count#613L])
> : +- Project [ss_item_sk#12,ss_net_profit#32]
> : +- Filter (ss_store_sk#17 = 4)
> : +- INPUT
> +- Scan ParquetRelation:
> hadoopds1g.store_sales[ss_item_sk#12,ss_net_profit#32,ss_store_sk#17]
> InputPaths:
> hdfs://bigaperf116.svl.ibm.com:8020/apps/hive/warehouse/hadoopds1g.db/store_sales,
> PushedFilters: [EqualTo(ss_store_sk,4)]
> at
> org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:47)
> at
> org.apache.spark.sql.execution.Exchange.doExecute(Exchange.scala:105)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:116)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
> at
> org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
> at org.apache.spark.sql.execution.Sort.doExecute(Sort.scala:60)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:116)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
> at
> org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
> at org.apache.spark.sql.execution.Window.doExecute(Window.scala:288)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:116)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
> at
> org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
> at
> org.apache.spark.sql.execution.InputAdapter.upstream(WholeStageCodegen.scala:176)
> at
> org.apache.spark.sql.execution.Filter.upstream(basicOperators.scala:73)
> at
> org.apache.spark.sql.execution.Project.upstream(basicOperators.scala:35)
> at
> org.apache.spark.sql.execution.WholeStageCodegen.doExecute(WholeStageCodegen.scala:279)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:116)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
> at
> org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
> at org.apache.spark.sql.execution.Sort.doExecute(Sort.scala:60)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:116)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
> at
> org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:116)
> at
> org.apache.spark.sql.execution.joins.SortMergeJoin.doExecute(SortMergeJoin.scala:63)
> at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:118)
> {noformat}
> Note this happens only when using parquet tables. Flat file (csv) works fine
> and returns correct query results.
> Additionally, query 58 also encounters the same exception. The query is as
> follows:
> {noformat}
> with ss_items as
> (select i_item_id item_id
> ,sum(ss_ext_sales_price) ss_item_rev
> from store_sales
> ,item
> ,date_dim
> where ss_item_sk = i_item_sk
> and d_date in (select d_date
> from date_dim
> where d_week_seq = (select d_week_seq
> from date_dim
> where d_date = '2000-01-03'))
> and ss_sold_date_sk = d_date_sk
> group by i_item_id),
> cs_items as
> (select i_item_id item_id
> ,sum(cs_ext_sales_price) cs_item_rev
> from catalog_sales
> ,item
> ,date_dim
> where cs_item_sk = i_item_sk
> and d_date in (select d_date
> from date_dim
> where d_week_seq = (select d_week_seq
> from date_dim
> where d_date = '2000-01-03'))
> and cs_sold_date_sk = d_date_sk
> group by i_item_id),
> ws_items as
> (select i_item_id item_id
> ,sum(ws_ext_sales_price) ws_item_rev
> from web_sales
> ,item
> ,date_dim
> where ws_item_sk = i_item_sk
> and d_date in (select d_date
> from date_dim
> where d_week_seq =(select d_week_seq
> from date_dim
> where d_date = '2000-01-03'))
> and ws_sold_date_sk = d_date_sk
> group by i_item_id)
> select ss_items.item_id
> ,ss_item_rev
> ,ss_item_rev/((ss_item_rev+cs_item_rev+ws_item_rev)/3) * 100 ss_dev
> ,cs_item_rev
> ,cs_item_rev/((ss_item_rev+cs_item_rev+ws_item_rev)/3) * 100 cs_dev
> ,ws_item_rev
> ,ws_item_rev/((ss_item_rev+cs_item_rev+ws_item_rev)/3) * 100 ws_dev
> ,(ss_item_rev+cs_item_rev+ws_item_rev)/3 average
> from ss_items,cs_items,ws_items
> where ss_items.item_id=cs_items.item_id
> and ss_items.item_id=ws_items.item_id
> and ss_item_rev between 0.9 * cs_item_rev and 1.1 * cs_item_rev
> and ss_item_rev between 0.9 * ws_item_rev and 1.1 * ws_item_rev
> and cs_item_rev between 0.9 * ss_item_rev and 1.1 * ss_item_rev
> and cs_item_rev between 0.9 * ws_item_rev and 1.1 * ws_item_rev
> and ws_item_rev between 0.9 * ss_item_rev and 1.1 * ss_item_rev
> and ws_item_rev between 0.9 * cs_item_rev and 1.1 * cs_item_rev
> order by item_id
> ,ss_item_rev
> limit 100;
> {noformat}
> Not certain if related. But seems a case a bad plan was created.
> Here is are logical, optimized and physical plans created for query 44:
> {noformat}
> Logical Plan =='Limit 100
> +- 'Sort ['asceding.rnk ASC], true
> +- 'Project
> [unresolvedalias('asceding.rnk,None),unresolvedalias('i1.i_product_name AS
> best_performing#433,None),unresolvedalias('i2.i_product_name AS
> worst_performing#434,None)]
> +- 'Filter ((('asceding.rnk = 'descending.rnk) && ('i1.i_item_sk =
> 'asceding.item_sk)) && ('i2.i_item_sk = 'descending.item_sk))
> +- 'Join Inner, None
> :- 'Join Inner, None
> : :- 'Join Inner, None
> : : :- 'Subquery asceding
> : : : +- 'Project [unresolvedalias(*,None)]
> : : : +- 'Filter ('rnk < 11)
> : : : +- 'Subquery V11
> : : : +- 'Project
> [unresolvedalias('item_sk,None),unresolvedalias('rank()
> windowspecdefinition('rank_col ASC,UnspecifiedFrame) AS rnk#428,None)]
> : : : +- 'Subquery V1
> : : : +- 'Filter cast(('avg('ss_net_profit) >
> (0.9 * scalarsubquery('Aggregate ['ss_store_sk],
> [unresolvedalias('avg('ss_net_profit) AS rank_col#427,None)]
> +- 'Filter (('ss_store_sk = 4) && isnull('ss_addr_sk))
> +- 'UnresolvedRelation `store_sales`, None
> ))) as boolean)
> : : : +- 'Aggregate ['ss_item_sk],
> [unresolvedalias('ss_item_sk AS
> item_sk#425,None),unresolvedalias('avg('ss_net_profit) AS rank_col#426,None)]
> : : : +- 'Filter ('ss_store_sk = 4)
> : : : +- 'UnresolvedRelation
> `store_sales`, Some(ss1)
> : : +- 'Subquery descending
> : : +- 'Project [unresolvedalias(*,None)]
> : : +- 'Filter ('rnk < 11)
> : : +- 'Subquery V21
> : : +- 'Project
> [unresolvedalias('item_sk,None),unresolvedalias('rank()
> windowspecdefinition('rank_col DESC,UnspecifiedFrame) AS rnk#432,None)]
> : : +- 'Subquery V2
> : : +- 'Filter cast(('avg('ss_net_profit) >
> (0.9 * scalarsubquery('Aggregate ['ss_store_sk],
> [unresolvedalias('avg('ss_net_profit) AS rank_col#431,None)]
> +- 'Filter (('ss_store_sk = 4) && isnull('ss_addr_sk))
> +- 'UnresolvedRelation `store_sales`, None
> ))) as boolean)
> : : +- 'Aggregate ['ss_item_sk],
> [unresolvedalias('ss_item_sk AS
> item_sk#429,None),unresolvedalias('avg('ss_net_profit) AS rank_col#430,None)]
> : : +- 'Filter ('ss_store_sk = 4)
> : : +- 'UnresolvedRelation
> `store_sales`, Some(ss1)
> : +- 'UnresolvedRelation `item`, Some(i1)
> +- 'UnresolvedRelation `item`, Some(i2)
> ==
> Analyzed Plan ==Limit 100
> +- Sort [rnk#428 ASC], true
> +- Project [rnk#428,i_product_name#202 AS
> best_performing#433,i_product_name#478 AS worst_performing#434]
> +- Filter (((rnk#428 = rnk#432) && (i_item_sk#181 = item_sk#425)) &&
> (i_item_sk#457 = item_sk#429))
> +- Join Inner, None
> :- Join Inner, None
> : :- Join Inner, None
> : : :- Subquery asceding
> : : : +- Project [item_sk#425,rnk#428]
> : : : +- Filter (rnk#428 < 11)
> : : : +- Subquery V11
> : : : +- Project [item_sk#425,rnk#428]
> : : : +- Project
> [item_sk#425,rank_col#426,rnk#428,rnk#428]
> : : : +- Window [item_sk#425,rank_col#426],
> [rank(rank_col#426) windowspecdefinition(rank_col#426 ASC,ROWS BETWEEN
> UNBOUNDED PRECEDING AND CURRENT ROW) AS rnk#428], [rank_col#426 ASC]
> : : : +- Project [item_sk#425,rank_col#426]
> : : : +- Subquery V1
> : : : +- Project
> [item_sk#425,rank_col#426]
> : : : +- Filter
> havingCondition#439: boolean
> : : : +- Aggregate
> [ss_item_sk#282], [cast(((avg(cast(ss_net_profit#302 as
> double)),mode=Complete,isDistinct=false) > (cast(0.9 as double) *
> scalarsubquery(Aggregate [ss_store_sk#287], [(avg(cast(ss_net_profit#302 as
> double)),mode=Complete,isDistinct=false) AS rank_col#427]
> +- Filter ((ss_store_sk#287 = 4) && isnull(ss_addr_sk#286))
> +- Subquery store_sales
> +-
> Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
> ParquetRelation
> ))) as boolean) AS havingCondition#439,ss_item_sk#282 AS
> item_sk#425,(avg(cast(ss_net_profit#302 as
> double)),mode=Complete,isDistinct=false) AS rank_col#426]
> : : : +- Filter
> (ss_store_sk#287 = 4)
> : : : +- Subquery ss1
> : : : +- Subquery
> store_sales
> : : : +-
> Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
> ParquetRelation
> : : +- Subquery descending
> : : +- Project [item_sk#429,rnk#432]
> : : +- Filter (rnk#432 < 11)
> : : +- Subquery V21
> : : +- Project [item_sk#429,rnk#432]
> : : +- Project
> [item_sk#429,rank_col#430,rnk#432,rnk#432]
> : : +- Window [item_sk#429,rank_col#430],
> [rank(rank_col#430) windowspecdefinition(rank_col#430 DESC,ROWS BETWEEN
> UNBOUNDED PRECEDING AND CURRENT ROW) AS rnk#432], [rank_col#430 DESC]
> : : +- Project [item_sk#429,rank_col#430]
> : : +- Subquery V2
> : : +- Project
> [item_sk#429,rank_col#430]
> : : +- Filter
> havingCondition#440: boolean
> : : +- Aggregate
> [ss_item_sk#282], [cast(((avg(cast(ss_net_profit#302 as
> double)),mode=Complete,isDistinct=false) > (cast(0.9 as double) *
> scalarsubquery(Aggregate [ss_store_sk#287], [(avg(cast(ss_net_profit#302 as
> double)),mode=Complete,isDistinct=false) AS rank_col#431]
> +- Filter ((ss_store_sk#287 = 4) && isnull(ss_addr_sk#286))
> +- Subquery store_sales
> +-
> Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
> ParquetRelation
> ))) as boolean) AS havingCondition#440,ss_item_sk#282 AS
> item_sk#429,(avg(cast(ss_net_profit#302 as
> double)),mode=Complete,isDistinct=false) AS rank_col#430]
> : : +- Filter
> (ss_store_sk#287 = 4)
> : : +- Subquery ss1
> : : +- Subquery
> store_sales
> : : +-
> Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
> ParquetRelation
> : +- Subquery i1
> : +- Subquery item
> : +-
> Relation[i_item_sk#181,i_item_id#182,i_rec_start_date#183,i_rec_end_date#184,i_item_desc#185,i_current_price#186,i_wholesale_cost#187,i_brand_id#188L,i_brand#189,i_class_id#190L,i_class#191,i_category_id#192L,i_category#193,i_manufact_id#194L,i_manufact#195,i_size#196,i_formulation#197,i_color#198,i_units#199,i_container#200,i_manager_id#201L,i_product_name#202]
> ParquetRelation
> +- Subquery i2
> +- Subquery item
> +-
> Relation[i_item_sk#457,i_item_id#458,i_rec_start_date#459,i_rec_end_date#460,i_item_desc#461,i_current_price#462,i_wholesale_cost#463,i_brand_id#464L,i_brand#465,i_class_id#466L,i_class#467,i_category_id#468L,i_category#469,i_manufact_id#470L,i_manufact#471,i_size#472,i_formulation#473,i_color#474,i_units#475,i_container#476,i_manager_id#477L,i_product_name#478]
> ParquetRelation
> ==
> Optimized Plan ==Limit 100
> +- Sort [rnk#428 ASC], true
> +- Project [rnk#428,i_product_name#202 AS
> best_performing#433,i_product_name#478 AS worst_performing#434]
> +- Join Inner, Some((i_item_sk#457 = item_sk#429))
> :- Project [rnk#428,item_sk#429,i_product_name#202]
> : +- Join Inner, Some((i_item_sk#181 = item_sk#425))
> : :- Project [rnk#428,item_sk#429,item_sk#425]
> : : +- Join Inner, Some((rnk#428 = rnk#432))
> : : :- Project [item_sk#425,rnk#428]
> : : : +- Filter (rnk#428 < 11)
> : : : +- Window [item_sk#425,rank_col#426],
> [rank(rank_col#426) windowspecdefinition(rank_col#426 ASC,ROWS BETWEEN
> UNBOUNDED PRECEDING AND CURRENT ROW) AS rnk#428], [rank_col#426 ASC]
> : : : +- Project [item_sk#425,rank_col#426]
> : : : +- Filter havingCondition#439: boolean
> : : : +- Aggregate [ss_item_sk#282],
> [((avg(cast(ss_net_profit#302 as double)),mode=Complete,isDistinct=false) >
> (0.9 * scalarsubquery(Aggregate [ss_store_sk#287],
> [(avg(cast(ss_net_profit#302 as double)),mode=Complete,isDistinct=false) AS
> rank_col#427]
> +- Filter ((ss_store_sk#287 = 4) && isnull(ss_addr_sk#286))
> +- Subquery store_sales
> +-
> Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
> ParquetRelation
> ))) AS havingCondition#439,ss_item_sk#282 AS
> item_sk#425,(avg(cast(ss_net_profit#302 as
> double)),mode=Complete,isDistinct=false) AS rank_col#426]
> : : : +- Project
> [ss_item_sk#282,ss_net_profit#302]
> : : : +- Filter (ss_store_sk#287 = 4)
> : : : +-
> Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
> ParquetRelation
> : : +- Project [item_sk#429,rnk#432]
> : : +- Filter (rnk#432 < 11)
> : : +- Window [item_sk#429,rank_col#430],
> [rank(rank_col#430) windowspecdefinition(rank_col#430 DESC,ROWS BETWEEN
> UNBOUNDED PRECEDING AND CURRENT ROW) AS rnk#432], [rank_col#430 DESC]
> : : +- Project [item_sk#429,rank_col#430]
> : : +- Filter havingCondition#440: boolean
> : : +- Aggregate [ss_item_sk#282],
> [((avg(cast(ss_net_profit#302 as double)),mode=Complete,isDistinct=false) >
> (0.9 * scalarsubquery(Aggregate [ss_store_sk#287],
> [(avg(cast(ss_net_profit#302 as double)),mode=Complete,isDistinct=false) AS
> rank_col#431]
> +- Filter ((ss_store_sk#287 = 4) && isnull(ss_addr_sk#286))
> +- Subquery store_sales
> +-
> Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
> ParquetRelation
> ))) AS havingCondition#440,ss_item_sk#282 AS
> item_sk#429,(avg(cast(ss_net_profit#302 as
> double)),mode=Complete,isDistinct=false) AS rank_col#430]
> : : +- Project
> [ss_item_sk#282,ss_net_profit#302]
> : : +- Filter (ss_store_sk#287 = 4)
> : : +-
> Relation[ss_sold_date_sk#280,ss_sold_time_sk#281,ss_item_sk#282,ss_customer_sk#283,ss_cdemo_sk#284,ss_hdemo_sk#285,ss_addr_sk#286,ss_store_sk#287,ss_promo_sk#288,ss_ticket_number#289L,ss_quantity#290L,ss_wholesale_cost#291,ss_list_price#292,ss_sales_price#293,ss_ext_discount_amt#294,ss_ext_sales_price#295,ss_ext_wholesale_cost#296,ss_ext_list_price#297,ss_ext_tax#298,ss_coupon_amt#299,ss_net_paid#300,ss_net_paid_inc_tax#301,ss_net_profit#302]
> ParquetRelation
> : +- Project [i_item_sk#181,i_product_name#202]
> : +-
> Relation[i_item_sk#181,i_item_id#182,i_rec_start_date#183,i_rec_end_date#184,i_item_desc#185,i_current_price#186,i_wholesale_cost#187,i_brand_id#188L,i_brand#189,i_class_id#190L,i_class#191,i_category_id#192L,i_category#193,i_manufact_id#194L,i_manufact#195,i_size#196,i_formulation#197,i_color#198,i_units#199,i_container#200,i_manager_id#201L,i_product_name#202]
> ParquetRelation
> +- Project [i_item_sk#457,i_product_name#478]
> +-
> Relation[i_item_sk#457,i_item_id#458,i_rec_start_date#459,i_rec_end_date#460,i_item_desc#461,i_current_price#462,i_wholesale_cost#463,i_brand_id#464L,i_brand#465,i_class_id#466L,i_class#467,i_category_id#468L,i_category#469,i_manufact_id#470L,i_manufact#471,i_size#472,i_formulation#473,i_color#474,i_units#475,i_container#476,i_manager_id#477L,i_product_name#478]
> ParquetRelation
> ==
> Physical Plan ==TakeOrderedAndProject(limit=100, orderBy=[rnk#428 ASC],
> output=[rnk#428,best_performing#433,worst_performing#434])
> +- WholeStageCodegen
> : +- Project [rnk#428,i_product_name#202 AS
> best_performing#433,i_product_name#478 AS worst_performing#434]
> : +- BroadcastHashJoin [item_sk#429], [i_item_sk#457], BuildRight, None
> : :- Project [rnk#428,item_sk#429,i_product_name#202]
> : : +- BroadcastHashJoin [item_sk#425], [i_item_sk#181],
> BuildRight, None
> : : :- Project [rnk#428,item_sk#429,item_sk#425]
> : : : +- INPUT
> : : +- INPUT
> : +- INPUT
> :- SortMergeJoin [rnk#428], [rnk#432], None
> : :- Sort [rnk#428 ASC], false, 0
> : : +- WholeStageCodegen
> : : : +- Project [item_sk#425,rnk#428]
> : : : +- Filter (rnk#428 < 11)
> : : : +- INPUT
> : : +- Window [item_sk#425,rank_col#426], [rank(rank_col#426)
> windowspecdefinition(rank_col#426 ASC,ROWS BETWEEN UNBOUNDED PRECEDING AND
> CURRENT ROW) AS rnk#428], [rank_col#426 ASC]
> : : +- Sort [rank_col#426 ASC], false, 0
> : : +- Exchange SinglePartition, None
> : : +- WholeStageCodegen
> : : : +- Project [item_sk#425,rank_col#426]
> : : : +- Filter havingCondition#439: boolean
> : : : +- TungstenAggregate(key=[ss_item_sk#282],
> functions=[(avg(cast(ss_net_profit#302 as
> double)),mode=Final,isDistinct=false)],
> output=[havingCondition#439,item_sk#425,rank_col#426])
> : : : +- INPUT
> : : +- Exchange hashpartitioning(ss_item_sk#282,200), None
> : : +- WholeStageCodegen
> : : : +- TungstenAggregate(key=[ss_item_sk#282],
> functions=[(avg(cast(ss_net_profit#302 as
> double)),mode=Partial,isDistinct=false)],
> output=[ss_item_sk#282,sum#506,count#507L])
> : : : +- Project
> [ss_item_sk#282,ss_net_profit#302]
> : : : +- Filter (ss_store_sk#287 = 4)
> : : : +- INPUT
> : : +- Scan
> ParquetRelation[ss_item_sk#282,ss_net_profit#302,ss_store_sk#287] InputPaths:
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/_SUCCESS,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/_common_metadata,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/_metadata,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/part-r-00000-2245f9f3-e00c-4c6c-b9cd-62179efcb22c.gz.parquet,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/part-r-00001-2245f9f3-e00c-4c6c-b9cd-62179efcb22c.gz.parquet,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/part-r-00002-2245f9f3-e00c-4c6c-b9cd-62179efcb22c.gz.parquet,
> PushedFilters: [EqualTo(ss_store_sk,4)]
> : +- Sort [rnk#432 ASC], false, 0
> : +- WholeStageCodegen
> : : +- Project [item_sk#429,rnk#432]
> : : +- Filter (rnk#432 < 11)
> : : +- INPUT
> : +- Window [item_sk#429,rank_col#430], [rank(rank_col#430)
> windowspecdefinition(rank_col#430 DESC,ROWS BETWEEN UNBOUNDED PRECEDING AND
> CURRENT ROW) AS rnk#432], [rank_col#430 DESC]
> : +- Sort [rank_col#430 DESC], false, 0
> : +- Exchange SinglePartition, None
> : +- WholeStageCodegen
> : : +- Project [item_sk#429,rank_col#430]
> : : +- Filter havingCondition#440: boolean
> : : +- TungstenAggregate(key=[ss_item_sk#282],
> functions=[(avg(cast(ss_net_profit#302 as
> double)),mode=Final,isDistinct=false)],
> output=[havingCondition#440,item_sk#429,rank_col#430])
> : : +- INPUT
> : +- Exchange hashpartitioning(ss_item_sk#282,200), None
> : +- WholeStageCodegen
> : : +- TungstenAggregate(key=[ss_item_sk#282],
> functions=[(avg(cast(ss_net_profit#302 as
> double)),mode=Partial,isDistinct=false)],
> output=[ss_item_sk#282,sum#511,count#512L])
> : : +- Project
> [ss_item_sk#282,ss_net_profit#302]
> : : +- Filter (ss_store_sk#287 = 4)
> : : +- INPUT
> : +- Scan
> ParquetRelation[ss_item_sk#282,ss_net_profit#302,ss_store_sk#287] InputPaths:
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/_SUCCESS,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/_common_metadata,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/_metadata,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/part-r-00000-2245f9f3-e00c-4c6c-b9cd-62179efcb22c.gz.parquet,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/part-r-00001-2245f9f3-e00c-4c6c-b9cd-62179efcb22c.gz.parquet,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/store_sales/part-r-00002-2245f9f3-e00c-4c6c-b9cd-62179efcb22c.gz.parquet,
> PushedFilters: [EqualTo(ss_store_sk,4)]
> :- Scan ParquetRelation[i_item_sk#181,i_product_name#202] InputPaths:
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/_SUCCESS,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/_common_metadata,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/_metadata,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/part-r-00000-09ab6229-4b98-42a8-988e-2f1706436007.gz.parquet,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/part-r-00001-09ab6229-4b98-42a8-988e-2f1706436007.gz.parquet
> +- Scan ParquetRelation[i_item_sk#457,i_product_name#478] InputPaths:
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/_SUCCESS,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/_common_metadata,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/_metadata,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/part-r-00000-09ab6229-4b98-42a8-988e-2f1706436007.gz.parquet,
>
> hdfs://bigaperf116.svl.ibm.com:8020/user/spark/hadoopds1gbparquet/item/part-r-00001-09ab6229-4b98-42a8-988e-2f1706436007.gz.parquet
> {noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]