[ https://issues.apache.org/jira/browse/SPARK-34145?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17696545#comment-17696545 ]
Sujith Chacko commented on SPARK-34145: --------------------------------------- [~yumwang] Is anyone working on this feature? > Combine scalar subqueries > ------------------------- > > Key: SPARK-34145 > URL: https://issues.apache.org/jira/browse/SPARK-34145 > Project: Spark > Issue Type: Improvement > Components: SQL > Affects Versions: 3.2.0 > Reporter: Yuming Wang > Priority: Major > > We can add a rule to combine scalar subqueries if it from same table to > improve query performance. for example: > {code:sql} > -- TPC-DS q9.sql > SELECT > CASE WHEN (SELECT count(*) > FROM store_sales > WHERE ss_quantity BETWEEN 1 AND 20) > 62316685 > THEN (SELECT avg(ss_ext_discount_amt) > FROM store_sales > WHERE ss_quantity BETWEEN 1 AND 20) > ELSE (SELECT avg(ss_net_paid) > FROM store_sales > WHERE ss_quantity BETWEEN 1 AND 20) END bucket1, > CASE WHEN (SELECT count(*) > FROM store_sales > WHERE ss_quantity BETWEEN 21 AND 40) > 19045798 > THEN (SELECT avg(ss_ext_discount_amt) > FROM store_sales > WHERE ss_quantity BETWEEN 21 AND 40) > ELSE (SELECT avg(ss_net_paid) > FROM store_sales > WHERE ss_quantity BETWEEN 21 AND 40) END bucket2, > CASE WHEN (SELECT count(*) > FROM store_sales > WHERE ss_quantity BETWEEN 41 AND 60) > 365541424 > THEN (SELECT avg(ss_ext_discount_amt) > FROM store_sales > WHERE ss_quantity BETWEEN 41 AND 60) > ELSE (SELECT avg(ss_net_paid) > FROM store_sales > WHERE ss_quantity BETWEEN 41 AND 60) END bucket3, > CASE WHEN (SELECT count(*) > FROM store_sales > WHERE ss_quantity BETWEEN 61 AND 80) > 216357808 > THEN (SELECT avg(ss_ext_discount_amt) > FROM store_sales > WHERE ss_quantity BETWEEN 61 AND 80) > ELSE (SELECT avg(ss_net_paid) > FROM store_sales > WHERE ss_quantity BETWEEN 61 AND 80) END bucket4, > CASE WHEN (SELECT count(*) > FROM store_sales > WHERE ss_quantity BETWEEN 81 AND 100) > 184483884 > THEN (SELECT avg(ss_ext_discount_amt) > FROM store_sales > WHERE ss_quantity BETWEEN 81 AND 100) > ELSE (SELECT avg(ss_net_paid) > FROM store_sales > WHERE ss_quantity BETWEEN 81 AND 100) END bucket5 > FROM reason > WHERE r_reason_sk = 1 > {code} > We can rewrite it to: > {code:sql} > WITH bucket_result AS ( > SELECT > CASE WHEN (count(ss_quantity) FILTER (WHERE ss_quantity BETWEEN 1 AND > 20)) > 62316685 > THEN (avg(ss_ext_discount_amt) FILTER (WHERE ss_quantity BETWEEN 1 AND > 20)) > ELSE (avg(ss_net_paid) FILTER (WHERE ss_quantity BETWEEN 1 AND 20)) END > bucket1, > CASE WHEN (count(ss_quantity) FILTER (WHERE ss_quantity BETWEEN 21 AND > 40)) > 62316685 > THEN (avg(ss_ext_discount_amt) FILTER (WHERE ss_quantity BETWEEN 21 AND > 40)) > ELSE (avg(ss_net_paid) FILTER (WHERE ss_quantity BETWEEN 21 AND 40)) END > bucket2, > CASE WHEN (count(ss_quantity) FILTER (WHERE ss_quantity BETWEEN 41 AND > 60)) > 62316685 > THEN (avg(ss_ext_discount_amt) FILTER (WHERE ss_quantity BETWEEN 41 AND > 60)) > ELSE (avg(ss_net_paid) FILTER (WHERE ss_quantity BETWEEN 41 AND 60)) END > bucket3, > CASE WHEN (count(ss_quantity) FILTER (WHERE ss_quantity BETWEEN 61 AND > 80)) > 62316685 > THEN (avg(ss_ext_discount_amt) FILTER (WHERE ss_quantity BETWEEN 61 AND > 80)) > ELSE (avg(ss_net_paid) FILTER (WHERE ss_quantity BETWEEN 61 AND 80)) END > bucket4, > CASE WHEN (count(ss_quantity) FILTER (WHERE ss_quantity BETWEEN 81 AND > 100)) > 62316685 > THEN (avg(ss_ext_discount_amt) FILTER (WHERE ss_quantity BETWEEN 81 AND > 100)) > ELSE (avg(ss_net_paid) FILTER (WHERE ss_quantity BETWEEN 81 AND 100)) END > bucket5 > FROM store_sales > ) > SELECT > (SELECT bucket1 FROM bucket_result) as bucket1, > (SELECT bucket2 FROM bucket_result) as bucket2, > (SELECT bucket3 FROM bucket_result) as bucket3, > (SELECT bucket4 FROM bucket_result) as bucket4, > (SELECT bucket5 FROM bucket_result) as bucket5 > FROM reason > WHERE r_reason_sk = 1; > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org