[
https://issues.apache.org/jira/browse/HIVE-10686?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14539639#comment-14539639
]
Jesus Camacho Rodriguez commented on HIVE-10686:
------------------------------------------------
The problem is in {{fixTopOBSchema}} that will not update the column references
for for the OrderBy operator in nested RexCall operands.
> java.lang.IndexOutOfBoundsException for query with rank() over(partition ...)
> -----------------------------------------------------------------------------
>
> Key: HIVE-10686
> URL: https://issues.apache.org/jira/browse/HIVE-10686
> Project: Hive
> Issue Type: Bug
> Reporter: Jesus Camacho Rodriguez
> Assignee: Jesus Camacho Rodriguez
>
> CBO throws Index out of bound exception for TPC-DS Q70.
> Query
> {code}
> explain
> select
> sum(ss_net_profit) as total_sum
> ,s_state
> ,s_county
> ,grouping__id as lochierarchy
> , rank() over(partition by grouping__id, case when grouping__id == 2 then
> s_state end order by sum(ss_net_profit)) as rank_within_parent
> from
> store_sales ss join date_dim d1 on d1.d_date_sk = ss.ss_sold_date_sk
> join store s on s.s_store_sk = ss.ss_store_sk
> where
> d1.d_month_seq between 1193 and 1193+11
> and s.s_state in
> ( select s_state
> from (select s_state as s_state, sum(ss_net_profit),
> rank() over ( partition by s_state order by
> sum(ss_net_profit) desc) as ranking
> from store_sales, store, date_dim
> where d_month_seq between 1193 and 1193+11
> and date_dim.d_date_sk =
> store_sales.ss_sold_date_sk
> and store.s_store_sk = store_sales.ss_store_sk
> group by s_state
> ) tmp1
> where ranking <= 5
> )
> group by s_state,s_county with rollup
> order by
> lochierarchy desc
> ,case when lochierarchy = 0 then s_state end
> ,rank_within_parent
> limit 100
> {code}
> Original plan (correct)
> {code}
> HiveSort(fetch=[100])
> HiveSort(sort0=[$3], sort1=[$5], sort2=[$4], dir0=[DESC], dir1=[ASC],
> dir2=[ASC])
> HiveProject(total_sum=[$4], s_state=[$0], s_county=[$1],
> lochierarchy=[$5], rank_within_parent=[rank() OVER (PARTITION BY $5,
> when(==($5, 2), $0) ORDER BY $4 ROWS BETWEEN 2147483647 FOLLOWING AND
> 2147483647 PRECEDING)], (tok_function when (= (tok_table_or_col lochierarchy)
> 0) (tok_table_or_col s_state))=[when(=($5, 0), $0)])
> HiveAggregate(group=[{0, 1}], groups=[[{0, 1}, {0}, {}]],
> indicator=[true], agg#0=[sum($2)], GROUPING__ID=[GROUPING__ID()])
> HiveProject($f0=[$7], $f1=[$6], $f2=[$1])
> HiveJoin(condition=[=($5, $2)], joinType=[inner], algorithm=[none],
> cost=[{1177.2086187101072 rows, 0.0 cpu, 0.0 io}])
> HiveJoin(condition=[=($3, $0)], joinType=[inner],
> algorithm=[none], cost=[{2880430.428726483 rows, 0.0 cpu, 0.0 io}])
> HiveProject(ss_sold_date_sk=[$0], ss_net_profit=[$21],
> ss_store_sk=[$22])
> HiveTableScan(table=[[tpcds.store_sales]])
> HiveProject(d_date_sk=[$0], d_month_seq=[$3])
> HiveFilter(condition=[between(false, $3, 1193, +(1193, 11))])
> HiveTableScan(table=[[tpcds.date_dim]])
> HiveProject(s_store_sk=[$0], s_county=[$1], s_state=[$2])
> SemiJoin(condition=[=($2, $3)], joinType=[inner])
> HiveProject(s_store_sk=[$0], s_county=[$23], s_state=[$24])
> HiveTableScan(table=[[tpcds.store]])
> HiveProject(s_state=[$0])
> HiveFilter(condition=[<=($1, 5)])
> HiveProject((tok_table_or_col s_state)=[$0],
> rank_window_0=[rank() OVER (PARTITION BY $0 ORDER BY $1 DESC ROWS BETWEEN
> 2147483647 FOLLOWING AND 2147483647 PRECEDING)])
> HiveAggregate(group=[{0}], agg#0=[sum($1)])
> HiveProject($f0=[$6], $f1=[$1])
> HiveJoin(condition=[=($5, $2)], joinType=[inner],
> algorithm=[none], cost=[{1177.2086187101072 rows, 0.0 cpu, 0.0 io}])
> HiveJoin(condition=[=($3, $0)], joinType=[inner],
> algorithm=[none], cost=[{2880430.428726483 rows, 0.0 cpu, 0.0 io}])
> HiveProject(ss_sold_date_sk=[$0],
> ss_net_profit=[$21], ss_store_sk=[$22])
> HiveTableScan(table=[[tpcds.store_sales]])
> HiveProject(d_date_sk=[$0], d_month_seq=[$3])
> HiveFilter(condition=[between(false, $3,
> 1193, +(1193, 11))])
> HiveTableScan(table=[[tpcds.date_dim]])
> HiveProject(s_store_sk=[$0], s_state=[$24])
> HiveTableScan(table=[[tpcds.store]])
> {code}
> Plan after fixTopOBSchema (incorrect)
> {code}
> HiveSort(fetch=[100])
> HiveSort(sort0=[$3], sort1=[$5], sort2=[$4], dir0=[DESC], dir1=[ASC],
> dir2=[ASC])
> HiveProject(total_sum=[$4], s_state=[$0], s_county=[$1],
> lochierarchy=[$5], rank_within_parent=[rank() OVER (PARTITION BY $5,
> when(==($5, 2), $0) ORDER BY $4 ROWS BETWEEN 2147483647 FOLLOWING AND
> 2147483647 PRECEDING)])
> HiveAggregate(group=[{0, 1}], groups=[[{0, 1}, {0}, {}]],
> indicator=[true], agg#0=[sum($2)], GROUPING__ID=[GROUPING__ID()])
> HiveProject($f0=[$7], $f1=[$6], $f2=[$1])
> HiveJoin(condition=[=($5, $2)], joinType=[inner], algorithm=[none],
> cost=[{1177.2086187101072 rows, 0.0 cpu, 0.0 io}])
> HiveJoin(condition=[=($3, $0)], joinType=[inner],
> algorithm=[none], cost=[{2880430.428726483 rows, 0.0 cpu, 0.0 io}])
> HiveProject(ss_sold_date_sk=[$0], ss_net_profit=[$21],
> ss_store_sk=[$22])
> HiveTableScan(table=[[tpcds.store_sales]])
> HiveProject(d_date_sk=[$0], d_month_seq=[$3])
> HiveFilter(condition=[between(false, $3, 1193, +(1193, 11))])
> HiveTableScan(table=[[tpcds.date_dim]])
> HiveProject(s_store_sk=[$0], s_county=[$1], s_state=[$2])
> SemiJoin(condition=[=($2, $3)], joinType=[inner])
> HiveProject(s_store_sk=[$0], s_county=[$23], s_state=[$24])
> HiveTableScan(table=[[tpcds.store]])
> HiveProject(s_state=[$0])
> HiveFilter(condition=[<=($1, 5)])
> HiveProject((tok_table_or_col s_state)=[$0],
> rank_window_0=[rank() OVER (PARTITION BY $0 ORDER BY $1 DESC ROWS BETWEEN
> 2147483647 FOLLOWING AND 2147483647 PRECEDING)])
> HiveAggregate(group=[{0}], agg#0=[sum($1)])
> HiveProject($f0=[$6], $f1=[$1])
> HiveJoin(condition=[=($5, $2)], joinType=[inner],
> algorithm=[none], cost=[{1177.2086187101072 rows, 0.0 cpu, 0.0 io}])
> HiveJoin(condition=[=($3, $0)], joinType=[inner],
> algorithm=[none], cost=[{2880430.428726483 rows, 0.0 cpu, 0.0 io}])
> HiveProject(ss_sold_date_sk=[$0],
> ss_net_profit=[$21], ss_store_sk=[$22])
> HiveTableScan(table=[[tpcds.store_sales]])
> HiveProject(d_date_sk=[$0], d_month_seq=[$3])
> HiveFilter(condition=[between(false, $3,
> 1193, +(1193, 11))])
> HiveTableScan(table=[[tpcds.date_dim]])
> HiveProject(s_store_sk=[$0], s_state=[$24])
> HiveTableScan(table=[[tpcds.store]])
> {code}
> Exception
> {code}
> 15/04/14 02:42:52 [main]: ERROR parse.CalcitePlanner: CBO failed, skipping
> CBO.
> java.lang.IndexOutOfBoundsException: Index: 5, Size: 5
> at java.util.ArrayList.rangeCheck(ArrayList.java:635)
> at java.util.ArrayList.get(ArrayList.java:411)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitInputRef(ASTConverter.java:395)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitInputRef(ASTConverter.java:372)
> at org.apache.calcite.rex.RexInputRef.accept(RexInputRef.java:112)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitCall(ASTConverter.java:543)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitCall(ASTConverter.java:372)
> at org.apache.calcite.rex.RexCall.accept(RexCall.java:107)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitCall(ASTConverter.java:543)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitCall(ASTConverter.java:372)
> at org.apache.calcite.rex.RexCall.accept(RexCall.java:107)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter.convertOBToASTNode(ASTConverter.java:252)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter.convert(ASTConverter.java:208)
> at
> org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter.convert(ASTConverter.java:98)
> at
> org.apache.hadoop.hive.ql.parse.CalcitePlanner.getOptimizedAST(CalcitePlanner.java:607)
> at
> org.apache.hadoop.hive.ql.parse.CalcitePlanner.genOPTree(CalcitePlanner.java:239)
> at
> org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10003)
> at
> org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:202)
> at
> org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:224)
> at
> org.apache.hadoop.hive.ql.parse.ExplainSemanticAnalyzer.analyzeInternal(ExplainSemanticAnalyzer.java:74)
> at
> org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:224)
> at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:424)
> at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:308)
> at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1122)
> at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1170)
> at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1059)
> at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1049)
> at
> org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:213)
> at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:165)
> at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:376)
> at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:311)
> at
> org.apache.hadoop.hive.cli.CliDriver.processReader(CliDriver.java:409)
> at org.apache.hadoop.hive.cli.CliDriver.processFile(CliDriver.java:425)
> at
> org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:714)
> at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:681)
> at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:621)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
> at
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:606)
> at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
> at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)