okumin commented on code in PR #5707: URL: https://github.com/apache/hive/pull/5707#discussion_r2007001086
########## ql/src/test/queries/clientpositive/bucketmapjoin_auto_reduce_parallel.q: ########## @@ -0,0 +1,24 @@ +create table source_table2(date_col date, string_col string, decimal_col decimal(38,0)) clustered by (decimal_col) into 7 buckets; +insert into table source_table2 values +('2022-08-30', 'pipeline', '50000000000000000005905545593'), ('2022-08-16', 'pipeline', '50000000000000000005905545593'), ('2022-09-01', 'pipeline', '50000000000000000006008686831'), ('2022-08-30', 'pipeline', '50000000000000000005992620837'), ('2022-09-01', 'pipeline', '50000000000000000005992620837'), ('2022-09-01', 'pipeline', '50000000000000000005992621067'), +('2022-08-30', 'pipeline', '50000000000000000005992621067'); + +create table target_table2(date_col date, string_col string, decimal_col decimal(38,0)) clustered by (decimal_col) into 7 buckets; +insert into table target_table2 values +('2017-05-17', 'pipeline', '50000000000000000000441610525'), ('2018-12-20', 'pipeline', '50000000000000000001048981030'), ('2020-06-30', 'pipeline', '50000000000000000002332575516'), ('2021-08-16', 'pipeline', '50000000000000000003897973989'), ('2017-06-06', 'pipeline', '50000000000000000000449148729'), ('2017-09-08', 'pipeline', '50000000000000000000525378314'), +('2022-08-30', 'pipeline', '50000000000000000005905545593'), ('2022-08-16', 'pipeline', '50000000000000000005905545593'), ('2018-05-03', 'pipeline', '50000000000000000000750826355'), ('2020-01-10', 'pipeline', '50000000000000000001816579677'), ('2021-11-01', 'pipeline', '50000000000000000004269423714'), ('2017-11-07', 'pipeline', '50000000000000000000585901787'), +('2019-10-15', 'pipeline', '50000000000000000001598843430'), ('2020-04-01', 'pipeline', '50000000000000000002035795461'), ('2020-02-24', 'pipeline', '50000000000000000001932600185'), ('2020-04-27', 'pipeline', '50000000000000000002108160849'), ('2016-07-05', 'pipeline', '50000000000000000000054405114'), ('2020-06-02', 'pipeline', '50000000000000000002234387967'), +('2020-08-21', 'pipeline', '50000000000000000002529168758'), ('2021-02-17', 'pipeline', '50000000000000000003158511687'); + +set hive.auto.convert.join=true; +set hive.optimize.dynamic.partition.hashjoin=false; +set hive.convert.join.bucket.mapjoin.tez=true; +set hive.vectorized.execution.enabled=false; Review Comment: We might not need `set hive.optimize.dynamic.partition.hashjoin=false;` and `set hive.convert.join.bucket.mapjoin.tez=true;` because [we use default values](https://github.com/apache/hive/blob/master/data/conf/llap/hive-site.xml). Why do we have to disable `hive.vectorized.execution.enabled`? ########## ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConvertJoinMapJoin.java: ########## @@ -701,6 +701,8 @@ private boolean convertJoinBucketMapJoin(JoinOperator joinOp, OptimizeTezProcCon } MapJoinDesc joinDesc = mapJoinOp.getConf(); joinDesc.setBucketMapJoin(true); + // Disabling TEZ_AUTO_REDUCER_PARALLELISM until TEZ-4603 is fixed. + context.conf.setBoolVar(ConfVars.TEZ_AUTO_REDUCER_PARALLELISM, false); Review Comment: How about removing the AUTOPARALLEL trait? Auto-reducer parallelism is enabled [only when a ReduceSinkOperator has AUTOPARALLEL](https://github.com/apache/hive/blob/2d528556b69c1ec011e83f69d2a2fdcfb78b356e/ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java#L125). So, we may remove the trait from all parent RSOs of MapJoinOperator. The current approach will likely work. However, it might be surprising to mutate a configuration here, and it will likely disable Auto-Reducer Parallelism on all vertices. ########## ql/src/test/queries/clientpositive/bucketmapjoin_auto_reduce_parallel.q: ########## @@ -0,0 +1,24 @@ +create table source_table2(date_col date, string_col string, decimal_col decimal(38,0)) clustered by (decimal_col) into 7 buckets; +insert into table source_table2 values +('2022-08-30', 'pipeline', '50000000000000000005905545593'), ('2022-08-16', 'pipeline', '50000000000000000005905545593'), ('2022-09-01', 'pipeline', '50000000000000000006008686831'), ('2022-08-30', 'pipeline', '50000000000000000005992620837'), ('2022-09-01', 'pipeline', '50000000000000000005992620837'), ('2022-09-01', 'pipeline', '50000000000000000005992621067'), +('2022-08-30', 'pipeline', '50000000000000000005992621067'); + +create table target_table2(date_col date, string_col string, decimal_col decimal(38,0)) clustered by (decimal_col) into 7 buckets; +insert into table target_table2 values +('2017-05-17', 'pipeline', '50000000000000000000441610525'), ('2018-12-20', 'pipeline', '50000000000000000001048981030'), ('2020-06-30', 'pipeline', '50000000000000000002332575516'), ('2021-08-16', 'pipeline', '50000000000000000003897973989'), ('2017-06-06', 'pipeline', '50000000000000000000449148729'), ('2017-09-08', 'pipeline', '50000000000000000000525378314'), +('2022-08-30', 'pipeline', '50000000000000000005905545593'), ('2022-08-16', 'pipeline', '50000000000000000005905545593'), ('2018-05-03', 'pipeline', '50000000000000000000750826355'), ('2020-01-10', 'pipeline', '50000000000000000001816579677'), ('2021-11-01', 'pipeline', '50000000000000000004269423714'), ('2017-11-07', 'pipeline', '50000000000000000000585901787'), +('2019-10-15', 'pipeline', '50000000000000000001598843430'), ('2020-04-01', 'pipeline', '50000000000000000002035795461'), ('2020-02-24', 'pipeline', '50000000000000000001932600185'), ('2020-04-27', 'pipeline', '50000000000000000002108160849'), ('2016-07-05', 'pipeline', '50000000000000000000054405114'), ('2020-06-02', 'pipeline', '50000000000000000002234387967'), +('2020-08-21', 'pipeline', '50000000000000000002529168758'), ('2021-02-17', 'pipeline', '50000000000000000003158511687'); + +set hive.auto.convert.join=true; +set hive.optimize.dynamic.partition.hashjoin=false; +set hive.convert.join.bucket.mapjoin.tez=true; +set hive.vectorized.execution.enabled=false; + +set hive.optimize.bucketmapjoin=true; Review Comment: I think we can remove this line because only Hive on MR uses this parameter -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For additional commands, e-mail: gitbox-h...@hive.apache.org