Re: [PR] HIVE-27078: Bucket Map Join can hang if the source vertex parallelism is changed by reducer autoparallelism [hive]

via GitHub Thu, 27 Mar 2025 02:04:08 -0700


ngsg commented on code in PR #5707:
URL: https://github.com/apache/hive/pull/5707#discussion_r2016005808



##########
ql/src/java/org/apache/hadoop/hive/ql/parse/GenTezUtils.java:
##########
@@ -182,6 +187,30 @@ public static ReduceWork createReduceWork(
     return reduceWork;
   }
 
+  private static boolean hasBucketMapJoin(Operator<? extends OperatorDesc> 
operator) {
+    if (operator == null) {
+      return false;
+    }
+
+    // Iterate over child operators
+    for (Operator<? extends OperatorDesc> childOp : 
operator.getChildOperators()) {
+      // Check if this is a MapJoinOperator and is a Bucket Map Join
+      if (childOp instanceof MapJoinOperator) {
+        MapJoinOperator mjOp = (MapJoinOperator) childOp;
+        if (mjOp.getConf().isBucketMapJoin()) {
+          return true; // Found BMJ, no need to check further
+        }
+      }
+
+      // Recursively check children
+      if (hasBucketMapJoin(childOp)) {
+        return true;
+      }

Review Comment:
   I was able to generate the above DAG using the following query. Could you 
try this with `TestMiniLlapLocalCliDriver`?
   ``` sql
   create table source_table2(date_col date, string_col string, decimal_col 
decimal(38,0)) clustered by (decimal_col) into 7 buckets;
   insert into table source_table2 values
   ('2022-08-30', 'pipeline', '50000000000000000005905545593'), ('2022-08-16', 
'pipeline', '50000000000000000005905545593'), ('2022-09-01', 'pipeline', 
'50000000000000000006008686831'), ('2022-08-30', 'pipeline', 
'50000000000000000005992620837'), ('2022-09-01', 'pipeline', 
'50000000000000000005992620837'), ('2022-09-01', 'pipeline', 
'50000000000000000005992621067'),
   ('2022-08-30', 'pipeline', '50000000000000000005992621067');
   
   create table target_table2(date_col date, string_col string, decimal_col 
decimal(38,0)) clustered by (decimal_col) into 7 buckets;
   insert into table target_table2 values
   ('2017-05-17', 'pipeline', '50000000000000000000441610525'), ('2018-12-20', 
'pipeline', '50000000000000000001048981030'), ('2020-06-30', 'pipeline', 
'50000000000000000002332575516'), ('2021-08-16', 'pipeline', 
'50000000000000000003897973989'), ('2017-06-06', 'pipeline', 
'50000000000000000000449148729'), ('2017-09-08', 'pipeline', 
'50000000000000000000525378314'),
   ('2022-08-30', 'pipeline', '50000000000000000005905545593'), ('2022-08-16', 
'pipeline', '50000000000000000005905545593'), ('2018-05-03', 'pipeline', 
'50000000000000000000750826355'), ('2020-01-10', 'pipeline', 
'50000000000000000001816579677'), ('2021-11-01', 'pipeline', 
'50000000000000000004269423714'), ('2017-11-07', 'pipeline', 
'50000000000000000000585901787'),
   ('2019-10-15', 'pipeline', '50000000000000000001598843430'), ('2020-04-01', 
'pipeline', '50000000000000000002035795461'), ('2020-02-24', 'pipeline', 
'50000000000000000001932600185'), ('2020-04-27', 'pipeline', 
'50000000000000000002108160849'), ('2016-07-05', 'pipeline', 
'50000000000000000000054405114'), ('2020-06-02', 'pipeline', 
'50000000000000000002234387967'),
   ('2020-08-21', 'pipeline', '50000000000000000002529168758'), ('2021-02-17', 
'pipeline', '50000000000000000003158511687');
   
    set hive.auto.convert.join=true;
   
   set hive.tez.auto.reducer.parallelism=true;
   set hive.tez.min.partition.factor=12;
   set hive.tez.max.partition.factor=50;
   
   explain extended
   select s.string_col, count(*)
   from target_table2 t
   inner join (
     select min(date_col) date_col, string_col, decimal_col
     from (
       select date_col, 'pipeline' string_col, min(decimal_col) decimal_col
       from source_table2
       where coalesce(decimal_col,'') = '50000000000000000005905545593'
       group by date_col, string_col
     ) x
     group by string_col, decimal_col
   ) s
   on s.date_col = t.date_col AND s.string_col = t.string_col AND s.decimal_col 
= t.decimal_col
   group by s.string_col;
   
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] HIVE-27078: Bucket Map Join can hang if the source vertex parallelism is changed by reducer autoparallelism [hive]

Reply via email to