[jira] [Work logged] (HIVE-24241) Enable SharedWorkOptimizer to merge downstream operators after an optimization step
[ https://issues.apache.org/jira/browse/HIVE-24241?focusedWorklogId=511300=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-511300 ] ASF GitHub Bot logged work on HIVE-24241: - Author: ASF GitHub Bot Created on: 13/Nov/20 10:26 Start Date: 13/Nov/20 10:26 Worklog Time Spent: 10m Work Description: kgyrtkirk merged pull request #1562: URL: https://github.com/apache/hive/pull/1562 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 511300) Time Spent: 2h 20m (was: 2h 10m) > Enable SharedWorkOptimizer to merge downstream operators after an > optimization step > --- > > Key: HIVE-24241 > URL: https://issues.apache.org/jira/browse/HIVE-24241 > Project: Hive > Issue Type: Sub-task >Reporter: Zoltan Haindrich >Assignee: Zoltan Haindrich >Priority: Major > Labels: pull-request-available > Time Spent: 2h 20m > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)
[jira] [Work logged] (HIVE-24241) Enable SharedWorkOptimizer to merge downstream operators after an optimization step
[ https://issues.apache.org/jira/browse/HIVE-24241?focusedWorklogId=508082=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-508082 ] ASF GitHub Bot logged work on HIVE-24241: - Author: ASF GitHub Bot Created on: 05/Nov/20 10:52 Start Date: 05/Nov/20 10:52 Worklog Time Spent: 10m Work Description: kgyrtkirk commented on a change in pull request #1562: URL: https://github.com/apache/hive/pull/1562#discussion_r517960036 ## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/OperatorGraph.java ## @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.File; +import java.io.PrintWriter; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.hive.ql.exec.AppMasterEventOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePointLookupOptimizerRule.DiGraph; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo; +import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc; + +import com.google.common.collect.Sets; + +public class OperatorGraph { + + /** + * A directed graph extended with support to check dag property. + */ + static class DagGraph extends DiGraph { + +static class DagNode extends Node { + int dagIdx = 0; + public DagNode(V v) { +super(v); + } + + @Override + public void addEdge(Edge edge) { +if (edge.s == this) { + DagNode t = (DagNode) edge.t; + ensureDagIdxAtLeast(t.dagIdx + 1); + if (t.dagIdx > dagIdx) { +throw new IllegalArgumentException("adding this edge would violate dag properties"); + } +} +super.addEdge(edge); + } + + void ensureDagIdxAtLeast(int min) { +if (dagIdx >= min) { + return; +} +dagIdx = min; +for (Edge e : edges) { + if(e.t == this) { +DagNode s = (DagNode) e.s; +s.ensureDagIdxAtLeast(min + 1); + } +} + } +} + +@Override +protected Node newNode(V s) { + return new DagNode(s); +} + } + + DagGraph, OpEdge> g; + + enum EdgeType { +FLOW, SEMIJOIN, DPP, TEST, + } + + static class OpEdge { + +private final EdgeType et; +private final int index; + +public OpEdge(EdgeType et) { + this(et, 0); +} + +public OpEdge(EdgeType et, int index) { + this.et = et; + this.index = index; +} + + } + + + Map, Cluster> nodeCluster = new HashMap<>(); + + public class Cluster { + +Set> members = new LinkedHashSet<>(); + +public void merge(Cluster o) { + for (Operator node : o.members) { +add(node); + } + o.members.clear(); +} + +public void add(Operator curr) { + nodeCluster.put(curr, this); + members.add(curr); +} + + } + + + public OperatorGraph(ParseContext pctx) { +g = new DagGraph, OperatorGraph.OpEdge>(); +Set> visited = Sets.newIdentityHashSet(); +Set> seen = Sets.newIdentityHashSet(); + +seen.addAll(pctx.getTopOps().values()); +while (!seen.isEmpty()) { + Operator curr = seen.iterator().next(); + seen.remove(curr); + if (visited.contains(curr)) { +continue; + } + + visited.add(curr); + + Cluster currentCluster = nodeCluster.get(curr); + if (currentCluster == null) { +currentCluster=new Cluster(); +currentCluster.add(curr); + } + List> parents = curr.getParentOperators(); + for (int i = 0; i < parents.size(); i++) { +Operator p = parents.get(i); +g.putEdgeValue(p, curr, new OpEdge(EdgeType.FLOW, i)); +if (p instanceof ReduceSinkOperator) { + // ignore cluster of parent RS +
[jira] [Work logged] (HIVE-24241) Enable SharedWorkOptimizer to merge downstream operators after an optimization step
[ https://issues.apache.org/jira/browse/HIVE-24241?focusedWorklogId=508080=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-508080 ] ASF GitHub Bot logged work on HIVE-24241: - Author: ASF GitHub Bot Created on: 05/Nov/20 10:43 Start Date: 05/Nov/20 10:43 Worklog Time Spent: 10m Work Description: kgyrtkirk commented on a change in pull request #1562: URL: https://github.com/apache/hive/pull/1562#discussion_r517954820 ## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/OperatorGraph.java ## @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.File; +import java.io.PrintWriter; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.hive.ql.exec.AppMasterEventOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePointLookupOptimizerRule.DiGraph; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo; +import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc; + +import com.google.common.collect.Sets; + +public class OperatorGraph { + + /** + * A directed graph extended with support to check dag property. + */ + static class DagGraph extends DiGraph { Review comment: jgrapht has quite a few graph algorithms as well - guava would be a viable candidate if we don't yet have a graph class; but I think it doesn't give much more than that. I've moved these classes around - let's see how far we can get with these! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 508080) Time Spent: 2h (was: 1h 50m) > Enable SharedWorkOptimizer to merge downstream operators after an > optimization step > --- > > Key: HIVE-24241 > URL: https://issues.apache.org/jira/browse/HIVE-24241 > Project: Hive > Issue Type: Improvement >Reporter: Zoltan Haindrich >Assignee: Zoltan Haindrich >Priority: Major > Labels: pull-request-available > Time Spent: 2h > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)
[jira] [Work logged] (HIVE-24241) Enable SharedWorkOptimizer to merge downstream operators after an optimization step
[ https://issues.apache.org/jira/browse/HIVE-24241?focusedWorklogId=506526=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-506526 ] ASF GitHub Bot logged work on HIVE-24241: - Author: ASF GitHub Bot Created on: 30/Oct/20 04:09 Start Date: 30/Oct/20 04:09 Worklog Time Spent: 10m Work Description: jcamachor commented on a change in pull request #1562: URL: https://github.com/apache/hive/pull/1562#discussion_r514803487 ## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/OperatorGraph.java ## @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.File; +import java.io.PrintWriter; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.hive.ql.exec.AppMasterEventOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePointLookupOptimizerRule.DiGraph; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo; +import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc; + +import com.google.common.collect.Sets; + +public class OperatorGraph { + + /** + * A directed graph extended with support to check dag property. + */ + static class DagGraph extends DiGraph { Review comment: In the meantime, maybe DiGraph could be made a top class. ## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HivePointLookupOptimizerRule.java ## @@ -189,115 +189,121 @@ public RexNode analyzeRexNode(RexBuilder rexBuilder, RexNode condition) { return newCondition; } - /** - * Transforms inequality candidates into [NOT] BETWEEN calls. - * - */ - protected static class RexTransformIntoBetween extends RexShuttle { -private final RexBuilder rexBuilder; + public static class DiGraph { Review comment: Left the comment in other class but I was thinking that it may be a good idea to promote this to top class (at least until we replace it by any other library version as we were discussing). ## File path: ql/src/test/results/clientpositive/llap/dynamic_partition_pruning.q.out ## @@ -4317,7 +4301,7 @@ STAGE PLANS: outputColumnNames: ds Statistics: Num rows: 2000 Data size: 389248 Basic stats: COMPLETE Column stats: COMPLETE Group By Operator - aggregations: max(ds) + aggregations: min(ds) Review comment: Any idea why this is happening? ## File path: ql/src/test/results/clientpositive/llap/dynamic_partition_pruning.q.out ## @@ -4277,37 +4277,21 @@ STAGE PLANS: alias: srcpart filterExpr: ds is not null (type: boolean) Statistics: Num rows: 2000 Data size: 389248 Basic stats: COMPLETE Column stats: COMPLETE - Filter Operator -predicate: ds is not null (type: boolean) Review comment: Note that the filter operator is removed. We need to be careful here because not all input formats guarantee that the filter expression is being applied / does not return false positives. I would expect the Filter remains but only a single time? ## File path: ql/src/test/results/clientpositive/perf/tez/query95.q.out ## @@ -128,7 +128,7 @@ Stage-0 Select Operator [SEL_235] (rows=144002668 width=7) Output:["_col0","_col1"] Filter Operator [FIL_234] (rows=144002668 width=7) - predicate:(ws_order_number is not null and (ws_order_number is not null or ws_order_number is not null)) +
[jira] [Work logged] (HIVE-24241) Enable SharedWorkOptimizer to merge downstream operators after an optimization step
[ https://issues.apache.org/jira/browse/HIVE-24241?focusedWorklogId=506040=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-506040 ] ASF GitHub Bot logged work on HIVE-24241: - Author: ASF GitHub Bot Created on: 29/Oct/20 04:04 Start Date: 29/Oct/20 04:04 Worklog Time Spent: 10m Work Description: dengzhhu653 commented on a change in pull request #1562: URL: https://github.com/apache/hive/pull/1562#discussion_r513942354 ## File path: ql/src/test/results/clientpositive/llap/sharedwork_semi.q.out ## @@ -541,7 +541,7 @@ STAGE PLANS: Map Operator Tree: TableScan alias: s - filterExpr: (ss_sold_date_sk is not null and ((ss_sold_date_sk BETWEEN DynamicValue(RS_7_d_d_date_sk_min) AND DynamicValue(RS_7_d_d_date_sk_max) and in_bloom_filter(ss_sold_date_sk, DynamicValue(RS_7_d_d_date_sk_bloom_filter))) or (ss_sold_date_sk BETWEEN DynamicValue(RS_21_d_d_date_sk_min) AND DynamicValue(RS_21_d_d_date_sk_max) and in_bloom_filter(ss_sold_date_sk, DynamicValue(RS_21_d_d_date_sk_bloom_filter) (type: boolean) + filterExpr: (((ss_sold_date_sk BETWEEN DynamicValue(RS_7_d_d_date_sk_min) AND DynamicValue(RS_7_d_d_date_sk_max) and in_bloom_filter(ss_sold_date_sk, DynamicValue(RS_7_d_d_date_sk_bloom_filter))) or (ss_sold_date_sk BETWEEN DynamicValue(RS_21_d_d_date_sk_min) AND DynamicValue(RS_21_d_d_date_sk_max) and in_bloom_filter(ss_sold_date_sk, DynamicValue(RS_21_d_d_date_sk_bloom_filter and ss_sold_date_sk is not null) (type: boolean) Review comment: we see a case when NonBlockingOpDeDupProc merges FIL-FIL, the conditionals may be reorder. https://github.com/apache/hive/pull/1308 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 506040) Time Spent: 1h 40m (was: 1.5h) > Enable SharedWorkOptimizer to merge downstream operators after an > optimization step > --- > > Key: HIVE-24241 > URL: https://issues.apache.org/jira/browse/HIVE-24241 > Project: Hive > Issue Type: Improvement >Reporter: Zoltan Haindrich >Assignee: Zoltan Haindrich >Priority: Major > Labels: pull-request-available > Time Spent: 1h 40m > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)
[jira] [Work logged] (HIVE-24241) Enable SharedWorkOptimizer to merge downstream operators after an optimization step
[ https://issues.apache.org/jira/browse/HIVE-24241?focusedWorklogId=505853=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-505853 ] ASF GitHub Bot logged work on HIVE-24241: - Author: ASF GitHub Bot Created on: 28/Oct/20 18:38 Start Date: 28/Oct/20 18:38 Worklog Time Spent: 10m Work Description: kgyrtkirk commented on a change in pull request #1562: URL: https://github.com/apache/hive/pull/1562#discussion_r513671527 ## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/OperatorGraph.java ## @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.File; +import java.io.PrintWriter; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.hive.ql.exec.AppMasterEventOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePointLookupOptimizerRule.DiGraph; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo; +import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc; + +import com.google.common.collect.Sets; + +public class OperatorGraph { Review comment: @jcamachor this is the checker class I was talking about - right now it builds on top of the basic `digraph` class I've introduce some time ago in `PointLookupOptimizer` ## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/OperatorGraph.java ## @@ -0,0 +1,231 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.io.File; +import java.io.PrintWriter; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.hadoop.hive.ql.exec.AppMasterEventOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HivePointLookupOptimizerRule.DiGraph; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo; +import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc; + +import com.google.common.collect.Sets; + +public class OperatorGraph { + + /** + * A directed graph extended with support to check dag property. + */ + static class DagGraph extends DiGraph { Review comment: we can definetly roll our own graph representation; however sometimes I would feel that it would make things easier to have access to basic graph algorithms (for example to do a topological order walk/etc) there is a small library called [jgrapht](https://jgrapht.org/) (EPL 2.0 license - I think it will be okay) which could be utilized for these kind of things @jcamachor what do you think about pulling in the jgrapht lib and removing the makeshift digraph classes? ## File path:
[jira] [Work logged] (HIVE-24241) Enable SharedWorkOptimizer to merge downstream operators after an optimization step
[ https://issues.apache.org/jira/browse/HIVE-24241?focusedWorklogId=505163=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-505163 ] ASF GitHub Bot logged work on HIVE-24241: - Author: ASF GitHub Bot Created on: 27/Oct/20 10:59 Start Date: 27/Oct/20 10:59 Worklog Time Spent: 10m Work Description: kgyrtkirk commented on a change in pull request #1562: URL: https://github.com/apache/hive/pull/1562#discussion_r512594710 ## File path: ql/src/test/results/clientpositive/perf/tez/constraints/query92.q.out ## @@ -164,7 +164,7 @@ Stage-0 Select Operator [SEL_115] (rows=143966864 width=119) Output:["_col0","_col1","_col2"] Filter Operator [FIL_113] (rows=143966864 width=119) -predicate:(ws_sold_date_sk is not null and ws_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(ws_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter))) Review comment: changes are gone This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 505163) Time Spent: 1h 20m (was: 1h 10m) > Enable SharedWorkOptimizer to merge downstream operators after an > optimization step > --- > > Key: HIVE-24241 > URL: https://issues.apache.org/jira/browse/HIVE-24241 > Project: Hive > Issue Type: Improvement >Reporter: Zoltan Haindrich >Assignee: Zoltan Haindrich >Priority: Major > Labels: pull-request-available > Time Spent: 1h 20m > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)
[jira] [Work logged] (HIVE-24241) Enable SharedWorkOptimizer to merge downstream operators after an optimization step
[ https://issues.apache.org/jira/browse/HIVE-24241?focusedWorklogId=505161=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-505161 ] ASF GitHub Bot logged work on HIVE-24241: - Author: ASF GitHub Bot Created on: 27/Oct/20 10:58 Start Date: 27/Oct/20 10:58 Worklog Time Spent: 10m Work Description: kgyrtkirk commented on a change in pull request #1562: URL: https://github.com/apache/hive/pull/1562#discussion_r512594208 ## File path: ql/src/test/results/clientpositive/perf/tez/constraints/query54.q.out ## @@ -202,156 +202,154 @@ Stage-0 predicate:(_col1 <= _col3) Merge Join Operator [MERGEJOIN_294] (rows=15218525 width=12) Conds:(Inner),Output:["_col0","_col1","_col3"] - <-Reducer 15 [CUSTOM_SIMPLE_EDGE] + <-Reducer 20 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_99] Filter Operator [FIL_98] (rows=608741 width=12) predicate:(_col2 <= _col1) Merge Join Operator [MERGEJOIN_291] (rows=1826225 width=12) Conds:(Inner),Output:["_col0","_col1","_col2"] <-Map 9 [CUSTOM_SIMPLE_EDGE] vectorized - PARTITION_ONLY_SHUFFLE [RS_327] Review comment: this is hightly unfortunate: the jsonexplain api "tells" the vertex about the outgoing edge type by calling [this method](https://github.com/apache/hive/blob/db895f374bf63b77b683574fdf678bfac91a5ac6/common/src/java/org/apache/hadoop/hive/common/jsonexplain/Vertex.java#L308) from [here](https://github.com/apache/hive/blob/db895f374bf63b77b683574fdf678bfac91a5ac6/common/src/java/org/apache/hadoop/hive/common/jsonexplain/Stage.java#L115) since a single vertex can have multiple outgoing edges - setting the type of one-of-them is problematic - I think we may want to consider to simple remove this tagging of vertices instead...we should consider renaming some of the edge types...like `CUSTOM_SIMPLE_EDGE` to `PARTITION_ONLY` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 505161) Time Spent: 1h 10m (was: 1h) > Enable SharedWorkOptimizer to merge downstream operators after an > optimization step > --- > > Key: HIVE-24241 > URL: https://issues.apache.org/jira/browse/HIVE-24241 > Project: Hive > Issue Type: Improvement >Reporter: Zoltan Haindrich >Assignee: Zoltan Haindrich >Priority: Major > Labels: pull-request-available > Time Spent: 1h 10m > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)
[jira] [Work logged] (HIVE-24241) Enable SharedWorkOptimizer to merge downstream operators after an optimization step
[ https://issues.apache.org/jira/browse/HIVE-24241?focusedWorklogId=505156=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-505156 ] ASF GitHub Bot logged work on HIVE-24241: - Author: ASF GitHub Bot Created on: 27/Oct/20 10:52 Start Date: 27/Oct/20 10:52 Worklog Time Spent: 10m Work Description: kgyrtkirk commented on a change in pull request #1562: URL: https://github.com/apache/hive/pull/1562#discussion_r512589390 ## File path: ql/src/test/results/clientpositive/perf/tez/constraints/query1b.q.out ## @@ -210,7 +210,7 @@ STAGE PLANS: Statistics: Num rows: 16855704 Data size: 2008197920 Basic stats: COMPLETE Column stats: COMPLETE value expressions: _col2 (type: decimal(17,2)) Filter Operator -predicate: (sr_store_sk is not null and sr_returned_date_sk is not null and sr_store_sk BETWEEN DynamicValue(RS_40_store_s_store_sk_min) AND DynamicValue(RS_40_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_40_store_s_store_sk_bloom_filter))) (type: boolean) +predicate: (sr_store_sk is not null and sr_returned_date_sk is not null) (type: boolean) Review comment: changes are gone in this file This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 505156) Time Spent: 1h (was: 50m) > Enable SharedWorkOptimizer to merge downstream operators after an > optimization step > --- > > Key: HIVE-24241 > URL: https://issues.apache.org/jira/browse/HIVE-24241 > Project: Hive > Issue Type: Improvement >Reporter: Zoltan Haindrich >Assignee: Zoltan Haindrich >Priority: Major > Labels: pull-request-available > Time Spent: 1h > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)
[jira] [Work logged] (HIVE-24241) Enable SharedWorkOptimizer to merge downstream operators after an optimization step
[ https://issues.apache.org/jira/browse/HIVE-24241?focusedWorklogId=505155=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-505155 ] ASF GitHub Bot logged work on HIVE-24241: - Author: ASF GitHub Bot Created on: 27/Oct/20 10:51 Start Date: 27/Oct/20 10:51 Worklog Time Spent: 10m Work Description: kgyrtkirk commented on a change in pull request #1562: URL: https://github.com/apache/hive/pull/1562#discussion_r512588759 ## File path: ql/src/test/results/clientpositive/llap/sharedwork_semi.q.out ## @@ -541,7 +541,7 @@ STAGE PLANS: Map Operator Tree: TableScan alias: s - filterExpr: (ss_sold_date_sk is not null and ((ss_sold_date_sk BETWEEN DynamicValue(RS_7_d_d_date_sk_min) AND DynamicValue(RS_7_d_d_date_sk_max) and in_bloom_filter(ss_sold_date_sk, DynamicValue(RS_7_d_d_date_sk_bloom_filter))) or (ss_sold_date_sk BETWEEN DynamicValue(RS_21_d_d_date_sk_min) AND DynamicValue(RS_21_d_d_date_sk_max) and in_bloom_filter(ss_sold_date_sk, DynamicValue(RS_21_d_d_date_sk_bloom_filter) (type: boolean) + filterExpr: (((ss_sold_date_sk BETWEEN DynamicValue(RS_7_d_d_date_sk_min) AND DynamicValue(RS_7_d_d_date_sk_max) and in_bloom_filter(ss_sold_date_sk, DynamicValue(RS_7_d_d_date_sk_bloom_filter))) or (ss_sold_date_sk BETWEEN DynamicValue(RS_21_d_d_date_sk_min) AND DynamicValue(RS_21_d_d_date_sk_max) and in_bloom_filter(ss_sold_date_sk, DynamicValue(RS_21_d_d_date_sk_bloom_filter and ss_sold_date_sk is not null) (type: boolean) Review comment: I've tried to retain the order - which have placed the bloom related checks at the end. I recall that there was a ticket about ordering conditionals - but I can't find the related ticket; do I remember incorrectly? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 505155) Time Spent: 50m (was: 40m) > Enable SharedWorkOptimizer to merge downstream operators after an > optimization step > --- > > Key: HIVE-24241 > URL: https://issues.apache.org/jira/browse/HIVE-24241 > Project: Hive > Issue Type: Improvement >Reporter: Zoltan Haindrich >Assignee: Zoltan Haindrich >Priority: Major > Labels: pull-request-available > Time Spent: 50m > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)
[jira] [Work logged] (HIVE-24241) Enable SharedWorkOptimizer to merge downstream operators after an optimization step
[ https://issues.apache.org/jira/browse/HIVE-24241?focusedWorklogId=505154=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-505154 ] ASF GitHub Bot logged work on HIVE-24241: - Author: ASF GitHub Bot Created on: 27/Oct/20 10:45 Start Date: 27/Oct/20 10:45 Worklog Time Spent: 10m Work Description: kgyrtkirk commented on a change in pull request #1562: URL: https://github.com/apache/hive/pull/1562#discussion_r512584825 ## File path: ql/src/test/results/clientpositive/perf/tez/constraints/query32.q.out ## @@ -160,7 +160,7 @@ Stage-0 Select Operator [SEL_115] (rows=286549727 width=119) Output:["_col0","_col1","_col2"] Filter Operator [FIL_113] (rows=286549727 width=119) -predicate:(cs_sold_date_sk is not null and cs_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(cs_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter))) Review comment: conditional was not reconstructed properly during filter creation - fixed This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 505154) Time Spent: 40m (was: 0.5h) > Enable SharedWorkOptimizer to merge downstream operators after an > optimization step > --- > > Key: HIVE-24241 > URL: https://issues.apache.org/jira/browse/HIVE-24241 > Project: Hive > Issue Type: Improvement >Reporter: Zoltan Haindrich >Assignee: Zoltan Haindrich >Priority: Major > Labels: pull-request-available > Time Spent: 40m > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)
[jira] [Work logged] (HIVE-24241) Enable SharedWorkOptimizer to merge downstream operators after an optimization step
[ https://issues.apache.org/jira/browse/HIVE-24241?focusedWorklogId=504632=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-504632 ] ASF GitHub Bot logged work on HIVE-24241: - Author: ASF GitHub Bot Created on: 26/Oct/20 05:54 Start Date: 26/Oct/20 05:54 Worklog Time Spent: 10m Work Description: jcamachor commented on a change in pull request #1562: URL: https://github.com/apache/hive/pull/1562#discussion_r511717213 ## File path: ql/src/test/results/clientpositive/perf/tez/constraints/query32.q.out ## @@ -160,7 +160,7 @@ Stage-0 Select Operator [SEL_115] (rows=286549727 width=119) Output:["_col0","_col1","_col2"] Filter Operator [FIL_113] (rows=286549727 width=119) -predicate:(cs_sold_date_sk is not null and cs_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(cs_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter))) Review comment: SJ is gone. Is this expected? ## File path: ql/src/test/results/clientpositive/perf/tez/constraints/query92.q.out ## @@ -164,7 +164,7 @@ Stage-0 Select Operator [SEL_115] (rows=143966864 width=119) Output:["_col0","_col1","_col2"] Filter Operator [FIL_113] (rows=143966864 width=119) -predicate:(ws_sold_date_sk is not null and ws_item_sk BETWEEN DynamicValue(RS_28_item_i_item_sk_min) AND DynamicValue(RS_28_item_i_item_sk_max) and in_bloom_filter(ws_item_sk, DynamicValue(RS_28_item_i_item_sk_bloom_filter))) Review comment: SJ got removed. Is this expected? ## File path: ql/src/java/org/apache/hadoop/hive/ql/plan/ExprNodeDescUtils.java ## @@ -1136,4 +1173,30 @@ public static boolean isOr(ExprNodeDesc expr) { return false; } + public static boolean isAnd(ExprNodeDesc expr) { +if (expr instanceof ExprNodeGenericFuncDesc) { Review comment: I think you could use `ExprNodeDescExprFactory.isANDFuncCallExpr` or `FunctionRegistry.isOpAnd(expr)`? ## File path: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java ## @@ -2595,6 +2595,8 @@ private static void populateLlapDaemonVarsSet(Set llapDaemonVarsSetLocal HIVE_SHARED_WORK_DPPUNION_OPTIMIZATION("hive.optimize.shared.work.dppunion", true, "Enables dppops unioning. This optimization will enable to merge multiple tablescans with different " + "dynamic filters into a single one (with a more complex filter)"), + HIVE_SHARED_WORK_DOWNSTREAM_MERGE("hive.optimize.shared.work.downstream.merge", true, +"Analyzes and merges equiv downstream operators after a successfull shared work optimization step."), Review comment: nit. typo 'successfull' ## File path: ql/src/test/results/clientpositive/perf/tez/constraints/query1b.q.out ## @@ -176,7 +176,7 @@ STAGE PLANS: Map Operator Tree: TableScan alias: store_returns - filterExpr: (((sr_customer_sk is not null and sr_store_sk is not null and sr_returned_date_sk is not null) or (sr_store_sk is not null and sr_returned_date_sk is not null)) and sr_store_sk BETWEEN DynamicValue(RS_40_store_s_store_sk_min) AND DynamicValue(RS_40_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_40_store_s_store_sk_bloom_filter))) (type: boolean) + filterExpr: (sr_store_sk BETWEEN DynamicValue(RS_40_store_s_store_sk_min) AND DynamicValue(RS_40_store_s_store_sk_max) and in_bloom_filter(sr_store_sk, DynamicValue(RS_40_store_s_store_sk_bloom_filter)) and ((sr_customer_sk is not null and sr_store_sk is not null and sr_returned_date_sk is not null) or (sr_store_sk is not null and sr_returned_date_sk is not null))) (type: boolean) Review comment: Same as above. Filter exprs order ## File path: ql/src/test/results/clientpositive/perf/tez/constraints/query54.q.out ## @@ -202,156 +202,154 @@ Stage-0 predicate:(_col1 <= _col3) Merge Join Operator [MERGEJOIN_294] (rows=15218525 width=12) Conds:(Inner),Output:["_col0","_col1","_col3"] - <-Reducer 15 [CUSTOM_SIMPLE_EDGE] + <-Reducer 20 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_99] Filter Operator [FIL_98] (rows=608741 width=12)
[jira] [Work logged] (HIVE-24241) Enable SharedWorkOptimizer to merge downstream operators after an optimization step
[ https://issues.apache.org/jira/browse/HIVE-24241?focusedWorklogId=498437=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-498437 ] ASF GitHub Bot logged work on HIVE-24241: - Author: ASF GitHub Bot Created on: 09/Oct/20 14:07 Start Date: 09/Oct/20 14:07 Worklog Time Spent: 10m Work Description: kgyrtkirk opened a new pull request #1562: URL: https://github.com/apache/hive/pull/1562 ### What changes were proposed in this pull request? ### Why are the changes needed? ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 498437) Time Spent: 20m (was: 10m) > Enable SharedWorkOptimizer to merge downstream operators after an > optimization step > --- > > Key: HIVE-24241 > URL: https://issues.apache.org/jira/browse/HIVE-24241 > Project: Hive > Issue Type: Improvement >Reporter: Zoltan Haindrich >Assignee: Zoltan Haindrich >Priority: Major > Labels: pull-request-available > Time Spent: 20m > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)
[jira] [Work logged] (HIVE-24241) Enable SharedWorkOptimizer to merge downstream operators after an optimization step
[ https://issues.apache.org/jira/browse/HIVE-24241?focusedWorklogId=497409=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-497409 ] ASF GitHub Bot logged work on HIVE-24241: - Author: ASF GitHub Bot Created on: 08/Oct/20 15:27 Start Date: 08/Oct/20 15:27 Worklog Time Spent: 10m Work Description: kgyrtkirk opened a new pull request #1562: URL: https://github.com/apache/hive/pull/1562 ### What changes were proposed in this pull request? ### Why are the changes needed? ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking --- Worklog Id: (was: 497409) Remaining Estimate: 0h Time Spent: 10m > Enable SharedWorkOptimizer to merge downstream operators after an > optimization step > --- > > Key: HIVE-24241 > URL: https://issues.apache.org/jira/browse/HIVE-24241 > Project: Hive > Issue Type: Improvement >Reporter: Zoltan Haindrich >Assignee: Zoltan Haindrich >Priority: Major > Time Spent: 10m > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)