This is an automated email from the ASF dual-hosted git repository. hashutosh pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 7e90ecf HIVE-22982 : TopN Key efficiency check might disable filter too soon (Attila Magyar via Ashutosh Chauhan) 7e90ecf is described below commit 7e90ecf480673364eb5de182f17f2c02eb144315 Author: Attila Magyar <amag...@cloudera.com> AuthorDate: Mon Mar 9 08:26:35 2020 -0700 HIVE-22982 : TopN Key efficiency check might disable filter too soon (Attila Magyar via Ashutosh Chauhan) Signed-off-by: Ashutosh Chauhan <hashut...@apache.org> --- .../java/org/apache/hadoop/hive/conf/HiveConf.java | 4 ++-- .../hadoop/hive/ql/exec/TopNKeyOperator.java | 3 ++- .../hive/ql/exec/vector/VectorTopNKeyOperator.java | 11 ++++----- .../vector/wrapper/VectorHashKeyWrapperBatch.java | 4 ++-- .../VectorHashKeyWrapperGeneralComparator.java | 13 +++++------ .../apache/hadoop/hive/ql/plan/TopNKeyDesc.java | 26 ++++++++++++++++++++++ .../hadoop/hive/ql/exec/TestTopNKeyFilter.java | 2 +- 7 files changed, 44 insertions(+), 19 deletions(-) diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 12f4822..a18a6d7 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2417,8 +2417,8 @@ public class HiveConf extends Configuration { HIVE_OPTIMIZE_TOPNKEY("hive.optimize.topnkey", true, "Whether to enable top n key optimizer."), HIVE_MAX_TOPN_ALLOWED("hive.optimize.topnkey.max", 128, "Maximum topN value allowed by top n key optimizer.\n" + "If the LIMIT is greater than this value then top n key optimization won't be used."), - HIVE_TOPN_EFFICIENCY_THRESHOLD("hive.optimize.topnkey.efficiency.threshold", 0.6f, "Disable topN key filter if the ratio between forwarded and total rows reaches this limit."), - HIVE_TOPN_EFFICIENCY_CHECK_BATCHES("hive.optimize.topnkey.efficiency.check.nbatches", 8, "Check topN key filter efficiency after a specific number of batches."), + HIVE_TOPN_EFFICIENCY_THRESHOLD("hive.optimize.topnkey.efficiency.threshold", 0.8f, "Disable topN key filter if the ratio between forwarded and total rows reaches this limit."), + HIVE_TOPN_EFFICIENCY_CHECK_BATCHES("hive.optimize.topnkey.efficiency.check.nbatches", 10000, "Check topN key filter efficiency after a specific number of batches."), HIVE_TOPN_MAX_NUMBER_OF_PARTITIONS("hive.optimize.topnkey.partitions.max", 64, "Limit the maximum number of partitions used by the top N key operator."), HIVE_SHARED_WORK_OPTIMIZATION("hive.optimize.shared.work", true, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java index f09867b..e95d779 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java @@ -139,7 +139,8 @@ public class TopNKeyOperator extends Operator<TopNKeyDesc> implements Serializab } if (runTimeNumRows % conf.getCheckEfficiencyNumRows() == 0) { // check the efficiency at every nth rows - checkTopNFilterEfficiency(topNKeyFilters, disabledPartitions, conf.getEfficiencyThreshold(), LOG); + checkTopNFilterEfficiency( + topNKeyFilters, disabledPartitions, conf.getEfficiencyThreshold(), LOG, conf.getCheckEfficiencyNumRows()); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java index 0f8eb17..10567c7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java @@ -178,14 +178,15 @@ public class VectorTopNKeyOperator extends Operator<TopNKeyDesc> implements Vect batch.selectedInUse = selectedInUseBackup; if (incomingBatches % conf.getCheckEfficiencyNumBatches() == 0) { - checkTopNFilterEfficiency(topNKeyFilters, disabledPartitions, conf.getEfficiencyThreshold(), LOG); + checkTopNFilterEfficiency( + topNKeyFilters, disabledPartitions, conf.getEfficiencyThreshold(), LOG, conf.getCheckEfficiencyNumRows()); } } public static void checkTopNFilterEfficiency(Map<KeyWrapper, TopNKeyFilter> filters, - Set<KeyWrapper> disabledPartitions, - float efficiencyThreshold, - Logger log) + Set<KeyWrapper> disabledPartitions, + float efficiencyThreshold, + Logger log, long checkEfficiencyNumRows) { Iterator<Map.Entry<KeyWrapper, TopNKeyFilter>> iterator = filters.entrySet().iterator(); while (iterator.hasNext()) { @@ -193,7 +194,7 @@ public class VectorTopNKeyOperator extends Operator<TopNKeyDesc> implements Vect KeyWrapper partitionKey = each.getKey(); TopNKeyFilter filter = each.getValue(); log.debug("Checking TopN Filter efficiency {}, threshold: {}", filter, efficiencyThreshold); - if (filter.forwardingRatio() >= efficiencyThreshold) { + if (filter.getTotal() >= checkEfficiencyNumRows && filter.forwardingRatio() >= efficiencyThreshold) { log.info("Disabling TopN Filter {}", filter); disabledPartitions.add(partitionKey); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperBatch.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperBatch.java index b487480..d020756 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperBatch.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperBatch.java @@ -1084,8 +1084,8 @@ public class VectorHashKeyWrapperBatch extends VectorColumnSetInfo { comparator.addColumnComparator( i, columnTypeSpecificIndex, columnVectorType, columnSortOrder.charAt(i), nullOrder.charAt(i)); } - if (comparator.getComparators().size() == 1) { // don't use the composite comparator for n=1 - return comparator.getComparators().get(0); + if (comparator.getComparators().length == 1) { // don't use the composite comparator for n=1 + return comparator.getComparators()[0]; } return comparator; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperGeneralComparator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperGeneralComparator.java index 06ac661..d08910b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperGeneralComparator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperGeneralComparator.java @@ -18,9 +18,7 @@ package org.apache.hadoop.hive.ql.exec.vector.wrapper; import java.io.Serializable; -import java.util.ArrayList; import java.util.Comparator; -import java.util.List; import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr; @@ -35,7 +33,7 @@ public class VectorHashKeyWrapperGeneralComparator /** * Compare {@link VectorHashKeyWrapperBase} instances only by one column. */ - private static class VectorHashKeyWrapperBaseComparator + static class VectorHashKeyWrapperBaseComparator implements Comparator<VectorHashKeyWrapperBase>, Serializable { private final int keyIndex; @@ -72,10 +70,10 @@ public class VectorHashKeyWrapperGeneralComparator } } - private final List<VectorHashKeyWrapperBaseComparator> comparators; + private final VectorHashKeyWrapperBaseComparator[] comparators; public VectorHashKeyWrapperGeneralComparator(int numberOfColumns) { - this.comparators = new ArrayList<>(numberOfColumns); + this.comparators = new VectorHashKeyWrapperBaseComparator[numberOfColumns]; } public void addColumnComparator(int keyIndex, int columnTypeSpecificIndex, ColumnVector.Type columnVectorType, @@ -115,8 +113,7 @@ public class VectorHashKeyWrapperGeneralComparator default: throw new RuntimeException("Unexpected column vector columnVectorType " + columnVectorType); } - - comparators.add( + comparators[keyIndex] = ( new VectorHashKeyWrapperBaseComparator( keyIndex, sortOrder == '-' ? comparator.reversed() : comparator, @@ -134,7 +131,7 @@ public class VectorHashKeyWrapperGeneralComparator return 0; } - public List<VectorHashKeyWrapperBaseComparator> getComparators() { + public VectorHashKeyWrapperBaseComparator[] getComparators() { return comparators; } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java index ddd657e..5d6d749 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java @@ -181,6 +181,11 @@ public class TopNKeyDesc extends AbstractOperatorDesc { if (getClass().getName().equals(other.getClass().getName())) { TopNKeyDesc otherDesc = (TopNKeyDesc) other; return getTopN() == otherDesc.getTopN() && + getEfficiencyThreshold() == otherDesc.getEfficiencyThreshold() && + getCheckEfficiencyNumRows() == otherDesc.getCheckEfficiencyNumRows() && + getCheckEfficiencyNumBatches() == otherDesc.getCheckEfficiencyNumBatches() && + getMaxNumberOfPartitions() == otherDesc.getMaxNumberOfPartitions() && + ExprNodeDescUtils.isSame(partitionKeyColumns, otherDesc.partitionKeyColumns) && Objects.equals(columnSortOrder, otherDesc.columnSortOrder) && Objects.equals(nullOrder, otherDesc.nullOrder) && ExprNodeDescUtils.isSame(keyColumns, otherDesc.keyColumns); @@ -195,9 +200,30 @@ public class TopNKeyDesc extends AbstractOperatorDesc { ret.setColumnSortOrder(columnSortOrder); ret.setNullOrder(nullOrder); ret.setKeyColumns(getKeyColumns() == null ? null : new ArrayList<>(getKeyColumns())); + ret.setPartitionKeyColumns(getPartitionKeyColumns() == null ? null : new ArrayList<>(getPartitionKeyColumns())); + ret.setCheckEfficiencyNumRows(checkEfficiencyNumRows); + ret.setCheckEfficiencyNumBatches(checkEfficiencyNumBatches); + ret.setEfficiencyThreshold(efficiencyThreshold); + ret.setMaxNumberOfPartitions(maxNumberOfPartitions); return ret; } + public void setEfficiencyThreshold(float efficiencyThreshold) { + this.efficiencyThreshold = efficiencyThreshold; + } + + public void setCheckEfficiencyNumBatches(long checkEfficiencyNumBatches) { + this.checkEfficiencyNumBatches = checkEfficiencyNumBatches; + } + + public void setCheckEfficiencyNumRows(long checkEfficiencyNumRows) { + this.checkEfficiencyNumRows = checkEfficiencyNumRows; + } + + public void setMaxNumberOfPartitions(int maxNumberOfPartitions) { + this.maxNumberOfPartitions = maxNumberOfPartitions; + } + public class TopNKeyDescExplainVectorization extends OperatorExplainVectorization { private final TopNKeyDesc topNKeyDesc; private final VectorTopNKeyDesc vectorTopNKeyDesc; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/TestTopNKeyFilter.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/TestTopNKeyFilter.java index a91bc73..0ee5c8d 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/TestTopNKeyFilter.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/TestTopNKeyFilter.java @@ -118,7 +118,7 @@ public class TestTopNKeyFilter { }}; Set<KeyWrapper> disabled = new HashSet<>(); - checkTopNFilterEfficiency(filters, disabled, 0.6f, LOG); + checkTopNFilterEfficiency(filters, disabled, 0.6f, LOG, 1); assertThat(disabled, hasSize(1)); assertThat(disabled, hasItem(new TestKeyWrapper(200))); }