This is an automated email from the ASF dual-hosted git repository.
hashutosh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 7e90ecf HIVE-22982 : TopN Key efficiency check might disable filter
too soon (Attila Magyar via Ashutosh Chauhan)
7e90ecf is described below
commit 7e90ecf480673364eb5de182f17f2c02eb144315
Author: Attila Magyar <[email protected]>
AuthorDate: Mon Mar 9 08:26:35 2020 -0700
HIVE-22982 : TopN Key efficiency check might disable filter too soon
(Attila Magyar via Ashutosh Chauhan)
Signed-off-by: Ashutosh Chauhan <[email protected]>
---
.../java/org/apache/hadoop/hive/conf/HiveConf.java | 4 ++--
.../hadoop/hive/ql/exec/TopNKeyOperator.java | 3 ++-
.../hive/ql/exec/vector/VectorTopNKeyOperator.java | 11 ++++-----
.../vector/wrapper/VectorHashKeyWrapperBatch.java | 4 ++--
.../VectorHashKeyWrapperGeneralComparator.java | 13 +++++------
.../apache/hadoop/hive/ql/plan/TopNKeyDesc.java | 26 ++++++++++++++++++++++
.../hadoop/hive/ql/exec/TestTopNKeyFilter.java | 2 +-
7 files changed, 44 insertions(+), 19 deletions(-)
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 12f4822..a18a6d7 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2417,8 +2417,8 @@ public class HiveConf extends Configuration {
HIVE_OPTIMIZE_TOPNKEY("hive.optimize.topnkey", true, "Whether to enable
top n key optimizer."),
HIVE_MAX_TOPN_ALLOWED("hive.optimize.topnkey.max", 128, "Maximum topN
value allowed by top n key optimizer.\n" +
"If the LIMIT is greater than this value then top n key optimization
won't be used."),
-
HIVE_TOPN_EFFICIENCY_THRESHOLD("hive.optimize.topnkey.efficiency.threshold",
0.6f, "Disable topN key filter if the ratio between forwarded and total rows
reaches this limit."),
-
HIVE_TOPN_EFFICIENCY_CHECK_BATCHES("hive.optimize.topnkey.efficiency.check.nbatches",
8, "Check topN key filter efficiency after a specific number of batches."),
+
HIVE_TOPN_EFFICIENCY_THRESHOLD("hive.optimize.topnkey.efficiency.threshold",
0.8f, "Disable topN key filter if the ratio between forwarded and total rows
reaches this limit."),
+
HIVE_TOPN_EFFICIENCY_CHECK_BATCHES("hive.optimize.topnkey.efficiency.check.nbatches",
10000, "Check topN key filter efficiency after a specific number of batches."),
HIVE_TOPN_MAX_NUMBER_OF_PARTITIONS("hive.optimize.topnkey.partitions.max",
64, "Limit the maximum number of partitions used by the top N key operator."),
HIVE_SHARED_WORK_OPTIMIZATION("hive.optimize.shared.work", true,
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java
index f09867b..e95d779 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/TopNKeyOperator.java
@@ -139,7 +139,8 @@ public class TopNKeyOperator extends Operator<TopNKeyDesc>
implements Serializab
}
if (runTimeNumRows % conf.getCheckEfficiencyNumRows() == 0) { // check the
efficiency at every nth rows
- checkTopNFilterEfficiency(topNKeyFilters, disabledPartitions,
conf.getEfficiencyThreshold(), LOG);
+ checkTopNFilterEfficiency(
+ topNKeyFilters, disabledPartitions, conf.getEfficiencyThreshold(),
LOG, conf.getCheckEfficiencyNumRows());
}
}
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java
index 0f8eb17..10567c7 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorTopNKeyOperator.java
@@ -178,14 +178,15 @@ public class VectorTopNKeyOperator extends
Operator<TopNKeyDesc> implements Vect
batch.selectedInUse = selectedInUseBackup;
if (incomingBatches % conf.getCheckEfficiencyNumBatches() == 0) {
- checkTopNFilterEfficiency(topNKeyFilters, disabledPartitions,
conf.getEfficiencyThreshold(), LOG);
+ checkTopNFilterEfficiency(
+ topNKeyFilters, disabledPartitions, conf.getEfficiencyThreshold(),
LOG, conf.getCheckEfficiencyNumRows());
}
}
public static void checkTopNFilterEfficiency(Map<KeyWrapper, TopNKeyFilter>
filters,
- Set<KeyWrapper>
disabledPartitions,
- float efficiencyThreshold,
- Logger log)
+ Set<KeyWrapper>
disabledPartitions,
+ float efficiencyThreshold,
+ Logger log, long
checkEfficiencyNumRows)
{
Iterator<Map.Entry<KeyWrapper, TopNKeyFilter>> iterator =
filters.entrySet().iterator();
while (iterator.hasNext()) {
@@ -193,7 +194,7 @@ public class VectorTopNKeyOperator extends
Operator<TopNKeyDesc> implements Vect
KeyWrapper partitionKey = each.getKey();
TopNKeyFilter filter = each.getValue();
log.debug("Checking TopN Filter efficiency {}, threshold: {}", filter,
efficiencyThreshold);
- if (filter.forwardingRatio() >= efficiencyThreshold) {
+ if (filter.getTotal() >= checkEfficiencyNumRows &&
filter.forwardingRatio() >= efficiencyThreshold) {
log.info("Disabling TopN Filter {}", filter);
disabledPartitions.add(partitionKey);
}
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperBatch.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperBatch.java
index b487480..d020756 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperBatch.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperBatch.java
@@ -1084,8 +1084,8 @@ public class VectorHashKeyWrapperBatch extends
VectorColumnSetInfo {
comparator.addColumnComparator(
i, columnTypeSpecificIndex, columnVectorType,
columnSortOrder.charAt(i), nullOrder.charAt(i));
}
- if (comparator.getComparators().size() == 1) { // don't use the composite
comparator for n=1
- return comparator.getComparators().get(0);
+ if (comparator.getComparators().length == 1) { // don't use the composite
comparator for n=1
+ return comparator.getComparators()[0];
}
return comparator;
}
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperGeneralComparator.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperGeneralComparator.java
index 06ac661..d08910b 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperGeneralComparator.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/wrapper/VectorHashKeyWrapperGeneralComparator.java
@@ -18,9 +18,7 @@
package org.apache.hadoop.hive.ql.exec.vector.wrapper;
import java.io.Serializable;
-import java.util.ArrayList;
import java.util.Comparator;
-import java.util.List;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr;
@@ -35,7 +33,7 @@ public class VectorHashKeyWrapperGeneralComparator
/**
* Compare {@link VectorHashKeyWrapperBase} instances only by one column.
*/
- private static class VectorHashKeyWrapperBaseComparator
+ static class VectorHashKeyWrapperBaseComparator
implements Comparator<VectorHashKeyWrapperBase>, Serializable {
private final int keyIndex;
@@ -72,10 +70,10 @@ public class VectorHashKeyWrapperGeneralComparator
}
}
- private final List<VectorHashKeyWrapperBaseComparator> comparators;
+ private final VectorHashKeyWrapperBaseComparator[] comparators;
public VectorHashKeyWrapperGeneralComparator(int numberOfColumns) {
- this.comparators = new ArrayList<>(numberOfColumns);
+ this.comparators = new VectorHashKeyWrapperBaseComparator[numberOfColumns];
}
public void addColumnComparator(int keyIndex, int columnTypeSpecificIndex,
ColumnVector.Type columnVectorType,
@@ -115,8 +113,7 @@ public class VectorHashKeyWrapperGeneralComparator
default:
throw new RuntimeException("Unexpected column vector columnVectorType "
+ columnVectorType);
}
-
- comparators.add(
+ comparators[keyIndex] = (
new VectorHashKeyWrapperBaseComparator(
keyIndex,
sortOrder == '-' ? comparator.reversed() : comparator,
@@ -134,7 +131,7 @@ public class VectorHashKeyWrapperGeneralComparator
return 0;
}
- public List<VectorHashKeyWrapperBaseComparator> getComparators() {
+ public VectorHashKeyWrapperBaseComparator[] getComparators() {
return comparators;
}
}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java
b/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java
index ddd657e..5d6d749 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/TopNKeyDesc.java
@@ -181,6 +181,11 @@ public class TopNKeyDesc extends AbstractOperatorDesc {
if (getClass().getName().equals(other.getClass().getName())) {
TopNKeyDesc otherDesc = (TopNKeyDesc) other;
return getTopN() == otherDesc.getTopN() &&
+ getEfficiencyThreshold() == otherDesc.getEfficiencyThreshold() &&
+ getCheckEfficiencyNumRows() == otherDesc.getCheckEfficiencyNumRows()
&&
+ getCheckEfficiencyNumBatches() ==
otherDesc.getCheckEfficiencyNumBatches() &&
+ getMaxNumberOfPartitions() == otherDesc.getMaxNumberOfPartitions() &&
+ ExprNodeDescUtils.isSame(partitionKeyColumns,
otherDesc.partitionKeyColumns) &&
Objects.equals(columnSortOrder, otherDesc.columnSortOrder) &&
Objects.equals(nullOrder, otherDesc.nullOrder) &&
ExprNodeDescUtils.isSame(keyColumns, otherDesc.keyColumns);
@@ -195,9 +200,30 @@ public class TopNKeyDesc extends AbstractOperatorDesc {
ret.setColumnSortOrder(columnSortOrder);
ret.setNullOrder(nullOrder);
ret.setKeyColumns(getKeyColumns() == null ? null : new
ArrayList<>(getKeyColumns()));
+ ret.setPartitionKeyColumns(getPartitionKeyColumns() == null ? null : new
ArrayList<>(getPartitionKeyColumns()));
+ ret.setCheckEfficiencyNumRows(checkEfficiencyNumRows);
+ ret.setCheckEfficiencyNumBatches(checkEfficiencyNumBatches);
+ ret.setEfficiencyThreshold(efficiencyThreshold);
+ ret.setMaxNumberOfPartitions(maxNumberOfPartitions);
return ret;
}
+ public void setEfficiencyThreshold(float efficiencyThreshold) {
+ this.efficiencyThreshold = efficiencyThreshold;
+ }
+
+ public void setCheckEfficiencyNumBatches(long checkEfficiencyNumBatches) {
+ this.checkEfficiencyNumBatches = checkEfficiencyNumBatches;
+ }
+
+ public void setCheckEfficiencyNumRows(long checkEfficiencyNumRows) {
+ this.checkEfficiencyNumRows = checkEfficiencyNumRows;
+ }
+
+ public void setMaxNumberOfPartitions(int maxNumberOfPartitions) {
+ this.maxNumberOfPartitions = maxNumberOfPartitions;
+ }
+
public class TopNKeyDescExplainVectorization extends
OperatorExplainVectorization {
private final TopNKeyDesc topNKeyDesc;
private final VectorTopNKeyDesc vectorTopNKeyDesc;
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/TestTopNKeyFilter.java
b/ql/src/test/org/apache/hadoop/hive/ql/exec/TestTopNKeyFilter.java
index a91bc73..0ee5c8d 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/exec/TestTopNKeyFilter.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/TestTopNKeyFilter.java
@@ -118,7 +118,7 @@ public class TestTopNKeyFilter {
}};
Set<KeyWrapper> disabled = new HashSet<>();
- checkTopNFilterEfficiency(filters, disabled, 0.6f, LOG);
+ checkTopNFilterEfficiency(filters, disabled, 0.6f, LOG, 1);
assertThat(disabled, hasSize(1));
assertThat(disabled, hasItem(new TestKeyWrapper(200)));
}