Repository: hive Updated Branches: refs/heads/master 07492e0d2 -> bf93128e7
HIVE-18611 : Avoid memory allocation of aggregation buffer during stats computation (Ashutosh Chauhan via Gopal V) Signed-off-by: Ashutosh Chauhan <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/bf93128e Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/bf93128e Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/bf93128e Branch: refs/heads/master Commit: bf93128e7fef309bed324a493a83c060220a9743 Parents: 07492e0 Author: Ashutosh Chauhan <[email protected]> Authored: Thu Feb 1 23:37:51 2018 -0800 Committer: Ashutosh Chauhan <[email protected]> Committed: Wed Feb 7 08:55:55 2018 -0800 ---------------------------------------------------------------------- .../stats/annotation/StatsRulesProcFactory.java | 7 ++++++- .../hive/ql/udf/generic/GenericUDAFBloomFilter.java | 12 +++++++++++- .../hive/ql/udf/generic/GenericUDAFEvaluator.java | 11 +++++++++++ .../java/org/apache/hive/common/util/BloomKFilter.java | 6 +++--- 4 files changed, 31 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/bf93128e/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index cbadfa4..9a3f81c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -488,7 +488,7 @@ public class StatsRulesProcFactory { factor *= columnFactor > 1d ? 1d : columnFactor; } float inFactor = HiveConf.getFloatVar(aspCtx.getConf(), HiveConf.ConfVars.HIVE_STATS_IN_CLAUSE_FACTOR); - return Math.round( (double) numRows * factor * inFactor); + return Math.round( numRows * factor * inFactor); } private long evaluateBetweenExpr(Statistics stats, ExprNodeDesc pred, long currNumRows, AnnotateStatsProcCtx aspCtx, @@ -1313,6 +1313,11 @@ public class StatsRulesProcFactory { // each evaluator has constant java object overhead avgValSize += gop.javaObjectOverHead; GenericUDAFEvaluator.AggregationBuffer agg = null; + int evaluatorEstimate = aggregationEvaluators[i].estimate(); + if (evaluatorEstimate > 0) { + avgValSize += evaluatorEstimate; + continue; + } try { agg = aggregationEvaluators[i].getNewAggregationBuffer(); } catch (HiveException e) { http://git-wip-us.apache.org/repos/asf/hive/blob/bf93128e/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java index 0c92d2a..ca8bc8f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFBloomFilter.java @@ -82,7 +82,7 @@ public class GenericUDAFBloomFilter implements GenericUDAFResolver2 { private PrimitiveObjectInspector inputOI; // Bloom filter rest - private ByteArrayOutputStream result = new ByteArrayOutputStream(); + private final ByteArrayOutputStream result = new ByteArrayOutputStream(); private transient byte[] scratchBuffer = new byte[HiveDecimal.SCRATCH_BUFFER_LEN_TO_BYTES]; @@ -102,6 +102,16 @@ public class GenericUDAFBloomFilter implements GenericUDAFResolver2 { return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector; } + @Override + public int estimate() { + long entries = Math.min(getExpectedEntries(), maxEntries); + long numBits = (long) (-entries * Math.log(BloomKFilter.DEFAULT_FPP) / (Math.log(2) * Math.log(2))); + int nLongs = (int) Math.ceil((double) numBits / (double) Long.SIZE); + // additional bits to pad long array to block size + int padLongs = 8 - nLongs % 8; + return (nLongs + padLongs) * Long.SIZE / 8; + } + /** * Class for storing the BloomFilter */ http://git-wip-us.apache.org/repos/asf/hive/blob/bf93128e/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java index c3498b7..3a3e4b6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFEvaluator.java @@ -70,6 +70,17 @@ public abstract class GenericUDAFEvaluator implements Closeable { } /** + * Although similar to AbstractAggregationBuffer::estimate(), it differs from it in 2 aspects + * 1) This avoids creation of AggregationBuffer which may result in large memory allocation + * 2) This is used only while compiling query as oppose to AbstractAggregationBuffer version + * which may be used in both runtime as well as compile time. + * @return + */ + public int estimate() { + return -1; + } + + /** * Mode. * */ http://git-wip-us.apache.org/repos/asf/hive/blob/bf93128e/storage-api/src/java/org/apache/hive/common/util/BloomKFilter.java ---------------------------------------------------------------------- diff --git a/storage-api/src/java/org/apache/hive/common/util/BloomKFilter.java b/storage-api/src/java/org/apache/hive/common/util/BloomKFilter.java index 9ecc2ba..6ccf5ab 100644 --- a/storage-api/src/java/org/apache/hive/common/util/BloomKFilter.java +++ b/storage-api/src/java/org/apache/hive/common/util/BloomKFilter.java @@ -36,15 +36,15 @@ import java.util.Arrays; * This implementation has much lesser L1 data cache misses than {@link BloomFilter}. */ public class BloomKFilter { - private byte[] BYTE_ARRAY_4 = new byte[4]; - private byte[] BYTE_ARRAY_8 = new byte[8]; + private final byte[] BYTE_ARRAY_4 = new byte[4]; + private final byte[] BYTE_ARRAY_8 = new byte[8]; public static final float DEFAULT_FPP = 0.05f; private static final int DEFAULT_BLOCK_SIZE = 8; private static final int DEFAULT_BLOCK_SIZE_BITS = (int) (Math.log(DEFAULT_BLOCK_SIZE) / Math.log(2)); private static final int DEFAULT_BLOCK_OFFSET_MASK = DEFAULT_BLOCK_SIZE - 1; private static final int DEFAULT_BIT_OFFSET_MASK = Long.SIZE - 1; private final long[] masks = new long[DEFAULT_BLOCK_SIZE]; - private BitSet bitSet; + private final BitSet bitSet; private final int m; private final int k; // spread k-1 bits to adjacent longs, default is 8
