This is an automated email from the ASF dual-hosted git repository. prasanthj pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new a003428 HIVE-22979: Support total file size in statistics annotation (Prasanth Jayachandran reviewed by Jesus Camacho Rodriguez) a003428 is described below commit a0034284fe02a5012f883704fcd57652519a4cd5 Author: Prasanth Jayachandran <prasan...@apache.org> AuthorDate: Mon Mar 9 10:39:42 2020 -0700 HIVE-22979: Support total file size in statistics annotation (Prasanth Jayachandran reviewed by Jesus Camacho Rodriguez) --- .../hive/ql/optimizer/spark/SparkMapJoinOptimizer.java | 2 +- .../stats/annotation/StatsRulesProcFactory.java | 2 +- .../org/apache/hadoop/hive/ql/plan/Statistics.java | 18 +++++++++++++++--- .../org/apache/hadoop/hive/ql/stats/BasicStats.java | 15 ++++++++++++--- .../org/apache/hadoop/hive/ql/stats/StatsUtils.java | 6 ++++-- .../ql/exec/tez/TestVectorMapJoinFastHashTable.java | 2 +- 6 files changed, 34 insertions(+), 11 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java index 5dcd49b..0638caf 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java @@ -215,7 +215,7 @@ public class SparkMapJoinOptimizer implements SemanticNodeProcessor { LOG.debug("Found a big table branch with parent operator {} and position {}", parentOp, pos); bigTablePosition = pos; bigTableFound = true; - bigInputStat = new Statistics(0, Long.MAX_VALUE, 0); + bigInputStat = new Statistics(0, Long.MAX_VALUE, Long.MAX_VALUE, 0); } else { // Either we've found multiple big table branches, or the current branch cannot // be a big table branch. Disable mapjoin for these cases. diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java index 0ada066..43fc449 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java @@ -2116,7 +2116,7 @@ public class StatsRulesProcFactory { } } - Statistics wcStats = new Statistics(newNumRows, newDataSize, 0); + Statistics wcStats = new Statistics(newNumRows, newDataSize, 0, 0); wcStats.setBasicStatsState(statsState); // evaluate filter expression and update statistics diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java index bc5f9d9..a4cb841 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java @@ -52,7 +52,10 @@ public class Statistics implements Serializable { private long numRows; private long runTimeNumRows; + // dataSize represents raw data size (estimated in-memory size based on row schema) after decompression and decoding. private long dataSize; + // totalFileSize represents on-disk size. + private long totalFileSize; private long numErasureCodedFiles; private State basicStatsState; private Map<String, ColStatistics> columnStats; @@ -60,12 +63,13 @@ public class Statistics implements Serializable { private boolean runtimeStats; public Statistics() { - this(0, 0, 0); + this(0, 0, 0, 0); } - public Statistics(long nr, long ds, long numEcFiles) { + public Statistics(long nr, long ds, long fs, long numEcFiles) { numRows = nr; dataSize = ds; + totalFileSize = fs; numErasureCodedFiles = numEcFiles; runTimeNumRows = -1; columnStats = null; @@ -74,6 +78,14 @@ public class Statistics implements Serializable { updateBasicStatsState(); } + public void setTotalFileSize(final long totalFileSize) { + this.totalFileSize = totalFileSize; + } + + public long getTotalFileSize() { + return totalFileSize; + } + public long getNumRows() { return numRows; } @@ -191,7 +203,7 @@ public class Statistics implements Serializable { @Override public Statistics clone() { - Statistics clone = new Statistics(numRows, dataSize, numErasureCodedFiles); + Statistics clone = new Statistics(numRows, dataSize, totalFileSize, numErasureCodedFiles); clone.setRunTimeNumRows(runTimeNumRows); clone.setBasicStatsState(basicStatsState); clone.setColumnStatsState(columnStatsState); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java index 02a9f15..b352ae4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java @@ -196,7 +196,7 @@ public class BasicStats { public void apply(BasicStats stats) { long ds = stats.getRawDataSize(); if (ds <= 0) { - ds = stats.getTotalSize(); + ds = stats.getTotalFileSize(); // if data size is still 0 then get file size if (ds <= 0) { @@ -229,6 +229,7 @@ public class BasicStats { private long currentNumRows; private long currentDataSize; + private long currentFileSize; private Statistics.State state; public BasicStats(Partish p) { @@ -240,6 +241,7 @@ public class BasicStats { currentNumRows = rowCount; currentDataSize = rawDataSize; + currentFileSize = totalSize; if (currentNumRows > 0) { state = State.COMPLETE; @@ -253,10 +255,12 @@ public class BasicStats { partish = null; List<Long> nrIn = Lists.newArrayList(); List<Long> dsIn = Lists.newArrayList(); + List<Long> fsIn = Lists.newArrayList(); state = (partStats.size() == 0) ? State.COMPLETE : null; for (BasicStats ps : partStats) { nrIn.add(ps.getNumRows()); dsIn.add(ps.getDataSize()); + fsIn.add(ps.getTotalFileSize()); if (state == null) { state = ps.getState(); @@ -266,6 +270,7 @@ public class BasicStats { } currentNumRows = StatsUtils.getSumIgnoreNegatives(nrIn); currentDataSize = StatsUtils.getSumIgnoreNegatives(dsIn); + currentFileSize = StatsUtils.getSumIgnoreNegatives(fsIn); } @@ -293,8 +298,12 @@ public class BasicStats { currentDataSize = ds; } - protected long getTotalSize() { - return totalSize; + protected long getTotalFileSize() { + return currentFileSize; + } + + public void setTotalFileSize(final long totalFileSize) { + this.currentFileSize = totalFileSize; } protected long getRawDataSize() { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java index bd4a4f6..73a8d80 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java @@ -276,6 +276,7 @@ public class StatsUtils { // long nr = getNumRows(conf, schema, neededColumns, table, ds); long ds = basicStats.getDataSize(); long nr = basicStats.getNumRows(); + long fs = basicStats.getTotalFileSize(); List<ColStatistics> colStats = Collections.emptyList(); long numErasureCodedFiles = getErasureCodedFiles(table); @@ -292,7 +293,7 @@ public class StatsUtils { } } - stats = new Statistics(nr, ds, numErasureCodedFiles); + stats = new Statistics(nr, ds, fs, numErasureCodedFiles); // infer if any column can be primary key based on column statistics inferAndSetPrimaryKey(stats.getNumRows(), colStats); @@ -321,6 +322,7 @@ public class StatsUtils { long nr = bbs.getNumRows(); long ds = bbs.getDataSize(); + long fs = bbs.getTotalFileSize(); List<Long> erasureCodedFiles = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.NUM_ERASURE_CODED_FILES); @@ -329,7 +331,7 @@ public class StatsUtils { if (nr == 0) { nr = 1; } - stats = new Statistics(nr, ds, numErasureCodedFiles); + stats = new Statistics(nr, ds, fs, numErasureCodedFiles); stats.setBasicStatsState(bbs.getState()); if (nr > 0) { // FIXME: this promotion process should be removed later diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestVectorMapJoinFastHashTable.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestVectorMapJoinFastHashTable.java index c2a1823..cb8ac38 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestVectorMapJoinFastHashTable.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestVectorMapJoinFastHashTable.java @@ -96,7 +96,7 @@ public class TestVectorMapJoinFastHashTable { dataSize += 8; } - Statistics stat = new Statistics(keyCount, dataSize, 0); + Statistics stat = new Statistics(keyCount, dataSize, 0, 0); Long realObjectSize = getObjectSize(container); Long executionEstimate = container.getEstimatedMemorySize();