This is an automated email from the ASF dual-hosted git repository.
prasanthj pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new a003428 HIVE-22979: Support total file size in statistics annotation
(Prasanth Jayachandran reviewed by Jesus Camacho Rodriguez)
a003428 is described below
commit a0034284fe02a5012f883704fcd57652519a4cd5
Author: Prasanth Jayachandran <[email protected]>
AuthorDate: Mon Mar 9 10:39:42 2020 -0700
HIVE-22979: Support total file size in statistics annotation (Prasanth
Jayachandran reviewed by Jesus Camacho Rodriguez)
---
.../hive/ql/optimizer/spark/SparkMapJoinOptimizer.java | 2 +-
.../stats/annotation/StatsRulesProcFactory.java | 2 +-
.../org/apache/hadoop/hive/ql/plan/Statistics.java | 18 +++++++++++++++---
.../org/apache/hadoop/hive/ql/stats/BasicStats.java | 15 ++++++++++++---
.../org/apache/hadoop/hive/ql/stats/StatsUtils.java | 6 ++++--
.../ql/exec/tez/TestVectorMapJoinFastHashTable.java | 2 +-
6 files changed, 34 insertions(+), 11 deletions(-)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java
index 5dcd49b..0638caf 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/spark/SparkMapJoinOptimizer.java
@@ -215,7 +215,7 @@ public class SparkMapJoinOptimizer implements
SemanticNodeProcessor {
LOG.debug("Found a big table branch with parent operator {} and
position {}", parentOp, pos);
bigTablePosition = pos;
bigTableFound = true;
- bigInputStat = new Statistics(0, Long.MAX_VALUE, 0);
+ bigInputStat = new Statistics(0, Long.MAX_VALUE, Long.MAX_VALUE,
0);
} else {
// Either we've found multiple big table branches, or the current
branch cannot
// be a big table branch. Disable mapjoin for these cases.
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
index 0ada066..43fc449 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java
@@ -2116,7 +2116,7 @@ public class StatsRulesProcFactory {
}
}
- Statistics wcStats = new Statistics(newNumRows, newDataSize, 0);
+ Statistics wcStats = new Statistics(newNumRows, newDataSize, 0, 0);
wcStats.setBasicStatsState(statsState);
// evaluate filter expression and update statistics
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
b/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
index bc5f9d9..a4cb841 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/Statistics.java
@@ -52,7 +52,10 @@ public class Statistics implements Serializable {
private long numRows;
private long runTimeNumRows;
+ // dataSize represents raw data size (estimated in-memory size based on row
schema) after decompression and decoding.
private long dataSize;
+ // totalFileSize represents on-disk size.
+ private long totalFileSize;
private long numErasureCodedFiles;
private State basicStatsState;
private Map<String, ColStatistics> columnStats;
@@ -60,12 +63,13 @@ public class Statistics implements Serializable {
private boolean runtimeStats;
public Statistics() {
- this(0, 0, 0);
+ this(0, 0, 0, 0);
}
- public Statistics(long nr, long ds, long numEcFiles) {
+ public Statistics(long nr, long ds, long fs, long numEcFiles) {
numRows = nr;
dataSize = ds;
+ totalFileSize = fs;
numErasureCodedFiles = numEcFiles;
runTimeNumRows = -1;
columnStats = null;
@@ -74,6 +78,14 @@ public class Statistics implements Serializable {
updateBasicStatsState();
}
+ public void setTotalFileSize(final long totalFileSize) {
+ this.totalFileSize = totalFileSize;
+ }
+
+ public long getTotalFileSize() {
+ return totalFileSize;
+ }
+
public long getNumRows() {
return numRows;
}
@@ -191,7 +203,7 @@ public class Statistics implements Serializable {
@Override
public Statistics clone() {
- Statistics clone = new Statistics(numRows, dataSize, numErasureCodedFiles);
+ Statistics clone = new Statistics(numRows, dataSize, totalFileSize,
numErasureCodedFiles);
clone.setRunTimeNumRows(runTimeNumRows);
clone.setBasicStatsState(basicStatsState);
clone.setColumnStatsState(columnStatsState);
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java
b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java
index 02a9f15..b352ae4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/BasicStats.java
@@ -196,7 +196,7 @@ public class BasicStats {
public void apply(BasicStats stats) {
long ds = stats.getRawDataSize();
if (ds <= 0) {
- ds = stats.getTotalSize();
+ ds = stats.getTotalFileSize();
// if data size is still 0 then get file size
if (ds <= 0) {
@@ -229,6 +229,7 @@ public class BasicStats {
private long currentNumRows;
private long currentDataSize;
+ private long currentFileSize;
private Statistics.State state;
public BasicStats(Partish p) {
@@ -240,6 +241,7 @@ public class BasicStats {
currentNumRows = rowCount;
currentDataSize = rawDataSize;
+ currentFileSize = totalSize;
if (currentNumRows > 0) {
state = State.COMPLETE;
@@ -253,10 +255,12 @@ public class BasicStats {
partish = null;
List<Long> nrIn = Lists.newArrayList();
List<Long> dsIn = Lists.newArrayList();
+ List<Long> fsIn = Lists.newArrayList();
state = (partStats.size() == 0) ? State.COMPLETE : null;
for (BasicStats ps : partStats) {
nrIn.add(ps.getNumRows());
dsIn.add(ps.getDataSize());
+ fsIn.add(ps.getTotalFileSize());
if (state == null) {
state = ps.getState();
@@ -266,6 +270,7 @@ public class BasicStats {
}
currentNumRows = StatsUtils.getSumIgnoreNegatives(nrIn);
currentDataSize = StatsUtils.getSumIgnoreNegatives(dsIn);
+ currentFileSize = StatsUtils.getSumIgnoreNegatives(fsIn);
}
@@ -293,8 +298,12 @@ public class BasicStats {
currentDataSize = ds;
}
- protected long getTotalSize() {
- return totalSize;
+ protected long getTotalFileSize() {
+ return currentFileSize;
+ }
+
+ public void setTotalFileSize(final long totalFileSize) {
+ this.currentFileSize = totalFileSize;
}
protected long getRawDataSize() {
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index bd4a4f6..73a8d80 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -276,6 +276,7 @@ public class StatsUtils {
// long nr = getNumRows(conf, schema, neededColumns, table, ds);
long ds = basicStats.getDataSize();
long nr = basicStats.getNumRows();
+ long fs = basicStats.getTotalFileSize();
List<ColStatistics> colStats = Collections.emptyList();
long numErasureCodedFiles = getErasureCodedFiles(table);
@@ -292,7 +293,7 @@ public class StatsUtils {
}
}
- stats = new Statistics(nr, ds, numErasureCodedFiles);
+ stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
// infer if any column can be primary key based on column statistics
inferAndSetPrimaryKey(stats.getNumRows(), colStats);
@@ -321,6 +322,7 @@ public class StatsUtils {
long nr = bbs.getNumRows();
long ds = bbs.getDataSize();
+ long fs = bbs.getTotalFileSize();
List<Long> erasureCodedFiles = getBasicStatForPartitions(table,
partList.getNotDeniedPartns(),
StatsSetupConst.NUM_ERASURE_CODED_FILES);
@@ -329,7 +331,7 @@ public class StatsUtils {
if (nr == 0) {
nr = 1;
}
- stats = new Statistics(nr, ds, numErasureCodedFiles);
+ stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
stats.setBasicStatsState(bbs.getState());
if (nr > 0) {
// FIXME: this promotion process should be removed later
diff --git
a/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestVectorMapJoinFastHashTable.java
b/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestVectorMapJoinFastHashTable.java
index c2a1823..cb8ac38 100644
---
a/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestVectorMapJoinFastHashTable.java
+++
b/ql/src/test/org/apache/hadoop/hive/ql/exec/tez/TestVectorMapJoinFastHashTable.java
@@ -96,7 +96,7 @@ public class TestVectorMapJoinFastHashTable {
dataSize += 8;
}
- Statistics stat = new Statistics(keyCount, dataSize, 0);
+ Statistics stat = new Statistics(keyCount, dataSize, 0, 0);
Long realObjectSize = getObjectSize(container);
Long executionEstimate = container.getEstimatedMemorySize();