[30/55] [abbrv] hive git commit: HIVE-11540 - Too many delta files during Compaction - OOM (Eugene Koifman, reviewed by Alan Gates)
HIVE-11540 - Too many delta files during Compaction - OOM (Eugene Koifman, reviewed by Alan Gates) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/e3ef96f2 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/e3ef96f2 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/e3ef96f2 Branch: refs/heads/spark Commit: e3ef96f2b83ffa932dd59fc3df79dff8747309ba Parents: 24ec6be Author: Eugene KoifmanAuthored: Sat Oct 24 18:44:05 2015 -0700 Committer: Eugene Koifman Committed: Sat Oct 24 18:44:05 2015 -0700 -- .../org/apache/hadoop/hive/conf/HiveConf.java | 2 + .../org/apache/hadoop/hive/ql/io/AcidUtils.java | 15 ++- .../hive/ql/txn/compactor/CompactorMR.java | 96 ++- .../hadoop/hive/ql/txn/compactor/Worker.java| 6 +- .../hive/ql/txn/compactor/CompactorTest.java| 4 + .../hive/ql/txn/compactor/TestWorker.java | 120 +-- 6 files changed, 201 insertions(+), 42 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hive/blob/e3ef96f2/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java -- diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index f065048..dc79415 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1551,6 +1551,8 @@ public class HiveConf extends Configuration { HIVE_COMPACTOR_DELTA_PCT_THRESHOLD("hive.compactor.delta.pct.threshold", 0.1f, "Percentage (fractional) size of the delta files relative to the base that will trigger\n" + "a major compaction. (1.0 = 100%, so the default 0.1 = 10%.)"), +COMPACTOR_MAX_NUM_DELTA("hive.compactor.max.num.delta", 500, "Maximum number of delta files that " + + "the compactor will attempt to handle in a single job."), HIVE_COMPACTOR_ABORTEDTXN_THRESHOLD("hive.compactor.abortedtxn.threshold", 1000, "Number of aborted transactions involving a given table or partition that will trigger\n" + http://git-wip-us.apache.org/repos/asf/hive/blob/e3ef96f2/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java -- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java index 30db513..e8d070c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java @@ -132,6 +132,9 @@ public class AcidUtils { return deltaSubdir(min, max) + "_" + String.format(STATEMENT_DIGITS, statementId); } + public static String baseDir(long txnId) { +return BASE_PREFIX + String.format(DELTA_DIGITS, txnId); + } /** * Create a filename for a bucket file. * @param directory the partition directory @@ -221,14 +224,16 @@ public class AcidUtils { Path getBaseDirectory(); /** - * Get the list of original files. + * Get the list of original files. Not {@code null}. * @return the list of original files (eg. 00_0) */ List getOriginalFiles(); /** * Get the list of base and delta directories that are valid and not - * obsolete. + * obsolete. Not {@code null}. List must be sorted in a specific way. + * See {@link org.apache.hadoop.hive.ql.io.AcidUtils.ParsedDelta#compareTo(org.apache.hadoop.hive.ql.io.AcidUtils.ParsedDelta)} + * for details. * @return the minimal list of current directories */ List getCurrentDirectories(); @@ -237,7 +242,7 @@ public class AcidUtils { * Get the list of obsolete directories. After filtering out bases and * deltas that are not selected by the valid transaction list, return the * list of original files, bases, and deltas that have been replaced by - * more up to date ones. + * more up to date ones. Not {@code null}. */ List getObsolete(); } @@ -284,6 +289,7 @@ public class AcidUtils { * happens in a different process; thus it's possible to have bases/deltas with * overlapping txnId boundaries. The sort order helps figure out the "best" set of files * to use to get data. + * This sorts "wider" delta before "narrower" i.e. delta_5_20 sorts before delta_5_10 (and delta_11_20) */ @Override public int compareTo(ParsedDelta parsedDelta) { @@ -499,6 +505,9 @@ public class AcidUtils { } Collections.sort(working); +//so now, 'working' should be sorted like delta_5_20 delta_5_10 delta_11_20 delta_51_60 for example +//and we want to end up with the best set
hive git commit: HIVE-11540 - Too many delta files during Compaction - OOM (Eugene Koifman, reviewed by Alan Gates)
Repository: hive Updated Branches: refs/heads/master 24ec6beda -> e3ef96f2b HIVE-11540 - Too many delta files during Compaction - OOM (Eugene Koifman, reviewed by Alan Gates) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/e3ef96f2 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/e3ef96f2 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/e3ef96f2 Branch: refs/heads/master Commit: e3ef96f2b83ffa932dd59fc3df79dff8747309ba Parents: 24ec6be Author: Eugene KoifmanAuthored: Sat Oct 24 18:44:05 2015 -0700 Committer: Eugene Koifman Committed: Sat Oct 24 18:44:05 2015 -0700 -- .../org/apache/hadoop/hive/conf/HiveConf.java | 2 + .../org/apache/hadoop/hive/ql/io/AcidUtils.java | 15 ++- .../hive/ql/txn/compactor/CompactorMR.java | 96 ++- .../hadoop/hive/ql/txn/compactor/Worker.java| 6 +- .../hive/ql/txn/compactor/CompactorTest.java| 4 + .../hive/ql/txn/compactor/TestWorker.java | 120 +-- 6 files changed, 201 insertions(+), 42 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hive/blob/e3ef96f2/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java -- diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index f065048..dc79415 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1551,6 +1551,8 @@ public class HiveConf extends Configuration { HIVE_COMPACTOR_DELTA_PCT_THRESHOLD("hive.compactor.delta.pct.threshold", 0.1f, "Percentage (fractional) size of the delta files relative to the base that will trigger\n" + "a major compaction. (1.0 = 100%, so the default 0.1 = 10%.)"), +COMPACTOR_MAX_NUM_DELTA("hive.compactor.max.num.delta", 500, "Maximum number of delta files that " + + "the compactor will attempt to handle in a single job."), HIVE_COMPACTOR_ABORTEDTXN_THRESHOLD("hive.compactor.abortedtxn.threshold", 1000, "Number of aborted transactions involving a given table or partition that will trigger\n" + http://git-wip-us.apache.org/repos/asf/hive/blob/e3ef96f2/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java -- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java index 30db513..e8d070c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java @@ -132,6 +132,9 @@ public class AcidUtils { return deltaSubdir(min, max) + "_" + String.format(STATEMENT_DIGITS, statementId); } + public static String baseDir(long txnId) { +return BASE_PREFIX + String.format(DELTA_DIGITS, txnId); + } /** * Create a filename for a bucket file. * @param directory the partition directory @@ -221,14 +224,16 @@ public class AcidUtils { Path getBaseDirectory(); /** - * Get the list of original files. + * Get the list of original files. Not {@code null}. * @return the list of original files (eg. 00_0) */ List getOriginalFiles(); /** * Get the list of base and delta directories that are valid and not - * obsolete. + * obsolete. Not {@code null}. List must be sorted in a specific way. + * See {@link org.apache.hadoop.hive.ql.io.AcidUtils.ParsedDelta#compareTo(org.apache.hadoop.hive.ql.io.AcidUtils.ParsedDelta)} + * for details. * @return the minimal list of current directories */ List getCurrentDirectories(); @@ -237,7 +242,7 @@ public class AcidUtils { * Get the list of obsolete directories. After filtering out bases and * deltas that are not selected by the valid transaction list, return the * list of original files, bases, and deltas that have been replaced by - * more up to date ones. + * more up to date ones. Not {@code null}. */ List getObsolete(); } @@ -284,6 +289,7 @@ public class AcidUtils { * happens in a different process; thus it's possible to have bases/deltas with * overlapping txnId boundaries. The sort order helps figure out the "best" set of files * to use to get data. + * This sorts "wider" delta before "narrower" i.e. delta_5_20 sorts before delta_5_10 (and delta_11_20) */ @Override public int compareTo(ParsedDelta parsedDelta) { @@ -499,6 +505,9 @@ public class AcidUtils { } Collections.sort(working); +//so now, 'working' should be sorted like delta_5_20 delta_5_10