[30/55] [abbrv] hive git commit: HIVE-11540 - Too many delta files during Compaction - OOM (Eugene Koifman, reviewed by Alan Gates)

2015-10-28 Thread xuefu
HIVE-11540 - Too many delta files during Compaction - OOM (Eugene Koifman, 
reviewed by Alan Gates)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/e3ef96f2
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/e3ef96f2
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/e3ef96f2

Branch: refs/heads/spark
Commit: e3ef96f2b83ffa932dd59fc3df79dff8747309ba
Parents: 24ec6be
Author: Eugene Koifman 
Authored: Sat Oct 24 18:44:05 2015 -0700
Committer: Eugene Koifman 
Committed: Sat Oct 24 18:44:05 2015 -0700

--
 .../org/apache/hadoop/hive/conf/HiveConf.java   |   2 +
 .../org/apache/hadoop/hive/ql/io/AcidUtils.java |  15 ++-
 .../hive/ql/txn/compactor/CompactorMR.java  |  96 ++-
 .../hadoop/hive/ql/txn/compactor/Worker.java|   6 +-
 .../hive/ql/txn/compactor/CompactorTest.java|   4 +
 .../hive/ql/txn/compactor/TestWorker.java   | 120 +--
 6 files changed, 201 insertions(+), 42 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/hive/blob/e3ef96f2/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
--
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index f065048..dc79415 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -1551,6 +1551,8 @@ public class HiveConf extends Configuration {
 HIVE_COMPACTOR_DELTA_PCT_THRESHOLD("hive.compactor.delta.pct.threshold", 
0.1f,
 "Percentage (fractional) size of the delta files relative to the base 
that will trigger\n" +
 "a major compaction. (1.0 = 100%, so the default 0.1 = 10%.)"),
+COMPACTOR_MAX_NUM_DELTA("hive.compactor.max.num.delta", 500, "Maximum 
number of delta files that " +
+  "the compactor will attempt to handle in a single job."),
 
 HIVE_COMPACTOR_ABORTEDTXN_THRESHOLD("hive.compactor.abortedtxn.threshold", 
1000,
 "Number of aborted transactions involving a given table or partition 
that will trigger\n" +

http://git-wip-us.apache.org/repos/asf/hive/blob/e3ef96f2/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
--
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
index 30db513..e8d070c 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
@@ -132,6 +132,9 @@ public class AcidUtils {
 return deltaSubdir(min, max) + "_" + String.format(STATEMENT_DIGITS, 
statementId);
   }
 
+  public static String baseDir(long txnId) {
+return BASE_PREFIX + String.format(DELTA_DIGITS, txnId);
+  }
   /**
* Create a filename for a bucket file.
* @param directory the partition directory
@@ -221,14 +224,16 @@ public class AcidUtils {
 Path getBaseDirectory();
 
 /**
- * Get the list of original files.
+ * Get the list of original files.  Not {@code null}.
  * @return the list of original files (eg. 00_0)
  */
 List getOriginalFiles();
 
 /**
  * Get the list of base and delta directories that are valid and not
- * obsolete.
+ * obsolete.  Not {@code null}.  List must be sorted in a specific way.
+ * See {@link 
org.apache.hadoop.hive.ql.io.AcidUtils.ParsedDelta#compareTo(org.apache.hadoop.hive.ql.io.AcidUtils.ParsedDelta)}
+ * for details.
  * @return the minimal list of current directories
  */
 List getCurrentDirectories();
@@ -237,7 +242,7 @@ public class AcidUtils {
  * Get the list of obsolete directories. After filtering out bases and
  * deltas that are not selected by the valid transaction list, return the
  * list of original files, bases, and deltas that have been replaced by
- * more up to date ones.
+ * more up to date ones.  Not {@code null}.
  */
 List getObsolete();
   }
@@ -284,6 +289,7 @@ public class AcidUtils {
  * happens in a different process; thus it's possible to have bases/deltas 
with
  * overlapping txnId boundaries.  The sort order helps figure out the 
"best" set of files
  * to use to get data.
+ * This sorts "wider" delta before "narrower" i.e. delta_5_20 sorts before 
delta_5_10 (and delta_11_20)
  */
 @Override
 public int compareTo(ParsedDelta parsedDelta) {
@@ -499,6 +505,9 @@ public class AcidUtils {
 }
 
 Collections.sort(working);
+//so now, 'working' should be sorted like delta_5_20 delta_5_10 
delta_11_20 delta_51_60 for example
+//and we want to end up with the best set 

hive git commit: HIVE-11540 - Too many delta files during Compaction - OOM (Eugene Koifman, reviewed by Alan Gates)

2015-10-24 Thread ekoifman
Repository: hive
Updated Branches:
  refs/heads/master 24ec6beda -> e3ef96f2b


HIVE-11540 - Too many delta files during Compaction - OOM (Eugene Koifman, 
reviewed by Alan Gates)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/e3ef96f2
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/e3ef96f2
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/e3ef96f2

Branch: refs/heads/master
Commit: e3ef96f2b83ffa932dd59fc3df79dff8747309ba
Parents: 24ec6be
Author: Eugene Koifman 
Authored: Sat Oct 24 18:44:05 2015 -0700
Committer: Eugene Koifman 
Committed: Sat Oct 24 18:44:05 2015 -0700

--
 .../org/apache/hadoop/hive/conf/HiveConf.java   |   2 +
 .../org/apache/hadoop/hive/ql/io/AcidUtils.java |  15 ++-
 .../hive/ql/txn/compactor/CompactorMR.java  |  96 ++-
 .../hadoop/hive/ql/txn/compactor/Worker.java|   6 +-
 .../hive/ql/txn/compactor/CompactorTest.java|   4 +
 .../hive/ql/txn/compactor/TestWorker.java   | 120 +--
 6 files changed, 201 insertions(+), 42 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/hive/blob/e3ef96f2/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
--
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index f065048..dc79415 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -1551,6 +1551,8 @@ public class HiveConf extends Configuration {
 HIVE_COMPACTOR_DELTA_PCT_THRESHOLD("hive.compactor.delta.pct.threshold", 
0.1f,
 "Percentage (fractional) size of the delta files relative to the base 
that will trigger\n" +
 "a major compaction. (1.0 = 100%, so the default 0.1 = 10%.)"),
+COMPACTOR_MAX_NUM_DELTA("hive.compactor.max.num.delta", 500, "Maximum 
number of delta files that " +
+  "the compactor will attempt to handle in a single job."),
 
 HIVE_COMPACTOR_ABORTEDTXN_THRESHOLD("hive.compactor.abortedtxn.threshold", 
1000,
 "Number of aborted transactions involving a given table or partition 
that will trigger\n" +

http://git-wip-us.apache.org/repos/asf/hive/blob/e3ef96f2/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
--
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
index 30db513..e8d070c 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/AcidUtils.java
@@ -132,6 +132,9 @@ public class AcidUtils {
 return deltaSubdir(min, max) + "_" + String.format(STATEMENT_DIGITS, 
statementId);
   }
 
+  public static String baseDir(long txnId) {
+return BASE_PREFIX + String.format(DELTA_DIGITS, txnId);
+  }
   /**
* Create a filename for a bucket file.
* @param directory the partition directory
@@ -221,14 +224,16 @@ public class AcidUtils {
 Path getBaseDirectory();
 
 /**
- * Get the list of original files.
+ * Get the list of original files.  Not {@code null}.
  * @return the list of original files (eg. 00_0)
  */
 List getOriginalFiles();
 
 /**
  * Get the list of base and delta directories that are valid and not
- * obsolete.
+ * obsolete.  Not {@code null}.  List must be sorted in a specific way.
+ * See {@link 
org.apache.hadoop.hive.ql.io.AcidUtils.ParsedDelta#compareTo(org.apache.hadoop.hive.ql.io.AcidUtils.ParsedDelta)}
+ * for details.
  * @return the minimal list of current directories
  */
 List getCurrentDirectories();
@@ -237,7 +242,7 @@ public class AcidUtils {
  * Get the list of obsolete directories. After filtering out bases and
  * deltas that are not selected by the valid transaction list, return the
  * list of original files, bases, and deltas that have been replaced by
- * more up to date ones.
+ * more up to date ones.  Not {@code null}.
  */
 List getObsolete();
   }
@@ -284,6 +289,7 @@ public class AcidUtils {
  * happens in a different process; thus it's possible to have bases/deltas 
with
  * overlapping txnId boundaries.  The sort order helps figure out the 
"best" set of files
  * to use to get data.
+ * This sorts "wider" delta before "narrower" i.e. delta_5_20 sorts before 
delta_5_10 (and delta_11_20)
  */
 @Override
 public int compareTo(ParsedDelta parsedDelta) {
@@ -499,6 +505,9 @@ public class AcidUtils {
 }
 
 Collections.sort(working);
+//so now, 'working' should be sorted like delta_5_20 delta_5_10