[ 
https://issues.apache.org/jira/browse/HIVE-25345?focusedWorklogId=624787&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-624787
 ]

ASF GitHub Bot logged work on HIVE-25345:
-----------------------------------------

                Author: ASF GitHub Bot
            Created on: 20/Jul/21 07:51
            Start Date: 20/Jul/21 07:51
    Worklog Time Spent: 10m 
      Work Description: klcopp commented on a change in pull request #2493:
URL: https://github.com/apache/hive/pull/2493#discussion_r672868711



##########
File path: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
##########
@@ -3194,6 +3194,15 @@ private static void 
populateLlapDaemonVarsSet(Set<String> llapDaemonVarsSetLocal
         "Age of table/partition's oldest aborted transaction when compaction 
will be triggered. " +
         "Default time unit is: hours. Set to a negative number to disable."),
 
+    
HIVE_COMPACTOR_ACTIVE_DELTA_DIR_THRESHOLD("hive.compactor.active.delta.dir.threshold",
 200,
+        "Number if active delta directories under a given table/partition."),

Review comment:
       I think the descriptions here should reflect that these are thresholds, 
and that logging will happen if they are passed

##########
File path: 
standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java
##########
@@ -432,6 +432,88 @@ public static ConfVars getMetaConf(String name) {
     COMPACTOR_RUN_AS_USER("metastore.compactor.run.as.user", 
"hive.compactor.run.as.user", "",
         "Specify the user to run compactor Initiator and Worker as. If empty 
string, defaults to table/partition " +
         "directory owner."),
+    COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING(

Review comment:
       For all of these: Instead of: "after which a warning should be raised" I 
think it would be clearer to say: "after which a warning will be logged"

##########
File path: 
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HMSMetricsListener.java
##########
@@ -101,7 +101,14 @@ public void onAllocWriteId(AllocWriteIdEvent 
allocWriteIdEvent, Connection dbCon
       Table table = getTable(allocWriteIdEvent);
 
       if (MetaStoreUtils.isNoAutoCompactSet(table.getParameters())) {
-        
Metrics.getOrCreateGauge(MetricsConstants.WRITES_TO_DISABLED_COMPACTION_TABLE).incrementAndGet();
+        int noAutoCompactSet =
+            
Metrics.getOrCreateGauge(MetricsConstants.WRITES_TO_DISABLED_COMPACTION_TABLE).incrementAndGet();
+        if (noAutoCompactSet >=
+            MetastoreConf.getIntVar(getConf(),
+                
MetastoreConf.ConfVars.COMPACTOR_NUMBER_OF_DISABLED_COMPACTION_TABLES_THRESHOLD))
 {
+          LOGGER.warn("Number of tables where the compaction is turned off is: 
" + noAutoCompactSet);

Review comment:
       This might be clearer: "There has been a write to a table where 
auto-compaction is disabled (tblproperties ("no_auto_compact"="true"))...
   And definitely log the db and table name, so users can find it and re-enable 
auto-compaction.

##########
File path: 
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java
##########
@@ -85,36 +85,113 @@ public void run() {
 
   private void collectMetrics() throws MetaException {
     ShowCompactResponse currentCompactions = txnHandler.showCompact(new 
ShowCompactRequest());
-    updateMetricsFromShowCompact(currentCompactions);
+    updateMetricsFromShowCompact(currentCompactions, conf);
     updateDBMetrics();
   }
 
   private void updateDBMetrics() throws MetaException {
     MetricsInfo metrics = txnHandler.getMetricsInfo();
     
Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount());
+    if (metrics.getTxnToWriteIdCount() >=
+        MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING) &&
+        metrics.getTxnToWriteIdCount() <
+            MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) {
+      LOG.warn("An excessive amount of (" + metrics.getTxnToWriteIdCount() + 
") Hive ACID metadata found in " +
+          "TXN_TO_WRITEID table, which can cause serious performance 
degradation.");
+    } else if (metrics.getTxnToWriteIdCount() >=
+        MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) {
+      LOG.error("An excessive amount of (" + metrics.getTxnToWriteIdCount() + 
") Hive ACID metadata found in " +
+          "TXN_TO_WRITEID table, which can cause serious performance 
degradation.");
+    }
     
Metrics.getOrCreateGauge(NUM_COMPLETED_TXN_COMPONENTS).set(metrics.getCompletedTxnsCount());
-
+    if (metrics.getCompletedTxnsCount() >=
+        MetastoreConf.getIntVar(conf,
+            
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING)
 &&
+        metrics.getCompletedTxnsCount() <
+            MetastoreConf.getIntVar(conf,
+                
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR))
 {
+      LOG.warn("An excessive amount of (" + metrics.getCompletedTxnsCount() + 
") Hive ACID metadata found in " +
+          "COMPLETED_TXN_COMPONENTS table, which can cause serious performance 
degradation.");
+    } else if (metrics.getCompletedTxnsCount() >= MetastoreConf.getIntVar(conf,
+        
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR))
 {
+      LOG.error("An excessive amount of (" + metrics.getCompletedTxnsCount() + 
") Hive ACID metadata found in " +
+          "COMPLETED_TXN_COMPONENTS table, which can cause serious performance 
degradation.");
+    }
     
Metrics.getOrCreateGauge(NUM_OPEN_REPL_TXNS).set(metrics.getOpenReplTxnsCount());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_ID).set(metrics.getOldestOpenReplTxnId());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_AGE).set(metrics.getOldestOpenReplTxnAge());
+    if (metrics.getOldestOpenReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING,
+            TimeUnit.SECONDS) && metrics.getOldestOpenReplTxnAge() <
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.warn("A replication transaction has been open for " + 
metrics.getOldestOpenReplTxnAge() + " seconds. " +
+          "Before you abort a transaction that was created by replication, and 
which has been open a long time, " +
+          "make sure that the hive.repl.txn.timeout threshold has expired.");
+    } else if (metrics.getOldestOpenReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR,
+        TimeUnit.SECONDS)) {
+      LOG.error("A replication transaction has been open for " + 
metrics.getOldestOpenReplTxnAge() + " seconds. " +
+          "Before you abort a transaction that was created by replication, and 
which has been open a long time, " +
+          "make sure that the hive.repl.txn.timeout threshold has expired.");
+    }
     
Metrics.getOrCreateGauge(NUM_OPEN_NON_REPL_TXNS).set(metrics.getOpenNonReplTxnsCount());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_ID).set(metrics.getOldestOpenNonReplTxnId());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_AGE).set(metrics.getOldestOpenNonReplTxnAge());
+    if (metrics.getOldestOpenNonReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING,
+            TimeUnit.SECONDS)
+        && metrics.getOldestOpenNonReplTxnAge() <
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.warn("Found an open transaction with an age of " + 
metrics.getOldestOpenNonReplTxnAge() + " seconds.");

Review comment:
       open non-replication transaction*

##########
File path: 
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java
##########
@@ -85,36 +85,113 @@ public void run() {
 
   private void collectMetrics() throws MetaException {
     ShowCompactResponse currentCompactions = txnHandler.showCompact(new 
ShowCompactRequest());
-    updateMetricsFromShowCompact(currentCompactions);
+    updateMetricsFromShowCompact(currentCompactions, conf);
     updateDBMetrics();
   }
 
   private void updateDBMetrics() throws MetaException {
     MetricsInfo metrics = txnHandler.getMetricsInfo();
     
Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount());
+    if (metrics.getTxnToWriteIdCount() >=

Review comment:
       It would be more readable to separate out all the logging into a new 
method (can just pass the MetricsInfo object)

##########
File path: 
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java
##########
@@ -85,36 +85,113 @@ public void run() {
 
   private void collectMetrics() throws MetaException {
     ShowCompactResponse currentCompactions = txnHandler.showCompact(new 
ShowCompactRequest());
-    updateMetricsFromShowCompact(currentCompactions);
+    updateMetricsFromShowCompact(currentCompactions, conf);
     updateDBMetrics();
   }
 
   private void updateDBMetrics() throws MetaException {
     MetricsInfo metrics = txnHandler.getMetricsInfo();
     
Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount());
+    if (metrics.getTxnToWriteIdCount() >=
+        MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING) &&
+        metrics.getTxnToWriteIdCount() <
+            MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) {
+      LOG.warn("An excessive amount of (" + metrics.getTxnToWriteIdCount() + 
") Hive ACID metadata found in " +
+          "TXN_TO_WRITEID table, which can cause serious performance 
degradation.");
+    } else if (metrics.getTxnToWriteIdCount() >=
+        MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) {
+      LOG.error("An excessive amount of (" + metrics.getTxnToWriteIdCount() + 
") Hive ACID metadata found in " +
+          "TXN_TO_WRITEID table, which can cause serious performance 
degradation.");
+    }
     
Metrics.getOrCreateGauge(NUM_COMPLETED_TXN_COMPONENTS).set(metrics.getCompletedTxnsCount());
-
+    if (metrics.getCompletedTxnsCount() >=
+        MetastoreConf.getIntVar(conf,
+            
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING)
 &&
+        metrics.getCompletedTxnsCount() <
+            MetastoreConf.getIntVar(conf,
+                
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR))
 {
+      LOG.warn("An excessive amount of (" + metrics.getCompletedTxnsCount() + 
") Hive ACID metadata found in " +
+          "COMPLETED_TXN_COMPONENTS table, which can cause serious performance 
degradation.");
+    } else if (metrics.getCompletedTxnsCount() >= MetastoreConf.getIntVar(conf,
+        
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR))
 {
+      LOG.error("An excessive amount of (" + metrics.getCompletedTxnsCount() + 
") Hive ACID metadata found in " +
+          "COMPLETED_TXN_COMPONENTS table, which can cause serious performance 
degradation.");
+    }
     
Metrics.getOrCreateGauge(NUM_OPEN_REPL_TXNS).set(metrics.getOpenReplTxnsCount());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_ID).set(metrics.getOldestOpenReplTxnId());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_AGE).set(metrics.getOldestOpenReplTxnAge());
+    if (metrics.getOldestOpenReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING,
+            TimeUnit.SECONDS) && metrics.getOldestOpenReplTxnAge() <
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.warn("A replication transaction has been open for " + 
metrics.getOldestOpenReplTxnAge() + " seconds. " +
+          "Before you abort a transaction that was created by replication, and 
which has been open a long time, " +
+          "make sure that the hive.repl.txn.timeout threshold has expired.");
+    } else if (metrics.getOldestOpenReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR,
+        TimeUnit.SECONDS)) {
+      LOG.error("A replication transaction has been open for " + 
metrics.getOldestOpenReplTxnAge() + " seconds. " +
+          "Before you abort a transaction that was created by replication, and 
which has been open a long time, " +
+          "make sure that the hive.repl.txn.timeout threshold has expired.");
+    }
     
Metrics.getOrCreateGauge(NUM_OPEN_NON_REPL_TXNS).set(metrics.getOpenNonReplTxnsCount());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_ID).set(metrics.getOldestOpenNonReplTxnId());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_AGE).set(metrics.getOldestOpenNonReplTxnAge());
+    if (metrics.getOldestOpenNonReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING,
+            TimeUnit.SECONDS)
+        && metrics.getOldestOpenNonReplTxnAge() <
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.warn("Found an open transaction with an age of " + 
metrics.getOldestOpenNonReplTxnAge() + " seconds.");
+    } else if (metrics.getOldestOpenNonReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.error("Found an open transaction with an age of " + 
metrics.getOldestOpenNonReplTxnAge() + " seconds.");
+    }
 
     
Metrics.getOrCreateGauge(NUM_ABORTED_TXNS).set(metrics.getAbortedTxnsCount());
     
Metrics.getOrCreateGauge(OLDEST_ABORTED_TXN_ID).set(metrics.getOldestAbortedTxnId());
     
Metrics.getOrCreateGauge(OLDEST_ABORTED_TXN_AGE).set(metrics.getOldestAbortedTxnAge());
+    if (metrics.getOldestAbortedTxnAge() >=
+        MetastoreConf.getTimeVar(conf,
+            
MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_WARNING,
+            TimeUnit.SECONDS) &&
+        metrics.getOldestAbortedTxnAge() <
+            MetastoreConf.getTimeVar(conf,
+                
MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_ERROR,
+                TimeUnit.SECONDS)) {
+      LOG.warn("Found an aborted transaction with an age of " + 
metrics.getOldestAbortedTxnAge() + " seconds.");
+    } else if (metrics.getOldestAbortedTxnAge() >=
+        MetastoreConf.getTimeVar(conf,
+            
MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.error("Found an aborted transaction with an age of " + 
metrics.getOldestAbortedTxnAge() + " seconds.");
+    }
 
     Metrics.getOrCreateGauge(NUM_LOCKS).set(metrics.getLocksCount());
     Metrics.getOrCreateGauge(OLDEST_LOCK_AGE).set(metrics.getOldestLockAge());
 
     
Metrics.getOrCreateGauge(TABLES_WITH_X_ABORTED_TXNS).set(metrics.getTablesWithXAbortedTxns());
+    if (metrics.getOldestAbortedTxnAge() >

Review comment:
       What do you think about saving the list of these tables in MetricsInfo, 
and logging that list here, so users know where to clean? It's  just an idea

##########
File path: 
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java
##########
@@ -143,11 +220,33 @@ public static void 
updateMetricsFromShowCompact(ShowCompactResponse showCompactR
         Metrics.getOrCreateGauge(key).set(0);
       }
     }
+
+    Long numFailedComp = counts.get(TxnStore.FAILED_RESPONSE);
+    Long numNotInitiatedComp = counts.get(TxnStore.DID_NOT_INITIATE_RESPONSE);
+    Long numSucceededComp = counts.get(TxnStore.SUCCEEDED_RESPONSE);
+    if (numFailedComp != null && numNotInitiatedComp != null && 
numSucceededComp != null &&
+        ((numFailedComp + numNotInitiatedComp) / (numFailedComp + 
numNotInitiatedComp + numSucceededComp) >
+      MetastoreConf.getDoubleVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_FAILED_COMPACTION_RATIO_THRESHOLD))) {
+      LOG.warn("Many compactions are failing. Check root cause of failed/not 
initiated compactions.");
+    }
+
     if (oldestEnqueueTime == Long.MAX_VALUE) {
       Metrics.getOrCreateGauge(COMPACTION_OLDEST_ENQUEUE_AGE).set(0);
     } else {
+      int oldestEnqueueAge = (int) ((System.currentTimeMillis() - 
oldestEnqueueTime) / 1000L);
       Metrics.getOrCreateGauge(COMPACTION_OLDEST_ENQUEUE_AGE)
-          .set((int) ((System.currentTimeMillis() - oldestEnqueueTime) / 
1000L));
+          .set(oldestEnqueueAge);
+      if (oldestEnqueueAge >= MetastoreConf.getTimeVar(conf,
+          
MetastoreConf.ConfVars.COMPACTOR_OLDEST_INITIATED_COMPACTION_TIME_THRESHOLD_WARNING,
 TimeUnit.SECONDS) &&
+          oldestEnqueueAge < MetastoreConf.getTimeVar(conf,
+              
MetastoreConf.ConfVars.COMPACTOR_OLDEST_INITIATED_COMPACTION_TIME_THRESHOLD_ERROR,
 TimeUnit.SECONDS)) {
+        LOG.warn("Found compaction entry in compaction queue with an age of " 
+ oldestEnqueueAge + " seconds. " +
+            "Check the time of last successful compaction and number of worker 
threads.");

Review comment:
       "Check the time of last successful compaction" -> I know this is part of 
the alert description but maybe this doesn't make sense...maybe just leave this 
at "Check the number of worker threads".. or "Consider increasing the number of 
worker threads"

##########
File path: 
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java
##########
@@ -85,36 +85,113 @@ public void run() {
 
   private void collectMetrics() throws MetaException {
     ShowCompactResponse currentCompactions = txnHandler.showCompact(new 
ShowCompactRequest());
-    updateMetricsFromShowCompact(currentCompactions);
+    updateMetricsFromShowCompact(currentCompactions, conf);
     updateDBMetrics();
   }
 
   private void updateDBMetrics() throws MetaException {
     MetricsInfo metrics = txnHandler.getMetricsInfo();
     
Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount());
+    if (metrics.getTxnToWriteIdCount() >=
+        MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING) &&
+        metrics.getTxnToWriteIdCount() <
+            MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) {
+      LOG.warn("An excessive amount of (" + metrics.getTxnToWriteIdCount() + 
") Hive ACID metadata found in " +
+          "TXN_TO_WRITEID table, which can cause serious performance 
degradation.");
+    } else if (metrics.getTxnToWriteIdCount() >=
+        MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) {
+      LOG.error("An excessive amount of (" + metrics.getTxnToWriteIdCount() + 
") Hive ACID metadata found in " +
+          "TXN_TO_WRITEID table, which can cause serious performance 
degradation.");
+    }
     
Metrics.getOrCreateGauge(NUM_COMPLETED_TXN_COMPONENTS).set(metrics.getCompletedTxnsCount());
-
+    if (metrics.getCompletedTxnsCount() >=
+        MetastoreConf.getIntVar(conf,
+            
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING)
 &&
+        metrics.getCompletedTxnsCount() <
+            MetastoreConf.getIntVar(conf,
+                
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR))
 {
+      LOG.warn("An excessive amount of (" + metrics.getCompletedTxnsCount() + 
") Hive ACID metadata found in " +
+          "COMPLETED_TXN_COMPONENTS table, which can cause serious performance 
degradation.");
+    } else if (metrics.getCompletedTxnsCount() >= MetastoreConf.getIntVar(conf,
+        
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR))
 {
+      LOG.error("An excessive amount of (" + metrics.getCompletedTxnsCount() + 
") Hive ACID metadata found in " +
+          "COMPLETED_TXN_COMPONENTS table, which can cause serious performance 
degradation.");
+    }
     
Metrics.getOrCreateGauge(NUM_OPEN_REPL_TXNS).set(metrics.getOpenReplTxnsCount());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_ID).set(metrics.getOldestOpenReplTxnId());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_AGE).set(metrics.getOldestOpenReplTxnAge());
+    if (metrics.getOldestOpenReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING,
+            TimeUnit.SECONDS) && metrics.getOldestOpenReplTxnAge() <
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.warn("A replication transaction has been open for " + 
metrics.getOldestOpenReplTxnAge() + " seconds. " +
+          "Before you abort a transaction that was created by replication, and 
which has been open a long time, " +
+          "make sure that the hive.repl.txn.timeout threshold has expired.");
+    } else if (metrics.getOldestOpenReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR,
+        TimeUnit.SECONDS)) {
+      LOG.error("A replication transaction has been open for " + 
metrics.getOldestOpenReplTxnAge() + " seconds. " +
+          "Before you abort a transaction that was created by replication, and 
which has been open a long time, " +
+          "make sure that the hive.repl.txn.timeout threshold has expired.");
+    }
     
Metrics.getOrCreateGauge(NUM_OPEN_NON_REPL_TXNS).set(metrics.getOpenNonReplTxnsCount());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_ID).set(metrics.getOldestOpenNonReplTxnId());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_AGE).set(metrics.getOldestOpenNonReplTxnAge());
+    if (metrics.getOldestOpenNonReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING,
+            TimeUnit.SECONDS)
+        && metrics.getOldestOpenNonReplTxnAge() <
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.warn("Found an open transaction with an age of " + 
metrics.getOldestOpenNonReplTxnAge() + " seconds.");
+    } else if (metrics.getOldestOpenNonReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.error("Found an open transaction with an age of " + 
metrics.getOldestOpenNonReplTxnAge() + " seconds.");
+    }
 
     
Metrics.getOrCreateGauge(NUM_ABORTED_TXNS).set(metrics.getAbortedTxnsCount());
     
Metrics.getOrCreateGauge(OLDEST_ABORTED_TXN_ID).set(metrics.getOldestAbortedTxnId());
     
Metrics.getOrCreateGauge(OLDEST_ABORTED_TXN_AGE).set(metrics.getOldestAbortedTxnAge());
+    if (metrics.getOldestAbortedTxnAge() >=
+        MetastoreConf.getTimeVar(conf,
+            
MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_WARNING,
+            TimeUnit.SECONDS) &&
+        metrics.getOldestAbortedTxnAge() <
+            MetastoreConf.getTimeVar(conf,
+                
MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_ERROR,
+                TimeUnit.SECONDS)) {
+      LOG.warn("Found an aborted transaction with an age of " + 
metrics.getOldestAbortedTxnAge() + " seconds.");
+    } else if (metrics.getOldestAbortedTxnAge() >=
+        MetastoreConf.getTimeVar(conf,
+            
MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.error("Found an aborted transaction with an age of " + 
metrics.getOldestAbortedTxnAge() + " seconds.");
+    }
 
     Metrics.getOrCreateGauge(NUM_LOCKS).set(metrics.getLocksCount());
     Metrics.getOrCreateGauge(OLDEST_LOCK_AGE).set(metrics.getOldestLockAge());
 
     
Metrics.getOrCreateGauge(TABLES_WITH_X_ABORTED_TXNS).set(metrics.getTablesWithXAbortedTxns());
+    if (metrics.getOldestAbortedTxnAge() >
+        MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TABLES_WITH_ABORTEDTXN_THRESHOLD)) {
+      LOG.error("Found " + metrics.getOldestAbortedTxnAge() + " 
tables/partitions with more than " +

Review comment:
       getTablesWithXAbortedTxns() instead of getOldestAbortedTxnAge()
   

##########
File path: 
standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java
##########
@@ -432,6 +432,88 @@ public static ConfVars getMetaConf(String name) {
     COMPACTOR_RUN_AS_USER("metastore.compactor.run.as.user", 
"hive.compactor.run.as.user", "",
         "Specify the user to run compactor Initiator and Worker as. If empty 
string, defaults to table/partition " +
         "directory owner."),
+    COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING(
+        "metastore.compactor.oldest.replication.open.txn.threshold.warning",
+        "hive.compactor.oldest.replication.open.txn.threshold.warning",
+        "14d", new TimeValidator(TimeUnit.DAYS),
+        "Age of open replication transaction after a warning should be raised. 
Default time unit: days"),
+    COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR(
+        "metastore.compactor.oldest.replication.open.txn.threshold.error",
+        "hive.compactor.oldest.replication.open.txn.threshold.error",
+        "21d", new TimeValidator(TimeUnit.DAYS),
+        "Age of open replication transaction after an error should be raised. 
Default time unit: days"),
+    COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING(

Review comment:
       *oldest non-replication transaction

##########
File path: 
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java
##########
@@ -85,36 +85,113 @@ public void run() {
 
   private void collectMetrics() throws MetaException {
     ShowCompactResponse currentCompactions = txnHandler.showCompact(new 
ShowCompactRequest());
-    updateMetricsFromShowCompact(currentCompactions);
+    updateMetricsFromShowCompact(currentCompactions, conf);
     updateDBMetrics();
   }
 
   private void updateDBMetrics() throws MetaException {
     MetricsInfo metrics = txnHandler.getMetricsInfo();
     
Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount());
+    if (metrics.getTxnToWriteIdCount() >=
+        MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING) &&
+        metrics.getTxnToWriteIdCount() <
+            MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) {
+      LOG.warn("An excessive amount of (" + metrics.getTxnToWriteIdCount() + 
") Hive ACID metadata found in " +
+          "TXN_TO_WRITEID table, which can cause serious performance 
degradation.");
+    } else if (metrics.getTxnToWriteIdCount() >=
+        MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) {
+      LOG.error("An excessive amount of (" + metrics.getTxnToWriteIdCount() + 
") Hive ACID metadata found in " +
+          "TXN_TO_WRITEID table, which can cause serious performance 
degradation.");
+    }
     
Metrics.getOrCreateGauge(NUM_COMPLETED_TXN_COMPONENTS).set(metrics.getCompletedTxnsCount());
-
+    if (metrics.getCompletedTxnsCount() >=
+        MetastoreConf.getIntVar(conf,
+            
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING)
 &&
+        metrics.getCompletedTxnsCount() <
+            MetastoreConf.getIntVar(conf,
+                
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR))
 {
+      LOG.warn("An excessive amount of (" + metrics.getCompletedTxnsCount() + 
") Hive ACID metadata found in " +
+          "COMPLETED_TXN_COMPONENTS table, which can cause serious performance 
degradation.");
+    } else if (metrics.getCompletedTxnsCount() >= MetastoreConf.getIntVar(conf,
+        
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR))
 {
+      LOG.error("An excessive amount of (" + metrics.getCompletedTxnsCount() + 
") Hive ACID metadata found in " +
+          "COMPLETED_TXN_COMPONENTS table, which can cause serious performance 
degradation.");
+    }
     
Metrics.getOrCreateGauge(NUM_OPEN_REPL_TXNS).set(metrics.getOpenReplTxnsCount());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_ID).set(metrics.getOldestOpenReplTxnId());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_AGE).set(metrics.getOldestOpenReplTxnAge());
+    if (metrics.getOldestOpenReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING,
+            TimeUnit.SECONDS) && metrics.getOldestOpenReplTxnAge() <
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.warn("A replication transaction has been open for " + 
metrics.getOldestOpenReplTxnAge() + " seconds. " +
+          "Before you abort a transaction that was created by replication, and 
which has been open a long time, " +
+          "make sure that the hive.repl.txn.timeout threshold has expired.");
+    } else if (metrics.getOldestOpenReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR,
+        TimeUnit.SECONDS)) {
+      LOG.error("A replication transaction has been open for " + 
metrics.getOldestOpenReplTxnAge() + " seconds. " +
+          "Before you abort a transaction that was created by replication, and 
which has been open a long time, " +
+          "make sure that the hive.repl.txn.timeout threshold has expired.");
+    }
     
Metrics.getOrCreateGauge(NUM_OPEN_NON_REPL_TXNS).set(metrics.getOpenNonReplTxnsCount());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_ID).set(metrics.getOldestOpenNonReplTxnId());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_AGE).set(metrics.getOldestOpenNonReplTxnAge());
+    if (metrics.getOldestOpenNonReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING,
+            TimeUnit.SECONDS)
+        && metrics.getOldestOpenNonReplTxnAge() <
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.warn("Found an open transaction with an age of " + 
metrics.getOldestOpenNonReplTxnAge() + " seconds.");
+    } else if (metrics.getOldestOpenNonReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.error("Found an open transaction with an age of " + 
metrics.getOldestOpenNonReplTxnAge() + " seconds.");
+    }
 
     
Metrics.getOrCreateGauge(NUM_ABORTED_TXNS).set(metrics.getAbortedTxnsCount());
     
Metrics.getOrCreateGauge(OLDEST_ABORTED_TXN_ID).set(metrics.getOldestAbortedTxnId());
     
Metrics.getOrCreateGauge(OLDEST_ABORTED_TXN_AGE).set(metrics.getOldestAbortedTxnAge());
+    if (metrics.getOldestAbortedTxnAge() >=
+        MetastoreConf.getTimeVar(conf,
+            
MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_WARNING,
+            TimeUnit.SECONDS) &&
+        metrics.getOldestAbortedTxnAge() <
+            MetastoreConf.getTimeVar(conf,
+                
MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_ERROR,
+                TimeUnit.SECONDS)) {
+      LOG.warn("Found an aborted transaction with an age of " + 
metrics.getOldestAbortedTxnAge() + " seconds.");

Review comment:
       Same as above: can log the txnid

##########
File path: 
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java
##########
@@ -85,36 +85,113 @@ public void run() {
 
   private void collectMetrics() throws MetaException {
     ShowCompactResponse currentCompactions = txnHandler.showCompact(new 
ShowCompactRequest());
-    updateMetricsFromShowCompact(currentCompactions);
+    updateMetricsFromShowCompact(currentCompactions, conf);
     updateDBMetrics();
   }
 
   private void updateDBMetrics() throws MetaException {
     MetricsInfo metrics = txnHandler.getMetricsInfo();
     
Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount());
+    if (metrics.getTxnToWriteIdCount() >=
+        MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING) &&
+        metrics.getTxnToWriteIdCount() <
+            MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) {
+      LOG.warn("An excessive amount of (" + metrics.getTxnToWriteIdCount() + 
") Hive ACID metadata found in " +
+          "TXN_TO_WRITEID table, which can cause serious performance 
degradation.");
+    } else if (metrics.getTxnToWriteIdCount() >=
+        MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) {
+      LOG.error("An excessive amount of (" + metrics.getTxnToWriteIdCount() + 
") Hive ACID metadata found in " +
+          "TXN_TO_WRITEID table, which can cause serious performance 
degradation.");
+    }
     
Metrics.getOrCreateGauge(NUM_COMPLETED_TXN_COMPONENTS).set(metrics.getCompletedTxnsCount());
-
+    if (metrics.getCompletedTxnsCount() >=
+        MetastoreConf.getIntVar(conf,
+            
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING)
 &&
+        metrics.getCompletedTxnsCount() <
+            MetastoreConf.getIntVar(conf,
+                
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR))
 {
+      LOG.warn("An excessive amount of (" + metrics.getCompletedTxnsCount() + 
") Hive ACID metadata found in " +
+          "COMPLETED_TXN_COMPONENTS table, which can cause serious performance 
degradation.");
+    } else if (metrics.getCompletedTxnsCount() >= MetastoreConf.getIntVar(conf,
+        
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR))
 {
+      LOG.error("An excessive amount of (" + metrics.getCompletedTxnsCount() + 
") Hive ACID metadata found in " +
+          "COMPLETED_TXN_COMPONENTS table, which can cause serious performance 
degradation.");
+    }
     
Metrics.getOrCreateGauge(NUM_OPEN_REPL_TXNS).set(metrics.getOpenReplTxnsCount());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_ID).set(metrics.getOldestOpenReplTxnId());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_AGE).set(metrics.getOldestOpenReplTxnAge());
+    if (metrics.getOldestOpenReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING,
+            TimeUnit.SECONDS) && metrics.getOldestOpenReplTxnAge() <
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.warn("A replication transaction has been open for " + 
metrics.getOldestOpenReplTxnAge() + " seconds. " +

Review comment:
       Here you can log the id too (OLDEST_OPEN_REPL_TXN_ID)

##########
File path: 
ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/metrics/DeltaFilesMetricReporter.java
##########
@@ -212,6 +213,22 @@ public static void mergeDeltaFilesStats(AcidDirectory dir, 
long checkThresholdIn
         }
       }
     }
+

Review comment:
       It would be better to separate this block out into a new method, for 
better readability

##########
File path: 
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java
##########
@@ -85,36 +85,113 @@ public void run() {
 
   private void collectMetrics() throws MetaException {
     ShowCompactResponse currentCompactions = txnHandler.showCompact(new 
ShowCompactRequest());
-    updateMetricsFromShowCompact(currentCompactions);
+    updateMetricsFromShowCompact(currentCompactions, conf);
     updateDBMetrics();
   }
 
   private void updateDBMetrics() throws MetaException {
     MetricsInfo metrics = txnHandler.getMetricsInfo();
     
Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount());
+    if (metrics.getTxnToWriteIdCount() >=
+        MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING) &&
+        metrics.getTxnToWriteIdCount() <
+            MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) {
+      LOG.warn("An excessive amount of (" + metrics.getTxnToWriteIdCount() + 
") Hive ACID metadata found in " +
+          "TXN_TO_WRITEID table, which can cause serious performance 
degradation.");
+    } else if (metrics.getTxnToWriteIdCount() >=
+        MetastoreConf.getIntVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) {
+      LOG.error("An excessive amount of (" + metrics.getTxnToWriteIdCount() + 
") Hive ACID metadata found in " +
+          "TXN_TO_WRITEID table, which can cause serious performance 
degradation.");
+    }
     
Metrics.getOrCreateGauge(NUM_COMPLETED_TXN_COMPONENTS).set(metrics.getCompletedTxnsCount());
-
+    if (metrics.getCompletedTxnsCount() >=
+        MetastoreConf.getIntVar(conf,
+            
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING)
 &&
+        metrics.getCompletedTxnsCount() <
+            MetastoreConf.getIntVar(conf,
+                
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR))
 {
+      LOG.warn("An excessive amount of (" + metrics.getCompletedTxnsCount() + 
") Hive ACID metadata found in " +
+          "COMPLETED_TXN_COMPONENTS table, which can cause serious performance 
degradation.");
+    } else if (metrics.getCompletedTxnsCount() >= MetastoreConf.getIntVar(conf,
+        
MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR))
 {
+      LOG.error("An excessive amount of (" + metrics.getCompletedTxnsCount() + 
") Hive ACID metadata found in " +
+          "COMPLETED_TXN_COMPONENTS table, which can cause serious performance 
degradation.");
+    }
     
Metrics.getOrCreateGauge(NUM_OPEN_REPL_TXNS).set(metrics.getOpenReplTxnsCount());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_ID).set(metrics.getOldestOpenReplTxnId());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_AGE).set(metrics.getOldestOpenReplTxnAge());
+    if (metrics.getOldestOpenReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING,
+            TimeUnit.SECONDS) && metrics.getOldestOpenReplTxnAge() <
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.warn("A replication transaction has been open for " + 
metrics.getOldestOpenReplTxnAge() + " seconds. " +
+          "Before you abort a transaction that was created by replication, and 
which has been open a long time, " +
+          "make sure that the hive.repl.txn.timeout threshold has expired.");
+    } else if (metrics.getOldestOpenReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR,
+        TimeUnit.SECONDS)) {
+      LOG.error("A replication transaction has been open for " + 
metrics.getOldestOpenReplTxnAge() + " seconds. " +
+          "Before you abort a transaction that was created by replication, and 
which has been open a long time, " +
+          "make sure that the hive.repl.txn.timeout threshold has expired.");
+    }
     
Metrics.getOrCreateGauge(NUM_OPEN_NON_REPL_TXNS).set(metrics.getOpenNonReplTxnsCount());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_ID).set(metrics.getOldestOpenNonReplTxnId());
     
Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_AGE).set(metrics.getOldestOpenNonReplTxnAge());
+    if (metrics.getOldestOpenNonReplTxnAge() >=
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING,
+            TimeUnit.SECONDS)
+        && metrics.getOldestOpenNonReplTxnAge() <
+        MetastoreConf.getTimeVar(conf, 
MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR,
+            TimeUnit.SECONDS)) {
+      LOG.warn("Found an open transaction with an age of " + 
metrics.getOldestOpenNonReplTxnAge() + " seconds.");

Review comment:
       Same as above: can log the txn id (OLDEST_OPEN_NON_REPL_TXN_ID)

##########
File path: 
standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java
##########
@@ -432,6 +432,88 @@ public static ConfVars getMetaConf(String name) {
     COMPACTOR_RUN_AS_USER("metastore.compactor.run.as.user", 
"hive.compactor.run.as.user", "",
         "Specify the user to run compactor Initiator and Worker as. If empty 
string, defaults to table/partition " +
         "directory owner."),
+    COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING(
+        "metastore.compactor.oldest.replication.open.txn.threshold.warning",
+        "hive.compactor.oldest.replication.open.txn.threshold.warning",
+        "14d", new TimeValidator(TimeUnit.DAYS),
+        "Age of open replication transaction after a warning should be raised. 
Default time unit: days"),
+    COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR(
+        "metastore.compactor.oldest.replication.open.txn.threshold.error",
+        "hive.compactor.oldest.replication.open.txn.threshold.error",
+        "21d", new TimeValidator(TimeUnit.DAYS),
+        "Age of open replication transaction after an error should be raised. 
Default time unit: days"),
+    COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING(
+        "metastore.compactor.oldest.open.txn.threshold.warning",
+        "hive.compactor.oldest.open.txn.threshold.warning", "24h",
+        new TimeValidator(TimeUnit.HOURS), "Age of oldest open transaction 
after a warning should be raised. " +
+        "Default time unit: hours"),
+    COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR(
+        "metastore.compactor.oldest.open.txn.threshold.error",
+        "hive.compactor.oldest.open.txn.threshold.error", "72h",
+        new TimeValidator(TimeUnit.HOURS), "Age of oldest open transaction 
after an error should be raised. " +
+        "Default time unit: hours"),
+    COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_WARNING(
+        
"metastore.compactor.oldest.uncleaned.aborted.txn.time.threshold.warning",
+        "hive.compactor.oldest.uncleaned.aborted.txn.time.threshold.warning",
+        "24h", new TimeValidator(TimeUnit.HOURS),
+        "Age of oldest aborted transaction after a warning should be raised. 
Default time unit: hours"),
+    COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_ERROR(
+        
"metastore.compactor.oldest.uncleaned.aborted.txn.time.threshold.error",
+        "hive.compactor.oldest.uncleaned.aborted.txn.time.threshold.error",
+        "48h", new TimeValidator(TimeUnit.HOURS),
+        "Age of oldest aborted transaction after an error should be raised. 
Default time unit: hours"),
+    COMPACTOR_TABLES_WITH_ABORTEDTXN_THRESHOLD(
+        "metastore.compactor.tables.with.aborted.txn.threshold",
+        "hive.compactor.tables.with.aborted.txn.threshold", 1,
+        "Number of tables has not been compacted and have more than " +
+            "hive.metastore.acidmetrics.table.aborted.txns.threshold (default 
1500) aborted transactions."),
+    COMPACTOR_OLDEST_UNCLEANED_COMPACTION_TIME_THRESHOLD(
+        "metastore.compactor.oldest.uncleaned.compaction.time.threshold",
+        "hive.compactor.oldest.uncleaned.compaction.time.threshold",
+        "24h", new TimeValidator(TimeUnit.HOURS),
+        "Age of oldest ready for cleaning compaction in the compaction queue. 
Default time unit is: hours"),
+    COMPACTOR_FAILED_COMPACTION_RATIO_THRESHOLD(
+        "metastore.compactor.failed.compaction.ratio.threshold",
+        "hive.compactor.failed.compaction.ratio.threshold", .01,
+        "Ratio between the number of failed compactions + not initiated 
compactions and number of failed " +
+            "compactions + not initiated compactions + succeeded 
compactions."),
+    COMPACTOR_OLDEST_INITIATED_COMPACTION_TIME_THRESHOLD_WARNING(
+        
"metastore.compactor.oldest.initiated.compaction.time.threshold.warning",
+        "hive.compactor.oldest.initiated.compaction.time.threshold.warning",
+        "1h", new TimeValidator(TimeUnit.HOURS),
+        "Age of oldest initiated compaction in the compaction queue when a 
warning should be raised. " +
+            "Default time unit is: hours"),
+    COMPACTOR_OLDEST_INITIATED_COMPACTION_TIME_THRESHOLD_ERROR(
+        "metastore.compactor.oldest.initiated.compaction.time.threshold.error",
+        "hive.compactor.oldest.initiated.compaction.time.threshold.error",
+        "12h", new TimeValidator(TimeUnit.HOURS),
+        "Age of oldest initiated compaction in the compaction queue when an 
error should be raised. " +
+            "Default time unit is: hours"),
+    COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING(
+        
"metastore.compactor.completed.txn.components.record.threshold.warning",
+        "hive.compactor.completed.txn.components.record.threshold.warning",
+        500000,
+        "Number of records in COMPLETED_TXN_COMPONENTS table, after a warning 
should be raised."),
+    COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR(
+        "metastore.compactor.completed.txn.components.record.threshold.error",
+        "hive.compactor.completed.txn.components.record.threshold.error",
+        1000000,
+        "Number of records in COMPLETED_TXN_COMPONENTS table, after an error 
should be raised."),
+    COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING(
+        "metastore.compactor.txn.to.writeid.record.threshold.warning",
+        "hive.compactor.txn.to.writeid.record.threshold.warning",
+        500000,
+        "Number of records in TXN_TO_WRITEID table, after a warning should be 
raised."),
+    COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR(
+        "metastore.compactor.txn.to.writeid.record.threshold.error",
+        "hive.compactor.txn.to.writeid.record.threshold.error",
+        1000000,
+        "Number of records in TXN_TO_WRITEID table, after a warning should be 
error."),
+    COMPACTOR_NUMBER_OF_DISABLED_COMPACTION_TABLES_THRESHOLD(
+        "metastore.compactor.number.of.disabled.compaction.tables.threshold",
+        "hive.compactor.number.of.disabled.compaction.tables.threshold",
+        1,
+        "Number of tables where the compaction is disabled"),

Review comment:
       Something like this might be clearer: If the number of writes to tables 
where auto-compaction is disabled reaches this threshold, a warning will be 
logged after every subsequent write to any table where auto-compaction is 
disabled.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Issue Time Tracking
-------------------

    Worklog Id:     (was: 624787)
    Time Spent: 20m  (was: 10m)

> Add logging based on new compaction metrics
> -------------------------------------------
>
>                 Key: HIVE-25345
>                 URL: https://issues.apache.org/jira/browse/HIVE-25345
>             Project: Hive
>          Issue Type: Improvement
>            Reporter: László Pintér
>            Assignee: László Pintér
>            Priority: Major
>              Labels: pull-request-available
>          Time Spent: 20m
>  Remaining Estimate: 0h
>




--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to