[ 
https://issues.apache.org/jira/browse/CASSANDRA-9625?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Ruoran Wang updated CASSANDRA-9625:
-----------------------------------
    Attachment: Screen Shot 2016-04-13 at 10.40.58 AM.png

I tired this following dumb fix, I applied similar change to 
ColumnFamilyMetrics where 
cfs.getCompactionStrategy().getEstimatedRemainingTasks(); is called. 
I hard coded to return 21 when getEstimatedRemainingTasks is taking too long. 
The graph shows when it's busy pendingCompaction shows 21, but now the 
graphite-reporter will continue to collect other metrics instead of blocked.

{noformat}
diff --git a/src/java/org/apache/cassandra/metrics/CompactionMetrics.java 
b/src/java/org/apache/cassandra/metrics/CompactionMetrics.java
index f7a99e1..e2ac22b 100644
--- a/src/java/org/apache/cassandra/metrics/CompactionMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/CompactionMetrics.java
@@ -18,8 +18,13 @@
 package org.apache.cassandra.metrics;
 
 import java.util.*;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
 import java.util.concurrent.ThreadPoolExecutor;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 
 import com.yammer.metrics.Metrics;
 import com.yammer.metrics.core.Counter;
@@ -31,12 +36,17 @@ import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.compaction.CompactionManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Metrics for compaction.
  */
 public class CompactionMetrics implements 
CompactionManager.CompactionExecutorStatsCollector
 {
+
+    private static final Logger logger = 
LoggerFactory.getLogger(CompactionMetrics.class);
+
     public static final MetricNameFactory factory = new 
DefaultNameFactory("Compaction");
 
     // a synchronized identity set of running tasks to their compaction info
@@ -57,15 +67,36 @@ public class CompactionMetrics implements 
CompactionManager.CompactionExecutorSt
         {
             public Integer value()
             {
-                int n = 0;
-                // add estimate number of compactions need to be done
-                for (String keyspaceName : Schema.instance.getKeyspaces())
-                {
-                    for (ColumnFamilyStore cfs : 
Keyspace.open(keyspaceName).getColumnFamilyStores())
-                        n += 
cfs.getCompactionStrategy().getEstimatedRemainingTasks();
+                // The collector thread is likely to be blocked by compactions
+                // This is a quick fix to avoid losing metrics
+                ExecutorService executor = Executors.newSingleThreadExecutor();
+
+                final Future<Integer> future = executor.submit(new Callable() {
+                    @Override
+                    public Integer call() throws Exception {
+                        int n = 0;
+                        // add estimate number of compactions need to be done
+                        for (String keyspaceName : 
Schema.instance.getKeyspaces())
+                        {
+                            for (ColumnFamilyStore cfs : 
Keyspace.open(keyspaceName).getColumnFamilyStores())
+                                n += 
cfs.getCompactionStrategy().getEstimatedRemainingTasks();
+                        }
+                        // add number of currently running compactions
+                        return n + compactions.size();
+                    }
+                });
+
+                try {
+                    return future.get(20, TimeUnit.SECONDS);
+                } catch (TimeoutException e) {
+                    future.cancel(true);
+                    logger.error("Skipping PendingTasks because some cfs is 
busy");
+                } catch (Exception othere) {
+                    logger.error("Skipping PendingTasks because an unexpected 
exception", othere);
                 }
-                // add number of currently running compactions
-                return n + compactions.size();
+
+                executor.shutdownNow();
+                return 21;
             }
         });
         completedTasks = 
Metrics.newGauge(factory.createMetricName("CompletedTasks"), new Gauge<Long>()
{noformat}

> GraphiteReporter not reporting
> ------------------------------
>
>                 Key: CASSANDRA-9625
>                 URL: https://issues.apache.org/jira/browse/CASSANDRA-9625
>             Project: Cassandra
>          Issue Type: Bug
>         Environment: Debian Jessie, 7u79-2.5.5-1~deb8u1, Cassandra 2.1.3
>            Reporter: Eric Evans
>            Assignee: T Jake Luciani
>         Attachments: Screen Shot 2016-04-13 at 10.40.58 AM.png, metrics.yaml, 
> thread-dump.log
>
>
> When upgrading from 2.1.3 to 2.1.6, the Graphite metrics reporter stops 
> working.  The usual startup is logged, and one batch of samples is sent, but 
> the reporting interval comes and goes, and no other samples are ever sent.  
> The logs are free from errors.
> Frustratingly, metrics reporting works in our smaller (staging) environment 
> on 2.1.6; We are able to reproduce this on all 6 of production nodes, but not 
> on a 3 node (otherwise identical) staging cluster (maybe it takes a certain 
> level of concurrency?).
> Attached is a thread dump, and our metrics.yaml.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to