This is an automated email from the ASF dual-hosted git repository.

alberto pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/geode.git


The following commit(s) were added to refs/heads/develop by this push:
     new acdc18e31e GEODE-10344: Send alert when thread stuck for long (#7747)
acdc18e31e is described below

commit acdc18e31e5ea6e7e34d0d3475ea17e8efc5b65d
Author: Alberto Gomez <[email protected]>
AuthorDate: Fri Jul 15 08:29:01 2022 +0200

    GEODE-10344: Send alert when thread stuck for long (#7747)
    
    This is the implementation of the feature
    described in RFC:
    
https://cwiki.apache.org/confluence/display/GEODE/Management+of+threads+stuck+for+a+long+time+in+Geode
---
 .../ClusterDistributionManagerDUnitTest.java       | 35 ++++++++++++++++++++++
 .../monitoring/executor/AbstractExecutor.java      | 21 +++++++++++++
 .../diagnosing_system_probs.html.md.erb            | 12 ++++++++
 3 files changed, 68 insertions(+)

diff --git 
a/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
 
b/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
index 1480628b00..60a33467bf 100644
--- 
a/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
+++ 
b/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
@@ -103,6 +103,8 @@ public class ClusterDistributionManagerDUnitTest extends 
CacheTestCase {
   private VM vm1;
   private VM vm2;
 
+  private static volatile Future<?> sleepingFuture;
+
   @Rule
   public DistributedRestoreSystemProperties restoreSystemProperties =
       new DistributedRestoreSystemProperties();
@@ -471,6 +473,39 @@ public class ClusterDistributionManagerDUnitTest extends 
CacheTestCase {
     assertThat(member).isNotNull();
   }
 
+  /**
+   * Tests that a thread stuck alert is generated if a thread is stuck for 
longer
+   * than the configured value.
+   */
+  @Test
+  public void testThreadStuckAlert() throws Exception {
+    Properties config = getDistributedSystemProperties();
+    config.setProperty(MCAST_PORT, "0");
+    getSystem(config);
+    createAlertListener();
+
+    vm1.invoke("Connect to distributed system", () -> {
+      addIgnoredException("has been stuck for");
+      config.setProperty(MCAST_PORT, "0");
+      System.setProperty(GEMFIRE_PREFIX + "max-thread-stuck-time-minutes", 
"1");
+      config.setProperty(NAME, "sleeper");
+      getSystem(config);
+
+      ExecutorService executor =
+          
(getCache(config)).getDistributionManager().getExecutors().getThreadPool();
+
+      sleepingFuture = executor.submit(() -> {
+        try {
+          TimeUnit.MINUTES.sleep(2);
+        } catch (InterruptedException e) {
+        }
+      });
+    });
+
+    await().untilAsserted(() -> assertThat(alertReceived).isTrue());
+    vm1.invoke(() -> sleepingFuture.cancel(true));
+  }
+
   private CacheListener<String, String> getSleepingListener(final boolean 
playDead) {
     regionDestroyedInvoked = false;
 
diff --git 
a/geode-core/src/main/java/org/apache/geode/internal/monitoring/executor/AbstractExecutor.java
 
b/geode-core/src/main/java/org/apache/geode/internal/monitoring/executor/AbstractExecutor.java
index 6e3a36b862..cadd2f735f 100644
--- 
a/geode-core/src/main/java/org/apache/geode/internal/monitoring/executor/AbstractExecutor.java
+++ 
b/geode-core/src/main/java/org/apache/geode/internal/monitoring/executor/AbstractExecutor.java
@@ -26,6 +26,7 @@ import java.util.Map;
 import org.apache.logging.log4j.Logger;
 
 import org.apache.geode.annotations.Immutable;
+import org.apache.geode.internal.lang.SystemProperty;
 import org.apache.geode.logging.internal.log4j.api.LogService;
 
 public abstract class AbstractExecutor {
@@ -37,6 +38,10 @@ public abstract class AbstractExecutor {
   private final String groupName;
   private short numIterationsStuck;
   private volatile long startTime;
+  private final int maxThreadStuckTime = 
SystemProperty.getProductIntegerProperty(
+      "max-thread-stuck-time-minutes").orElse(0) * 1000 * 60;
+
+  private volatile boolean stuckForGood = false;
 
   public AbstractExecutor(String groupName) {
     this(groupName, Thread.currentThread().getId());
@@ -51,9 +56,25 @@ public abstract class AbstractExecutor {
 
   public void handleExpiry(long stuckTime, Map<Long, ThreadInfo> 
threadInfoMap) {
     incNumIterationsStuck();
+    sendAlertForThreadStuckForLong(stuckTime, threadInfoMap);
     logger.warn(createThreadReport(stuckTime, threadInfoMap));
   }
 
+  private void sendAlertForThreadStuckForLong(long stuckTime, Map<Long, 
ThreadInfo> threadInfoMap) {
+    if (maxThreadStuckTime <= 0) {
+      return;
+    }
+    if (threadInfoMap.get(threadID) == null) {
+      return;
+    }
+    if (stuckForGood) {
+      return;
+    }
+    if (stuckTime > maxThreadStuckTime) {
+      stuckForGood = true;
+      logger.fatal(createThreadReport(stuckTime, threadInfoMap));
+    }
+  }
 
   String createThreadReport(long stuckTime, Map<Long, ThreadInfo> 
threadInfoMap) {
 
diff --git 
a/geode-docs/managing/troubleshooting/diagnosing_system_probs.html.md.erb 
b/geode-docs/managing/troubleshooting/diagnosing_system_probs.html.md.erb
index 35799fdc48..4113aede21 100644
--- a/geode-docs/managing/troubleshooting/diagnosing_system_probs.html.md.erb
+++ b/geode-docs/managing/troubleshooting/diagnosing_system_probs.html.md.erb
@@ -37,6 +37,7 @@ This section provides possible causes and suggested responses 
for system problem
 -   
[PartitionedRegionStorageException](diagnosing_system_probs.html#diagnosing_system_probs__section_7DE15A6C99974821B6CA418BC2AF98F1)
 -   [Application crashes without producing an 
exception](diagnosing_system_probs.html#diagnosing_system_probs__section_AFA1D06BC3AA44A4AB0593FD1EF0B0B7)
 -   [Timeout 
alert](diagnosing_system_probs.html#diagnosing_system_probs__section_06C68EA0DACC46C58AA88E98C19AD2D8)
+-   [Thread stuck 
alert](diagnosing_system_probs.html#diagnosing_system_probs__section_06C68EA0DACC46C58AA88E98C19AD2D81)
 -   [Member produces 
SocketTimeoutException](diagnosing_system_probs.html#diagnosing_system_probs__section_66D11C8E84F941B58800EDB52194B087)
 -   [Member logs ForcedDisconnectException, Cache and DistributedSystem 
forcibly 
closed](diagnosing_system_probs.html#diagnosing_system_probs__section_8C7CB2EA0A274DAF90083FECE0BF3B1F)
 -   [Members cannot see each 
other](diagnosing_system_probs.html#diagnosing_system_probs__section_778D150443044847B1C73B9E02BE247B)
@@ -310,6 +311,17 @@ Response:
 -   If you’re seeing a lot of timeouts and you haven’t seen them before, check 
whether your network is flooded.
 -   If you see these alerts constantly during normal operation, consider 
raising the ack-wait-threshold above the default 15 seconds.
 
+
+## <a id="diagnosing_system_probs__section_06C68EA0DACC46C58AA88E98C19AD2D81" 
class="no-quick-link"></a>Thread stuck alert
+
+If a thread in a member has been stuck for longer than the configured time 
(max-thread-stuck-minutes System Property), it sends an alert to signal that 
something might be wrong with the member or with some other member. The alert 
is logged in the member’s log as fatal.
+
+A thread stuck timeout alert warns about a thread that is stuck in a member 
that would probably never progress. A possible cause would be a bug in the code.
+
+Response:
+
+-   If you see these alerts, consider bouncing the member at a convenient time 
to release the stuck thread.
+
 ## <a id="diagnosing_system_probs__section_66D11C8E84F941B58800EDB52194B087" 
class="no-quick-link"></a>Member produces SocketTimeoutException
 
 A client and server produces a SocketTimeoutException when it stops waiting 
for a response from the other side of the connection and closes the socket. 
This exception typically happens on the handshake or when establishing a 
callback connection.

Reply via email to