This is an automated email from the ASF dual-hosted git repository.
alberto pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/develop by this push:
new acdc18e31e GEODE-10344: Send alert when thread stuck for long (#7747)
acdc18e31e is described below
commit acdc18e31e5ea6e7e34d0d3475ea17e8efc5b65d
Author: Alberto Gomez <[email protected]>
AuthorDate: Fri Jul 15 08:29:01 2022 +0200
GEODE-10344: Send alert when thread stuck for long (#7747)
This is the implementation of the feature
described in RFC:
https://cwiki.apache.org/confluence/display/GEODE/Management+of+threads+stuck+for+a+long+time+in+Geode
---
.../ClusterDistributionManagerDUnitTest.java | 35 ++++++++++++++++++++++
.../monitoring/executor/AbstractExecutor.java | 21 +++++++++++++
.../diagnosing_system_probs.html.md.erb | 12 ++++++++
3 files changed, 68 insertions(+)
diff --git
a/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
b/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
index 1480628b00..60a33467bf 100644
---
a/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
+++
b/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
@@ -103,6 +103,8 @@ public class ClusterDistributionManagerDUnitTest extends
CacheTestCase {
private VM vm1;
private VM vm2;
+ private static volatile Future<?> sleepingFuture;
+
@Rule
public DistributedRestoreSystemProperties restoreSystemProperties =
new DistributedRestoreSystemProperties();
@@ -471,6 +473,39 @@ public class ClusterDistributionManagerDUnitTest extends
CacheTestCase {
assertThat(member).isNotNull();
}
+ /**
+ * Tests that a thread stuck alert is generated if a thread is stuck for
longer
+ * than the configured value.
+ */
+ @Test
+ public void testThreadStuckAlert() throws Exception {
+ Properties config = getDistributedSystemProperties();
+ config.setProperty(MCAST_PORT, "0");
+ getSystem(config);
+ createAlertListener();
+
+ vm1.invoke("Connect to distributed system", () -> {
+ addIgnoredException("has been stuck for");
+ config.setProperty(MCAST_PORT, "0");
+ System.setProperty(GEMFIRE_PREFIX + "max-thread-stuck-time-minutes",
"1");
+ config.setProperty(NAME, "sleeper");
+ getSystem(config);
+
+ ExecutorService executor =
+
(getCache(config)).getDistributionManager().getExecutors().getThreadPool();
+
+ sleepingFuture = executor.submit(() -> {
+ try {
+ TimeUnit.MINUTES.sleep(2);
+ } catch (InterruptedException e) {
+ }
+ });
+ });
+
+ await().untilAsserted(() -> assertThat(alertReceived).isTrue());
+ vm1.invoke(() -> sleepingFuture.cancel(true));
+ }
+
private CacheListener<String, String> getSleepingListener(final boolean
playDead) {
regionDestroyedInvoked = false;
diff --git
a/geode-core/src/main/java/org/apache/geode/internal/monitoring/executor/AbstractExecutor.java
b/geode-core/src/main/java/org/apache/geode/internal/monitoring/executor/AbstractExecutor.java
index 6e3a36b862..cadd2f735f 100644
---
a/geode-core/src/main/java/org/apache/geode/internal/monitoring/executor/AbstractExecutor.java
+++
b/geode-core/src/main/java/org/apache/geode/internal/monitoring/executor/AbstractExecutor.java
@@ -26,6 +26,7 @@ import java.util.Map;
import org.apache.logging.log4j.Logger;
import org.apache.geode.annotations.Immutable;
+import org.apache.geode.internal.lang.SystemProperty;
import org.apache.geode.logging.internal.log4j.api.LogService;
public abstract class AbstractExecutor {
@@ -37,6 +38,10 @@ public abstract class AbstractExecutor {
private final String groupName;
private short numIterationsStuck;
private volatile long startTime;
+ private final int maxThreadStuckTime =
SystemProperty.getProductIntegerProperty(
+ "max-thread-stuck-time-minutes").orElse(0) * 1000 * 60;
+
+ private volatile boolean stuckForGood = false;
public AbstractExecutor(String groupName) {
this(groupName, Thread.currentThread().getId());
@@ -51,9 +56,25 @@ public abstract class AbstractExecutor {
public void handleExpiry(long stuckTime, Map<Long, ThreadInfo>
threadInfoMap) {
incNumIterationsStuck();
+ sendAlertForThreadStuckForLong(stuckTime, threadInfoMap);
logger.warn(createThreadReport(stuckTime, threadInfoMap));
}
+ private void sendAlertForThreadStuckForLong(long stuckTime, Map<Long,
ThreadInfo> threadInfoMap) {
+ if (maxThreadStuckTime <= 0) {
+ return;
+ }
+ if (threadInfoMap.get(threadID) == null) {
+ return;
+ }
+ if (stuckForGood) {
+ return;
+ }
+ if (stuckTime > maxThreadStuckTime) {
+ stuckForGood = true;
+ logger.fatal(createThreadReport(stuckTime, threadInfoMap));
+ }
+ }
String createThreadReport(long stuckTime, Map<Long, ThreadInfo>
threadInfoMap) {
diff --git
a/geode-docs/managing/troubleshooting/diagnosing_system_probs.html.md.erb
b/geode-docs/managing/troubleshooting/diagnosing_system_probs.html.md.erb
index 35799fdc48..4113aede21 100644
--- a/geode-docs/managing/troubleshooting/diagnosing_system_probs.html.md.erb
+++ b/geode-docs/managing/troubleshooting/diagnosing_system_probs.html.md.erb
@@ -37,6 +37,7 @@ This section provides possible causes and suggested responses
for system problem
-
[PartitionedRegionStorageException](diagnosing_system_probs.html#diagnosing_system_probs__section_7DE15A6C99974821B6CA418BC2AF98F1)
- [Application crashes without producing an
exception](diagnosing_system_probs.html#diagnosing_system_probs__section_AFA1D06BC3AA44A4AB0593FD1EF0B0B7)
- [Timeout
alert](diagnosing_system_probs.html#diagnosing_system_probs__section_06C68EA0DACC46C58AA88E98C19AD2D8)
+- [Thread stuck
alert](diagnosing_system_probs.html#diagnosing_system_probs__section_06C68EA0DACC46C58AA88E98C19AD2D81)
- [Member produces
SocketTimeoutException](diagnosing_system_probs.html#diagnosing_system_probs__section_66D11C8E84F941B58800EDB52194B087)
- [Member logs ForcedDisconnectException, Cache and DistributedSystem
forcibly
closed](diagnosing_system_probs.html#diagnosing_system_probs__section_8C7CB2EA0A274DAF90083FECE0BF3B1F)
- [Members cannot see each
other](diagnosing_system_probs.html#diagnosing_system_probs__section_778D150443044847B1C73B9E02BE247B)
@@ -310,6 +311,17 @@ Response:
- If you’re seeing a lot of timeouts and you haven’t seen them before, check
whether your network is flooded.
- If you see these alerts constantly during normal operation, consider
raising the ack-wait-threshold above the default 15 seconds.
+
+## <a id="diagnosing_system_probs__section_06C68EA0DACC46C58AA88E98C19AD2D81"
class="no-quick-link"></a>Thread stuck alert
+
+If a thread in a member has been stuck for longer than the configured time
(max-thread-stuck-minutes System Property), it sends an alert to signal that
something might be wrong with the member or with some other member. The alert
is logged in the member’s log as fatal.
+
+A thread stuck timeout alert warns about a thread that is stuck in a member
that would probably never progress. A possible cause would be a bug in the code.
+
+Response:
+
+- If you see these alerts, consider bouncing the member at a convenient time
to release the stuck thread.
+
## <a id="diagnosing_system_probs__section_66D11C8E84F941B58800EDB52194B087"
class="no-quick-link"></a>Member produces SocketTimeoutException
A client and server produces a SocketTimeoutException when it stops waiting
for a response from the other side of the connection and closes the socket.
This exception typically happens on the handshake or when establishing a
callback connection.