GEODE-952: Some time many unit tests fails because locator was ForcedDisconnect
We're seeing a number of similar failures that all seem to be caused by JVMs pausing and being kicked out of the distributed system. This change-set enables creation of a heap dump if a member is forced out of the system and JVM pauses have been detected. This will give us artifacts that we can analyze to help determine what's going on. Project: http://git-wip-us.apache.org/repos/asf/incubator-geode/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-geode/commit/445efdb9 Tree: http://git-wip-us.apache.org/repos/asf/incubator-geode/tree/445efdb9 Diff: http://git-wip-us.apache.org/repos/asf/incubator-geode/diff/445efdb9 Branch: refs/heads/feature/GEODE-949-2 Commit: 445efdb9e80326d4de4cc6b71e32dd80e04543b8 Parents: 70ca921 Author: Bruce Schuchardt <[email protected]> Authored: Wed Mar 9 11:36:13 2016 -0800 Committer: Bruce Schuchardt <[email protected]> Committed: Wed Mar 9 14:31:30 2016 -0800 ---------------------------------------------------------------------- .../internal/InternalDistributedSystem.java | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-geode/blob/445efdb9/geode-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java ---------------------------------------------------------------------- diff --git a/geode-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java b/geode-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java index a193699..92cb9f8 100644 --- a/geode-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java +++ b/geode-core/src/main/java/com/gemstone/gemfire/distributed/internal/InternalDistributedSystem.java @@ -21,6 +21,7 @@ import java.io.File; import java.io.IOException; import java.io.Reader; import java.lang.reflect.Array; +import java.lang.reflect.Method; import java.net.InetAddress; import java.util.ArrayList; import java.util.Date; @@ -76,6 +77,7 @@ import com.gemstone.gemfire.internal.InternalDataSerializer; import com.gemstone.gemfire.internal.InternalInstantiator; import com.gemstone.gemfire.internal.LinuxProcFsStatistics; import com.gemstone.gemfire.internal.LocalStatisticsImpl; +import com.gemstone.gemfire.internal.OSProcess; import com.gemstone.gemfire.internal.OsStatisticsFactory; import com.gemstone.gemfire.internal.SocketCreator; import com.gemstone.gemfire.internal.StatisticsImpl; @@ -83,8 +85,8 @@ import com.gemstone.gemfire.internal.StatisticsManager; import com.gemstone.gemfire.internal.StatisticsTypeFactoryImpl; import com.gemstone.gemfire.internal.SystemTimer; import com.gemstone.gemfire.internal.admin.remote.DistributionLocatorId; -import com.gemstone.gemfire.internal.cache.CacheServerImpl; import com.gemstone.gemfire.internal.cache.CacheConfig; +import com.gemstone.gemfire.internal.cache.CacheServerImpl; import com.gemstone.gemfire.internal.cache.EventID; import com.gemstone.gemfire.internal.cache.GemFireCacheImpl; import com.gemstone.gemfire.internal.cache.execute.FunctionServiceStats; @@ -944,6 +946,22 @@ public class InternalDistributedSystem if (isForcedDisconnect) { this.forcedDisconnect = true; resetReconnectAttemptCounter(); + if (sampler.isSamplingEnabled()) { + if (sampler.getStatSamplerStats().getJvmPauses() > 0) { + try { + // if running tests then create a heap dump + Class.forName("com.gemstone.gemfire.test.dunit.standalone.DUnitLauncher"); + Class<?> jmapClass = Class.forName("sun.tools.jmap.JMap"); + logger.info("This member of the distributed system has been forced to disconnect. JVM pauses have been detected - dumping heap"); + String pid = String.valueOf(OSProcess.getId()); + String fileName = "java"+pid+".hprof"; + Object parameters = new String[]{"-dump:format=b,file="+fileName, pid}; + Method main = jmapClass.getDeclaredMethod("main", String[].class); + main.invoke(null, parameters); + } catch (Exception e) { + } + } + } reconnected = tryReconnect(true, reason, GemFireCacheImpl.getInstance()); }
