This is an automated email from the ASF dual-hosted git repository. agura pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/ignite.git
The following commit(s) were added to refs/heads/master by this push: new 594061d IGNITE-12523 Added throttling for thread dumps generation on system failure. 594061d is described below commit 594061d906749c38b718763c5753d99281b36728 Author: Andrey Gura <ag...@apache.org> AuthorDate: Thu Dec 19 20:20:12 2019 +0300 IGNITE-12523 Added throttling for thread dumps generation on system failure. --- .../org/apache/ignite/IgniteSystemProperties.java | 8 + .../processors/failure/FailureProcessor.java | 63 ++++++- .../FailureProcessorThreadDumpThrottlingTest.java | 203 +++++++++++++++++++++ .../ignite/testsuites/IgniteBasicTestSuite.java | 2 + 4 files changed, 273 insertions(+), 3 deletions(-) diff --git a/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java b/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java index 53b7a9a..3388071 100644 --- a/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java +++ b/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java @@ -987,6 +987,14 @@ public final class IgniteSystemProperties { public static final String IGNITE_DUMP_THREADS_ON_FAILURE = "IGNITE_DUMP_THREADS_ON_FAILURE"; /** + * Throttling time out for thread dump generation during failure handling. + * + * Default is failure detection timeout. {@code 0} or negative value - throttling is disabled. + */ + public static final String IGNITE_DUMP_THREADS_ON_FAILURE_THROTTLING_TIMEOUT = + "IGNITE_DUMP_THREADS_ON_FAILURE_THROTTLING_TIMEOUT"; + + /** * Throttling timeout in millis which avoid excessive PendingTree access on unwind if there is nothing to clean yet. * * Default is 500 ms. diff --git a/modules/core/src/main/java/org/apache/ignite/internal/processors/failure/FailureProcessor.java b/modules/core/src/main/java/org/apache/ignite/internal/processors/failure/FailureProcessor.java index 19495eb..7980a4f 100644 --- a/modules/core/src/main/java/org/apache/ignite/internal/processors/failure/FailureProcessor.java +++ b/modules/core/src/main/java/org/apache/ignite/internal/processors/failure/FailureProcessor.java @@ -17,6 +17,8 @@ package org.apache.ignite.internal.processors.failure; +import java.util.EnumMap; +import java.util.Map; import org.apache.ignite.Ignite; import org.apache.ignite.IgniteCheckedException; import org.apache.ignite.IgniteSystemProperties; @@ -24,6 +26,7 @@ import org.apache.ignite.configuration.IgniteConfiguration; import org.apache.ignite.failure.AbstractFailureHandler; import org.apache.ignite.failure.FailureContext; import org.apache.ignite.failure.FailureHandler; +import org.apache.ignite.failure.FailureType; import org.apache.ignite.failure.NoOpFailureHandler; import org.apache.ignite.failure.StopNodeOrHaltFailureHandler; import org.apache.ignite.internal.GridKernalContext; @@ -33,13 +36,19 @@ import org.apache.ignite.internal.processors.diagnostic.DiagnosticProcessor; import org.apache.ignite.internal.util.typedef.X; import org.apache.ignite.internal.util.typedef.internal.U; +import static org.apache.ignite.IgniteSystemProperties.IGNITE_DUMP_THREADS_ON_FAILURE; +import static org.apache.ignite.IgniteSystemProperties.IGNITE_DUMP_THREADS_ON_FAILURE_THROTTLING_TIMEOUT; + /** * General failure processing API */ public class FailureProcessor extends GridProcessorAdapter { /** Value of the system property that enables threads dumping on failure. */ private final boolean igniteDumpThreadsOnFailure = - IgniteSystemProperties.getBoolean(IgniteSystemProperties.IGNITE_DUMP_THREADS_ON_FAILURE, false); + IgniteSystemProperties.getBoolean(IGNITE_DUMP_THREADS_ON_FAILURE, false); + + /** Timeout for throttling of thread dumps generation. */ + long dumpThreadsTrottlingTimeout; /** Ignored failure log message. */ static final String IGNORED_FAILURE_LOG_MSG = "Possible failure suppressed accordingly to a configured handler "; @@ -48,6 +57,9 @@ public class FailureProcessor extends GridProcessorAdapter { static final String FAILURE_LOG_MSG = "Critical system error detected. " + "Will be handled accordingly to configured handler "; + /** Thread dump per failure type timestamps. */ + private Map<FailureType, Long> threadDumpPerFailureTypeTime; + /** Ignite. */ private final Ignite ignite; @@ -66,7 +78,22 @@ public class FailureProcessor extends GridProcessorAdapter { public FailureProcessor(GridKernalContext ctx) { super(ctx); - this.ignite = ctx.grid(); + ignite = ctx.grid(); + + if (igniteDumpThreadsOnFailure) { + dumpThreadsTrottlingTimeout = + IgniteSystemProperties.getLong( + IGNITE_DUMP_THREADS_ON_FAILURE_THROTTLING_TIMEOUT, + ctx.config().getFailureDetectionTimeout() + ); + + if (dumpThreadsTrottlingTimeout > 0) { + threadDumpPerFailureTypeTime = new EnumMap<>(FailureType.class); + + for (FailureType type : FailureType.values()) + threadDumpPerFailureTypeTime.put(type, 0L); + } + } } /** {@inheritDoc} */ @@ -152,7 +179,7 @@ public class FailureProcessor extends GridProcessorAdapter { " WAL path: " + ctx.config().getDataStorageConfiguration().getWalPath() + " WAL archive path: " + ctx.config().getDataStorageConfiguration().getWalArchivePath()); - if (igniteDumpThreadsOnFailure) + if (igniteDumpThreadsOnFailure && !throttleThreadDump(failureCtx.type())) U.dumpThreads(log, !failureTypeIgnored(failureCtx, hnd)); DiagnosticProcessor diagnosticProcessor = ctx.diagnostic(); @@ -172,6 +199,36 @@ public class FailureProcessor extends GridProcessorAdapter { } /** + * Defines whether thread dump should be throttled for givn failure type or not. + * + * @param type Failure type. + * @return {@code True} if thread dump generation should be throttled fro given failure type. + */ + private boolean throttleThreadDump(FailureType type) { + if (dumpThreadsTrottlingTimeout <= 0) + return false; + + long curr = U.currentTimeMillis(); + + Long last = threadDumpPerFailureTypeTime.get(type); + + assert last != null : "Unknown failure type " + type; + + boolean throttle = curr - last < dumpThreadsTrottlingTimeout; + + if (!throttle) + threadDumpPerFailureTypeTime.put(type, curr); + else { + if (log.isInfoEnabled()) { + log.info("Thread dump is hidden due to throttling settings. " + + "Set IGNITE_DUMP_THREADS_ON_FAILURE_THROTTLING_TIMEOUT property to 0 to see all thread dumps."); + } + } + + return throttle; + } + + /** * @param failureCtx Failure context. * @param hnd Handler. */ diff --git a/modules/core/src/test/java/org/apache/ignite/internal/processors/failure/FailureProcessorThreadDumpThrottlingTest.java b/modules/core/src/test/java/org/apache/ignite/internal/processors/failure/FailureProcessorThreadDumpThrottlingTest.java new file mode 100644 index 0000000..9f85ae6 --- /dev/null +++ b/modules/core/src/test/java/org/apache/ignite/internal/processors/failure/FailureProcessorThreadDumpThrottlingTest.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.ignite.internal.processors.failure; + +import com.google.common.collect.ImmutableSet; +import org.apache.ignite.IgniteSystemProperties; +import org.apache.ignite.configuration.IgniteConfiguration; +import org.apache.ignite.failure.FailureContext; +import org.apache.ignite.failure.FailureType; +import org.apache.ignite.failure.TestFailureHandler; +import org.apache.ignite.internal.IgniteEx; +import org.apache.ignite.internal.util.typedef.internal.U; +import org.apache.ignite.testframework.ListeningTestLogger; +import org.apache.ignite.testframework.LogListener; +import org.apache.ignite.testframework.junits.WithSystemProperty; +import org.apache.ignite.testframework.junits.common.GridCommonAbstractTest; +import org.junit.Test; + +import static org.apache.ignite.failure.FailureType.SYSTEM_CRITICAL_OPERATION_TIMEOUT; +import static org.apache.ignite.failure.FailureType.SYSTEM_WORKER_BLOCKED; +import static org.apache.ignite.internal.util.IgniteUtils.THREAD_DUMP_MSG; + +/** + * Tests for throttling thread dumps during handling failures. + */ +@WithSystemProperty(key = IgniteSystemProperties.IGNITE_DUMP_THREADS_ON_FAILURE, value = "true") +public class FailureProcessorThreadDumpThrottlingTest extends GridCommonAbstractTest { + /** Test logger. */ + private final ListeningTestLogger testLog = new ListeningTestLogger(true, log); + + /** {@inheritDoc} */ + @Override protected IgniteConfiguration getConfiguration(String igniteInstanceName) throws Exception { + IgniteConfiguration cfg = super.getConfiguration(igniteInstanceName); + + TestFailureHandler hnd = new TestFailureHandler(false); + + hnd.setIgnoredFailureTypes(ImmutableSet.of(FailureType.SYSTEM_CRITICAL_OPERATION_TIMEOUT, SYSTEM_WORKER_BLOCKED)); + + cfg.setFailureHandler(hnd); + + cfg.setGridLogger(testLog); + + return cfg; + } + + /** {@inheritDoc} */ + @Override protected void beforeTest() throws Exception { + super.beforeTest(); + + startGrid(0); + } + + /** {@inheritDoc} */ + @Override protected void afterTest() throws Exception { + testLog.clearListeners(); + + stopAllGrids(); + + super.afterTest(); + } + + /** + * Tests that thread dumps will not get if {@code IGNITE_DUMP_THREADS_ON_FAILURE == false}. + */ + @Test + @WithSystemProperty(key = IgniteSystemProperties.IGNITE_DUMP_THREADS_ON_FAILURE, value = "false") + @WithSystemProperty(key = IgniteSystemProperties.IGNITE_DUMP_THREADS_ON_FAILURE_THROTTLING_TIMEOUT, value = "0") + public void testNoThreadDumps() throws Exception { + LogListener lsnr = LogListener.matches(THREAD_DUMP_MSG).times(0).build(); + + testLog.registerListener(lsnr); + + IgniteEx ignite = ignite(0); + + FailureContext failureCtx = + new FailureContext(SYSTEM_WORKER_BLOCKED, new Throwable("Failure context error")); + + for (int i = 0; i < 3; i++) + ignite.context().failure().process(failureCtx); + + assertTrue(lsnr.check()); + } + + /** + * Tests that thread dumps will get for every failure for disabled throttling. + */ + @Test + @WithSystemProperty(key = IgniteSystemProperties.IGNITE_DUMP_THREADS_ON_FAILURE, value = "true") + @WithSystemProperty(key = IgniteSystemProperties.IGNITE_DUMP_THREADS_ON_FAILURE_THROTTLING_TIMEOUT, value = "0") + public void testNoThrottling() throws Exception { + LogListener lsnr = LogListener.matches(THREAD_DUMP_MSG).times(3).build(); + + testLog.registerListener(lsnr); + + IgniteEx ignite = ignite(0); + + FailureContext failureCtx = + new FailureContext(SYSTEM_WORKER_BLOCKED, new Throwable("Failure context error")); + + for (int i = 0; i < 3; i++) + ignite.context().failure().process(failureCtx); + + assertTrue(lsnr.check()); + } + + /** + * Tests that thread dumps will be throttled and will be generated again after timeout exceeded. + */ + @Test + @WithSystemProperty(key = IgniteSystemProperties.IGNITE_DUMP_THREADS_ON_FAILURE, value = "true") + @WithSystemProperty(key = IgniteSystemProperties.IGNITE_DUMP_THREADS_ON_FAILURE_THROTTLING_TIMEOUT, value = "1000") + public void testThrottling() throws Exception { + LogListener dumpLsnr = LogListener.matches(THREAD_DUMP_MSG).times(2).build(); + LogListener throttledLsnr = LogListener.matches("Thread dump is hidden").times(4).build(); + + testLog.registerListener(dumpLsnr); + testLog.registerListener(throttledLsnr); + + IgniteEx ignite = ignite(0); + + FailureContext failureCtx = + new FailureContext(SYSTEM_WORKER_BLOCKED, new Throwable("Failure context error")); + + for (int i = 0; i < 3; i++) + ignite.context().failure().process(failureCtx); + + U.sleep(1000); + + for (int i = 0; i < 3; i++) + ignite.context().failure().process(failureCtx); + + assertTrue(dumpLsnr.check()); + assertTrue(throttledLsnr.check()); + } + + /** + * Tests that thread dumps will be throttled per failure type and will be generated again after timeout exceeded. + */ + @Test + @WithSystemProperty(key = IgniteSystemProperties.IGNITE_DUMP_THREADS_ON_FAILURE, value = "true") + @WithSystemProperty(key = IgniteSystemProperties.IGNITE_DUMP_THREADS_ON_FAILURE_THROTTLING_TIMEOUT, value = "1000") + public void testThrottlingPerFailureType() throws Exception { + LogListener dumpLsnr = LogListener.matches(THREAD_DUMP_MSG).times(4).build(); + LogListener throttledLsnr = LogListener.matches("Thread dump is hidden").times(8).build(); + + testLog.registerListener(dumpLsnr); + testLog.registerListener(throttledLsnr); + + IgniteEx ignite = ignite(0); + + FailureContext workerBlockedFailureCtx = + new FailureContext(SYSTEM_WORKER_BLOCKED, new Throwable("Failure context error")); + + FailureContext opTimeoutFailureCtx = + new FailureContext(SYSTEM_CRITICAL_OPERATION_TIMEOUT, new Throwable("Failure context error")); + + for (int i = 0; i < 3; i++) { + ignite.context().failure().process(workerBlockedFailureCtx); + + ignite.context().failure().process(opTimeoutFailureCtx); + } + + U.sleep(1000); + + for (int i = 0; i < 3; i++) { + ignite.context().failure().process(workerBlockedFailureCtx); + + ignite.context().failure().process(opTimeoutFailureCtx); + } + + assertTrue(dumpLsnr.check()); + assertTrue(throttledLsnr.check()); + } + + /** + * Tests that default thread dump trhottling timeout equals failure detection timeout. + */ + @Test + @WithSystemProperty(key = IgniteSystemProperties.IGNITE_DUMP_THREADS_ON_FAILURE, value = "true") + public void testDefaultThrottlingTimeout() throws Exception { + IgniteEx ignite = ignite(0); + + assertEquals( + ignite.context().failure().dumpThreadsTrottlingTimeout, + ignite.configuration().getFailureDetectionTimeout().longValue() + ); + } +} diff --git a/modules/core/src/test/java/org/apache/ignite/testsuites/IgniteBasicTestSuite.java b/modules/core/src/test/java/org/apache/ignite/testsuites/IgniteBasicTestSuite.java index 927dd71..5c6be99 100644 --- a/modules/core/src/test/java/org/apache/ignite/testsuites/IgniteBasicTestSuite.java +++ b/modules/core/src/test/java/org/apache/ignite/testsuites/IgniteBasicTestSuite.java @@ -90,6 +90,7 @@ import org.apache.ignite.internal.processors.database.CacheFreeListSelfTest; import org.apache.ignite.internal.processors.database.DataRegionMetricsSelfTest; import org.apache.ignite.internal.processors.database.IndexStorageSelfTest; import org.apache.ignite.internal.processors.database.SwapPathConstructionSelfTest; +import org.apache.ignite.internal.processors.failure.FailureProcessorThreadDumpThrottlingTest; import org.apache.ignite.internal.processors.metastorage.DistributedMetaStorageTest; import org.apache.ignite.internal.processors.metastorage.persistence.DistributedMetaStorageHistoryCacheTest; import org.apache.ignite.internal.processors.metastorage.persistence.DmsDataWriterWorkerTest; @@ -240,6 +241,7 @@ import org.junit.runners.Suite; OomFailureHandlerTest.class, TransactionIntegrityWithSystemWorkerDeathTest.class, FailureProcessorLoggingTest.class, + FailureProcessorThreadDumpThrottlingTest.class, AtomicOperationsInTxTest.class,