Repository: ignite Updated Branches: refs/heads/ignite-3212 [created] a05b55a4e
ignite-3212 Dump debug info on exchange timeout Project: http://git-wip-us.apache.org/repos/asf/ignite/repo Commit: http://git-wip-us.apache.org/repos/asf/ignite/commit/a05b55a4 Tree: http://git-wip-us.apache.org/repos/asf/ignite/tree/a05b55a4 Diff: http://git-wip-us.apache.org/repos/asf/ignite/diff/a05b55a4 Branch: refs/heads/ignite-3212 Commit: a05b55a4e52ecbc24fefabaf710d854215ac7f7f Parents: 3c3ed05 Author: sboikov <[email protected]> Authored: Wed Jun 1 10:33:01 2016 +0300 Committer: sboikov <[email protected]> Committed: Wed Jun 1 10:40:43 2016 +0300 ---------------------------------------------------------------------- .../apache/ignite/IgniteSystemProperties.java | 3 ++ .../GridCachePartitionExchangeManager.java | 43 +++++++++++++++++--- .../GridDhtPartitionsExchangeFuture.java | 42 +++++++++++++++++-- 3 files changed, 79 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ignite/blob/a05b55a4/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java ---------------------------------------------------------------------- diff --git a/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java b/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java index 5f761a2..ab77378 100644 --- a/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java +++ b/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java @@ -377,6 +377,9 @@ public final class IgniteSystemProperties { /** Number of times pending cache objects will be dumped to the log in case of partition exchange timeout. */ public static final String IGNITE_DUMP_PENDING_OBJECTS_THRESHOLD = "IGNITE_DUMP_PENDING_OBJECTS_THRESHOLD"; + /** */ + public static final String IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT = "IGNITE_DUMP_PENDING_OBJECTS_THRESHOLD"; + /** JDBC driver cursor remove delay. */ public static final String IGNITE_JDBC_DRIVER_CURSOR_REMOVE_DELAY = "IGNITE_JDBC_DRIVER_CURSOR_RMV_DELAY"; http://git-wip-us.apache.org/repos/asf/ignite/blob/a05b55a4/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java ---------------------------------------------------------------------- diff --git a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java index 78a8d6c..0c80423 100644 --- a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java +++ b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java @@ -33,12 +33,14 @@ import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.ignite.IgniteCheckedException; +import org.apache.ignite.IgniteSystemProperties; import org.apache.ignite.cluster.ClusterNode; import org.apache.ignite.events.DiscoveryEvent; import org.apache.ignite.events.Event; @@ -90,6 +92,7 @@ import org.jsr166.ConcurrentLinkedDeque8; import static java.util.concurrent.TimeUnit.MILLISECONDS; import static org.apache.ignite.IgniteSystemProperties.IGNITE_PRELOAD_RESEND_TIMEOUT; +import static org.apache.ignite.IgniteSystemProperties.IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT; import static org.apache.ignite.IgniteSystemProperties.getLong; import static org.apache.ignite.events.EventType.EVT_CACHE_REBALANCE_STARTED; import static org.apache.ignite.events.EventType.EVT_NODE_FAILED; @@ -1106,16 +1109,17 @@ public class GridCachePartitionExchangeManager<K, V> extends GridCacheSharedMana } /** - * + * @throws Exception If failed. */ - public void dumpDebugInfo() { + public void dumpDebugInfo() throws Exception { dumpDebugInfo(null); } /** - * + * @param exchTopVer Optional current exchange topology version. + * @throws Exception If failed. */ - public void dumpDebugInfo(@Nullable AffinityTopologyVersion exchTopVer) { + public void dumpDebugInfo(@Nullable AffinityTopologyVersion exchTopVer) throws Exception { U.warn(log, "Ready affinity version: " + readyTopVer.get()); U.warn(log, "Last exchange future: " + lastInitializedFut); @@ -1332,7 +1336,36 @@ public class GridCachePartitionExchangeManager<K, V> extends GridCacheSharedMana exchFut.init(); - exchFut.get(); + int dumpedObjects = 0; + + while (true) { + try { + exchFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS); + + break; + } + catch (IgniteFutureTimeoutCheckedException ignored) { + U.warn(log, "Failed to wait for partition map exchange [" + + "topVer=" + exchFut.topologyVersion() + + ", node=" + cctx.localNodeId() + "]. " + + "Dumping pending objects that might be the cause: "); + + if (dumpedObjects < GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) { + try { + dumpDebugInfo(exchFut.topologyVersion()); + } + catch (Exception e) { + U.error(log, "Failed to dump debug information: " + e, e); + } + + if (IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, true)) + U.dumpThreads(log); + + dumpedObjects++; + } + } + } + if (log.isDebugEnabled()) log.debug("After waiting for exchange future [exchFut=" + exchFut + ", worker=" + http://git-wip-us.apache.org/repos/asf/ignite/blob/a05b55a4/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java ---------------------------------------------------------------------- diff --git a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java index b497f58..5e81c22 100644 --- a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java +++ b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java @@ -73,6 +73,7 @@ import org.apache.ignite.lang.IgniteRunnable; import org.jetbrains.annotations.Nullable; import org.jsr166.ConcurrentHashMap8; +import static org.apache.ignite.IgniteSystemProperties.IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT; import static org.apache.ignite.events.EventType.EVT_NODE_FAILED; import static org.apache.ignite.events.EventType.EVT_NODE_JOINED; import static org.apache.ignite.events.EventType.EVT_NODE_LEFT; @@ -85,7 +86,7 @@ import static org.apache.ignite.internal.managers.communication.GridIoPolicy.SYS public class GridDhtPartitionsExchangeFuture extends GridFutureAdapter<AffinityTopologyVersion> implements Comparable<GridDhtPartitionsExchangeFuture>, GridDhtTopologyFuture { /** */ - private static final int DUMP_PENDING_OBJECTS_THRESHOLD = + public static final int DUMP_PENDING_OBJECTS_THRESHOLD = IgniteSystemProperties.getInteger(IgniteSystemProperties.IGNITE_DUMP_PENDING_OBJECTS_THRESHOLD, 10); /** */ @@ -891,7 +892,15 @@ public class GridDhtPartitionsExchangeFuture extends GridFutureAdapter<AffinityT U.warn(log, "Failed to wait for partition release future [topVer=" + topologyVersion() + ", node=" + cctx.localNodeId() + "]. Dumping pending objects that might be the cause: "); - cctx.exchange().dumpDebugInfo(topologyVersion()); + try { + cctx.exchange().dumpDebugInfo(topologyVersion()); + } + catch (Exception e) { + U.error(log, "Failed to dump debug information: " + e, e); + } + + if (IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, true)) + U.dumpThreads(log); } /** @@ -1541,6 +1550,7 @@ public class GridDhtPartitionsExchangeFuture extends GridFutureAdapter<AffinityT try { boolean crdChanged = false; boolean allReceived = false; + Set<UUID> reqFrom = null; ClusterNode crd0; @@ -1556,8 +1566,13 @@ public class GridDhtPartitionsExchangeFuture extends GridFutureAdapter<AffinityT crd = srvNodes.size() > 0 ? srvNodes.get(0) : null; } - if (crd != null && crd.isLocal() && rmvd) - allReceived = remaining.isEmpty(); + if (crd != null && crd.isLocal()) { + if (rmvd) + allReceived = remaining.isEmpty(); + + if (crdChanged && !remaining.isEmpty()) + reqFrom = new HashSet<>(remaining); + } crd0 = crd; } @@ -1588,6 +1603,25 @@ public class GridDhtPartitionsExchangeFuture extends GridFutureAdapter<AffinityT return; } + if (crdChanged && reqFrom != null) { + GridDhtPartitionsSingleRequest req = new GridDhtPartitionsSingleRequest(exchId); + + for (UUID nodeId : reqFrom) { + try { + // It is possible that some nodes finished exchange with previous coordinator. + cctx.io().send(nodeId, req, SYSTEM_POOL); + } + catch (ClusterTopologyCheckedException e) { + if (log.isDebugEnabled()) + log.debug("Node left during partition exchange [nodeId=" + nodeId + + ", exchId=" + exchId + ']'); + } + catch (IgniteCheckedException e) { + U.error(log, "Failed to request partitions from node: " + nodeId, e); + } + } + } + for (Map.Entry<ClusterNode, GridDhtPartitionsSingleMessage> m : singleMsgs.entrySet()) processMessage(m.getKey(), m.getValue()); }
