This is an automated email from the ASF dual-hosted git repository. bschuchardt pushed a commit to branch develop in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/develop by this push: new 08316aa GEODE-8385: hang recovering from disk with cyclic dependencies (#5403) 08316aa is described below commit 08316aa05198704d96aefc5497e483052c27a378 Author: Bruce Schuchardt <bschucha...@pivotal.io> AuthorDate: Wed Jul 29 07:58:39 2020 -0700 GEODE-8385: hang recovering from disk with cyclic dependencies (#5403) * GEODE-8385: hang recovering from disk with cyclic dependencies This restores the point at which we notify membership listeners of departures. We used to do this (in 1.12 and earlier) when a ShutdownMessage is received instead of waiting for a new membership view announcing the departure. Membership views can take some time to form and install, which can cause persistent (disk store) views to be updated later than they used to be. In the case of this ticket the disk store of one member was being closed while another was shutting down. The member closing its disk store did not see the view announcing that shutdown until most of its disk store regions had closed their persistence advisors. This left the disk store thinking that the other member was still up at the time it was closed. --- .../ClusterDistributionManagerDUnitTest.java | 28 ++++++++ .../internal/ClusterDistributionManager.java | 76 ++++++++++------------ .../distributed/internal/StartupOperation.java | 1 - .../internal/membership/gms/GMSMemberData.java | 4 +- .../internal/membership/gms/GMSMembership.java | 6 +- 5 files changed, 69 insertions(+), 46 deletions(-) diff --git a/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java index 9d3bbd9..e28f15ab 100644 --- a/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java +++ b/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java @@ -211,6 +211,34 @@ public class ClusterDistributionManagerDUnitTest extends CacheTestCase { .until(() -> !membershipManager.isSurpriseMember(member)); } + @Test + public void shutdownMessageCausesListenerInvocation() { + final AtomicBoolean listenerInvoked = new AtomicBoolean(); + vm1.invoke("join the cluster", () -> getSystem().getDistributedMember()); // lead member + system = getSystem(); // non-lead member + // this membership listener will be invoked when the shutdown message is received + system.getDistributionManager().addMembershipListener(new MembershipListener() { + @Override + public void memberDeparted(DistributionManager distributionManager, + InternalDistributedMember id, boolean crashed) { + assertThat(crashed).isFalse(); + listenerInvoked.set(Boolean.TRUE); + } + }); + final InternalDistributedMember memberID = system.getDistributedMember(); + locatorvm.invoke("send a shutdown message", () -> { + final DistributionManager distributionManager = + ((InternalDistributedSystem) Locator.getLocator().getDistributedSystem()) + .getDistributionManager(); + final ShutdownMessage shutdownMessage = new ShutdownMessage(); + shutdownMessage.setRecipient(memberID); + shutdownMessage.setDistributionManagerId(distributionManager.getDistributionManagerId()); + distributionManager.putOutgoing(shutdownMessage); + }); + await().until(() -> listenerInvoked.get()); + } + + /** * Tests that a severe-level alert is generated if a member does not respond with an ack quickly * enough. vm0 and vm1 create a region and set ack-severe-alert-threshold. vm1 has a cache diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/ClusterDistributionManager.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/ClusterDistributionManager.java index 9a52f60..1f58344 100644 --- a/geode-core/src/main/java/org/apache/geode/distributed/internal/ClusterDistributionManager.java +++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/ClusterDistributionManager.java @@ -1782,24 +1782,6 @@ public class ClusterDistributionManager implements DistributionManager { } /** - * Returns true if id was removed. Returns false if it was not in the list of managers. - */ - private boolean removeManager(InternalDistributedMember theId, boolean crashed, String p_reason) { - String reason = p_reason; - - reason = prettifyReason(reason); - if (logger.isDebugEnabled()) { - logger.debug("DistributionManager: removing member <{}>; crashed {}; reason = {}", theId, - crashed, reason); - } - removeHostedLocators(theId); - - redundancyZones.remove(theId); - - return true; - } - - /** * Makes note of a new distribution manager that has started up in the distributed cache. Invokes * the appropriately listeners. * @@ -1866,41 +1848,51 @@ public class ClusterDistributionManager implements DistributionManager { void shutdownMessageReceived(InternalDistributedMember theId, String reason) { removeHostedLocators(theId); distribution.shutdownMessageReceived(theId, reason); + handleManagerDeparture(theId, false, reason); } + /* + * handleManagerDeparted may be invoked multiple times for a member identifier. + * We allow this and inform listeners on each invocation, but only perform some + * actions (such as decrementing the node count) if the change came from a + * membership view. + */ @Override - public void handleManagerDeparture(InternalDistributedMember theId, boolean p_crashed, - String p_reason) { - + public void handleManagerDeparture(InternalDistributedMember theId, boolean memberCrashed, + String reason) { alertingService.removeAlertListener(theId); + removeUnfinishedStartup(theId, true); + int vmType = theId.getVmKind(); if (vmType == ADMIN_ONLY_DM_TYPE) { - removeUnfinishedStartup(theId, true); - handleConsoleShutdown(theId, p_crashed, p_reason); + handleConsoleShutdown(theId, memberCrashed, reason); return; } - removeUnfinishedStartup(theId, true); - - if (removeManager(theId, p_crashed, p_reason)) { - if (theId.getVmKind() != ClusterDistributionManager.LOCATOR_DM_TYPE) { - stats.incNodes(-1); - } - String msg; - if (p_crashed && !shouldInhibitMembershipWarnings()) { - msg = - "Member at {} unexpectedly left the distributed cache: {}"; - addMemberEvent(new MemberCrashedEvent(theId, p_reason)); - } else { - msg = - "Member at {} gracefully left the distributed cache: {}"; - addMemberEvent(new MemberDepartedEvent(theId, p_reason)); - } - logger.info(msg, new Object[] {theId, prettifyReason(p_reason)}); + if (logger.isDebugEnabled()) { + logger.debug( + "DistributionManager: removing member <{}>; crashed {}; reason = {}", theId, + memberCrashed, prettifyReason(reason)); + } + removeHostedLocators(theId); + redundancyZones.remove(theId); - executors.handleManagerDeparture(theId); + if (theId.getVmKind() != ClusterDistributionManager.LOCATOR_DM_TYPE) { + stats.incNodes(-1); + } + String msg; + if (memberCrashed && !shouldInhibitMembershipWarnings()) { + msg = + "Member at {} unexpectedly left the distributed cache: {}"; + addMemberEvent(new MemberCrashedEvent(theId, reason)); + } else { + msg = + "Member at {} gracefully left the distributed cache: {}"; + addMemberEvent(new MemberDepartedEvent(theId, reason)); } + logger.info(msg, new Object[] {theId, prettifyReason(reason)}); + executors.handleManagerDeparture(theId); } private void handleManagerSuspect(InternalDistributedMember suspect, @@ -2368,7 +2360,7 @@ public class ClusterDistributionManager implements DistributionManager { } dm.handleManagerDeparture(theId, crashed, reason); } catch (DistributedSystemDisconnectedException se) { - // let's not get huffy about it + // ignored } } diff --git a/geode-core/src/main/java/org/apache/geode/distributed/internal/StartupOperation.java b/geode-core/src/main/java/org/apache/geode/distributed/internal/StartupOperation.java index 5eef068..fb28123 100644 --- a/geode-core/src/main/java/org/apache/geode/distributed/internal/StartupOperation.java +++ b/geode-core/src/main/java/org/apache/geode/distributed/internal/StartupOperation.java @@ -75,7 +75,6 @@ public class StartupOperation { if (this.newlyDeparted != null && !this.newlyDeparted.isEmpty()) { // tell the reply processor not to wait for the recipients that didn't // get the message - // Vector viewMembers = dm.getViewMembers(); for (Iterator it = this.newlyDeparted.iterator(); it.hasNext();) { InternalDistributedMember id = (InternalDistributedMember) it.next(); this.dm.handleManagerDeparture(id, false, diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMemberData.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMemberData.java index b0b759e..2ab55c4 100644 --- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMemberData.java +++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMemberData.java @@ -401,9 +401,9 @@ public class GMSMemberData implements MemberData, Comparable<GMSMemberData> { sb.append("MemberData["); if (name != null && name.length() > 0) { - sb.append("name=").append(name); + sb.append("name=").append(name).append(';'); } - sb.append(";addr=").append(inetAddr).append(";port=").append(udpPort) + sb.append("addr=").append(inetAddr).append(";port=").append(udpPort) .append(";kind=").append(vmKind).append(";processId=").append(processId) .append(";viewId=").append(vmViewId); if (getVersionOrdinal() != Version.CURRENT_ORDINAL) { diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java index a791e46..db2ecae 100644 --- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java +++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java @@ -688,7 +688,11 @@ public class GMSMembership<ID extends MemberIdentifier> implements Membership<ID return; // Explicit deletion, no upcall. } - listener.memberDeparted(dm, crashed, reason); + if (!shutdownMembers.containsKey(dm)) { + // if we've received a shutdown message then DistributionManager will already have + // notified listeners + listener.memberDeparted(dm, crashed, reason); + } } /**