This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch support/1.13
in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/support/1.13 by this push:
new 2eb66cc GEODE-8385: hang recovering from disk with cyclic
dependencies (#5403)
2eb66cc is described below
commit 2eb66cc795e3d08218a4cd32f96a4b47d64d116e
Author: Bruce Schuchardt <[email protected]>
AuthorDate: Wed Jul 29 07:58:39 2020 -0700
GEODE-8385: hang recovering from disk with cyclic dependencies (#5403)
* GEODE-8385: hang recovering from disk with cyclic dependencies
This restores the point at which we notify membership listeners of
departures. We used to do this (in 1.12 and earlier) when a ShutdownMessage
is received instead of waiting for a new membership view announcing the
departure.
Membership views can take some time to form and install, which can cause
persistent (disk store) views to be updated later than they used to be.
In the case of this ticket the disk store of one member was being
closed while another was shutting down. The member closing its disk
store did not see the view announcing that shutdown until most of its
disk store regions had closed their persistence advisors. This left the
disk store thinking that the other member was still up at the time it
was closed.
(cherry picked from commit 08316aa05198704d96aefc5497e483052c27a378)
---
.../ClusterDistributionManagerDUnitTest.java | 28 ++++++++
.../internal/ClusterDistributionManager.java | 76 ++++++++++------------
.../distributed/internal/StartupOperation.java | 1 -
.../internal/membership/gms/GMSMemberData.java | 4 +-
.../internal/membership/gms/GMSMembership.java | 6 +-
5 files changed, 69 insertions(+), 46 deletions(-)
diff --git
a/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
b/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
index 5cef2bb..a85d82f 100644
---
a/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
+++
b/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
@@ -204,6 +204,34 @@ public class ClusterDistributionManagerDUnitTest extends
CacheTestCase {
.until(() -> !membershipManager.isSurpriseMember(member));
}
+ @Test
+ public void shutdownMessageCausesListenerInvocation() {
+ final AtomicBoolean listenerInvoked = new AtomicBoolean();
+ vm1.invoke("join the cluster", () -> getSystem().getDistributedMember());
// lead member
+ system = getSystem(); // non-lead member
+ // this membership listener will be invoked when the shutdown message is
received
+ system.getDistributionManager().addMembershipListener(new
MembershipListener() {
+ @Override
+ public void memberDeparted(DistributionManager distributionManager,
+ InternalDistributedMember id, boolean crashed) {
+ assertThat(crashed).isFalse();
+ listenerInvoked.set(Boolean.TRUE);
+ }
+ });
+ final InternalDistributedMember memberID = system.getDistributedMember();
+ locatorvm.invoke("send a shutdown message", () -> {
+ final DistributionManager distributionManager =
+ ((InternalDistributedSystem)
Locator.getLocator().getDistributedSystem())
+ .getDistributionManager();
+ final ShutdownMessage shutdownMessage = new ShutdownMessage();
+ shutdownMessage.setRecipient(memberID);
+
shutdownMessage.setDistributionManagerId(distributionManager.getDistributionManagerId());
+ distributionManager.putOutgoing(shutdownMessage);
+ });
+ await().until(() -> listenerInvoked.get());
+ }
+
+
/**
* Tests that a severe-level alert is generated if a member does not respond
with an ack quickly
* enough. vm0 and vm1 create a region and set ack-severe-alert-threshold.
vm1 has a cache
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/ClusterDistributionManager.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/ClusterDistributionManager.java
index a5e363c..d17ae80 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/ClusterDistributionManager.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/ClusterDistributionManager.java
@@ -1782,24 +1782,6 @@ public class ClusterDistributionManager implements
DistributionManager {
}
/**
- * Returns true if id was removed. Returns false if it was not in the list
of managers.
- */
- private boolean removeManager(InternalDistributedMember theId, boolean
crashed, String p_reason) {
- String reason = p_reason;
-
- reason = prettifyReason(reason);
- if (logger.isDebugEnabled()) {
- logger.debug("DistributionManager: removing member <{}>; crashed {};
reason = {}", theId,
- crashed, reason);
- }
- removeHostedLocators(theId);
-
- redundancyZones.remove(theId);
-
- return true;
- }
-
- /**
* Makes note of a new distribution manager that has started up in the
distributed cache. Invokes
* the appropriately listeners.
*
@@ -1866,41 +1848,51 @@ public class ClusterDistributionManager implements
DistributionManager {
void shutdownMessageReceived(InternalDistributedMember theId, String reason)
{
removeHostedLocators(theId);
distribution.shutdownMessageReceived(theId, reason);
+ handleManagerDeparture(theId, false, reason);
}
+ /*
+ * handleManagerDeparted may be invoked multiple times for a member
identifier.
+ * We allow this and inform listeners on each invocation, but only perform
some
+ * actions (such as decrementing the node count) if the change came from a
+ * membership view.
+ */
@Override
- public void handleManagerDeparture(InternalDistributedMember theId, boolean
p_crashed,
- String p_reason) {
-
+ public void handleManagerDeparture(InternalDistributedMember theId, boolean
memberCrashed,
+ String reason) {
alertingService.removeAlertListener(theId);
+ removeUnfinishedStartup(theId, true);
+
int vmType = theId.getVmKind();
if (vmType == ADMIN_ONLY_DM_TYPE) {
- removeUnfinishedStartup(theId, true);
- handleConsoleShutdown(theId, p_crashed, p_reason);
+ handleConsoleShutdown(theId, memberCrashed, reason);
return;
}
- removeUnfinishedStartup(theId, true);
-
- if (removeManager(theId, p_crashed, p_reason)) {
- if (theId.getVmKind() != ClusterDistributionManager.LOCATOR_DM_TYPE) {
- stats.incNodes(-1);
- }
- String msg;
- if (p_crashed && !shouldInhibitMembershipWarnings()) {
- msg =
- "Member at {} unexpectedly left the distributed cache: {}";
- addMemberEvent(new MemberCrashedEvent(theId, p_reason));
- } else {
- msg =
- "Member at {} gracefully left the distributed cache: {}";
- addMemberEvent(new MemberDepartedEvent(theId, p_reason));
- }
- logger.info(msg, new Object[] {theId, prettifyReason(p_reason)});
+ if (logger.isDebugEnabled()) {
+ logger.debug(
+ "DistributionManager: removing member <{}>; crashed {}; reason =
{}", theId,
+ memberCrashed, prettifyReason(reason));
+ }
+ removeHostedLocators(theId);
+ redundancyZones.remove(theId);
- executors.handleManagerDeparture(theId);
+ if (theId.getVmKind() != ClusterDistributionManager.LOCATOR_DM_TYPE) {
+ stats.incNodes(-1);
+ }
+ String msg;
+ if (memberCrashed && !shouldInhibitMembershipWarnings()) {
+ msg =
+ "Member at {} unexpectedly left the distributed cache: {}";
+ addMemberEvent(new MemberCrashedEvent(theId, reason));
+ } else {
+ msg =
+ "Member at {} gracefully left the distributed cache: {}";
+ addMemberEvent(new MemberDepartedEvent(theId, reason));
}
+ logger.info(msg, new Object[] {theId, prettifyReason(reason)});
+ executors.handleManagerDeparture(theId);
}
private void handleManagerSuspect(InternalDistributedMember suspect,
@@ -2368,7 +2360,7 @@ public class ClusterDistributionManager implements
DistributionManager {
}
dm.handleManagerDeparture(theId, crashed, reason);
} catch (DistributedSystemDisconnectedException se) {
- // let's not get huffy about it
+ // ignored
}
}
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/StartupOperation.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/StartupOperation.java
index 5eef068..fb28123 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/StartupOperation.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/StartupOperation.java
@@ -75,7 +75,6 @@ public class StartupOperation {
if (this.newlyDeparted != null && !this.newlyDeparted.isEmpty()) {
// tell the reply processor not to wait for the recipients that didn't
// get the message
- // Vector viewMembers = dm.getViewMembers();
for (Iterator it = this.newlyDeparted.iterator(); it.hasNext();) {
InternalDistributedMember id = (InternalDistributedMember) it.next();
this.dm.handleManagerDeparture(id, false,
diff --git
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMemberData.java
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMemberData.java
index 85087c8..4120395 100644
---
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMemberData.java
+++
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMemberData.java
@@ -401,9 +401,9 @@ public class GMSMemberData implements MemberData,
Comparable<GMSMemberData> {
sb.append("MemberData[");
if (name != null && name.length() > 0) {
- sb.append("name=").append(name);
+ sb.append("name=").append(name).append(';');
}
- sb.append(";addr=").append(inetAddr).append(";port=").append(udpPort)
+ sb.append("addr=").append(inetAddr).append(";port=").append(udpPort)
.append(";kind=").append(vmKind).append(";processId=").append(processId)
.append(";viewId=").append(vmViewId);
if (getVersionOrdinal() != Version.CURRENT_ORDINAL) {
diff --git
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
index a791e46..db2ecae 100644
---
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
+++
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
@@ -688,7 +688,11 @@ public class GMSMembership<ID extends MemberIdentifier>
implements Membership<ID
return; // Explicit deletion, no upcall.
}
- listener.memberDeparted(dm, crashed, reason);
+ if (!shutdownMembers.containsKey(dm)) {
+ // if we've received a shutdown message then DistributionManager will
already have
+ // notified listeners
+ listener.memberDeparted(dm, crashed, reason);
+ }
}
/**