This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/geode.git


The following commit(s) were added to refs/heads/develop by this push:
     new 08316aa  GEODE-8385: hang recovering from disk with cyclic 
dependencies (#5403)
08316aa is described below

commit 08316aa05198704d96aefc5497e483052c27a378
Author: Bruce Schuchardt <bschucha...@pivotal.io>
AuthorDate: Wed Jul 29 07:58:39 2020 -0700

    GEODE-8385: hang recovering from disk with cyclic dependencies (#5403)
    
    * GEODE-8385: hang recovering from disk with cyclic dependencies
    
    This restores the point at which we notify membership listeners of
    departures.  We used to do this (in 1.12 and earlier) when a ShutdownMessage
    is received instead of waiting for a new membership view announcing the 
departure.
    Membership views can take some time to form and install, which can cause
    persistent (disk store) views to be updated later than they used to be.
    
    In the case of this ticket the disk store of one member was being
    closed while another was shutting down.  The member closing its disk
    store did not see the view announcing that shutdown until most of its
    disk store regions had closed their persistence advisors.  This left the
    disk store thinking that the other member was still up at the time it
    was closed.
---
 .../ClusterDistributionManagerDUnitTest.java       | 28 ++++++++
 .../internal/ClusterDistributionManager.java       | 76 ++++++++++------------
 .../distributed/internal/StartupOperation.java     |  1 -
 .../internal/membership/gms/GMSMemberData.java     |  4 +-
 .../internal/membership/gms/GMSMembership.java     |  6 +-
 5 files changed, 69 insertions(+), 46 deletions(-)

diff --git 
a/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
 
b/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
index 9d3bbd9..e28f15ab 100644
--- 
a/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
+++ 
b/geode-core/src/distributedTest/java/org/apache/geode/distributed/internal/ClusterDistributionManagerDUnitTest.java
@@ -211,6 +211,34 @@ public class ClusterDistributionManagerDUnitTest extends 
CacheTestCase {
         .until(() -> !membershipManager.isSurpriseMember(member));
   }
 
+  @Test
+  public void shutdownMessageCausesListenerInvocation() {
+    final AtomicBoolean listenerInvoked = new AtomicBoolean();
+    vm1.invoke("join the cluster", () -> getSystem().getDistributedMember()); 
// lead member
+    system = getSystem(); // non-lead member
+    // this membership listener will be invoked when the shutdown message is 
received
+    system.getDistributionManager().addMembershipListener(new 
MembershipListener() {
+      @Override
+      public void memberDeparted(DistributionManager distributionManager,
+          InternalDistributedMember id, boolean crashed) {
+        assertThat(crashed).isFalse();
+        listenerInvoked.set(Boolean.TRUE);
+      }
+    });
+    final InternalDistributedMember memberID = system.getDistributedMember();
+    locatorvm.invoke("send a shutdown message", () -> {
+      final DistributionManager distributionManager =
+          ((InternalDistributedSystem) 
Locator.getLocator().getDistributedSystem())
+              .getDistributionManager();
+      final ShutdownMessage shutdownMessage = new ShutdownMessage();
+      shutdownMessage.setRecipient(memberID);
+      
shutdownMessage.setDistributionManagerId(distributionManager.getDistributionManagerId());
+      distributionManager.putOutgoing(shutdownMessage);
+    });
+    await().until(() -> listenerInvoked.get());
+  }
+
+
   /**
    * Tests that a severe-level alert is generated if a member does not respond 
with an ack quickly
    * enough. vm0 and vm1 create a region and set ack-severe-alert-threshold. 
vm1 has a cache
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/ClusterDistributionManager.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/ClusterDistributionManager.java
index 9a52f60..1f58344 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/ClusterDistributionManager.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/ClusterDistributionManager.java
@@ -1782,24 +1782,6 @@ public class ClusterDistributionManager implements 
DistributionManager {
   }
 
   /**
-   * Returns true if id was removed. Returns false if it was not in the list 
of managers.
-   */
-  private boolean removeManager(InternalDistributedMember theId, boolean 
crashed, String p_reason) {
-    String reason = p_reason;
-
-    reason = prettifyReason(reason);
-    if (logger.isDebugEnabled()) {
-      logger.debug("DistributionManager: removing member <{}>; crashed {}; 
reason = {}", theId,
-          crashed, reason);
-    }
-    removeHostedLocators(theId);
-
-    redundancyZones.remove(theId);
-
-    return true;
-  }
-
-  /**
    * Makes note of a new distribution manager that has started up in the 
distributed cache. Invokes
    * the appropriately listeners.
    *
@@ -1866,41 +1848,51 @@ public class ClusterDistributionManager implements 
DistributionManager {
   void shutdownMessageReceived(InternalDistributedMember theId, String reason) 
{
     removeHostedLocators(theId);
     distribution.shutdownMessageReceived(theId, reason);
+    handleManagerDeparture(theId, false, reason);
   }
 
+  /*
+   * handleManagerDeparted may be invoked multiple times for a member 
identifier.
+   * We allow this and inform listeners on each invocation, but only perform 
some
+   * actions (such as decrementing the node count) if the change came from a
+   * membership view.
+   */
   @Override
-  public void handleManagerDeparture(InternalDistributedMember theId, boolean 
p_crashed,
-      String p_reason) {
-
+  public void handleManagerDeparture(InternalDistributedMember theId, boolean 
memberCrashed,
+      String reason) {
     alertingService.removeAlertListener(theId);
 
+    removeUnfinishedStartup(theId, true);
+
     int vmType = theId.getVmKind();
     if (vmType == ADMIN_ONLY_DM_TYPE) {
-      removeUnfinishedStartup(theId, true);
-      handleConsoleShutdown(theId, p_crashed, p_reason);
+      handleConsoleShutdown(theId, memberCrashed, reason);
       return;
     }
 
-    removeUnfinishedStartup(theId, true);
-
-    if (removeManager(theId, p_crashed, p_reason)) {
-      if (theId.getVmKind() != ClusterDistributionManager.LOCATOR_DM_TYPE) {
-        stats.incNodes(-1);
-      }
-      String msg;
-      if (p_crashed && !shouldInhibitMembershipWarnings()) {
-        msg =
-            "Member at {} unexpectedly left the distributed cache: {}";
-        addMemberEvent(new MemberCrashedEvent(theId, p_reason));
-      } else {
-        msg =
-            "Member at {} gracefully left the distributed cache: {}";
-        addMemberEvent(new MemberDepartedEvent(theId, p_reason));
-      }
-      logger.info(msg, new Object[] {theId, prettifyReason(p_reason)});
+    if (logger.isDebugEnabled()) {
+      logger.debug(
+          "DistributionManager: removing member <{}>; crashed {}; reason = 
{}", theId,
+          memberCrashed, prettifyReason(reason));
+    }
+    removeHostedLocators(theId);
+    redundancyZones.remove(theId);
 
-      executors.handleManagerDeparture(theId);
+    if (theId.getVmKind() != ClusterDistributionManager.LOCATOR_DM_TYPE) {
+      stats.incNodes(-1);
+    }
+    String msg;
+    if (memberCrashed && !shouldInhibitMembershipWarnings()) {
+      msg =
+          "Member at {} unexpectedly left the distributed cache: {}";
+      addMemberEvent(new MemberCrashedEvent(theId, reason));
+    } else {
+      msg =
+          "Member at {} gracefully left the distributed cache: {}";
+      addMemberEvent(new MemberDepartedEvent(theId, reason));
     }
+    logger.info(msg, new Object[] {theId, prettifyReason(reason)});
+    executors.handleManagerDeparture(theId);
   }
 
   private void handleManagerSuspect(InternalDistributedMember suspect,
@@ -2368,7 +2360,7 @@ public class ClusterDistributionManager implements 
DistributionManager {
         }
         dm.handleManagerDeparture(theId, crashed, reason);
       } catch (DistributedSystemDisconnectedException se) {
-        // let's not get huffy about it
+        // ignored
       }
     }
 
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/StartupOperation.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/StartupOperation.java
index 5eef068..fb28123 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/StartupOperation.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/StartupOperation.java
@@ -75,7 +75,6 @@ public class StartupOperation {
     if (this.newlyDeparted != null && !this.newlyDeparted.isEmpty()) {
       // tell the reply processor not to wait for the recipients that didn't
       // get the message
-      // Vector viewMembers = dm.getViewMembers();
       for (Iterator it = this.newlyDeparted.iterator(); it.hasNext();) {
         InternalDistributedMember id = (InternalDistributedMember) it.next();
         this.dm.handleManagerDeparture(id, false,
diff --git 
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMemberData.java
 
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMemberData.java
index b0b759e..2ab55c4 100644
--- 
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMemberData.java
+++ 
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMemberData.java
@@ -401,9 +401,9 @@ public class GMSMemberData implements MemberData, 
Comparable<GMSMemberData> {
 
     sb.append("MemberData[");
     if (name != null && name.length() > 0) {
-      sb.append("name=").append(name);
+      sb.append("name=").append(name).append(';');
     }
-    sb.append(";addr=").append(inetAddr).append(";port=").append(udpPort)
+    sb.append("addr=").append(inetAddr).append(";port=").append(udpPort)
         
.append(";kind=").append(vmKind).append(";processId=").append(processId)
         .append(";viewId=").append(vmViewId);
     if (getVersionOrdinal() != Version.CURRENT_ORDINAL) {
diff --git 
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
 
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
index a791e46..db2ecae 100644
--- 
a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
+++ 
b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
@@ -688,7 +688,11 @@ public class GMSMembership<ID extends MemberIdentifier> 
implements Membership<ID
       return; // Explicit deletion, no upcall.
     }
 
-    listener.memberDeparted(dm, crashed, reason);
+    if (!shutdownMembers.containsKey(dm)) {
+      // if we've received a shutdown message then DistributionManager will 
already have
+      // notified listeners
+      listener.memberDeparted(dm, crashed, reason);
+    }
   }
 
   /**

Reply via email to