This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch release/1.4.0
in repository https://gitbox.apache.org/repos/asf/geode.git


The following commit(s) were added to refs/heads/release/1.4.0 by this push:
     new 2675247  Squashed commit of the following:
2675247 is described below

commit 267524720d8a94145dc3fdf3a50b6fd161a80004
Author: Bruce Schuchardt <[email protected]>
AuthorDate: Tue Jan 9 08:39:48 2018 -0800

    Squashed commit of the following:
    
    commit 70a76592a58379bdd9b53433877b64831fc7432e
    Author: Bruce Schuchardt <[email protected]>
    Date:   Tue Jan 9 08:37:45 2018 -0800
    
        GEODE-3588 2 restarts of Locator results in split brain
    
        removed thread dump in new test
    
    commit 89bf34c39f3df4ed7b16d6c9a256e2d26b9d2267
    Author: Bruce Schuchardt <[email protected]>
    Date:   Mon Jan 8 15:57:09 2018 -0800
    
        GEODE-3588 2 restarts of Locator results in split brain
    
        Udo's fix for GEODE-870 added a new boolean instance variable to
        GMSJoinLeave to tell its ViewCreator thread to shut down.  This works
        but the state was never being reset after its first use.  This caused
        Subsequent ViewCreator threads to shut down immediately.  The only
        way to fix this condition without a patch is to restart the coordinator 
node.
    
        The patch moves this boolean variable to the ViewCreator thread so that
        it is automatically reset when a new ViewCreator is instantiated.
    
        I also did a little code cleanup, moving GMSJoinLeave methods from the
        end of the file to where its other methods are located and adding
        a setShutdownFlag() method during debugging so I could isolate what
        was happening.
    
    Sarge reviewed the changes for me so this closes #1255
    
    (cherry picked from commit 3cf7caab3c3726dfb47e12a900240e377e035594)
---
 .../membership/gms/membership/GMSJoinLeave.java    | 140 +++++++++++----------
 .../membership/gms/messenger/JGroupsMessenger.java |   2 +-
 .../apache/geode/distributed/LocatorDUnitTest.java |  50 ++++++++
 .../gms/membership/GMSJoinLeaveJUnitTest.java      |   4 +-
 4 files changed, 130 insertions(+), 66 deletions(-)

diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index f90329c..c6b4e1d 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -255,11 +255,6 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
    */
   NetView quorumLostView;
 
-  /**
-   * a flag to mark a coordinator's viewCreator for shutdown
-   */
-  private boolean markViewCreatorForShutdown = false;
-
   static class SearchState {
     Set<InternalDistributedMember> alreadyTried = new HashSet<>();
     Set<InternalDistributedMember> registrants = new HashSet<>();
@@ -533,7 +528,7 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
    */
   private void processJoinRequest(JoinRequestMessage incomingRequest) {
 
-    logger.info("received join request from {}", 
incomingRequest.getMemberID());
+    logger.info("Received a join request from {}", 
incomingRequest.getMemberID());
 
     if (!ALLOW_OLD_VERSION_FOR_TESTING
         && 
incomingRequest.getMemberID().getVersionObject().compareTo(Version.CURRENT) < 
0) {
@@ -570,7 +565,6 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
     // services.getMessenger().send(joinResponseMessage);
     // return;
     // }
-    logger.info("Received a join request from " + incomingRequest.getSender());
 
     recordViewRequest(incomingRequest);
   }
@@ -930,7 +924,7 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
       // if another server is the coordinator - GEODE-870
       if (isCoordinator && !localAddress.equals(view.getCoordinator())
           && getViewCreator() != null) {
-        markViewCreatorForShutdown = true;
+        getViewCreator().markViewCreatorForShutdown();
         this.isCoordinator = false;
       }
       installView(view);
@@ -1686,6 +1680,50 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
     processLeaveRequest(msg);
   }
 
+  boolean checkIfAvailable(InternalDistributedMember fmbr) {
+    // return the member id if it fails health checks
+    logger.info("checking state of member " + fmbr);
+    if (services.getHealthMonitor().checkIfAvailable(fmbr,
+        "Member failed to acknowledge a membership view", false)) {
+      logger.info("member " + fmbr + " passed availability check");
+      return true;
+    }
+    logger.info("member " + fmbr + " failed availability check");
+    return false;
+  }
+
+  private InternalDistributedMember getMemId(NetMember jgId,
+      List<InternalDistributedMember> members) {
+    for (InternalDistributedMember m : members) {
+      if (((GMSMember) m.getNetMember()).equals(jgId)) {
+        return m;
+      }
+    }
+    return null;
+  }
+
+  @Override
+  public InternalDistributedMember getMemberID(NetMember jgId) {
+    NetView v = currentView;
+    InternalDistributedMember ret = null;
+    if (v != null) {
+      ret = getMemId(jgId, v.getMembers());
+    }
+
+    if (ret == null) {
+      v = preparedView;
+      if (v != null) {
+        ret = getMemId(jgId, v.getMembers());
+      }
+    }
+
+    if (ret == null) {
+      return new InternalDistributedMember(jgId);
+    }
+
+    return ret;
+  }
+
   @Override
   public void disableDisconnectOnQuorumLossForTesting() {
     this.quorumRequired = false;
@@ -2004,6 +2042,8 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
     volatile boolean testFlagForRemovalRequest = false;
     // count of number of views abandoned due to conflicts
     volatile int abandonedViews = 0;
+    private boolean markViewCreatorForShutdown = false; // see GEODE-870
+
 
     /**
      * initial view to install. guarded by synch on ViewCreator
@@ -2027,7 +2067,7 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
     }
 
     void shutdown() {
-      shutdown = true;
+      setShutdownFlag();
       synchronized (viewRequests) {
         viewRequests.notifyAll();
         interrupt();
@@ -2097,18 +2137,34 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
           try {
             sleep(services.getConfig().getMemberTimeout());
           } catch (InterruptedException e2) {
-            shutdown = true;
+            setShutdownFlag();
             retry = false;
           }
         } catch (InterruptedException e) {
-          shutdown = true;
+          setShutdownFlag();
         } catch (DistributedSystemDisconnectedException e) {
-          shutdown = true;
+          setShutdownFlag();
         }
       } while (retry);
     }
 
     /**
+     * marks this ViewCreator as being shut down. It may be some short amount 
of time before the
+     * ViewCreator thread exits.
+     */
+    private void setShutdownFlag() {
+      shutdown = true;
+    }
+
+    /**
+     * This allows GMSJoinLeave to tell the ViewCreator to shut down after 
finishing its current
+     * task. See GEODE-870.
+     */
+    private void markViewCreatorForShutdown() {
+      this.markViewCreatorForShutdown = true;
+    }
+
+    /**
      * During initial view processing a prepared view was discovered. This 
method will extract its
      * new members and create a new initial view containing them.
      *
@@ -2214,19 +2270,19 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
               try {
                 sleep(services.getConfig().getMemberTimeout());
               } catch (InterruptedException e2) {
-                shutdown = true;
+                setShutdownFlag();
               }
             } catch (DistributedSystemDisconnectedException e) {
-              shutdown = true;
+              setShutdownFlag();
             } catch (InterruptedException e) {
               logger.info("View Creator thread interrupted");
-              shutdown = true;
+              setShutdownFlag();
             }
             requests = null;
           }
         }
       } finally {
-        shutdown = true;
+        setShutdownFlag();
         informToPendingJoinRequests();
         
org.apache.geode.distributed.internal.membership.gms.interfaces.Locator locator 
=
             services.getLocator();
@@ -2443,7 +2499,7 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
           Set<InternalDistributedMember> crashes = 
newView.getActualCrashedMembers(currentView);
           forceDisconnect(LocalizedStrings.Network_partition_detected
               .toLocalizedString(crashes.size(), crashes));
-          shutdown = true;
+          setShutdownFlag();
           return;
         }
 
@@ -2497,7 +2553,7 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
             // this member may have been kicked out of the conflicting view
             if (failures.contains(localAddress)) {
               forceDisconnect("I am no longer a member of the distributed 
system");
-              shutdown = true;
+              setShutdownFlag();
               return;
             }
             List<InternalDistributedMember> newMembers = 
conflictingView.getNewMembers();
@@ -2567,13 +2623,13 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
       // can be transmitted to the new members w/o including it in the view 
message
 
       if (markViewCreatorForShutdown && getViewCreator() != null) {
-        shutdown = true;
+        setShutdownFlag();
       }
 
       // after sending a final view we need to stop this thread if
       // the GMS is shutting down
       if (isStopping()) {
-        shutdown = true;
+        setShutdownFlag();
       }
     }
 
@@ -2704,54 +2760,12 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
       return result;
     }
 
-    boolean getTestFlageForRemovalRequest() {
+    boolean getTestFlagForRemovalRequest() {
       return testFlagForRemovalRequest;
     }
-  }
 
-  boolean checkIfAvailable(InternalDistributedMember fmbr) {
-    // return the member id if it fails health checks
-    logger.info("checking state of member " + fmbr);
-    if (services.getHealthMonitor().checkIfAvailable(fmbr,
-        "Member failed to acknowledge a membership view", false)) {
-      logger.info("member " + fmbr + " passed availability check");
-      return true;
-    }
-    logger.info("member " + fmbr + " failed availability check");
-    return false;
-  }
-
-  private InternalDistributedMember getMemId(NetMember jgId,
-      List<InternalDistributedMember> members) {
-    for (InternalDistributedMember m : members) {
-      if (((GMSMember) m.getNetMember()).equals(jgId)) {
-        return m;
-      }
-    }
-    return null;
   }
 
-  @Override
-  public InternalDistributedMember getMemberID(NetMember jgId) {
-    NetView v = currentView;
-    InternalDistributedMember ret = null;
-    if (v != null) {
-      ret = getMemId(jgId, v.getMembers());
-    }
-
-    if (ret == null) {
-      v = preparedView;
-      if (v != null) {
-        ret = getMemId(jgId, v.getMembers());
-      }
-    }
-
-    if (ret == null) {
-      return new InternalDistributedMember(jgId);
-    }
-
-    return ret;
-  }
 
   static class ViewAbandonedException extends Exception {
   }
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
index 46142c4..d074940 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
@@ -409,7 +409,7 @@ public class JGroupsMessenger implements Messenger {
     
mbrs.addAll(v.getMembers().stream().map(JGAddress::new).collect(Collectors.toList()));
     ViewId vid = new ViewId(new JGAddress(v.getCoordinator()), v.getViewId());
     View jgv = new View(vid, new ArrayList<>(mbrs));
-    logger.trace("installing JGroups view: {}", jgv);
+    logger.trace("installing view into JGroups stack: {}", jgv);
     this.myChannel.down(new Event(Event.VIEW_CHANGE, jgv));
 
     addressesWithIoExceptionsProcessed.clear();
diff --git 
a/geode-core/src/test/java/org/apache/geode/distributed/LocatorDUnitTest.java 
b/geode-core/src/test/java/org/apache/geode/distributed/LocatorDUnitTest.java
index 446385d..577e089 100644
--- 
a/geode-core/src/test/java/org/apache/geode/distributed/LocatorDUnitTest.java
+++ 
b/geode-core/src/test/java/org/apache/geode/distributed/LocatorDUnitTest.java
@@ -79,6 +79,7 @@ import 
org.apache.geode.distributed.internal.membership.gms.membership.GMSJoinLe
 import org.apache.geode.internal.Assert;
 import org.apache.geode.internal.AvailablePort;
 import org.apache.geode.internal.AvailablePortHelper;
+import org.apache.geode.internal.OSProcess;
 import org.apache.geode.internal.cache.GemFireCacheImpl;
 import org.apache.geode.internal.logging.InternalLogWriter;
 import org.apache.geode.internal.logging.LocalLogWriter;
@@ -2022,6 +2023,55 @@ public class LocatorDUnitTest extends 
JUnit4DistributedTestCase {
   }
 
   /**
+   * See GEODE-3588 - a locator is restarted twice with a server and ends up 
in a split-brain
+   */
+  @Test
+  public void testRestartLocatorMultipleTimes() throws Exception {
+    disconnectAllFromDS();
+    port1 = AvailablePort.getRandomAvailablePort(AvailablePort.SOCKET);
+    DistributedTestUtils.deleteLocatorStateFile(port1);
+    File logFile = new File("");
+    File stateFile = new File("locator" + port1 + "state.dat");
+    VM vm0 = Host.getHost(0).getVM(0);
+    final Properties p = new Properties();
+    p.setProperty(LOCATORS, Host.getHost(0).getHostName() + "[" + port1 + "]");
+    p.setProperty(MCAST_PORT, "0");
+    p.setProperty(ENABLE_CLUSTER_CONFIGURATION, "false");
+    p.setProperty(LOG_LEVEL, "finest");
+    addDSProps(p);
+    if (stateFile.exists()) {
+      stateFile.delete();
+    }
+
+    Locator locator = Locator.startLocatorAndDS(port1, logFile, p);
+
+    vm0.invoke(() -> {
+      DistributedSystem.connect(p);
+      return null;
+    });
+
+    try {
+      locator.stop();
+      locator = Locator.startLocatorAndDS(port1, logFile, p);
+      assertEquals(2, ((InternalDistributedSystem) 
locator.getDistributedSystem()).getDM()
+          .getViewMembers().size());
+
+      locator.stop();
+      locator = Locator.startLocatorAndDS(port1, logFile, p);
+      assertEquals(2, ((InternalDistributedSystem) 
locator.getDistributedSystem()).getDM()
+          .getViewMembers().size());
+
+    } finally {
+      vm0.invoke("disconnect", () -> {
+        DistributedSystem.connect(p).disconnect();
+        return null;
+      });
+      locator.stop();
+    }
+
+  }
+
+  /**
    * return the distributed member id for the ds on this vm
    */
   public static DistributedMember getDistributedMember(Properties props) {
diff --git 
a/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
 
b/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
index 7389465..1b6cdec 100644
--- 
a/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
+++ 
b/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
@@ -1331,7 +1331,7 @@ public class GMSJoinLeaveJUnitTest {
     this.removeMember = null;
 
     assertTrue("testFlagForRemovalRequest should be true",
-        gmsJoinLeave.getViewCreator().getTestFlageForRemovalRequest());
+        gmsJoinLeave.getViewCreator().getTestFlagForRemovalRequest());
   }
 
   @Test
@@ -1354,7 +1354,7 @@ public class GMSJoinLeaveJUnitTest {
     this.leaveMember = null;
 
     assertTrue("testFlagForRemovalRequest should be true",
-        gmsJoinLeave.getViewCreator().getTestFlageForRemovalRequest());
+        gmsJoinLeave.getViewCreator().getTestFlagForRemovalRequest());
   }
 
   private void installView() throws Exception {

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to