This is an automated email from the ASF dual-hosted git repository.

bschuchardt pushed a commit to branch release/1.9.0
in repository https://gitbox.apache.org/repos/asf/geode.git


The following commit(s) were added to refs/heads/release/1.9.0 by this push:
     new cfe9fa1  GEODE-6423 availability checks sometimes immediately initiate 
removal
cfe9fa1 is described below

commit cfe9fa13cf325792616d386075f2dfd9410cd2b2
Author: Bruce Schuchardt <[email protected]>
AuthorDate: Fri Feb 22 13:55:44 2019 -0800

    GEODE-6423 availability checks sometimes immediately initiate removal
    
    Ensure that the availability check is performed for the contracted
    member-timeout period.  This allows a suspect to survive the check if
    it's having a momentary glitch like a brief garbage-collection, or if
    there is short network outage.
    
    This change caused some "reconnect" tests to fail due to short
    auto-reconnect intervals letting disconnected nodes start reconnecting
    before suspect processing completed on the force-disconnected nodes.
    I've fixed this by reinitializing the UUID part of the membership ID in
    JGroupsMessenger during reconnect attempts.
    
    (cherry picked from commit 8b29d9eb6d759435d8d9e39575f2f0edff8e81c1)
---
 .../cache/PersistentRegionRecoveryDUnitTest.java   |  2 +
 .../gms/fd/GMSHealthMonitorJUnitTest.java          | 41 +++++++++++++-
 .../membership/gms/fd/GMSHealthMonitor.java        | 63 ++++++++++++++--------
 .../membership/gms/membership/GMSJoinLeave.java    |  1 +
 .../membership/gms/messenger/JGroupsMessenger.java | 15 ++++--
 .../geode/test/dunit/rules/ClusterStartupRule.java |  2 +-
 6 files changed, 95 insertions(+), 29 deletions(-)

diff --git 
a/geode-core/src/distributedTest/java/org/apache/geode/internal/cache/PersistentRegionRecoveryDUnitTest.java
 
b/geode-core/src/distributedTest/java/org/apache/geode/internal/cache/PersistentRegionRecoveryDUnitTest.java
index 01d3f4b..fcd4b8a 100644
--- 
a/geode-core/src/distributedTest/java/org/apache/geode/internal/cache/PersistentRegionRecoveryDUnitTest.java
+++ 
b/geode-core/src/distributedTest/java/org/apache/geode/internal/cache/PersistentRegionRecoveryDUnitTest.java
@@ -39,6 +39,7 @@ import 
org.apache.geode.distributed.internal.DistributionMessageObserver;
 import org.apache.geode.internal.cache.backup.BackupOperation;
 import org.apache.geode.internal.logging.LogService;
 import org.apache.geode.test.dunit.AsyncInvocation;
+import org.apache.geode.test.dunit.IgnoredException;
 import org.apache.geode.test.dunit.VM;
 import org.apache.geode.test.dunit.internal.JUnit4DistributedTestCase;
 import org.apache.geode.test.dunit.rules.CacheRule;
@@ -73,6 +74,7 @@ public class PersistentRegionRecoveryDUnitTest extends 
JUnit4DistributedTestCase
     vm0 = getVM(0);
     vm1 = getVM(1);
     regionName = getClass().getSimpleName() + "-" + testName.getMethodName();
+    IgnoredException.addIgnoredException("Possible loss of quorum");
   }
 
   @Test
diff --git 
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
 
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
index f8f8f7f..dbbd332 100644
--- 
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
+++ 
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
@@ -24,6 +24,7 @@ import static 
org.apache.geode.distributed.ConfigurationProperties.MCAST_PORT;
 import static org.apache.geode.distributed.ConfigurationProperties.MCAST_TTL;
 import static 
org.apache.geode.distributed.ConfigurationProperties.MEMBER_TIMEOUT;
 import static org.apache.geode.test.awaitility.GeodeAwaitility.await;
+import static org.apache.geode.test.awaitility.GeodeAwaitility.getTimeout;
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
@@ -130,7 +131,7 @@ public class GMSHealthMonitorJUnitTest {
     nonDefault.put(MCAST_TTL, "0");
     nonDefault.put(LOG_FILE, "");
     nonDefault.put(LOG_LEVEL, "fine");
-    nonDefault.put(MEMBER_TIMEOUT, "2000");
+    nonDefault.put(MEMBER_TIMEOUT, "" + memberTimeout);
     nonDefault.put(LOCATORS, "localhost[10344]");
     DistributionManager dm = mock(DistributionManager.class);
     SocketCreatorFactory.setDistributionConfig(new DistributionConfigImpl(new 
Properties()));
@@ -785,6 +786,44 @@ public class GMSHealthMonitorJUnitTest {
   }
 
   @Test
+  public void testTcpCheckMemberTriesUntilTimeout() throws Exception {
+    ServerSocket mySocket = new ServerSocket(0);
+    Thread serverThread = new Thread() {
+      public void run() {
+        long giveupTime = System.currentTimeMillis() + (5 * memberTimeout);
+        while (System.currentTimeMillis() < giveupTime) {
+          try {
+            Socket acceptedSocket = mySocket.accept();
+            try {
+              Thread.sleep(200);
+            } catch (InterruptedException e) {
+              e.printStackTrace();
+              return;
+            }
+            acceptedSocket.close();
+          } catch (IOException e) {
+            if (!mySocket.isClosed()) {
+              System.err.println("Test failed with unexpected IOException");
+              e.printStackTrace(System.err);
+            }
+            return;
+          }
+        }
+      }
+    };
+    serverThread.setDaemon(true);
+    serverThread.start();
+    InternalDistributedMember otherMember =
+        createInternalDistributedMember(Version.CURRENT_ORDINAL, 0, 1, 1);
+    long startTime = System.currentTimeMillis();
+    gmsHealthMonitor.doTCPCheckMember(otherMember, mySocket.getLocalPort());
+    mySocket.close();
+    serverThread.interrupt();
+    serverThread.join(getTimeout().getValueInMS());
+    assertThat(System.currentTimeMillis()).isGreaterThanOrEqualTo(startTime + 
memberTimeout);
+  }
+
+  @Test
   public void testDoTCPCheckMemberWithOkStatus() throws Exception {
     executeTestDoTCPCheck(GMSHealthMonitor.OK, true);
   }
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index ab72a07..05c0a34 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -513,33 +513,50 @@ public class GMSHealthMonitor implements HealthMonitor, 
MessageHandler {
    */
   boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port) {
     Socket clientSocket = null;
-    try {
-      logger.debug("Checking member {} with TCP socket connection {}:{}.", 
suspectMember,
-          suspectMember.getInetAddress(), port);
-      clientSocket =
-          
SocketCreatorFactory.getSocketCreatorForComponent(SecurableCommunicationChannel.CLUSTER)
-              .connect(suspectMember.getInetAddress(), port, (int) 
memberTimeout,
-                  new ConnectTimeoutTask(services.getTimer(), memberTimeout), 
false, -1, false);
-      clientSocket.setTcpNoDelay(true);
-      return doTCPCheckMember(suspectMember, clientSocket);
-    } catch (IOException e) {
-      // this is expected if it is a connection-timeout or other failure
-      // to connect
-    } catch (IllegalStateException e) {
-      if (!isStopping) {
-        logger.trace("Unexpected exception", e);
+    // make sure we try to check on the member for the contracted 
memberTimeout period
+    // in case a timed socket.connect() returns immediately
+    long giveupTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(
+        services.getConfig().getMemberTimeout(), TimeUnit.MILLISECONDS);
+    boolean passed = false;
+    int iteration = 0;
+    do {
+      iteration++;
+      if (iteration > 1) {
+        try {
+          Thread.sleep(100);
+        } catch (InterruptedException e) {
+          Thread.currentThread().interrupt();
+          return false;
+        }
       }
-    } finally {
       try {
-        if (clientSocket != null) {
-          clientSocket.setSoLinger(true, 0); // abort the connection
-          clientSocket.close();
-        }
+        logger.debug("Checking member {} with TCP socket connection {}:{}.", 
suspectMember,
+            suspectMember.getInetAddress(), port);
+        clientSocket =
+            
SocketCreatorFactory.getSocketCreatorForComponent(SecurableCommunicationChannel.CLUSTER)
+                .connect(suspectMember.getInetAddress(), port, (int) 
memberTimeout,
+                    new ConnectTimeoutTask(services.getTimer(), 
memberTimeout), false, -1, false);
+        clientSocket.setTcpNoDelay(true);
+        passed = doTCPCheckMember(suspectMember, clientSocket);
       } catch (IOException e) {
-        // expected
+        // this is expected if it is a connection-timeout or other failure
+        // to connect
+      } catch (IllegalStateException | GemFireConfigException e) {
+        if (!isStopping) {
+          logger.trace("Unexpected exception", e);
+        }
+      } finally {
+        try {
+          if (clientSocket != null) {
+            clientSocket.setSoLinger(true, 0); // abort the connection
+            clientSocket.close();
+          }
+        } catch (IOException e) {
+          // expected
+        }
       }
-    }
-    return false;
+    } while (!passed && !this.isShutdown() && System.nanoTime() < giveupTime);
+    return passed;
   }
 
   // Package protected for testing purposes
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 5ed5836..a816e78 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -1040,6 +1040,7 @@ public class GMSJoinLeave implements JoinLeave, 
MessageHandler {
             this.preparedView.getCreator().equals(view.getCreator())) {
           // this can happen if we received two prepares during auto-reconnect
         } else {
+          // send the conflicting view to the creator of this new view
           services.getMessenger()
               .send(new ViewAckMessage(view.getViewId(), m.getSender(), 
this.preparedView));
         }
diff --git 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
index ccca881..9085121 100644
--- 
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
+++ 
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
@@ -330,9 +330,16 @@ public class JGroupsMessenger implements Messenger {
         members.add(new UUID(0, 0));// TODO open a JGroups JIRA for GEODE-3034
         View jgv = new View(vid, members);
         this.myChannel.down(new Event(Event.VIEW_CHANGE, jgv));
-        UUID logicalAddress = (UUID) myChannel.getAddress();
-        if (logicalAddress instanceof JGAddress) {
-          ((JGAddress) logicalAddress).setVmViewId(-1);
+        // attempt to establish a new UUID in the jgroups channel so the 
member address will be
+        // different
+        try {
+          Method setAddressMethod = 
JChannel.class.getDeclaredMethod("setAddress");
+          setAddressMethod.setAccessible(true);
+          setAddressMethod.invoke(myChannel);
+        } catch (SecurityException | NoSuchMethodException e) {
+          logger.warn("Unable to establish a new JGroups address.  "
+              + "My address will be exactly the same as last time. 
Exception={}",
+              e.getMessage());
         }
         reconnecting = true;
       } else {
@@ -362,7 +369,7 @@ public class JGroupsMessenger implements Messenger {
       jgroupsReceiver = new JGroupsReceiver();
       myChannel.setReceiver(jgroupsReceiver);
       if (!reconnecting) {
-        myChannel.connect("AG"); // apache g***** (whatever we end up calling 
it)
+        myChannel.connect("AG"); // Apache Geode
       }
     } catch (Exception e) {
       myChannel.close();
diff --git 
a/geode-dunit/src/main/java/org/apache/geode/test/dunit/rules/ClusterStartupRule.java
 
b/geode-dunit/src/main/java/org/apache/geode/test/dunit/rules/ClusterStartupRule.java
index cd87286..3d329aa 100644
--- 
a/geode-dunit/src/main/java/org/apache/geode/test/dunit/rules/ClusterStartupRule.java
+++ 
b/geode-dunit/src/main/java/org/apache/geode/test/dunit/rules/ClusterStartupRule.java
@@ -250,7 +250,7 @@ public class ClusterStartupRule implements 
SerializableTestRule {
       SerializableFunction<ServerStarterRule> ruleOperator) {
     final String defaultName = "server-" + index;
     VM serverVM = getVM(index, version);
-    Server server = serverVM.invoke(() -> {
+    Server server = serverVM.invoke("startServerVM", () -> {
       memberStarter = new ServerStarterRule();
       ServerStarterRule serverStarter = (ServerStarterRule) memberStarter;
       if (logFile) {

Reply via email to