This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch release/1.9.0
in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/release/1.9.0 by this push:
new cfe9fa1 GEODE-6423 availability checks sometimes immediately initiate
removal
cfe9fa1 is described below
commit cfe9fa13cf325792616d386075f2dfd9410cd2b2
Author: Bruce Schuchardt <[email protected]>
AuthorDate: Fri Feb 22 13:55:44 2019 -0800
GEODE-6423 availability checks sometimes immediately initiate removal
Ensure that the availability check is performed for the contracted
member-timeout period. This allows a suspect to survive the check if
it's having a momentary glitch like a brief garbage-collection, or if
there is short network outage.
This change caused some "reconnect" tests to fail due to short
auto-reconnect intervals letting disconnected nodes start reconnecting
before suspect processing completed on the force-disconnected nodes.
I've fixed this by reinitializing the UUID part of the membership ID in
JGroupsMessenger during reconnect attempts.
(cherry picked from commit 8b29d9eb6d759435d8d9e39575f2f0edff8e81c1)
---
.../cache/PersistentRegionRecoveryDUnitTest.java | 2 +
.../gms/fd/GMSHealthMonitorJUnitTest.java | 41 +++++++++++++-
.../membership/gms/fd/GMSHealthMonitor.java | 63 ++++++++++++++--------
.../membership/gms/membership/GMSJoinLeave.java | 1 +
.../membership/gms/messenger/JGroupsMessenger.java | 15 ++++--
.../geode/test/dunit/rules/ClusterStartupRule.java | 2 +-
6 files changed, 95 insertions(+), 29 deletions(-)
diff --git
a/geode-core/src/distributedTest/java/org/apache/geode/internal/cache/PersistentRegionRecoveryDUnitTest.java
b/geode-core/src/distributedTest/java/org/apache/geode/internal/cache/PersistentRegionRecoveryDUnitTest.java
index 01d3f4b..fcd4b8a 100644
---
a/geode-core/src/distributedTest/java/org/apache/geode/internal/cache/PersistentRegionRecoveryDUnitTest.java
+++
b/geode-core/src/distributedTest/java/org/apache/geode/internal/cache/PersistentRegionRecoveryDUnitTest.java
@@ -39,6 +39,7 @@ import
org.apache.geode.distributed.internal.DistributionMessageObserver;
import org.apache.geode.internal.cache.backup.BackupOperation;
import org.apache.geode.internal.logging.LogService;
import org.apache.geode.test.dunit.AsyncInvocation;
+import org.apache.geode.test.dunit.IgnoredException;
import org.apache.geode.test.dunit.VM;
import org.apache.geode.test.dunit.internal.JUnit4DistributedTestCase;
import org.apache.geode.test.dunit.rules.CacheRule;
@@ -73,6 +74,7 @@ public class PersistentRegionRecoveryDUnitTest extends
JUnit4DistributedTestCase
vm0 = getVM(0);
vm1 = getVM(1);
regionName = getClass().getSimpleName() + "-" + testName.getMethodName();
+ IgnoredException.addIgnoredException("Possible loss of quorum");
}
@Test
diff --git
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
index f8f8f7f..dbbd332 100644
---
a/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
+++
b/geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java
@@ -24,6 +24,7 @@ import static
org.apache.geode.distributed.ConfigurationProperties.MCAST_PORT;
import static org.apache.geode.distributed.ConfigurationProperties.MCAST_TTL;
import static
org.apache.geode.distributed.ConfigurationProperties.MEMBER_TIMEOUT;
import static org.apache.geode.test.awaitility.GeodeAwaitility.await;
+import static org.apache.geode.test.awaitility.GeodeAwaitility.getTimeout;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
@@ -130,7 +131,7 @@ public class GMSHealthMonitorJUnitTest {
nonDefault.put(MCAST_TTL, "0");
nonDefault.put(LOG_FILE, "");
nonDefault.put(LOG_LEVEL, "fine");
- nonDefault.put(MEMBER_TIMEOUT, "2000");
+ nonDefault.put(MEMBER_TIMEOUT, "" + memberTimeout);
nonDefault.put(LOCATORS, "localhost[10344]");
DistributionManager dm = mock(DistributionManager.class);
SocketCreatorFactory.setDistributionConfig(new DistributionConfigImpl(new
Properties()));
@@ -785,6 +786,44 @@ public class GMSHealthMonitorJUnitTest {
}
@Test
+ public void testTcpCheckMemberTriesUntilTimeout() throws Exception {
+ ServerSocket mySocket = new ServerSocket(0);
+ Thread serverThread = new Thread() {
+ public void run() {
+ long giveupTime = System.currentTimeMillis() + (5 * memberTimeout);
+ while (System.currentTimeMillis() < giveupTime) {
+ try {
+ Socket acceptedSocket = mySocket.accept();
+ try {
+ Thread.sleep(200);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ return;
+ }
+ acceptedSocket.close();
+ } catch (IOException e) {
+ if (!mySocket.isClosed()) {
+ System.err.println("Test failed with unexpected IOException");
+ e.printStackTrace(System.err);
+ }
+ return;
+ }
+ }
+ }
+ };
+ serverThread.setDaemon(true);
+ serverThread.start();
+ InternalDistributedMember otherMember =
+ createInternalDistributedMember(Version.CURRENT_ORDINAL, 0, 1, 1);
+ long startTime = System.currentTimeMillis();
+ gmsHealthMonitor.doTCPCheckMember(otherMember, mySocket.getLocalPort());
+ mySocket.close();
+ serverThread.interrupt();
+ serverThread.join(getTimeout().getValueInMS());
+ assertThat(System.currentTimeMillis()).isGreaterThanOrEqualTo(startTime +
memberTimeout);
+ }
+
+ @Test
public void testDoTCPCheckMemberWithOkStatus() throws Exception {
executeTestDoTCPCheck(GMSHealthMonitor.OK, true);
}
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
index ab72a07..05c0a34 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java
@@ -513,33 +513,50 @@ public class GMSHealthMonitor implements HealthMonitor,
MessageHandler {
*/
boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port) {
Socket clientSocket = null;
- try {
- logger.debug("Checking member {} with TCP socket connection {}:{}.",
suspectMember,
- suspectMember.getInetAddress(), port);
- clientSocket =
-
SocketCreatorFactory.getSocketCreatorForComponent(SecurableCommunicationChannel.CLUSTER)
- .connect(suspectMember.getInetAddress(), port, (int)
memberTimeout,
- new ConnectTimeoutTask(services.getTimer(), memberTimeout),
false, -1, false);
- clientSocket.setTcpNoDelay(true);
- return doTCPCheckMember(suspectMember, clientSocket);
- } catch (IOException e) {
- // this is expected if it is a connection-timeout or other failure
- // to connect
- } catch (IllegalStateException e) {
- if (!isStopping) {
- logger.trace("Unexpected exception", e);
+ // make sure we try to check on the member for the contracted
memberTimeout period
+ // in case a timed socket.connect() returns immediately
+ long giveupTime = System.nanoTime() + TimeUnit.NANOSECONDS.convert(
+ services.getConfig().getMemberTimeout(), TimeUnit.MILLISECONDS);
+ boolean passed = false;
+ int iteration = 0;
+ do {
+ iteration++;
+ if (iteration > 1) {
+ try {
+ Thread.sleep(100);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ return false;
+ }
}
- } finally {
try {
- if (clientSocket != null) {
- clientSocket.setSoLinger(true, 0); // abort the connection
- clientSocket.close();
- }
+ logger.debug("Checking member {} with TCP socket connection {}:{}.",
suspectMember,
+ suspectMember.getInetAddress(), port);
+ clientSocket =
+
SocketCreatorFactory.getSocketCreatorForComponent(SecurableCommunicationChannel.CLUSTER)
+ .connect(suspectMember.getInetAddress(), port, (int)
memberTimeout,
+ new ConnectTimeoutTask(services.getTimer(),
memberTimeout), false, -1, false);
+ clientSocket.setTcpNoDelay(true);
+ passed = doTCPCheckMember(suspectMember, clientSocket);
} catch (IOException e) {
- // expected
+ // this is expected if it is a connection-timeout or other failure
+ // to connect
+ } catch (IllegalStateException | GemFireConfigException e) {
+ if (!isStopping) {
+ logger.trace("Unexpected exception", e);
+ }
+ } finally {
+ try {
+ if (clientSocket != null) {
+ clientSocket.setSoLinger(true, 0); // abort the connection
+ clientSocket.close();
+ }
+ } catch (IOException e) {
+ // expected
+ }
}
- }
- return false;
+ } while (!passed && !this.isShutdown() && System.nanoTime() < giveupTime);
+ return passed;
}
// Package protected for testing purposes
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 5ed5836..a816e78 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -1040,6 +1040,7 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
this.preparedView.getCreator().equals(view.getCreator())) {
// this can happen if we received two prepares during auto-reconnect
} else {
+ // send the conflicting view to the creator of this new view
services.getMessenger()
.send(new ViewAckMessage(view.getViewId(), m.getSender(),
this.preparedView));
}
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
index ccca881..9085121 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
@@ -330,9 +330,16 @@ public class JGroupsMessenger implements Messenger {
members.add(new UUID(0, 0));// TODO open a JGroups JIRA for GEODE-3034
View jgv = new View(vid, members);
this.myChannel.down(new Event(Event.VIEW_CHANGE, jgv));
- UUID logicalAddress = (UUID) myChannel.getAddress();
- if (logicalAddress instanceof JGAddress) {
- ((JGAddress) logicalAddress).setVmViewId(-1);
+ // attempt to establish a new UUID in the jgroups channel so the
member address will be
+ // different
+ try {
+ Method setAddressMethod =
JChannel.class.getDeclaredMethod("setAddress");
+ setAddressMethod.setAccessible(true);
+ setAddressMethod.invoke(myChannel);
+ } catch (SecurityException | NoSuchMethodException e) {
+ logger.warn("Unable to establish a new JGroups address. "
+ + "My address will be exactly the same as last time.
Exception={}",
+ e.getMessage());
}
reconnecting = true;
} else {
@@ -362,7 +369,7 @@ public class JGroupsMessenger implements Messenger {
jgroupsReceiver = new JGroupsReceiver();
myChannel.setReceiver(jgroupsReceiver);
if (!reconnecting) {
- myChannel.connect("AG"); // apache g***** (whatever we end up calling
it)
+ myChannel.connect("AG"); // Apache Geode
}
} catch (Exception e) {
myChannel.close();
diff --git
a/geode-dunit/src/main/java/org/apache/geode/test/dunit/rules/ClusterStartupRule.java
b/geode-dunit/src/main/java/org/apache/geode/test/dunit/rules/ClusterStartupRule.java
index cd87286..3d329aa 100644
---
a/geode-dunit/src/main/java/org/apache/geode/test/dunit/rules/ClusterStartupRule.java
+++
b/geode-dunit/src/main/java/org/apache/geode/test/dunit/rules/ClusterStartupRule.java
@@ -250,7 +250,7 @@ public class ClusterStartupRule implements
SerializableTestRule {
SerializableFunction<ServerStarterRule> ruleOperator) {
final String defaultName = "server-" + index;
VM serverVM = getVM(index, version);
- Server server = serverVM.invoke(() -> {
+ Server server = serverVM.invoke("startServerVM", () -> {
memberStarter = new ServerStarterRule();
ServerStarterRule serverStarter = (ServerStarterRule) memberStarter;
if (logFile) {