This is an automated email from the ASF dual-hosted git repository.
bschuchardt pushed a commit to branch release/1.4.0
in repository https://gitbox.apache.org/repos/asf/geode.git
The following commit(s) were added to refs/heads/release/1.4.0 by this push:
new 2675247 Squashed commit of the following:
2675247 is described below
commit 267524720d8a94145dc3fdf3a50b6fd161a80004
Author: Bruce Schuchardt <[email protected]>
AuthorDate: Tue Jan 9 08:39:48 2018 -0800
Squashed commit of the following:
commit 70a76592a58379bdd9b53433877b64831fc7432e
Author: Bruce Schuchardt <[email protected]>
Date: Tue Jan 9 08:37:45 2018 -0800
GEODE-3588 2 restarts of Locator results in split brain
removed thread dump in new test
commit 89bf34c39f3df4ed7b16d6c9a256e2d26b9d2267
Author: Bruce Schuchardt <[email protected]>
Date: Mon Jan 8 15:57:09 2018 -0800
GEODE-3588 2 restarts of Locator results in split brain
Udo's fix for GEODE-870 added a new boolean instance variable to
GMSJoinLeave to tell its ViewCreator thread to shut down. This works
but the state was never being reset after its first use. This caused
Subsequent ViewCreator threads to shut down immediately. The only
way to fix this condition without a patch is to restart the coordinator
node.
The patch moves this boolean variable to the ViewCreator thread so that
it is automatically reset when a new ViewCreator is instantiated.
I also did a little code cleanup, moving GMSJoinLeave methods from the
end of the file to where its other methods are located and adding
a setShutdownFlag() method during debugging so I could isolate what
was happening.
Sarge reviewed the changes for me so this closes #1255
(cherry picked from commit 3cf7caab3c3726dfb47e12a900240e377e035594)
---
.../membership/gms/membership/GMSJoinLeave.java | 140 +++++++++++----------
.../membership/gms/messenger/JGroupsMessenger.java | 2 +-
.../apache/geode/distributed/LocatorDUnitTest.java | 50 ++++++++
.../gms/membership/GMSJoinLeaveJUnitTest.java | 4 +-
4 files changed, 130 insertions(+), 66 deletions(-)
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index f90329c..c6b4e1d 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -255,11 +255,6 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
*/
NetView quorumLostView;
- /**
- * a flag to mark a coordinator's viewCreator for shutdown
- */
- private boolean markViewCreatorForShutdown = false;
-
static class SearchState {
Set<InternalDistributedMember> alreadyTried = new HashSet<>();
Set<InternalDistributedMember> registrants = new HashSet<>();
@@ -533,7 +528,7 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
*/
private void processJoinRequest(JoinRequestMessage incomingRequest) {
- logger.info("received join request from {}",
incomingRequest.getMemberID());
+ logger.info("Received a join request from {}",
incomingRequest.getMemberID());
if (!ALLOW_OLD_VERSION_FOR_TESTING
&&
incomingRequest.getMemberID().getVersionObject().compareTo(Version.CURRENT) <
0) {
@@ -570,7 +565,6 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
// services.getMessenger().send(joinResponseMessage);
// return;
// }
- logger.info("Received a join request from " + incomingRequest.getSender());
recordViewRequest(incomingRequest);
}
@@ -930,7 +924,7 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
// if another server is the coordinator - GEODE-870
if (isCoordinator && !localAddress.equals(view.getCoordinator())
&& getViewCreator() != null) {
- markViewCreatorForShutdown = true;
+ getViewCreator().markViewCreatorForShutdown();
this.isCoordinator = false;
}
installView(view);
@@ -1686,6 +1680,50 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
processLeaveRequest(msg);
}
+ boolean checkIfAvailable(InternalDistributedMember fmbr) {
+ // return the member id if it fails health checks
+ logger.info("checking state of member " + fmbr);
+ if (services.getHealthMonitor().checkIfAvailable(fmbr,
+ "Member failed to acknowledge a membership view", false)) {
+ logger.info("member " + fmbr + " passed availability check");
+ return true;
+ }
+ logger.info("member " + fmbr + " failed availability check");
+ return false;
+ }
+
+ private InternalDistributedMember getMemId(NetMember jgId,
+ List<InternalDistributedMember> members) {
+ for (InternalDistributedMember m : members) {
+ if (((GMSMember) m.getNetMember()).equals(jgId)) {
+ return m;
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public InternalDistributedMember getMemberID(NetMember jgId) {
+ NetView v = currentView;
+ InternalDistributedMember ret = null;
+ if (v != null) {
+ ret = getMemId(jgId, v.getMembers());
+ }
+
+ if (ret == null) {
+ v = preparedView;
+ if (v != null) {
+ ret = getMemId(jgId, v.getMembers());
+ }
+ }
+
+ if (ret == null) {
+ return new InternalDistributedMember(jgId);
+ }
+
+ return ret;
+ }
+
@Override
public void disableDisconnectOnQuorumLossForTesting() {
this.quorumRequired = false;
@@ -2004,6 +2042,8 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
volatile boolean testFlagForRemovalRequest = false;
// count of number of views abandoned due to conflicts
volatile int abandonedViews = 0;
+ private boolean markViewCreatorForShutdown = false; // see GEODE-870
+
/**
* initial view to install. guarded by synch on ViewCreator
@@ -2027,7 +2067,7 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
}
void shutdown() {
- shutdown = true;
+ setShutdownFlag();
synchronized (viewRequests) {
viewRequests.notifyAll();
interrupt();
@@ -2097,18 +2137,34 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
try {
sleep(services.getConfig().getMemberTimeout());
} catch (InterruptedException e2) {
- shutdown = true;
+ setShutdownFlag();
retry = false;
}
} catch (InterruptedException e) {
- shutdown = true;
+ setShutdownFlag();
} catch (DistributedSystemDisconnectedException e) {
- shutdown = true;
+ setShutdownFlag();
}
} while (retry);
}
/**
+ * marks this ViewCreator as being shut down. It may be some short amount
of time before the
+ * ViewCreator thread exits.
+ */
+ private void setShutdownFlag() {
+ shutdown = true;
+ }
+
+ /**
+ * This allows GMSJoinLeave to tell the ViewCreator to shut down after
finishing its current
+ * task. See GEODE-870.
+ */
+ private void markViewCreatorForShutdown() {
+ this.markViewCreatorForShutdown = true;
+ }
+
+ /**
* During initial view processing a prepared view was discovered. This
method will extract its
* new members and create a new initial view containing them.
*
@@ -2214,19 +2270,19 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
try {
sleep(services.getConfig().getMemberTimeout());
} catch (InterruptedException e2) {
- shutdown = true;
+ setShutdownFlag();
}
} catch (DistributedSystemDisconnectedException e) {
- shutdown = true;
+ setShutdownFlag();
} catch (InterruptedException e) {
logger.info("View Creator thread interrupted");
- shutdown = true;
+ setShutdownFlag();
}
requests = null;
}
}
} finally {
- shutdown = true;
+ setShutdownFlag();
informToPendingJoinRequests();
org.apache.geode.distributed.internal.membership.gms.interfaces.Locator locator
=
services.getLocator();
@@ -2443,7 +2499,7 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
Set<InternalDistributedMember> crashes =
newView.getActualCrashedMembers(currentView);
forceDisconnect(LocalizedStrings.Network_partition_detected
.toLocalizedString(crashes.size(), crashes));
- shutdown = true;
+ setShutdownFlag();
return;
}
@@ -2497,7 +2553,7 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
// this member may have been kicked out of the conflicting view
if (failures.contains(localAddress)) {
forceDisconnect("I am no longer a member of the distributed
system");
- shutdown = true;
+ setShutdownFlag();
return;
}
List<InternalDistributedMember> newMembers =
conflictingView.getNewMembers();
@@ -2567,13 +2623,13 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
// can be transmitted to the new members w/o including it in the view
message
if (markViewCreatorForShutdown && getViewCreator() != null) {
- shutdown = true;
+ setShutdownFlag();
}
// after sending a final view we need to stop this thread if
// the GMS is shutting down
if (isStopping()) {
- shutdown = true;
+ setShutdownFlag();
}
}
@@ -2704,54 +2760,12 @@ public class GMSJoinLeave implements JoinLeave,
MessageHandler {
return result;
}
- boolean getTestFlageForRemovalRequest() {
+ boolean getTestFlagForRemovalRequest() {
return testFlagForRemovalRequest;
}
- }
- boolean checkIfAvailable(InternalDistributedMember fmbr) {
- // return the member id if it fails health checks
- logger.info("checking state of member " + fmbr);
- if (services.getHealthMonitor().checkIfAvailable(fmbr,
- "Member failed to acknowledge a membership view", false)) {
- logger.info("member " + fmbr + " passed availability check");
- return true;
- }
- logger.info("member " + fmbr + " failed availability check");
- return false;
- }
-
- private InternalDistributedMember getMemId(NetMember jgId,
- List<InternalDistributedMember> members) {
- for (InternalDistributedMember m : members) {
- if (((GMSMember) m.getNetMember()).equals(jgId)) {
- return m;
- }
- }
- return null;
}
- @Override
- public InternalDistributedMember getMemberID(NetMember jgId) {
- NetView v = currentView;
- InternalDistributedMember ret = null;
- if (v != null) {
- ret = getMemId(jgId, v.getMembers());
- }
-
- if (ret == null) {
- v = preparedView;
- if (v != null) {
- ret = getMemId(jgId, v.getMembers());
- }
- }
-
- if (ret == null) {
- return new InternalDistributedMember(jgId);
- }
-
- return ret;
- }
static class ViewAbandonedException extends Exception {
}
diff --git
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
index 46142c4..d074940 100644
---
a/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
+++
b/geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/messenger/JGroupsMessenger.java
@@ -409,7 +409,7 @@ public class JGroupsMessenger implements Messenger {
mbrs.addAll(v.getMembers().stream().map(JGAddress::new).collect(Collectors.toList()));
ViewId vid = new ViewId(new JGAddress(v.getCoordinator()), v.getViewId());
View jgv = new View(vid, new ArrayList<>(mbrs));
- logger.trace("installing JGroups view: {}", jgv);
+ logger.trace("installing view into JGroups stack: {}", jgv);
this.myChannel.down(new Event(Event.VIEW_CHANGE, jgv));
addressesWithIoExceptionsProcessed.clear();
diff --git
a/geode-core/src/test/java/org/apache/geode/distributed/LocatorDUnitTest.java
b/geode-core/src/test/java/org/apache/geode/distributed/LocatorDUnitTest.java
index 446385d..577e089 100644
---
a/geode-core/src/test/java/org/apache/geode/distributed/LocatorDUnitTest.java
+++
b/geode-core/src/test/java/org/apache/geode/distributed/LocatorDUnitTest.java
@@ -79,6 +79,7 @@ import
org.apache.geode.distributed.internal.membership.gms.membership.GMSJoinLe
import org.apache.geode.internal.Assert;
import org.apache.geode.internal.AvailablePort;
import org.apache.geode.internal.AvailablePortHelper;
+import org.apache.geode.internal.OSProcess;
import org.apache.geode.internal.cache.GemFireCacheImpl;
import org.apache.geode.internal.logging.InternalLogWriter;
import org.apache.geode.internal.logging.LocalLogWriter;
@@ -2022,6 +2023,55 @@ public class LocatorDUnitTest extends
JUnit4DistributedTestCase {
}
/**
+ * See GEODE-3588 - a locator is restarted twice with a server and ends up
in a split-brain
+ */
+ @Test
+ public void testRestartLocatorMultipleTimes() throws Exception {
+ disconnectAllFromDS();
+ port1 = AvailablePort.getRandomAvailablePort(AvailablePort.SOCKET);
+ DistributedTestUtils.deleteLocatorStateFile(port1);
+ File logFile = new File("");
+ File stateFile = new File("locator" + port1 + "state.dat");
+ VM vm0 = Host.getHost(0).getVM(0);
+ final Properties p = new Properties();
+ p.setProperty(LOCATORS, Host.getHost(0).getHostName() + "[" + port1 + "]");
+ p.setProperty(MCAST_PORT, "0");
+ p.setProperty(ENABLE_CLUSTER_CONFIGURATION, "false");
+ p.setProperty(LOG_LEVEL, "finest");
+ addDSProps(p);
+ if (stateFile.exists()) {
+ stateFile.delete();
+ }
+
+ Locator locator = Locator.startLocatorAndDS(port1, logFile, p);
+
+ vm0.invoke(() -> {
+ DistributedSystem.connect(p);
+ return null;
+ });
+
+ try {
+ locator.stop();
+ locator = Locator.startLocatorAndDS(port1, logFile, p);
+ assertEquals(2, ((InternalDistributedSystem)
locator.getDistributedSystem()).getDM()
+ .getViewMembers().size());
+
+ locator.stop();
+ locator = Locator.startLocatorAndDS(port1, logFile, p);
+ assertEquals(2, ((InternalDistributedSystem)
locator.getDistributedSystem()).getDM()
+ .getViewMembers().size());
+
+ } finally {
+ vm0.invoke("disconnect", () -> {
+ DistributedSystem.connect(p).disconnect();
+ return null;
+ });
+ locator.stop();
+ }
+
+ }
+
+ /**
* return the distributed member id for the ds on this vm
*/
public static DistributedMember getDistributedMember(Properties props) {
diff --git
a/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
b/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
index 7389465..1b6cdec 100644
---
a/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
+++
b/geode-core/src/test/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
@@ -1331,7 +1331,7 @@ public class GMSJoinLeaveJUnitTest {
this.removeMember = null;
assertTrue("testFlagForRemovalRequest should be true",
- gmsJoinLeave.getViewCreator().getTestFlageForRemovalRequest());
+ gmsJoinLeave.getViewCreator().getTestFlagForRemovalRequest());
}
@Test
@@ -1354,7 +1354,7 @@ public class GMSJoinLeaveJUnitTest {
this.leaveMember = null;
assertTrue("testFlagForRemovalRequest should be true",
- gmsJoinLeave.getViewCreator().getTestFlageForRemovalRequest());
+ gmsJoinLeave.getViewCreator().getTestFlagForRemovalRequest());
}
private void installView() throws Exception {
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].