Author: fpj Date: Mon Oct 21 21:36:55 2013 New Revision: 1534390 URL: http://svn.apache.org/r1534390 Log: ZOOKEEPER-1732. ZooKeeper server unable to join established ensemble (German Blanco via fpj)
Modified: zookeeper/trunk/CHANGES.txt zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Leader.java zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Learner.java zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java zookeeper/trunk/src/java/test/org/apache/zookeeper/test/FLETest.java Modified: zookeeper/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/zookeeper/trunk/CHANGES.txt?rev=1534390&r1=1534389&r2=1534390&view=diff ============================================================================== --- zookeeper/trunk/CHANGES.txt (original) +++ zookeeper/trunk/CHANGES.txt Mon Oct 21 21:36:55 2013 @@ -450,6 +450,9 @@ BUGFIXES: ZOOKEEPER-1646. mt c client tests fail on Ubuntu Raring (phunt) + ZOOKEEPER-1732. ZooKeeper server unable to join established + ensemble (German Blanco via fpj) + IMPROVEMENTS: ZOOKEEPER-1170. Fix compiler (eclipse) warnings: unused imports, Modified: zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java URL: http://svn.apache.org/viewvc/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java?rev=1534390&r1=1534389&r2=1534390&view=diff ============================================================================== --- zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java (original) +++ zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java Mon Oct 21 21:36:55 2013 @@ -67,6 +67,14 @@ public class FastLeaderElection implemen */ final static int maxNotificationInterval = 60000; + + /** + * This value is passed to the methods that check the quorum + * majority of an established ensemble for those values that + * should not be taken into account in the comparison + * (electionEpoch and zxid). + */ + final static int IGNOREVALUE = -1; /** * Connection manager. Fast leader election uses TCP for @@ -382,7 +390,7 @@ public class FastLeaderElection implemen ToSend.mType.notification, current.getId(), current.getZxid(), - logicalclock, + current.getElectionEpoch(), self.getPeerState(), response.sid, current.getPeerEpoch(), @@ -919,15 +927,25 @@ public class FastLeaderElection implemen } } - /** + /* * Before joining an established ensemble, verify that * a majority are following the same leader. + * Only peer epoch is used to check that the votes come + * from the same ensemble. This is because there is at + * least one corner case in which the ensemble can be + * created with inconsistent zxid and election epoch + * info. However, given that only one ensemble can be + * running at a single point in time and that each + * epoch is used only once, using only the epoch to + * compare the votes is sufficient. + * + * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732 */ - outofelection.put(n.sid, new Vote(n.leader, n.zxid, - n.electionEpoch, n.peerEpoch, n.state)); + outofelection.put(n.sid, new Vote(n.leader, + IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state)); if (termPredicate(outofelection, new Vote(n.leader, - n.zxid, n.electionEpoch, n.peerEpoch, n.state)) - && checkLeader(outofelection, n.leader, n.electionEpoch)) { + IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state)) + && checkLeader(outofelection, n.leader, IGNOREVALUE)) { synchronized(this){ logicalclock = n.electionEpoch; self.setPeerState((n.leader == self.getId()) ? Modified: zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Leader.java URL: http://svn.apache.org/viewvc/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Leader.java?rev=1534390&r1=1534389&r2=1534390&view=diff ============================================================================== --- zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Leader.java (original) +++ zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Leader.java Mon Oct 21 21:36:55 2013 @@ -1242,6 +1242,15 @@ public class Leader { } zk.startup(); + /* + * Update the election vote here to ensure that all members of the + * ensemble report the same vote to new servers that start up and + * send leader election notifications to the ensemble. + * + * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732 + */ + self.updateElectionVote(getEpoch()); + zk.getZKDatabase().setlastProcessedZxid(zk.getZxid()); } Modified: zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Learner.java URL: http://svn.apache.org/viewvc/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Learner.java?rev=1534390&r1=1534389&r2=1534390&view=diff ============================================================================== --- zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Learner.java (original) +++ zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/Learner.java Mon Oct 21 21:36:55 2013 @@ -495,6 +495,15 @@ public class Learner { writePacket(ack, true); sock.setSoTimeout(self.tickTime * self.syncLimit); zk.startup(); + /* + * Update the election vote here to ensure that all members of the + * ensemble report the same vote to new servers that start up and + * send leader election notifications to the ensemble. + * + * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732 + */ + self.updateElectionVote(newEpoch); + // We need to log the stuff that came in between the snapshot and the uptodate if (zk instanceof FollowerZooKeeperServer) { FollowerZooKeeperServer fzk = (FollowerZooKeeperServer)zk; Modified: zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java URL: http://svn.apache.org/viewvc/zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java?rev=1534390&r1=1534389&r2=1534390&view=diff ============================================================================== --- zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java (original) +++ zookeeper/trunk/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java Mon Oct 21 21:36:55 2013 @@ -1605,4 +1605,22 @@ public class QuorumPeer extends Thread i } return false; } + + /** + * Updates leader election info to avoid inconsistencies when + * a new server tries to join the ensemble. + * + * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732 + */ + protected void updateElectionVote(long newEpoch) { + Vote currentVote = getCurrentVote(); + if (currentVote != null) { + setCurrentVote(new Vote(currentVote.getId(), + currentVote.getZxid(), + currentVote.getElectionEpoch(), + newEpoch, + currentVote.getState())); + } + } + } Modified: zookeeper/trunk/src/java/test/org/apache/zookeeper/test/FLETest.java URL: http://svn.apache.org/viewvc/zookeeper/trunk/src/java/test/org/apache/zookeeper/test/FLETest.java?rev=1534390&r1=1534389&r2=1534390&view=diff ============================================================================== --- zookeeper/trunk/src/java/test/org/apache/zookeeper/test/FLETest.java (original) +++ zookeeper/trunk/src/java/test/org/apache/zookeeper/test/FLETest.java Mon Oct 21 21:36:55 2013 @@ -454,4 +454,67 @@ public class FLETest extends ZKTestCase } } } + + /* + * For ZOOKEEPER-1732 verify that it is possible to join an ensemble with + * inconsistent election round information. + */ + @Test + public void testJoinInconsistentEnsemble() throws Exception { + int sid; + QuorumPeer peer; + int waitTime = 10 * 1000; + ArrayList<QuorumPeer> peerList = new ArrayList<QuorumPeer>(); + for(sid = 0; sid < 3; sid++) { + peers.put(Long.valueOf(sid), + new QuorumServer(sid, + new InetSocketAddress(PortAssignment.unique()), + new InetSocketAddress(PortAssignment.unique()))); + tmpdir[sid] = ClientBase.createTmpDir(); + port[sid] = PortAssignment.unique(); + } + // start 2 peers and verify if they form the cluster + for (sid = 0; sid < 2; sid++) { + peer = new QuorumPeer(peers, tmpdir[sid], tmpdir[sid], + port[sid], 3, sid, 2000, 2, 2); + LOG.info("Starting peer " + peer.getId()); + peer.start(); + peerList.add(sid, peer); + } + peer = peerList.get(0); + VerifyState v1 = new VerifyState(peerList.get(0)); + v1.start(); + v1.join(waitTime); + Assert.assertFalse("Unable to form cluster in " + + waitTime + " ms", + !v1.isSuccess()); + // Change the election round for one of the members of the ensemble + long leaderSid = peer.getCurrentVote().getId(); + long zxid = peer.getCurrentVote().getZxid(); + long electionEpoch = peer.getCurrentVote().getElectionEpoch(); + ServerState state = peer.getCurrentVote().getState(); + long peerEpoch = peer.getCurrentVote().getPeerEpoch(); + Vote newVote = new Vote(leaderSid, zxid+100, electionEpoch+100, peerEpoch, state); + peer.setCurrentVote(newVote); + // Start 3rd peer and check if it joins the quorum + peer = new QuorumPeer(peers, tmpdir[2], tmpdir[2], + port[2], 3, 2, 2000, 2, 2); + LOG.info("Starting peer " + peer.getId()); + peer.start(); + peerList.add(sid, peer); + v1 = new VerifyState(peer); + v1.start(); + v1.join(waitTime); + if (v1.isAlive()) { + Assert.fail("Peer " + peer.getId() + " failed to join the cluster " + + "within " + waitTime + " ms"); + } + // cleanup + for (int id = 0; id < 3; id++) { + peer = peerList.get(id); + if (peer != null) { + peer.shutdown(); + } + } + } }