Author: fpj Date: Mon Oct 21 21:35:12 2013 New Revision: 1534388 URL: http://svn.apache.org/r1534388 Log: ZOOKEEPER-1732. ZooKeeper server unable to join established ensemble (German Blanco via fpj)
Modified: zookeeper/branches/branch-3.4/CHANGES.txt zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Leader.java zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Learner.java zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java zookeeper/branches/branch-3.4/src/java/test/org/apache/zookeeper/test/FLETest.java Modified: zookeeper/branches/branch-3.4/CHANGES.txt URL: http://svn.apache.org/viewvc/zookeeper/branches/branch-3.4/CHANGES.txt?rev=1534388&r1=1534387&r2=1534388&view=diff ============================================================================== --- zookeeper/branches/branch-3.4/CHANGES.txt (original) +++ zookeeper/branches/branch-3.4/CHANGES.txt Mon Oct 21 21:35:12 2013 @@ -137,6 +137,8 @@ BUGFIXES: ZOOKEEPER-1558. Leader should not snapshot uncommitted state (fpj) + ZOOKEEPER-1732. ZooKeeper server unable to join established ensemble (German Blanco via fpj) + IMPROVEMENTS: ZOOKEEPER-1564. Allow JUnit test build with IBM Java Modified: zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java URL: http://svn.apache.org/viewvc/zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java?rev=1534388&r1=1534387&r2=1534388&view=diff ============================================================================== --- zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java (original) +++ zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java Mon Oct 21 21:35:12 2013 @@ -66,6 +66,14 @@ public class FastLeaderElection implemen */ final static int maxNotificationInterval = 60000; + + /** + * This value is passed to the methods that check the quorum + * majority of an established ensemble for those values that + * should not be taken into account in the comparison + * (electionEpoch and zxid). + */ + final static int IGNOREVALUE = -1; /** * Connection manager. Fast leader election uses TCP for @@ -324,7 +332,7 @@ public class FastLeaderElection implemen ToSend.mType.notification, current.getId(), current.getZxid(), - logicalclock, + current.getElectionEpoch(), self.getPeerState(), response.sid, current.getPeerEpoch()); @@ -867,15 +875,25 @@ public class FastLeaderElection implemen } } - /** + /* * Before joining an established ensemble, verify that * a majority are following the same leader. + * Only peer epoch is used to check that the votes come + * from the same ensemble. This is because there is at + * least one corner case in which the ensemble can be + * created with inconsistent zxid and election epoch + * info. However, given that only one ensemble can be + * running at a single point in time and that each + * epoch is used only once, using only the epoch to + * compare the votes is sufficient. + * + * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732 */ - outofelection.put(n.sid, new Vote(n.leader, n.zxid, - n.electionEpoch, n.peerEpoch, n.state)); + outofelection.put(n.sid, new Vote(n.leader, + IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state)); if (termPredicate(outofelection, new Vote(n.leader, - n.zxid, n.electionEpoch, n.peerEpoch, n.state)) - && checkLeader(outofelection, n.leader, n.electionEpoch)) { + IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state)) + && checkLeader(outofelection, n.leader, IGNOREVALUE)) { synchronized(this){ logicalclock = n.electionEpoch; self.setPeerState((n.leader == self.getId()) ? Modified: zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Leader.java URL: http://svn.apache.org/viewvc/zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Leader.java?rev=1534388&r1=1534387&r2=1534388&view=diff ============================================================================== --- zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Leader.java (original) +++ zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Leader.java Mon Oct 21 21:35:12 2013 @@ -945,6 +945,15 @@ public class Leader { + " ]; starting up and setting last processed zxid: 0x{}", Long.toHexString(zk.getZxid())); zk.startup(); + /* + * Update the election vote here to ensure that all members of the + * ensemble report the same vote to new servers that start up and + * send leader election notifications to the ensemble. + * + * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732 + */ + self.updateElectionVote(getEpoch()); + zk.getZKDatabase().setlastProcessedZxid(zk.getZxid()); } Modified: zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Learner.java URL: http://svn.apache.org/viewvc/zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Learner.java?rev=1534388&r1=1534387&r2=1534388&view=diff ============================================================================== --- zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Learner.java (original) +++ zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/Learner.java Mon Oct 21 21:35:12 2013 @@ -435,6 +435,15 @@ public class Learner { writePacket(ack, true); sock.setSoTimeout(self.tickTime * self.syncLimit); zk.startup(); + /* + * Update the election vote here to ensure that all members of the + * ensemble report the same vote to new servers that start up and + * send leader election notifications to the ensemble. + * + * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732 + */ + self.updateElectionVote(newEpoch); + // We need to log the stuff that came in between the snapshot and the uptodate if (zk instanceof FollowerZooKeeperServer) { FollowerZooKeeperServer fzk = (FollowerZooKeeperServer)zk; Modified: zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java URL: http://svn.apache.org/viewvc/zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java?rev=1534388&r1=1534387&r2=1534388&view=diff ============================================================================== --- zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java (original) +++ zookeeper/branches/branch-3.4/src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java Mon Oct 21 21:35:12 2013 @@ -1194,4 +1194,21 @@ public class QuorumPeer extends Thread i acceptedEpoch = e; writeLongToFile(ACCEPTED_EPOCH_FILENAME, e); } + + /** + * Updates leader election info to avoid inconsistencies when + * a new server tries to join the ensemble. + * See ZOOKEEPER-1732 for more info. + */ + protected void updateElectionVote(long newEpoch) { + Vote currentVote = getCurrentVote(); + if (currentVote != null) { + setCurrentVote(new Vote(currentVote.getId(), + currentVote.getZxid(), + currentVote.getElectionEpoch(), + newEpoch, + currentVote.getState())); + } + } + } Modified: zookeeper/branches/branch-3.4/src/java/test/org/apache/zookeeper/test/FLETest.java URL: http://svn.apache.org/viewvc/zookeeper/branches/branch-3.4/src/java/test/org/apache/zookeeper/test/FLETest.java?rev=1534388&r1=1534387&r2=1534388&view=diff ============================================================================== --- zookeeper/branches/branch-3.4/src/java/test/org/apache/zookeeper/test/FLETest.java (original) +++ zookeeper/branches/branch-3.4/src/java/test/org/apache/zookeeper/test/FLETest.java Mon Oct 21 21:35:12 2013 @@ -403,4 +403,67 @@ public class FLETest extends ZKTestCase } } } + + /* + * For ZOOKEEPER-1732 verify that it is possible to join an ensemble with + * inconsistent election round information. + */ + @Test + public void testJoinInconsistentEnsemble() throws Exception { + int sid; + QuorumPeer peer; + int waitTime = 10 * 1000; + ArrayList<QuorumPeer> peerList = new ArrayList<QuorumPeer>(); + for(sid = 0; sid < 3; sid++) { + peers.put(Long.valueOf(sid), + new QuorumServer(sid, + new InetSocketAddress(PortAssignment.unique()), + new InetSocketAddress(PortAssignment.unique()))); + tmpdir[sid] = ClientBase.createTmpDir(); + port[sid] = PortAssignment.unique(); + } + // start 2 peers and verify if they form the cluster + for (sid = 0; sid < 2; sid++) { + peer = new QuorumPeer(peers, tmpdir[sid], tmpdir[sid], + port[sid], 3, sid, 2000, 2, 2); + LOG.info("Starting peer " + peer.getId()); + peer.start(); + peerList.add(sid, peer); + } + peer = peerList.get(0); + VerifyState v1 = new VerifyState(peerList.get(0)); + v1.start(); + v1.join(waitTime); + Assert.assertFalse("Unable to form cluster in " + + waitTime + " ms", + !v1.isSuccess()); + // Change the election round for one of the members of the ensemble + long leaderSid = peer.getCurrentVote().getId(); + long zxid = peer.getCurrentVote().getZxid(); + long electionEpoch = peer.getCurrentVote().getElectionEpoch(); + ServerState state = peer.getCurrentVote().getState(); + long peerEpoch = peer.getCurrentVote().getPeerEpoch(); + Vote newVote = new Vote(leaderSid, zxid+100, electionEpoch+100, peerEpoch, state); + peer.setCurrentVote(newVote); + // Start 3rd peer and check if it joins the quorum + peer = new QuorumPeer(peers, tmpdir[2], tmpdir[2], + port[2], 3, 2, 2000, 2, 2); + LOG.info("Starting peer " + peer.getId()); + peer.start(); + peerList.add(sid, peer); + v1 = new VerifyState(peer); + v1.start(); + v1.join(waitTime); + if (v1.isAlive()) { + Assert.fail("Peer " + peer.getId() + " failed to join the cluster " + + "within " + waitTime + " ms"); + } + // cleanup + for (int id = 0; id < 3; id++) { + peer = peerList.get(id); + if (peer != null) { + peer.shutdown(); + } + } + } }