[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user asfgit closed the pull request at: https://github.com/apache/zookeeper/pull/453 ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user revans2 commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r169662234 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +923,103 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testFailedTxnAsPartOfQuorumLoss() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +servers = LaunchServers(SERVER_COUNT); + +waitForAll(servers, States.CONNECTED); + +// we need to shutdown and start back up to make sure that the create session isn't the first transaction since +// that is rather innocuous. +servers.shutDownAllServers(); +waitForAll(servers, States.CONNECTING); +servers.restartAllServersAndClients(this); +waitForAll(servers, States.CONNECTED); + +// 2. kill all followers +int leader = servers.findLeader(); +Map outstanding = servers.mt[leader].main.quorumPeer.leader.outstandingProposals; +// increase the tick time to delay the leader going to looking +servers.mt[leader].main.quorumPeer.tickTime = 1; +LOG.warn("LEADER {}", leader); + +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +servers.mt[i].shutdown(); +} +} + +// 3. start up the followers to form a new quorum +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +servers.mt[i].start(); +} +} + +// 4. wait one of the follower to be the new leader +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +// Recreate a client session since the previous session was not persisted. +servers.restartClient(i, this); +waitForOne(servers.zk[i], States.CONNECTED); +} +} + +// 5. send a create request to old leader and make sure it's synced to disk, +//which means it acked from itself +try { +servers.zk[leader].create("/zk" + leader, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, +CreateMode.PERSISTENT); +Assert.fail("create /zk" + leader + " should have failed"); +} catch (KeeperException e) { +} + +// just make sure that we actually did get it in process at the +// leader +Assert.assertEquals(1, outstanding.size()); +Proposal p = outstanding.values().iterator().next(); +Assert.assertEquals(OpCode.create, p.request.getHdr().getType()); + +// make sure it has a chance to write it to disk +int sleepTime = 0; +Long longLeader = new Long(leader); +while (!p.qvAcksetPairs.get(0).getAckset().contains(longLeader)) { +if (sleepTime > 2000) { +Assert.fail("Transaction not synced to disk within 1 second " + p.qvAcksetPairs.get(0).getAckset() ++ " expected " + leader); +} +Thread.sleep(100); +sleepTime += 100; +} + +// 6. wait for the leader to quit due to not enough followers and come back up as a part of the new quorum +sleepTime = 0; +Follower f = servers.mt[leader].main.quorumPeer.follower; +while (f == null || !f.isRunning()) { +if (sleepTime > 10_000) { +Assert.fail("Took too long for old leader to time out " + servers.mt[leader].main.quorumPeer.getPeerState()); +} +Thread.sleep(100); +sleepTime += 100; +f = servers.mt[leader].main.quorumPeer.follower; +} +servers.mt[leader].shutdown(); --- End diff -- It is a lot of very specific steps that make the data inconsistency show up. This is needed to force the transaction log to be replayed which has an edit in it that wasn't considered as a part of leader election. ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user afine commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168884569 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -465,6 +470,37 @@ private void waitForAll(ZooKeeper[] zks, States state) throws InterruptedExcepti private static class Servers { MainThread mt[]; ZooKeeper zk[]; +int[] clientPorts; + +public void shutDownAllServers() throws InterruptedException { +for (MainThread t: mt) { +t.shutdown(); +} +} + +public void restartAllServersAndClients(Watcher watcher) throws IOException { +for (MainThread t : mt) { +if (!t.isAlive()) { +t.start(); +} +} +for (int i = 0; i < zk.length; i++) { +restartClient(i, watcher); +} +} + +public void restartClient(int i, Watcher watcher) throws IOException { --- End diff -- annoying nitpick: let's use a better argument name than `i` ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user afine commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168886064 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +923,103 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testFailedTxnAsPartOfQuorumLoss() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +servers = LaunchServers(SERVER_COUNT); + +waitForAll(servers, States.CONNECTED); + +// we need to shutdown and start back up to make sure that the create session isn't the first transaction since +// that is rather innocuous. +servers.shutDownAllServers(); +waitForAll(servers, States.CONNECTING); +servers.restartAllServersAndClients(this); +waitForAll(servers, States.CONNECTED); + +// 2. kill all followers +int leader = servers.findLeader(); +Map outstanding = servers.mt[leader].main.quorumPeer.leader.outstandingProposals; +// increase the tick time to delay the leader going to looking +servers.mt[leader].main.quorumPeer.tickTime = 1; +LOG.warn("LEADER {}", leader); + +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +servers.mt[i].shutdown(); +} +} + +// 3. start up the followers to form a new quorum +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +servers.mt[i].start(); +} +} + +// 4. wait one of the follower to be the new leader +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +// Recreate a client session since the previous session was not persisted. +servers.restartClient(i, this); +waitForOne(servers.zk[i], States.CONNECTED); +} +} + +// 5. send a create request to old leader and make sure it's synced to disk, +//which means it acked from itself +try { +servers.zk[leader].create("/zk" + leader, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, +CreateMode.PERSISTENT); +Assert.fail("create /zk" + leader + " should have failed"); +} catch (KeeperException e) { +} + +// just make sure that we actually did get it in process at the +// leader +Assert.assertEquals(1, outstanding.size()); +Proposal p = outstanding.values().iterator().next(); +Assert.assertEquals(OpCode.create, p.request.getHdr().getType()); + +// make sure it has a chance to write it to disk +int sleepTime = 0; +Long longLeader = new Long(leader); +while (!p.qvAcksetPairs.get(0).getAckset().contains(longLeader)) { +if (sleepTime > 2000) { +Assert.fail("Transaction not synced to disk within 1 second " + p.qvAcksetPairs.get(0).getAckset() ++ " expected " + leader); +} +Thread.sleep(100); +sleepTime += 100; +} + +// 6. wait for the leader to quit due to not enough followers and come back up as a part of the new quorum +sleepTime = 0; +Follower f = servers.mt[leader].main.quorumPeer.follower; +while (f == null || !f.isRunning()) { +if (sleepTime > 10_000) { --- End diff -- nitpick: can we reuse the ticktime here to make the relationship more obvious? ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user afine commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168887935 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +923,103 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testFailedTxnAsPartOfQuorumLoss() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +servers = LaunchServers(SERVER_COUNT); + +waitForAll(servers, States.CONNECTED); + +// we need to shutdown and start back up to make sure that the create session isn't the first transaction since +// that is rather innocuous. +servers.shutDownAllServers(); +waitForAll(servers, States.CONNECTING); +servers.restartAllServersAndClients(this); +waitForAll(servers, States.CONNECTED); + +// 2. kill all followers +int leader = servers.findLeader(); +Map outstanding = servers.mt[leader].main.quorumPeer.leader.outstandingProposals; +// increase the tick time to delay the leader going to looking +servers.mt[leader].main.quorumPeer.tickTime = 1; +LOG.warn("LEADER {}", leader); + +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +servers.mt[i].shutdown(); +} +} + +// 3. start up the followers to form a new quorum +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +servers.mt[i].start(); +} +} + +// 4. wait one of the follower to be the new leader +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +// Recreate a client session since the previous session was not persisted. +servers.restartClient(i, this); +waitForOne(servers.zk[i], States.CONNECTED); +} +} + +// 5. send a create request to old leader and make sure it's synced to disk, +//which means it acked from itself +try { +servers.zk[leader].create("/zk" + leader, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, +CreateMode.PERSISTENT); +Assert.fail("create /zk" + leader + " should have failed"); +} catch (KeeperException e) { +} + +// just make sure that we actually did get it in process at the +// leader +Assert.assertEquals(1, outstanding.size()); +Proposal p = outstanding.values().iterator().next(); +Assert.assertEquals(OpCode.create, p.request.getHdr().getType()); + +// make sure it has a chance to write it to disk +int sleepTime = 0; +Long longLeader = new Long(leader); +while (!p.qvAcksetPairs.get(0).getAckset().contains(longLeader)) { +if (sleepTime > 2000) { +Assert.fail("Transaction not synced to disk within 1 second " + p.qvAcksetPairs.get(0).getAckset() ++ " expected " + leader); +} +Thread.sleep(100); +sleepTime += 100; +} + +// 6. wait for the leader to quit due to not enough followers and come back up as a part of the new quorum +sleepTime = 0; +Follower f = servers.mt[leader].main.quorumPeer.follower; +while (f == null || !f.isRunning()) { +if (sleepTime > 10_000) { +Assert.fail("Took too long for old leader to time out " + servers.mt[leader].main.quorumPeer.getPeerState()); +} +Thread.sleep(100); +sleepTime += 100; +f = servers.mt[leader].main.quorumPeer.follower; +} +servers.mt[leader].shutdown(); --- End diff -- why do we need this? ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user afine commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168884819 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -465,6 +470,37 @@ private void waitForAll(ZooKeeper[] zks, States state) throws InterruptedExcepti private static class Servers { MainThread mt[]; ZooKeeper zk[]; +int[] clientPorts; + +public void shutDownAllServers() throws InterruptedException { +for (MainThread t: mt) { +t.shutdown(); +} +} + +public void restartAllServersAndClients(Watcher watcher) throws IOException { +for (MainThread t : mt) { +if (!t.isAlive()) { +t.start(); +} +} +for (int i = 0; i < zk.length; i++) { +restartClient(i, watcher); +} +} + +public void restartClient(int i, Watcher watcher) throws IOException { +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, watcher); +} + +public int findLeader() { --- End diff -- there are other places in this test class that benefit from this refactoring. Would you mind cleaning that up? ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user afine commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168857757 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +888,127 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testTxnAheadSnapInRetainDB() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +final int clientPorts[] = new int[SERVER_COUNT]; +StringBuilder sb = new StringBuilder(); +for (int i = 0; i < SERVER_COUNT; i++) { +clientPorts[i] = PortAssignment.unique(); +sb.append("server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ";" + clientPorts[i] + "\n"); +} +String quorumCfgSection = sb.toString(); + +MainThread mt[] = new MainThread[SERVER_COUNT]; +ZooKeeper zk[] = new ZooKeeper[SERVER_COUNT]; +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i] = new MainThread(i, clientPorts[i], quorumCfgSection); +mt[i].start(); +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// we need to shutdown and start back up to make sure that the create session isn't the first transaction since +// that is rather innocuous. +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].shutdown(); +} + +waitForAll(zk, States.CONNECTING); + +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].start(); +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// 2. kill all followers +int leader = -1; +Map outstanding = null; +for (int i = 0; i < SERVER_COUNT; i++) { +if (mt[i].main.quorumPeer.leader != null) { +leader = i; +outstanding = mt[leader].main.quorumPeer.leader.outstandingProposals; +// increase the tick time to delay the leader going to looking +mt[leader].main.quorumPeer.tickTime = 1; +} +} +LOG.warn("LEADER {}", leader); + +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].shutdown(); +} +} + +// 3. start up the followers to form a new quorum +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].start(); +} +} + +// 4. wait one of the follower to be the leader +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +waitForOne(zk[i], States.CONNECTED); +} +} + +// 5. send a create request to leader and make sure it's synced to disk, +//which means it acked from itself +try { +zk[leader].create("/zk" + leader, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, +CreateMode.PERSISTENT); +Assert.fail("create /zk" + leader + " should have failed"); +} catch (KeeperException e) { +} + +// just make sure that we actually did get it in process at the +// leader +Assert.assertTrue(outstanding.size() == 1); +Proposal p = (Proposal) outstanding.values().iterator().next(); +Assert.assertTrue(p.request.getHdr().getType() == OpCode.create); + +// make sure it has a chance to write it to disk +Thread.sleep(1000); --- End diff -- @revans2 take a look at `testElectionFraud`, specifically: https://github.com/apache/zookeeper/blob/master/src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java#L383 and https://github.com/apache/zookeeper/blob/master/src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java#L397 You can manually start and stop leader election, I think that may solve this problem. ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user afine commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168857052 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -435,7 +435,7 @@ private void waitForOne(ZooKeeper zk, States state) throws InterruptedException int iterations = ClientBase.CONNECTION_TIMEOUT / 500; while (zk.getState() != state) { if (iterations-- == 0) { -throw new RuntimeException("Waiting too long"); +throw new RuntimeException("Waiting too long " + zk.getState() + " != " + state); --- End diff -- Since @anmolnar thinks it is valuable, I think it is fine for it to be left in. ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user revans2 commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168807943 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +888,127 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testTxnAheadSnapInRetainDB() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +final int clientPorts[] = new int[SERVER_COUNT]; +StringBuilder sb = new StringBuilder(); +for (int i = 0; i < SERVER_COUNT; i++) { +clientPorts[i] = PortAssignment.unique(); +sb.append("server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ";" + clientPorts[i] + "\n"); +} +String quorumCfgSection = sb.toString(); + +MainThread mt[] = new MainThread[SERVER_COUNT]; +ZooKeeper zk[] = new ZooKeeper[SERVER_COUNT]; +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i] = new MainThread(i, clientPorts[i], quorumCfgSection); +mt[i].start(); +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// we need to shutdown and start back up to make sure that the create session isn't the first transaction since +// that is rather innocuous. +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].shutdown(); +} + +waitForAll(zk, States.CONNECTING); + +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].start(); +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// 2. kill all followers +int leader = -1; +Map outstanding = null; +for (int i = 0; i < SERVER_COUNT; i++) { +if (mt[i].main.quorumPeer.leader != null) { +leader = i; +outstanding = mt[leader].main.quorumPeer.leader.outstandingProposals; +// increase the tick time to delay the leader going to looking +mt[leader].main.quorumPeer.tickTime = 1; +} +} +LOG.warn("LEADER {}", leader); + +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].shutdown(); +} +} + +// 3. start up the followers to form a new quorum +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].start(); +} +} + +// 4. wait one of the follower to be the leader +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +waitForOne(zk[i], States.CONNECTED); +} +} + +// 5. send a create request to leader and make sure it's synced to disk, +//which means it acked from itself +try { +zk[leader].create("/zk" + leader, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, +CreateMode.PERSISTENT); +Assert.fail("create /zk" + leader + " should have failed"); +} catch (KeeperException e) { +} + +// just make sure that we actually did get it in process at the +// leader +Assert.assertTrue(outstanding.size() == 1); +Proposal p = (Proposal) outstanding.values().iterator().next(); +Assert.assertTrue(p.request.getHdr().getType() == OpCode.create); + +// make sure it has a chance to write it to disk +Thread.sleep(1000); +p.qvAcksetPairs.get(0).getAckset().contains(leader); + +// 6. wait the leader to quit due to no enough followers +Thread.sleep(4000); +//waitForOne(zk[leader], States.CONNECTING); --- End diff -- done ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user revans2 commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168807976 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +888,127 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testTxnAheadSnapInRetainDB() throws Exception { --- End diff -- done ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user revans2 commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168807914 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +888,127 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testTxnAheadSnapInRetainDB() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +final int clientPorts[] = new int[SERVER_COUNT]; +StringBuilder sb = new StringBuilder(); +for (int i = 0; i < SERVER_COUNT; i++) { +clientPorts[i] = PortAssignment.unique(); +sb.append("server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ";" + clientPorts[i] + "\n"); +} +String quorumCfgSection = sb.toString(); + +MainThread mt[] = new MainThread[SERVER_COUNT]; +ZooKeeper zk[] = new ZooKeeper[SERVER_COUNT]; +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i] = new MainThread(i, clientPorts[i], quorumCfgSection); +mt[i].start(); +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// we need to shutdown and start back up to make sure that the create session isn't the first transaction since +// that is rather innocuous. +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].shutdown(); +} + +waitForAll(zk, States.CONNECTING); + +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].start(); +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// 2. kill all followers +int leader = -1; +Map outstanding = null; +for (int i = 0; i < SERVER_COUNT; i++) { +if (mt[i].main.quorumPeer.leader != null) { +leader = i; +outstanding = mt[leader].main.quorumPeer.leader.outstandingProposals; +// increase the tick time to delay the leader going to looking +mt[leader].main.quorumPeer.tickTime = 1; +} +} +LOG.warn("LEADER {}", leader); + +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].shutdown(); +} +} + +// 3. start up the followers to form a new quorum +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].start(); +} +} + +// 4. wait one of the follower to be the leader +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +waitForOne(zk[i], States.CONNECTED); +} +} + +// 5. send a create request to leader and make sure it's synced to disk, +//which means it acked from itself +try { +zk[leader].create("/zk" + leader, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, +CreateMode.PERSISTENT); +Assert.fail("create /zk" + leader + " should have failed"); +} catch (KeeperException e) { +} + +// just make sure that we actually did get it in process at the +// leader +Assert.assertTrue(outstanding.size() == 1); +Proposal p = (Proposal) outstanding.values().iterator().next(); --- End diff -- removed the cast ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user revans2 commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168807853 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +888,127 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testTxnAheadSnapInRetainDB() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +final int clientPorts[] = new int[SERVER_COUNT]; +StringBuilder sb = new StringBuilder(); +for (int i = 0; i < SERVER_COUNT; i++) { +clientPorts[i] = PortAssignment.unique(); +sb.append("server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ";" + clientPorts[i] + "\n"); +} +String quorumCfgSection = sb.toString(); + +MainThread mt[] = new MainThread[SERVER_COUNT]; +ZooKeeper zk[] = new ZooKeeper[SERVER_COUNT]; +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i] = new MainThread(i, clientPorts[i], quorumCfgSection); +mt[i].start(); +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// we need to shutdown and start back up to make sure that the create session isn't the first transaction since +// that is rather innocuous. +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].shutdown(); +} + +waitForAll(zk, States.CONNECTING); + +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].start(); +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// 2. kill all followers +int leader = -1; +Map outstanding = null; +for (int i = 0; i < SERVER_COUNT; i++) { +if (mt[i].main.quorumPeer.leader != null) { +leader = i; +outstanding = mt[leader].main.quorumPeer.leader.outstandingProposals; +// increase the tick time to delay the leader going to looking +mt[leader].main.quorumPeer.tickTime = 1; +} +} +LOG.warn("LEADER {}", leader); + +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].shutdown(); +} +} + +// 3. start up the followers to form a new quorum +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].start(); +} +} + +// 4. wait one of the follower to be the leader +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +waitForOne(zk[i], States.CONNECTED); +} +} + +// 5. send a create request to leader and make sure it's synced to disk, +//which means it acked from itself +try { +zk[leader].create("/zk" + leader, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, +CreateMode.PERSISTENT); +Assert.fail("create /zk" + leader + " should have failed"); +} catch (KeeperException e) { +} + +// just make sure that we actually did get it in process at the +// leader +Assert.assertTrue(outstanding.size() == 1); +Proposal p = (Proposal) outstanding.values().iterator().next(); +Assert.assertTrue(p.request.getHdr().getType() == OpCode.create); + +// make sure it has a chance to write it to disk +Thread.sleep(1000); --- End diff -- I was able to do what you said and drop the 1 second sleep, but the sleep at step 6 I am going to need something else because the leader is only in the looking state for 2 ms. Leader election happens way too fast for us to be able to catch that by polling. If I remove the 4 second sleep it does not trigger the error case, I don't completely know why. I'll spend some time looking at it, but if you have
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user revans2 commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168795646 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -435,7 +435,7 @@ private void waitForOne(ZooKeeper zk, States state) throws InterruptedException int iterations = ClientBase.CONNECTION_TIMEOUT / 500; while (zk.getState() != state) { if (iterations-- == 0) { -throw new RuntimeException("Waiting too long"); +throw new RuntimeException("Waiting too long " + zk.getState() + " != " + state); --- End diff -- @anmolnar and @afine I put this in for my own debugging and I forgot to remove it. If you want me to I am happy to either remove it or file a separate JIRA and put it up as a separate pull request, or just leave it. Either way is fine with me. ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user anmolnar commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168795633 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +888,127 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testTxnAheadSnapInRetainDB() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +final int clientPorts[] = new int[SERVER_COUNT]; +StringBuilder sb = new StringBuilder(); +for (int i = 0; i < SERVER_COUNT; i++) { +clientPorts[i] = PortAssignment.unique(); +sb.append("server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ";" + clientPorts[i] + "\n"); +} +String quorumCfgSection = sb.toString(); + +MainThread mt[] = new MainThread[SERVER_COUNT]; +ZooKeeper zk[] = new ZooKeeper[SERVER_COUNT]; +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i] = new MainThread(i, clientPorts[i], quorumCfgSection); --- End diff -- Use `LaunchServers(numServers, tickTime)` method in this class. It gives you a collection of `MainThread` and `ZooKeeper` objects properly initialized. Test `tearDown()` will care about destroying it. ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user revans2 commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168794042 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +888,127 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testTxnAheadSnapInRetainDB() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +final int clientPorts[] = new int[SERVER_COUNT]; +StringBuilder sb = new StringBuilder(); +for (int i = 0; i < SERVER_COUNT; i++) { +clientPorts[i] = PortAssignment.unique(); +sb.append("server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ";" + clientPorts[i] + "\n"); +} +String quorumCfgSection = sb.toString(); + +MainThread mt[] = new MainThread[SERVER_COUNT]; +ZooKeeper zk[] = new ZooKeeper[SERVER_COUNT]; +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i] = new MainThread(i, clientPorts[i], quorumCfgSection); +mt[i].start(); +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// we need to shutdown and start back up to make sure that the create session isn't the first transaction since +// that is rather innocuous. +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].shutdown(); +} + +waitForAll(zk, States.CONNECTING); + +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].start(); +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// 2. kill all followers +int leader = -1; +Map outstanding = null; +for (int i = 0; i < SERVER_COUNT; i++) { +if (mt[i].main.quorumPeer.leader != null) { +leader = i; +outstanding = mt[leader].main.quorumPeer.leader.outstandingProposals; +// increase the tick time to delay the leader going to looking +mt[leader].main.quorumPeer.tickTime = 1; +} +} +LOG.warn("LEADER {}", leader); + +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].shutdown(); +} +} + +// 3. start up the followers to form a new quorum +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].start(); +} +} + +// 4. wait one of the follower to be the leader +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +waitForOne(zk[i], States.CONNECTED); +} +} + +// 5. send a create request to leader and make sure it's synced to disk, +//which means it acked from itself +try { +zk[leader].create("/zk" + leader, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, +CreateMode.PERSISTENT); +Assert.fail("create /zk" + leader + " should have failed"); +} catch (KeeperException e) { +} + +// just make sure that we actually did get it in process at the +// leader +Assert.assertTrue(outstanding.size() == 1); +Proposal p = (Proposal) outstanding.values().iterator().next(); +Assert.assertTrue(p.request.getHdr().getType() == OpCode.create); + +// make sure it has a chance to write it to disk +Thread.sleep(1000); --- End diff -- I will see if I can make it work. I agree I would love to kill as many of the sleeps as possible. ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user revans2 commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168793764 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +888,127 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testTxnAheadSnapInRetainDB() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +final int clientPorts[] = new int[SERVER_COUNT]; +StringBuilder sb = new StringBuilder(); +for (int i = 0; i < SERVER_COUNT; i++) { +clientPorts[i] = PortAssignment.unique(); +sb.append("server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ";" + clientPorts[i] + "\n"); +} +String quorumCfgSection = sb.toString(); + +MainThread mt[] = new MainThread[SERVER_COUNT]; +ZooKeeper zk[] = new ZooKeeper[SERVER_COUNT]; +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i] = new MainThread(i, clientPorts[i], quorumCfgSection); --- End diff -- I am not super familiar with the test infrastructure. If you have a suggestion I would love it, otherwise I will look around and see what I can come up with. ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user anmolnar commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168793569 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +888,127 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testTxnAheadSnapInRetainDB() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +final int clientPorts[] = new int[SERVER_COUNT]; +StringBuilder sb = new StringBuilder(); +for (int i = 0; i < SERVER_COUNT; i++) { +clientPorts[i] = PortAssignment.unique(); +sb.append("server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ";" + clientPorts[i] + "\n"); +} +String quorumCfgSection = sb.toString(); + +MainThread mt[] = new MainThread[SERVER_COUNT]; +ZooKeeper zk[] = new ZooKeeper[SERVER_COUNT]; +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i] = new MainThread(i, clientPorts[i], quorumCfgSection); --- End diff -- +1 As mentioned testElectionFraud() is a good example for that. ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user anmolnar commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168793211 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -435,7 +435,7 @@ private void waitForOne(ZooKeeper zk, States state) throws InterruptedException int iterations = ClientBase.CONNECTION_TIMEOUT / 500; while (zk.getState() != state) { if (iterations-- == 0) { -throw new RuntimeException("Waiting too long"); +throw new RuntimeException("Waiting too long " + zk.getState() + " != " + state); --- End diff -- Although I agree with you in general, I think this one here is a good addition to test output. ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user afine commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168651275 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +888,127 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testTxnAheadSnapInRetainDB() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +final int clientPorts[] = new int[SERVER_COUNT]; +StringBuilder sb = new StringBuilder(); +for (int i = 0; i < SERVER_COUNT; i++) { +clientPorts[i] = PortAssignment.unique(); +sb.append("server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ";" + clientPorts[i] + "\n"); +} +String quorumCfgSection = sb.toString(); + +MainThread mt[] = new MainThread[SERVER_COUNT]; +ZooKeeper zk[] = new ZooKeeper[SERVER_COUNT]; +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i] = new MainThread(i, clientPorts[i], quorumCfgSection); +mt[i].start(); +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// we need to shutdown and start back up to make sure that the create session isn't the first transaction since +// that is rather innocuous. +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].shutdown(); +} + +waitForAll(zk, States.CONNECTING); + +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].start(); +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// 2. kill all followers +int leader = -1; +Map outstanding = null; +for (int i = 0; i < SERVER_COUNT; i++) { +if (mt[i].main.quorumPeer.leader != null) { +leader = i; +outstanding = mt[leader].main.quorumPeer.leader.outstandingProposals; +// increase the tick time to delay the leader going to looking +mt[leader].main.quorumPeer.tickTime = 1; +} +} +LOG.warn("LEADER {}", leader); + +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].shutdown(); +} +} + +// 3. start up the followers to form a new quorum +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].start(); +} +} + +// 4. wait one of the follower to be the leader +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +waitForOne(zk[i], States.CONNECTED); +} +} + +// 5. send a create request to leader and make sure it's synced to disk, +//which means it acked from itself +try { +zk[leader].create("/zk" + leader, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, +CreateMode.PERSISTENT); +Assert.fail("create /zk" + leader + " should have failed"); +} catch (KeeperException e) { +} + +// just make sure that we actually did get it in process at the +// leader +Assert.assertTrue(outstanding.size() == 1); +Proposal p = (Proposal) outstanding.values().iterator().next(); --- End diff -- Do we need this cast? ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user afine commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168649459 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +888,127 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testTxnAheadSnapInRetainDB() throws Exception { --- End diff -- nit: I don't think we use the terminology "RetainDB" anywhere else. Perhaps we can get rid of "retain"? ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user afine commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168649906 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +888,127 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testTxnAheadSnapInRetainDB() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +final int clientPorts[] = new int[SERVER_COUNT]; +StringBuilder sb = new StringBuilder(); +for (int i = 0; i < SERVER_COUNT; i++) { +clientPorts[i] = PortAssignment.unique(); +sb.append("server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ";" + clientPorts[i] + "\n"); +} +String quorumCfgSection = sb.toString(); + +MainThread mt[] = new MainThread[SERVER_COUNT]; +ZooKeeper zk[] = new ZooKeeper[SERVER_COUNT]; +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i] = new MainThread(i, clientPorts[i], quorumCfgSection); --- End diff -- is there any reason we can't use the existing test infra to clean this up a little ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user afine commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168649723 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +888,127 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testTxnAheadSnapInRetainDB() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +final int clientPorts[] = new int[SERVER_COUNT]; +StringBuilder sb = new StringBuilder(); +for (int i = 0; i < SERVER_COUNT; i++) { +clientPorts[i] = PortAssignment.unique(); +sb.append("server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ";" + clientPorts[i] + "\n"); +} +String quorumCfgSection = sb.toString(); + +MainThread mt[] = new MainThread[SERVER_COUNT]; +ZooKeeper zk[] = new ZooKeeper[SERVER_COUNT]; +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i] = new MainThread(i, clientPorts[i], quorumCfgSection); +mt[i].start(); +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// we need to shutdown and start back up to make sure that the create session isn't the first transaction since +// that is rather innocuous. +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].shutdown(); +} + +waitForAll(zk, States.CONNECTING); + +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].start(); +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// 2. kill all followers +int leader = -1; +Map outstanding = null; +for (int i = 0; i < SERVER_COUNT; i++) { +if (mt[i].main.quorumPeer.leader != null) { +leader = i; +outstanding = mt[leader].main.quorumPeer.leader.outstandingProposals; +// increase the tick time to delay the leader going to looking +mt[leader].main.quorumPeer.tickTime = 1; +} +} +LOG.warn("LEADER {}", leader); + +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].shutdown(); +} +} + +// 3. start up the followers to form a new quorum +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].start(); +} +} + +// 4. wait one of the follower to be the leader +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +waitForOne(zk[i], States.CONNECTED); +} +} + +// 5. send a create request to leader and make sure it's synced to disk, +//which means it acked from itself +try { +zk[leader].create("/zk" + leader, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, +CreateMode.PERSISTENT); +Assert.fail("create /zk" + leader + " should have failed"); +} catch (KeeperException e) { +} + +// just make sure that we actually did get it in process at the +// leader +Assert.assertTrue(outstanding.size() == 1); +Proposal p = (Proposal) outstanding.values().iterator().next(); +Assert.assertTrue(p.request.getHdr().getType() == OpCode.create); + +// make sure it has a chance to write it to disk +Thread.sleep(1000); +p.qvAcksetPairs.get(0).getAckset().contains(leader); + +// 6. wait the leader to quit due to no enough followers +Thread.sleep(4000); +//waitForOne(zk[leader], States.CONNECTING); --- End diff -- remove this ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user afine commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168649080 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -435,7 +435,7 @@ private void waitForOne(ZooKeeper zk, States state) throws InterruptedException int iterations = ClientBase.CONNECTION_TIMEOUT / 500; while (zk.getState() != state) { if (iterations-- == 0) { -throw new RuntimeException("Waiting too long"); +throw new RuntimeException("Waiting too long " + zk.getState() + " != " + state); --- End diff -- nit: let's minimize unrelated test changes and whitespace changes ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user afine commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r168653437 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/QuorumPeerMainTest.java --- @@ -888,4 +888,127 @@ public void testWithOnlyMinSessionTimeout() throws Exception { maxSessionTimeOut, quorumPeer.getMaxSessionTimeout()); } +@Test +public void testTxnAheadSnapInRetainDB() throws Exception { +// 1. start up server and wait for leader election to finish +ClientBase.setupTestEnv(); +final int SERVER_COUNT = 3; +final int clientPorts[] = new int[SERVER_COUNT]; +StringBuilder sb = new StringBuilder(); +for (int i = 0; i < SERVER_COUNT; i++) { +clientPorts[i] = PortAssignment.unique(); +sb.append("server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ";" + clientPorts[i] + "\n"); +} +String quorumCfgSection = sb.toString(); + +MainThread mt[] = new MainThread[SERVER_COUNT]; +ZooKeeper zk[] = new ZooKeeper[SERVER_COUNT]; +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i] = new MainThread(i, clientPorts[i], quorumCfgSection); +mt[i].start(); +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// we need to shutdown and start back up to make sure that the create session isn't the first transaction since +// that is rather innocuous. +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].shutdown(); +} + +waitForAll(zk, States.CONNECTING); + +for (int i = 0; i < SERVER_COUNT; i++) { +mt[i].start(); +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +} + +waitForAll(zk, States.CONNECTED); + +// 2. kill all followers +int leader = -1; +Map outstanding = null; +for (int i = 0; i < SERVER_COUNT; i++) { +if (mt[i].main.quorumPeer.leader != null) { +leader = i; +outstanding = mt[leader].main.quorumPeer.leader.outstandingProposals; +// increase the tick time to delay the leader going to looking +mt[leader].main.quorumPeer.tickTime = 1; +} +} +LOG.warn("LEADER {}", leader); + +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].shutdown(); +} +} + +// 3. start up the followers to form a new quorum +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +mt[i].start(); +} +} + +// 4. wait one of the follower to be the leader +for (int i = 0; i < SERVER_COUNT; i++) { +if (i != leader) { +// Recreate a client session since the previous session was not persisted. +zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this); +waitForOne(zk[i], States.CONNECTED); +} +} + +// 5. send a create request to leader and make sure it's synced to disk, +//which means it acked from itself +try { +zk[leader].create("/zk" + leader, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, +CreateMode.PERSISTENT); +Assert.fail("create /zk" + leader + " should have failed"); +} catch (KeeperException e) { +} + +// just make sure that we actually did get it in process at the +// leader +Assert.assertTrue(outstanding.size() == 1); +Proposal p = (Proposal) outstanding.values().iterator().next(); +Assert.assertTrue(p.request.getHdr().getType() == OpCode.create); + +// make sure it has a chance to write it to disk +Thread.sleep(1000); --- End diff -- There is a lot of `Thread.sleep()` going on and I would like to find a way to minimize that. Apache infra can occasionally be quite slow (it can starve threads) and tests with many `Thread.sleep()`s in them have historically been quite flaky. So, to the extent that it is possible. I would like to minimize occurrences of `Thread.sleep()`, or at least those outside the context of retry logic. So perha
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user revans2 commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r167885280 --- Diff: src/java/main/org/apache/zookeeper/server/ZKDatabase.java --- @@ -233,14 +233,32 @@ public long getDataTreeLastProcessedZxid() { * @throws IOException */ public long loadDataBase() throws IOException { -PlayBackListener listener=new PlayBackListener(){ +PlayBackListener listener = new PlayBackListener(){ public void onTxnLoaded(TxnHeader hdr,Record txn){ Request r = new Request(0, hdr.getCxid(),hdr.getType(), hdr, txn, hdr.getZxid()); addCommittedProposal(r); } }; -long zxid = snapLog.restore(dataTree,sessionsWithTimeouts,listener); +long zxid = snapLog.restore(dataTree, sessionsWithTimeouts, listener); +initialized = true; +return zxid; +} + +/** + * Fast forward the database adding transactions from the committed log into memory. + * @return the last valid zxid. + * @throws IOException + */ +public long fastForwardDataBase() throws IOException { +PlayBackListener listener = new PlayBackListener(){ --- End diff -- Will do ---
[GitHub] zookeeper pull request #453: ZOOKEEPER-2845: Apply commit log when restartin...
Github user anmolnar commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/453#discussion_r167884587 --- Diff: src/java/main/org/apache/zookeeper/server/ZKDatabase.java --- @@ -233,14 +233,32 @@ public long getDataTreeLastProcessedZxid() { * @throws IOException */ public long loadDataBase() throws IOException { -PlayBackListener listener=new PlayBackListener(){ +PlayBackListener listener = new PlayBackListener(){ public void onTxnLoaded(TxnHeader hdr,Record txn){ Request r = new Request(0, hdr.getCxid(),hdr.getType(), hdr, txn, hdr.getZxid()); addCommittedProposal(r); } }; -long zxid = snapLog.restore(dataTree,sessionsWithTimeouts,listener); +long zxid = snapLog.restore(dataTree, sessionsWithTimeouts, listener); +initialized = true; +return zxid; +} + +/** + * Fast forward the database adding transactions from the committed log into memory. + * @return the last valid zxid. + * @throws IOException + */ +public long fastForwardDataBase() throws IOException { +PlayBackListener listener = new PlayBackListener(){ --- End diff -- I think it'd be nice to extract the common logic of these two methods into a operate one. ---