Github user hanm commented on a diff in the pull request: https://github.com/apache/zookeeper/pull/605#discussion_r219628119 --- Diff: src/java/test/org/apache/zookeeper/server/quorum/FuzzySnapshotRelatedTest.java --- @@ -162,6 +167,98 @@ public void process(String path) { new String(zk[followerA].getData(node2, null, null))); } + /** + * It's possibel during SNAP sync, the parent is serialized before the + * child get deleted during sending the snapshot over. + * + * In which case, we need to make sure the pzxid get correctly updated + * when applying the txns received. + */ + @Test + public void testPZxidUpdatedDuringSnapSyncing() throws Exception { + LOG.info("Enable force snapshot sync"); + System.setProperty(LearnerHandler.FORCE_SNAP_SYNC, "true"); + + final String parent = "/testPZxidUpdatedWhenDeletingNonExistNode"; + final String child = parent + "/child"; + createEmptyNode(zk[leaderId], parent); + createEmptyNode(zk[leaderId], child); + + LOG.info("shutdown follower {}", followerA); + mt[followerA].shutdown(); + QuorumPeerMainTest.waitForOne(zk[followerA], States.CONNECTING); + + LOG.info("Set up ZKDatabase to catch the node serializing in DataTree"); + addSerializeListener(leaderId, parent, child); + + LOG.info("Restart follower A to trigger a SNAP sync with leader"); + mt[followerA].start(); + QuorumPeerMainTest.waitForOne(zk[followerA], States.CONNECTED); + + LOG.info("Check and make sure the pzxid of the parent is the same " + + "on leader and follower A"); + compareStat(parent, leaderId, followerA); + } + + /** + * It's possible during taking fuzzy snapshot, the parent is serialized + * before the child get deleted in the fuzzy range. + * + * In which case, we need to make sure the pzxid get correctly updated + * when replaying the txns. + */ + @Test + public void testPZxidUpdatedWhenLoadingSnapshot() throws Exception { + + final String parent = "/testPZxidUpdatedDuringTakingSnapshot"; + final String child = parent + "/child"; + createEmptyNode(zk[followerA], parent); + createEmptyNode(zk[followerA], child); + + LOG.info("Set up ZKDatabase to catch the node serializing in DataTree"); + addSerializeListener(followerA, parent, child); + + LOG.info("Take snapshot on follower A"); + ZooKeeperServer zkServer = mt[followerA].main.quorumPeer.getActiveServer(); + zkServer.takeSnapshot(true); + + LOG.info("Restarting follower A to load snapshot"); + mt[followerA].shutdown(); + mt[followerA].start(); + QuorumPeerMainTest.waitForOne(zk[followerA], States.CONNECTED); + + LOG.info("Check and make sure the pzxid of the parent is the same " + + "on leader and follower A"); + compareStat(parent, leaderId, followerA); + } + + private void addSerializeListener(int sid, String parent, String child) { + final ZooKeeper zkClient = zk[followerA]; + CustomDataTree dt = + (CustomDataTree) mt[sid].main.quorumPeer.getZkDb().getDataTree(); + dt.addListener(parent, new NodeSerializeListener() { + @Override + public void nodeSerialized(String path) { + try { + zkClient.delete(child, -1); + LOG.info("Deleted the child node after the parent is serialized"); + } catch (Exception e) { + LOG.error("Error when deleting node {}", e); + } + } + }); + } + + private void compareStat(String path, int sid, int compareWithSid) throws Exception{ + Stat stat1 = new Stat(); + zk[sid].getData(path, null, stat1); + + Stat stat2 = new Stat(); + zk[compareWithSid].getData(path, null, stat2); --- End diff -- @lvfangmin we should add a try catch with retry here around `ConnectionLossException`. I noticed this when checking the flaky test dashboard. See one failure [example](https://builds.apache.org/job/ZooKeeper-trunk/198/testReport/junit/org.apache.zookeeper.server.quorum/FuzzySnapshotRelatedTest/testPZxidUpdatedWhenLoadingSnapshot/)
---