[ https://issues.apache.org/jira/browse/ZOOKEEPER-3201?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16698526#comment-16698526 ]
Michael K. Edwards commented on ZOOKEEPER-3201: ----------------------------------------------- What seems to be happening here is that this "second chance" logic isn't sufficient to ensure that we don't hit {{ConnectionLossException}} again during the second attempt to test {{zk.exists()}}. {noformat} /** * Ensure the client is able to talk to the server. * * @param idx the idx of the server the client is talking to */ private void checkClientConnected(int idx) throws Exception { ZooKeeper zk = getClient(idx); if (zk == null) { return; } try { Assert.assertNull(zk.exists("/foofoofoo-connected", false)); } catch (ConnectionLossException e) { // second chance... // in some cases, leader change in particular, the timing is // very tricky to get right in order to assure that the client has // disconnected and reconnected. In some cases the client will // disconnect, then attempt to reconnect before the server is // back, in which case we'll see another connloss on the operation // in the try, this catches that case and waits for the server // to come back PeerStruct peer = qu.getPeer(idx); Assert.assertTrue("Waiting for server down", ClientBase.waitForServerUp( "127.0.0.1:" + peer.clientPort, ClientBase.CONNECTION_TIMEOUT)); Assert.assertNull(zk.exists("/foofoofoo-connected", false)); } } {noformat} > Flaky test: > org.apache.zookeeper.server.ZxidRolloverTest.testRolloverThenLeaderRestart > -------------------------------------------------------------------------------------- > > Key: ZOOKEEPER-3201 > URL: https://issues.apache.org/jira/browse/ZOOKEEPER-3201 > Project: ZooKeeper > Issue Type: Sub-task > Reporter: Michael K. Edwards > Priority: Major > > Encountered when running tests locally: > {noformat} > 64429 [junit] 2018-11-25 22:28:12,729 [myid:127.0.0.1:27389] - INFO > [main-SendThread(127.0.0.1:27389):ClientCnxn$SendThread@1108] - Opening > socket connection to server localhost/127.0.0 .1:27389. Will not attempt > to authenticate using SASL (unknown error) > 64430 [junit] 2018-11-25 22:28:12,730 [myid:127.0.0.1:27389] - INFO > [main-SendThread(127.0.0.1:27389):ClientCnxn$SendThread@955] - Socket > connection established, initiating session, cli ent: /127.0.0.1:47668, > server: localhost/127.0.0.1:27389 > 64431 [junit] 2018-11-25 22:28:12,734 [myid:] - INFO > [NIOWorkerThread-1:Learner@117] - Revalidating client: 0x10000a9cccf0000 > 64432 [junit] 2018-11-25 22:28:12,743 [myid:127.0.0.1:27389] - INFO > [main-SendThread(127.0.0.1:27389):ClientCnxn$SendThread@1390] - Session > establishment complete on server localhost/12 7.0.0.1:27389, sessionid = > 0x10000a9cccf0000, negotiated timeout = 30000 > 64433 [junit] 2018-11-25 22:28:13,009 [myid:127.0.0.1:27392] - INFO > [main-SendThread(127.0.0.1:27392):ClientCnxn$SendThread@1108] - Opening > socket connection to server localhost/127.0.0 .1:27392. Will not attempt > to authenticate using SASL (unknown error) > 64434 [junit] 2018-11-25 22:28:13,009 [myid:127.0.0.1:27392] - INFO > [main-SendThread(127.0.0.1:27392):ClientCnxn$SendThread@955] - Socket > connection established, initiating session, cli ent: /127.0.0.1:52160, > server: localhost/127.0.0.1:27392 > 64435 [junit] 2018-11-25 22:28:13,016 [myid:127.0.0.1:27395] - INFO > [main-SendThread(127.0.0.1:27395):ClientCnxn$SendThread@1108] - Opening > socket connection to server localhost/127.0.0 .1:27395. Will not attempt > to authenticate using SASL (unknown error) > 64436 [junit] 2018-11-25 22:28:13,016 [myid:127.0.0.1:27395] - INFO > [main-SendThread(127.0.0.1:27395):ClientCnxn$SendThread@955] - Socket > connection established, initiating session, cli ent: /127.0.0.1:47256, > server: localhost/127.0.0.1:27395 > 64437 [junit] 2018-11-25 22:28:13,017 [myid:] - INFO > [NIOWorkerThread-4:ZooKeeperServer@1030] - Refusing session request for > client /127.0.0.1:47256 as it has seen zxid 0x300000000 our last zxid > is 0x2fffffffe client must try another server > 64438 [junit] 2018-11-25 22:28:13,018 [myid:127.0.0.1:27395] - INFO > [main-SendThread(127.0.0.1:27395):ClientCnxn$SendThread@1236] - Unable to > read additional data from server sessionid 0x30000a9ccd20000, likely > server has closed socket, closing socket connection and attempting reconnect > 64439 [junit] 2018-11-25 22:28:13,023 [myid:127.0.0.1:27392] - INFO > [main-SendThread(127.0.0.1:27392):ClientCnxn$SendThread@1390] - Session > establishment complete on server localhost/12 7.0.0.1:27392, sessionid = > 0x20000a9d0940000, negotiated timeout = 30000 > 64440 [junit] 2018-11-25 22:28:13,119 [myid:] - INFO > [main:FourLetterWordMain@87] - connecting to 127.0.0.1 27395 > 64441 [junit] 2018-11-25 22:28:13,120 [myid:] - INFO > [NIOWorkerThread-1:NIOServerCnxn@518] - Processing stat command from > /127.0.0.1:47258 > 64442 [junit] 2018-11-25 22:28:13,121 [myid:] - INFO > [NIOWorkerThread-1:StatCommand@53] - Stat command output > 64443 [junit] 2018-11-25 22:28:14,134 [myid:127.0.0.1:27395] - INFO > [main-SendThread(127.0.0.1:27395):ClientCnxn$SendThread@1108] - Opening > socket connection to server localhost/127.0.0 .1:27395. Will not attempt > to authenticate using SASL (unknown error) > 64444 [junit] 2018-11-25 22:28:14,135 [myid:127.0.0.1:27395] - INFO > [main-SendThread(127.0.0.1:27395):ClientCnxn$SendThread@955] - Socket > connection established, initiating session, cli ent: /127.0.0.1:47312, > server: localhost/127.0.0.1:27395 > 64445 [junit] 2018-11-25 22:28:14,135 [myid:] - INFO > [NIOWorkerThread-2:ZooKeeperServer@1030] - Refusing session request for > client /127.0.0.1:47312 as it has seen zxid 0x300000000 our last zxid > is 0x2fffffffe client must try another server > 64446 [junit] 2018-11-25 22:28:14,137 [myid:127.0.0.1:27395] - INFO > [main-SendThread(127.0.0.1:27395):ClientCnxn$SendThread@1236] - Unable to > read additional data from server sessionid 0x30000a9ccd20000, likely > server has closed socket, closing socket connection and attempting reconnect > 64447 [junit] 2018-11-25 22:28:14,240 [myid:] - INFO > [main:JUnit4ZKTestRunner$LoggedInvokeMethod@98] - TEST METHOD FAILED > testRolloverThenLeaderRestart > 64448 [junit] > org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode > = ConnectionLoss for /foofoofoo-connected > 64449 [junit] at > org.apache.zookeeper.KeeperException.create(KeeperException.java:102) > 64450 [junit] at > org.apache.zookeeper.KeeperException.create(KeeperException.java:54) > 64451 [junit] at > org.apache.zookeeper.ZooKeeper.exists(ZooKeeper.java:1942) > 64452 [junit] at > org.apache.zookeeper.ZooKeeper.exists(ZooKeeper.java:1970) > 64453 [junit] at > org.apache.zookeeper.server.ZxidRolloverTest.checkClientConnected(ZxidRolloverTest.java:119) > 64454 [junit] at > org.apache.zookeeper.server.ZxidRolloverTest.checkClientsConnected(ZxidRolloverTest.java:90) > 64455 [junit] at > org.apache.zookeeper.server.ZxidRolloverTest.start(ZxidRolloverTest.java:165) > 64456 [junit] at > org.apache.zookeeper.server.ZxidRolloverTest.testRolloverThenLeaderRestart(ZxidRolloverTest.java:382){noformat} -- This message was sent by Atlassian JIRA (v7.6.3#76005)