[ https://issues.apache.org/jira/browse/ZOOKEEPER-2598?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Srinivas Neginhal updated ZOOKEEPER-2598: ----------------------------------------- Attachment: zk3.tgz zk2.tgz zk1.tgz Attaching tar gz of the zookeeper config and datadir. > Data Inconsistency after power off/on of some nodes > --------------------------------------------------- > > Key: ZOOKEEPER-2598 > URL: https://issues.apache.org/jira/browse/ZOOKEEPER-2598 > Project: ZooKeeper > Issue Type: Bug > Components: quorum > Affects Versions: 3.5.1 > Environment: ZK is running in a docker container on a Ubuntu 14.04 VM > Reporter: Srinivas Neginhal > Attachments: zk1.tgz, zk2.tgz, zk3.tgz > > > Steps to reproduce: > 1. Create a three node cluster: Node1, Node2 and Node3. > Each node is a VM that runs: > 1. ZK in a docker container > 2. Two clients, A and B that use ZK for group membership and leader > election. The clients create sequential ephemeral nodes when they come up. > 2. The three ZK's running in the containers form an ensemble. > 3. Power off/on Node 2 and Node 3 in a loop > 4. After a few times, the ephemeral nodes seen by the three nodes are > different. > Here is the output of some four letter commands with the ensemble in the > state: > 1. conf: > ZK 1: > # echo conf| nc 10.0.0.1 1300 > clientPort=1300 > secureClientPort=-1 > dataDir=/moot/persistentStore/zkWorkspace/version-2 > dataDirSize=67293721 > dataLogDir=/moot/persistentStore/zkWorkspace/version-2 > dataLogSize=67293721 > tickTime=2000 > maxClientCnxns=60 > minSessionTimeout=4000 > maxSessionTimeout=40000 > serverId=1 > initLimit=100 > syncLimit=20 > electionAlg=3 > electionPort=1200 > quorumPort=1100 > peerType=0 > membership: > server.1=10.0.0.1:1100:1200:participant;10.0.0.1:1300;8e64c644-d0fa-414f-bab2-3c8c80364410 > server.2=10.0.0.2:1100:1200:participant;10.0.0.2:1300;38bf19b8-d4cb-4dac-b328-7bbf0ee1e2c4 > server.3=10.0.0.3:1100:1200:participant;10.0.0.3:1300;e1415d59-e857-43e6-ba9b-01daeb31a434 > ZK 2: > # echo conf| nc 10.0.0.2 1300 > clientPort=1300 > secureClientPort=-1 > dataDir=/moot/persistentStore/zkWorkspace/version-2 > dataDirSize=1409480873 > dataLogDir=/moot/persistentStore/zkWorkspace/version-2 > dataLogSize=1409480873 > tickTime=2000 > maxClientCnxns=60 > minSessionTimeout=4000 > maxSessionTimeout=40000 > serverId=2 > initLimit=100 > syncLimit=20 > electionAlg=3 > electionPort=1200 > quorumPort=1100 > peerType=0 > membership: > server.1=10.0.0.1:1100:1200:participant;10.0.0.1:1300;8e64c644-d0fa-414f-bab2-3c8c80364410 > server.2=10.0.0.2:1100:1200:participant;10.0.0.2:1300;38bf19b8-d4cb-4dac-b328-7bbf0ee1e2c4 > server.3=10.0.0.3:1100:1200:participant;10.0.0.3:1300;e1415d59-e857-43e6-ba9b-01daeb31a434 > ZK 3: > # echo conf| nc 10.0.0.3 1300 > clientPort=1300 > secureClientPort=-1 > dataDir=/moot/persistentStore/zkWorkspace/version-2 > dataDirSize=1409505467 > dataLogDir=/moot/persistentStore/zkWorkspace/version-2 > dataLogSize=1409505467 > tickTime=2000 > maxClientCnxns=60 > minSessionTimeout=4000 > maxSessionTimeout=40000 > serverId=3 > initLimit=100 > syncLimit=20 > electionAlg=3 > electionPort=1200 > quorumPort=1100 > peerType=0 > membership: > server.1=10.0.0.1:1100:1200:participant;10.0.0.1:1300;8e64c644-d0fa-414f-bab2-3c8c80364410 > server.2=10.0.0.2:1100:1200:participant;10.0.0.2:1300;38bf19b8-d4cb-4dac-b328-7bbf0ee1e2c4 > server.3=10.0.0.3:1100:1200:participant;10.0.0.3:1300;e1415d59-e857-43e6-ba9b-01daeb31a434 > 2. mntr: > ZK 1: > # echo mntr| nc 10.0.0.1 1300 > zk_version 3.5.1-alpha--1, built on 09/07/2016 00:34 GMT > zk_avg_latency 0 > zk_max_latency 471 > zk_min_latency 0 > zk_packets_received 32556 > zk_packets_sent 32564 > zk_num_alive_connections 7 > zk_outstanding_requests 0 > zk_server_state leader > zk_znode_count 58 > zk_watch_count 51 > zk_ephemerals_count 5 > zk_approximate_data_size 5251 > zk_open_file_descriptor_count 52 > zk_max_file_descriptor_count 1048576 > zk_followers 2 > zk_synced_followers 2 > zk_pending_syncs 0 > ZK 2: > # echo mntr| nc 10.0.0.2 1300 > zk_version 3.5.1-alpha--1, built on 09/07/2016 00:34 GMT > zk_avg_latency 1 > zk_max_latency 227 > zk_min_latency 0 > zk_packets_received 30905 > zk_packets_sent 30936 > zk_num_alive_connections 6 > zk_outstanding_requests 0 > zk_server_state follower > zk_znode_count 58 > zk_watch_count 82 > zk_ephemerals_count 5 > zk_approximate_data_size 5251 > zk_open_file_descriptor_count 49 > zk_max_file_descriptor_count 1048576 > ZK 3: > # echo mntr| nc 10.0.0.3 1300 > zk_version 3.5.1-alpha--1, built on 09/07/2016 00:34 GMT > zk_avg_latency 4 > zk_max_latency 590 > zk_min_latency 0 > zk_packets_received 6192 > zk_packets_sent 6191 > zk_num_alive_connections 2 > zk_outstanding_requests 0 > zk_server_state follower > zk_znode_count 64 > zk_watch_count 17 > zk_ephemerals_count 11 > zk_approximate_data_size 5806 > zk_open_file_descriptor_count 45 > zk_max_file_descriptor_count 1048576 > 3. dump showing the inconsistency: > ZK 1: > # echo dump| nc 10.0.0.1 1300 > SessionTracker dump: > Session Sets (17)/(12): > 0 expire at Tue Sep 20 18:22:35 UTC 2016: > 0 expire at Tue Sep 20 18:22:37 UTC 2016: > 0 expire at Tue Sep 20 18:22:39 UTC 2016: > 0 expire at Tue Sep 20 18:22:41 UTC 2016: > 0 expire at Tue Sep 20 18:22:43 UTC 2016: > 0 expire at Tue Sep 20 18:22:45 UTC 2016: > 0 expire at Tue Sep 20 18:22:49 UTC 2016: > 0 expire at Tue Sep 20 18:22:51 UTC 2016: > 0 expire at Tue Sep 20 18:22:53 UTC 2016: > 0 expire at Tue Sep 20 18:22:55 UTC 2016: > 0 expire at Tue Sep 20 18:22:57 UTC 2016: > 4 expire at Tue Sep 20 18:22:59 UTC 2016: > 0x100061435f7000d > 0x10000d9e4460004 > 0x100061435f70002 > 0x10000d9e4460003 > 4 expire at Tue Sep 20 18:23:03 UTC 2016: > 0x2000001141a0002 > 0x2000001141a0000 > 0x2000001141a0005 > 0x100061435f70010 > 1 expire at Tue Sep 20 18:23:07 UTC 2016: > 0x2000001141a0001 > 1 expire at Tue Sep 20 18:23:09 UTC 2016: > 0x100061435f70000 > 1 expire at Tue Sep 20 18:23:11 UTC 2016: > 0x2000001141a000f > 1 expire at Tue Sep 20 18:23:13 UTC 2016: > 0x300000188c30001 > ephemeral nodes dump: > Sessions with Ephemerals (5): > 0x100061435f70000: > /moot/gmle/ServiceDirectory/ActiveNodes/member0000000064 > 0x2000001141a000f: > /moot/gmle/ServiceDirectory/ActiveNodes/member0000000066 > 0x2000001141a0001: > /moot/gmle/ServiceDirectory/ActiveNodes/member0000000065 > 0x2000001141a0000: > /moot/gmle/ActiveControllerCluster/member0000000065 > 0x2000001141a0005: > /moot/gmle/ActiveControllerCluster/member0000000066 > Connections dump: > Connections Sets (5)/(10): > 0 expire at Tue Sep 20 18:22:35 UTC 2016: > 1 expire at Tue Sep 20 18:22:45 UTC 2016: > ip: /10.0.0.1:45591 sessionId: 0x0 > 0 expire at Tue Sep 20 18:22:55 UTC 2016: > 5 expire at Tue Sep 20 18:23:05 UTC 2016: > ip: /10.0.0.3:34734 sessionId: 0x100061435f7000d > ip: /10.0.0.1:42963 sessionId: 0x10000d9e4460003 > ip: /10.0.0.3:34739 sessionId: 0x100061435f70010 > ip: /10.0.0.2:45750 sessionId: 0x100061435f70002 > ip: /10.0.0.1:42961 sessionId: 0x10000d9e4460004 > 1 expire at Tue Sep 20 18:23:15 UTC 2016: > ip: /10.0.0.1:42964 sessionId: 0x100061435f70000 > ZK 2: > # echo dump| nc 10.0.0.2 1300 > SessionTracker dump: > Global Sessions(13): > 0x10000d9e4460003 30000ms > 0x10000d9e4460004 30000ms > 0x100061435f70000 40000ms > 0x100061435f70002 30000ms > 0x100061435f7000d 30000ms > 0x100061435f70010 30000ms > 0x100061435f70584 4000ms > 0x2000001141a0000 40000ms > 0x2000001141a0001 40000ms > 0x2000001141a0002 30000ms > 0x2000001141a0005 40000ms > 0x2000001141a000f 40000ms > 0x300000188c30001 40000ms > ephemeral nodes dump: > Sessions with Ephemerals (5): > 0x100061435f70000: > /moot/gmle/ServiceDirectory/ActiveNodes/member0000000064 > 0x2000001141a000f: > /moot/gmle/ServiceDirectory/ActiveNodes/member0000000066 > 0x2000001141a0001: > /moot/gmle/ServiceDirectory/ActiveNodes/member0000000065 > 0x2000001141a0000: > /moot/gmle/ActiveControllerCluster/member0000000065 > 0x2000001141a0005: > /moot/gmle/ActiveControllerCluster/member0000000066 > Connections dump: > Connections Sets (4)/(6): > 0 expire at Tue Sep 20 18:25:13 UTC 2016: > 1 expire at Tue Sep 20 18:25:23 UTC 2016: > ip: /10.0.0.2:38021 sessionId: 0x0 > 1 expire at Tue Sep 20 18:25:33 UTC 2016: > ip: /10.0.0.2:35422 sessionId: 0x2000001141a0002 > 4 expire at Tue Sep 20 18:25:43 UTC 2016: > ip: /10.0.0.2:35419 sessionId: 0x2000001141a0001 > ip: /10.0.0.1:59025 sessionId: 0x2000001141a0000 > ip: /10.0.0.2:35427 sessionId: 0x2000001141a0005 > ip: /10.0.0.3:56967 sessionId: 0x2000001141a000f > ZK 3: > # echo dump| nc 10.0.0.3 1300 > SessionTracker dump: > Global Sessions(23): > 0x10000d9e4460003 30000ms > 0x10000d9e4460004 30000ms > 0x100055a50b00001 30000ms > 0x100055a50b00003 40000ms > 0x100055a50b0000c 40000ms > 0x100061435f70000 40000ms > 0x100061435f70002 30000ms > 0x100061435f7000d 30000ms > 0x100061435f70010 30000ms > 0x100061435f70585 4000ms > 0x2000001141a0000 40000ms > 0x2000001141a0001 40000ms > 0x2000001141a0002 30000ms > 0x2000001141a0005 40000ms > 0x2000001141a000f 40000ms > 0x200000130750000 40000ms > 0x200000130750001 40000ms > 0x200000130750002 30000ms > 0x200000130750004 40000ms > 0x20000013075000d 30000ms > 0x3000000e4860000 30000ms > 0x3000000e4860002 40000ms > 0x300000188c30001 40000ms > ephemeral nodes dump: > Sessions with Ephemerals (11): > 0x100061435f70000: > /moot/gmle/ServiceDirectory/ActiveNodes/member0000000064 > 0x3000000e4860002: > /moot/gmle/ActiveControllerCluster/member0000000027 > 0x100055a50b0000c: > /moot/gmle/ServiceDirectory/ActiveNodes/member0000000027 > 0x100055a50b00003: > /moot/gmle/ActiveControllerCluster/member0000000025 > 0x200000130750004: > /moot/gmle/ActiveControllerCluster/member0000000026 > 0x200000130750000: > /moot/gmle/ServiceDirectory/ActiveNodes/member0000000026 > 0x2000001141a000f: > /moot/gmle/ServiceDirectory/ActiveNodes/member0000000066 > 0x200000130750001: > /moot/gmle/ServiceDirectory/ActiveNodes/member0000000025 > 0x2000001141a0001: > /moot/gmle/ServiceDirectory/ActiveNodes/member0000000065 > 0x2000001141a0000: > /moot/gmle/ActiveControllerCluster/member0000000065 > 0x2000001141a0005: > /moot/gmle/ActiveControllerCluster/member0000000066 > Connections dump: > Connections Sets (4)/(2): > 0 expire at Tue Sep 20 18:25:40 UTC 2016: > 1 expire at Tue Sep 20 18:25:50 UTC 2016: > ip: /10.0.0.3:52784 sessionId: 0x0 > 0 expire at Tue Sep 20 18:26:10 UTC 2016: > 1 expire at Tue Sep 20 18:26:20 UTC 2016: > ip: /10.0.0.3:50222 sessionId: 0x300000188c30001 -- This message was sent by Atlassian JIRA (v6.3.4#6332)