[ 
https://issues.apache.org/jira/browse/CASSANDRA-14155?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16727108#comment-16727108
 ] 

Ariel Weisberg commented on CASSANDRA-14155:
--------------------------------------------

I haven't quite nailed it down, but basically nodes are sending gossip syn acks 
to nodes that have just started that only contain information on the node 
sending the ack. They are in response to previously received gossip syn as I 
can see the message being created in the verb handler.

My guess is that since gossip doesn't do request/response correlation we are 
getting an ack for some request, like maybe before the node was upgraded, and 
the response from that looks like an ack to the shadow round.

I see stuff like
{noformat}
INFO  [GossipStage:1] 2018-12-21 16:30:41,202 
GossipDigestSynVerbHandler.java:104 - sending [] digests and 
{127.0.0.2:7000=EndpointState: HeartBeatState = HeartBeat: generation = 
1545427529, version = 2147483647, AppStateMap = {}} deltas
java.lang.Throwable: null
        at 
org.apache.cassandra.gms.GossipDigestSynVerbHandler.doVerb(GossipDigestSynVerbHandler.java:104)
        at 
org.apache.cassandra.net.MessageDeliveryTask.process(MessageDeliveryTask.java:92)
        at 
org.apache.cassandra.net.MessageDeliveryTask.run(MessageDeliveryTask.java:54)
        at 
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
        at java.util.concurrent.FutureTask.run(FutureTask.java:266)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at 
org.apache.cassandra.concurrent.NamedThreadFactory.lambda$threadLocalDeallocator$0(NamedThreadFactory.java:81)
        at 
io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
        at java.lang.Thread.run(Thread.java:748)
DEBUG [GossipStage:1] 2018-12-21 16:30:41,202 
OutboundMessagingConnection.java:257 - connection attempt 4 to 127.0.0.2:7000 
(GOSSIP)
DEBUG [GossipStage:1] 2018-12-21 16:30:41,202 NettyFactory.java:326 - creating 
outbound bootstrap to peer 127.0.0.2:7000, compression: false, encryption: 
enabled (jdk), coalesce: DISABLED, protocolVersion: 11
{noformat}

and then later 
{noformat}
DEBUG [GossipStage:1] 2018-12-21 16:30:59,320 Gossiper.java:1390 - Shadow 
request received, adding all states, {127.0.0.3:7000=EndpointState: 
HeartBeatState = HeartBeat: generation = 1545427545, version = 360, AppStateMap 
= {STATUS=Value(NORMAL,3074457345618258602,18), LOAD=Value(1.1576835E7,349), 
SCHEMA=Value(e8a984be-18d3-3322-b20c-036da60b86c9,46), DC=Value(datacenter1,6), 
RACK=Value(rack1,8), RELEASE_VERSION=Value(3.11.3-SNAPSHOT,4), 
RPC_ADDRESS=Value(127.0.0.3,3), NET_VERSION=Value(11,1), 
HOST_ID=Value(e65cd54b-a568-42d9-ae25-70e566167fc1,2), 
TOKENS=Value(^@^@^@^H*ªªªªªªª^@^@^@^@,17), RPC_READY=Value(true,32)}, 
127.0.0.2:7000=EndpointState: HeartBeatState = HeartBeat: generation = 
1545427529, version = 2147483647, AppStateMap = 
{STATUS=Value(shutdown,true,132), LOAD=Value(1.1576468E7,347), 
SCHEMA=Value(e8a984be-18d3-3322-b20c-036da60b86c9,62), DC=Value(datacenter1,6), 
RACK=Value(rack1,8), RELEASE_VERSION=Value(3.11.3-SNAPSHOT,4), 
RPC_ADDRESS=Value(127.0.0.2,3), NET_VERSION=Value(11,1), 
HOST_ID=Value(b14298c2-4322-49de-9747-407dc5f16bb6,2), 
TOKENS=Value(^@^@^@^HÕUUUUUUU^@^@^@^@,17), RPC_READY=Value(false,133), 
STATUS_WITH_PORT=Value(shutdown,true,131)}, 127.0.0.1:7000=EndpointState: 
HeartBeatState = HeartBeat: generation = 1545427751, version = 152, AppStateMap 
= {STATUS=Value(NORMAL,-9223372036854775808,31), LOAD=Value(4494006.0,104), 
SCHEMA=Value(3b77100c-2a8c-318f-a9e0-c5bfc4a4bde4,42), DC=Value(datacenter1,7), 
RACK=Value(rack1,9), RELEASE_VERSION=Value(4.0-SNAPSHOT,5), 
RPC_ADDRESS=Value(127.0.0.1,4), NET_VERSION=Value(12,1), 
HOST_ID=Value(01625b7d-2315-47e8-b031-da3cd9382161,2), 
TOKENS=Value(^@^@^@^H<80>^@^@^@^@^@^@^@^@^@^@^@,29), RPC_READY=Value(true,54), 
NATIVE_ADDRESS_AND_PORT=Value(127.0.0.1:9042,3), 
STATUS_WITH_PORT=Value(NORMAL,-9223372036854775808,30)}}
INFO  [GossipStage:1] 2018-12-21 16:30:59,322 
GossipDigestSynVerbHandler.java:104 - sending [] digests and 
{127.0.0.3:7000=EndpointState: HeartBeatState = HeartBeat: generation = 
1545427545, version = 360, AppStateMap = 
{STATUS=Value(NORMAL,3074457345618258602,18), LOAD=Value(1.1576835E7,349), 
SCHEMA=Value(e8a984be-18d3-3322-b20c-036da60b86c9,46), DC=Value(datacenter1,6), 
RACK=Value(rack1,8), RELEASE_VERSION=Value(3.11.3-SNAPSHOT,4), 
RPC_ADDRESS=Value(127.0.0.3,3), NET_VERSION=Value(11,1), 
HOST_ID=Value(e65cd54b-a568-42d9-ae25-70e566167fc1,2), 
TOKENS=Value(^@^@^@^H*ªªªªªªª^@^@^@^@,17), RPC_READY=Value(true,32)}, 
127.0.0.2:7000=EndpointState: HeartBeatState = HeartBeat: generation = 
1545427529, version = 2147483647, AppStateMap = 
{STATUS=Value(shutdown,true,132), LOAD=Value(1.1576468E7,347), 
SCHEMA=Value(e8a984be-18d3-3322-b20c-036da60b86c9,62), DC=Value(datacenter1,6), 
RACK=Value(rack1,8), RELEASE_VERSION=Value(3.11.3-SNAPSHOT,4), 
RPC_ADDRESS=Value(127.0.0.2,3), NET_VERSION=Value(11,1), 
HOST_ID=Value(b14298c2-4322-49de-9747-407dc5f16bb6,2), 
TOKENS=Value(^@^@^@^HÕUUUUUUU^@^@^@^@,17), RPC_READY=Value(false,133), 
STATUS_WITH_PORT=Value(shutdown,true,131)}, 127.0.0.1:7000=EndpointState: 
HeartBeatState = HeartBeat: generation = 1545427751, version = 152, AppStateMap 
= {STATUS=Value(NORMAL,-9223372036854775808,31), LOAD=Value(4494006.0,104), 
SCHEMA=Value(3b77100c-2a8c-318f-a9e0-c5bfc4a4bde4,42), DC=Value(datacenter1,7), 
RACK=Value(rack1,9), RELEASE_VERSION=Value(4.0-SNAPSHOT,5), 
RPC_ADDRESS=Value(127.0.0.1,4), NET_VERSION=Value(12,1), 
HOST_ID=Value(01625b7d-2315-47e8-b031-da3cd9382161,2), 
TOKENS=Value(^@^@^@^H<80>^@^@^@^@^@^@^@^@^@^@^@,29), RPC_READY=Value(true,54), 
NATIVE_ADDRESS_AND_PORT=Value(127.0.0.1:9042,3), 
STATUS_WITH_PORT=Value(NORMAL,-9223372036854775808,30)}} deltas
{noformat}

> [TRUNK] Gossiper somewhat frequently hitting an NPE on node startup with 
> dtests at 
> org.apache.cassandra.gms.Gossiper.isSafeForStartup(Gossiper.java:769)
> --------------------------------------------------------------------------------------------------------------------------------------------------------
>
>                 Key: CASSANDRA-14155
>                 URL: https://issues.apache.org/jira/browse/CASSANDRA-14155
>             Project: Cassandra
>          Issue Type: Bug
>          Components: Lifecycle, Testing
>            Reporter: Michael Kjellman
>            Assignee: Jason Brown
>            Priority: Major
>              Labels: dtest
>
> Gossiper is somewhat frequently hitting an NPE on node startup with dtests at 
> org.apache.cassandra.gms.Gossiper.isSafeForStartup(Gossiper.java:769)
> {code}
> test teardown failure
> Unexpected error found in node logs (see stdout for full details). Errors: 
> [ERROR [main] 2018-01-08 21:41:01,832 CassandraDaemon.java:675 - Exception 
> encountered during startup
> java.lang.NullPointerException: null
>         at 
> org.apache.cassandra.gms.Gossiper.isSafeForStartup(Gossiper.java:769) 
> ~[main/:na]
>         at 
> org.apache.cassandra.service.StorageService.checkForEndpointCollision(StorageService.java:511)
>  ~[main/:na]
>         at 
> org.apache.cassandra.service.StorageService.prepareToJoin(StorageService.java:761)
>  ~[main/:na]
>         at 
> org.apache.cassandra.service.StorageService.initServer(StorageService.java:621)
>  ~[main/:na]
>         at 
> org.apache.cassandra.service.StorageService.initServer(StorageService.java:568)
>  ~[main/:na]
>         at 
> org.apache.cassandra.service.CassandraDaemon.setup(CassandraDaemon.java:360) 
> [main/:na]
>         at 
> org.apache.cassandra.service.CassandraDaemon.activate(CassandraDaemon.java:569)
>  [main/:na]
>         at 
> org.apache.cassandra.service.CassandraDaemon.main(CassandraDaemon.java:658) 
> [main/:na], ERROR [main] 2018-01-08 21:41:01,832 CassandraDaemon.java:675 - 
> Exception encountered during startup
> java.lang.NullPointerException: null
>         at 
> org.apache.cassandra.gms.Gossiper.isSafeForStartup(Gossiper.java:769) 
> ~[main/:na]
>         at 
> org.apache.cassandra.service.StorageService.checkForEndpointCollision(StorageService.java:511)
>  ~[main/:na]
>         at 
> org.apache.cassandra.service.StorageService.prepareToJoin(StorageService.java:761)
>  ~[main/:na]
>         at 
> org.apache.cassandra.service.StorageService.initServer(StorageService.java:621)
>  ~[main/:na]
>         at 
> org.apache.cassandra.service.StorageService.initServer(StorageService.java:568)
>  ~[main/:na]
>         at 
> org.apache.cassandra.service.CassandraDaemon.setup(CassandraDaemon.java:360) 
> [main/:na]
>         at 
> org.apache.cassandra.service.CassandraDaemon.activate(CassandraDaemon.java:569)
>  [main/:na]
>         at 
> org.apache.cassandra.service.CassandraDaemon.main(CassandraDaemon.java:658) 
> [main/:na]]
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to