This is an automated email from the ASF dual-hosted git repository.
dcapwell pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/cassandra.git
The following commit(s) were added to refs/heads/trunk by this push:
new 740cec41d2 When a node is bootstrapping it gets the whole gossip state
but applies in random order causing some cases where StorageService will fail
causing an instance to not show up in TokenMetadata
740cec41d2 is described below
commit 740cec41d2d67783a463bd18f70221de331928df
Author: David Capwell <[email protected]>
AuthorDate: Wed Jun 1 08:49:44 2022 -0700
When a node is bootstrapping it gets the whole gossip state but applies in
random order causing some cases where StorageService will fail causing an
instance to not show up in TokenMetadata
patch by David Capwell; reviewed by Blake Eggleston for CASSANDRA-17676
---
CHANGES.txt | 1 +
src/java/org/apache/cassandra/gms/Gossiper.java | 46 +++++++++++++++++++++-
.../org/apache/cassandra/gms/VersionedValue.java | 3 ++
3 files changed, 49 insertions(+), 1 deletion(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index fb75bc510d..30dfdf763c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
4.2
+ * When a node is bootstrapping it gets the whole gossip state but applies in
random order causing some cases where StorageService will fail causing an
instance to not show up in TokenMetadata (CASSANDRA-17676)
* Add CQLSH command SHOW REPLICAS (CASSANDRA-17577)
* Add guardrail to allow disabling of SimpleStrategy (CASSANDRA-17647)
* Change default directory permission to 750 in packaging (CASSANDRA-17470)
diff --git a/src/java/org/apache/cassandra/gms/Gossiper.java
b/src/java/org/apache/cassandra/gms/Gossiper.java
index 4c72166a9d..8041eed034 100644
--- a/src/java/org/apache/cassandra/gms/Gossiper.java
+++ b/src/java/org/apache/cassandra/gms/Gossiper.java
@@ -85,6 +85,7 @@ import static org.apache.cassandra.net.Verb.GOSSIP_DIGEST_SYN;
import static
org.apache.cassandra.utils.FBUtilities.getBroadcastAddressAndPort;
import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis;
import static org.apache.cassandra.utils.Clock.Global.nanoTime;
+import static org.apache.cassandra.gms.VersionedValue.BOOTSTRAPPING_STATUS;
/**
* This module is responsible for Gossiping information for the local
endpoint. This abstraction
@@ -1500,11 +1501,54 @@ public class Gossiper implements
IFailureDetectionEventListener, GossiperMBean
return pieces[0];
}
+ /**
+ * Gossip offers no happens-before relationship, but downstream
subscribers assume a happens-before relationship
+ * before being notified! To attempt to be nicer to subscribers, this
{@link Comparator} attempts to order EndpointState
+ * within a map based off a few heuristics:
+ * <ol>
+ * <li>STATUS - some STATUS depends on other instance STATUS, so make
sure they are last; eg. BOOT, and BOOT_REPLACE</li>
+ * <li>generation - normally defined as system clock millis, this can
be skewed and is a best effort</li>
+ * <li>address - tie breaker to make sure order is consistent</li>
+ * </ol>
+ * <p>
+ * Problems:
+ * Generation is normally defined as system clock millis, which can be
skewed and in-consistent cross nodes
+ * (generations do not have a happens-before relationship, so ordering is
sketchy at best).
+ * <p>
+ * Motivations:
+ * {@link Map#entrySet()} returns data in effectivlly random order, so can
get into a situation such as the following example.
+ * {@code
+ * 3 node cluster: n1, n2, and n3
+ * n2 goes down and n4 does host replacement and fails before completion
+ * h5 tries to do a host replacement against n4 (ignore the fact this
doesn't make sense)
+ * }
+ * In that case above, the {@link Map#entrySet()} ordering can be random,
causing h4 to apply before h2, which will
+ * be rejected by subscripers (only after updating gossip causing zero
retries).
+ */
+ private static Comparator<Entry<InetAddressAndPort, EndpointState>>
STATE_MAP_ORDERING =
+ ((Comparator<Entry<InetAddressAndPort, EndpointState>>) (e1, e2) -> {
+ // check status first, make sure bootstrap status happens-after all
others
+ if (BOOTSTRAPPING_STATUS.contains(getGossipStatus(e1.getValue())))
+ return 1;
+ if (BOOTSTRAPPING_STATUS.contains(getGossipStatus(e2.getValue())))
+ return -1;
+ return 0;
+ })
+ .thenComparingInt((Entry<InetAddressAndPort, EndpointState> e) ->
e.getValue().getHeartBeatState().getGeneration())
+ .thenComparing(Entry::getKey);
+
+ private static Iterable<Entry<InetAddressAndPort, EndpointState>>
order(Map<InetAddressAndPort, EndpointState> epStateMap)
+ {
+ List<Entry<InetAddressAndPort, EndpointState>> list = new
ArrayList<>(epStateMap.entrySet());
+ Collections.sort(list, STATE_MAP_ORDERING);
+ return list;
+ }
+
@VisibleForTesting
public void applyStateLocally(Map<InetAddressAndPort, EndpointState>
epStateMap)
{
checkProperThreadForStateMutation();
- for (Entry<InetAddressAndPort, EndpointState> entry :
epStateMap.entrySet())
+ for (Entry<InetAddressAndPort, EndpointState> entry :
order(epStateMap))
{
InetAddressAndPort ep = entry.getKey();
if (ep.equals(getBroadcastAddressAndPort()) && !isInShadowRound())
diff --git a/src/java/org/apache/cassandra/gms/VersionedValue.java
b/src/java/org/apache/cassandra/gms/VersionedValue.java
index 26644e17cc..519fffa9b8 100644
--- a/src/java/org/apache/cassandra/gms/VersionedValue.java
+++ b/src/java/org/apache/cassandra/gms/VersionedValue.java
@@ -27,6 +27,7 @@ import java.util.stream.Collectors;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import org.apache.cassandra.db.TypeSizes;
@@ -84,6 +85,8 @@ public class VersionedValue implements
Comparable<VersionedValue>
// values for ApplicationState.REMOVAL_COORDINATOR
public final static String REMOVAL_COORDINATOR = "REMOVER";
+ public static Set<String> BOOTSTRAPPING_STATUS =
ImmutableSet.of(STATUS_BOOTSTRAPPING, STATUS_BOOTSTRAPPING_REPLACE);
+
public final int version;
public final String value;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]