This is an automated email from the ASF dual-hosted git repository.

asf-gitbox-commits pushed a commit to branch cassandra-6.0
in repository https://gitbox.apache.org/repos/asf/cassandra.git

commit ea495907e136c43dcc4d60ab6694f2d831dae325
Author: Marcus Eriksson <[email protected]>
AuthorDate: Mon Mar 16 15:04:42 2026 +0100

    Make nodetool abortbootstrap more robust
    
    Patch by marcuse; reviewed by Sam Tunnicliffe for CASSANDRA-21235
---
 CHANGES.txt                                        |   1 +
 src/java/org/apache/cassandra/gms/Gossiper.java    |   8 +-
 .../apache/cassandra/service/StorageService.java   |  55 +++++++++--
 .../cassandra/tcm/transformations/Register.java    |   3 +
 .../distributed/test/ring/BootstrapTest.java       |  24 ++++-
 .../ClusterMetadataAbortedBootstrapRejoinTest.java | 109 +++++++++++++++++++++
 6 files changed, 190 insertions(+), 10 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 8d1e71aec1..698c97c150 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 6.0-alpha2
+ * Make nodetool abortbootstrap more robust (CASSANDRA-21235)
  * Don't clear prepared statement cache on nodetool cms initialize 
(CASSANDRA-21234)
  * Improve performance when deserializing cluster metadata  (CASSANDRA-21224)
  * Minor TokenMap performance improvement (CASSANDRA-21223)
diff --git a/src/java/org/apache/cassandra/gms/Gossiper.java 
b/src/java/org/apache/cassandra/gms/Gossiper.java
index 728ae7de7c..3d6414913f 100644
--- a/src/java/org/apache/cassandra/gms/Gossiper.java
+++ b/src/java/org/apache/cassandra/gms/Gossiper.java
@@ -269,7 +269,13 @@ public class Gossiper implements 
IFailureDetectionEventListener, GossiperMBean,
                 taskLock.lock();
 
                 /* Update the local heartbeat counter. */
-                
endpointStateMap.get(getBroadcastAddressAndPort()).updateHeartBeat();
+                EndpointState epstate = 
endpointStateMap.get(getBroadcastAddressAndPort());
+                if (epstate == null)
+                {
+                    logger.warn("Node {} is not in gossip, not running 
GossipTask", getBroadcastAddressAndPort());
+                    return;
+                }
+                epstate.updateHeartBeat();
                 if (logger.isTraceEnabled())
                     logger.trace("My heartbeat is now {}", 
endpointStateMap.get(FBUtilities.getBroadcastAddressAndPort()).getHeartBeatState().getHeartBeatVersion());
                 final List<GossipDigest> gDigests = new ArrayList<>();
diff --git a/src/java/org/apache/cassandra/service/StorageService.java 
b/src/java/org/apache/cassandra/service/StorageService.java
index f99599c889..d32d19460d 100644
--- a/src/java/org/apache/cassandra/service/StorageService.java
+++ b/src/java/org/apache/cassandra/service/StorageService.java
@@ -1662,15 +1662,12 @@ public class StorageService extends 
NotificationBroadcasterSupport implements IE
 
     public void abortBootstrap(String nodeStr, String endpointStr)
     {
-        logger.debug("Aborting bootstrap for {}/{}", nodeStr, endpointStr);
+        logger.info("Aborting bootstrap for {}", StringUtils.isEmpty(nodeStr) 
? endpointStr : nodeStr);
         ClusterMetadata metadata = ClusterMetadata.current();
-        NodeId nodeId;
-        if (!StringUtils.isEmpty(nodeStr))
-            nodeId = NodeId.fromString(nodeStr);
-        else
-            nodeId = 
metadata.directory.peerId(InetAddressAndPort.getByNameUnchecked(endpointStr));
-
+        NodeId nodeId = parseNodeIdOrEndpoint(metadata, nodeStr, endpointStr);
         InetAddressAndPort endpoint = metadata.directory.endpoint(nodeId);
+        if (endpoint == null)
+            throw new IllegalArgumentException("Can't abort bootstrap for " + 
nodeId + " - it does not exist in cluster metadata");
         if (Gossiper.instance.isKnownEndpoint(endpoint) && 
FailureDetector.instance.isAlive(endpoint))
             throw new RuntimeException("Can't abort bootstrap for " + nodeId + 
" - it is alive");
         NodeState nodeState = metadata.directory.peerState(nodeId);
@@ -1693,6 +1690,47 @@ public class StorageService extends 
NotificationBroadcasterSupport implements IE
         }
     }
 
+    private static NodeId parseNodeIdOrEndpoint(ClusterMetadata metadata, 
String nodeStr, String endpointStr)
+    {
+        NodeId nodeId;
+        if (!StringUtils.isEmpty(nodeStr))
+        {
+            try
+            {
+                nodeId = NodeId.fromString(nodeStr);
+            }
+            catch (IllegalArgumentException | UnsupportedOperationException e)
+            {
+                String msg = "Unable to parse node id string " + nodeStr;
+                logger.warn("{}", msg, e);
+                throw new IllegalArgumentException(msg, e);
+            }
+        }
+        else
+        {
+            InetAddressAndPort endpoint;
+            try
+            {
+                endpoint = InetAddressAndPort.getByName(endpointStr);
+            }
+            catch (UnknownHostException e)
+            {
+                String msg = "Unable to look up endpoint " + endpointStr;
+                logger.warn("{}", msg, e);
+                throw new IllegalArgumentException(msg, e);
+            }
+
+            nodeId = metadata.directory.peerId(endpoint);
+            if (nodeId == null)
+            {
+                String msg = "Unknown endpoint: " + endpoint;
+                logger.warn(msg);
+                throw new IllegalArgumentException(msg);
+            }
+        }
+        return nodeId;
+    }
+
     @Override
     public void migrateConsensusProtocol(@Nonnull List<String> keyspaceNames,
                                          @Nullable List<String> 
maybeTableNames,
@@ -2009,7 +2047,8 @@ public class StorageService extends 
NotificationBroadcasterSupport implements IE
 
     public String getLocalHostId()
     {
-        return getLocalHostUUID().toString();
+        UUID localHostId = getLocalHostUUID();
+        return localHostId != null ? localHostId.toString() : "UNKNOWN";
     }
 
     public UUID getLocalHostUUID()
diff --git a/src/java/org/apache/cassandra/tcm/transformations/Register.java 
b/src/java/org/apache/cassandra/tcm/transformations/Register.java
index 1e31ff3a85..a7551c7069 100644
--- a/src/java/org/apache/cassandra/tcm/transformations/Register.java
+++ b/src/java/org/apache/cassandra/tcm/transformations/Register.java
@@ -176,6 +176,9 @@ public class Register implements Transformation
         else
         {
             NodeId nodeId = 
directory.peerId(FBUtilities.getBroadcastAddressAndPort());
+            if (nodeId == null)
+                throw new IllegalStateException("Node has host id 
"+localHostId+" in system.local, but is not present in cluster metadata - not 
allowing this node to register. " +
+                                                "If a bootstrap of this node 
failed and was aborted with `nodetool abortbootstrap` it should also have its 
data removed before trying to rebootstrap.");
             NodeVersion dirVersion = directory.version(nodeId);
 
             // If this is a node in the process of upgrading, update the host 
id in the system.local table
diff --git 
a/test/distributed/org/apache/cassandra/distributed/test/ring/BootstrapTest.java
 
b/test/distributed/org/apache/cassandra/distributed/test/ring/BootstrapTest.java
index bc7abb85ef..8012399afd 100644
--- 
a/test/distributed/org/apache/cassandra/distributed/test/ring/BootstrapTest.java
+++ 
b/test/distributed/org/apache/cassandra/distributed/test/ring/BootstrapTest.java
@@ -18,6 +18,8 @@
 
 package org.apache.cassandra.distributed.test.ring;
 
+import java.io.Closeable;
+import java.io.IOException;
 import java.util.Map;
 import java.util.Objects;
 import java.util.concurrent.Callable;
@@ -48,6 +50,7 @@ import org.apache.cassandra.distributed.api.ConsistencyLevel;
 import org.apache.cassandra.distributed.api.ICluster;
 import org.apache.cassandra.distributed.api.IInstanceConfig;
 import org.apache.cassandra.distributed.api.IInvokableInstance;
+import org.apache.cassandra.distributed.api.NodeToolResult;
 import org.apache.cassandra.distributed.api.TokenSupplier;
 import org.apache.cassandra.distributed.shared.JMXUtil;
 import org.apache.cassandra.distributed.shared.NetworkTopology;
@@ -55,7 +58,6 @@ import org.apache.cassandra.distributed.test.TestBaseImpl;
 import org.apache.cassandra.metrics.DefaultNameFactory;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.service.StorageServiceMBean;
-import org.apache.cassandra.utils.Closeable;
 import org.apache.cassandra.utils.concurrent.CountDownLatch;
 
 import static net.bytebuddy.matcher.ElementMatchers.named;
@@ -228,6 +230,26 @@ public class BootstrapTest extends TestBaseImpl
         }
     }
 
+    @Test
+    public void testAbortBootstrapBadIp() throws IOException
+    {
+        try (Cluster cluster = builder().withNodes(1).start())
+        {
+            NodeToolResult res = 
cluster.get(1).nodetoolResult("abortbootstrap", "--ip", "127.0.0.55");
+            res.asserts().failure();
+            assertTrue(res.getStdout().contains("Unknown endpoint"));
+            res = cluster.get(1).nodetoolResult("abortbootstrap", "--ip", 
"127.0.0.999");
+            res.asserts().failure();
+            assertTrue(res.getStdout().contains("Unable to look up endpoint"));
+            res = cluster.get(1).nodetoolResult("abortbootstrap", "--node", 
"999");
+            res.asserts().failure();
+            assertTrue(res.getStdout().contains("does not exist in cluster 
metadata"));
+            res = cluster.get(1).nodetoolResult("abortbootstrap", "--node", 
"hello");
+            res.asserts().failure();
+            assertTrue(res.getStdout().contains("Unable to parse node id"));
+        }
+    }
+
     public static void populate(ICluster cluster, int from, int to)
     {
         populate(cluster, from, to, 1, 3, ConsistencyLevel.QUORUM);
diff --git 
a/test/distributed/org/apache/cassandra/distributed/test/tcm/ClusterMetadataAbortedBootstrapRejoinTest.java
 
b/test/distributed/org/apache/cassandra/distributed/test/tcm/ClusterMetadataAbortedBootstrapRejoinTest.java
new file mode 100644
index 0000000000..5994fab07c
--- /dev/null
+++ 
b/test/distributed/org/apache/cassandra/distributed/test/tcm/ClusterMetadataAbortedBootstrapRejoinTest.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test.tcm;
+
+import java.io.IOException;
+import java.time.Duration;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import com.google.common.util.concurrent.Uninterruptibles;
+
+import net.bytebuddy.ByteBuddy;
+import net.bytebuddy.dynamic.loading.ClassLoadingStrategy;
+import net.bytebuddy.implementation.MethodDelegation;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+
+import org.apache.cassandra.distributed.Cluster;
+import org.apache.cassandra.distributed.api.Feature;
+import org.apache.cassandra.distributed.api.IInstanceConfig;
+import org.apache.cassandra.distributed.api.IInvokableInstance;
+import org.apache.cassandra.distributed.api.TokenSupplier;
+import org.apache.cassandra.distributed.shared.NetworkTopology;
+import org.apache.cassandra.distributed.test.TestBaseImpl;
+import org.apache.cassandra.gms.FailureDetector;
+import org.apache.cassandra.locator.InetAddressAndPort;
+import org.apache.cassandra.service.StorageService;
+
+import static net.bytebuddy.matcher.ElementMatchers.named;
+import static net.bytebuddy.matcher.ElementMatchers.takesArguments;
+
+
+public class ClusterMetadataAbortedBootstrapRejoinTest extends TestBaseImpl
+{
+    @Test
+    public void testFailedBootstrapNotAllowedToJoin() throws IOException, 
TimeoutException, ExecutionException, InterruptedException
+    {
+        TokenSupplier even = TokenSupplier.evenlyDistributedTokens(3);
+        try (Cluster cluster = init(Cluster.build(2)
+                                           
.withInstanceInitializer(BBHelper::install)
+                                           .withConfig(c -> 
c.with(Feature.GOSSIP, Feature.NETWORK))
+                                           
.withNodeIdTopology(NetworkTopology.singleDcNetworkTopology(3, "dc0", "rack0"))
+                                           .withTokenSupplier(even::token)
+                                           .start()))
+        {
+            IInstanceConfig config = cluster.newInstanceConfig()
+                                            .set("auto_bootstrap", true);
+            IInvokableInstance toBootstrap = cluster.bootstrap(config);
+            toBootstrap.startup(cluster);
+            toBootstrap.logs().watchFor(Duration.ofSeconds(60), 
BBHelper.FAILMESSAGE);
+            toBootstrap.shutdown().get();
+            cluster.get(1).runOnInstance(() -> {
+                int i = 0;
+                while 
(FailureDetector.instance.isAlive(InetAddressAndPort.getByNameUnchecked("127.0.0.3"))
 && i++ < 30)
+                    Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
+            });
+            cluster.get(1).nodetoolResult("abortbootstrap", "--ip", 
"127.0.0.3").asserts().success();
+            Assertions.assertThatThrownBy(toBootstrap::startup)
+                      .isInstanceOf(IllegalStateException.class)
+                      .hasMessageContaining("but is not present in cluster 
metadata");
+        }
+    }
+
+    public static class BBHelper
+    {
+        public static String FAILMESSAGE = "ARTIFICIALLY FAILING BOOTSTRAP";
+        public static AtomicBoolean enabled = new AtomicBoolean(true);
+        public static void install(ClassLoader cl, int i)
+        {
+            if (i == 3)
+            {
+                new ByteBuddy().rebase(StorageService.class)
+                               
.method(named("repairPaxosForTopologyChange").and(takesArguments(1)))
+                               .intercept(MethodDelegation.to(BBHelper.class))
+                               .make()
+                               .load(cl, 
ClassLoadingStrategy.Default.INJECTION);
+            }
+        }
+
+        public static void repairPaxosForTopologyChange(String reason)
+        {
+            if (enabled.get())
+            {
+                enabled.set(false);
+                throw new RuntimeException(FAILMESSAGE);
+            }
+        }
+
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to