[
https://issues.apache.org/jira/browse/CASSANDRA-19975?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17886261#comment-17886261
]
David Capwell commented on CASSANDRA-19975:
-------------------------------------------
I commented out "ClusterUtils.stopUnchecked(node1);" and the test passes... so
does look to be due to CMS having 1 down node
> TCM unable to allow node to join when there is 1 down voting member
> -------------------------------------------------------------------
>
> Key: CASSANDRA-19975
> URL: https://issues.apache.org/jira/browse/CASSANDRA-19975
> Project: Cassandra
> Issue Type: Bug
> Components: Transactional Cluster Metadata
> Reporter: David Capwell
> Priority: Normal
> Fix For: 5.x
>
>
> This issue was found by the HarryTopologyMixupTest… in the cep-15-accord
> branch we added stopping nodes as well as restarting nodes (now that accord
> supports it) and this looks to break TCM if the down node is a CMS voting
> member.
> Here is the test that shows it
> {code}
> /*
> * Licensed to the Apache Software Foundation (ASF) under one
> * or more contributor license agreements. See the NOTICE file
> * distributed with this work for additional information
> * regarding copyright ownership. The ASF licenses this file
> * to you under the Apache License, Version 2.0 (the
> * "License"); you may not use this file except in compliance
> * with the License. You may obtain a copy of the License at
> *
> * http://www.apache.org/licenses/LICENSE-2.0
> *
> * Unless required by applicable law or agreed to in writing, software
> * distributed under the License is distributed on an "AS IS" BASIS,
> * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> * See the License for the specific language governing permissions and
> * limitations under the License.
> */
> package org.apache.cassandra.distributed.test.tcm;
> import accord.utils.Invariants;
> import accord.utils.async.TimeoutUtils;
> import org.agrona.collections.Long2LongHashMap;
> import org.apache.cassandra.distributed.Cluster;
> import org.apache.cassandra.distributed.api.Feature;
> import org.apache.cassandra.distributed.api.IInvokableInstance;
> import org.apache.cassandra.distributed.impl.INodeProvisionStrategy;
> import org.apache.cassandra.distributed.shared.ClusterUtils;
> import org.apache.cassandra.distributed.test.TestBaseImpl;
> import org.junit.Test;
> import java.io.IOException;
> import java.time.Duration;
> import java.util.concurrent.ExecutionException;
> import java.util.concurrent.TimeoutException;
> import java.util.concurrent.atomic.AtomicInteger;
> public class RepoTest extends TestBaseImpl
> {
> /**
> * This is the history reported from HarryTopologyMixupTest
> *
> History:
> 2: Add Node3; epoch=18, cms=[1, 2]
> // hidden - reconfigure to rf=3
> 3: Waiting for CMS to Quiesce; epoch=18, cms=[1, 2]
> 5: Harry Validate All; epoch=31, cms=[1, 2, 3]
> 6: Harry Insert; epoch=31, cms=[1, 2, 3]
> 8: Add Node4; epoch=31, cms=[1, 2, 3]
> 9: Waiting for CMS to Quiesce; epoch=31, cms=[1, 2, 3]
> 10: Harry Validate All; epoch=38, cms=[1, 2, 3]
> 11: nodetool repair harry tbl_0 from node2; epoch=38, cms=[1, 2, 3]
> 12: Stop Node3 for nodetool removenode; epoch=38, cms=[1, 2, 3]
> 13: nodetool removenode node3 from node1; epoch=38, cms=[1, 2, 3]
> 14: nodetool repair harry tbl_0 from node1; epoch=49, cms=[1, 2, 3]
> 15: Waiting for CMS to Quiesce; epoch=49, cms=[1, 2, 3]
> 16: Stop Node1 for Normal Stop; epoch=49, cms=[1, 2, 4]
> 18: Add Node5; epoch=49, cms=[1, 2, 4]
> */
> @Test
> public void test() throws IOException, ExecutionException,
> InterruptedException, TimeoutException
> {
> Long2LongHashMap nodeToToken = new Long2LongHashMap(-0);
> nodeToToken.put(1, -1799911656L);
> nodeToToken.put(2, -1005197310L);
> nodeToToken.put(3, -834315596L);
> nodeToToken.put(4, 335272232L);
> nodeToToken.put(5, -1829188286L);
> final AtomicInteger counter = new AtomicInteger(0);
> try (Cluster cluster = Cluster.build(2)
> .withTokenSupplier(i ->
> nodeToToken.get(i))
> .withConfig(c ->
> c.with(Feature.values()))
> .withNodeProvisionStrategy((subnet,
> portMap) -> new INodeProvisionStrategy.AbstractNodeProvisionStrategy(portMap)
> {
> {
> Invariants.checkArgument(subnet
> == 0, "Unexpected subnet detected: %d", subnet);
> }
> private final String ipPrefix =
> "127.0." + subnet + '.';
> @Override
> public int seedNodeNum()
> {
> switch
> (counter.getAndIncrement())
> {
> case 0:
> case 1:
> return 1;
> default:
> return 2;
> }
> }
> @Override
> public String ipAddress(int nodeNum)
> {
> return ipPrefix + nodeNum;
> }
> })
> .start())
> {
> fixDistributedSchemas(cluster);
> IInvokableInstance node1 = cluster.get(1);
> IInvokableInstance node2 = cluster.get(2);
> node1.nodetoolResult("cms", "reconfigure",
> "2").asserts().success();
> IInvokableInstance node3 = ClusterUtils.addInstance(cluster,
> node1.config(), c -> c.set("auto_bootstrap", true));
> node3.startup(cluster);
> node1.nodetoolResult("cms", "reconfigure",
> Integer.toString(3)).asserts().success();
> ClusterUtils.waitForCMSToQuiesce(cluster, new int[]{1, 2, 3});
> IInvokableInstance node4 = ClusterUtils.addInstance(cluster,
> node1.config(), c -> c.set("auto_bootstrap", true));
> node4.startup(cluster);
> ClusterUtils.stopUnchecked(node3);
> node1.nodetoolResult("removenode", "3").asserts().success();
> ClusterUtils.stopUnchecked(node1);
> // expected CMS Voting Group: [1, 2, 4]
> TimeoutUtils.runBlocking(Duration.ofMinutes(2), "node5 join", ()
> -> {
> IInvokableInstance node5 = ClusterUtils.addInstance(cluster,
> node1.config(), c -> c.set("auto_bootstrap", true));
> node5.startup(cluster);
> });
> }
> }
> }
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]