This is an automated email from the ASF dual-hosted git repository.

sanpwc pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/ignite-3.git


The following commit(s) were added to refs/heads/main by this push:
     new cdb99355351 IGNITE-25848 Fix several Critical system error caused by 
Meta Storage unavailability (#6243)
cdb99355351 is described below

commit cdb99355351de392221a006a6c11c96e51eecda8
Author: Mirza Aliev <[email protected]>
AuthorDate: Fri Jul 18 19:43:45 2025 +0400

    IGNITE-25848 Fix several Critical system error caused by Meta Storage 
unavailability (#6243)
---
 ...niteDistributionZoneManagerNodeRestartTest.java |   2 +
 .../apache/ignite/internal/ItIgniteStartTest.java  |  29 -----
 .../apache/ignite/internal/ItIgniteStopTest.java   |  71 ++++++++++++
 .../app/ItIgniteInMemoryNodeRestartTest.java       | 128 +++------------------
 .../ignite/internal/BaseIgniteRestartTest.java     |  87 ++++++++++++--
 5 files changed, 169 insertions(+), 148 deletions(-)

diff --git 
a/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItIgniteDistributionZoneManagerNodeRestartTest.java
 
b/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItIgniteDistributionZoneManagerNodeRestartTest.java
index af9ded574e9..09409ee8b1c 100644
--- 
a/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItIgniteDistributionZoneManagerNodeRestartTest.java
+++ 
b/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItIgniteDistributionZoneManagerNodeRestartTest.java
@@ -252,6 +252,8 @@ public class ItIgniteDistributionZoneManagerNodeRestartTest 
extends BaseIgniteRe
         when(cmgManager.startAsync(any())).thenReturn(nullCompletedFuture());
         when(cmgManager.stopAsync(any())).thenReturn(nullCompletedFuture());
 
+        when(cmgManager.clusterState()).thenReturn(nullCompletedFuture());
+
         var readOperationForCompactionTracker = new 
ReadOperationForCompactionTracker();
 
         var storage = new RocksDbKeyValueStorage(
diff --git 
a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/ItIgniteStartTest.java
 
b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/ItIgniteStartTest.java
index 9ce1b73a9cd..444d6d7f450 100644
--- 
a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/ItIgniteStartTest.java
+++ 
b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/ItIgniteStartTest.java
@@ -31,13 +31,11 @@ import com.typesafe.config.parser.ConfigDocumentFactory;
 import java.util.Set;
 import java.util.concurrent.CompletableFuture;
 import java.util.stream.IntStream;
-import org.apache.ignite.Ignite;
 import org.apache.ignite.internal.Cluster.ServerRegistration;
 import org.apache.ignite.internal.app.IgniteImpl;
 import org.apache.ignite.internal.app.IgniteServerImpl;
 import org.apache.ignite.lang.IgniteException;
 import org.apache.ignite.network.ClusterNode;
-import org.apache.ignite.table.KeyValueView;
 import org.junit.jupiter.api.Test;
 
 class ItIgniteStartTest extends ClusterPerTestIntegrationTest {
@@ -120,33 +118,6 @@ class ItIgniteStartTest extends 
ClusterPerTestIntegrationTest {
         assertEquals("IGN-NETWORK-2", exception.codeAsString());
     }
 
-    @Test
-    void testNodesCouldBeStoppedEvenIfMetastorageIsUnavailable() {
-        int nodeCount = 3;
-
-        cluster.startAndInit(nodeCount, builder -> {
-            builder.cmgNodeNames(cluster.nodeName(2));
-            builder.metaStorageNodeNames(cluster.nodeName(0));
-        });
-
-        Ignite node1 = cluster.node(1);
-
-        node1.sql().executeScript("CREATE TABLE TEST (ID INT PRIMARY KEY, VAL 
VARCHAR)");
-
-        KeyValueView<Integer, String> kvView = 
node1.tables().table("TEST").keyValueView(Integer.class, String.class);
-
-        kvView.put(null, 1, "one");
-
-        // Stop the single meta storage node.
-        cluster.stopNode(0);
-
-        // Imitate some activity on cluster
-        kvView.putAsync(null, 2, "two");
-
-        assertThat(cluster.stopNodeAsync(1), willCompleteSuccessfully());
-        assertThat(cluster.stopNodeAsync(2), willCompleteSuccessfully());
-    }
-
     private static void waitTill1NodeValidateItselfWithCmg(ServerRegistration 
registration) throws InterruptedException {
         IgniteImpl ignite = ((IgniteServerImpl) 
registration.server()).igniteImpl();
 
diff --git 
a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/ItIgniteStopTest.java
 
b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/ItIgniteStopTest.java
new file mode 100644
index 00000000000..6bd68c38384
--- /dev/null
+++ 
b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/ItIgniteStopTest.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.internal;
+
+import static 
org.apache.ignite.internal.testframework.matchers.CompletableFutureMatcher.willCompleteSuccessfully;
+import static org.hamcrest.MatcherAssert.assertThat;
+
+import com.typesafe.config.parser.ConfigDocument;
+import com.typesafe.config.parser.ConfigDocumentFactory;
+import org.apache.ignite.Ignite;
+import org.apache.ignite.table.KeyValueView;
+import org.junit.jupiter.api.Test;
+
+class ItIgniteStopTest extends ClusterPerTestIntegrationTest {
+    private static final long RAFT_RETRY_TIMEOUT_MILLIS = 15000;
+
+    @Override
+    protected int initialNodes() {
+        return 0;
+    }
+
+    @Override
+    protected String getNodeBootstrapConfigTemplate() {
+        ConfigDocument document = 
ConfigDocumentFactory.parseString(super.getNodeBootstrapConfigTemplate())
+                .withValueText("ignite.raft.retryTimeoutMillis", 
Long.toString(RAFT_RETRY_TIMEOUT_MILLIS));
+        return document.render();
+    }
+
+    @Test
+    void testNodesCouldBeStoppedEvenIfMetastorageIsUnavailable() throws 
InterruptedException {
+        int nodeCount = 3;
+
+        cluster.startAndInit(nodeCount, builder -> {
+            builder.cmgNodeNames(cluster.nodeName(2));
+            builder.metaStorageNodeNames(cluster.nodeName(0));
+        });
+
+        Ignite node1 = cluster.node(1);
+
+        node1.sql().executeScript("CREATE TABLE TEST (ID INT PRIMARY KEY, VAL 
VARCHAR)");
+
+        KeyValueView<Integer, String> kvView = 
node1.tables().table("TEST").keyValueView(Integer.class, String.class);
+
+        kvView.put(null, 1, "one");
+
+        // Stop the single meta storage node.
+        cluster.stopNode(0);
+
+        // Imitate some activity on cluster
+        kvView.putAsync(null, 2, "two");
+
+        assertThat(cluster.stopNodeAsync(1), willCompleteSuccessfully());
+
+        assertThat(cluster.stopNodeAsync(2), willCompleteSuccessfully());
+    }
+}
diff --git 
a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteInMemoryNodeRestartTest.java
 
b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteInMemoryNodeRestartTest.java
index 17a58ab0172..ae955a394f5 100644
--- 
a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteInMemoryNodeRestartTest.java
+++ 
b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteInMemoryNodeRestartTest.java
@@ -21,19 +21,14 @@ import static 
org.apache.ignite.internal.TestDefaultProfilesNames.DEFAULT_AIMEM_
 import static org.apache.ignite.internal.TestWrappers.unwrapIgniteImpl;
 import static org.apache.ignite.internal.TestWrappers.unwrapTableViewInternal;
 import static 
org.apache.ignite.internal.lang.IgniteSystemProperties.colocationEnabled;
-import static 
org.apache.ignite.internal.testframework.IgniteTestUtils.testNodeName;
 import static 
org.apache.ignite.internal.testframework.IgniteTestUtils.waitForCondition;
-import static 
org.apache.ignite.internal.testframework.matchers.CompletableFutureMatcher.willCompleteSuccessfully;
 import static org.hamcrest.MatcherAssert.assertThat;
 import static org.hamcrest.Matchers.is;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
-import java.nio.file.Path;
-import java.util.ArrayList;
 import java.util.Collection;
-import java.util.List;
 import java.util.Objects;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ExecutionException;
@@ -45,8 +40,6 @@ import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 import java.util.stream.Stream;
 import org.apache.ignite.Ignite;
-import org.apache.ignite.IgniteServer;
-import org.apache.ignite.InitParameters;
 import org.apache.ignite.internal.BaseIgniteRestartTest;
 import org.apache.ignite.internal.app.IgniteImpl;
 import org.apache.ignite.internal.lang.IgniteBiTuple;
@@ -63,13 +56,9 @@ import org.apache.ignite.internal.storage.RowId;
 import org.apache.ignite.internal.table.TableViewInternal;
 import org.apache.ignite.internal.table.distributed.storage.InternalTableImpl;
 import org.apache.ignite.internal.testframework.IgniteTestUtils;
-import org.apache.ignite.internal.testframework.TestIgnitionManager;
-import org.apache.ignite.internal.util.IgniteUtils;
 import org.apache.ignite.sql.IgniteSql;
 import org.apache.ignite.table.Table;
 import org.apache.ignite.table.Tuple;
-import org.jetbrains.annotations.Nullable;
-import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.TestInfo;
@@ -85,99 +74,15 @@ public class ItIgniteInMemoryNodeRestartTest extends 
BaseIgniteRestartTest {
     /** Test table name. */
     private static final String TABLE_NAME = "Table1";
 
-    /** Cluster nodes. */
-    private static final List<Ignite> CLUSTER_NODES = new ArrayList<>();
-
-    /**
-     * Stops all started nodes.
-     */
-    @AfterEach
-    public void afterEach() throws Exception {
-        var closeables = new ArrayList<AutoCloseable>();
-
-        for (IgniteServer node : IGNITE_SERVERS) {
-            if (node != null) {
-                closeables.add(node::shutdown);
-            }
-        }
-
-        IgniteUtils.closeAll(closeables);
-
-        CLUSTER_NODES.clear();
-    }
-
-    /**
-     * Start node with the given parameters.
-     *
-     * @param idx Node index, is used to stop the node later, see {@link 
#stopNode(int)}.
-     * @param nodeName Node name.
-     * @param cfgString Configuration string.
-     * @param workDir Working directory.
-     * @return Created node instance.
-     */
-    private static IgniteImpl startNode(int idx, String nodeName, @Nullable 
String cfgString, Path workDir) {
-        assertTrue(CLUSTER_NODES.size() == idx || CLUSTER_NODES.get(idx) == 
null);
-        assertTrue(IGNITE_SERVERS.size() == idx || IGNITE_SERVERS.get(idx) == 
null);
-
-        IgniteServer node = TestIgnitionManager.start(nodeName, cfgString, 
workDir.resolve(nodeName));
-
-        IGNITE_SERVERS.add(idx, node);
-
-        if (CLUSTER_NODES.isEmpty()) {
-            InitParameters initParameters = InitParameters.builder()
-                    .metaStorageNodes(node)
-                    .clusterName("cluster")
-                    .build();
-
-            TestIgnitionManager.init(node, initParameters);
-        }
-
-        assertThat(node.waitForInitAsync(), willCompleteSuccessfully());
-
-        Ignite ignite = node.api();
-
-        CLUSTER_NODES.add(idx, ignite);
-
-        return unwrapIgniteImpl(ignite);
-    }
-
-    /**
-     * Start node with the given parameters.
-     *
-     * @param testInfo Test info.
-     * @param idx Node index, is used to stop the node later, see {@link 
#stopNode(int)}.
-     * @return Created node instance.
-     */
-    private IgniteImpl startNode(TestInfo testInfo, int idx) {
-        int port = DEFAULT_NODE_PORT + idx;
-        String nodeName = testNodeName(testInfo, port);
-        String cfgString = configurationString(idx);
-
-        return startNode(idx, nodeName, cfgString, workDir.resolve(nodeName));
-    }
-
-    /** {@inheritDoc} */
-    @Override
-    protected void stopNode(int idx) {
-        IgniteServer node = IGNITE_SERVERS.get(idx);
-
-        if (node != null) {
-            node.shutdown();
-
-            CLUSTER_NODES.set(idx, null);
-            IGNITE_SERVERS.set(idx, null);
-        }
-    }
-
     /**
      * Restarts an in-memory node that is not a leader of the table's 
partition.
      */
     @Test
     public void inMemoryNodeRestartNotLeader(TestInfo testInfo) throws 
Exception {
         // Start three nodes, the first one is going to be CMG and MetaStorage 
leader.
-        IgniteImpl ignite = startNode(testInfo, 0);
-        startNode(testInfo, 1);
-        startNode(testInfo, 2);
+        IgniteImpl ignite = startNode(0);
+        startNode(1);
+        startNode(2);
 
         // Create a table with replica on every node.
         createTableWithData(ignite, TABLE_NAME, 3, 1);
@@ -205,7 +110,7 @@ public class ItIgniteInMemoryNodeRestartTest extends 
BaseIgniteRestartTest {
         // Restart the node.
         stopNode(idxToStop);
 
-        IgniteImpl restartingNode = startNode(testInfo, idxToStop);
+        IgniteImpl restartingNode = startNode(idxToStop);
 
         log.info("Restarted node {}", restartingNode.name());
 
@@ -286,9 +191,9 @@ public class ItIgniteInMemoryNodeRestartTest extends 
BaseIgniteRestartTest {
     @Test
     public void inMemoryNodeRestartNoMajority(TestInfo testInfo) throws 
Exception {
         // Start three nodes, the first one is going to be CMG and MetaStorage 
leader.
-        IgniteImpl ignite0 = startNode(testInfo, 0);
-        startNode(testInfo, 1);
-        startNode(testInfo, 2);
+        IgniteImpl ignite0 = startNode(0);
+        startNode(1);
+        startNode(2);
 
         // Create a table with replica on every node.
         createTableWithData(ignite0, TABLE_NAME, 3, 1);
@@ -299,7 +204,7 @@ public class ItIgniteInMemoryNodeRestartTest extends 
BaseIgniteRestartTest {
         stopNode(1);
         stopNode(2);
 
-        IgniteImpl restartingNode = startNode(testInfo, 1);
+        IgniteImpl restartingNode = startNode(1);
 
         Loza loza = restartingNode.raftManager();
 
@@ -325,9 +230,9 @@ public class ItIgniteInMemoryNodeRestartTest extends 
BaseIgniteRestartTest {
     @Test
     public void inMemoryNodeFullPartitionRestart(TestInfo testInfo) throws 
Exception {
         // Start three nodes, the first one is going to be CMG and MetaStorage 
leader.
-        IgniteImpl ignite0 = startNode(testInfo, 0);
-        startNode(testInfo, 1);
-        startNode(testInfo, 2);
+        IgniteImpl ignite0 = startNode(0);
+        startNode(1);
+        startNode(2);
 
         // Create a table with replicas on every node.
         createTableWithData(ignite0, TABLE_NAME, 3, 1);
@@ -338,9 +243,9 @@ public class ItIgniteInMemoryNodeRestartTest extends 
BaseIgniteRestartTest {
         stopNode(1);
         stopNode(2);
 
-        startNode(testInfo, 0);
-        startNode(testInfo, 1);
-        startNode(testInfo, 2);
+        startNode(0);
+        startNode(1);
+        startNode(2);
 
         // Check that full partition restart happens.
         for (int i = 0; i < 3; i++) {
@@ -414,7 +319,8 @@ public class ItIgniteInMemoryNodeRestartTest extends 
BaseIgniteRestartTest {
     }
 
     private static boolean tableHasDataOnAllIgnites(String name, int 
partitions) {
-        return CLUSTER_NODES.stream()
+        return IGNITE_SERVERS.stream()
+                .map(s -> unwrapIgniteImpl(s.api()))
                 .allMatch(igniteNode -> 
tableHasAnyData(unwrapTableViewInternal(igniteNode.tables().table(name)), 
partitions));
     }
 
@@ -430,6 +336,6 @@ public class ItIgniteInMemoryNodeRestartTest extends 
BaseIgniteRestartTest {
     }
 
     private static IgniteImpl ignite(int idx) {
-        return unwrapIgniteImpl(CLUSTER_NODES.get(idx));
+        return unwrapIgniteImpl(IGNITE_SERVERS.get(idx).api());
     }
 }
diff --git 
a/modules/runner/src/testFixtures/java/org/apache/ignite/internal/BaseIgniteRestartTest.java
 
b/modules/runner/src/testFixtures/java/org/apache/ignite/internal/BaseIgniteRestartTest.java
index 92362dc2219..0e3ca22d5f4 100644
--- 
a/modules/runner/src/testFixtures/java/org/apache/ignite/internal/BaseIgniteRestartTest.java
+++ 
b/modules/runner/src/testFixtures/java/org/apache/ignite/internal/BaseIgniteRestartTest.java
@@ -35,14 +35,21 @@ import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.Set;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.stream.Collectors;
 import java.util.stream.IntStream;
+import java.util.stream.Stream;
 import org.apache.ignite.IgniteServer;
 import org.apache.ignite.InitParameters;
 import org.apache.ignite.configuration.ConfigurationModule;
 import org.apache.ignite.internal.app.IgniteImpl;
 import org.apache.ignite.internal.close.ManuallyCloseable;
+import 
org.apache.ignite.internal.cluster.management.ClusterManagementGroupManager;
+import org.apache.ignite.internal.cluster.management.ClusterState;
 import org.apache.ignite.internal.cluster.management.topology.LogicalTopology;
 import 
org.apache.ignite.internal.cluster.management.topology.LogicalTopologyImpl;
 import org.apache.ignite.internal.configuration.ConfigurationManager;
@@ -137,25 +144,89 @@ public abstract class BaseIgniteRestartTest extends 
IgniteAbstractTest {
      */
     @AfterEach
     public void afterEachTest() throws Exception {
-        var closeables = new ArrayList<AutoCloseable>();
+        var nonCmgMsNodesToStop = new ArrayList<AutoCloseable>();
+        var cmgMsNodesToStop = new ArrayList<AutoCloseable>();
 
-        for (IgniteServer node : IGNITE_SERVERS) {
-            if (node != null) {
-                closeables.add(node::shutdown);
+        List<String> serverNames = IGNITE_SERVERS.stream()
+                .filter(Objects::nonNull)
+                .map(IgniteServer::name)
+                .collect(toList());
+
+        List<String> partialNodeNames = this.partialNodes.stream()
+                .filter(Objects::nonNull)
+                .map(PartialNode::name)
+                .collect(toList());
+
+        log.info("Shutting the cluster down [serverNodes={}, 
partialNodes={}]", serverNames, partialNodeNames);
+
+        Optional<PartialNode> anyPartialNode = this.partialNodes.stream()
+                .filter(Objects::nonNull)
+                .findAny();
+
+        if (anyPartialNode.isPresent()) {
+            ClusterManagementGroupManager component = findComponent(
+                    anyPartialNode.get().startedComponents(),
+                    ClusterManagementGroupManager.class
+            );
+
+            Set<String> cmgMsPartialNodesNames = cmgMsNodes(component);
+
+            for (PartialNode partialNode : 
partialNodes.stream().filter(Objects::nonNull).collect(toList())) {
+                if (!cmgMsPartialNodesNames.contains(partialNode.name())) {
+                    nonCmgMsNodesToStop.add(partialNode::stop);
+                } else {
+                    cmgMsNodesToStop.add(partialNode::stop);
+                }
             }
         }
 
-        if (!partialNodes.isEmpty()) {
-            for (PartialNode partialNode : partialNodes) {
-                closeables.add(partialNode::stop);
+        Optional<IgniteServer> anyServerNode = IGNITE_SERVERS.stream()
+                .filter(Objects::nonNull)
+                .findAny();
+
+        if (anyServerNode.isPresent()) {
+            IgniteImpl ignite = unwrapIgniteImpl(anyServerNode.get().api());
+
+            Set<String> cmgMsNodesNames = 
cmgMsNodes(ignite.clusterManagementGroupManager());
+
+            for (IgniteServer node : 
IGNITE_SERVERS.stream().filter(Objects::nonNull).collect(toList())) {
+                if (!cmgMsNodesNames.contains(node.name())) {
+                    nonCmgMsNodesToStop.add(node::shutdown);
+                } else {
+                    cmgMsNodesToStop.add(node::shutdown);
+                }
             }
         }
 
-        closeAll(closeables);
+        closeAll(nonCmgMsNodesToStop);
+        closeAll(cmgMsNodesToStop);
 
+        partialNodes.clear();
         IGNITE_SERVERS.clear();
     }
 
+    /**
+     * Returns the set of nodes' names that host Meta Storage and CMG.
+     *
+     * @param cmgManager Cluster management group manager.
+     * @return Set of node names.
+     * @throws Exception If failed to get cluster state.
+     */
+    private static Set<String> cmgMsNodes(ClusterManagementGroupManager 
cmgManager) throws Exception {
+        CompletableFuture<ClusterState> stateFut = cmgManager.clusterState();
+
+        assertThat(stateFut, willCompleteSuccessfully());
+
+        if (stateFut.get() == null) {
+            return Set.of();
+        }
+
+        return Stream.concat(
+                stateFut.get().metaStorageNodes().stream(),
+                stateFut.get().cmgNodes().stream()
+        ).collect(Collectors.toSet());
+    }
+
     /**
      * Load configuration modules.
      *

Reply via email to