This is an automated email from the ASF dual-hosted git repository.
sanpwc pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/ignite-3.git
The following commit(s) were added to refs/heads/main by this push:
new cdb99355351 IGNITE-25848 Fix several Critical system error caused by
Meta Storage unavailability (#6243)
cdb99355351 is described below
commit cdb99355351de392221a006a6c11c96e51eecda8
Author: Mirza Aliev <[email protected]>
AuthorDate: Fri Jul 18 19:43:45 2025 +0400
IGNITE-25848 Fix several Critical system error caused by Meta Storage
unavailability (#6243)
---
...niteDistributionZoneManagerNodeRestartTest.java | 2 +
.../apache/ignite/internal/ItIgniteStartTest.java | 29 -----
.../apache/ignite/internal/ItIgniteStopTest.java | 71 ++++++++++++
.../app/ItIgniteInMemoryNodeRestartTest.java | 128 +++------------------
.../ignite/internal/BaseIgniteRestartTest.java | 87 ++++++++++++--
5 files changed, 169 insertions(+), 148 deletions(-)
diff --git
a/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItIgniteDistributionZoneManagerNodeRestartTest.java
b/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItIgniteDistributionZoneManagerNodeRestartTest.java
index af9ded574e9..09409ee8b1c 100644
---
a/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItIgniteDistributionZoneManagerNodeRestartTest.java
+++
b/modules/distribution-zones/src/integrationTest/java/org/apache/ignite/internal/distributionzones/ItIgniteDistributionZoneManagerNodeRestartTest.java
@@ -252,6 +252,8 @@ public class ItIgniteDistributionZoneManagerNodeRestartTest
extends BaseIgniteRe
when(cmgManager.startAsync(any())).thenReturn(nullCompletedFuture());
when(cmgManager.stopAsync(any())).thenReturn(nullCompletedFuture());
+ when(cmgManager.clusterState()).thenReturn(nullCompletedFuture());
+
var readOperationForCompactionTracker = new
ReadOperationForCompactionTracker();
var storage = new RocksDbKeyValueStorage(
diff --git
a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/ItIgniteStartTest.java
b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/ItIgniteStartTest.java
index 9ce1b73a9cd..444d6d7f450 100644
---
a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/ItIgniteStartTest.java
+++
b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/ItIgniteStartTest.java
@@ -31,13 +31,11 @@ import com.typesafe.config.parser.ConfigDocumentFactory;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.stream.IntStream;
-import org.apache.ignite.Ignite;
import org.apache.ignite.internal.Cluster.ServerRegistration;
import org.apache.ignite.internal.app.IgniteImpl;
import org.apache.ignite.internal.app.IgniteServerImpl;
import org.apache.ignite.lang.IgniteException;
import org.apache.ignite.network.ClusterNode;
-import org.apache.ignite.table.KeyValueView;
import org.junit.jupiter.api.Test;
class ItIgniteStartTest extends ClusterPerTestIntegrationTest {
@@ -120,33 +118,6 @@ class ItIgniteStartTest extends
ClusterPerTestIntegrationTest {
assertEquals("IGN-NETWORK-2", exception.codeAsString());
}
- @Test
- void testNodesCouldBeStoppedEvenIfMetastorageIsUnavailable() {
- int nodeCount = 3;
-
- cluster.startAndInit(nodeCount, builder -> {
- builder.cmgNodeNames(cluster.nodeName(2));
- builder.metaStorageNodeNames(cluster.nodeName(0));
- });
-
- Ignite node1 = cluster.node(1);
-
- node1.sql().executeScript("CREATE TABLE TEST (ID INT PRIMARY KEY, VAL
VARCHAR)");
-
- KeyValueView<Integer, String> kvView =
node1.tables().table("TEST").keyValueView(Integer.class, String.class);
-
- kvView.put(null, 1, "one");
-
- // Stop the single meta storage node.
- cluster.stopNode(0);
-
- // Imitate some activity on cluster
- kvView.putAsync(null, 2, "two");
-
- assertThat(cluster.stopNodeAsync(1), willCompleteSuccessfully());
- assertThat(cluster.stopNodeAsync(2), willCompleteSuccessfully());
- }
-
private static void waitTill1NodeValidateItselfWithCmg(ServerRegistration
registration) throws InterruptedException {
IgniteImpl ignite = ((IgniteServerImpl)
registration.server()).igniteImpl();
diff --git
a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/ItIgniteStopTest.java
b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/ItIgniteStopTest.java
new file mode 100644
index 00000000000..6bd68c38384
--- /dev/null
+++
b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/ItIgniteStopTest.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.internal;
+
+import static
org.apache.ignite.internal.testframework.matchers.CompletableFutureMatcher.willCompleteSuccessfully;
+import static org.hamcrest.MatcherAssert.assertThat;
+
+import com.typesafe.config.parser.ConfigDocument;
+import com.typesafe.config.parser.ConfigDocumentFactory;
+import org.apache.ignite.Ignite;
+import org.apache.ignite.table.KeyValueView;
+import org.junit.jupiter.api.Test;
+
+class ItIgniteStopTest extends ClusterPerTestIntegrationTest {
+ private static final long RAFT_RETRY_TIMEOUT_MILLIS = 15000;
+
+ @Override
+ protected int initialNodes() {
+ return 0;
+ }
+
+ @Override
+ protected String getNodeBootstrapConfigTemplate() {
+ ConfigDocument document =
ConfigDocumentFactory.parseString(super.getNodeBootstrapConfigTemplate())
+ .withValueText("ignite.raft.retryTimeoutMillis",
Long.toString(RAFT_RETRY_TIMEOUT_MILLIS));
+ return document.render();
+ }
+
+ @Test
+ void testNodesCouldBeStoppedEvenIfMetastorageIsUnavailable() throws
InterruptedException {
+ int nodeCount = 3;
+
+ cluster.startAndInit(nodeCount, builder -> {
+ builder.cmgNodeNames(cluster.nodeName(2));
+ builder.metaStorageNodeNames(cluster.nodeName(0));
+ });
+
+ Ignite node1 = cluster.node(1);
+
+ node1.sql().executeScript("CREATE TABLE TEST (ID INT PRIMARY KEY, VAL
VARCHAR)");
+
+ KeyValueView<Integer, String> kvView =
node1.tables().table("TEST").keyValueView(Integer.class, String.class);
+
+ kvView.put(null, 1, "one");
+
+ // Stop the single meta storage node.
+ cluster.stopNode(0);
+
+ // Imitate some activity on cluster
+ kvView.putAsync(null, 2, "two");
+
+ assertThat(cluster.stopNodeAsync(1), willCompleteSuccessfully());
+
+ assertThat(cluster.stopNodeAsync(2), willCompleteSuccessfully());
+ }
+}
diff --git
a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteInMemoryNodeRestartTest.java
b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteInMemoryNodeRestartTest.java
index 17a58ab0172..ae955a394f5 100644
---
a/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteInMemoryNodeRestartTest.java
+++
b/modules/runner/src/integrationTest/java/org/apache/ignite/internal/runner/app/ItIgniteInMemoryNodeRestartTest.java
@@ -21,19 +21,14 @@ import static
org.apache.ignite.internal.TestDefaultProfilesNames.DEFAULT_AIMEM_
import static org.apache.ignite.internal.TestWrappers.unwrapIgniteImpl;
import static org.apache.ignite.internal.TestWrappers.unwrapTableViewInternal;
import static
org.apache.ignite.internal.lang.IgniteSystemProperties.colocationEnabled;
-import static
org.apache.ignite.internal.testframework.IgniteTestUtils.testNodeName;
import static
org.apache.ignite.internal.testframework.IgniteTestUtils.waitForCondition;
-import static
org.apache.ignite.internal.testframework.matchers.CompletableFutureMatcher.willCompleteSuccessfully;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.is;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
-import java.nio.file.Path;
-import java.util.ArrayList;
import java.util.Collection;
-import java.util.List;
import java.util.Objects;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
@@ -45,8 +40,6 @@ import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import org.apache.ignite.Ignite;
-import org.apache.ignite.IgniteServer;
-import org.apache.ignite.InitParameters;
import org.apache.ignite.internal.BaseIgniteRestartTest;
import org.apache.ignite.internal.app.IgniteImpl;
import org.apache.ignite.internal.lang.IgniteBiTuple;
@@ -63,13 +56,9 @@ import org.apache.ignite.internal.storage.RowId;
import org.apache.ignite.internal.table.TableViewInternal;
import org.apache.ignite.internal.table.distributed.storage.InternalTableImpl;
import org.apache.ignite.internal.testframework.IgniteTestUtils;
-import org.apache.ignite.internal.testframework.TestIgnitionManager;
-import org.apache.ignite.internal.util.IgniteUtils;
import org.apache.ignite.sql.IgniteSql;
import org.apache.ignite.table.Table;
import org.apache.ignite.table.Tuple;
-import org.jetbrains.annotations.Nullable;
-import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestInfo;
@@ -85,99 +74,15 @@ public class ItIgniteInMemoryNodeRestartTest extends
BaseIgniteRestartTest {
/** Test table name. */
private static final String TABLE_NAME = "Table1";
- /** Cluster nodes. */
- private static final List<Ignite> CLUSTER_NODES = new ArrayList<>();
-
- /**
- * Stops all started nodes.
- */
- @AfterEach
- public void afterEach() throws Exception {
- var closeables = new ArrayList<AutoCloseable>();
-
- for (IgniteServer node : IGNITE_SERVERS) {
- if (node != null) {
- closeables.add(node::shutdown);
- }
- }
-
- IgniteUtils.closeAll(closeables);
-
- CLUSTER_NODES.clear();
- }
-
- /**
- * Start node with the given parameters.
- *
- * @param idx Node index, is used to stop the node later, see {@link
#stopNode(int)}.
- * @param nodeName Node name.
- * @param cfgString Configuration string.
- * @param workDir Working directory.
- * @return Created node instance.
- */
- private static IgniteImpl startNode(int idx, String nodeName, @Nullable
String cfgString, Path workDir) {
- assertTrue(CLUSTER_NODES.size() == idx || CLUSTER_NODES.get(idx) ==
null);
- assertTrue(IGNITE_SERVERS.size() == idx || IGNITE_SERVERS.get(idx) ==
null);
-
- IgniteServer node = TestIgnitionManager.start(nodeName, cfgString,
workDir.resolve(nodeName));
-
- IGNITE_SERVERS.add(idx, node);
-
- if (CLUSTER_NODES.isEmpty()) {
- InitParameters initParameters = InitParameters.builder()
- .metaStorageNodes(node)
- .clusterName("cluster")
- .build();
-
- TestIgnitionManager.init(node, initParameters);
- }
-
- assertThat(node.waitForInitAsync(), willCompleteSuccessfully());
-
- Ignite ignite = node.api();
-
- CLUSTER_NODES.add(idx, ignite);
-
- return unwrapIgniteImpl(ignite);
- }
-
- /**
- * Start node with the given parameters.
- *
- * @param testInfo Test info.
- * @param idx Node index, is used to stop the node later, see {@link
#stopNode(int)}.
- * @return Created node instance.
- */
- private IgniteImpl startNode(TestInfo testInfo, int idx) {
- int port = DEFAULT_NODE_PORT + idx;
- String nodeName = testNodeName(testInfo, port);
- String cfgString = configurationString(idx);
-
- return startNode(idx, nodeName, cfgString, workDir.resolve(nodeName));
- }
-
- /** {@inheritDoc} */
- @Override
- protected void stopNode(int idx) {
- IgniteServer node = IGNITE_SERVERS.get(idx);
-
- if (node != null) {
- node.shutdown();
-
- CLUSTER_NODES.set(idx, null);
- IGNITE_SERVERS.set(idx, null);
- }
- }
-
/**
* Restarts an in-memory node that is not a leader of the table's
partition.
*/
@Test
public void inMemoryNodeRestartNotLeader(TestInfo testInfo) throws
Exception {
// Start three nodes, the first one is going to be CMG and MetaStorage
leader.
- IgniteImpl ignite = startNode(testInfo, 0);
- startNode(testInfo, 1);
- startNode(testInfo, 2);
+ IgniteImpl ignite = startNode(0);
+ startNode(1);
+ startNode(2);
// Create a table with replica on every node.
createTableWithData(ignite, TABLE_NAME, 3, 1);
@@ -205,7 +110,7 @@ public class ItIgniteInMemoryNodeRestartTest extends
BaseIgniteRestartTest {
// Restart the node.
stopNode(idxToStop);
- IgniteImpl restartingNode = startNode(testInfo, idxToStop);
+ IgniteImpl restartingNode = startNode(idxToStop);
log.info("Restarted node {}", restartingNode.name());
@@ -286,9 +191,9 @@ public class ItIgniteInMemoryNodeRestartTest extends
BaseIgniteRestartTest {
@Test
public void inMemoryNodeRestartNoMajority(TestInfo testInfo) throws
Exception {
// Start three nodes, the first one is going to be CMG and MetaStorage
leader.
- IgniteImpl ignite0 = startNode(testInfo, 0);
- startNode(testInfo, 1);
- startNode(testInfo, 2);
+ IgniteImpl ignite0 = startNode(0);
+ startNode(1);
+ startNode(2);
// Create a table with replica on every node.
createTableWithData(ignite0, TABLE_NAME, 3, 1);
@@ -299,7 +204,7 @@ public class ItIgniteInMemoryNodeRestartTest extends
BaseIgniteRestartTest {
stopNode(1);
stopNode(2);
- IgniteImpl restartingNode = startNode(testInfo, 1);
+ IgniteImpl restartingNode = startNode(1);
Loza loza = restartingNode.raftManager();
@@ -325,9 +230,9 @@ public class ItIgniteInMemoryNodeRestartTest extends
BaseIgniteRestartTest {
@Test
public void inMemoryNodeFullPartitionRestart(TestInfo testInfo) throws
Exception {
// Start three nodes, the first one is going to be CMG and MetaStorage
leader.
- IgniteImpl ignite0 = startNode(testInfo, 0);
- startNode(testInfo, 1);
- startNode(testInfo, 2);
+ IgniteImpl ignite0 = startNode(0);
+ startNode(1);
+ startNode(2);
// Create a table with replicas on every node.
createTableWithData(ignite0, TABLE_NAME, 3, 1);
@@ -338,9 +243,9 @@ public class ItIgniteInMemoryNodeRestartTest extends
BaseIgniteRestartTest {
stopNode(1);
stopNode(2);
- startNode(testInfo, 0);
- startNode(testInfo, 1);
- startNode(testInfo, 2);
+ startNode(0);
+ startNode(1);
+ startNode(2);
// Check that full partition restart happens.
for (int i = 0; i < 3; i++) {
@@ -414,7 +319,8 @@ public class ItIgniteInMemoryNodeRestartTest extends
BaseIgniteRestartTest {
}
private static boolean tableHasDataOnAllIgnites(String name, int
partitions) {
- return CLUSTER_NODES.stream()
+ return IGNITE_SERVERS.stream()
+ .map(s -> unwrapIgniteImpl(s.api()))
.allMatch(igniteNode ->
tableHasAnyData(unwrapTableViewInternal(igniteNode.tables().table(name)),
partitions));
}
@@ -430,6 +336,6 @@ public class ItIgniteInMemoryNodeRestartTest extends
BaseIgniteRestartTest {
}
private static IgniteImpl ignite(int idx) {
- return unwrapIgniteImpl(CLUSTER_NODES.get(idx));
+ return unwrapIgniteImpl(IGNITE_SERVERS.get(idx).api());
}
}
diff --git
a/modules/runner/src/testFixtures/java/org/apache/ignite/internal/BaseIgniteRestartTest.java
b/modules/runner/src/testFixtures/java/org/apache/ignite/internal/BaseIgniteRestartTest.java
index 92362dc2219..0e3ca22d5f4 100644
---
a/modules/runner/src/testFixtures/java/org/apache/ignite/internal/BaseIgniteRestartTest.java
+++
b/modules/runner/src/testFixtures/java/org/apache/ignite/internal/BaseIgniteRestartTest.java
@@ -35,14 +35,21 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.stream.Collectors;
import java.util.stream.IntStream;
+import java.util.stream.Stream;
import org.apache.ignite.IgniteServer;
import org.apache.ignite.InitParameters;
import org.apache.ignite.configuration.ConfigurationModule;
import org.apache.ignite.internal.app.IgniteImpl;
import org.apache.ignite.internal.close.ManuallyCloseable;
+import
org.apache.ignite.internal.cluster.management.ClusterManagementGroupManager;
+import org.apache.ignite.internal.cluster.management.ClusterState;
import org.apache.ignite.internal.cluster.management.topology.LogicalTopology;
import
org.apache.ignite.internal.cluster.management.topology.LogicalTopologyImpl;
import org.apache.ignite.internal.configuration.ConfigurationManager;
@@ -137,25 +144,89 @@ public abstract class BaseIgniteRestartTest extends
IgniteAbstractTest {
*/
@AfterEach
public void afterEachTest() throws Exception {
- var closeables = new ArrayList<AutoCloseable>();
+ var nonCmgMsNodesToStop = new ArrayList<AutoCloseable>();
+ var cmgMsNodesToStop = new ArrayList<AutoCloseable>();
- for (IgniteServer node : IGNITE_SERVERS) {
- if (node != null) {
- closeables.add(node::shutdown);
+ List<String> serverNames = IGNITE_SERVERS.stream()
+ .filter(Objects::nonNull)
+ .map(IgniteServer::name)
+ .collect(toList());
+
+ List<String> partialNodeNames = this.partialNodes.stream()
+ .filter(Objects::nonNull)
+ .map(PartialNode::name)
+ .collect(toList());
+
+ log.info("Shutting the cluster down [serverNodes={},
partialNodes={}]", serverNames, partialNodeNames);
+
+ Optional<PartialNode> anyPartialNode = this.partialNodes.stream()
+ .filter(Objects::nonNull)
+ .findAny();
+
+ if (anyPartialNode.isPresent()) {
+ ClusterManagementGroupManager component = findComponent(
+ anyPartialNode.get().startedComponents(),
+ ClusterManagementGroupManager.class
+ );
+
+ Set<String> cmgMsPartialNodesNames = cmgMsNodes(component);
+
+ for (PartialNode partialNode :
partialNodes.stream().filter(Objects::nonNull).collect(toList())) {
+ if (!cmgMsPartialNodesNames.contains(partialNode.name())) {
+ nonCmgMsNodesToStop.add(partialNode::stop);
+ } else {
+ cmgMsNodesToStop.add(partialNode::stop);
+ }
}
}
- if (!partialNodes.isEmpty()) {
- for (PartialNode partialNode : partialNodes) {
- closeables.add(partialNode::stop);
+ Optional<IgniteServer> anyServerNode = IGNITE_SERVERS.stream()
+ .filter(Objects::nonNull)
+ .findAny();
+
+ if (anyServerNode.isPresent()) {
+ IgniteImpl ignite = unwrapIgniteImpl(anyServerNode.get().api());
+
+ Set<String> cmgMsNodesNames =
cmgMsNodes(ignite.clusterManagementGroupManager());
+
+ for (IgniteServer node :
IGNITE_SERVERS.stream().filter(Objects::nonNull).collect(toList())) {
+ if (!cmgMsNodesNames.contains(node.name())) {
+ nonCmgMsNodesToStop.add(node::shutdown);
+ } else {
+ cmgMsNodesToStop.add(node::shutdown);
+ }
}
}
- closeAll(closeables);
+ closeAll(nonCmgMsNodesToStop);
+ closeAll(cmgMsNodesToStop);
+ partialNodes.clear();
IGNITE_SERVERS.clear();
}
+ /**
+ * Returns the set of nodes' names that host Meta Storage and CMG.
+ *
+ * @param cmgManager Cluster management group manager.
+ * @return Set of node names.
+ * @throws Exception If failed to get cluster state.
+ */
+ private static Set<String> cmgMsNodes(ClusterManagementGroupManager
cmgManager) throws Exception {
+ CompletableFuture<ClusterState> stateFut = cmgManager.clusterState();
+
+ assertThat(stateFut, willCompleteSuccessfully());
+
+ if (stateFut.get() == null) {
+ return Set.of();
+ }
+
+ return Stream.concat(
+ stateFut.get().metaStorageNodes().stream(),
+ stateFut.get().cmgNodes().stream()
+ ).collect(Collectors.toSet());
+ }
+
/**
* Load configuration modules.
*