This is an automated email from the ASF dual-hosted git repository.
sanpwc pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/ignite-3.git
The following commit(s) were added to refs/heads/main by this push:
new da337a2b8a7 IGNITE-26421 Make node stop time unbounded (#6933)
da337a2b8a7 is described below
commit da337a2b8a78dca8995a2899603c316c410f3505
Author: Denis Chudov <[email protected]>
AuthorDate: Fri Nov 21 11:25:54 2025 +0400
IGNITE-26421 Make node stop time unbounded (#6933)
---
.../apache/ignite/internal/thread/ThreadUtils.java | 3 +-
.../internal/tostring/IgniteToStringBuilder.java | 2 +-
.../ignite/internal/failure/FailureManager.java | 4 +--
.../PartitionReplicaLifecycleManager.java | 38 ++++++++++++++++++----
.../ignite/internal/replicator/ReplicaManager.java | 1 +
5 files changed, 38 insertions(+), 10 deletions(-)
diff --git
a/modules/core/src/main/java/org/apache/ignite/internal/thread/ThreadUtils.java
b/modules/core/src/main/java/org/apache/ignite/internal/thread/ThreadUtils.java
index 4a6e84417ae..f64f479a0dd 100644
---
a/modules/core/src/main/java/org/apache/ignite/internal/thread/ThreadUtils.java
+++
b/modules/core/src/main/java/org/apache/ignite/internal/thread/ThreadUtils.java
@@ -31,6 +31,7 @@ import java.util.HashSet;
import java.util.Set;
import org.apache.ignite.internal.lang.IgniteStringFormatter;
import org.apache.ignite.internal.logger.IgniteLogger;
+import org.jetbrains.annotations.Nullable;
/**
* This class contains utility methods for working with threads.
@@ -61,7 +62,7 @@ public class ThreadUtils {
* @param isErrorLevel {@code true} if thread dump must be printed with
ERROR logging level,
* {@code false} if thread dump must be printed with WARN logging
level.
*/
- public static void dumpThreads(IgniteLogger log, String message, boolean
isErrorLevel) {
+ public static void dumpThreads(IgniteLogger log, @Nullable String message,
boolean isErrorLevel) {
ThreadMXBean mxBean = ManagementFactory.getThreadMXBean();
Set<Long> deadlockedThreadsIds = getDeadlockedThreadIds(mxBean);
diff --git
a/modules/core/src/main/java/org/apache/ignite/internal/tostring/IgniteToStringBuilder.java
b/modules/core/src/main/java/org/apache/ignite/internal/tostring/IgniteToStringBuilder.java
index 3a195678846..e86b1c95f83 100644
---
a/modules/core/src/main/java/org/apache/ignite/internal/tostring/IgniteToStringBuilder.java
+++
b/modules/core/src/main/java/org/apache/ignite/internal/tostring/IgniteToStringBuilder.java
@@ -83,7 +83,7 @@ public class IgniteToStringBuilder {
private static final Object[] EMPTY_ARRAY = new Object[0];
/** Max collection elements to be written. */
- private static final int COLLECTION_LIMIT = 100;
+ public static final int COLLECTION_LIMIT = 100;
/** {@link #sensitiveDataPolicy} var handle. */
private static final VarHandle SENSITIVE_DATA_POLICY;
diff --git
a/modules/failure-handler/src/main/java/org/apache/ignite/internal/failure/FailureManager.java
b/modules/failure-handler/src/main/java/org/apache/ignite/internal/failure/FailureManager.java
index 8d0b68a985e..1927a74c6ec 100644
---
a/modules/failure-handler/src/main/java/org/apache/ignite/internal/failure/FailureManager.java
+++
b/modules/failure-handler/src/main/java/org/apache/ignite/internal/failure/FailureManager.java
@@ -201,9 +201,9 @@ public class FailureManager implements FailureProcessor,
IgniteComponent {
}
if (dumpThreadsOnFailure && !throttleThreadDump(failureCtx)) {
- String ctxId = format(" [failureCtxId={}]", failureCtx.id());
+ String ctxIdMsg = format(" [failureCtxId={}]", failureCtx.id());
- ThreadUtils.dumpThreads(LOG, ctxId,
!handler.ignoredFailureTypes().contains(failureCtx.type()));
+ ThreadUtils.dumpThreads(LOG, ctxIdMsg,
!handler.ignoredFailureTypes().contains(failureCtx.type()));
}
boolean invalidated = handler.onFailure(failureCtx);
diff --git
a/modules/partition-replicator/src/main/java/org/apache/ignite/internal/partition/replicator/PartitionReplicaLifecycleManager.java
b/modules/partition-replicator/src/main/java/org/apache/ignite/internal/partition/replicator/PartitionReplicaLifecycleManager.java
index c98a7a1ae03..f3fee7e95cc 100644
---
a/modules/partition-replicator/src/main/java/org/apache/ignite/internal/partition/replicator/PartitionReplicaLifecycleManager.java
+++
b/modules/partition-replicator/src/main/java/org/apache/ignite/internal/partition/replicator/PartitionReplicaLifecycleManager.java
@@ -21,6 +21,7 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Collections.emptySet;
import static java.util.concurrent.CompletableFuture.allOf;
import static java.util.concurrent.CompletableFuture.completedFuture;
+import static java.util.concurrent.CompletableFuture.delayedExecutor;
import static java.util.concurrent.CompletableFuture.failedFuture;
import static java.util.concurrent.CompletableFuture.supplyAsync;
import static java.util.function.Function.identity;
@@ -44,6 +45,7 @@ import static
org.apache.ignite.internal.distributionzones.rebalance.ZoneRebalan
import static
org.apache.ignite.internal.hlc.HybridTimestamp.LOGICAL_TIME_BITS_SIZE;
import static org.apache.ignite.internal.hlc.HybridTimestamp.hybridTimestamp;
import static
org.apache.ignite.internal.hlc.HybridTimestamp.nullableHybridTimestamp;
+import static org.apache.ignite.internal.lang.IgniteStringFormatter.format;
import static org.apache.ignite.internal.metastorage.dsl.Conditions.notExists;
import static org.apache.ignite.internal.metastorage.dsl.Operations.put;
import static
org.apache.ignite.internal.partition.replicator.LocalPartitionReplicaEvent.AFTER_REPLICA_DESTROYED;
@@ -54,6 +56,7 @@ import static
org.apache.ignite.internal.partitiondistribution.Assignments.assig
import static
org.apache.ignite.internal.partitiondistribution.PartitionDistributionUtils.calculateAssignmentForPartition;
import static
org.apache.ignite.internal.partitiondistribution.PartitionDistributionUtils.calculateAssignments;
import static org.apache.ignite.internal.raft.PeersAndLearners.fromAssignments;
+import static
org.apache.ignite.internal.tostring.IgniteToStringBuilder.COLLECTION_LIMIT;
import static org.apache.ignite.internal.util.ByteUtils.toByteArray;
import static
org.apache.ignite.internal.util.CompletableFutures.falseCompletedFuture;
import static
org.apache.ignite.internal.util.CompletableFutures.nullCompletedFuture;
@@ -72,7 +75,6 @@ import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.ScheduledExecutorService;
@@ -108,7 +110,6 @@ import org.apache.ignite.internal.hlc.HybridTimestamp;
import org.apache.ignite.internal.lang.ByteArray;
import org.apache.ignite.internal.lang.ComponentStoppingException;
import org.apache.ignite.internal.lang.IgniteInternalException;
-import org.apache.ignite.internal.lang.IgniteStringFormatter;
import org.apache.ignite.internal.lang.NodeStoppingException;
import org.apache.ignite.internal.logger.IgniteLogger;
import org.apache.ignite.internal.logger.Loggers;
@@ -154,6 +155,8 @@ import org.apache.ignite.internal.schema.SchemaManager;
import org.apache.ignite.internal.schema.SchemaSyncService;
import org.apache.ignite.internal.storage.DataStorageManager;
import org.apache.ignite.internal.storage.engine.StorageEngine;
+import org.apache.ignite.internal.thread.ThreadUtils;
+import org.apache.ignite.internal.tostring.S;
import org.apache.ignite.internal.tx.TxManager;
import org.apache.ignite.internal.tx.storage.state.TxStatePartitionStorage;
import
org.apache.ignite.internal.tx.storage.state.rocksdb.TxStateRocksDbSharedStorage;
@@ -563,7 +566,7 @@ public class PartitionReplicaLifecycleManager extends
int partitionCount,
boolean onNodeRecovery
) {
- assert stableAssignmentsForZone != null :
IgniteStringFormatter.format("Zone has empty assignments [id={}].", zoneId);
+ assert stableAssignmentsForZone != null : format("Zone has empty
assignments [id={}].", zoneId);
Supplier<CompletableFuture<Void>> createZoneReplicationNodes = () ->
inBusyLockAsync(busyLock, () -> {
var partitionsStartFutures = new
CompletableFuture<?>[stableAssignmentsForZone.size()];
@@ -1678,12 +1681,35 @@ public class PartitionReplicaLifecycleManager extends
.toArray(CompletableFuture[]::new);
try {
- allOf(stopPartitionsFuture).get(30, TimeUnit.SECONDS);
- } catch (InterruptedException | ExecutionException | TimeoutException
e) {
- LOG.error("Unable to clean up zones resources", e);
+ CompletableFuture<Void> fut = allOf(stopPartitionsFuture);
+
+ delayedExecutor(30, TimeUnit.SECONDS, Runnable::run)
+ .execute(() -> {
+ if (!fut.isDone()) {
+ printPartitionState(partitionIds);
+ }
+ });
+
+ fut.get();
+ } catch (Throwable e) {
+ failureProcessor.process(new FailureContext(e, "Unable to clean up
zones resources"));
}
}
+ private void printPartitionState(Stream<ZonePartitionId> partitionIds) {
+ List<ZonePartitionId> nonStoppedPartitions = partitionIds
+ .filter(partId -> replicaMgr.replica(partId) != null)
+ .collect(toList());
+
+ int exceedLimit = nonStoppedPartitions.size() - COLLECTION_LIMIT;
+
+ String partitionsStr = "There are still some partitions that are being
stopped: "
+ + S.toString(nonStoppedPartitions, (sb, e, i) ->
sb.app(e).app(i < nonStoppedPartitions.size() - 1 ? ", " : ""))
+ + (exceedLimit > 0 ? format(" and {} more; ", exceedLimit) :
"; ");
+
+ ThreadUtils.dumpThreads(LOG, partitionsStr, false);
+ }
+
/**
* Lock the zones replica list for any changes. {@link
#hasLocalPartition(ZonePartitionId)} must be executed under this lock always.
*
diff --git
a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaManager.java
b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaManager.java
index 1d92e8ff09f..ec8b2bbaeb8 100644
---
a/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaManager.java
+++
b/modules/replicator/src/main/java/org/apache/ignite/internal/replicator/ReplicaManager.java
@@ -784,6 +784,7 @@ public class ReplicaManager extends
AbstractEventProducer<LocalReplicaEvent, Loc
* @param replicationGroupId Table-Partition identifier.
* @return replica if it was created or null otherwise.
*/
+ @Nullable
public CompletableFuture<Replica> replica(ReplicationGroupId
replicationGroupId) {
return replicas.get(replicationGroupId);
}