This is an automated email from the ASF dual-hosted git repository.
psalagnac pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new a2b3f409e24 SOLR-18155: Abort shard leader election if container
shutdown has started (#4224)
a2b3f409e24 is described below
commit a2b3f409e24f88e85be01f745db086340fdd0895
Author: Pierre Salagnac <[email protected]>
AuthorDate: Wed Mar 18 10:27:55 2026 +0100
SOLR-18155: Abort shard leader election if container shutdown has started
(#4224)
This is mostly for tests. It makes sure a replica cannot be elected leader
for a very short time while all nodes are shutting down.
(cherry picked from commit 8873cf1242be0787503530079be23f30c55916a0)
---
changelog/unreleased/SOLR-18155-election-leak.yml | 7 +++++++
.../apache/solr/cloud/ShardLeaderElectionContext.java | 18 +++++++++++++-----
2 files changed, 20 insertions(+), 5 deletions(-)
diff --git a/changelog/unreleased/SOLR-18155-election-leak.yml
b/changelog/unreleased/SOLR-18155-election-leak.yml
new file mode 100644
index 00000000000..a43436d0980
--- /dev/null
+++ b/changelog/unreleased/SOLR-18155-election-leak.yml
@@ -0,0 +1,7 @@
+title: Abort shard leader election if container shutdown sequence has started,
so we don't have leaders elected very late and not properly closed.
+type: fixed
+authors:
+ - name: Pierre Salagnac
+links:
+ - name: SOLR-18155
+ url: https://issues.apache.org/jira/browse/SOLR-18155
diff --git
a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
index edee0f0c6e7..5d4a8a365d9 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
@@ -75,6 +75,14 @@ final class ShardLeaderElectionContext extends
ShardLeaderElectionContextBase {
syncStrategy.close();
}
+ /**
+ * Internally check whether we should abort the election process. This
returns true if either this
+ * context was explicitly closed, or Solr server is being shut down.
+ */
+ private boolean shouldAbort() {
+ return isClosed || cc.isShutDown();
+ }
+
@Override
public void cancelElection() throws InterruptedException, KeeperException {
String coreName = leaderProps.getStr(ZkStateReader.CORE_NAME_PROP);
@@ -151,7 +159,7 @@ final class ShardLeaderElectionContext extends
ShardLeaderElectionContextBase {
areAllReplicasParticipating();
}
- if (isClosed) {
+ if (shouldAbort()) {
// Solr is shutting down or the ZooKeeper session expired while
waiting for replicas. If the
// later, we cannot be sure we are still the leader, so we should bail
out. The OnReconnect
// handler will re-register the cores and handle a new leadership
election.
@@ -182,7 +190,7 @@ final class ShardLeaderElectionContext extends
ShardLeaderElectionContextBase {
}
}
- if (isClosed) {
+ if (shouldAbort()) {
return;
}
@@ -264,7 +272,7 @@ final class ShardLeaderElectionContext extends
ShardLeaderElectionContextBase {
}
}
- if (!isClosed) {
+ if (!shouldAbort()) {
try {
if (replicaType == Replica.Type.TLOG) {
// stop replicate from old leader
@@ -356,7 +364,7 @@ final class ShardLeaderElectionContext extends
ShardLeaderElectionContextBase {
ZkShardTerms zkShardTerms, String coreNodeName, int timeout) throws
InterruptedException {
long timeoutAt =
System.nanoTime() + TimeUnit.NANOSECONDS.convert(timeout,
TimeUnit.MILLISECONDS);
- while (!isClosed && !cc.isShutDown()) {
+ while (!shouldAbort()) {
if (System.nanoTime() > timeoutAt) {
log.warn(
"After waiting for {}ms, no other potential leader was found, {}
try to become leader anyway (core_term:{}, highest_term:{})",
@@ -441,7 +449,7 @@ final class ShardLeaderElectionContext extends
ShardLeaderElectionContextBase {
DocCollection docCollection =
zkController.getClusterState().getCollectionOrNull(collection);
Slice slices = (docCollection == null) ? null :
docCollection.getSlice(shardId);
int cnt = 0;
- while (!isClosed && !cc.isShutDown()) {
+ while (!shouldAbort()) {
// wait for everyone to be up
if (slices != null) {
int found = 0;