This is an automated email from the ASF dual-hosted git repository.

dsmiley pushed a commit to branch branch_10x
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/branch_10x by this push:
     new b6983892ba3 SOLR-18002: idle timeouts should cause servers to be added 
to the zombie list (#3891)
b6983892ba3 is described below

commit b6983892ba3b6d129a766f07b5377c24b523de56
Author: jvanneman <[email protected]>
AuthorDate: Sun Jan 25 10:05:32 2026 -0500

    SOLR-18002: idle timeouts should cause servers to be added to the zombie 
list (#3891)
    
    CloudSolrClient/LBSolrClient should consider a retry-able request that 
times out as another condition to internally mark that replica as a "zombie".
    
    Previously, unresponsive servers continued to receive traffic and high 
client latencies as the idle timeout is consistently triggered on every request 
to that replica.
---
 ...002-add-unresponsive-servers-to-zombie-list.yml |   8 +
 .../pages/configuring-solr-xml.adoc                |   5 +-
 .../query-guide/pages/common-query-parameters.adoc |   3 +-
 .../solr/client/solrj/impl/LBAsyncSolrClient.java  |  16 +-
 .../solr/client/solrj/impl/LBSolrClient.java       |  16 +-
 .../solr/client/solrj/impl/LB2SolrClientTest.java  | 188 +++++++++++++++++++++
 6 files changed, 230 insertions(+), 6 deletions(-)

diff --git 
a/changelog/unreleased/SOLR-18002-add-unresponsive-servers-to-zombie-list.yml 
b/changelog/unreleased/SOLR-18002-add-unresponsive-servers-to-zombie-list.yml
new file mode 100644
index 00000000000..b658a667c37
--- /dev/null
+++ 
b/changelog/unreleased/SOLR-18002-add-unresponsive-servers-to-zombie-list.yml
@@ -0,0 +1,8 @@
+# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc
+title: CloudSolrClient/LBSolrClient should consider a retry-able request that 
times out as another condition to internally mark that replica as a "zombie".
+type: changed # added, changed, fixed, deprecated, removed, dependency_update, 
security, other
+authors:
+  - name: James Vanneman
+links:
+  - name: SOLR-18002
+    url: https://issues.apache.org/jira/browse/SOLR-18002
diff --git 
a/solr/solr-ref-guide/modules/configuration-guide/pages/configuring-solr-xml.adoc
 
b/solr/solr-ref-guide/modules/configuration-guide/pages/configuring-solr-xml.adoc
index a1032f5ecf7..2aed51b89e3 100644
--- 
a/solr/solr-ref-guide/modules/configuration-guide/pages/configuring-solr-xml.adoc
+++ 
b/solr/solr-ref-guide/modules/configuration-guide/pages/configuring-solr-xml.adoc
@@ -522,7 +522,7 @@ Custom shard handlers are also supported and should be 
referenced in `solr.xml`
 
 Sub-elements of `<shardHandlerFactory>` may vary in the case of custom shard 
handlers, but both `HttpShardHandlerFactory` and `ParallelShardHandlerFactory` 
support the following configuration options:
 
-`socketTimeout`::
+[[sockettimeout]]`socketTimeout`::
 +
 [%autowidth,frame=none]
 |===
@@ -531,6 +531,9 @@ Sub-elements of `<shardHandlerFactory>` may vary in the 
case of custom shard han
 +
 The read timeout for intra-cluster query and administrative requests.
 The default is the same as the `distribUpdateSoTimeout` specified in the 
`<solrcloud>` section.
++
+It is recommended to set this value to be larger than any 
xref:query-guide:common-query-parameters.adoc#timeallowed-parameter[`timeAllowed`]
 query parameter used to allow  
xref:query-guide:common-query-parameters.adoc#timeallowed-parameter[`timeAllowed`]
  to gracefully finish the request processing and return partial results before 
the coordinator gives up on the request.
+
 
 `connTimeout`::
 +
diff --git 
a/solr/solr-ref-guide/modules/query-guide/pages/common-query-parameters.adoc 
b/solr/solr-ref-guide/modules/query-guide/pages/common-query-parameters.adoc
index 49c418fd749..4e69ef4855f 100644
--- a/solr/solr-ref-guide/modules/query-guide/pages/common-query-parameters.adoc
+++ b/solr/solr-ref-guide/modules/query-guide/pages/common-query-parameters.adoc
@@ -350,7 +350,8 @@ This parameter specifies the amount of time, in 
milliseconds, allowed for a sear
 If this time expires before the search is complete, any partial results will 
be returned, but values such as `numFound`, xref:faceting.adoc[facet] counts, 
and result xref:stats-component.adoc[stats] may not be accurate for the entire 
result set.
 In case of expiration, if `omitHeader` isn't set to `true` the response header 
contains a special flag called `partialResults`.
 When using `timeAllowed` in combination with 
xref:pagination-of-results.adoc#using-cursors[`cursorMark`], and the 
`partialResults` flag is present, some matching documents may have been skipped 
in the result set.
-Additionally, if  the `partialResults` flag is present, `cursorMark` can match 
`nextCursorMark` even if there may be more results
+Additionally, if  the `partialResults` flag is present, `cursorMark` can match 
`nextCursorMark` even if there may be more results.
+It is recommended to set this value to be smaller than the 
xref:configuration-guide:configuring-solr-xml.adoc#sockettimeout[socketTimeout] 
configured in `solr.xml` to allow Solr to gracefully finish the request 
processing and return partial results before the coordinator gives up on the 
request.
 
 [source,json]
 ----
diff --git 
a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBAsyncSolrClient.java 
b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBAsyncSolrClient.java
index c02197c2d4d..48dab986d80 100644
--- 
a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBAsyncSolrClient.java
+++ 
b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBAsyncSolrClient.java
@@ -21,6 +21,8 @@ import java.net.ConnectException;
 import java.net.SocketException;
 import java.net.SocketTimeoutException;
 import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.CompletionException;
+import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicReference;
 import org.apache.solr.client.solrj.RemoteSolrException;
 import org.apache.solr.client.solrj.SolrClient;
@@ -179,6 +181,9 @@ public abstract class LBAsyncSolrClient extends 
LBSolrClient {
       boolean isNonRetryable,
       boolean isZombie,
       RetryListener listener) {
+    if (oe instanceof CompletionException) {
+      oe = oe.getCause();
+    }
     try {
       throw (Exception) oe;
     } catch (SolrException e) {
@@ -210,9 +215,16 @@ public abstract class LBAsyncSolrClient extends 
LBSolrClient {
       }
     } catch (SolrServerException e) {
       Throwable rootCause = e.getRootCause();
-      if (!isNonRetryable && rootCause instanceof IOException) {
+      if (!isNonRetryable
+          && (rootCause instanceof IOException || rootCause instanceof 
TimeoutException)) {
+        listener.onFailure((!isZombie) ? makeServerAZombie(endpoint, e) : e, 
true);
+      } else if (isNonRetryable && isConnectException(rootCause)) {
         listener.onFailure((!isZombie) ? makeServerAZombie(endpoint, e) : e, 
true);
-      } else if (isNonRetryable && rootCause instanceof ConnectException) {
+      } else {
+        listener.onFailure(e, false);
+      }
+    } catch (IOException e) {
+      if (!isNonRetryable || isConnectException(e)) {
         listener.onFailure((!isZombie) ? makeServerAZombie(endpoint, e) : e, 
true);
       } else {
         listener.onFailure(e, false);
diff --git 
a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java 
b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java
index fc0d06c7f29..74e55e9629d 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/LBSolrClient.java
@@ -23,6 +23,7 @@ import java.lang.ref.WeakReference;
 import java.net.ConnectException;
 import java.net.SocketException;
 import java.net.SocketTimeoutException;
+import java.net.http.HttpConnectTimeoutException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -38,6 +39,7 @@ import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.Executors;
 import java.util.concurrent.ScheduledExecutorService;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collectors;
 import org.apache.solr.client.solrj.RemoteSolrException;
@@ -665,9 +667,10 @@ public abstract class LBSolrClient extends SolrClient {
       }
     } catch (SolrServerException e) {
       Throwable rootCause = e.getRootCause();
-      if (!isNonRetryable && rootCause instanceof IOException) {
+      if (!isNonRetryable
+          && (rootCause instanceof IOException || rootCause instanceof 
TimeoutException)) {
         ex = (!isZombie) ? makeServerAZombie(baseUrl, e) : e;
-      } else if (isNonRetryable && rootCause instanceof ConnectException) {
+      } else if (isNonRetryable && isConnectException(rootCause)) {
         ex = (!isZombie) ? makeServerAZombie(baseUrl, e) : e;
       } else {
         throw e;
@@ -679,6 +682,15 @@ public abstract class LBSolrClient extends SolrClient {
     return ex;
   }
 
+  protected boolean isConnectException(Throwable t) {
+    if (t instanceof ConnectException || t instanceof 
HttpConnectTimeoutException) {
+      return true;
+    }
+    // Check for common connection timeout exceptions by name to avoid hard 
dependencies on
+    // specific HTTP client libraries (e.g., Jetty or Apache HttpClient).
+    return t != null && 
t.getClass().getName().endsWith("ConnectTimeoutException");
+  }
+
   protected abstract SolrClient getClient(Endpoint endpoint);
 
   private void startAliveCheckExecutor() {
diff --git 
a/solr/solrj/src/test/org/apache/solr/client/solrj/impl/LB2SolrClientTest.java 
b/solr/solrj/src/test/org/apache/solr/client/solrj/impl/LB2SolrClientTest.java
index 79c24dfd62d..66e806e9538 100644
--- 
a/solr/solrj/src/test/org/apache/solr/client/solrj/impl/LB2SolrClientTest.java
+++ 
b/solr/solrj/src/test/org/apache/solr/client/solrj/impl/LB2SolrClientTest.java
@@ -18,6 +18,8 @@ package org.apache.solr.client.solrj.impl;
 
 import java.io.IOException;
 import java.io.UncheckedIOException;
+import java.net.ServerSocket;
+import java.net.Socket;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
@@ -25,16 +27,22 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Properties;
 import java.util.Set;
+import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.TimeUnit;
 import org.apache.lucene.util.IOUtils;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.SolrRequest;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.jetty.HttpJettySolrClient;
+import org.apache.solr.client.solrj.jetty.LBJettySolrClient;
+import org.apache.solr.client.solrj.request.QueryRequest;
 import org.apache.solr.client.solrj.request.SolrQuery;
+import org.apache.solr.client.solrj.request.UpdateRequest;
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.client.solrj.response.SolrResponseBase;
 import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.RetryUtil;
 import org.apache.solr.embedded.JettyConfig;
 import org.apache.solr.embedded.JettySolrRunner;
@@ -204,6 +212,62 @@ public class LB2SolrClientTest extends SolrTestCaseJ4 {
     }
   }
 
+  public void testTimeoutExceptionMarksServerAsZombie() throws Exception {
+    try (TimeoutZombieTestContext ctx = new TimeoutZombieTestContext()) {
+      LBSolrClient.Req lbReq = ctx.createQueryRequest();
+
+      try {
+        ctx.lbClient.request(lbReq);
+      } catch (Exception e) {
+      }
+
+      ctx.assertZombieState();
+    }
+  }
+
+  public void testTimeoutExceptionMarksServerAsZombieAsyncRequest() throws 
Exception {
+    try (TimeoutZombieTestContext ctx = new TimeoutZombieTestContext()) {
+      LBSolrClient.Req lbReq = ctx.createQueryRequest();
+
+      ctx.lbClient.requestAsync(lbReq).exceptionally(e -> null).get();
+
+      ctx.assertZombieState();
+    }
+  }
+
+  public void testConnectTimeoutExceptionMarksServerAsZombie() throws 
Exception {
+    try (ConnectTimeoutZombieTestContext ctx = new 
ConnectTimeoutZombieTestContext()) {
+      LBSolrClient.Req lbReq = ctx.createQueryRequest();
+
+      try {
+        ctx.lbClient.request(lbReq);
+      } catch (Exception e) {
+      }
+
+      ctx.assertZombieState();
+    }
+  }
+
+  public void testConnectTimeoutExceptionMarksServerAsZombieAsyncRequest() 
throws Exception {
+    try (ConnectTimeoutZombieTestContext ctx = new 
ConnectTimeoutZombieTestContext()) {
+      LBSolrClient.Req lbReq = ctx.createQueryRequest();
+
+      ctx.lbClient.requestAsync(lbReq).exceptionally(e -> null).get();
+
+      ctx.assertZombieState();
+    }
+  }
+
+  public void testConnectTimeoutExceptionMarksServerAsZombieAsyncUpdate() 
throws Exception {
+    try (ConnectTimeoutZombieTestContext ctx = new 
ConnectTimeoutZombieTestContext()) {
+      LBSolrClient.Req lbReq = ctx.createUpdateRequest();
+
+      ctx.lbClient.requestAsync(lbReq).exceptionally(e -> null).get();
+
+      ctx.assertZombieState();
+    }
+  }
+
   private LBSolrClient.Endpoint[] bootstrapBaseSolrEndpoints(int max) {
     LBSolrClient.Endpoint[] solrUrls = new LBSolrClient.Endpoint[max];
     for (int i = 0; i < max; i++) {
@@ -318,4 +382,128 @@ public class LB2SolrClientTest extends SolrTestCaseJ4 {
       }
     }
   }
+
+  private class TimeoutZombieTestContext implements AutoCloseable {
+    final ServerSocket blackhole;
+    final LBSolrClient.Endpoint nonRoutableEndpoint;
+    final HttpJettySolrClient delegateClient;
+    final LBAsyncSolrClient lbClient;
+
+    TimeoutZombieTestContext() throws Exception {
+      // create a socket that allows a client to connect but causes them to 
hang until idleTimeout
+      // is triggered
+      blackhole = new ServerSocket(0);
+      int blackholePort = blackhole.getLocalPort();
+      nonRoutableEndpoint =
+          new LBSolrClient.Endpoint("http://localhost:"; + blackholePort + 
"/solr");
+
+      delegateClient =
+          new HttpJettySolrClient.Builder()
+              .withConnectionTimeout(1000, TimeUnit.MILLISECONDS)
+              .withIdleTimeout(1, TimeUnit.MILLISECONDS)
+              .build();
+
+      lbClient = new LBJettySolrClient.Builder(delegateClient, 
nonRoutableEndpoint).build();
+    }
+
+    LBSolrClient.Req createQueryRequest() {
+      SolrQuery solrQuery = new SolrQuery("*:*");
+      QueryRequest queryRequest = new QueryRequest(solrQuery);
+
+      List<LBSolrClient.Endpoint> endpoints =
+          List.of(
+              new LBSolrClient.Endpoint(
+                  nonRoutableEndpoint.getBaseUrl(), 
solr[0].getDefaultCollection()));
+      return new LBSolrClient.Req(queryRequest, endpoints);
+    }
+
+    void assertZombieState() {
+      assertTrue(
+          "Non-routable endpoint should be marked as zombie due to timeout",
+          lbClient.zombieServers.containsKey(
+              nonRoutableEndpoint.getBaseUrl() + "/" + 
solr[0].getDefaultCollection()));
+    }
+
+    @Override
+    public void close() {
+      lbClient.close();
+      delegateClient.close();
+      try {
+        blackhole.close();
+      } catch (IOException ioe) {
+
+      }
+    }
+  }
+
+  private class ConnectTimeoutZombieTestContext implements AutoCloseable {
+    final ServerSocket ss;
+    final Socket connector;
+    final LBSolrClient.Endpoint nonRoutableEndpoint;
+    final LBAsyncSolrClient lbClient;
+    final HttpJdkSolrClient delegateClient;
+
+    ConnectTimeoutZombieTestContext() throws Exception {
+      // Create a server socket with a backlog of 1 and occupy that slot to 
trigger a connect
+      // timeout.
+      ss = new ServerSocket(0, 1);
+      int port = ss.getLocalPort();
+      connector = new Socket("127.0.0.1", port);
+
+      nonRoutableEndpoint = new LBSolrClient.Endpoint("http://127.0.0.1:"; + 
port + "/solr");
+      delegateClient =
+          new HttpJdkSolrClient.Builder(nonRoutableEndpoint.getBaseUrl())
+              .withConnectionTimeout(1, TimeUnit.MILLISECONDS)
+              .build();
+
+      lbClient =
+          new LBAsyncSolrClient(
+              new LBSolrClient.Builder<>(delegateClient, nonRoutableEndpoint)
+                  .withDefaultCollection(solr[0].getDefaultCollection())) {
+            @Override
+            protected CompletableFuture<NamedList<Object>> requestAsyncWithUrl(
+                SolrClient client, String baseUrl, SolrRequest<?> request)
+                throws SolrServerException, IOException {
+              return ((HttpJdkSolrClient) client).requestAsync(request, null);
+            }
+          };
+    }
+
+    LBSolrClient.Req createQueryRequest() {
+      SolrQuery solrQuery = new SolrQuery("*:*");
+      QueryRequest queryRequest = new QueryRequest(solrQuery);
+
+      List<LBSolrClient.Endpoint> endpoints =
+          List.of(
+              new LBSolrClient.Endpoint(
+                  nonRoutableEndpoint.getBaseUrl(), 
solr[0].getDefaultCollection()));
+      return new LBSolrClient.Req(queryRequest, endpoints);
+    }
+
+    LBSolrClient.Req createUpdateRequest() {
+      UpdateRequest updateRequest = new UpdateRequest();
+      updateRequest.add(new SolrInputDocument());
+
+      List<LBSolrClient.Endpoint> endpoints =
+          List.of(
+              new LBSolrClient.Endpoint(
+                  nonRoutableEndpoint.getBaseUrl(), 
solr[0].getDefaultCollection()));
+      return new LBSolrClient.Req(updateRequest, endpoints);
+    }
+
+    void assertZombieState() {
+      assertTrue(
+          "Endpoint should be marked as zombie due to connect timeout",
+          lbClient.zombieServers.containsKey(
+              nonRoutableEndpoint.getBaseUrl() + "/" + 
solr[0].getDefaultCollection()));
+    }
+
+    @Override
+    public void close() throws IOException {
+      lbClient.close();
+      delegateClient.close();
+      connector.close();
+      ss.close();
+    }
+  }
 }

Reply via email to