This is an automated email from the ASF dual-hosted git repository.
epugh pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new 5d2eb05506a SOLR-16458: Migrate NodeHealthAPI from homegrown @EndPoint
to JAX-RS (#4171)
5d2eb05506a is described below
commit 5d2eb05506af12759caa414ba7ac063e311a3247
Author: Eric Pugh <[email protected]>
AuthorDate: Sat Mar 21 14:21:30 2026 -0400
SOLR-16458: Migrate NodeHealthAPI from homegrown @EndPoint to JAX-RS (#4171)
Co-authored-by: copilot-swe-agent[bot]
<[email protected]>
Co-authored-by: epugh <[email protected]>
---
.../SOLR-16458-migrate-node-health-api.yml | 8 +
.../solr/client/api/endpoint/NodeHealthApi.java | 43 ++++
.../solr/client/api/model/NodeHealthResponse.java | 36 +++
.../solr/handler/admin/HealthCheckHandler.java | 264 ++------------------
.../apache/solr/handler/admin/api/NodeHealth.java | 277 +++++++++++++++++++++
.../solr/handler/admin/api/NodeHealthAPI.java | 48 ----
.../solr/handler/admin/HealthCheckHandlerTest.java | 94 ++++---
.../handler/admin/api/NodeHealthSolrCloudTest.java | 129 ++++++++++
.../admin/api/NodeHealthStandaloneTest.java | 66 +++++
.../handler/admin/api/V2NodeAPIMappingTest.java | 23 --
.../pages/implicit-requesthandlers.adoc | 14 +-
.../pages/user-managed-index-replication.adoc | 57 +++++
.../java/org/apache/solr/common/util/Utils.java | 6 +
13 files changed, 712 insertions(+), 353 deletions(-)
diff --git a/changelog/unreleased/SOLR-16458-migrate-node-health-api.yml
b/changelog/unreleased/SOLR-16458-migrate-node-health-api.yml
new file mode 100644
index 00000000000..59dc710934d
--- /dev/null
+++ b/changelog/unreleased/SOLR-16458-migrate-node-health-api.yml
@@ -0,0 +1,8 @@
+title: "SolrJ now offers a SolrRequest class allowing users to perform v2
single-node healthchecks: NodeApi.Healthcheck"
+type: added
+authors:
+ - name: Eric Pugh
+ - name: Jason Gerlowski
+links:
+ - name: SOLR-16458
+ url: https://issues.apache.org/jira/browse/SOLR-16458
diff --git
a/solr/api/src/java/org/apache/solr/client/api/endpoint/NodeHealthApi.java
b/solr/api/src/java/org/apache/solr/client/api/endpoint/NodeHealthApi.java
new file mode 100644
index 00000000000..38ce0a20c9b
--- /dev/null
+++ b/solr/api/src/java/org/apache/solr/client/api/endpoint/NodeHealthApi.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.client.api.endpoint;
+
+import io.swagger.v3.oas.annotations.Operation;
+import io.swagger.v3.oas.annotations.Parameter;
+import jakarta.ws.rs.GET;
+import jakarta.ws.rs.Path;
+import jakarta.ws.rs.QueryParam;
+import org.apache.solr.client.api.model.NodeHealthResponse;
+
+/** V2 API definition for checking the health of a Solr node. */
+@Path("/node/health")
+public interface NodeHealthApi {
+
+ @GET
+ @Operation(
+ summary = "Determine the health of a Solr node.",
+ tags = {"node"})
+ NodeHealthResponse healthcheck(
+ @QueryParam("requireHealthyCores") Boolean requireHealthyCores,
+ @Parameter(
+ description =
+ "Maximum number of index generations a follower replica may
lag behind its"
+ + " leader before the health check reports FAILURE. Only
relevant when"
+ + " running in Standalone mode with leader/follower
replication.")
+ @QueryParam("maxGenerationLag")
+ Integer maxGenerationLag);
+}
diff --git
a/solr/api/src/java/org/apache/solr/client/api/model/NodeHealthResponse.java
b/solr/api/src/java/org/apache/solr/client/api/model/NodeHealthResponse.java
new file mode 100644
index 00000000000..a0be8723b98
--- /dev/null
+++ b/solr/api/src/java/org/apache/solr/client/api/model/NodeHealthResponse.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.client.api.model;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+/** Response body for the '/api/node/health' endpoint. */
+public class NodeHealthResponse extends SolrJerseyResponse {
+
+ /** The possible health statuses for a Solr node. */
+ public enum NodeStatus {
+ OK,
+ FAILURE
+ }
+
+ @JsonProperty public NodeStatus status;
+
+ @JsonProperty public String message;
+
+ @JsonProperty("num_cores_unhealthy")
+ public Integer numCoresUnhealthy;
+}
diff --git
a/solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java
b/solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java
index 1ecf959e49e..1dab5d1d977 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/HealthCheckHandler.java
@@ -17,39 +17,21 @@
package org.apache.solr.handler.admin;
-import static org.apache.solr.common.params.CommonParams.FAILURE;
-import static org.apache.solr.common.params.CommonParams.OK;
-import static org.apache.solr.common.params.CommonParams.STATUS;
-import static org.apache.solr.handler.admin.api.ReplicationAPIBase.GENERATION;
-
-import java.lang.invoke.MethodHandles;
-import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collection;
import java.util.List;
-import java.util.Locale;
-import java.util.stream.Collectors;
-import org.apache.lucene.index.IndexCommit;
-import org.apache.solr.api.AnnotatedApi;
import org.apache.solr.api.Api;
+import org.apache.solr.api.JerseyResource;
+import org.apache.solr.client.api.model.NodeHealthResponse;
import org.apache.solr.client.solrj.request.HealthCheckRequest;
-import org.apache.solr.cloud.CloudDescriptor;
import org.apache.solr.common.SolrException;
-import org.apache.solr.common.cloud.ClusterState;
-import org.apache.solr.common.cloud.Replica.State;
-import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.CoreContainer;
-import org.apache.solr.core.SolrCore;
-import org.apache.solr.handler.IndexFetcher;
-import org.apache.solr.handler.ReplicationHandler;
import org.apache.solr.handler.RequestHandlerBase;
-import org.apache.solr.handler.admin.api.NodeHealthAPI;
+import org.apache.solr.handler.admin.api.NodeHealth;
+import org.apache.solr.handler.api.V2ApiUtils;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.security.AuthorizationContext;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
/**
* Health Check Handler for reporting the health of a specific node.
@@ -77,12 +59,13 @@ import org.slf4j.LoggerFactory;
* specify the acceptable generation lag follower should be with respect to
its leader using the
* <code>maxGenerationLag=<max_generation_lag></code> request parameter.
If <code>
* maxGenerationLag</code> is not provided then health check would simply
return OK.
+ *
+ * <p>All health-check logic lives in the v2 {@link NodeHealth}; this handler
is a thin v1 bridge
+ * that extracts request parameters and delegates.
*/
public class HealthCheckHandler extends RequestHandlerBase {
- private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final String PARAM_REQUIRE_HEALTHY_CORES =
"requireHealthyCores";
- private static final List<State> UNHEALTHY_STATES =
Arrays.asList(State.DOWN, State.RECOVERING);
CoreContainer coreContainer;
@@ -100,224 +83,18 @@ public class HealthCheckHandler extends
RequestHandlerBase {
@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp)
throws Exception {
rsp.setHttpCaching(false);
-
- // Core container should not be null and active (redundant check)
- if (coreContainer == null || coreContainer.isShutDown()) {
- rsp.setException(
- new SolrException(
- SolrException.ErrorCode.SERVER_ERROR,
- "CoreContainer is either not initialized or shutting down"));
- return;
- }
- if (!coreContainer.isZooKeeperAware()) {
- if (log.isDebugEnabled()) {
- log.debug("Invoked HealthCheckHandler in legacy mode.");
- }
- healthCheckLegacyMode(req, rsp);
- } else {
- if (log.isDebugEnabled()) {
- log.debug(
- "Invoked HealthCheckHandler in cloud mode on [{}]",
- this.coreContainer.getZkController().getNodeName());
- }
- healthCheckCloudMode(req, rsp);
- }
- }
-
- private void healthCheckCloudMode(SolrQueryRequest req, SolrQueryResponse
rsp) {
- ZkStateReader zkStateReader =
coreContainer.getZkController().getZkStateReader();
- ClusterState clusterState = zkStateReader.getClusterState();
- // Check for isConnected and isClosed
- if (zkStateReader.getZkClient().isClosed() ||
!zkStateReader.getZkClient().isConnected()) {
- rsp.add(STATUS, FAILURE);
- rsp.setException(
- new SolrException(
- SolrException.ErrorCode.SERVICE_UNAVAILABLE,
- "Host Unavailable: Not connected to zk"));
- return;
- }
-
- // Fail if not in live_nodes
- if
(!clusterState.getLiveNodes().contains(coreContainer.getZkController().getNodeName()))
{
- rsp.add(STATUS, FAILURE);
- rsp.setException(
- new SolrException(
- SolrException.ErrorCode.SERVICE_UNAVAILABLE,
- "Host Unavailable: Not in live nodes as per zk"));
- return;
- }
-
- // Optionally require that all cores on this node are active if param
'requireHealthyCores=true'
- if (req.getParams().getBool(PARAM_REQUIRE_HEALTHY_CORES, false)) {
- if (!coreContainer.isStatusLoadComplete()) {
- rsp.add(STATUS, FAILURE);
- rsp.setException(
- new SolrException(
- SolrException.ErrorCode.SERVICE_UNAVAILABLE,
- "Host Unavailable: Core Loading not complete"));
- return;
- }
- Collection<CloudDescriptor> coreDescriptors =
- coreContainer.getCoreDescriptors().stream()
- .map(cd -> cd.getCloudDescriptor())
- .collect(Collectors.toList());
- long unhealthyCores = findUnhealthyCores(coreDescriptors, clusterState);
- if (unhealthyCores > 0) {
- rsp.add(STATUS, FAILURE);
- rsp.add("num_cores_unhealthy", unhealthyCores);
- rsp.setException(
- new SolrException(
- SolrException.ErrorCode.SERVICE_UNAVAILABLE,
- unhealthyCores
- + " out of "
- + coreContainer.getNumAllCores()
- + " replicas are currently initializing or recovering"));
- return;
- }
- rsp.add("message", "All cores are healthy");
- }
-
- // All lights green, report healthy
- rsp.add(STATUS, OK);
- }
-
- private void healthCheckLegacyMode(SolrQueryRequest req, SolrQueryResponse
rsp) {
- Integer maxGenerationLag =
req.getParams().getInt(HealthCheckRequest.PARAM_MAX_GENERATION_LAG);
- List<String> laggingCoresInfo = new ArrayList<>();
- boolean allCoresAreInSync = true;
-
- // check only if max generation lag is specified
- if (maxGenerationLag != null) {
- // if is not negative
- if (maxGenerationLag < 0) {
- log.error("Invalid value for maxGenerationLag:[{}]", maxGenerationLag);
- rsp.add(
- "message",
- String.format(Locale.ROOT, "Invalid value of maxGenerationLag:%s",
maxGenerationLag));
- rsp.add(STATUS, FAILURE);
- } else {
- for (SolrCore core : coreContainer.getCores()) {
- ReplicationHandler replicationHandler =
- (ReplicationHandler)
core.getRequestHandler(ReplicationHandler.PATH);
- if (replicationHandler.isFollower()) {
- boolean isCoreInSync =
- isWithinGenerationLag(core, replicationHandler,
maxGenerationLag, laggingCoresInfo);
-
- allCoresAreInSync &= isCoreInSync;
- }
- }
- }
- if (allCoresAreInSync) {
- rsp.add(
- "message",
- String.format(
- Locale.ROOT,
- "All the followers are in sync with leader (within
maxGenerationLag: %d) "
- + "or the cores are acting as leader",
- maxGenerationLag));
- rsp.add(STATUS, OK);
- } else {
- rsp.add(
- "message",
- String.format(
- Locale.ROOT,
- "Cores violating maxGenerationLag:%d.%n%s",
- maxGenerationLag,
- String.join(",\n", laggingCoresInfo)));
- rsp.add(STATUS, FAILURE);
- }
- } else { // if maxGeneration lag is not specified (is null) we aren't
checking for lag
- rsp.add(
- "message",
- "maxGenerationLag isn't specified. Followers aren't "
- + "checking for the generation lag from the leaders");
- rsp.add(STATUS, OK);
- }
- }
-
- private boolean isWithinGenerationLag(
- final SolrCore core,
- ReplicationHandler replicationHandler,
- int maxGenerationLag,
- List<String> laggingCoresInfo) {
- IndexFetcher indexFetcher = null;
+ final Boolean requireHealthyCores =
req.getParams().getBool(PARAM_REQUIRE_HEALTHY_CORES);
+ final Integer maxGenerationLag =
+ req.getParams().getInt(HealthCheckRequest.PARAM_MAX_GENERATION_LAG);
try {
- // may not be the best way to get leader's replicableCommit
- NamedList<?> follower = (NamedList<?>)
replicationHandler.getInitArgs().get("follower");
-
- indexFetcher = new IndexFetcher(follower, replicationHandler, core);
-
- NamedList<?> replicableCommitOnLeader = indexFetcher.getLatestVersion();
- long leaderGeneration = (Long) replicableCommitOnLeader.get(GENERATION);
-
- // Get our own commit and generation from the commit
- IndexCommit commit = core.getDeletionPolicy().getLatestCommit();
- if (commit != null) {
- long followerGeneration = commit.getGeneration();
- long generationDiff = leaderGeneration - followerGeneration;
-
- // generationDiff shouldn't be negative except for some edge cases,
log it. Some scenarios
- // are
- // 1) commit generation rolls over Long.MAX_VALUE (really unlikely)
- // 2) Leader's index is wiped clean and the follower is still showing
commit generation
- // from the old index
- if (generationDiff < 0) {
- log.warn("core:[{}], generation lag:[{}] is negative.");
- } else if (generationDiff < maxGenerationLag) {
- log.info(
- "core:[{}] generation lag is above acceptable threshold:[{}], "
- + "generation lag:[{}], leader generation:[{}], follower
generation:[{}]",
- core,
- maxGenerationLag,
- generationDiff,
- leaderGeneration,
- followerGeneration);
-
- laggingCoresInfo.add(
- String.format(
- Locale.ROOT,
- "Core %s is lagging by %d generations",
- core.getName(),
- generationDiff));
- return true;
- }
- }
- } catch (Exception e) {
- log.error("Failed to check if the follower is in sync with the leader",
e);
- } finally {
- if (indexFetcher != null) {
- indexFetcher.destroy();
- }
+ V2ApiUtils.squashIntoSolrResponseWithoutHeader(
+ rsp, new NodeHealth(coreContainer).healthcheck(requireHealthyCores,
maxGenerationLag));
+ } catch (SolrException e) {
+ final NodeHealthResponse failureResponse = new NodeHealthResponse();
+ failureResponse.status = NodeHealthResponse.NodeStatus.FAILURE;
+ V2ApiUtils.squashIntoSolrResponseWithoutHeader(rsp, failureResponse);
+ rsp.setException(e);
}
- return false;
- }
-
- /**
- * Find replicas DOWN or RECOVERING, or replicas in clusterstate that do not
exist on local node.
- * We first find local cores which are either not registered or unhealthy,
and check each of these
- * against the clusterstate, and return a count of unhealthy replicas
- *
- * @param cores list of core cloud descriptors to iterate
- * @param clusterState clusterstate from ZK
- * @return number of unhealthy cores, either in DOWN or RECOVERING state
- */
- static long findUnhealthyCores(Collection<CloudDescriptor> cores,
ClusterState clusterState) {
- return cores.stream()
- .filter(
- c ->
- !c.hasRegistered()
- || UNHEALTHY_STATES.contains(c.getLastPublished())) //
Find candidates locally
- .filter(
- c ->
- clusterState.hasCollection(
- c.getCollectionName())) // Only care about cores for
actual collections
- .filter(
- c ->
- clusterState
- .getCollection(c.getCollectionName())
- .getActiveSlicesMap()
- .containsKey(c.getShardId()))
- .count();
}
@Override
@@ -337,7 +114,12 @@ public class HealthCheckHandler extends RequestHandlerBase
{
@Override
public Collection<Api> getApis() {
- return AnnotatedApi.getApis(new NodeHealthAPI(this));
+ return List.of();
+ }
+
+ @Override
+ public Collection<Class<? extends JerseyResource>> getJerseyResources() {
+ return List.of(NodeHealth.class);
}
@Override
diff --git
a/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealth.java
b/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealth.java
new file mode 100644
index 00000000000..de207f334d1
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealth.java
@@ -0,0 +1,277 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.admin.api;
+
+import static
org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.FAILURE;
+import static
org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.OK;
+import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
+import static
org.apache.solr.common.SolrException.ErrorCode.SERVICE_UNAVAILABLE;
+import static org.apache.solr.handler.admin.api.ReplicationAPIBase.GENERATION;
+import static org.apache.solr.security.PermissionNameProvider.Name.HEALTH_PERM;
+
+import jakarta.inject.Inject;
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Locale;
+import java.util.stream.Collectors;
+import org.apache.lucene.index.IndexCommit;
+import org.apache.solr.api.JerseyResource;
+import org.apache.solr.client.api.endpoint.NodeHealthApi;
+import org.apache.solr.client.api.model.NodeHealthResponse;
+import org.apache.solr.cloud.CloudDescriptor;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.cloud.ClusterState;
+import org.apache.solr.common.cloud.Replica.State;
+import org.apache.solr.common.cloud.ZkStateReader;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.core.CoreContainer;
+import org.apache.solr.core.CoreDescriptor;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.handler.IndexFetcher;
+import org.apache.solr.handler.ReplicationHandler;
+import org.apache.solr.jersey.PermissionName;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * V2 API for checking the health of the receiving node.
+ *
+ * <p>This API (GET /v2/node/health) is analogous to the v1 /admin/info/health.
+ *
+ * <p>The v1 {@link org.apache.solr.handler.admin.HealthCheckHandler}
delegates to this class.
+ */
+public class NodeHealth extends JerseyResource implements NodeHealthApi {
+
+ private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+ private static final List<State> UNHEALTHY_STATES =
Arrays.asList(State.DOWN, State.RECOVERING);
+
+ private final CoreContainer coreContainer;
+
+ @Inject
+ public NodeHealth(CoreContainer coreContainer) {
+ this.coreContainer = coreContainer;
+ }
+
+ @Override
+ @PermissionName(HEALTH_PERM)
+ public NodeHealthResponse healthcheck(Boolean requireHealthyCores, Integer
maxGenerationLag) {
+ if (coreContainer == null || coreContainer.isShutDown()) {
+ throw new SolrException(
+ SERVER_ERROR, "CoreContainer is either not initialized or shutting
down");
+ }
+
+ final NodeHealthResponse response =
instantiateJerseyResponse(NodeHealthResponse.class);
+
+ if (!coreContainer.isZooKeeperAware()) {
+ if (log.isDebugEnabled()) {
+ log.debug("Invoked HealthCheckHandler in legacy mode.");
+ }
+ healthCheckStandaloneMode(response, maxGenerationLag);
+ } else {
+ if (log.isDebugEnabled()) {
+ log.debug(
+ "Invoked HealthCheckHandler in cloud mode on [{}]",
+ coreContainer.getZkController().getNodeName());
+ }
+ healthCheckCloudMode(response, requireHealthyCores);
+ }
+
+ return response;
+ }
+
+ private void healthCheckCloudMode(NodeHealthResponse response, Boolean
requireHealthyCores) {
+ ClusterState clusterState = getClusterState();
+
+ if (Boolean.TRUE.equals(requireHealthyCores)) {
+ if (!coreContainer.isStatusLoadComplete()) {
+ throw new SolrException(SERVICE_UNAVAILABLE, "Host Unavailable: Core
Loading not complete");
+ }
+ Collection<CloudDescriptor> coreDescriptors =
+ coreContainer.getCoreDescriptors().stream()
+ .map(CoreDescriptor::getCloudDescriptor)
+ .collect(Collectors.toList());
+ int unhealthyCores = findUnhealthyCores(coreDescriptors, clusterState);
+ if (unhealthyCores > 0) {
+ response.numCoresUnhealthy = unhealthyCores;
+ throw new SolrException(
+ SERVICE_UNAVAILABLE,
+ unhealthyCores
+ + " out of "
+ + coreContainer.getNumAllCores()
+ + " replicas are currently initializing or recovering");
+ }
+ response.message = "All cores are healthy";
+ }
+
+ response.status = OK;
+ }
+
+ private ClusterState getClusterState() {
+ ZkStateReader zkStateReader =
coreContainer.getZkController().getZkStateReader();
+ ClusterState clusterState = zkStateReader.getClusterState();
+
+ if (zkStateReader.getZkClient().isClosed() ||
!zkStateReader.getZkClient().isConnected()) {
+ throw new SolrException(SERVICE_UNAVAILABLE, "Host Unavailable: Not
connected to zk");
+ }
+
+ if
(!clusterState.getLiveNodes().contains(coreContainer.getZkController().getNodeName()))
{
+ throw new SolrException(SERVICE_UNAVAILABLE, "Host Unavailable: Not in
live nodes as per zk");
+ }
+ return clusterState;
+ }
+
+ private void healthCheckStandaloneMode(NodeHealthResponse response, Integer
maxGenerationLag) {
+ List<String> laggingCoresInfo = new ArrayList<>();
+ boolean allCoresAreInSync = true;
+
+ if (maxGenerationLag != null) {
+ if (maxGenerationLag < 0) {
+ log.error("Invalid value for maxGenerationLag:[{}]", maxGenerationLag);
+ response.message =
+ String.format(Locale.ROOT, "Invalid value of maxGenerationLag:%s",
maxGenerationLag);
+ response.status = FAILURE;
+ return;
+ }
+
+ for (SolrCore core : coreContainer.getCores()) {
+ ReplicationHandler replicationHandler =
+ (ReplicationHandler)
core.getRequestHandler(ReplicationHandler.PATH);
+ if (replicationHandler.isFollower()) {
+ boolean isCoreInSync =
+ isWithinGenerationLag(core, replicationHandler,
maxGenerationLag, laggingCoresInfo);
+ allCoresAreInSync &= isCoreInSync;
+ }
+ }
+
+ if (allCoresAreInSync) {
+ response.message =
+ String.format(
+ Locale.ROOT,
+ "All the followers are in sync with leader (within
maxGenerationLag: %d) "
+ + "or the cores are acting as leader",
+ maxGenerationLag);
+ response.status = OK;
+ } else {
+ response.message =
+ String.format(
+ Locale.ROOT,
+ "Cores violating maxGenerationLag:%d.%n%s",
+ maxGenerationLag,
+ String.join(",\n", laggingCoresInfo));
+ response.status = FAILURE;
+ }
+ } else {
+ response.message =
+ "maxGenerationLag isn't specified. Followers aren't "
+ + "checking for the generation lag from the leaders";
+ response.status = OK;
+ }
+ }
+
+ private boolean isWithinGenerationLag(
+ final SolrCore core,
+ ReplicationHandler replicationHandler,
+ int maxGenerationLag,
+ List<String> laggingCoresInfo) {
+ IndexFetcher indexFetcher = null;
+ try {
+ // may not be the best way to get leader's replicableCommit; NamedList
is unavoidable here
+ // as it is the init-args format used by ReplicationHandler
+ NamedList<?> follower = (NamedList<?>)
replicationHandler.getInitArgs().get("follower");
+ indexFetcher = new IndexFetcher(follower, replicationHandler, core);
+ // getLatestVersion() returns a NamedList from the IndexFetcher network
API
+ NamedList<?> replicableCommitOnLeader = indexFetcher.getLatestVersion();
+ long leaderGeneration = (Long) replicableCommitOnLeader.get(GENERATION);
+
+ // Get our own commit and generation from the commit
+ IndexCommit commit = core.getDeletionPolicy().getLatestCommit();
+ if (commit != null) {
+ long followerGeneration = commit.getGeneration();
+ long generationDiff = leaderGeneration - followerGeneration;
+
+ // generationDiff shouldn't be negative except for some edge cases,
log it. Some scenarios
+ // are:
+ // 1) commit generation rolls over Long.MAX_VALUE (really unlikely)
+ // 2) Leader's index is wiped clean and the follower is still showing
commit generation
+ // from the old index
+ if (generationDiff < 0) {
+ log.warn("core:[{}], generation lag:[{}] is negative.", core,
generationDiff);
+ return false;
+ } else if (generationDiff > maxGenerationLag) {
+ log.info(
+ "core:[{}] generation lag is above acceptable threshold:[{}], "
+ + "generation lag:[{}], leader generation:[{}], follower
generation:[{}]",
+ core,
+ maxGenerationLag,
+ generationDiff,
+ leaderGeneration,
+ followerGeneration);
+ laggingCoresInfo.add(
+ String.format(
+ Locale.ROOT,
+ "Core %s is lagging by %d generations",
+ core.getName(),
+ generationDiff));
+ return false;
+ }
+ }
+ } catch (Exception e) {
+ log.error("Failed to check if the follower is in sync with the leader",
e);
+ return false;
+ } finally {
+ if (indexFetcher != null) {
+ indexFetcher.destroy();
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Find replicas DOWN or RECOVERING, or replicas in clusterstate that do not
exist on local node.
+ * We first find local cores which are either not registered or unhealthy,
and check each of these
+ * against the clusterstate, and return a count of unhealthy replicas.
+ *
+ * @param cores list of core cloud descriptors to iterate
+ * @param clusterState clusterstate from ZK
+ * @return number of unhealthy cores, either in DOWN or RECOVERING state
+ */
+ public static int findUnhealthyCores(
+ Collection<CloudDescriptor> cores, ClusterState clusterState) {
+ return Math.toIntExact(
+ cores.stream()
+ .filter(
+ c ->
+ !c.hasRegistered()
+ || UNHEALTHY_STATES.contains(
+ c.getLastPublished())) // Find candidates locally
+ .filter(
+ c ->
+ clusterState.hasCollection(
+ c.getCollectionName())) // Only care about cores for
actual collections
+ .filter(
+ c ->
+ clusterState
+ .getCollection(c.getCollectionName())
+ .getActiveSlicesMap()
+ .containsKey(c.getShardId()))
+ .count());
+ }
+}
diff --git
a/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealthAPI.java
b/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealthAPI.java
deleted file mode 100644
index df5f64900f0..00000000000
--- a/solr/core/src/java/org/apache/solr/handler/admin/api/NodeHealthAPI.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.solr.handler.admin.api;
-
-import static org.apache.solr.client.solrj.SolrRequest.METHOD.GET;
-import static org.apache.solr.security.PermissionNameProvider.Name.HEALTH_PERM;
-
-import org.apache.solr.api.EndPoint;
-import org.apache.solr.handler.admin.HealthCheckHandler;
-import org.apache.solr.request.SolrQueryRequest;
-import org.apache.solr.response.SolrQueryResponse;
-
-/**
- * V2 API for checking the health of the receiving node.
- *
- * <p>This API (GET /v2/node/health) is analogous to the v1 /admin/info/health.
- */
-public class NodeHealthAPI {
- private final HealthCheckHandler handler;
-
- public NodeHealthAPI(HealthCheckHandler handler) {
- this.handler = handler;
- }
-
- // TODO Update permission here once SOLR-11623 lands.
- @EndPoint(
- path = {"/node/health"},
- method = GET,
- permission = HEALTH_PERM)
- public void getSystemInformation(SolrQueryRequest req, SolrQueryResponse
rsp) throws Exception {
- handler.handleRequestBody(req, rsp);
- }
-}
diff --git
a/solr/core/src/test/org/apache/solr/handler/admin/HealthCheckHandlerTest.java
b/solr/core/src/test/org/apache/solr/handler/admin/HealthCheckHandlerTest.java
index 43838707d05..79036e5c16e 100644
---
a/solr/core/src/test/org/apache/solr/handler/admin/HealthCheckHandlerTest.java
+++
b/solr/core/src/test/org/apache/solr/handler/admin/HealthCheckHandlerTest.java
@@ -18,11 +18,17 @@
package org.apache.solr.handler.admin;
import static
org.apache.solr.common.params.CommonParams.HEALTH_CHECK_HANDLER_PATH;
+import static org.hamcrest.Matchers.containsString;
import java.io.IOException;
+import java.net.URI;
+import java.net.http.HttpClient;
+import java.net.http.HttpRequest;
+import java.net.http.HttpResponse;
import java.util.Arrays;
import java.util.Collection;
import java.util.Properties;
+import java.util.concurrent.TimeUnit;
import org.apache.solr.client.solrj.RemoteSolrException;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrRequest;
@@ -30,10 +36,8 @@ import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.GenericSolrRequest;
import org.apache.solr.client.solrj.request.HealthCheckRequest;
-import org.apache.solr.client.solrj.request.V2Request;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.client.solrj.response.HealthCheckResponse;
-import org.apache.solr.client.solrj.response.V2Response;
import org.apache.solr.cloud.CloudDescriptor;
import org.apache.solr.cloud.ClusterStateMockUtil;
import org.apache.solr.cloud.SolrCloudTestCase;
@@ -44,6 +48,7 @@ import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.embedded.JettySolrRunner;
+import org.apache.solr.handler.admin.api.NodeHealth;
import org.junit.BeforeClass;
import org.junit.Test;
@@ -104,12 +109,8 @@ public class HealthCheckHandlerTest extends
SolrCloudTestCase {
// negative check of our (new) "broken" node that we deliberately put
into an unhealthy state
RemoteSolrException e =
- expectThrows(
- RemoteSolrException.class,
- () -> {
- runHealthcheckWithClient(solrClient);
- });
- assertTrue(e.getMessage(), e.getMessage().contains("Host Unavailable"));
+ expectThrows(RemoteSolrException.class, () ->
runHealthcheckWithClient(solrClient));
+ assertThat(e.getMessage(), containsString("Host Unavailable"));
assertEquals(SolrException.ErrorCode.SERVICE_UNAVAILABLE.code, e.code());
} finally {
newJetty.stop();
@@ -135,37 +136,56 @@ public class HealthCheckHandlerTest extends
SolrCloudTestCase {
}
}
+ /**
+ * Verifies that the v1 health-check response body contains {@code
"status":"FAILURE"} when the
+ * node is absent from ZooKeeper's live-nodes set.
+ *
+ * <p>This is a regression test for the refactoring that delegated
health-check logic to {@link
+ * NodeHealth}: after that change, {@link SolrException} thrown by {@link
NodeHealth} would escape
+ * {@link HealthCheckHandler#handleRequestBody} before the {@code status}
field was written to the
+ * response, leaving callers without a machine-readable failure indicator in
the body.
+ *
+ * <p>The node's ZK session is kept alive so that only the live-nodes check
fires, not the "not
+ * connected to ZK" check, isolating the specific code path under test.
+ */
@Test
- public void testHealthCheckV2Api() throws Exception {
- V2Response res = new
V2Request.Builder("/node/health").build().process(cluster.getSolrClient());
- assertEquals(0, res.getStatus());
- assertEquals(CommonParams.OK, res.getResponse().get(CommonParams.STATUS));
-
- // add a new node for the purpose of negative testing
+ public void testV1FailureResponseIncludesStatusField() throws Exception {
JettySolrRunner newJetty = cluster.startJettySolrRunner();
try (SolrClient solrClient =
getHttpSolrClient(newJetty.getBaseUrl().toString())) {
+ // Sanity check: the new node is initially healthy.
+ assertEquals(CommonParams.OK,
runHealthcheckWithClient(solrClient).getNodeStatus());
- // positive check that our (new) "healthy" node works with direct http
client
- assertEquals(
- CommonParams.OK,
- new V2Request.Builder("/node/health")
- .build()
- .process(solrClient)
- .getResponse()
- .get(CommonParams.STATUS));
-
- // now "break" our (new) node
- newJetty.getCoreContainer().getZkController().getZkClient().close();
-
- // negative check of our (new) "broken" node that we deliberately put
into an unhealthy state
- RemoteSolrException e =
- expectThrows(
- RemoteSolrException.class,
- () -> {
- new
V2Request.Builder("/node/health").build().process(solrClient);
- });
- assertTrue(e.getMessage(), e.getMessage().contains("Host Unavailable"));
- assertEquals(SolrException.ErrorCode.SERVICE_UNAVAILABLE.code, e.code());
+ String nodeName =
newJetty.getCoreContainer().getZkController().getNodeName();
+
+ // Remove the node from ZooKeeper's live_nodes without closing the ZK
session.
+ // This ensures the "ZK not connected" check passes and only the "not in
live nodes"
+ // check fires, exercising the specific failure branch we fixed.
+ newJetty.getCoreContainer().getZkController().removeEphemeralLiveNode();
+
+ // Wait for the node's own ZkStateReader to reflect the removal before
querying.
+ newJetty
+ .getCoreContainer()
+ .getZkController()
+ .getZkStateReader()
+ .waitForLiveNodes(10, TimeUnit.SECONDS, missingLiveNode(nodeName));
+
+ // Use a raw HTTP request so we can inspect the full response body.
+ // SolrJ's HealthCheckRequest throws RemoteSolrException on non-200
responses and does
+ // not expose the response body, so we go below SolrJ here.
+ try (HttpClient httpClient = HttpClient.newHttpClient()) {
+ HttpResponse<String> response =
+ httpClient.send(
+ HttpRequest.newBuilder()
+ .uri(URI.create(newJetty.getBaseUrl() +
HEALTH_CHECK_HANDLER_PATH))
+ .build(),
+ HttpResponse.BodyHandlers.ofString());
+
+ assertEquals("Expected 503 SERVICE_UNAVAILABLE", 503,
response.statusCode());
+ assertThat(
+ "v1 error response body must contain status=FAILURE so
body-inspecting clients get a clear signal",
+ response.body(),
+ containsString("FAILURE"));
+ }
} finally {
newJetty.stop();
}
@@ -193,7 +213,7 @@ public class HealthCheckHandlerTest extends
SolrCloudTestCase {
mockCD("invalid", "invalid", "slice1", false,
Replica.State.RECOVERING),
// A core for a slice that is not an active slice will not fail
the check
mockCD("collection1", "invalid_replica1", "invalid", true,
Replica.State.DOWN));
- long unhealthy1 = HealthCheckHandler.findUnhealthyCores(node1Cores,
clusterState);
+ long unhealthy1 = NodeHealth.findUnhealthyCores(node1Cores,
clusterState);
assertEquals(2, unhealthy1);
// Node 2
@@ -203,7 +223,7 @@ public class HealthCheckHandlerTest extends
SolrCloudTestCase {
mockCD("collection1", "slice1_replica4", "slice1", true,
Replica.State.DOWN),
mockCD(
"collection2", "slice1_replica1", "slice1", true,
Replica.State.RECOVERY_FAILED));
- long unhealthy2 = HealthCheckHandler.findUnhealthyCores(node2Cores,
clusterState);
+ long unhealthy2 = NodeHealth.findUnhealthyCores(node2Cores,
clusterState);
assertEquals(1, unhealthy2);
}
}
diff --git
a/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthSolrCloudTest.java
b/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthSolrCloudTest.java
new file mode 100644
index 00000000000..61ab10b4acd
--- /dev/null
+++
b/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthSolrCloudTest.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.admin.api;
+
+import static
org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.OK;
+import static org.hamcrest.Matchers.containsString;
+
+import java.util.concurrent.TimeUnit;
+import org.apache.solr.client.solrj.SolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.NodeApi;
+import org.apache.solr.cloud.SolrCloudTestCase;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.embedded.JettySolrRunner;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Tests for the node-health API, on SolrCloud clusters
+ *
+ * @see NodeHealthStandaloneTest
+ */
+public class NodeHealthSolrCloudTest extends SolrCloudTestCase {
+
+ @BeforeClass
+ public static void setupCluster() throws Exception {
+ configureCluster(1).addConfig("conf",
configset("cloud-minimal")).configure();
+
+ CollectionAdminRequest.createCollection(DEFAULT_TEST_COLLECTION_NAME,
"conf", 1, 1)
+ .process(cluster.getSolrClient());
+ }
+
+ @Test
+ public void testHealthyNodeReturnsOkStatus() throws Exception {
+ final var request = new NodeApi.Healthcheck();
+ final var response = request.process(cluster.getSolrClient());
+
+ assertNotNull(response);
+ assertEquals(OK, response.status);
+ assertNull("Expected no error on a healthy node", response.error);
+ }
+
+ @Test
+ public void testRequireHealthyCoresReturnOkWhenAllCoresHealthy() throws
Exception {
+ final var request = new NodeApi.Healthcheck();
+ request.setRequireHealthyCores(true);
+ final var response = request.process(cluster.getSolrClient());
+
+ assertNotNull(response);
+ assertEquals(OK, response.status);
+ assertEquals("All cores are healthy", response.message);
+ }
+
+ @Test
+ public void testCloudMode_UnhealthyWhenZkClientClosed() throws Exception {
+ // Use a fresh node so closing its ZK client does not break the primary
cluster node
+ JettySolrRunner newJetty = cluster.startJettySolrRunner();
+ cluster.waitForNode(newJetty, 30);
+ try (SolrClient nodeClient = newJetty.newClient()) {
+ // Sanity check: the new node should start out healthy
+ assertEquals(OK, new NodeApi.Healthcheck().process(nodeClient).status);
+
+ // Break the ZK connection to put the node into an unhealthy state
+ newJetty.getCoreContainer().getZkController().getZkClient().close();
+
+ SolrException e =
+ assertThrows(SolrException.class, () -> new
NodeApi.Healthcheck().process(nodeClient));
+ assertEquals(ErrorCode.SERVICE_UNAVAILABLE.code, e.code());
+ assertThat(e.getMessage(), containsString(("Host Unavailable")));
+ } finally {
+ newJetty.stop();
+ }
+ }
+
+ /**
+ * Verifies that when the node's name is absent from ZooKeeper's live-nodes
set (while the ZK
+ * session itself is still connected), the v2 health-check API throws a
{@code
+ * SERVICE_UNAVAILABLE} exception with a message identifying the live-nodes
check as the cause.
+ *
+ * <p>This specifically exercises the code path at
NodeHealth#getClusterState() that checks {@code
+ * clusterState.getLiveNodes().contains(nodeName)}.
+ */
+ @Test
+ public void testNotInLiveNodes_ThrowsServiceUnavailable() throws Exception {
+ JettySolrRunner newJetty = cluster.startJettySolrRunner();
+ cluster.waitForNode(newJetty, 30);
+ try (SolrClient nodeClient = newJetty.newClient()) {
+ // Sanity check: the new node should start out healthy
+ assertEquals(OK, new NodeApi.Healthcheck().process(nodeClient).status);
+
+ String nodeName =
newJetty.getCoreContainer().getZkController().getNodeName();
+
+ // Remove the node from ZooKeeper's live_nodes without closing the ZK
session.
+ // This ensures the "ZK not connected" check passes and only the "not in
live nodes"
+ // check fires, isolating the code path under test.
+ newJetty.getCoreContainer().getZkController().removeEphemeralLiveNode();
+
+ // Wait for the node's own ZkStateReader to reflect the removal before
querying it.
+ newJetty
+ .getCoreContainer()
+ .getZkController()
+ .getZkStateReader()
+ .waitForLiveNodes(10, TimeUnit.SECONDS, missingLiveNode(nodeName));
+
+ SolrException e =
+ assertThrows(SolrException.class, () -> new
NodeApi.Healthcheck().process(nodeClient));
+ assertEquals(ErrorCode.SERVICE_UNAVAILABLE.code, e.code());
+ assertThat(e.getMessage(), containsString("Not in live nodes"));
+ } finally {
+ newJetty.stop();
+ }
+ }
+}
diff --git
a/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthStandaloneTest.java
b/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthStandaloneTest.java
new file mode 100644
index 00000000000..0e3c2765038
--- /dev/null
+++
b/solr/core/src/test/org/apache/solr/handler/admin/api/NodeHealthStandaloneTest.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler.admin.api;
+
+import static
org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.FAILURE;
+import static
org.apache.solr.client.api.model.NodeHealthResponse.NodeStatus.OK;
+import static org.hamcrest.Matchers.containsString;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.client.solrj.request.NodeApi;
+import org.apache.solr.util.SolrJettyTestRule;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.Test;
+
+/**
+ * Tests for the node-health API, on Standalone Solr
+ *
+ * @see NodeHealthSolrCloudTest
+ */
+public class NodeHealthStandaloneTest extends SolrTestCaseJ4 {
+
+ @ClassRule public static SolrJettyTestRule solrTestRule = new
SolrJettyTestRule();
+
+ @BeforeClass
+ public static void setupCluster() throws Exception {
+ solrTestRule.startSolr(createTempDir());
+ }
+
+ @Test
+ public void testWithoutMaxGenerationLagReturnsOk() throws Exception {
+
+ final var request = new NodeApi.Healthcheck();
+ final var response = request.process(solrTestRule.getAdminClient());
+
+ assertNotNull(response);
+ assertEquals(OK, response.status);
+ assertThat(response.message, containsString("maxGenerationLag isn't
specified"));
+ }
+
+ @Test
+ public void testWithNegativeMaxGenerationLagReturnsFailure() throws
Exception {
+ final var request = new NodeApi.Healthcheck();
+ request.setMaxGenerationLag(-1);
+ final var response = request.process(solrTestRule.getAdminClient());
+
+ assertNotNull(response);
+ assertEquals(FAILURE, response.status);
+ assertThat(response.message, containsString("Invalid value of
maxGenerationLag"));
+ }
+}
diff --git
a/solr/core/src/test/org/apache/solr/handler/admin/api/V2NodeAPIMappingTest.java
b/solr/core/src/test/org/apache/solr/handler/admin/api/V2NodeAPIMappingTest.java
index 18a09fc7568..6b3c63de45b 100644
---
a/solr/core/src/test/org/apache/solr/handler/admin/api/V2NodeAPIMappingTest.java
+++
b/solr/core/src/test/org/apache/solr/handler/admin/api/V2NodeAPIMappingTest.java
@@ -34,7 +34,6 @@ import org.apache.solr.common.util.CommandOperation;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.handler.RequestHandlerBase;
import org.apache.solr.handler.admin.CoreAdminHandler;
-import org.apache.solr.handler.admin.HealthCheckHandler;
import org.apache.solr.handler.admin.InfoHandler;
import org.apache.solr.handler.admin.LoggingHandler;
import org.apache.solr.handler.admin.PropertiesRequestHandler;
@@ -55,7 +54,6 @@ public class V2NodeAPIMappingTest extends SolrTestCaseJ4 {
private InfoHandler infoHandler;
private LoggingHandler mockLoggingHandler;
private PropertiesRequestHandler mockPropertiesHandler;
- private HealthCheckHandler mockHealthCheckHandler;
private ThreadDumpHandler mockThreadDumpHandler;
@BeforeClass
@@ -69,13 +67,11 @@ public class V2NodeAPIMappingTest extends SolrTestCaseJ4 {
infoHandler = mock(InfoHandler.class);
mockLoggingHandler = mock(LoggingHandler.class);
mockPropertiesHandler = mock(PropertiesRequestHandler.class);
- mockHealthCheckHandler = mock(HealthCheckHandler.class);
mockThreadDumpHandler = mock(ThreadDumpHandler.class);
queryRequestCaptor = ArgumentCaptor.forClass(SolrQueryRequest.class);
when(infoHandler.getLoggingHandler()).thenReturn(mockLoggingHandler);
when(infoHandler.getPropertiesHandler()).thenReturn(mockPropertiesHandler);
-
when(infoHandler.getHealthCheckHandler()).thenReturn(mockHealthCheckHandler);
when(infoHandler.getThreadDumpHandler()).thenReturn(mockThreadDumpHandler);
apiBag = new ApiBag(false);
@@ -141,19 +137,6 @@ public class V2NodeAPIMappingTest extends SolrTestCaseJ4 {
assertEquals("anyParamValue", v1Params.get("anyParamName"));
}
- @Test
- public void testHealthCheckApiAllProperties() throws Exception {
- final ModifiableSolrParams solrParams = new ModifiableSolrParams();
- solrParams.add("requireHealthyCores", "true");
- solrParams.add("maxGenerationLag", "123");
- final SolrParams v1Params =
- captureConvertedHealthCheckV1Params("/node/health", "GET", solrParams);
-
- // All parameters are passed through to v1 API as-is.
- assertEquals(true, v1Params.getBool("requireHealthyCores"));
- assertEquals(123, v1Params.getPrimitiveInt("maxGenerationLag"));
- }
-
private SolrParams captureConvertedCoreV1Params(String path, String method,
String v2RequestBody)
throws Exception {
return doCaptureParams(
@@ -165,11 +148,6 @@ public class V2NodeAPIMappingTest extends SolrTestCaseJ4 {
return doCaptureParams(path, method, inputParams, null,
mockPropertiesHandler);
}
- private SolrParams captureConvertedHealthCheckV1Params(
- String path, String method, SolrParams inputParams) throws Exception {
- return doCaptureParams(path, method, inputParams, null,
mockHealthCheckHandler);
- }
-
private SolrParams captureConvertedThreadDumpV1Params(
String path, String method, SolrParams inputParams) throws Exception {
return doCaptureParams(path, method, inputParams, null,
mockThreadDumpHandler);
@@ -212,6 +190,5 @@ public class V2NodeAPIMappingTest extends SolrTestCaseJ4 {
apiBag.registerObject(new RejoinLeaderElectionAPI(coreHandler));
apiBag.registerObject(new
NodePropertiesAPI(infoHandler.getPropertiesHandler()));
apiBag.registerObject(new
NodeThreadsAPI(infoHandler.getThreadDumpHandler()));
- apiBag.registerObject(new
NodeHealthAPI(infoHandler.getHealthCheckHandler()));
}
}
diff --git
a/solr/solr-ref-guide/modules/configuration-guide/pages/implicit-requesthandlers.adoc
b/solr/solr-ref-guide/modules/configuration-guide/pages/implicit-requesthandlers.adoc
index 16b2f691281..4380337752c 100644
---
a/solr/solr-ref-guide/modules/configuration-guide/pages/implicit-requesthandlers.adoc
+++
b/solr/solr-ref-guide/modules/configuration-guide/pages/implicit-requesthandlers.adoc
@@ -40,18 +40,24 @@ This handler must have a collection name in the path to the
endpoint.
|`solr/<collection>/admin/file`
|{solr-javadocs}/core/org/apache/solr/handler/admin/ShowFileRequestHandler.html[ShowFileRequestHandler]
|`_ADMIN_FILE`
|===
-Health:: Report the health of the node (_available only in SolrCloud mode_)
+Health:: Report the health of the node.
+
[cols="3*.",frame=none,grid=cols,options="header"]
|===
|API Endpoints |Class & Javadocs |Paramset
|v1: `solr/admin/info/health`
-v2: `api/node/health`
|{solr-javadocs}/core/org/apache/solr/handler/admin/HealthCheckHandler.html[HealthCheckHandler]
|
+v2: `api/node/health` |v1:
{solr-javadocs}/core/org/apache/solr/handler/admin/HealthCheckHandler.html[HealthCheckHandler]
+
+v2:
{solr-javadocs}/core/org/apache/solr/handler/admin/api/NodeHealth.html[NodeHealth]
|
|===
+
-This endpoint also accepts additional request parameters.
-Please see
{solr-javadocs}/core/org/apache/solr/handler/admin/HealthCheckHandler.html[Javadocs]
for details.
+In SolrCloud mode the handler checks that the node is connected to ZooKeeper
and is listed in live nodes.
+The optional `requireHealthyCores=true` parameter additionally requires that
all local replicas be in an active state, which is useful for rolling-restart
probes.
++
+In user-managed (leader-follower) mode the handler checks replication lag.
+The optional `maxGenerationLag=<n>` parameter specifies the maximum number of
Lucene commit generations by which a follower is allowed to trail its leader;
the endpoint returns HTTP 503 if any core exceeds this threshold.
+See
xref:deployment-guide:user-managed-index-replication.adoc#monitoring-follower-replication-lag[Monitoring
Follower Replication Lag] for details and examples.
Logging:: Retrieve and modify registered loggers.
+
diff --git
a/solr/solr-ref-guide/modules/deployment-guide/pages/user-managed-index-replication.adoc
b/solr/solr-ref-guide/modules/deployment-guide/pages/user-managed-index-replication.adoc
index ea3f0f37674..ff3e4421fbb 100644
---
a/solr/solr-ref-guide/modules/deployment-guide/pages/user-managed-index-replication.adoc
+++
b/solr/solr-ref-guide/modules/deployment-guide/pages/user-managed-index-replication.adoc
@@ -575,6 +575,63 @@ A snapshot with the name `snapshot._name_` must exist or
an error will be return
`location`::: The location where the snapshot is created.
+[[monitoring-follower-replication-lag]]
+== Monitoring Follower Replication Lag
+
+In a leader-follower deployment it is important to know whether followers are
keeping pace with the leader.
+Solr's health-check endpoint supports a `maxGenerationLag` request parameter
that lets you assert that each follower core is within a specified number of
Lucene commit generations of its leader.
+When the follower is lagging more than the allowed number of generations the
endpoint returns HTTP 503 (Service Unavailable), making it straightforward to
integrate into load-balancer health probes or monitoring systems.
+
+The `maxGenerationLag` parameter is an integer representing the maximum
acceptable number of commit generations by which a follower is allowed to trail
its leader.
+A value of `0` requires the follower to be fully up to date.
+If the parameter is omitted, the health check returns `OK` regardless of
replication lag.
+
+[WARNING]
+====
+Because a follower's generation can only increase when a replication from the
leader actually completes, `maxGenerationLag=0` may return `FAILURE`
immediately after a follower starts or after a period of network instability
even though the follower will catch up on the next poll cycle.
+Use a small positive value (for example `2`) for production monitoring unless
you require strict freshness guarantees.
+====
+
+Use the health endpoint as follows:
+
+====
+[.tab-label]*V1 API*
+
+[source,bash]
+----
+http://_follower_host:port_/solr/admin/info/health?maxGenerationLag=<_max_lag_>
+----
+====
+
+====
+[.tab-label]*V2 API*
+
+[source,bash]
+----
+http://_follower_host:port_/api/node/health?maxGenerationLag=<_max_lag_>
+----
+====
+
+A healthy response looks like:
+
+[source,json]
+----
+{
+ "status": "OK",
+ "message": "All the followers are in sync with leader (within
maxGenerationLag: 2) or the cores are acting as leader"
+}
+----
+
+When a follower is lagging too far behind, the response is HTTP 503 and the
body identifies the lagging cores:
+
+[source,json]
+----
+{
+ "status": "FAILURE",
+ "message": "Cores violating maxGenerationLag:2.\nCore collection1 is lagging
by 5 generations"
+}
+----
+
== Optimizing Distributed Indexes
Optimizing an index is not something most users should generally worry about -
but in particular users should be aware of the impacts of optimizing an index
when using the `ReplicationHandler`.
diff --git a/solr/solrj/src/java/org/apache/solr/common/util/Utils.java
b/solr/solrj/src/java/org/apache/solr/common/util/Utils.java
index 164ae8ae7b0..86c96944ace 100644
--- a/solr/solrj/src/java/org/apache/solr/common/util/Utils.java
+++ b/solr/solrj/src/java/org/apache/solr/common/util/Utils.java
@@ -845,6 +845,12 @@ public class Utils {
* @return a serializable version of the object
*/
public static Object getReflectWriter(Object o) {
+ // Enums serialized as their declared name so that javabin/NamedList
consumers
+ // (e.g. HealthCheckHandlerTest comparing against CommonParams.OK == "OK")
see
+ // a plain string rather than "pkg.EnumClass:NAME".
+ if (o instanceof Enum<?> e) {
+ return e.name();
+ }
List<FieldWriter> fieldWriters = null;
try {
fieldWriters =