This is an automated email from the ASF dual-hosted git repository.
epugh pushed a commit to branch branch_9_8
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9_8 by this push:
new 0cad3f7ebf8 SOLR-17306: fix replication problem on follower restart
(#2918)
0cad3f7ebf8 is described below
commit 0cad3f7ebf881ea25becf0f759b3fe59626e41fa
Author: Martin Anzinger <[email protected]>
AuthorDate: Thu Dec 19 16:55:44 2024 +0100
SOLR-17306: fix replication problem on follower restart (#2918)
(cherry picked from commit 9cef6e390719cbd7b55085cfef98fcb053785f77)
---
solr/CHANGES.txt | 2 +
.../java/org/apache/solr/handler/IndexFetcher.java | 6 +
.../solr/handler/TestReplicationHandler.java | 135 ++++++++++++++++++++-
3 files changed, 142 insertions(+), 1 deletion(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index b6f30d7f998..08c21bfd7c0 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -117,6 +117,8 @@ Bug Fixes
* SOLR-17595: Fix two issues in Solr CLI that prevent Solr from starting with
the techproducts example and from
correctly parsing arguments on Windows that start with -D and have multiple
values separated by "," or spaces. (Christos Malliaridis)
+* SOLR-17306: fix replication problem on follower restart (Martin Anzinger and
Peter Kroiss via Eric Pugh)
+
Dependency Upgrades
---------------------
* PR#2702: chore(deps): update io.netty:* to v4.1.114.final (solrbot)
diff --git a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
index 049af659b3c..061be7a9269 100644
--- a/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
+++ b/solr/core/src/java/org/apache/solr/handler/IndexFetcher.java
@@ -531,6 +531,12 @@ public class IndexFetcher {
IndexDeletionPolicyWrapper.getCommitTimestamp(commit)); // nowarn
}
+ // Leader's version is 0 and generation is 0 - not open for replication
+ if (latestVersion == 0L && latestGeneration == 0L) {
+ log.info("Leader's version is 0 and generation is 0 - not open for
replication");
+ return IndexFetchResult.LEADER_IS_NOT_ACTIVE;
+ }
+
if (latestVersion == 0L) {
if (IndexDeletionPolicyWrapper.getCommitTimestamp(commit) != 0L) {
// since we won't get the files for an empty index,
diff --git
a/solr/core/src/test/org/apache/solr/handler/TestReplicationHandler.java
b/solr/core/src/test/org/apache/solr/handler/TestReplicationHandler.java
index 930a5a2b11f..0625807fed8 100644
--- a/solr/core/src/test/org/apache/solr/handler/TestReplicationHandler.java
+++ b/solr/core/src/test/org/apache/solr/handler/TestReplicationHandler.java
@@ -118,7 +118,7 @@ public class TestReplicationHandler extends SolrTestCaseJ4 {
public void setUp() throws Exception {
super.setUp();
systemSetPropertySolrDisableUrlAllowList("true");
- // System.setProperty("solr.directoryFactory",
"solr.StandardDirectoryFactory");
+ System.setProperty("solr.directoryFactory",
"solr.StandardDirectoryFactory");
// For manual testing only
// useFactory(null); // force an FS factory.
leader = new SolrInstance(createTempDir("solr-instance").toFile(),
"leader", null);
@@ -1800,6 +1800,139 @@ public class TestReplicationHandler extends
SolrTestCaseJ4 {
}
}
+ @Test
+ public void doTestIndexFollowerAfterRestartWhenReplicationIsDisabled()
throws Exception {
+ // failed before changes to IndexFetcher
+ testReplicationRestartFollower("disablereplication");
+ }
+
+ @Test
+ public void doTestIndexFollowerAfterRestartWhenReplicationIsEnabled() throws
Exception {
+ testReplicationRestartFollower("enablereplication");
+ }
+
+ private void testReplicationRestartFollower(String replicationCmd) throws
Exception {
+ useFactory(null);
+ try {
+ clearIndexWithReplication();
+ // change solrconfig having 'replicateAfter startup' option on leader
+ leader.copyConfigFile(CONF_DIR + "solrconfig-leader2.xml",
"solrconfig.xml");
+
+ leaderJetty.stop();
+ final TimeOut waitForLeaderToShutdown =
+ new TimeOut(300, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+ waitForLeaderToShutdown.waitFor(
+ "Gave up after waiting an obscene amount of time for leader to shut
down",
+ () -> leaderJetty.isStopped());
+
+ leaderJetty.start();
+ final TimeOut waitForLeaderToStart = new TimeOut(30, TimeUnit.SECONDS,
TimeSource.NANO_TIME);
+ waitForLeaderToStart.waitFor(
+ "Gave up after waiting an obscene amount of time for leader to
start",
+ () -> leaderJetty.isRunning());
+
+ // close and re-create leader client because its connection pool has
stale connections
+ leaderClient.close();
+ leaderClient =
+ createNewSolrClient(buildUrl(leaderJetty.getLocalPort()),
DEFAULT_TEST_CORENAME);
+
+ NamedList<Object> leaderQueryRsp = rQuery(0, "*:*", leaderClient);
+ SolrDocumentList leaderQueryResult = (SolrDocumentList)
leaderQueryRsp.get("response");
+ assertEquals(0, numFound(leaderQueryRsp));
+
+ // get docs from follower and check if number is equal to leader
+ NamedList<Object> followerQueryRsp = rQuery(0, "*:*", followerClient);
+ SolrDocumentList followerQueryResult = (SolrDocumentList)
followerQueryRsp.get("response");
+ assertEquals(0, numFound(followerQueryRsp));
+
+ // compare results
+ String cmp =
+ BaseDistributedSearchTestCase.compare(leaderQueryResult,
followerQueryResult, 0, null);
+ assertNull(cmp);
+
+ nDocs--;
+ for (int i = 0; i < nDocs; i++) {
+ index(leaderClient, "id", i, "name", "name = " + i);
+ }
+
+ leaderClient.commit();
+
+ leaderQueryRsp = rQuery(nDocs, "*:*", leaderClient);
+ leaderQueryResult = (SolrDocumentList) leaderQueryRsp.get("response");
+ assertEquals(nDocs, numFound(leaderQueryRsp));
+
+ // get docs from follower and check if number is equal to leader
+ followerQueryRsp = rQuery(nDocs, "*:*", followerClient);
+ followerQueryResult = (SolrDocumentList)
followerQueryRsp.get("response");
+ assertEquals(nDocs, numFound(followerQueryRsp));
+
+ // compare results
+ cmp = BaseDistributedSearchTestCase.compare(leaderQueryResult,
followerQueryResult, 0, null);
+ assertNull(cmp);
+
+ String timesReplicatedString =
getFollowerDetails("timesIndexReplicated");
+ String timesFailed;
+ Integer previousTimesFailed = null;
+ if (timesReplicatedString == null) {
+ timesFailed = "0";
+ } else {
+ int timesReplicated = Integer.parseInt(timesReplicatedString);
+ timesFailed = getFollowerDetails("timesFailed");
+ if (null == timesFailed) {
+ timesFailed = "0";
+ }
+
+ previousTimesFailed = Integer.parseInt(timesFailed);
+ // Sometimes replication will fail because leader's core is still
loading; make sure there
+ // was one success
+ assertEquals(1, timesReplicated - previousTimesFailed);
+ }
+
+ followerJetty.stop();
+
+ invokeReplicationCommand(
+ buildUrl(leaderJetty.getLocalPort()) + "/" + DEFAULT_TEST_CORENAME,
replicationCmd);
+
+ final TimeOut waitForFollowerToShutdown =
+ new TimeOut(300, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+ waitForFollowerToShutdown.waitFor(
+ "Gave up after waiting an obscene amount of time for leader to shut
down",
+ () -> followerJetty.isStopped());
+
+ log.info("FOLLOWER START ********************************************");
+ followerJetty.start();
+
+ final TimeOut waitForFollowerToStart =
+ new TimeOut(30, TimeUnit.SECONDS, TimeSource.NANO_TIME);
+ waitForFollowerToStart.waitFor(
+ "Gave up after waiting an obscene amount of time for leader to
start",
+ () -> followerJetty.isRunning());
+
+ // poll interval on follower is 1 second, so we just sleep for a few
seconds
+ Thread.sleep(3000);
+ followerClient.close();
+ followerClient =
+ createNewSolrClient(buildUrl(followerJetty.getLocalPort()),
DEFAULT_TEST_CORENAME);
+ NamedList<Object> details = getDetails(followerClient);
+
+ leaderQueryRsp = rQuery(nDocs, "*:*", leaderClient);
+ leaderQueryResult = (SolrDocumentList) leaderQueryRsp.get("response");
+ assertEquals(nDocs, numFound(leaderQueryRsp));
+
+ // get docs from follower and check if number is equal to leader
+ followerQueryRsp = rQuery(nDocs, "*:*", followerClient);
+ followerQueryResult = (SolrDocumentList)
followerQueryRsp.get("response");
+ assertEquals(nDocs, numFound(followerQueryRsp));
+
+ // compare results again
+ cmp = BaseDistributedSearchTestCase.compare(leaderQueryResult,
followerQueryResult, 0, null);
+ assertNull(cmp);
+
+ } finally {
+ resetFactory();
+ }
+ }
+
private void assertReplicationResponseSucceeded(NamedList<?> response) {
assertNotNull("null response from server", response);
assertNotNull("Expected replication response to have 'status' field",
response.get("status"));