This is an automated email from the ASF dual-hosted git repository.
eolivelli pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/bookkeeper.git
The following commit(s) were added to refs/heads/master by this push:
new dc2eafa BookieAutoRecoveryTest.testEmptyLedgerLosesQuorumEventually
fix flaky test, ensure that the Auditor is alive (#3149)
dc2eafa is described below
commit dc2eafa680384c1fc653ad9fd83d4cd5a66d8af4
Author: Enrico Olivelli <[email protected]>
AuthorDate: Mon Mar 28 17:53:04 2022 +0200
BookieAutoRecoveryTest.testEmptyLedgerLosesQuorumEventually fix flaky test,
ensure that the Auditor is alive (#3149)
---
.../bookkeeper/replication/AutoRecoveryMain.java | 5 ++++
.../bookkeeper/replication/ReplicationWorker.java | 10 +++++++-
.../replication/BookieAutoRecoveryTest.java | 6 ++++-
.../bookkeeper/test/BookKeeperClusterTestCase.java | 29 +++++++++++++++++++++-
4 files changed, 47 insertions(+), 3 deletions(-)
diff --git
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java
index 39d1d74..0f2b2c0 100644
---
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java
+++
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/AutoRecoveryMain.java
@@ -185,6 +185,11 @@ public class AutoRecoveryMain {
return auditorElector.getAuditor();
}
+ @VisibleForTesting
+ public ReplicationWorker getReplicationWorker() {
+ return replicationWorker;
+ }
+
/** Is auto-recovery service running? */
public boolean isAutoRecoveryRunning() {
return running;
diff --git
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java
index 6509cdd..d897e77 100644
---
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java
+++
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/replication/ReplicationWorker.java
@@ -25,6 +25,8 @@ import static
org.apache.bookkeeper.replication.ReplicationStats.NUM_FULL_OR_PAR
import static
org.apache.bookkeeper.replication.ReplicationStats.REPLICATE_EXCEPTION;
import static
org.apache.bookkeeper.replication.ReplicationStats.REPLICATION_WORKER_SCOPE;
import static
org.apache.bookkeeper.replication.ReplicationStats.REREPLICATE_OP;
+
+import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Stopwatch;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
@@ -244,6 +246,11 @@ public class ReplicationWorker implements Runnable {
LOG.error("UnavailableException "
+ "while replicating fragments", e);
waitBackOffTime(rwRereplicateBackoffMs);
+ if (Thread.currentThread().isInterrupted()) {
+ LOG.error("Interrupted while replicating fragments");
+ shutdown();
+ return;
+ }
}
}
LOG.info("ReplicationWorker exited loop!");
@@ -646,7 +653,8 @@ public class ReplicationWorker implements Runnable {
/**
* Gives the running status of ReplicationWorker.
*/
- boolean isRunning() {
+ @VisibleForTesting
+ public boolean isRunning() {
return workerRunning && workerThread.isAlive();
}
diff --git
a/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java
b/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java
index 05937fe..a8e89dc 100644
---
a/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java
+++
b/bookkeeper-server/src/test/java/org/apache/bookkeeper/replication/BookieAutoRecoveryTest.java
@@ -403,8 +403,12 @@ public class BookieAutoRecoveryTest extends
BookKeeperClusterTestCase {
assertTrue("Should be marked as underreplicated", latch.await(5,
TimeUnit.SECONDS));
latch = new CountDownLatch(1);
s = watchUrLedgerNode(urZNode, latch); // should be marked as
replicated
+
+ startNewBookie();
+ getAuditor(10, TimeUnit.SECONDS).submitAuditTask().get(); // ensure
auditor runs
+
if (s != null) {
- assertTrue("Should be marked as replicated", latch.await(5,
TimeUnit.SECONDS));
+ assertTrue("Should be marked as replicated", latch.await(20,
TimeUnit.SECONDS));
}
// should be able to open ledger without issue
diff --git
a/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java
b/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java
index aa4b510..1b14f17 100644
---
a/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java
+++
b/bookkeeper-server/src/test/java/org/apache/bookkeeper/test/BookKeeperClusterTestCase.java
@@ -73,6 +73,7 @@ import org.apache.bookkeeper.net.BookieSocketAddress;
import org.apache.bookkeeper.proto.BookieServer;
import org.apache.bookkeeper.replication.Auditor;
import org.apache.bookkeeper.replication.AutoRecoveryMain;
+import org.apache.bookkeeper.replication.ReplicationWorker;
import org.apache.bookkeeper.server.Main;
import org.apache.bookkeeper.stats.StatsLogger;
import org.apache.bookkeeper.util.DiskChecker;
@@ -426,6 +427,11 @@ public abstract class BookKeeperClusterTestCase {
public ServerConfiguration killBookie(BookieId addr) throws Exception {
Optional<ServerTester> tester = byAddress(addr);
if (tester.isPresent()) {
+ if (tester.get().autoRecovery != null
+ && tester.get().autoRecovery.getAuditor() != null
+ && tester.get().autoRecovery.getAuditor().isRunning()) {
+ LOG.warn("Killing bookie {} who is the current Auditor", addr);
+ }
servers.remove(tester.get());
tester.get().shutdown();
return tester.get().getConfiguration();
@@ -763,7 +769,20 @@ public abstract class BookKeeperClusterTestCase {
while (System.nanoTime() < timeoutAt) {
for (ServerTester t : servers) {
Auditor a = t.getAuditor();
- if (a != null) {
+ ReplicationWorker replicationWorker = t.getReplicationWorker();
+
+ // found a candidate Auditor + ReplicationWorker
+ if (a != null && a.isRunning()
+ && replicationWorker != null &&
replicationWorker.isRunning()) {
+ int deathWatchInterval =
t.getConfiguration().getDeathWatchInterval();
+ Thread.sleep(deathWatchInterval + 1000);
+ }
+
+ // double check, because in the meantime
AutoRecoveryDeathWatcher may have killed the
+ // AutoRecovery daemon
+ if (a != null && a.isRunning()
+ && replicationWorker != null &&
replicationWorker.isRunning()) {
+ LOG.info("Found Auditor Bookie {}",
t.server.getBookieId());
return a;
}
}
@@ -899,6 +918,14 @@ public abstract class BookKeeperClusterTestCase {
}
}
+ ReplicationWorker getReplicationWorker() {
+ if (autoRecovery != null) {
+ return autoRecovery.getReplicationWorker();
+ } else {
+ return null;
+ }
+ }
+
ServerConfiguration getConfiguration() {
return conf;
}