This is an automated email from the ASF dual-hosted git repository.
lhotari pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/bookkeeper.git
The following commit(s) were added to refs/heads/master by this push:
new 497aa4e53b Count the connection failure as the condition of quarantine
(#4727)
497aa4e53b is described below
commit 497aa4e53b9b10dad3c1e8800bcf417c9ef8014a
Author: Yong Zhang <[email protected]>
AuthorDate: Thu Mar 19 17:09:49 2026 +0800
Count the connection failure as the condition of quarantine (#4727)
* Count the connection failure as the condition of quarantine
---
### Motivation
Currently, the BookieClient quarantine mechanism primarily triggers based
on read and write error responses from Bookies. However, in multi-region
deployments, a common failure mode is the Network Partition or DNS Resolution
Failure at the Region level.
In such scenarios:
A Bookie remains registered in ZooKeeper (it can still heartbeat to its
local ZK observer).
The Client (Broker) cannot resolve the Bookie's IP or establish a TCP
connection.
The EnsemblePlacementPolicy (especially RegionAwareEnsemblePlacementPolicy)
sees the Bookie as "Available" and repeatedly selects it to satisfy minRack or
E/Qw constraints.
The LedgerHandle fails to write because it cannot initialize a connection
handle, triggering an Ensemble Change.
Because the connection failure didn't trigger a quarantine, the placement
policy picks the same problematic Bookie again in the next iteration.
This creates an infinite Ensemble Change loop, causing the Ledger write to
hang indefinitely and bloating the Ledger metadata in ZooKeeper with thousands
of segments.
* Add configuration to control the behavior
---
.../bookkeeper/conf/ClientConfiguration.java | 23 ++++++++++++++++++++++
.../bookkeeper/proto/PerChannelBookieClient.java | 3 +++
2 files changed, 26 insertions(+)
diff --git
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ClientConfiguration.java
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ClientConfiguration.java
index dde3f7e8f6..95cb43f06e 100644
---
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ClientConfiguration.java
+++
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/conf/ClientConfiguration.java
@@ -140,6 +140,7 @@ public class ClientConfiguration extends
AbstractConfiguration<ClientConfigurati
protected static final String BOOKIE_ERROR_THRESHOLD_PER_INTERVAL =
"bookieErrorThresholdPerInterval";
protected static final String BOOKIE_QUARANTINE_TIME_SECONDS =
"bookieQuarantineTimeSeconds";
protected static final String BOOKIE_QUARANTINE_RATIO =
"bookieQuarantineRatio";
+ protected static final String BOOKIE_CONNECTION_ERROR_QUARANTINE_ENABLED =
"bookieConnectionErrorQuarantineEnabled";
// Bookie info poll interval
protected static final String DISK_WEIGHT_BASED_PLACEMENT_ENABLED =
"diskWeightBasedPlacementEnabled";
@@ -1456,6 +1457,28 @@ public class ClientConfiguration extends
AbstractConfiguration<ClientConfigurati
return this;
}
+
+ /**
+ * Set if count the bookie connecting error into the quarantine condition.
If this is enabled, the connection
+ * error will be counted into the BookieErrorThresholdPerInterval. So be
careful to set the quarantine time.
+ *
+ * @param enabled
+ * @return
+ */
+ public ClientConfiguration
setBookieConnectionErrorQuarantineEnabled(boolean enabled) {
+ setProperty(BOOKIE_CONNECTION_ERROR_QUARANTINE_ENABLED, enabled);
+ return this;
+ }
+
+ /**
+ * Get if count the bookie connecting error into the quarantine condition.
+ *
+ * @return
+ */
+ public boolean getBookieConnectionErrorQuarantineEnabled() {
+ return getBoolean(BOOKIE_CONNECTION_ERROR_QUARANTINE_ENABLED, false);
+ }
+
/**
* Get the time for which a bookie will be quarantined.
*
diff --git
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/PerChannelBookieClient.java
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/PerChannelBookieClient.java
index 655e206bbc..892306797e 100644
---
a/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/PerChannelBookieClient.java
+++
b/bookkeeper-server/src/main/java/org/apache/bookkeeper/proto/PerChannelBookieClient.java
@@ -1818,6 +1818,9 @@ public class PerChannelBookieClient extends
ChannelInboundHandlerAdapter {
if (state != ConnectionState.CLOSED) {
state = ConnectionState.DISCONNECTED;
}
+ if (conf.getBookieConnectionErrorQuarantineEnabled()) {
+ recordError();
+ }
failedConnectionCounter.inc();
}