This is an automated email from the ASF dual-hosted git repository.
adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 3083438b186 HDDS-15569. Speed up TestClientRetryTimeout by tightening
configurations (#10527)
3083438b186 is described below
commit 3083438b186aeb5e28b318f3aab3df9cd2368a00
Author: Rishabh Patel <[email protected]>
AuthorDate: Tue Jun 16 22:35:39 2026 -0700
HDDS-15569. Speed up TestClientRetryTimeout by tightening configurations
(#10527)
---
.../hdds/ratis/conf/TestRatisClientConfig.java | 70 ++++++++++
.../ozone/client/rpc/TestClientRetryTimeout.java | 144 +++++++++++----------
2 files changed, 145 insertions(+), 69 deletions(-)
diff --git
a/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/ratis/conf/TestRatisClientConfig.java
b/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/ratis/conf/TestRatisClientConfig.java
index c4d61aa4b29..3c9db4eade1 100644
---
a/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/ratis/conf/TestRatisClientConfig.java
+++
b/hadoop-hdds/common/src/test/java/org/apache/hadoop/hdds/ratis/conf/TestRatisClientConfig.java
@@ -17,6 +17,7 @@
package org.apache.hadoop.hdds.ratis.conf;
+import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.time.Duration;
@@ -69,4 +70,73 @@ void setAndGet() {
assertEquals(maxRetry, subject.getExponentialPolicyMaxRetries());
}
+ /**
+ * Regression guard for HDDS-15444: the production defaults must keep the
+ * worst-case wall-clock of a single Ratis-client retry cycle bounded.
+ * Per cycle = write-rpc + max-retries × (backoff-sleep + write-rpc) +
+ * watch-rpc. With the post-HDDS-15444 defaults this is ~213 s; we assert
+ * it stays under 4 minutes so a future revert of any one knob is caught
+ * in a unit test rather than in a multi-minute integration test.
+ */
+ @Test
+ void defaultsBoundSingleCycleWallClock() {
+ RatisClientConfig subject = new OzoneConfiguration()
+ .getObject(RatisClientConfig.class);
+ RatisClientConfig.RaftConfig raftSubject = new OzoneConfiguration()
+ .getObject(RatisClientConfig.RaftConfig.class);
+
+ Duration writeRpc = raftSubject.getRpcRequestTimeout();
+ Duration watchRpc = raftSubject.getRpcWatchRequestTimeout();
+ int maxRetries = subject.getExponentialPolicyMaxRetries();
+ Duration maxBackoff = subject.getExponentialPolicyMaxSleep();
+
+ Duration perCycle = writeRpc
+ .plus(maxBackoff.plus(writeRpc).multipliedBy(maxRetries))
+ .plus(watchRpc);
+
+ assertThat(perCycle)
+ .as("Single Ratis-client retry cycle worst-case wall-clock with "
+ + "production defaults (writeRpc=%s, watchRpc=%s,
maxRetries=%d, "
+ + "maxBackoff=%s) must stay bounded; a regression here means "
+ + "client writes against a dead pipeline can hang for
minutes.",
+ writeRpc, watchRpc, maxRetries, maxBackoff)
+ .isLessThan(Duration.ofMinutes(4));
+ }
+
+ /**
+ * Regression guard for HDDS-15444: the bounded exponential backoff is
+ * what stops the Ratis client from retrying indefinitely. If this is
+ * ever set back to {@code Integer.MAX_VALUE} (the pre-HDDS-15444
+ * behaviour) write failures revert to multi-minute hangs.
+ */
+ @Test
+ void defaultsCapExponentialMaxRetries() {
+ RatisClientConfig subject = new OzoneConfiguration()
+ .getObject(RatisClientConfig.class);
+
+ assertThat(subject.getExponentialPolicyMaxRetries())
+ .as("hdds.ratis.client.exponential.backoff.max.retries must remain "
+ + "bounded; unbounded retries reintroduce the HDDS-15444 hang.")
+ .isPositive()
+ .isLessThanOrEqualTo(5);
+ }
+
+ /**
+ * Regression guard for HDDS-15444: the client-side watch RPC timeout
+ * must align with the server-side watch timeout (30 s by default).
+ * If the client waits longer than the server is willing to honour, the
+ * client hangs past the server-side abort.
+ */
+ @Test
+ void defaultsAlignWatchTimeoutWithServer() {
+ RatisClientConfig.RaftConfig raftSubject = new OzoneConfiguration()
+ .getObject(RatisClientConfig.RaftConfig.class);
+
+ assertThat(raftSubject.getRpcWatchRequestTimeout())
+ .as("hdds.ratis.raft.client.rpc.watch.request.timeout should be "
+ + "close to the server-side watch timeout (30 s); a much larger "
+ + "value lets the client hang past the server's abort.")
+ .isLessThanOrEqualTo(Duration.ofSeconds(60));
+ }
+
}
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestClientRetryTimeout.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestClientRetryTimeout.java
index fb4ac461087..1c08b169c8b 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestClientRetryTimeout.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestClientRetryTimeout.java
@@ -18,6 +18,8 @@
package org.apache.hadoop.ozone.client.rpc;
import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_HEARTBEAT_INTERVAL;
+import static
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL;
import static
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
@@ -35,6 +37,7 @@
import org.apache.hadoop.hdds.conf.StorageUnit;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.ratis.conf.RatisClientConfig;
import org.apache.hadoop.hdds.scm.OzoneClientConfig;
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.scm.XceiverClientRatis;
@@ -63,9 +66,20 @@
import org.slf4j.LoggerFactory;
/**
- * Verifies that client write operations
- * fail within acceptable time bounds when pipelines/datanodes are down.
- *
+ * Verifies that client write operations fail within acceptable time bounds
+ * when pipelines/datanodes are down.
+ * <p>
+ * This class covers the <em>plumbing</em>: that the layered retry path
+ * (Ratis-client retries × {@code ozone.client.max.retries}) terminates in
+ * bounded time when the pipeline is unusable. To keep the suite under the
+ * per-test wall-clock budget the cluster is started with compressed retry
+ * and timeout values; the assertions are scaled accordingly. A regression
+ * that re-introduces unbounded retries here will still trip the bound.
+ * <p>
+ * The companion regression check on the <em>production defaults</em> lives
+ * in {@code TestRatisClientConfig} (hdds-common). That unit test is what
+ * catches a future revert of any of the HDDS-15444 default values without
+ * needing a mini-cluster.
*/
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@@ -83,37 +97,21 @@ public class TestClientRetryTimeout {
/**
* Maximum acceptable duration for a SINGLE retry cycle (write + watch)
* when the pipeline is completely dead.
- * <p>
- * With the proposed config fixes:
- * Write: RPC(60s) + retry1(1s+60s) + retry2(2s+60s) ≈ 183s
- * Watch: RPC(30s)
- * Total single cycle: ~213s
- * <p>
- * We use 4 minutes (240s) as the upper bound with some margin.
- * The OLD defaults would take ~8 minutes per cycle.
*/
private static final Duration MAX_SINGLE_CYCLE_DURATION =
- Duration.ofMinutes(4);
+ Duration.ofSeconds(90);
/**
* Maximum acceptable duration for the watch-for-commit operation alone.
- * <p>
- * With the proposed fix: watch RPC timeout is 30s, but the write timeout
- * (70s) may also be hit if majority is lost. We use 120s to accommodate
- * write(70s) + watch(30s) + overhead.
- * The OLD default would take 180s+ (3 minutes watch alone).
*/
- private static final Duration MAX_WATCH_DURATION = Duration.ofSeconds(120);
+ private static final Duration MAX_WATCH_DURATION = Duration.ofSeconds(75);
/**
* Maximum acceptable duration for the end-to-end write failure with
- * Ozone-level retries (ozone.client.max.retries = 5).
- * <p>
- * With proposed fixes: 5 × ~93s ≈ 465s ≈ 8 min.
- * The OLD defaults would take ~40 minutes.
+ * Ozone-level retries.
*/
private static final Duration MAX_TOTAL_WRITE_DURATION =
- Duration.ofMinutes(10);
+ Duration.ofSeconds(180);
private MiniOzoneCluster cluster;
private OzoneClient client;
@@ -135,15 +133,43 @@ public void init() throws Exception {
OzoneClientConfig clientConfig = conf.getObject(OzoneClientConfig.class);
clientConfig.setStreamBufferFlushDelay(false);
+ // Cap ozone-level retries at 1 (i.e. 2 attempts total): enough to
+ // exercise the compound retry multiplier without bloating wall-clock.
+ clientConfig.setMaxRetryCount(1);
conf.setFromObject(clientConfig);
+ // Compress Ratis client retry/timeout knobs for tests only. The test
+ // verifies that retry plumbing terminates in bounded time; the ratio
+ // holds at any scale, so we run with smaller absolute values. Values
+ // are kept conservative enough that a healthy round-trip on a loaded
+ // CI runner (multi-second GC pauses, JVM stalls) doesn't spuriously
+ // trip a per-RPC timeout.
+ RatisClientConfig ratisClient = conf.getObject(RatisClientConfig.class);
+ ratisClient.setWriteRequestTimeout(Duration.ofSeconds(15));
+ ratisClient.setWatchRequestTimeout(Duration.ofSeconds(10));
+ ratisClient.setExponentialPolicyBaseSleep(Duration.ofMillis(500));
+ ratisClient.setExponentialPolicyMaxSleep(Duration.ofSeconds(1));
+ ratisClient.setExponentialPolicyMaxRetries(1);
+ conf.setFromObject(ratisClient);
+
+ RatisClientConfig.RaftConfig raftClient =
+ conf.getObject(RatisClientConfig.RaftConfig.class);
+ raftClient.setRpcRequestTimeout(Duration.ofSeconds(10));
+ raftClient.setRpcWatchRequestTimeout(Duration.ofSeconds(10));
+ conf.setFromObject(raftClient);
+
// Fast leader election so new leader can be chosen quickly
conf.setTimeDuration(
OzoneConfigKeys.HDDS_RATIS_LEADER_ELECTION_MINIMUM_TIMEOUT_DURATION_KEY,
1, TimeUnit.SECONDS);
- // Prevent SCM from removing dead datanodes during the test
- conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 300, TimeUnit.SECONDS);
+ // Fast heartbeats so SCM detects restarted DNs quickly. Stale=60s and
+ // dead=300s are sized so that SCM does not transition killed DNs to
+ // DEAD inside a single test (which would change the failure mode from
+ // "RPC timeout" to "pipeline removed"); STALE during a test is fine.
+ conf.setTimeDuration(HDDS_HEARTBEAT_INTERVAL, 1, TimeUnit.SECONDS);
+ conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 60, TimeUnit.SECONDS);
+ conf.setTimeDuration(OZONE_SCM_DEADNODE_INTERVAL, 300, TimeUnit.SECONDS);
// Allow multiple pipelines per datanode to accommodate all tests
conf.setInt(ScmConfigKeys.OZONE_DATANODE_PIPELINE_LIMIT, 5);
@@ -175,15 +201,9 @@ public void shutdown() {
/**
* Test 1: Write to a pipeline where ALL datanodes are dead.
* <p>
- * Demonstrates the problem:
- * - Client sends WriteChunk to dead leader
- * - TimeoutIOException triggers exponential backoff
- * - OLD config: retries with unlimited max retries for up to 5 minutes
- * - FIXED config: retries at most 2 times, fails in ~63s
- * <p>
- * This test asserts that the write failure is detected within
- * MAX_SINGLE_CYCLE_DURATION (4 minutes), which would fail with the old
- * 5-minute write timeout + 3-minute watch timeout = 8 minutes.
+ * Verifies that when WriteChunk is sent to a dead leader, exponential
+ * backoff terminates after the configured retry count and surfaces the
+ * failure within {@link #MAX_SINGLE_CYCLE_DURATION}.
*/
@Test
@Order(1)
@@ -239,24 +259,23 @@ public void testWriteToDeadPipelineFailsFast() throws
Exception {
MAX_SINGLE_CYCLE_DURATION, elapsed)
.isLessThan(MAX_SINGLE_CYCLE_DURATION);
- // Restart the datanodes for subsequent tests
+ // Restart the datanodes and wait for SCM to have a writable pipeline
+ // before the next test runs. We don't need full cluster recovery
+ // (all 7 DNs HEALTHY), only one OPEN factor-THREE pipeline.
for (DatanodeDetails dn : nodes) {
cluster.restartHddsDatanode(dn, false);
}
- cluster.waitForClusterToBeReady();
+ cluster.waitForPipelineTobeReady(HddsProtos.ReplicationFactor.THREE,
+ 60000);
}
/**
* Test 2: Watch-for-commit when follower datanodes are dead.
* <p>
- * Demonstrates the problem:
- * - Client writes data, leader commits but followers are dead
- * - Watch-ALL_COMMITTED is issued
- * - OLD config: watch RPC timeout is 180s, client waits 3 minutes
- * - FIXED config: watch RPC timeout is 30s, matches server timeout
- * <p>
- * This test asserts that the watch failure is detected within
- * MAX_WATCH_DURATION (60s), which would fail with the old 180s timeout.
+ * Verifies that {@code watch(ALL_COMMITTED)} terminates within
+ * {@link #MAX_WATCH_DURATION} when followers cannot ack — the client
+ * watch RPC timeout must align with the server-side watch timeout
+ * rather than running well past it.
*/
@Test
@Order(2)
@@ -328,21 +347,16 @@ public void
testWatchForCommitWithDeadFollowersFailsFast() throws Exception {
} catch (Exception e) {
// May already be running
}
- cluster.waitForClusterToBeReady();
+ cluster.waitForPipelineTobeReady(HddsProtos.ReplicationFactor.THREE,
+ 60000);
}
/**
* Test 3: Write failure when the Raft leader is specifically killed.
* <p>
- * Demonstrates the problem:
- * - Client is writing to a pipeline
- * - The Raft leader is killed mid-write
- * - RaftClient still holds stale connection to dead leader
- * - OLD config: exponential backoff retries indefinitely for 5 min
- * - FIXED config: limited to 2 retries, fails in ~63s
- * <p>
- * This test asserts the entire operation (write + close) completes
- * within MAX_SINGLE_CYCLE_DURATION.
+ * Verifies that with the leader killed mid-write, the RaftClient does
+ * not retry forever against the stale leader; the write either recovers
+ * via re-election or fails within {@link #MAX_SINGLE_CYCLE_DURATION}.
*/
@Test
@Order(3)
@@ -404,21 +418,19 @@ public void testWriteWithLeaderFailureFailsFast() throws
Exception {
// Restart the leader
cluster.restartHddsDatanode(leader.getDatanodeDetails(), false);
- cluster.waitForClusterToBeReady();
+ cluster.waitForPipelineTobeReady(HddsProtos.ReplicationFactor.THREE,
+ 60000);
}
/**
* Test 4: End-to-end write with ALL datanodes killed, verifying the
* total time including Ozone-level retries.
* <p>
- * Demonstrates the compound problem:
- * - Ozone retries up to 5 times (ozone.client.max.retries)
- * - Each retry allocates a new pipeline, but ALL datanodes are dead
- * - OLD config: 5 × (5min write + 3min watch) = 40 minutes
- * - FIXED config: 5 × (~63s write + 30s watch) ≈ 7.7 minutes
- * <p>
- * This test asserts the TOTAL time is within MAX_TOTAL_WRITE_DURATION
- * (10 minutes), which would fail with the old 40-minute worst case.
+ * Verifies that the compound retry path
+ * ({@code ozone.client.max.retries} × per-cycle Ratis retries) does not
+ * multiply into an unbounded wait. This is the test that catches the
+ * interaction between the two retry layers; tests 1–3 only exercise
+ * each layer in isolation.
*/
@Test
@Order(4)
@@ -464,12 +476,6 @@ public void
testEndToEndWriteWithAllDatanodesDownFailsFast()
+ "is causing the client to hang.",
MAX_TOTAL_WRITE_DURATION, elapsed)
.isLessThan(MAX_TOTAL_WRITE_DURATION);
-
- // Restart all datanodes
- for (HddsDatanodeService dn : allDatanodes) {
- cluster.restartHddsDatanode(dn.getDatanodeDetails(), false);
- }
- cluster.waitForClusterToBeReady();
}
private String getKeyName() {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]