This is an automated email from the ASF dual-hosted git repository. dlych pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/asterixdb.git
commit 06f7c1852a14e7b33ff4034dbceb69a3534eaaae Author: Murtadha Hubail <[email protected]> AuthorDate: Thu Mar 24 01:43:28 2022 +0300 [NO ISSUE][REP] Increase replication ack timeout - user model changes: no - storage format changes: no - interface changes: no Details: - Increase replication ack timeout to 120 seconds. Change-Id: I228620af371d651a84160231cdd832ca1087e7f9 Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/15843 Reviewed-by: Ali Alsuliman <[email protected]> Integration-Tests: Jenkins <[email protected]> Tested-by: Jenkins <[email protected]> --- .../results/api/cluster_state_1/cluster_state_1.1.regexadm | 2 +- .../api/cluster_state_1_full/cluster_state_1_full.1.regexadm | 2 +- .../api/cluster_state_1_less/cluster_state_1_less.1.regexadm | 2 +- .../org/apache/asterix/common/config/ReplicationProperties.java | 2 +- .../asterix/replication/messaging/MarkComponentValidTask.java | 7 ++++++- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1/cluster_state_1.1.regexadm b/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1/cluster_state_1.1.regexadm index 1805e7a8f3..c55c0bcbee 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1/cluster_state_1.1.regexadm +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1/cluster_state_1.1.regexadm @@ -43,7 +43,7 @@ "replication\.log\.buffer\.numpages" : 8, "replication\.log\.buffer\.pagesize" : 131072, "replication\.strategy" : "none", - "replication\.timeout" : 30, + "replication\.timeout" : 120, "ssl\.enabled" : false, "storage.compression.block" : "snappy", "storage.global.cleanup.timeout" : 600, diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_full/cluster_state_1_full.1.regexadm b/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_full/cluster_state_1_full.1.regexadm index 743347a005..661daf3ed2 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_full/cluster_state_1_full.1.regexadm +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_full/cluster_state_1_full.1.regexadm @@ -43,7 +43,7 @@ "replication\.log\.buffer\.numpages" : 8, "replication\.log\.buffer\.pagesize" : 131072, "replication\.strategy" : "none", - "replication\.timeout" : 30, + "replication\.timeout" : 120, "ssl\.enabled" : false, "storage.compression.block" : "snappy", "storage.global.cleanup.timeout" : 600, diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_less/cluster_state_1_less.1.regexadm b/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_less/cluster_state_1_less.1.regexadm index 4359bd9ff2..1f0e865dcf 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_less/cluster_state_1_less.1.regexadm +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_less/cluster_state_1_less.1.regexadm @@ -43,7 +43,7 @@ "replication\.log\.buffer\.numpages" : 8, "replication\.log\.buffer\.pagesize" : 131072, "replication\.strategy" : "none", - "replication\.timeout" : 30, + "replication\.timeout" : 120, "ssl\.enabled" : false, "storage.compression.block" : "snappy", "storage.global.cleanup.timeout" : 600, diff --git a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/config/ReplicationProperties.java b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/config/ReplicationProperties.java index dd42936812..ada3875c10 100644 --- a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/config/ReplicationProperties.java +++ b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/config/ReplicationProperties.java @@ -48,7 +48,7 @@ public class ReplicationProperties extends AbstractProperties { "The size in bytes to replicate in each batch"), REPLICATION_TIMEOUT( LONG, - TimeUnit.SECONDS.toSeconds(30), + TimeUnit.SECONDS.toSeconds(120), "The time in seconds to timeout waiting for master or replica to ack"), REPLICATION_ENABLED(BOOLEAN, false, "Whether or not data replication is enabled"), REPLICATION_FACTOR(NONNEGATIVE_INTEGER, 2, "Number of replicas (backups) to maintain per master replica"), diff --git a/asterixdb/asterix-replication/src/main/java/org/apache/asterix/replication/messaging/MarkComponentValidTask.java b/asterixdb/asterix-replication/src/main/java/org/apache/asterix/replication/messaging/MarkComponentValidTask.java index 1ea076d8fa..172bd59ca1 100644 --- a/asterixdb/asterix-replication/src/main/java/org/apache/asterix/replication/messaging/MarkComponentValidTask.java +++ b/asterixdb/asterix-replication/src/main/java/org/apache/asterix/replication/messaging/MarkComponentValidTask.java @@ -37,12 +37,15 @@ import org.apache.asterix.replication.api.IReplicationWorker; import org.apache.asterix.replication.sync.IndexSynchronizer; import org.apache.hyracks.api.exceptions.HyracksDataException; import org.apache.hyracks.storage.am.lsm.common.impls.IndexComponentFileReference; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; /** * A task to mark a replicated LSM component as valid */ public class MarkComponentValidTask implements IReplicaTask { + private static final Logger LOGGER = LogManager.getLogger(); private final long masterLsn; private final long lastComponentId; private final String file; @@ -90,7 +93,9 @@ public class MarkComponentValidTask implements IReplicaTask { // wait until the lsn mapping is flushed to disk while (!indexCheckpointManager.isFlushed(masterLsn)) { if (replicationTimeOut <= 0) { - throw new ReplicationException(new TimeoutException("Couldn't receive flush lsn from master")); + LOGGER.warn("{} seconds passed without receiving flush lsn {} from master for component {}", + appCtx.getReplicationProperties().getReplicationTimeOut(), masterLsn, file); + throw new ReplicationException(new TimeoutException("couldn't receive flush lsn from master")); } final long startTime = System.nanoTime(); indexCheckpointManager.wait(replicationTimeOut);
