This is an automated email from the ASF dual-hosted git repository.
tanxinyu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iotdb.git
The following commit(s) were added to refs/heads/master by this push:
new a54f7f7be51 Make DN wait when CN cluster's leader is not ready or
down. (#13236)
a54f7f7be51 is described below
commit a54f7f7be51a697eb50cda229a89000f648ce66d
Author: Peng Junzhi <[email protected]>
AuthorDate: Wed Aug 21 10:31:16 2024 +0800
Make DN wait when CN cluster's leader is not ready or down. (#13236)
* make dn wait when cn cluster's leader is not ready or down.
* fix
---
.../iotdb/db/protocol/client/ConfigNodeClient.java | 25 +++++++++++++++++++---
1 file changed, 22 insertions(+), 3 deletions(-)
diff --git
a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java
b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java
index 04057ef3def..e987829a98a 100644
---
a/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java
+++
b/iotdb-core/datanode/src/main/java/org/apache/iotdb/db/protocol/client/ConfigNodeClient.java
@@ -173,14 +173,15 @@ public class ConfigNodeClient implements
IConfigNodeRPCService.Iface, ThriftClie
private static final Logger logger =
LoggerFactory.getLogger(ConfigNodeClient.class);
- private static final int RETRY_NUM = 10;
+ private static final int RETRY_NUM = 15;
public static final String MSG_RECONNECTION_FAIL =
"Fail to connect to any config node. Please check status of ConfigNodes
or logs of connected DataNode";
private static final String MSG_RECONNECTION_DATANODE_FAIL =
"Failed to connect to ConfigNode %s from DataNode %s when executing %s,
Exception:";
- private static final int RETRY_INTERVAL_MS = 1000;
+ private static final long RETRY_INTERVAL_MS = 1000L;
+ private static final long WAIT_CN_LEADER_ELECTION_INTERVAL_MS = 2000L;
private final ThriftClientProperty property;
@@ -262,7 +263,7 @@ public class ConfigNodeClient implements
IConfigNodeRPCService.Iface, ThriftClie
connect(configLeader, timeoutMs);
return;
} catch (TException ignore) {
- logger.warn("The current node may have been down {},try next node",
configLeader);
+ logger.warn("The current node leader may have been down {}, try next
node", configLeader);
configLeader = null;
}
} else {
@@ -358,12 +359,14 @@ public class ConfigNodeClient implements
IConfigNodeRPCService.Iface, ThriftClie
*/
private <T> T executeRemoteCallWithRetry(final Operation<T> call, final
Predicate<T> check)
throws TException {
+ int detectedNodeNum = 0;
for (int i = 0; i < RETRY_NUM; i++) {
try {
final T result = call.execute();
if (check.test(result)) {
return result;
}
+ detectedNodeNum++;
} catch (TException e) {
final String message =
String.format(
@@ -374,6 +377,22 @@ public class ConfigNodeClient implements
IConfigNodeRPCService.Iface, ThriftClie
logger.warn(message, e);
configLeader = null;
}
+
+ // If we have detected all configNodes and still not return
+ if (detectedNodeNum >= configNodes.size()) {
+ // Clear count
+ detectedNodeNum = 0;
+ // Wait to start the next try
+ try {
+ Thread.sleep(WAIT_CN_LEADER_ELECTION_INTERVAL_MS);
+ } catch (InterruptedException ignore) {
+ Thread.currentThread().interrupt();
+ logger.warn(
+ "Unexpected interruption when waiting to try to connect to
ConfigNode, may because current node has been down. Will break current
execution process to avoid meaningless wait.");
+ break;
+ }
+ }
+
connectAndSync();
}
throw new TException(MSG_RECONNECTION_FAIL);