This is an automated email from the ASF dual-hosted git repository.
hulk pushed a commit to branch unstable
in repository https://gitbox.apache.org/repos/asf/kvrocks-controller.git
The following commit(s) were added to refs/heads/unstable by this push:
new 54a7ac4 Improve the failover efficiency (#362)
54a7ac4 is described below
commit 54a7ac4eecfb77d68dcdd5753113f3977a88f1f1
Author: Raphael <[email protected]>
AuthorDate: Sun Oct 12 13:24:56 2025 +0800
Improve the failover efficiency (#362)
---
controller/cluster.go | 24 ++++++++++++++----------
store/cluster_node.go | 2 +-
2 files changed, 15 insertions(+), 11 deletions(-)
diff --git a/controller/cluster.go b/controller/cluster.go
index 752ec60..8d4e70d 100755
--- a/controller/cluster.go
+++ b/controller/cluster.go
@@ -138,27 +138,30 @@ func (c *ClusterChecker) increaseFailureCount(shardIndex
int, node store.Node) i
}
log := logger.Get().With(
+ zap.String("cluster_name", c.clusterName),
zap.String("id", node.ID()),
zap.Bool("is_master", node.IsMaster()),
zap.String("addr", node.Addr()))
- if count%c.options.maxFailureCount == 0 {
+ if count%c.options.maxFailureCount == 0 || count >
c.options.maxFailureCount {
cluster, err := c.clusterStore.GetCluster(c.ctx, c.namespace,
c.clusterName)
if err != nil {
- log.Error("Failed to get the clusterName info",
zap.Error(err))
+ log.Error("Failed to get the cluster info",
zap.Error(err))
return count
}
newMasterID, err := cluster.PromoteNewMaster(c.ctx, shardIndex,
node.ID(), "")
- if err == nil {
- // the node is normal if it can be elected as the new
master,
- // because it requires the node is healthy.
- c.resetFailureCount(newMasterID)
- err = c.clusterStore.UpdateCluster(c.ctx, c.namespace,
cluster)
- }
if err != nil {
log.Error("Failed to promote the new master",
zap.Error(err))
- } else {
- log.With(zap.String("new_master_id",
newMasterID)).Info("Promote the new master")
+ return count
+ }
+ err = c.clusterStore.UpdateCluster(c.ctx, c.namespace, cluster)
+ if err != nil {
+ log.Error("Failed to update the cluster",
zap.Error(err))
+ return count
}
+ // the node is normal if it can be elected as the new master,
+ // because it requires the node is healthy.
+ c.resetFailureCount(newMasterID)
+ log.With(zap.String("new_master_id",
newMasterID)).Info("Promote the new master")
}
return count
}
@@ -216,6 +219,7 @@ func (c *ClusterChecker) parallelProbeNodes(ctx
context.Context, cluster *store.
go func(shardIdx int, n store.Node) {
defer wg.Done()
log := logger.Get().With(
+ zap.String("cluster_name",
c.clusterName),
zap.String("id", n.ID()),
zap.Bool("is_master", n.IsMaster()),
zap.String("addr", n.Addr()),
diff --git a/store/cluster_node.go b/store/cluster_node.go
old mode 100644
new mode 100755
index 5882dff..8fcd106
--- a/store/cluster_node.go
+++ b/store/cluster_node.go
@@ -46,7 +46,7 @@ const (
dialTimeout = 3200 * time.Millisecond
readTimeout = 3 * time.Second
writeTimeout = 3 * time.Second
- minIdleConns = 3
+ minIdleConns = 10
)
var (