This is an automated email from the ASF dual-hosted git repository.

hulk pushed a commit to branch unstable
in repository https://gitbox.apache.org/repos/asf/kvrocks-controller.git


The following commit(s) were added to refs/heads/unstable by this push:
     new 54a7ac4  Improve the failover efficiency (#362)
54a7ac4 is described below

commit 54a7ac4eecfb77d68dcdd5753113f3977a88f1f1
Author: Raphael <[email protected]>
AuthorDate: Sun Oct 12 13:24:56 2025 +0800

    Improve the failover efficiency (#362)
---
 controller/cluster.go | 24 ++++++++++++++----------
 store/cluster_node.go |  2 +-
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/controller/cluster.go b/controller/cluster.go
index 752ec60..8d4e70d 100755
--- a/controller/cluster.go
+++ b/controller/cluster.go
@@ -138,27 +138,30 @@ func (c *ClusterChecker) increaseFailureCount(shardIndex 
int, node store.Node) i
        }
 
        log := logger.Get().With(
+               zap.String("cluster_name", c.clusterName),
                zap.String("id", node.ID()),
                zap.Bool("is_master", node.IsMaster()),
                zap.String("addr", node.Addr()))
-       if count%c.options.maxFailureCount == 0 {
+       if count%c.options.maxFailureCount == 0 || count > 
c.options.maxFailureCount {
                cluster, err := c.clusterStore.GetCluster(c.ctx, c.namespace, 
c.clusterName)
                if err != nil {
-                       log.Error("Failed to get the clusterName info", 
zap.Error(err))
+                       log.Error("Failed to get the cluster info", 
zap.Error(err))
                        return count
                }
                newMasterID, err := cluster.PromoteNewMaster(c.ctx, shardIndex, 
node.ID(), "")
-               if err == nil {
-                       // the node is normal if it can be elected as the new 
master,
-                       // because it requires the node is healthy.
-                       c.resetFailureCount(newMasterID)
-                       err = c.clusterStore.UpdateCluster(c.ctx, c.namespace, 
cluster)
-               }
                if err != nil {
                        log.Error("Failed to promote the new master", 
zap.Error(err))
-               } else {
-                       log.With(zap.String("new_master_id", 
newMasterID)).Info("Promote the new master")
+                       return count
+               }
+               err = c.clusterStore.UpdateCluster(c.ctx, c.namespace, cluster)
+               if err != nil {
+                       log.Error("Failed to update the cluster", 
zap.Error(err))
+                       return count
                }
+               // the node is normal if it can be elected as the new master,
+               // because it requires the node is healthy.
+               c.resetFailureCount(newMasterID)
+               log.With(zap.String("new_master_id", 
newMasterID)).Info("Promote the new master")
        }
        return count
 }
@@ -216,6 +219,7 @@ func (c *ClusterChecker) parallelProbeNodes(ctx 
context.Context, cluster *store.
                        go func(shardIdx int, n store.Node) {
                                defer wg.Done()
                                log := logger.Get().With(
+                                       zap.String("cluster_name", 
c.clusterName),
                                        zap.String("id", n.ID()),
                                        zap.Bool("is_master", n.IsMaster()),
                                        zap.String("addr", n.Addr()),
diff --git a/store/cluster_node.go b/store/cluster_node.go
old mode 100644
new mode 100755
index 5882dff..8fcd106
--- a/store/cluster_node.go
+++ b/store/cluster_node.go
@@ -46,7 +46,7 @@ const (
        dialTimeout  = 3200 * time.Millisecond
        readTimeout  = 3 * time.Second
        writeTimeout = 3 * time.Second
-       minIdleConns = 3
+       minIdleConns = 10
 )
 
 var (

Reply via email to