This is an automated email from the ASF dual-hosted git repository.

zhongxjian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/dubbo-kubernetes.git


The following commit(s) were added to refs/heads/master by this push:
     new e7fe9683 [horus] fixed pod and node logic (#390)
e7fe9683 is described below

commit e7fe96834c500e58e8a5165d09934998b0e77580
Author: mfordjody <[email protected]>
AuthorDate: Fri Sep 27 15:33:48 2024 +0800

    [horus] fixed pod and node logic (#390)
---
 app/horus/cmd/main.go                   | 31 +++++++++++++++++++------------
 app/horus/core/horuser/node_downtime.go |  4 ++--
 app/horus/core/horuser/node_restart.go  |  7 ++++---
 app/horus/core/horuser/pod_abnormal.go  |  8 +++++---
 manifests/horus/horus.yaml              |  4 ++--
 5 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/app/horus/cmd/main.go b/app/horus/cmd/main.go
index f73b1b38..45032156 100644
--- a/app/horus/cmd/main.go
+++ b/app/horus/cmd/main.go
@@ -118,26 +118,33 @@ func main() {
                return nil
        })
        group.Add(func() error {
-               klog.Info("horus down time manager start success.")
-               err := horus.DownTimeManager(ctx)
-               if err != nil {
-                       klog.Errorf("horus down time manager start failed 
error:%v", err)
+               if c.NodeDownTime.Enabled {
+                       klog.Info("horus down time manager start success.")
+                       err := horus.DownTimeManager(ctx)
+                       if err != nil {
+                               klog.Errorf("horus down time manager start 
failed error:%v", err)
+                       }
                }
                return nil
        })
        group.Add(func() error {
-               klog.Info("horus down time restart manager start success.")
-               err := horus.DowntimeRestartManager(ctx)
-               if err != nil {
-                       klog.Errorf("horus down time restart manager start 
failed error:%v", err)
+               if c.NodeDownTime.Enabled {
+                       klog.Info("horus down time restart manager start 
success.")
+                       err := horus.DowntimeRestartManager(ctx)
+                       if err != nil {
+                               klog.Errorf("horus down time restart manager 
start failed error:%v", err)
+                       }
                }
                return nil
+
        })
        group.Add(func() error {
-               klog.Info("horus pod abnormal clean manager start success.")
-               err := horus.PodAbnormalCleanManager(ctx)
-               if err != nil {
-                       klog.Errorf("horus pod abnormal clean manager start 
failed error:%v", err)
+               if c.PodAbnormal.Enabled {
+                       klog.Info("horus pod abnormal clean manager start 
success.")
+                       err := horus.PodAbnormalCleanManager(ctx)
+                       if err != nil {
+                               klog.Errorf("horus pod abnormal clean manager 
start failed error:%v", err)
+                       }
                }
                return nil
        })
diff --git a/app/horus/core/horuser/node_downtime.go 
b/app/horus/core/horuser/node_downtime.go
index 199d7a92..9ffcaefc 100644
--- a/app/horus/core/horuser/node_downtime.go
+++ b/app/horus/core/horuser/node_downtime.go
@@ -28,7 +28,7 @@ import (
 
 const (
        NODE_DOWN        = "node_down"
-       NODE_DOWN_REASON = "node_down"
+       NODE_DOWN_REASON = "unknown"
 )
 
 func (h *Horuser) DownTimeManager(ctx context.Context) error {
@@ -129,7 +129,7 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
                }
                newfound++
                if newfound > 0 {
-                       klog.Infof("NodeDownTimeCheckOnCluster get 
toNodeNameips \n【集群:%v】\n 【总数:%v】\n 【细节:%v】\n", clusterName, len(nodeIP), 
nodeName)
+                       klog.Infof("DownTimeNodes get WithDownNodeIPs 
\n【集群:%v】\n 【节点:%v】\n【节点 IP 数:%v】\n", clusterName, nodeName, len(nodeIP))
                        alert.DingTalkSend(h.cc.NodeDownTime.DingTalk, 
WithDownNodeIPsMsg)
                }
                WithDownNodeIPsMsg += fmt.Sprintf("node:%v ip:%v", nodeName, 
nodeIP)
diff --git a/app/horus/core/horuser/node_restart.go 
b/app/horus/core/horuser/node_restart.go
index 507bf330..9ef064ce 100644
--- a/app/horus/core/horuser/node_restart.go
+++ b/app/horus/core/horuser/node_restart.go
@@ -52,9 +52,9 @@ func (h *Horuser) RestartOrRepair(ctx context.Context) {
 }
 
 func (h *Horuser) TryRestart(node db.NodeDataInfo) {
-       err := h.Drain(node.NodeName, node.ClusterName)
+       err := h.Drain(node.ClusterName, node.NodeName)
        if err != nil {
-               msg := fmt.Sprintf("\n【安全驱逐节点尝试重启就绪:%v】\n", err)
+               msg := fmt.Sprintf("\n【安全驱逐节点重启就绪:%v】\n", err)
                alert.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)
                return
        }
@@ -69,7 +69,8 @@ func (h *Horuser) TryRestart(node db.NodeDataInfo) {
                klog.Infof("Node %v is already uncordoned.", node.NodeName)
        }
 
-       err = syscall.Reboot(syscall.LINUX_REBOOT_CMD_RESTART)
+       syscall.Reboot(syscall.LINUX_REBOOT_CMD_RESTART)
+
        if err != nil {
                msg += fmt.Sprintf("\n【节点重启失败:%v】\n", err)
        } else {
diff --git a/app/horus/core/horuser/pod_abnormal.go 
b/app/horus/core/horuser/pod_abnormal.go
index 6ad4015e..80ae9f0a 100644
--- a/app/horus/core/horuser/pod_abnormal.go
+++ b/app/horus/core/horuser/pod_abnormal.go
@@ -37,10 +37,12 @@ func (h *Horuser) PodAbnormalCleanManager(ctx 
context.Context) error {
 func (h *Horuser) PodAbnormalClean(ctx context.Context) {
        var wg sync.WaitGroup
        for cn := range h.cc.PodAbnormal.KubeMultiple {
+               cn := cn
                wg.Add(1)
-               go func(clusterName string) {
+               go func() {
                        defer wg.Done()
-               }(cn)
+                       h.PodsOnCluster(cn)
+               }()
        }
        wg.Wait()
 }
@@ -63,7 +65,7 @@ func (h *Horuser) PodsOnCluster(clusterName string) {
                if pod.Status.Phase == corev1.PodRunning || pod.Status.Phase == 
corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
                        continue
                }
-               msg := 
fmt.Sprintf("【集群:%v】【%d/%d】【Namespace:%v】【PodName:%v】【Phase:%v】【节点名:%v】", 
clusterName, index+1, count, pod.Namespace, pod.Name, pod.Status.Phase, 
pod.Spec.NodeName)
+               msg := 
fmt.Sprintf("\n【集群:%v】\n【%d/%d】\n【Namespace:%v】\n【PodName:%v】\n【Phase:%v】\n【节点名:%v】\n",
 clusterName, index+1, count, pod.Namespace, pod.Name, pod.Status.Phase, 
pod.Spec.NodeName)
                klog.Infof(msg)
 
                wp.Submit(func() {
diff --git a/manifests/horus/horus.yaml b/manifests/horus/horus.yaml
index 596954ab..3f5735e6 100644
--- a/manifests/horus/horus.yaml
+++ b/manifests/horus/horus.yaml
@@ -91,11 +91,11 @@ nodeDownTime:
       - 15000000
 
 podAbnormal:
-  enabled: false
+  enabled: true
   intervalSecond: 5
   doubleSecond: 10
   labelSelector: "app.kubernetes.io/name=horus"
-  fieldSelector: "status.phase != Running"
+  fieldSelector: "status.phase!=Running"
   kubeMultiple:
     cluster: config.1
   dingTalk:

Reply via email to