This is an automated email from the ASF dual-hosted git repository.

zhongxjian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/dubbo-kubernetes.git


The following commit(s) were added to refs/heads/master by this push:
     new 9a059931 [horus] Fix restart sequence logic (#416)
9a059931 is described below

commit 9a059931e1ed280b311b19a428fc6a822d3dc877
Author: mfordjody <[email protected]>
AuthorDate: Wed Oct 2 13:26:31 2024 +0800

    [horus] Fix restart sequence logic (#416)
---
 app/horus/core/horuser/node_restart.go | 17 +++++++++++------
 app/horus/core/horuser/restart.sh      |  4 ++--
 manifests/horus/horus.yaml             | 12 +++++++-----
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/app/horus/core/horuser/node_restart.go 
b/app/horus/core/horuser/node_restart.go
index fe7c8549..06e1c069 100644
--- a/app/horus/core/horuser/node_restart.go
+++ b/app/horus/core/horuser/node_restart.go
@@ -56,23 +56,28 @@ func (h *Horuser) TryRestart(node db.NodeDataInfo) {
        if err != nil {
                klog.Errorf("Drain node err:%v", err)
                klog.Infof("clusterName:%v nodeName:%v", node.ClusterName, 
node.NodeName)
-               return
-       } else {
-               klog.Infof("Drain node Success clusterName:%v nodeName:%v", 
node.ClusterName, node.NodeName)
        }
+       klog.Infof("Drain node Success clusterName:%v nodeName:%v", 
node.ClusterName, node.NodeName)
 
        pass, err := node.RestartMarker()
-       klog.Infof("RestartMarker result pass:%v err:%v", pass, err)
+       if err != nil {
+               klog.Errorf("Error getting RestartMarker for node %v: %v", 
node.NodeName, err)
+               return
+       }
+       klog.Infof("RestartMarker result pass:%v", pass)
 
        if pass {
                msg := 
fmt.Sprintf("\n【等待宕机节点腾空后重启】\n【节点:%v】\n【日期:%v】\n【集群:%v】\n", node.NodeName, 
node.FirstDate, node.ClusterName)
                alert.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)
-               cmd := exec.Command("/bin/bash", "./restart.sh", node.NodeIP, 
h.cc.NodeDownTime.AllSystemUser, h.cc.NodeDownTime.AllSystemPassword)
+
+               cmd := exec.Command("/bin/bash", "core/horuser/restart.sh", 
node.NodeIP, h.cc.NodeDownTime.AllSystemUser, 
h.cc.NodeDownTime.AllSystemPassword)
                output, err := cmd.CombinedOutput()
                if err != nil {
                        klog.Errorf("Failed restart for Output: %v node %v: 
%v", string(output), node.NodeName, err)
                        return
                }
-               klog.Infof("Successfully restart for node %v. Output: %v", 
node.NodeName, string(output))
+               klog.Infof("Successfully restarted node %v. Output: %v", 
node.NodeName, string(output))
+       } else {
+               klog.Infof("RestartMarker did not pass for node %v", 
node.NodeName)
        }
 }
diff --git a/app/horus/core/horuser/restart.sh 
b/app/horus/core/horuser/restart.sh
index cf4766e1..19784a24 100644
--- a/app/horus/core/horuser/restart.sh
+++ b/app/horus/core/horuser/restart.sh
@@ -25,5 +25,5 @@ if [ $# -lt 3 ]; then
 fi
 
 for i in $host_computer; do
-    sshpass -p$host_pass ssh -o "StrictHostKeyChecking=no" "$host_name"@$i 
"echo $host_pass | sudo -S reboot"
-done
\ No newline at end of file
+    sshpass -p "$host_pass" ssh "$host_name"@$i -o "StrictHostKeyChecking=no" 
"echo $host_pass | sudo -S reboot"
+done
diff --git a/manifests/horus/horus.yaml b/manifests/horus/horus.yaml
index 033fd82d..836a92b5 100644
--- a/manifests/horus/horus.yaml
+++ b/manifests/horus/horus.yaml
@@ -35,7 +35,7 @@ kubeMultiple:
   cluster: config.1
 
 promMultiple:
-  cluster: http://192.168.15.128:31019
+  cluster: http://192.168.15.133:31173
 
 nodeRecovery:
   enabled: false
@@ -56,7 +56,7 @@ customModular:
     node_cpu: 1
   abnormalityQL:
     node_cpu: |-
-      100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 
100) > 30
+      100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 
100) > 15
   recoveryQL:
     node_cpu: |-
       100 - (avg by (node) 
(rate(node_cpu_seconds_total{mode="idle",node="%s"}[5m])) * 100) < 10
@@ -74,14 +74,16 @@ customModular:
 
 nodeDownTime:
   enabled: true
-  intervalSecond: 15
+  intervalSecond: 5
   promQueryTimeSecond: 60
   abnormalityQL:
-    - 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 
100) > 50
+    - 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 
100) > 13
 #    - (avg by (node) (node_memory_MemFree_bytes / node_memory_MemTotal_bytes 
)) * 100 < 20
 #    - node_filesystem_avail_bytes{mountpoint="/"} / 
node_filesystem_size_bytes{mountpoint="/"} * 100 < 15
-  nodeNameToIPs:
+  abnormalInfoSystemQL:
     node_os_info{node="%s"}
+  allSystemUser: "zxj"
+  AllSystemPassword: "1"
   kubeMultiple:
     cluster: config.1
   dingTalk:

Reply via email to