This is an automated email from the ASF dual-hosted git repository.
zhongxjian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/dubbo-kubernetes.git
The following commit(s) were added to refs/heads/master by this push:
new 9a059931 [horus] Fix restart sequence logic (#416)
9a059931 is described below
commit 9a059931e1ed280b311b19a428fc6a822d3dc877
Author: mfordjody <[email protected]>
AuthorDate: Wed Oct 2 13:26:31 2024 +0800
[horus] Fix restart sequence logic (#416)
---
app/horus/core/horuser/node_restart.go | 17 +++++++++++------
app/horus/core/horuser/restart.sh | 4 ++--
manifests/horus/horus.yaml | 12 +++++++-----
3 files changed, 20 insertions(+), 13 deletions(-)
diff --git a/app/horus/core/horuser/node_restart.go
b/app/horus/core/horuser/node_restart.go
index fe7c8549..06e1c069 100644
--- a/app/horus/core/horuser/node_restart.go
+++ b/app/horus/core/horuser/node_restart.go
@@ -56,23 +56,28 @@ func (h *Horuser) TryRestart(node db.NodeDataInfo) {
if err != nil {
klog.Errorf("Drain node err:%v", err)
klog.Infof("clusterName:%v nodeName:%v", node.ClusterName,
node.NodeName)
- return
- } else {
- klog.Infof("Drain node Success clusterName:%v nodeName:%v",
node.ClusterName, node.NodeName)
}
+ klog.Infof("Drain node Success clusterName:%v nodeName:%v",
node.ClusterName, node.NodeName)
pass, err := node.RestartMarker()
- klog.Infof("RestartMarker result pass:%v err:%v", pass, err)
+ if err != nil {
+ klog.Errorf("Error getting RestartMarker for node %v: %v",
node.NodeName, err)
+ return
+ }
+ klog.Infof("RestartMarker result pass:%v", pass)
if pass {
msg :=
fmt.Sprintf("\n【等待宕机节点腾空后重启】\n【节点:%v】\n【日期:%v】\n【集群:%v】\n", node.NodeName,
node.FirstDate, node.ClusterName)
alert.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)
- cmd := exec.Command("/bin/bash", "./restart.sh", node.NodeIP,
h.cc.NodeDownTime.AllSystemUser, h.cc.NodeDownTime.AllSystemPassword)
+
+ cmd := exec.Command("/bin/bash", "core/horuser/restart.sh",
node.NodeIP, h.cc.NodeDownTime.AllSystemUser,
h.cc.NodeDownTime.AllSystemPassword)
output, err := cmd.CombinedOutput()
if err != nil {
klog.Errorf("Failed restart for Output: %v node %v:
%v", string(output), node.NodeName, err)
return
}
- klog.Infof("Successfully restart for node %v. Output: %v",
node.NodeName, string(output))
+ klog.Infof("Successfully restarted node %v. Output: %v",
node.NodeName, string(output))
+ } else {
+ klog.Infof("RestartMarker did not pass for node %v",
node.NodeName)
}
}
diff --git a/app/horus/core/horuser/restart.sh
b/app/horus/core/horuser/restart.sh
index cf4766e1..19784a24 100644
--- a/app/horus/core/horuser/restart.sh
+++ b/app/horus/core/horuser/restart.sh
@@ -25,5 +25,5 @@ if [ $# -lt 3 ]; then
fi
for i in $host_computer; do
- sshpass -p$host_pass ssh -o "StrictHostKeyChecking=no" "$host_name"@$i
"echo $host_pass | sudo -S reboot"
-done
\ No newline at end of file
+ sshpass -p "$host_pass" ssh "$host_name"@$i -o "StrictHostKeyChecking=no"
"echo $host_pass | sudo -S reboot"
+done
diff --git a/manifests/horus/horus.yaml b/manifests/horus/horus.yaml
index 033fd82d..836a92b5 100644
--- a/manifests/horus/horus.yaml
+++ b/manifests/horus/horus.yaml
@@ -35,7 +35,7 @@ kubeMultiple:
cluster: config.1
promMultiple:
- cluster: http://192.168.15.128:31019
+ cluster: http://192.168.15.133:31173
nodeRecovery:
enabled: false
@@ -56,7 +56,7 @@ customModular:
node_cpu: 1
abnormalityQL:
node_cpu: |-
- 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) *
100) > 30
+ 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) *
100) > 15
recoveryQL:
node_cpu: |-
100 - (avg by (node)
(rate(node_cpu_seconds_total{mode="idle",node="%s"}[5m])) * 100) < 10
@@ -74,14 +74,16 @@ customModular:
nodeDownTime:
enabled: true
- intervalSecond: 15
+ intervalSecond: 5
promQueryTimeSecond: 60
abnormalityQL:
- - 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) *
100) > 50
+ - 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) *
100) > 13
# - (avg by (node) (node_memory_MemFree_bytes / node_memory_MemTotal_bytes
)) * 100 < 20
# - node_filesystem_avail_bytes{mountpoint="/"} /
node_filesystem_size_bytes{mountpoint="/"} * 100 < 15
- nodeNameToIPs:
+ abnormalInfoSystemQL:
node_os_info{node="%s"}
+ allSystemUser: "zxj"
+ AllSystemPassword: "1"
kubeMultiple:
cluster: config.1
dingTalk: