This is an automated email from the ASF dual-hosted git repository.
zhongxjian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/dubbo-kubernetes.git
The following commit(s) were added to refs/heads/master by this push:
new e7fe9683 [horus] fixed pod and node logic (#390)
e7fe9683 is described below
commit e7fe96834c500e58e8a5165d09934998b0e77580
Author: mfordjody <[email protected]>
AuthorDate: Fri Sep 27 15:33:48 2024 +0800
[horus] fixed pod and node logic (#390)
---
app/horus/cmd/main.go | 31 +++++++++++++++++++------------
app/horus/core/horuser/node_downtime.go | 4 ++--
app/horus/core/horuser/node_restart.go | 7 ++++---
app/horus/core/horuser/pod_abnormal.go | 8 +++++---
manifests/horus/horus.yaml | 4 ++--
5 files changed, 32 insertions(+), 22 deletions(-)
diff --git a/app/horus/cmd/main.go b/app/horus/cmd/main.go
index f73b1b38..45032156 100644
--- a/app/horus/cmd/main.go
+++ b/app/horus/cmd/main.go
@@ -118,26 +118,33 @@ func main() {
return nil
})
group.Add(func() error {
- klog.Info("horus down time manager start success.")
- err := horus.DownTimeManager(ctx)
- if err != nil {
- klog.Errorf("horus down time manager start failed
error:%v", err)
+ if c.NodeDownTime.Enabled {
+ klog.Info("horus down time manager start success.")
+ err := horus.DownTimeManager(ctx)
+ if err != nil {
+ klog.Errorf("horus down time manager start
failed error:%v", err)
+ }
}
return nil
})
group.Add(func() error {
- klog.Info("horus down time restart manager start success.")
- err := horus.DowntimeRestartManager(ctx)
- if err != nil {
- klog.Errorf("horus down time restart manager start
failed error:%v", err)
+ if c.NodeDownTime.Enabled {
+ klog.Info("horus down time restart manager start
success.")
+ err := horus.DowntimeRestartManager(ctx)
+ if err != nil {
+ klog.Errorf("horus down time restart manager
start failed error:%v", err)
+ }
}
return nil
+
})
group.Add(func() error {
- klog.Info("horus pod abnormal clean manager start success.")
- err := horus.PodAbnormalCleanManager(ctx)
- if err != nil {
- klog.Errorf("horus pod abnormal clean manager start
failed error:%v", err)
+ if c.PodAbnormal.Enabled {
+ klog.Info("horus pod abnormal clean manager start
success.")
+ err := horus.PodAbnormalCleanManager(ctx)
+ if err != nil {
+ klog.Errorf("horus pod abnormal clean manager
start failed error:%v", err)
+ }
}
return nil
})
diff --git a/app/horus/core/horuser/node_downtime.go
b/app/horus/core/horuser/node_downtime.go
index 199d7a92..9ffcaefc 100644
--- a/app/horus/core/horuser/node_downtime.go
+++ b/app/horus/core/horuser/node_downtime.go
@@ -28,7 +28,7 @@ import (
const (
NODE_DOWN = "node_down"
- NODE_DOWN_REASON = "node_down"
+ NODE_DOWN_REASON = "unknown"
)
func (h *Horuser) DownTimeManager(ctx context.Context) error {
@@ -129,7 +129,7 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
}
newfound++
if newfound > 0 {
- klog.Infof("NodeDownTimeCheckOnCluster get
toNodeNameips \n【集群:%v】\n 【总数:%v】\n 【细节:%v】\n", clusterName, len(nodeIP),
nodeName)
+ klog.Infof("DownTimeNodes get WithDownNodeIPs
\n【集群:%v】\n 【节点:%v】\n【节点 IP 数:%v】\n", clusterName, nodeName, len(nodeIP))
alert.DingTalkSend(h.cc.NodeDownTime.DingTalk,
WithDownNodeIPsMsg)
}
WithDownNodeIPsMsg += fmt.Sprintf("node:%v ip:%v", nodeName,
nodeIP)
diff --git a/app/horus/core/horuser/node_restart.go
b/app/horus/core/horuser/node_restart.go
index 507bf330..9ef064ce 100644
--- a/app/horus/core/horuser/node_restart.go
+++ b/app/horus/core/horuser/node_restart.go
@@ -52,9 +52,9 @@ func (h *Horuser) RestartOrRepair(ctx context.Context) {
}
func (h *Horuser) TryRestart(node db.NodeDataInfo) {
- err := h.Drain(node.NodeName, node.ClusterName)
+ err := h.Drain(node.ClusterName, node.NodeName)
if err != nil {
- msg := fmt.Sprintf("\n【安全驱逐节点尝试重启就绪:%v】\n", err)
+ msg := fmt.Sprintf("\n【安全驱逐节点重启就绪:%v】\n", err)
alert.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)
return
}
@@ -69,7 +69,8 @@ func (h *Horuser) TryRestart(node db.NodeDataInfo) {
klog.Infof("Node %v is already uncordoned.", node.NodeName)
}
- err = syscall.Reboot(syscall.LINUX_REBOOT_CMD_RESTART)
+ syscall.Reboot(syscall.LINUX_REBOOT_CMD_RESTART)
+
if err != nil {
msg += fmt.Sprintf("\n【节点重启失败:%v】\n", err)
} else {
diff --git a/app/horus/core/horuser/pod_abnormal.go
b/app/horus/core/horuser/pod_abnormal.go
index 6ad4015e..80ae9f0a 100644
--- a/app/horus/core/horuser/pod_abnormal.go
+++ b/app/horus/core/horuser/pod_abnormal.go
@@ -37,10 +37,12 @@ func (h *Horuser) PodAbnormalCleanManager(ctx
context.Context) error {
func (h *Horuser) PodAbnormalClean(ctx context.Context) {
var wg sync.WaitGroup
for cn := range h.cc.PodAbnormal.KubeMultiple {
+ cn := cn
wg.Add(1)
- go func(clusterName string) {
+ go func() {
defer wg.Done()
- }(cn)
+ h.PodsOnCluster(cn)
+ }()
}
wg.Wait()
}
@@ -63,7 +65,7 @@ func (h *Horuser) PodsOnCluster(clusterName string) {
if pod.Status.Phase == corev1.PodRunning || pod.Status.Phase ==
corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
continue
}
- msg :=
fmt.Sprintf("【集群:%v】【%d/%d】【Namespace:%v】【PodName:%v】【Phase:%v】【节点名:%v】",
clusterName, index+1, count, pod.Namespace, pod.Name, pod.Status.Phase,
pod.Spec.NodeName)
+ msg :=
fmt.Sprintf("\n【集群:%v】\n【%d/%d】\n【Namespace:%v】\n【PodName:%v】\n【Phase:%v】\n【节点名:%v】\n",
clusterName, index+1, count, pod.Namespace, pod.Name, pod.Status.Phase,
pod.Spec.NodeName)
klog.Infof(msg)
wp.Submit(func() {
diff --git a/manifests/horus/horus.yaml b/manifests/horus/horus.yaml
index 596954ab..3f5735e6 100644
--- a/manifests/horus/horus.yaml
+++ b/manifests/horus/horus.yaml
@@ -91,11 +91,11 @@ nodeDownTime:
- 15000000
podAbnormal:
- enabled: false
+ enabled: true
intervalSecond: 5
doubleSecond: 10
labelSelector: "app.kubernetes.io/name=horus"
- fieldSelector: "status.phase != Running"
+ fieldSelector: "status.phase!=Running"
kubeMultiple:
cluster: config.1
dingTalk: