This is an automated email from the ASF dual-hosted git repository.
zhongxjian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/dubbo-kubernetes.git
The following commit(s) were added to refs/heads/master by this push:
new def734bd [horus] Downtime function test results completed (#442)
def734bd is described below
commit def734bd7f9fd90cfc5f4b00c7a6767473949d8c
Author: mfordjody <[email protected]>
AuthorDate: Wed Oct 9 10:45:32 2024 +0800
[horus] Downtime function test results completed (#442)
---
app/horus/base/db/db.go | 2 +-
app/horus/cmd/main.go | 12 +++++++-----
app/horus/core/horuser/node_downtime.go | 1 +
app/horus/core/horuser/node_drain.go | 23 ++---------------------
app/horus/core/horuser/node_restart.go | 2 +-
manifests/horus/horus.yaml | 4 ++--
6 files changed, 14 insertions(+), 30 deletions(-)
diff --git a/app/horus/base/db/db.go b/app/horus/base/db/db.go
index 112fc656..d552ad0d 100644
--- a/app/horus/base/db/db.go
+++ b/app/horus/base/db/db.go
@@ -149,7 +149,7 @@ func GetRecoveryNodeDataInfoDate(day int) ([]NodeDataInfo,
error) {
func GetRestartNodeDataInfoDate() ([]NodeDataInfo, error) {
var ndi []NodeDataInfo
- session := db.Where("restart = 0 and repair = 0 and module_name = ?",
"node_down")
+ session := db.Where("restart = 0 and repair = 0 and module_name = ?",
"nodeDown")
err := session.Find(&ndi)
return ndi, err
}
diff --git a/app/horus/cmd/main.go b/app/horus/cmd/main.go
index cb15d52e..f67786a4 100644
--- a/app/horus/cmd/main.go
+++ b/app/horus/cmd/main.go
@@ -93,11 +93,13 @@ func main() {
return nil
})
group.Add(func() error {
- klog.Info("horus node recovery manager start success.")
- err := horus.RecoveryManager(ctx)
- if err != nil {
- klog.Errorf("horus node recovery manager start failed
err:%v", err)
- return err
+ if c.CustomModular.Enabled {
+ klog.Info("horus node recovery manager start success.")
+ err := horus.RecoveryManager(ctx)
+ if err != nil {
+ klog.Errorf("horus node recovery manager start
failed err:%v", err)
+ return err
+ }
}
return nil
})
diff --git a/app/horus/core/horuser/node_downtime.go
b/app/horus/core/horuser/node_downtime.go
index 6cb57404..68c96beb 100644
--- a/app/horus/core/horuser/node_downtime.go
+++ b/app/horus/core/horuser/node_downtime.go
@@ -119,6 +119,7 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
}
msg := fmt.Sprintf("\n【%s】\n【集群:%v】\n【已达到宕机临界点:%v】",
h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs))
+
newfound := 0
for nodeName, _ := range WithDownNodeIPs {
diff --git a/app/horus/core/horuser/node_drain.go
b/app/horus/core/horuser/node_drain.go
index fd01562e..92434535 100644
--- a/app/horus/core/horuser/node_drain.go
+++ b/app/horus/core/horuser/node_drain.go
@@ -16,9 +16,7 @@
package horuser
import (
- "context"
"fmt"
- corev1 "k8s.io/api/core/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"
)
@@ -54,33 +52,16 @@ func (h *Horuser) Drain(nodeName, clusterName string) (err
error) {
break
}
}
- klog.Errorf("node Drain evict pod result items:%d count:%v
nodeName:%v\n clusterName:%v\n podName:%v\n podNamespace:%v\n", items+1, count,
nodeName, clusterName, pods.Name, pods.Namespace)
if ds {
continue
}
+ klog.Errorf("node Drain evict pod result items:%d count:%v
nodeName:%v\n clusterName:%v\n podName:%v\n podNamespace:%v\n", items+1, count,
nodeName, clusterName, pods.Name, pods.Namespace)
+
err = h.Evict(pods.Name, pods.Namespace, clusterName)
if err != nil {
klog.Errorf("node Drain evict pod err:%v items:%d
count:%v nodeName:%v\n clusterName:%v\n podName:%v\n podNamespace:%v\n", err,
items+1, count, nodeName, clusterName, pods.Name, pods.Namespace)
return err
}
- err = h.Finalizer(clusterName, pods.Name, pods.Namespace)
- if err != nil {
- klog.Errorf("node Drain finalizer pod err:%v items:%d
count:%v nodeName:%v\n clusterName:%v\n podName:%v\n podNamespace:%v\n", err,
items+1, count, nodeName, clusterName, pods.Name, pods.Namespace)
- return err
- }
-
- var oldPod *corev1.Pod
- var _ = h.Terminating(clusterName, oldPod)
- newPod, _ :=
kubeClient.CoreV1().Pods(oldPod.Namespace).Get(context.Background(),
oldPod.Name, v1.GetOptions{})
- if newPod == nil {
- return err
- }
- if newPod.UID != oldPod.UID {
- return err
- }
- if newPod.DeletionTimestamp.IsZero() {
- return err
- }
}
return nil
}
diff --git a/app/horus/core/horuser/node_restart.go
b/app/horus/core/horuser/node_restart.go
index d8aedbec..b5b08faa 100644
--- a/app/horus/core/horuser/node_restart.go
+++ b/app/horus/core/horuser/node_restart.go
@@ -69,7 +69,7 @@ func (h *Horuser) TryRestart(node db.NodeDataInfo) {
klog.Infof("RestartMarker result success:%v", success)
if success {
- msg :=
fmt.Sprintf("\n【等待宕机节点腾空后重启】\n【节点:%v】\n【日期:%v】\n【集群:%v】\n", node.NodeName,
node.FirstDate, node.ClusterName)
+ msg :=
fmt.Sprintf("\n【宕机节点等待腾空后重启】\n【节点:%v】\n【日期:%v】\n【集群:%v】\n", node.NodeName,
node.FirstDate, node.ClusterName)
alerter.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)
cmd := exec.Command("/bin/bash", "core/horuser/restart.sh",
node.NodeIP, h.cc.NodeDownTime.AllSystemUser,
h.cc.NodeDownTime.AllSystemPassword)
diff --git a/manifests/horus/horus.yaml b/manifests/horus/horus.yaml
index 6a52d9a4..fdac8695 100644
--- a/manifests/horus/horus.yaml
+++ b/manifests/horus/horus.yaml
@@ -25,7 +25,7 @@ kubeMultiple:
cluster: config.1
promMultiple:
- cluster: http://192.168.15.134:31974
+ cluster: http://192.168.15.133:31974
nodeRecovery:
dayNumber: 1
@@ -64,7 +64,7 @@ customModular:
title: "自定义通知"
nodeDownTime:
- enabled: true
+ enabled: false
intervalSecond: 15
promQueryTimeSecond: 60
abnormalityQL: