This is an automated email from the ASF dual-hosted git repository.
zhongxjian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/dubbo-kubernetes.git
The following commit(s) were added to refs/heads/master by this push:
new 94101443 [horus] Reconfiguring the restart logic (#382)
94101443 is described below
commit 94101443cd2de86018218f0e4ca1ec5fc57db452
Author: mfordjody <[email protected]>
AuthorDate: Thu Sep 26 19:00:57 2024 +0800
[horus] Reconfiguring the restart logic (#382)
---
app/horus/cmd/main.go | 2 +-
app/horus/core/horuser/node_drain.go | 22 +++++++++++++++++++++-
app/horus/core/horuser/node_restart.go | 16 +++++++++++-----
app/horus/core/horuser/pod_evict.go | 1 +
manifests/horus/horus.yaml | 4 ++--
5 files changed, 36 insertions(+), 9 deletions(-)
diff --git a/app/horus/cmd/main.go b/app/horus/cmd/main.go
index 9d641063..0c20203e 100644
--- a/app/horus/cmd/main.go
+++ b/app/horus/cmd/main.go
@@ -37,7 +37,7 @@ var (
)
func main() {
- flag.StringVar(&configFile, "configFile",
"../../deploy/horus/horus.yaml", "horus config file")
+ flag.StringVar(&configFile, "configFile",
"../../manifests/horus/horus.yaml", "horus config file")
flag.StringVar(&address, "address", "0.0.0.0:38089", "horus address")
klog.InitFlags(flag.CommandLine)
flag.Parse()
diff --git a/app/horus/core/horuser/node_drain.go
b/app/horus/core/horuser/node_drain.go
index f7fe8726..b60d07e4 100644
--- a/app/horus/core/horuser/node_drain.go
+++ b/app/horus/core/horuser/node_drain.go
@@ -16,7 +16,9 @@
package horuser
import (
+ "context"
"fmt"
+ corev1 "k8s.io/api/core/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"
)
@@ -55,11 +57,29 @@ func (h *Horuser) Drain(nodeName, clusterName string) (err
error) {
if ds {
continue
}
- err := h.Evict(pods.Name, pods.Namespace, clusterName)
+ err = h.Evict(pods.Name, pods.Namespace, clusterName)
if err != nil {
klog.Errorf("node Drain evict pod err:%v items:%d
count:%v nodeName:%v clusterName:%v podName:%v podNamespace:%v", err, items+1,
count, nodeName, clusterName, pods.Name, pods.Namespace)
return err
}
+ err = h.Finalizer(clusterName, pods.Name, pods.Namespace)
+ if err != nil {
+ klog.Errorf("node Drain finalizer pod err:%v items:%d
count:%v nodeName:%v clusterName:%v podName:%v podNamespace:%v", err, items+1,
count, nodeName, clusterName, pods.Name, pods.Namespace)
+ return err
+ }
+
+ var oldPod *corev1.Pod
+ var _ = h.Terminating(clusterName, oldPod)
+ newPod, _ :=
kubeClient.CoreV1().Pods(oldPod.Namespace).Get(context.Background(),
oldPod.Name, v1.GetOptions{})
+ if newPod == nil {
+ return err
+ }
+ if newPod.UID != oldPod.UID {
+ return err
+ }
+ if newPod.DeletionTimestamp.IsZero() {
+ return err
+ }
}
return nil
}
diff --git a/app/horus/core/horuser/node_restart.go
b/app/horus/core/horuser/node_restart.go
index 3feef7b1..6340e7bc 100644
--- a/app/horus/core/horuser/node_restart.go
+++ b/app/horus/core/horuser/node_restart.go
@@ -54,15 +54,21 @@ func (h *Horuser) RestartOrRepair(ctx context.Context) {
func (h *Horuser) TryRestart(node db.NodeDataInfo) {
msg := fmt.Sprintf("\n【节点尝试重启】\n 节点:%v\n 日期:%v\n 集群:%v\n",
node.NodeName, node.FirstDate, node.ClusterName)
- err := h.UnCordon(node.NodeName, node.ClusterName)
+ err := h.Drain(node.NodeName, node.ClusterName)
if err != nil {
- msg += fmt.Sprintf("\n【取消不可调度状态失败:%v】\n", err)
+ msg += fmt.Sprintf("\n【驱逐节点】\n")
alert.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)
- return
- } else {
- klog.Infof("Node %v is already uncordoned.", node.NodeName)
}
+ //err := h.UnCordon(node.NodeName, node.ClusterName)
+ //if err != nil {
+ // msg += fmt.Sprintf("\n【取消不可调度状态失败:%v】\n", err)
+ // alert.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)
+ // return
+ //} else {
+ // klog.Infof("Node %v is already uncordoned.", node.NodeName)
+ //}
+
err = syscall.Reboot(syscall.LINUX_REBOOT_CMD_RESTART)
if err != nil {
msg += fmt.Sprintf("\n【节点重启失败:%v】\n", err)
diff --git a/app/horus/core/horuser/pod_evict.go
b/app/horus/core/horuser/pod_evict.go
index 8c0abb23..6b147bc2 100644
--- a/app/horus/core/horuser/pod_evict.go
+++ b/app/horus/core/horuser/pod_evict.go
@@ -49,4 +49,5 @@ func (h *Horuser) Evict(podName, podNamespace, clusterName
string) (err error) {
}
klog.Infof("pod Evict delete success clusterName:%v podName:%v
podNamespace:%v", clusterName, podName, podNamespace)
return nil
+
}
diff --git a/manifests/horus/horus.yaml b/manifests/horus/horus.yaml
index 58f36fa9..7d41766f 100644
--- a/manifests/horus/horus.yaml
+++ b/manifests/horus/horus.yaml
@@ -36,7 +36,7 @@ kubeMultiple:
cluster: config.1
promMultiple:
- cluster: http://192.168.15.128:31505
+ cluster: http://192.168.15.128:30177
nodeRecovery:
enabled: false
@@ -74,7 +74,7 @@ customModular:
webhookUrl:
"https://hooks.slack.com/services/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"
nodeDownTime:
- enabled: true
+ enabled: false
checkIntervalSecond: 5
promQueryTimeSecond: 60
abnormalityQL: