This is an automated email from the ASF dual-hosted git repository.

zhongxjian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/dubbo-kubernetes.git


The following commit(s) were added to refs/heads/master by this push:
     new 94101443 [horus] Reconfiguring the restart logic (#382)
94101443 is described below

commit 94101443cd2de86018218f0e4ca1ec5fc57db452
Author: mfordjody <[email protected]>
AuthorDate: Thu Sep 26 19:00:57 2024 +0800

    [horus] Reconfiguring the restart logic (#382)
---
 app/horus/cmd/main.go                  |  2 +-
 app/horus/core/horuser/node_drain.go   | 22 +++++++++++++++++++++-
 app/horus/core/horuser/node_restart.go | 16 +++++++++++-----
 app/horus/core/horuser/pod_evict.go    |  1 +
 manifests/horus/horus.yaml             |  4 ++--
 5 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/app/horus/cmd/main.go b/app/horus/cmd/main.go
index 9d641063..0c20203e 100644
--- a/app/horus/cmd/main.go
+++ b/app/horus/cmd/main.go
@@ -37,7 +37,7 @@ var (
 )
 
 func main() {
-       flag.StringVar(&configFile, "configFile", 
"../../deploy/horus/horus.yaml", "horus config file")
+       flag.StringVar(&configFile, "configFile", 
"../../manifests/horus/horus.yaml", "horus config file")
        flag.StringVar(&address, "address", "0.0.0.0:38089", "horus address")
        klog.InitFlags(flag.CommandLine)
        flag.Parse()
diff --git a/app/horus/core/horuser/node_drain.go 
b/app/horus/core/horuser/node_drain.go
index f7fe8726..b60d07e4 100644
--- a/app/horus/core/horuser/node_drain.go
+++ b/app/horus/core/horuser/node_drain.go
@@ -16,7 +16,9 @@
 package horuser
 
 import (
+       "context"
        "fmt"
+       corev1 "k8s.io/api/core/v1"
        v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/klog/v2"
 )
@@ -55,11 +57,29 @@ func (h *Horuser) Drain(nodeName, clusterName string) (err 
error) {
                if ds {
                        continue
                }
-               err := h.Evict(pods.Name, pods.Namespace, clusterName)
+               err = h.Evict(pods.Name, pods.Namespace, clusterName)
                if err != nil {
                        klog.Errorf("node Drain evict pod err:%v items:%d 
count:%v nodeName:%v clusterName:%v podName:%v podNamespace:%v", err, items+1, 
count, nodeName, clusterName, pods.Name, pods.Namespace)
                        return err
                }
+               err = h.Finalizer(clusterName, pods.Name, pods.Namespace)
+               if err != nil {
+                       klog.Errorf("node Drain finalizer pod err:%v items:%d 
count:%v nodeName:%v clusterName:%v podName:%v podNamespace:%v", err, items+1, 
count, nodeName, clusterName, pods.Name, pods.Namespace)
+                       return err
+               }
+
+               var oldPod *corev1.Pod
+               var _ = h.Terminating(clusterName, oldPod)
+               newPod, _ := 
kubeClient.CoreV1().Pods(oldPod.Namespace).Get(context.Background(), 
oldPod.Name, v1.GetOptions{})
+               if newPod == nil {
+                       return err
+               }
+               if newPod.UID != oldPod.UID {
+                       return err
+               }
+               if newPod.DeletionTimestamp.IsZero() {
+                       return err
+               }
        }
        return nil
 }
diff --git a/app/horus/core/horuser/node_restart.go 
b/app/horus/core/horuser/node_restart.go
index 3feef7b1..6340e7bc 100644
--- a/app/horus/core/horuser/node_restart.go
+++ b/app/horus/core/horuser/node_restart.go
@@ -54,15 +54,21 @@ func (h *Horuser) RestartOrRepair(ctx context.Context) {
 func (h *Horuser) TryRestart(node db.NodeDataInfo) {
        msg := fmt.Sprintf("\n【节点尝试重启】\n 节点:%v\n 日期:%v\n 集群:%v\n", 
node.NodeName, node.FirstDate, node.ClusterName)
 
-       err := h.UnCordon(node.NodeName, node.ClusterName)
+       err := h.Drain(node.NodeName, node.ClusterName)
        if err != nil {
-               msg += fmt.Sprintf("\n【取消不可调度状态失败:%v】\n", err)
+               msg += fmt.Sprintf("\n【驱逐节点】\n")
                alert.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)
-               return
-       } else {
-               klog.Infof("Node %v is already uncordoned.", node.NodeName)
        }
 
+       //err := h.UnCordon(node.NodeName, node.ClusterName)
+       //if err != nil {
+       //      msg += fmt.Sprintf("\n【取消不可调度状态失败:%v】\n", err)
+       //      alert.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)
+       //      return
+       //} else {
+       //      klog.Infof("Node %v is already uncordoned.", node.NodeName)
+       //}
+
        err = syscall.Reboot(syscall.LINUX_REBOOT_CMD_RESTART)
        if err != nil {
                msg += fmt.Sprintf("\n【节点重启失败:%v】\n", err)
diff --git a/app/horus/core/horuser/pod_evict.go 
b/app/horus/core/horuser/pod_evict.go
index 8c0abb23..6b147bc2 100644
--- a/app/horus/core/horuser/pod_evict.go
+++ b/app/horus/core/horuser/pod_evict.go
@@ -49,4 +49,5 @@ func (h *Horuser) Evict(podName, podNamespace, clusterName 
string) (err error) {
        }
        klog.Infof("pod Evict delete success clusterName:%v podName:%v 
podNamespace:%v", clusterName, podName, podNamespace)
        return nil
+
 }
diff --git a/manifests/horus/horus.yaml b/manifests/horus/horus.yaml
index 58f36fa9..7d41766f 100644
--- a/manifests/horus/horus.yaml
+++ b/manifests/horus/horus.yaml
@@ -36,7 +36,7 @@ kubeMultiple:
   cluster: config.1
 
 promMultiple:
-  cluster: http://192.168.15.128:31505
+  cluster: http://192.168.15.128:30177
 
 nodeRecovery:
   enabled: false
@@ -74,7 +74,7 @@ customModular:
     webhookUrl: 
"https://hooks.slack.com/services/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";
 
 nodeDownTime:
-  enabled: true
+  enabled: false
   checkIntervalSecond: 5
   promQueryTimeSecond: 60
   abnormalityQL:

Reply via email to