This is an automated email from the ASF dual-hosted git repository.

zhongxjian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/dubbo-kubernetes.git


The following commit(s) were added to refs/heads/master by this push:
     new 61186491 [horus] Fixes and optimizations (#441)
61186491 is described below

commit 6118649162fbe2eb23f323d4baefa96bd59f3e4f
Author: mfordjody <[email protected]>
AuthorDate: Wed Oct 9 09:49:35 2024 +0800

    [horus] Fixes and optimizations (#441)
---
 app/horus/base/config/config.go         |  1 -
 app/horus/cmd/main.go                   | 27 +++++++++++++++------------
 app/horus/core/horuser/node_downtime.go |  4 ++--
 app/horus/core/horuser/node_restart.go  |  4 ++--
 manifests/horus/horus.yaml              | 11 +++++------
 5 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/app/horus/base/config/config.go b/app/horus/base/config/config.go
index 34d813a9..fdb38d1c 100644
--- a/app/horus/base/config/config.go
+++ b/app/horus/base/config/config.go
@@ -47,7 +47,6 @@ type SlackConfiguration struct {
 }
 
 type RecoveryConfiguration struct {
-       Enabled             bool                   `yaml:"enabled"`
        DayNumber           int                    `yaml:"dayNumber"`
        IntervalSecond      int                    `yaml:"intervalSecond"`
        PromQueryTimeSecond int64                  `yaml:"promQueryTimeSecond"`
diff --git a/app/horus/cmd/main.go b/app/horus/cmd/main.go
index 8b3e4a76..cb15d52e 100644
--- a/app/horus/cmd/main.go
+++ b/app/horus/cmd/main.go
@@ -79,6 +79,7 @@ func main() {
                err := srv.ListenAndServe()
                if err != nil {
                        klog.Errorf("horus metrics err:%v", err)
+                       return err
                }
                return nil
        })
@@ -87,16 +88,16 @@ func main() {
                err := ticker.Manager(ctx)
                if err != nil {
                        klog.Errorf("horus ticker manager start failed err:%v", 
err)
+                       return err
                }
                return nil
        })
        group.Add(func() error {
-               if c.NodeRecovery.Enabled {
-                       klog.Info("horus node recovery manager start success.")
-                       err := horus.RecoveryManager(ctx)
-                       if err != nil {
-                               klog.Errorf("horus node recovery manager start 
failed err:%v", err)
-                       }
+               klog.Info("horus node recovery manager start success.")
+               err := horus.RecoveryManager(ctx)
+               if err != nil {
+                       klog.Errorf("horus node recovery manager start failed 
err:%v", err)
+                       return err
                }
                return nil
        })
@@ -106,6 +107,7 @@ func main() {
                        err := horus.CustomizeModularManager(ctx)
                        if err != nil {
                                klog.Errorf("horus node customize modular 
manager start failed err:%v", err)
+                               return err
                        }
                }
                return nil
@@ -116,17 +118,17 @@ func main() {
                        err := horus.DownTimeManager(ctx)
                        if err != nil {
                                klog.Errorf("horus node downtime manager start 
failed err:%v", err)
+                               return err
                        }
                }
                return nil
        })
        group.Add(func() error {
-               if c.NodeDownTime.Enabled {
-                       klog.Info("horus node downtime restart manager start 
success.")
-                       err := horus.DowntimeRestartManager(ctx)
-                       if err != nil {
-                               klog.Errorf("horus node downtime restart 
manager start failed err:%v", err)
-                       }
+               klog.Info("horus node downtime restart manager start success.")
+               err := horus.DowntimeRestartManager(ctx)
+               if err != nil {
+                       klog.Errorf("horus node downtime restart manager start 
failed err:%v", err)
+                       return err
                }
                return nil
        })
@@ -136,6 +138,7 @@ func main() {
                        err := horus.PodStagnationCleanManager(ctx)
                        if err != nil {
                                klog.Errorf("horus pod stagnation clean manager 
start failed err:%v", err)
+                               return err
                        }
                }
                return nil
diff --git a/app/horus/core/horuser/node_downtime.go 
b/app/horus/core/horuser/node_downtime.go
index 51b66a4c..6cb57404 100644
--- a/app/horus/core/horuser/node_downtime.go
+++ b/app/horus/core/horuser/node_downtime.go
@@ -28,7 +28,7 @@ import (
 )
 
 const (
-       NODE_DOWN        = "node_down"
+       NODE_DOWN        = "nodeDown"
        NODE_DOWN_REASON = "downtime"
 )
 
@@ -118,7 +118,7 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
                WithDownNodeIPs[node] = str
        }
 
-       msg := fmt.Sprintf("\n【%s】\n【集群:%v】\n【已达到宕机标准:%v】", 
h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs))
+       msg := fmt.Sprintf("\n【%s】\n【集群:%v】\n【已达到宕机临界点:%v】", 
h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs))
        newfound := 0
 
        for nodeName, _ := range WithDownNodeIPs {
diff --git a/app/horus/core/horuser/node_restart.go 
b/app/horus/core/horuser/node_restart.go
index c331fe0e..d8aedbec 100644
--- a/app/horus/core/horuser/node_restart.go
+++ b/app/horus/core/horuser/node_restart.go
@@ -83,8 +83,8 @@ func (h *Horuser) TryRestart(node db.NodeDataInfo) {
                klog.Infof("RestartMarker did not success for node %v", 
node.NodeName)
        }
 
-       if node.Restart < 2 {
-               klog.Info("It's been rebooted once.")
+       if node.Restart > 2 {
+               klog.Error("It's been rebooted once.")
                return
        }
 }
diff --git a/manifests/horus/horus.yaml b/manifests/horus/horus.yaml
index ba1218bf..6a52d9a4 100644
--- a/manifests/horus/horus.yaml
+++ b/manifests/horus/horus.yaml
@@ -28,7 +28,6 @@ promMultiple:
   cluster: http://192.168.15.134:31974
 
 nodeRecovery:
-  enabled: false
   dayNumber: 1
   intervalSecond: 15
   promQueryTimeSecond: 60
@@ -65,12 +64,12 @@ customModular:
     title: "自定义通知"
 
 nodeDownTime:
-  enabled: false
-  intervalSecond: 5
+  enabled: true
+  intervalSecond: 15
   promQueryTimeSecond: 60
   abnormalityQL:
-    - 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 
100) > 13
-#    - (avg by (node) (node_memory_MemFree_bytes / node_memory_MemTotal_bytes 
)) * 100 < 20
+    - 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 
100) > 20
+    - (avg by (node) (node_memory_MemFree_bytes / node_memory_MemTotal_bytes 
)) * 100 > 25
 #    - node_filesystem_avail_bytes{mountpoint="/"} / 
node_filesystem_size_bytes{mountpoint="/"} * 100 < 15
   abnormalInfoSystemQL:
     node_os_info{node="%s"}
@@ -88,7 +87,7 @@ nodeDownTime:
     title: "自定义通知"
 
 podStagnationCleaner:
-  enabled: true
+  enabled: false
   intervalSecond: 15
   doubleSecond: 60
   fieldSelector: "status.phase!=Running"

Reply via email to