This is an automated email from the ASF dual-hosted git repository.
zhongxjian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/dubbo-kubernetes.git
The following commit(s) were added to refs/heads/master by this push:
new 61186491 [horus] Fixes and optimizations (#441)
61186491 is described below
commit 6118649162fbe2eb23f323d4baefa96bd59f3e4f
Author: mfordjody <[email protected]>
AuthorDate: Wed Oct 9 09:49:35 2024 +0800
[horus] Fixes and optimizations (#441)
---
app/horus/base/config/config.go | 1 -
app/horus/cmd/main.go | 27 +++++++++++++++------------
app/horus/core/horuser/node_downtime.go | 4 ++--
app/horus/core/horuser/node_restart.go | 4 ++--
manifests/horus/horus.yaml | 11 +++++------
5 files changed, 24 insertions(+), 23 deletions(-)
diff --git a/app/horus/base/config/config.go b/app/horus/base/config/config.go
index 34d813a9..fdb38d1c 100644
--- a/app/horus/base/config/config.go
+++ b/app/horus/base/config/config.go
@@ -47,7 +47,6 @@ type SlackConfiguration struct {
}
type RecoveryConfiguration struct {
- Enabled bool `yaml:"enabled"`
DayNumber int `yaml:"dayNumber"`
IntervalSecond int `yaml:"intervalSecond"`
PromQueryTimeSecond int64 `yaml:"promQueryTimeSecond"`
diff --git a/app/horus/cmd/main.go b/app/horus/cmd/main.go
index 8b3e4a76..cb15d52e 100644
--- a/app/horus/cmd/main.go
+++ b/app/horus/cmd/main.go
@@ -79,6 +79,7 @@ func main() {
err := srv.ListenAndServe()
if err != nil {
klog.Errorf("horus metrics err:%v", err)
+ return err
}
return nil
})
@@ -87,16 +88,16 @@ func main() {
err := ticker.Manager(ctx)
if err != nil {
klog.Errorf("horus ticker manager start failed err:%v",
err)
+ return err
}
return nil
})
group.Add(func() error {
- if c.NodeRecovery.Enabled {
- klog.Info("horus node recovery manager start success.")
- err := horus.RecoveryManager(ctx)
- if err != nil {
- klog.Errorf("horus node recovery manager start
failed err:%v", err)
- }
+ klog.Info("horus node recovery manager start success.")
+ err := horus.RecoveryManager(ctx)
+ if err != nil {
+ klog.Errorf("horus node recovery manager start failed
err:%v", err)
+ return err
}
return nil
})
@@ -106,6 +107,7 @@ func main() {
err := horus.CustomizeModularManager(ctx)
if err != nil {
klog.Errorf("horus node customize modular
manager start failed err:%v", err)
+ return err
}
}
return nil
@@ -116,17 +118,17 @@ func main() {
err := horus.DownTimeManager(ctx)
if err != nil {
klog.Errorf("horus node downtime manager start
failed err:%v", err)
+ return err
}
}
return nil
})
group.Add(func() error {
- if c.NodeDownTime.Enabled {
- klog.Info("horus node downtime restart manager start
success.")
- err := horus.DowntimeRestartManager(ctx)
- if err != nil {
- klog.Errorf("horus node downtime restart
manager start failed err:%v", err)
- }
+ klog.Info("horus node downtime restart manager start success.")
+ err := horus.DowntimeRestartManager(ctx)
+ if err != nil {
+ klog.Errorf("horus node downtime restart manager start
failed err:%v", err)
+ return err
}
return nil
})
@@ -136,6 +138,7 @@ func main() {
err := horus.PodStagnationCleanManager(ctx)
if err != nil {
klog.Errorf("horus pod stagnation clean manager
start failed err:%v", err)
+ return err
}
}
return nil
diff --git a/app/horus/core/horuser/node_downtime.go
b/app/horus/core/horuser/node_downtime.go
index 51b66a4c..6cb57404 100644
--- a/app/horus/core/horuser/node_downtime.go
+++ b/app/horus/core/horuser/node_downtime.go
@@ -28,7 +28,7 @@ import (
)
const (
- NODE_DOWN = "node_down"
+ NODE_DOWN = "nodeDown"
NODE_DOWN_REASON = "downtime"
)
@@ -118,7 +118,7 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
WithDownNodeIPs[node] = str
}
- msg := fmt.Sprintf("\n【%s】\n【集群:%v】\n【已达到宕机标准:%v】",
h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs))
+ msg := fmt.Sprintf("\n【%s】\n【集群:%v】\n【已达到宕机临界点:%v】",
h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs))
newfound := 0
for nodeName, _ := range WithDownNodeIPs {
diff --git a/app/horus/core/horuser/node_restart.go
b/app/horus/core/horuser/node_restart.go
index c331fe0e..d8aedbec 100644
--- a/app/horus/core/horuser/node_restart.go
+++ b/app/horus/core/horuser/node_restart.go
@@ -83,8 +83,8 @@ func (h *Horuser) TryRestart(node db.NodeDataInfo) {
klog.Infof("RestartMarker did not success for node %v",
node.NodeName)
}
- if node.Restart < 2 {
- klog.Info("It's been rebooted once.")
+ if node.Restart > 2 {
+ klog.Error("It's been rebooted once.")
return
}
}
diff --git a/manifests/horus/horus.yaml b/manifests/horus/horus.yaml
index ba1218bf..6a52d9a4 100644
--- a/manifests/horus/horus.yaml
+++ b/manifests/horus/horus.yaml
@@ -28,7 +28,6 @@ promMultiple:
cluster: http://192.168.15.134:31974
nodeRecovery:
- enabled: false
dayNumber: 1
intervalSecond: 15
promQueryTimeSecond: 60
@@ -65,12 +64,12 @@ customModular:
title: "自定义通知"
nodeDownTime:
- enabled: false
- intervalSecond: 5
+ enabled: true
+ intervalSecond: 15
promQueryTimeSecond: 60
abnormalityQL:
- - 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) *
100) > 13
-# - (avg by (node) (node_memory_MemFree_bytes / node_memory_MemTotal_bytes
)) * 100 < 20
+ - 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) *
100) > 20
+ - (avg by (node) (node_memory_MemFree_bytes / node_memory_MemTotal_bytes
)) * 100 > 25
# - node_filesystem_avail_bytes{mountpoint="/"} /
node_filesystem_size_bytes{mountpoint="/"} * 100 < 15
abnormalInfoSystemQL:
node_os_info{node="%s"}
@@ -88,7 +87,7 @@ nodeDownTime:
title: "自定义通知"
podStagnationCleaner:
- enabled: true
+ enabled: false
intervalSecond: 15
doubleSecond: 60
fieldSelector: "status.phase!=Running"