This is an automated email from the ASF dual-hosted git repository.
zhongxjian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/dubbo-kubernetes.git
The following commit(s) were added to refs/heads/master by this push:
new c73df09e [horus] Fix node address domain conversion (#411)
c73df09e is described below
commit c73df09e90e67ae0420588292bf3e9258c38da9a
Author: mfordjody <[email protected]>
AuthorDate: Tue Oct 1 14:37:23 2024 +0800
[horus] Fix node address domain conversion (#411)
---
app/horus/core/horuser/node_downtime.go | 36 +++++++++++++++++++++-----
app/horus/core/horuser/{script => }/restart.sh | 2 +-
manifests/horus/horus.yaml | 14 +++++-----
3 files changed, 38 insertions(+), 14 deletions(-)
diff --git a/app/horus/core/horuser/node_downtime.go
b/app/horus/core/horuser/node_downtime.go
index 2f436a01..3c028786 100644
--- a/app/horus/core/horuser/node_downtime.go
+++ b/app/horus/core/horuser/node_downtime.go
@@ -20,6 +20,7 @@ import (
"fmt"
"github.com/apache/dubbo-kubernetes/app/horus/basic/db"
"github.com/apache/dubbo-kubernetes/app/horus/core/alert"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog"
"sync"
@@ -28,7 +29,7 @@ import (
const (
NODE_DOWN = "node_down"
- NODE_DOWN_REASON = "down_time"
+ NODE_DOWN_REASON = "downtime"
)
func (h *Horuser) DownTimeManager(ctx context.Context) error {
@@ -58,6 +59,14 @@ func (h *Horuser) DownTimeCheck(ctx context.Context) {
}
func (h *Horuser) DownTimeNodes(clusterName, addr string) {
+ kubeClient := h.kubeClientMap[clusterName]
+ if kubeClient == nil {
+ klog.Errorf("DownTimeNodes kubeClient by clusterName empty.")
+ return
+ }
+ ctxFirst, cancelFirst := h.GetK8sContext()
+ defer cancelFirst()
+
klog.Infof("DownTimeNodes Query Start clusterName:%v", clusterName)
nodeDownTimeRes := make(map[string]int)
cq := len(h.cc.NodeDownTime.AbnormalityQL)
@@ -107,9 +116,10 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
WithDownNodeIPs[node] = str
}
- WithDownNodeIPsMsg := fmt.Sprintf("\n【%s】\n【集群:%v】\n【已达到宕机标准:%v】",
h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs))
+ msg := fmt.Sprintf("\n【%s】\n【集群:%v】\n【已达到宕机标准:%v】",
h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs))
newfound := 0
- for nodeName, nodeIP := range WithDownNodeIPs {
+
+ for nodeName, _ := range WithDownNodeIPs {
today := time.Now().Format("2006-01-02")
err := h.Cordon(nodeName, clusterName, NODE_DOWN)
if err != nil {
@@ -117,6 +127,20 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
klog.Infof("clusterName:%v nodeName:%v", clusterName,
nodeName)
return
}
+ node, err := kubeClient.CoreV1().Nodes().Get(ctxFirst,
nodeName, metav1.GetOptions{})
+ if err != nil {
+ klog.Errorf("node Cordon get err nodeName:%v
clusterName:%v", nodeName, clusterName)
+ }
+ nodeIP, err := func() (string, error) {
+ for _, address := range node.Status.Addresses {
+ if address.Type == "InternalIP" {
+
+ return address.Address, nil
+ }
+ }
+ return "", nil
+ }()
+
write := db.NodeDataInfo{
NodeName: nodeName,
NodeIP: nodeIP,
@@ -129,10 +153,10 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string)
{
}
newfound++
if newfound > 0 {
- klog.Infof("DownTimeNodes get WithDownNodeIPs
\n【集群:%v】\n 【节点:%v】\n【节点 IP 数:%v】\n", clusterName, nodeName, len(nodeIP))
- alert.DingTalkSend(h.cc.NodeDownTime.DingTalk,
WithDownNodeIPsMsg)
+ klog.Infof("DownTimeNodes get WithDownNodeIPs
\n【集群:%v】\n 【节点:%v】\n【节点数:%v】\n", clusterName, nodeName, len(nodeIP))
+ alert.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)
}
- WithDownNodeIPsMsg += fmt.Sprintf("node:%v ip:%v", nodeName,
nodeIP)
+ msg += fmt.Sprintf("node:%v ip:%v", nodeName, nodeIP)
write.Reason = NODE_DOWN_REASON
write.FirstDate = today
_, err = write.Add()
diff --git a/app/horus/core/horuser/script/restart.sh
b/app/horus/core/horuser/restart.sh
similarity index 89%
rename from app/horus/core/horuser/script/restart.sh
rename to app/horus/core/horuser/restart.sh
index b4585969..cf4766e1 100644
--- a/app/horus/core/horuser/script/restart.sh
+++ b/app/horus/core/horuser/restart.sh
@@ -25,5 +25,5 @@ if [ $# -lt 3 ]; then
fi
for i in $host_computer; do
- sshpass -p$host_pass ssh -o "StrictHostKeyChecking=no" $host_name@$i "echo
$host_pass | sudo -S reboot"
+ sshpass -p$host_pass ssh -o "StrictHostKeyChecking=no" "$host_name"@$i
"echo $host_pass | sudo -S reboot"
done
\ No newline at end of file
diff --git a/manifests/horus/horus.yaml b/manifests/horus/horus.yaml
index a6f6d0e2..033fd82d 100644
--- a/manifests/horus/horus.yaml
+++ b/manifests/horus/horus.yaml
@@ -35,10 +35,10 @@ kubeMultiple:
cluster: config.1
promMultiple:
- cluster: http://192.168.15.128:30937
+ cluster: http://192.168.15.128:31019
nodeRecovery:
- enabled: true
+ enabled: false
dayNumber: 1
intervalSecond: 15
promQueryTimeSecond: 60
@@ -51,7 +51,7 @@ nodeRecovery:
webhookUrl:
"https://hooks.slack.com/services/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"
customModular:
- enabled: true
+ enabled: false
cordonDailyLimit:
node_cpu: 1
abnormalityQL:
@@ -73,13 +73,13 @@ customModular:
webhookUrl:
"https://hooks.slack.com/services/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"
nodeDownTime:
- enabled: false
+ enabled: true
intervalSecond: 15
promQueryTimeSecond: 60
abnormalityQL:
- - 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) *
100) > 80
- - (avg by (node) (node_memory_MemFree_bytes / node_memory_MemTotal_bytes
)) * 100 < 10
- - node_filesystem_avail_bytes{mountpoint="/"} /
node_filesystem_size_bytes{mountpoint="/"} * 100 < 15
+ - 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) *
100) > 50
+# - (avg by (node) (node_memory_MemFree_bytes / node_memory_MemTotal_bytes
)) * 100 < 20
+# - node_filesystem_avail_bytes{mountpoint="/"} /
node_filesystem_size_bytes{mountpoint="/"} * 100 < 15
nodeNameToIPs:
node_os_info{node="%s"}
kubeMultiple: