This is an automated email from the ASF dual-hosted git repository.

zhongxjian pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/dubbo-kubernetes.git


The following commit(s) were added to refs/heads/master by this push:
     new c73df09e [horus] Fix node address domain conversion (#411)
c73df09e is described below

commit c73df09e90e67ae0420588292bf3e9258c38da9a
Author: mfordjody <[email protected]>
AuthorDate: Tue Oct 1 14:37:23 2024 +0800

    [horus] Fix node address domain conversion (#411)
---
 app/horus/core/horuser/node_downtime.go        | 36 +++++++++++++++++++++-----
 app/horus/core/horuser/{script => }/restart.sh |  2 +-
 manifests/horus/horus.yaml                     | 14 +++++-----
 3 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/app/horus/core/horuser/node_downtime.go 
b/app/horus/core/horuser/node_downtime.go
index 2f436a01..3c028786 100644
--- a/app/horus/core/horuser/node_downtime.go
+++ b/app/horus/core/horuser/node_downtime.go
@@ -20,6 +20,7 @@ import (
        "fmt"
        "github.com/apache/dubbo-kubernetes/app/horus/basic/db"
        "github.com/apache/dubbo-kubernetes/app/horus/core/alert"
+       metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
        "k8s.io/apimachinery/pkg/util/wait"
        "k8s.io/klog"
        "sync"
@@ -28,7 +29,7 @@ import (
 
 const (
        NODE_DOWN        = "node_down"
-       NODE_DOWN_REASON = "down_time"
+       NODE_DOWN_REASON = "downtime"
 )
 
 func (h *Horuser) DownTimeManager(ctx context.Context) error {
@@ -58,6 +59,14 @@ func (h *Horuser) DownTimeCheck(ctx context.Context) {
 }
 
 func (h *Horuser) DownTimeNodes(clusterName, addr string) {
+       kubeClient := h.kubeClientMap[clusterName]
+       if kubeClient == nil {
+               klog.Errorf("DownTimeNodes kubeClient by clusterName empty.")
+               return
+       }
+       ctxFirst, cancelFirst := h.GetK8sContext()
+       defer cancelFirst()
+
        klog.Infof("DownTimeNodes Query Start clusterName:%v", clusterName)
        nodeDownTimeRes := make(map[string]int)
        cq := len(h.cc.NodeDownTime.AbnormalityQL)
@@ -107,9 +116,10 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
                WithDownNodeIPs[node] = str
        }
 
-       WithDownNodeIPsMsg := fmt.Sprintf("\n【%s】\n【集群:%v】\n【已达到宕机标准:%v】", 
h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs))
+       msg := fmt.Sprintf("\n【%s】\n【集群:%v】\n【已达到宕机标准:%v】", 
h.cc.NodeDownTime.DingTalk.Title, clusterName, len(WithDownNodeIPs))
        newfound := 0
-       for nodeName, nodeIP := range WithDownNodeIPs {
+
+       for nodeName, _ := range WithDownNodeIPs {
                today := time.Now().Format("2006-01-02")
                err := h.Cordon(nodeName, clusterName, NODE_DOWN)
                if err != nil {
@@ -117,6 +127,20 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) {
                        klog.Infof("clusterName:%v nodeName:%v", clusterName, 
nodeName)
                        return
                }
+               node, err := kubeClient.CoreV1().Nodes().Get(ctxFirst, 
nodeName, metav1.GetOptions{})
+               if err != nil {
+                       klog.Errorf("node Cordon get err nodeName:%v 
clusterName:%v", nodeName, clusterName)
+               }
+               nodeIP, err := func() (string, error) {
+                       for _, address := range node.Status.Addresses {
+                               if address.Type == "InternalIP" {
+
+                                       return address.Address, nil
+                               }
+                       }
+                       return "", nil
+               }()
+
                write := db.NodeDataInfo{
                        NodeName:    nodeName,
                        NodeIP:      nodeIP,
@@ -129,10 +153,10 @@ func (h *Horuser) DownTimeNodes(clusterName, addr string) 
{
                }
                newfound++
                if newfound > 0 {
-                       klog.Infof("DownTimeNodes get WithDownNodeIPs 
\n【集群:%v】\n 【节点:%v】\n【节点 IP 数:%v】\n", clusterName, nodeName, len(nodeIP))
-                       alert.DingTalkSend(h.cc.NodeDownTime.DingTalk, 
WithDownNodeIPsMsg)
+                       klog.Infof("DownTimeNodes get WithDownNodeIPs 
\n【集群:%v】\n 【节点:%v】\n【节点数:%v】\n", clusterName, nodeName, len(nodeIP))
+                       alert.DingTalkSend(h.cc.NodeDownTime.DingTalk, msg)
                }
-               WithDownNodeIPsMsg += fmt.Sprintf("node:%v ip:%v", nodeName, 
nodeIP)
+               msg += fmt.Sprintf("node:%v ip:%v", nodeName, nodeIP)
                write.Reason = NODE_DOWN_REASON
                write.FirstDate = today
                _, err = write.Add()
diff --git a/app/horus/core/horuser/script/restart.sh 
b/app/horus/core/horuser/restart.sh
similarity index 89%
rename from app/horus/core/horuser/script/restart.sh
rename to app/horus/core/horuser/restart.sh
index b4585969..cf4766e1 100644
--- a/app/horus/core/horuser/script/restart.sh
+++ b/app/horus/core/horuser/restart.sh
@@ -25,5 +25,5 @@ if [ $# -lt 3 ]; then
 fi
 
 for i in $host_computer; do
-    sshpass -p$host_pass ssh -o "StrictHostKeyChecking=no" $host_name@$i "echo 
$host_pass | sudo -S reboot"
+    sshpass -p$host_pass ssh -o "StrictHostKeyChecking=no" "$host_name"@$i 
"echo $host_pass | sudo -S reboot"
 done
\ No newline at end of file
diff --git a/manifests/horus/horus.yaml b/manifests/horus/horus.yaml
index a6f6d0e2..033fd82d 100644
--- a/manifests/horus/horus.yaml
+++ b/manifests/horus/horus.yaml
@@ -35,10 +35,10 @@ kubeMultiple:
   cluster: config.1
 
 promMultiple:
-  cluster: http://192.168.15.128:30937
+  cluster: http://192.168.15.128:31019
 
 nodeRecovery:
-  enabled: true
+  enabled: false
   dayNumber: 1
   intervalSecond: 15
   promQueryTimeSecond: 60
@@ -51,7 +51,7 @@ nodeRecovery:
     webhookUrl: 
"https://hooks.slack.com/services/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";
 
 customModular:
-  enabled: true
+  enabled: false
   cordonDailyLimit:
     node_cpu: 1
   abnormalityQL:
@@ -73,13 +73,13 @@ customModular:
     webhookUrl: 
"https://hooks.slack.com/services/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";
 
 nodeDownTime:
-  enabled: false
+  enabled: true
   intervalSecond: 15
   promQueryTimeSecond: 60
   abnormalityQL:
-    - 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 
100) > 80
-    - (avg by (node) (node_memory_MemFree_bytes / node_memory_MemTotal_bytes 
)) * 100 < 10
-    - node_filesystem_avail_bytes{mountpoint="/"} / 
node_filesystem_size_bytes{mountpoint="/"} * 100 < 15
+    - 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 
100) > 50
+#    - (avg by (node) (node_memory_MemFree_bytes / node_memory_MemTotal_bytes 
)) * 100 < 20
+#    - node_filesystem_avail_bytes{mountpoint="/"} / 
node_filesystem_size_bytes{mountpoint="/"} * 100 < 15
   nodeNameToIPs:
     node_os_info{node="%s"}
   kubeMultiple:

Reply via email to