This is an automated email from the ASF dual-hosted git repository.

ocket8888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficcontrol.git


The following commit(s) were added to refs/heads/master by this push:
     new 5009758  Updates the trafficcontrol-health-client with the following: 
(#6511)
5009758 is described below

commit 5009758898f03cc05d1391dcc08faec2b77fe337
Author: John J. Rushford <[email protected]>
AuthorDate: Thu Jan 13 14:03:36 2022 -0700

    Updates the trafficcontrol-health-client with the following: (#6511)
    
    - Add the "unavailable-poll-threshold" configuration parameter.
      A parent must be polled as unhealthy for a consecutive number
      of times.  When the unhealthy poll count reaches this threshold
      the parent is marked down.
    - Fixes a bug where a change to the "tm-poll-interval-seconds" is
      not applied to a running health-client when it re-reads it's
      configuration file.
---
 tc-health-client/README.md          |   7 ++
 tc-health-client/config/config.go   |  52 +++++++------
 tc-health-client/tmagent/tmagent.go | 148 ++++++++++++++++++++++++++----------
 3 files changed, 143 insertions(+), 64 deletions(-)

diff --git a/tc-health-client/README.md b/tc-health-client/README.md
index 90702e9..4e6e272 100644
--- a/tc-health-client/README.md
+++ b/tc-health-client/README.md
@@ -105,6 +105,7 @@ Sample configuarion file:
     "tm-poll-interval-seconds": "60s",
     "tm-proxy-url", "http://sample-http-proxy.cdn.net:80";,
     "tm-update-cycles": 5,
+    "unavailable-poll-threshold": 2,
     "trafficserver-config-dir": "/opt/trafficserver/etc/trafficserver",
     "trafficserver-bin-dir": "/opt/trafficserver/bin",
   }
@@ -162,6 +163,12 @@ Sample configuarion file:
   reaches **tm-update-cycles**, TrafficOps is polled for a new list of 
available
   TrafficMonitors for the CDN and the poll count is reset to 0.
 
+### unavailable-poll-threshold
+
+  This controls when an unhealthy parent is marked down.  An unhealthy parent
+  will be marked down when the number of consecutive polls reaches this 
threshold
+  with the parent reported as unhealthy.  The default threshold is 2.
+
 ### trafficserver-config-dir
 
   The location on the host where **Traffic Server** configuration files are 
diff --git a/tc-health-client/config/config.go 
b/tc-health-client/config/config.go
index e35f3d7..be74cbf 100644
--- a/tc-health-client/config/config.go
+++ b/tc-health-client/config/config.go
@@ -43,32 +43,34 @@ var tmPollingInterval time.Duration
 var toRequestTimeout time.Duration
 
 const (
-       DefaultConfigFile             = 
"/etc/trafficcontrol/tc-health-client.json"
-       DefaultLogDirectory           = "/var/log/trafficcontrol"
-       DefaultLogFile                = "tc-health-client.log"
-       DefaultTrafficServerConfigDir = "/opt/trafficserver/etc/trafficserver"
-       DefaultTrafficServerBinDir    = "/opt/trafficserver/bin"
-       DefaultTmUpdateCycles         = 10
+       DefaultConfigFile               = 
"/etc/trafficcontrol/tc-health-client.json"
+       DefaultLogDirectory             = "/var/log/trafficcontrol"
+       DefaultLogFile                  = "tc-health-client.log"
+       DefaultTrafficServerConfigDir   = "/opt/trafficserver/etc/trafficserver"
+       DefaultTrafficServerBinDir      = "/opt/trafficserver/bin"
+       DefaultTmUpdateCycles           = 10
+       DefaultUnavailablePollThreshold = 2
 )
 
 type Cfg struct {
-       CDNName                 string          `json:"cdn-name"`
-       EnableActiveMarkdowns   bool            `json:"enable-active-markdowns"`
-       ReasonCode              string          `json:"reason-code"`
-       TOCredentialFile        string          `json:"to-credential-file"`
-       TORequestTimeOutSeconds string          
`json:"to-request-timeout-seconds"`
-       TOPass                  string          `json:"to-pass"`
-       TOUrl                   string          `json:"to-url"`
-       TOUser                  string          `json:"to-user"`
-       TmProxyURL              string          `json:"tm-proxy-url"`
-       TmPollIntervalSeconds   string          
`json:"tm-poll-interval-seconds"`
-       TmUpdateCycles          int             `json:"tm-update-cycles"`
-       TrafficServerConfigDir  string          
`json:"trafficserver-config-dir"`
-       TrafficServerBinDir     string          `json:"trafficserver-bin-dir"`
-       TrafficMonitors         map[string]bool 
`json:"trafficmonitors,omitempty"`
-       HealthClientConfigFile  util.ConfigFile
-       CredentialFile          util.ConfigFile
-       ParsedProxyURL          *url.URL
+       CDNName                  string          `json:"cdn-name"`
+       EnableActiveMarkdowns    bool            
`json:"enable-active-markdowns"`
+       ReasonCode               string          `json:"reason-code"`
+       TOCredentialFile         string          `json:"to-credential-file"`
+       TORequestTimeOutSeconds  string          
`json:"to-request-timeout-seconds"`
+       TOPass                   string          `json:"to-pass"`
+       TOUrl                    string          `json:"to-url"`
+       TOUser                   string          `json:"to-user"`
+       TmProxyURL               string          `json:"tm-proxy-url"`
+       TmPollIntervalSeconds    string          
`json:"tm-poll-interval-seconds"`
+       TmUpdateCycles           int             `json:"tm-update-cycles"`
+       UnavailablePollThreshold int             
`json:"unavailable-poll-threshold"`
+       TrafficServerConfigDir   string          
`json:"trafficserver-config-dir"`
+       TrafficServerBinDir      string          `json:"trafficserver-bin-dir"`
+       TrafficMonitors          map[string]bool 
`json:"trafficmonitors,omitempty"`
+       HealthClientConfigFile   util.ConfigFile
+       CredentialFile           util.ConfigFile
+       ParsedProxyURL           *url.URL
 }
 
 type LogCfg struct {
@@ -324,6 +326,9 @@ func LoadConfig(cfg *Cfg) (bool, error) {
                if cfg.TmUpdateCycles == 0 {
                        cfg.TmUpdateCycles = DefaultTmUpdateCycles
                }
+               if cfg.UnavailablePollThreshold == 0 {
+                       cfg.UnavailablePollThreshold = 
DefaultUnavailablePollThreshold
+               }
 
                cfg.HealthClientConfigFile.LastModifyTime = modTime
 
@@ -362,6 +367,7 @@ func UpdateConfig(cfg *Cfg, newCfg *Cfg) {
        cfg.TOUser = newCfg.TOUser
        cfg.TmPollIntervalSeconds = newCfg.TmPollIntervalSeconds
        cfg.TmUpdateCycles = newCfg.TmUpdateCycles
+       cfg.UnavailablePollThreshold = newCfg.UnavailablePollThreshold
        cfg.TrafficServerConfigDir = newCfg.TrafficServerConfigDir
        cfg.TrafficServerBinDir = newCfg.TrafficServerBinDir
        cfg.TrafficMonitors = newCfg.TrafficMonitors
diff --git a/tc-health-client/tmagent/tmagent.go 
b/tc-health-client/tmagent/tmagent.go
index 584c0df..51f43ef 100644
--- a/tc-health-client/tmagent/tmagent.go
+++ b/tc-health-client/tmagent/tmagent.go
@@ -78,10 +78,12 @@ type FailOver struct {
 // the trafficserver 'HostStatus' fields that are necessary to interface
 // with the trafficserver 'traffic_ctl' command.
 type ParentStatus struct {
-       Fqdn         string
-       ActiveReason bool
-       LocalReason  bool
-       ManualReason bool
+       Fqdn                 string
+       ActiveReason         bool
+       LocalReason          bool
+       ManualReason         bool
+       LastTmPoll           int64
+       UnavailablePollCount int
 }
 
 // used to get the overall parent availablity from the
@@ -262,10 +264,10 @@ func (c *ParentInfo) GetCacheStatuses() (tc.CRStates, 
error) {
 // the status that trafficmonitor health protocol has determined for a parent.
 func (c *ParentInfo) PollAndUpdateCacheStatus() {
        cycleCount := 0
-       pollingInterval := config.GetTMPollingInterval()
        log.Infoln("polling started")
 
        for {
+               pollingInterval := config.GetTMPollingInterval()
                // check for config file updates
                newCfg := config.Cfg{
                        HealthClientConfigFile: c.Cfg.HealthClientConfigFile,
@@ -311,6 +313,10 @@ func (c *ParentInfo) PollAndUpdateCacheStatus() {
 
                // read traffic manager cache statuses.
                _c, err := c.GetCacheStatuses()
+
+               // get the current poll time
+               now := time.Now().Unix()
+
                caches := _c.Caches
                if err != nil {
                        log.Errorf("error in TrafficMonitor polling: %s\n", 
err.Error())
@@ -327,6 +333,9 @@ func (c *ParentInfo) PollAndUpdateCacheStatus() {
                        hostName := string(k)
                        cs, ok := c.Parents[hostName]
                        if ok {
+                               // update the polling time
+                               cs.LastTmPoll = now
+                               c.Parents[hostName] = cs
                                tmAvailable := v.IsAvailable
                                if cs.available(c.Cfg.ReasonCode) != 
tmAvailable {
                                        // do not mark down if the 
configuration disables mark downs.
@@ -447,15 +456,10 @@ func parseFqdn(fqdn string) string {
        return hostName
 }
 
-// used to mark a parent as up or down in the trafficserver HostStatus
-// subsystem.
-//
-// TODO see issue #6448, add cacheStatus back when available in CrStates
-//func (c *ParentInfo) markParent(fqdn string, cacheStatus string, available 
bool) error {
-func (c *ParentInfo) markParent(fqdn string, available bool) error {
-       hostName := parseFqdn(fqdn)
-       tc := filepath.Join(c.TrafficServerBinDir, TrafficCtl)
+func (c *ParentInfo) execTrafficCtl(fqdn string, available bool) error {
        reason := c.Cfg.ReasonCode
+       tc := filepath.Join(c.TrafficServerBinDir, TrafficCtl)
+
        var status string
        if available {
                status = "up"
@@ -472,27 +476,81 @@ func (c *ParentInfo) markParent(fqdn string, available 
bool) error {
        if err != nil {
                return errors.New("marking " + fqdn + " " + status + ": " + 
TrafficCtl + " error: " + err.Error())
        }
+
+       return nil
+}
+
+// used to mark a parent as up or down in the trafficserver HostStatus
+// subsystem.
+//
+// TODO see issue #6448, add cacheStatus back when available in CrStates
+//func (c *ParentInfo) markParent(fqdn string, cacheStatus string, available 
bool) error {
+func (c *ParentInfo) markParent(fqdn string, available bool) error {
+       var hostAvailable bool
+       var err error
+       hostName := parseFqdn(fqdn)
+
+       log.Debugf("fqdn: %s, available: %v", fqdn, available)
+
        pv, ok := c.Parents[hostName]
        if ok {
-               switch reason {
-               case "active":
-                       pv.ActiveReason = available
-               case "local":
-                       pv.LocalReason = available
+               activeReason := pv.ActiveReason
+               localReason := pv.LocalReason
+               unavailablePollCount := pv.UnavailablePollCount
+
+               log.Debugf("hostName: %s, UnavailablePollCount: %d, available: 
%v", hostName, unavailablePollCount, available)
+
+               if !available { // unavailable
+                       unavailablePollCount += 1
+                       if unavailablePollCount < 
c.Cfg.UnavailablePollThreshold {
+                               log.Infof("TM indicates %s is unavailable but 
the UnavailablePollThreshold has not been reached", hostName)
+                               hostAvailable = true
+                       } else {
+                               // marking the host down
+                               err = c.execTrafficCtl(fqdn, available)
+                               if err != nil {
+                                       log.Errorln(err.Error())
+                               }
+                               if err == nil {
+                                       // TODO see issue 6448, add cacheStatus 
back when available in CrStates
+                                       // log.Infof("marked parent %s DOWN, 
cache status was: %s\n", hostName, cacheStatus)
+                                       hostAvailable = false
+                                       log.Infof("marked parent %s DOWN", 
hostName)
+                               }
+                       }
+               } else { // available
+                       // marking the host up
+                       err = c.execTrafficCtl(fqdn, available)
+                       if err == nil {
+                               hostAvailable = true
+                               // reset the unavilable poll count
+                               unavailablePollCount = 0
+                               // TODO see issue #6448, add cacheStatus back 
when available in CrStates
+                               //log.Infof("marked parent %s UP, cache status 
was: %s\n", hostName, cacheStatus)
+                               log.Infof("marked parent %s UP", hostName)
+                       } else {
+                               hostAvailable = false
+                       }
                }
-       }
-       c.Parents[hostName] = pv
 
-       if !available {
-               // TODO see issue 6448, add cacheStatus back when available in 
CrStates
-               // log.Infof("marked parent %s DOWN, cache status was: %s\n", 
hostName, cacheStatus)
-               log.Infof("marked parent %s DOWN", hostName)
-       } else {
-               // TODO see issue #6448, add cacheStatus back when available in 
CrStates
-               //log.Infof("marked parent %s UP, cache status was: %s\n", 
hostName, cacheStatus)
-               log.Infof("marked parent %s UP", hostName)
+               // update parent info
+               if err == nil {
+                       reason := c.Cfg.ReasonCode
+                       switch reason {
+                       case "active":
+                               activeReason = hostAvailable
+                       case "local":
+                               localReason = hostAvailable
+                       }
+                       // save updates
+                       pv.ActiveReason = activeReason
+                       pv.LocalReason = localReason
+                       pv.UnavailablePollCount = unavailablePollCount
+                       c.Parents[hostName] = pv
+                       log.Debugf("Updated parent status: %v", pv)
+               }
        }
-       return nil
+       return err
 }
 
 // reads the current parent statuses from the trafficserver HostStatus
@@ -545,10 +603,12 @@ func (c *ParentInfo) readHostStatus(parentStatus 
map[string]ParentStatus) error
                                                }
                                        }
                                        pstat := ParentStatus{
-                                               Fqdn:         fqdn,
-                                               ActiveReason: activeReason,
-                                               LocalReason:  localReason,
-                                               ManualReason: manualReason,
+                                               Fqdn:                 fqdn,
+                                               ActiveReason:         
activeReason,
+                                               LocalReason:          
localReason,
+                                               ManualReason:         
manualReason,
+                                               LastTmPoll:           0,
+                                               UnavailablePollCount: 0,
                                        }
                                        hostName = parseFqdn(fqdn)
                                        pv, ok := parentStatus[hostName]
@@ -562,6 +622,8 @@ func (c *ParentInfo) readHostStatus(parentStatus 
map[string]ParentStatus) error
                                                available := 
pstat.available(c.Cfg.ReasonCode)
                                                if 
pv.available(c.Cfg.ReasonCode) != available {
                                                        log.Infof("host status 
for '%s' has changed to %s\n", hostName, pstat.Status())
+                                                       pstat.LastTmPoll = 
pv.LastTmPoll
+                                                       
pstat.UnavailablePollCount = pv.UnavailablePollCount
                                                        parentStatus[hostName] 
= pstat
                                                }
                                        }
@@ -627,10 +689,12 @@ func (c *ParentInfo) readParentConfig(parentStatus 
map[string]ParentStatus) erro
                                                // already exist.
                                                if !ok {
                                                        pstat := ParentStatus{
-                                                               Fqdn:         
strings.TrimSpace(fqdn),
-                                                               ActiveReason: 
true,
-                                                               LocalReason:  
true,
-                                                               ManualReason: 
true,
+                                                               Fqdn:           
      strings.TrimSpace(fqdn),
+                                                               ActiveReason:   
      true,
+                                                               LocalReason:    
      true,
+                                                               ManualReason:   
      true,
+                                                               LastTmPoll:     
      0,
+                                                               
UnavailablePollCount: 0,
                                                        }
                                                        parentStatus[hostName] 
= pstat
                                                        log.Debugf("added Host 
'%s' from %s to the parents map\n", hostName, fn)
@@ -715,10 +779,12 @@ func (c *ParentInfo) readStrategies(parentStatus 
map[string]ParentStatus) error
                _, ok := parentStatus[hostName]
                if !ok {
                        pstat := ParentStatus{
-                               Fqdn:         strings.TrimSpace(fqdn),
-                               ActiveReason: true,
-                               LocalReason:  true,
-                               ManualReason: true,
+                               Fqdn:                 strings.TrimSpace(fqdn),
+                               ActiveReason:         true,
+                               LocalReason:          true,
+                               ManualReason:         true,
+                               LastTmPoll:           0,
+                               UnavailablePollCount: 0,
                        }
                        parentStatus[hostName] = pstat
                        log.Debugf("added Host '%s' from %s to the parents 
map\n", hostName, fn)

Reply via email to