This is an automated email from the ASF dual-hosted git repository.
ocket8888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficcontrol.git
The following commit(s) were added to refs/heads/master by this push:
new 5009758 Updates the trafficcontrol-health-client with the following:
(#6511)
5009758 is described below
commit 5009758898f03cc05d1391dcc08faec2b77fe337
Author: John J. Rushford <[email protected]>
AuthorDate: Thu Jan 13 14:03:36 2022 -0700
Updates the trafficcontrol-health-client with the following: (#6511)
- Add the "unavailable-poll-threshold" configuration parameter.
A parent must be polled as unhealthy for a consecutive number
of times. When the unhealthy poll count reaches this threshold
the parent is marked down.
- Fixes a bug where a change to the "tm-poll-interval-seconds" is
not applied to a running health-client when it re-reads it's
configuration file.
---
tc-health-client/README.md | 7 ++
tc-health-client/config/config.go | 52 +++++++------
tc-health-client/tmagent/tmagent.go | 148 ++++++++++++++++++++++++++----------
3 files changed, 143 insertions(+), 64 deletions(-)
diff --git a/tc-health-client/README.md b/tc-health-client/README.md
index 90702e9..4e6e272 100644
--- a/tc-health-client/README.md
+++ b/tc-health-client/README.md
@@ -105,6 +105,7 @@ Sample configuarion file:
"tm-poll-interval-seconds": "60s",
"tm-proxy-url", "http://sample-http-proxy.cdn.net:80",
"tm-update-cycles": 5,
+ "unavailable-poll-threshold": 2,
"trafficserver-config-dir": "/opt/trafficserver/etc/trafficserver",
"trafficserver-bin-dir": "/opt/trafficserver/bin",
}
@@ -162,6 +163,12 @@ Sample configuarion file:
reaches **tm-update-cycles**, TrafficOps is polled for a new list of
available
TrafficMonitors for the CDN and the poll count is reset to 0.
+### unavailable-poll-threshold
+
+ This controls when an unhealthy parent is marked down. An unhealthy parent
+ will be marked down when the number of consecutive polls reaches this
threshold
+ with the parent reported as unhealthy. The default threshold is 2.
+
### trafficserver-config-dir
The location on the host where **Traffic Server** configuration files are
diff --git a/tc-health-client/config/config.go
b/tc-health-client/config/config.go
index e35f3d7..be74cbf 100644
--- a/tc-health-client/config/config.go
+++ b/tc-health-client/config/config.go
@@ -43,32 +43,34 @@ var tmPollingInterval time.Duration
var toRequestTimeout time.Duration
const (
- DefaultConfigFile =
"/etc/trafficcontrol/tc-health-client.json"
- DefaultLogDirectory = "/var/log/trafficcontrol"
- DefaultLogFile = "tc-health-client.log"
- DefaultTrafficServerConfigDir = "/opt/trafficserver/etc/trafficserver"
- DefaultTrafficServerBinDir = "/opt/trafficserver/bin"
- DefaultTmUpdateCycles = 10
+ DefaultConfigFile =
"/etc/trafficcontrol/tc-health-client.json"
+ DefaultLogDirectory = "/var/log/trafficcontrol"
+ DefaultLogFile = "tc-health-client.log"
+ DefaultTrafficServerConfigDir = "/opt/trafficserver/etc/trafficserver"
+ DefaultTrafficServerBinDir = "/opt/trafficserver/bin"
+ DefaultTmUpdateCycles = 10
+ DefaultUnavailablePollThreshold = 2
)
type Cfg struct {
- CDNName string `json:"cdn-name"`
- EnableActiveMarkdowns bool `json:"enable-active-markdowns"`
- ReasonCode string `json:"reason-code"`
- TOCredentialFile string `json:"to-credential-file"`
- TORequestTimeOutSeconds string
`json:"to-request-timeout-seconds"`
- TOPass string `json:"to-pass"`
- TOUrl string `json:"to-url"`
- TOUser string `json:"to-user"`
- TmProxyURL string `json:"tm-proxy-url"`
- TmPollIntervalSeconds string
`json:"tm-poll-interval-seconds"`
- TmUpdateCycles int `json:"tm-update-cycles"`
- TrafficServerConfigDir string
`json:"trafficserver-config-dir"`
- TrafficServerBinDir string `json:"trafficserver-bin-dir"`
- TrafficMonitors map[string]bool
`json:"trafficmonitors,omitempty"`
- HealthClientConfigFile util.ConfigFile
- CredentialFile util.ConfigFile
- ParsedProxyURL *url.URL
+ CDNName string `json:"cdn-name"`
+ EnableActiveMarkdowns bool
`json:"enable-active-markdowns"`
+ ReasonCode string `json:"reason-code"`
+ TOCredentialFile string `json:"to-credential-file"`
+ TORequestTimeOutSeconds string
`json:"to-request-timeout-seconds"`
+ TOPass string `json:"to-pass"`
+ TOUrl string `json:"to-url"`
+ TOUser string `json:"to-user"`
+ TmProxyURL string `json:"tm-proxy-url"`
+ TmPollIntervalSeconds string
`json:"tm-poll-interval-seconds"`
+ TmUpdateCycles int `json:"tm-update-cycles"`
+ UnavailablePollThreshold int
`json:"unavailable-poll-threshold"`
+ TrafficServerConfigDir string
`json:"trafficserver-config-dir"`
+ TrafficServerBinDir string `json:"trafficserver-bin-dir"`
+ TrafficMonitors map[string]bool
`json:"trafficmonitors,omitempty"`
+ HealthClientConfigFile util.ConfigFile
+ CredentialFile util.ConfigFile
+ ParsedProxyURL *url.URL
}
type LogCfg struct {
@@ -324,6 +326,9 @@ func LoadConfig(cfg *Cfg) (bool, error) {
if cfg.TmUpdateCycles == 0 {
cfg.TmUpdateCycles = DefaultTmUpdateCycles
}
+ if cfg.UnavailablePollThreshold == 0 {
+ cfg.UnavailablePollThreshold =
DefaultUnavailablePollThreshold
+ }
cfg.HealthClientConfigFile.LastModifyTime = modTime
@@ -362,6 +367,7 @@ func UpdateConfig(cfg *Cfg, newCfg *Cfg) {
cfg.TOUser = newCfg.TOUser
cfg.TmPollIntervalSeconds = newCfg.TmPollIntervalSeconds
cfg.TmUpdateCycles = newCfg.TmUpdateCycles
+ cfg.UnavailablePollThreshold = newCfg.UnavailablePollThreshold
cfg.TrafficServerConfigDir = newCfg.TrafficServerConfigDir
cfg.TrafficServerBinDir = newCfg.TrafficServerBinDir
cfg.TrafficMonitors = newCfg.TrafficMonitors
diff --git a/tc-health-client/tmagent/tmagent.go
b/tc-health-client/tmagent/tmagent.go
index 584c0df..51f43ef 100644
--- a/tc-health-client/tmagent/tmagent.go
+++ b/tc-health-client/tmagent/tmagent.go
@@ -78,10 +78,12 @@ type FailOver struct {
// the trafficserver 'HostStatus' fields that are necessary to interface
// with the trafficserver 'traffic_ctl' command.
type ParentStatus struct {
- Fqdn string
- ActiveReason bool
- LocalReason bool
- ManualReason bool
+ Fqdn string
+ ActiveReason bool
+ LocalReason bool
+ ManualReason bool
+ LastTmPoll int64
+ UnavailablePollCount int
}
// used to get the overall parent availablity from the
@@ -262,10 +264,10 @@ func (c *ParentInfo) GetCacheStatuses() (tc.CRStates,
error) {
// the status that trafficmonitor health protocol has determined for a parent.
func (c *ParentInfo) PollAndUpdateCacheStatus() {
cycleCount := 0
- pollingInterval := config.GetTMPollingInterval()
log.Infoln("polling started")
for {
+ pollingInterval := config.GetTMPollingInterval()
// check for config file updates
newCfg := config.Cfg{
HealthClientConfigFile: c.Cfg.HealthClientConfigFile,
@@ -311,6 +313,10 @@ func (c *ParentInfo) PollAndUpdateCacheStatus() {
// read traffic manager cache statuses.
_c, err := c.GetCacheStatuses()
+
+ // get the current poll time
+ now := time.Now().Unix()
+
caches := _c.Caches
if err != nil {
log.Errorf("error in TrafficMonitor polling: %s\n",
err.Error())
@@ -327,6 +333,9 @@ func (c *ParentInfo) PollAndUpdateCacheStatus() {
hostName := string(k)
cs, ok := c.Parents[hostName]
if ok {
+ // update the polling time
+ cs.LastTmPoll = now
+ c.Parents[hostName] = cs
tmAvailable := v.IsAvailable
if cs.available(c.Cfg.ReasonCode) !=
tmAvailable {
// do not mark down if the
configuration disables mark downs.
@@ -447,15 +456,10 @@ func parseFqdn(fqdn string) string {
return hostName
}
-// used to mark a parent as up or down in the trafficserver HostStatus
-// subsystem.
-//
-// TODO see issue #6448, add cacheStatus back when available in CrStates
-//func (c *ParentInfo) markParent(fqdn string, cacheStatus string, available
bool) error {
-func (c *ParentInfo) markParent(fqdn string, available bool) error {
- hostName := parseFqdn(fqdn)
- tc := filepath.Join(c.TrafficServerBinDir, TrafficCtl)
+func (c *ParentInfo) execTrafficCtl(fqdn string, available bool) error {
reason := c.Cfg.ReasonCode
+ tc := filepath.Join(c.TrafficServerBinDir, TrafficCtl)
+
var status string
if available {
status = "up"
@@ -472,27 +476,81 @@ func (c *ParentInfo) markParent(fqdn string, available
bool) error {
if err != nil {
return errors.New("marking " + fqdn + " " + status + ": " +
TrafficCtl + " error: " + err.Error())
}
+
+ return nil
+}
+
+// used to mark a parent as up or down in the trafficserver HostStatus
+// subsystem.
+//
+// TODO see issue #6448, add cacheStatus back when available in CrStates
+//func (c *ParentInfo) markParent(fqdn string, cacheStatus string, available
bool) error {
+func (c *ParentInfo) markParent(fqdn string, available bool) error {
+ var hostAvailable bool
+ var err error
+ hostName := parseFqdn(fqdn)
+
+ log.Debugf("fqdn: %s, available: %v", fqdn, available)
+
pv, ok := c.Parents[hostName]
if ok {
- switch reason {
- case "active":
- pv.ActiveReason = available
- case "local":
- pv.LocalReason = available
+ activeReason := pv.ActiveReason
+ localReason := pv.LocalReason
+ unavailablePollCount := pv.UnavailablePollCount
+
+ log.Debugf("hostName: %s, UnavailablePollCount: %d, available:
%v", hostName, unavailablePollCount, available)
+
+ if !available { // unavailable
+ unavailablePollCount += 1
+ if unavailablePollCount <
c.Cfg.UnavailablePollThreshold {
+ log.Infof("TM indicates %s is unavailable but
the UnavailablePollThreshold has not been reached", hostName)
+ hostAvailable = true
+ } else {
+ // marking the host down
+ err = c.execTrafficCtl(fqdn, available)
+ if err != nil {
+ log.Errorln(err.Error())
+ }
+ if err == nil {
+ // TODO see issue 6448, add cacheStatus
back when available in CrStates
+ // log.Infof("marked parent %s DOWN,
cache status was: %s\n", hostName, cacheStatus)
+ hostAvailable = false
+ log.Infof("marked parent %s DOWN",
hostName)
+ }
+ }
+ } else { // available
+ // marking the host up
+ err = c.execTrafficCtl(fqdn, available)
+ if err == nil {
+ hostAvailable = true
+ // reset the unavilable poll count
+ unavailablePollCount = 0
+ // TODO see issue #6448, add cacheStatus back
when available in CrStates
+ //log.Infof("marked parent %s UP, cache status
was: %s\n", hostName, cacheStatus)
+ log.Infof("marked parent %s UP", hostName)
+ } else {
+ hostAvailable = false
+ }
}
- }
- c.Parents[hostName] = pv
- if !available {
- // TODO see issue 6448, add cacheStatus back when available in
CrStates
- // log.Infof("marked parent %s DOWN, cache status was: %s\n",
hostName, cacheStatus)
- log.Infof("marked parent %s DOWN", hostName)
- } else {
- // TODO see issue #6448, add cacheStatus back when available in
CrStates
- //log.Infof("marked parent %s UP, cache status was: %s\n",
hostName, cacheStatus)
- log.Infof("marked parent %s UP", hostName)
+ // update parent info
+ if err == nil {
+ reason := c.Cfg.ReasonCode
+ switch reason {
+ case "active":
+ activeReason = hostAvailable
+ case "local":
+ localReason = hostAvailable
+ }
+ // save updates
+ pv.ActiveReason = activeReason
+ pv.LocalReason = localReason
+ pv.UnavailablePollCount = unavailablePollCount
+ c.Parents[hostName] = pv
+ log.Debugf("Updated parent status: %v", pv)
+ }
}
- return nil
+ return err
}
// reads the current parent statuses from the trafficserver HostStatus
@@ -545,10 +603,12 @@ func (c *ParentInfo) readHostStatus(parentStatus
map[string]ParentStatus) error
}
}
pstat := ParentStatus{
- Fqdn: fqdn,
- ActiveReason: activeReason,
- LocalReason: localReason,
- ManualReason: manualReason,
+ Fqdn: fqdn,
+ ActiveReason:
activeReason,
+ LocalReason:
localReason,
+ ManualReason:
manualReason,
+ LastTmPoll: 0,
+ UnavailablePollCount: 0,
}
hostName = parseFqdn(fqdn)
pv, ok := parentStatus[hostName]
@@ -562,6 +622,8 @@ func (c *ParentInfo) readHostStatus(parentStatus
map[string]ParentStatus) error
available :=
pstat.available(c.Cfg.ReasonCode)
if
pv.available(c.Cfg.ReasonCode) != available {
log.Infof("host status
for '%s' has changed to %s\n", hostName, pstat.Status())
+ pstat.LastTmPoll =
pv.LastTmPoll
+
pstat.UnavailablePollCount = pv.UnavailablePollCount
parentStatus[hostName]
= pstat
}
}
@@ -627,10 +689,12 @@ func (c *ParentInfo) readParentConfig(parentStatus
map[string]ParentStatus) erro
// already exist.
if !ok {
pstat := ParentStatus{
- Fqdn:
strings.TrimSpace(fqdn),
- ActiveReason:
true,
- LocalReason:
true,
- ManualReason:
true,
+ Fqdn:
strings.TrimSpace(fqdn),
+ ActiveReason:
true,
+ LocalReason:
true,
+ ManualReason:
true,
+ LastTmPoll:
0,
+
UnavailablePollCount: 0,
}
parentStatus[hostName]
= pstat
log.Debugf("added Host
'%s' from %s to the parents map\n", hostName, fn)
@@ -715,10 +779,12 @@ func (c *ParentInfo) readStrategies(parentStatus
map[string]ParentStatus) error
_, ok := parentStatus[hostName]
if !ok {
pstat := ParentStatus{
- Fqdn: strings.TrimSpace(fqdn),
- ActiveReason: true,
- LocalReason: true,
- ManualReason: true,
+ Fqdn: strings.TrimSpace(fqdn),
+ ActiveReason: true,
+ LocalReason: true,
+ ManualReason: true,
+ LastTmPoll: 0,
+ UnavailablePollCount: 0,
}
parentStatus[hostName] = pstat
log.Debugf("added Host '%s' from %s to the parents
map\n", hostName, fn)