This is an automated email from the ASF dual-hosted git repository. zrhoffman pushed a commit to branch 6.0.x in repository https://gitbox.apache.org/repos/asf/trafficcontrol.git
commit 1c4ec191dd43ccf63e15322630cd38ca2f5e4f60 Author: Rawlin Peters <[email protected]> AuthorDate: Mon Aug 30 09:32:11 2021 -0600 Make TM recover from TO being unavailable at startup (#6146) * Make TM recover from TO being unavailable at startup Always set non-nil TO sessions that can eventually authenticate themselves when TO recovers. Additionally, instead of exclusively using one major client version after starting up and logging in, always attempt the latest major version and fall back to the legacy version in case of error. This will allow TM to seamlessly transition between using either major version no matter which order TM and TO are upgraded in. Closes: #6129 * Organize imports, add UsingDummyTO back (cherry picked from commit 16dda648671ad438fcb8b6b08eaf6fbeb2fa9589) --- CHANGELOG.md | 1 + traffic_monitor/handler/handler.go | 2 +- traffic_monitor/manager/opsconfig.go | 4 +- traffic_monitor/towrap/towrap.go | 144 ++++++++++++++------- .../{handler/handler.go => towrap/towrap_test.go} | 33 ++--- 5 files changed, 111 insertions(+), 73 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 010ddd9..34bf7fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -67,6 +67,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - [#5690](https://github.com/apache/trafficcontrol/issues/5690) - Fixed github action for added/modified db migration file. - [#2471](https://github.com/apache/trafficcontrol/issues/2471) - A PR check to ensure added db migration file is the latest. +- [#6129](https://github.com/apache/trafficcontrol/issues/6129) - Traffic Monitor start doesn't recover when Traffic Ops is unavailable - [#5609](https://github.com/apache/trafficcontrol/issues/5609) - Fixed GET /servercheck filter for an extra query param. - [#5954](https://github.com/apache/trafficcontrol/issues/5954) - Traffic Ops HTTP response write errors are ignored - [#6048](https://github.com/apache/trafficcontrol/issues/6048) - TM sometimes sets cachegroups to unavailable even though all caches are online diff --git a/traffic_monitor/handler/handler.go b/traffic_monitor/handler/handler.go index 88832e8..31b7d6d 100644 --- a/traffic_monitor/handler/handler.go +++ b/traffic_monitor/handler/handler.go @@ -40,7 +40,7 @@ type OpsConfig struct { HttpsListener string `json:"httpsListener"` CertFile string `json:"certFile"` KeyFile string `json:"keyFile"` - UsingDummyTO bool `json:"usingDummyTO"` + UsingDummyTO bool `json:"usingDummyTO"` // only used in the TM UI to indicate if TM started up with on-disk backup snapshots } type Handler interface { diff --git a/traffic_monitor/manager/opsconfig.go b/traffic_monitor/manager/opsconfig.go index 29fe954..0361825 100644 --- a/traffic_monitor/manager/opsconfig.go +++ b/traffic_monitor/manager/opsconfig.go @@ -39,7 +39,7 @@ import ( "github.com/apache/trafficcontrol/traffic_monitor/todata" "github.com/apache/trafficcontrol/traffic_monitor/towrap" - "github.com/json-iterator/go" + jsoniter "github.com/json-iterator/go" ) // StartOpsConfigManager starts the ops config manager goroutine, returning the (threadsafe) variables which it sets. @@ -172,8 +172,8 @@ func StartOpsConfigManager( time.Sleep(duration) if toSession.BackupFileExists() && (toLoginCount >= cfg.TrafficOpsDiskRetryMax) { - log.Errorf("error instantiating Session with traffic_ops, backup disk files exist, creating empty traffic_ops session to read") newOpsConfig.UsingDummyTO = true + log.Errorf("error instantiating authenticated session with Traffic Ops, backup disk files exist, continuing with unauthenticated session") break } diff --git a/traffic_monitor/towrap/towrap.go b/traffic_monitor/towrap/towrap.go index e0857e8..fb330a3 100644 --- a/traffic_monitor/towrap/towrap.go +++ b/traffic_monitor/towrap/towrap.go @@ -22,9 +22,12 @@ package towrap */ import ( + "crypto/tls" "errors" "fmt" "io/ioutil" + "net/http" + "net/http/cookiejar" "net/url" "os" "strconv" @@ -38,6 +41,7 @@ import ( client "github.com/apache/trafficcontrol/traffic_ops/v3-client" jsoniter "github.com/json-iterator/go" + "golang.org/x/net/publicsuffix" ) const localHostIP = "127.0.0.1" @@ -193,7 +197,6 @@ type TrafficOpsSessionThreadsafe struct { m *sync.Mutex lastCRConfig ByteMapCache crConfigHist CRConfigHistoryThreadsafe - useLegacy bool CRConfigBackupFile string TMConfigBackupFile string } @@ -209,17 +212,13 @@ func NewTrafficOpsSessionThreadsafe(s *client.Session, ls *legacyClient.Session, session: &s, legacySession: &ls, TMConfigBackupFile: cfg.TMConfigBackupFile, - useLegacy: false, } } // Initialized tells whether or not the TrafficOpsSessionThreadsafe has been -// properly initialized (by calling 'Update'). +// properly initialized with non-nil sessions. func (s TrafficOpsSessionThreadsafe) Initialized() bool { - if s.useLegacy { - return s.legacySession != nil && *s.legacySession != nil - } - return s.session != nil && *s.session != nil + return s.session != nil && *s.session != nil && s.legacySession != nil && *s.legacySession != nil } // Update updates the TrafficOpsSessionThreadsafe's connection information with @@ -240,24 +239,70 @@ func (s *TrafficOpsSessionThreadsafe) Update( s.m.Lock() defer s.m.Unlock() + // always set unauthenticated sessions first which can eventually authenticate themselves when attempting requests + if err := s.setSession(url, username, password, insecure, userAgent, useCache, timeout); err != nil { + return err + } + if err := s.setLegacySession(url, username, password, insecure, userAgent, useCache, timeout); err != nil { + return err + } + session, _, err := client.LoginWithAgent(url, username, password, insecure, userAgent, useCache, timeout) if err != nil { - log.Errorf("Error logging in using up-to-date client: %v", err) + log.Errorf("logging in using up-to-date client: %v", err) legacySession, _, err := legacyClient.LoginWithAgent(url, username, password, insecure, userAgent, useCache, timeout) if err != nil || legacySession == nil { err = fmt.Errorf("logging in using legacy client: %v", err) return err } *s.legacySession = legacySession - s.useLegacy = true } else { *s.session = session - s.useLegacy = false } return nil } +// setSession sets the session for the up-to-date client without logging in. +func (s *TrafficOpsSessionThreadsafe) setSession(url, username, password string, insecure bool, userAgent string, useCache bool, timeout time.Duration) error { + options := cookiejar.Options{ + PublicSuffixList: publicsuffix.List, + } + jar, err := cookiejar.New(&options) + if err != nil { + return err + } + to := client.NewSession(username, password, url, userAgent, &http.Client{ + Timeout: timeout, + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: insecure}, + }, + Jar: jar, + }, useCache) + *s.session = to + return nil +} + +// setSession sets the session for the legacy client without logging in. +func (s *TrafficOpsSessionThreadsafe) setLegacySession(url, username, password string, insecure bool, userAgent string, useCache bool, timeout time.Duration) error { + options := cookiejar.Options{ + PublicSuffixList: publicsuffix.List, + } + jar, err := cookiejar.New(&options) + if err != nil { + return err + } + to := legacyClient.NewSession(username, password, url, userAgent, &http.Client{ + Timeout: timeout, + Transport: &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: insecure}, + }, + Jar: jar, + }, useCache) + *s.legacySession = to + return nil +} + // getThreadsafeSession is used internally to get a copy of the session pointer, // or nil if it doesn't exist. This should not be used outside // TrafficOpsSessionThreadsafe, and never stored, because part of the purpose of @@ -334,44 +379,47 @@ func (s TrafficOpsSessionThreadsafe) CRConfigRaw(cdn string) ([]byte, error) { var remoteAddr string var err error var data []byte + var legacyReqInf legacyClient.ReqInf - if s.useLegacy { - ss := s.getLegacy() - if ss == nil { + ss := s.get() + if ss == nil { + return nil, ErrNilSession + } + data, reqInf, err := ss.GetCRConfig(cdn) + if reqInf.RemoteAddr != nil { + remoteAddr = reqInf.RemoteAddr.String() + } + if err != nil { + log.Warnln("getting CRConfig from Traffic Ops using up-to-date client: " + err.Error() + ". Retrying with legacy client") + ls := s.getLegacy() + if ls == nil { return nil, ErrNilSession } - b, reqInf, e := ss.GetCRConfig(cdn) - err = e - data = b - if reqInf.RemoteAddr != nil { - remoteAddr = reqInf.RemoteAddr.String() + data, legacyReqInf, err = ls.GetCRConfig(cdn) + if legacyReqInf.RemoteAddr != nil { + remoteAddr = legacyReqInf.RemoteAddr.String() } - } else { - ss := s.get() - if ss == nil { - return nil, ErrNilSession - } - b, reqInf, e := ss.GetCRConfig(cdn) - err = e - data = b - if reqInf.RemoteAddr != nil { - remoteAddr = reqInf.RemoteAddr.String() + if err != nil { + log.Errorln("getting CRConfig from Traffic Ops using legacy client: " + err.Error() + ". Checking for backup") } } if err == nil { - ioutil.WriteFile(s.CRConfigBackupFile, data, 0644) + log.Infoln("successfully got CRConfig from Traffic Ops. Writing to backup file") + if wErr := ioutil.WriteFile(s.CRConfigBackupFile, data, 0644); wErr != nil { + log.Errorf("failed to write CRConfig backup file: %v", wErr) + } } else { if s.BackupFileExists() { log.Errorln("using backup file for CRConfig snapshot due to error fetching CRConfig snapshot from Traffic Ops: " + err.Error()) data, err = ioutil.ReadFile(s.CRConfigBackupFile) if err != nil { - return nil, fmt.Errorf("file Read Error: %v", err) + return nil, fmt.Errorf("reading CRConfig backup file: %v", err) } remoteAddr = localHostIP err = nil } else { - return nil, fmt.Errorf("Failed to get CRConfig from Traffic Ops (%v), and there is no backup file", err) + return nil, fmt.Errorf("failed to get CRConfig from Traffic Ops (%v), and there is no backup file", err) } } @@ -383,11 +431,6 @@ func (s TrafficOpsSessionThreadsafe) CRConfigRaw(cdn string) ([]byte, error) { } defer s.crConfigHist.Add(hist) - // TODO: per the above logic, I don't think this is possible - if err != nil { - return data, err - } - crc := &tc.CRConfig{} json := jsoniter.ConfigFastest if err = json.Unmarshal(data, crc); err != nil { @@ -438,6 +481,9 @@ func (s TrafficOpsSessionThreadsafe) fetchLegacyTMConfig(cdn string) (*tc.Traffi } m, _, e := ss.GetTrafficMonitorConfig(cdn) + if m == nil { + return nil, e + } return m.Upgrade(), e } @@ -451,20 +497,20 @@ func (s TrafficOpsSessionThreadsafe) trafficMonitorConfigMapRaw(cdn string) (*tc var configMap *tc.TrafficMonitorConfigMap var err error - if s.useLegacy { + config, err = s.fetchTMConfig(cdn) + if err != nil { + log.Warnln("getting Traffic Monitor config from Traffic Ops using up-to-date client: " + err.Error() + ". Retrying with legacy client") config, err = s.fetchLegacyTMConfig(cdn) - } else { - config, err = s.fetchTMConfig(cdn) - } - - if config == nil { if err != nil { - return nil, fmt.Errorf("getting Traffic Monitor configuration map: %v", err) + log.Errorln("getting Traffic Monitor config from Traffic Ops using legacy client: " + err.Error()) } - return nil, errors.New("nil configMap after fetching") } if err == nil { + log.Infoln("successfully got Traffic Monitor config from Traffic Ops") + if config == nil { + return nil, fmt.Errorf("nil Traffic Monitor config after successful fetch") + } configMap, err = tc.TrafficMonitorTransformToMap(config) } @@ -491,7 +537,9 @@ func (s TrafficOpsSessionThreadsafe) trafficMonitorConfigMapRaw(cdn string) (*tc json := jsoniter.ConfigFastest data, err := json.Marshal(*config) if err == nil { - ioutil.WriteFile(s.TMConfigBackupFile, data, 0644) + if wErr := ioutil.WriteFile(s.TMConfigBackupFile, data, 0644); wErr != nil { + log.Errorf("failed to write TM config backup file: %v", wErr) + } } return configMap, err @@ -613,10 +661,10 @@ func (s TrafficOpsSessionThreadsafe) MonitorCDN(hostName string) (string, error) var server tc.ServerV30 var err error - if s.useLegacy { + server, err = s.fetchServerByHostname(hostName) + if err != nil { + log.Warnln("getting server by hostname '" + hostName + "' using up-to-date client: " + err.Error() + ". Retrying with legacy client") server, err = s.fetchLegacyServerByHostname(hostName) - } else { - server, err = s.fetchServerByHostname(hostName) } if err != nil { diff --git a/traffic_monitor/handler/handler.go b/traffic_monitor/towrap/towrap_test.go similarity index 56% copy from traffic_monitor/handler/handler.go copy to traffic_monitor/towrap/towrap_test.go index 88832e8..e8482f3 100644 --- a/traffic_monitor/handler/handler.go +++ b/traffic_monitor/towrap/towrap_test.go @@ -1,4 +1,4 @@ -package handler +package towrap /* * Licensed to the Apache Software Foundation (ASF) under one @@ -20,29 +20,18 @@ package handler */ import ( - "io" + "testing" "time" -) -const ( - NOTIFY_NEVER = iota - NOTIFY_CHANGE - NOTIFY_ALWAYS + "github.com/apache/trafficcontrol/traffic_monitor/config" ) -type OpsConfig struct { - Username string `json:"username"` - Password string `json:"password"` - Url string `json:"url"` - Insecure bool `json:"insecure"` - CdnName string `json:"cdnName"` - HttpListener string `json:"httpListener"` - HttpsListener string `json:"httpsListener"` - CertFile string `json:"certFile"` - KeyFile string `json:"keyFile"` - UsingDummyTO bool `json:"usingDummyTO"` -} - -type Handler interface { - Handle(string, io.Reader, string, time.Duration, time.Time, error, uint64, bool, interface{}, chan<- uint64) +func TestTrafficOpsSessionThreadsafeUpdateSetsNonNilSessions(t *testing.T) { + s := NewTrafficOpsSessionThreadsafe(nil, nil, 5, config.Config{}) + err := s.Update("", "", "", true, "", false, 10*time.Second) + if err == nil { + t.Error("expected an error, got nil") + } else if s.session == nil || *s.session == nil || s.legacySession == nil || *s.legacySession == nil { + t.Errorf("expected non-nil sessions after getting error from Update()") + } }
