Fix TM2 event log to include error details

Fixes Traffic Monitor 2.0 event log description to include HTTP fetch
errors.

Fixes broken print formatting.

Changes Result to only have one error. Once an error occurs, that
error should be passed up, and we shouldn't try to do any more
processing on the result.

Fixes #1975


Project: http://git-wip-us.apache.org/repos/asf/incubator-trafficcontrol/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-trafficcontrol/commit/890465fb
Tree: 
http://git-wip-us.apache.org/repos/asf/incubator-trafficcontrol/tree/890465fb
Diff: 
http://git-wip-us.apache.org/repos/asf/incubator-trafficcontrol/diff/890465fb

Branch: refs/heads/psql
Commit: 890465fb85232609589527d9910c38208f9ff5c0
Parents: 247a1c4
Author: Robert Butts <robert.o.bu...@gmail.com>
Authored: Fri Oct 7 14:22:52 2016 -0600
Committer: Robert Butts <robert.o.bu...@gmail.com>
Committed: Fri Oct 7 14:22:52 2016 -0600

----------------------------------------------------------------------
 .../experimental/traffic_monitor/cache/cache.go     | 12 +++++-------
 .../traffic_monitor/health/cache_health.go          | 16 ++++++++++------
 .../traffic_monitor/manager/healthresult.go         | 10 ++++++----
 3 files changed, 21 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-trafficcontrol/blob/890465fb/traffic_monitor/experimental/traffic_monitor/cache/cache.go
----------------------------------------------------------------------
diff --git a/traffic_monitor/experimental/traffic_monitor/cache/cache.go 
b/traffic_monitor/experimental/traffic_monitor/cache/cache.go
index bc1a297..a2fe3c7 100644
--- a/traffic_monitor/experimental/traffic_monitor/cache/cache.go
+++ b/traffic_monitor/experimental/traffic_monitor/cache/cache.go
@@ -50,7 +50,7 @@ type PrecomputedData struct {
 type Result struct {
        Id        enum.CacheName
        Available bool
-       Errors    []error
+       Error     error
        Astats    Astats
        Time      time.Time
        Vitals    Vitals
@@ -138,8 +138,6 @@ func (handler Handler) Handle(id string, r io.Reader, err 
error, pollId uint64,
        log.Debugf("poll %v %v handle start\n", pollId, time.Now())
        result := Result{
                Id:           enum.CacheName(id),
-               Available:    false,
-               Errors:       []error{},
                Time:         time.Now(), // TODO change this to be computed 
the instant we get the result back, to minimise inaccuracy
                PollID:       pollId,
                PollFinished: pollFinished,
@@ -147,14 +145,14 @@ func (handler Handler) Handle(id string, r io.Reader, err 
error, pollId uint64,
 
        if err != nil {
                log.Errorf("%v handler given error '%v'\n", id, err) // error 
here, in case the thing that called Handle didn't error
-               result.Errors = append(result.Errors, err)
+               result.Error = err
                handler.ResultChannel <- result
                return
        }
 
        if r == nil {
                log.Errorf("%v handle reader nil\n", id)
-               result.Errors = append(result.Errors, fmt.Errorf("handler got 
nil reader"))
+               result.Error = fmt.Errorf("handler got nil reader")
                handler.ResultChannel <- result
                return
        }
@@ -163,7 +161,7 @@ func (handler Handler) Handle(id string, r io.Reader, err 
error, pollId uint64,
 
        if err := json.NewDecoder(r).Decode(&result.Astats); err != nil {
                log.Errorf("%s procnetdev decode error '%v'\n", id, err)
-               result.Errors = append(result.Errors, err)
+               result.Error = err
                handler.ResultChannel <- result
                return
        }
@@ -179,7 +177,7 @@ func (handler Handler) Handle(id string, r io.Reader, err 
error, pollId uint64,
        log.Debugf("poll %v %v handle decode end\n", pollId, time.Now())
 
        if err != nil {
-               result.Errors = append(result.Errors, err)
+               result.Error = err
                log.Errorf("addkbps handle %s error '%v'\n", id, err)
        } else {
                result.Available = true

http://git-wip-us.apache.org/repos/asf/incubator-trafficcontrol/blob/890465fb/traffic_monitor/experimental/traffic_monitor/health/cache_health.go
----------------------------------------------------------------------
diff --git 
a/traffic_monitor/experimental/traffic_monitor/health/cache_health.go 
b/traffic_monitor/experimental/traffic_monitor/health/cache_health.go
index aa10eb2..b8b180f 100644
--- a/traffic_monitor/experimental/traffic_monitor/health/cache_health.go
+++ b/traffic_monitor/experimental/traffic_monitor/health/cache_health.go
@@ -1,6 +1,7 @@
 package health
 
 import (
+       
"github.com/Comcast/traffic_control/traffic_monitor/experimental/common/log"
        
"github.com/Comcast/traffic_control/traffic_monitor/experimental/traffic_monitor/cache"
        traffic_ops "github.com/Comcast/traffic_control/traffic_ops/client"
 
@@ -32,25 +33,28 @@ func getNumber(key string, intface map[string]interface{}) 
(float64, error) {
 }
 
 func setError(newResult *cache.Result, err error) {
-       newResult.Errors = append(newResult.Errors, err)
+       newResult.Error = err
        newResult.Available = false
 }
 
 // Get the vitals to decide health on in the right format
 func GetVitals(newResult *cache.Result, prevResult *cache.Result, mc 
*traffic_ops.TrafficMonitorConfigMap) {
-
+       if newResult.Error != nil {
+               log.Errorf("cache_health.GetVitals() called with an errored 
Result!")
+               return
+       }
        // proc.loadavg -- we're using the 1 minute average (!?)
        // value looks like: "0.20 0.07 0.07 1/967 29536" (without the quotes)
        loadAverages := strings.Fields(newResult.Astats.System.ProcLoadavg)
        if len(loadAverages) > 0 {
                oneMinAvg, err := strconv.ParseFloat(loadAverages[0], 64)
                if err != nil {
-                       setError(newResult, fmt.Errorf("Error converting load 
average string: %v", err))
+                       setError(newResult, fmt.Errorf("Error converting load 
average string '%s': %v", newResult.Astats.System.ProcLoadavg, err))
                        return
                }
                newResult.Vitals.LoadAvg = oneMinAvg
        } else {
-               setError(newResult, fmt.Errorf("Can't make sense of'", 
newResult.Astats.System.ProcLoadavg, "'as a load average for", newResult.Id))
+               setError(newResult, fmt.Errorf("Can't make sense of '%s' as a 
load average for %s", newResult.Astats.System.ProcLoadavg, newResult.Id))
                return
        }
 
@@ -64,13 +68,11 @@ func GetVitals(newResult *cache.Result, prevResult 
*cache.Result, mc *traffic_op
                var err error
                newResult.Vitals.BytesOut, err = strconv.ParseInt(numbers[8], 
10, 64)
                if err != nil {
-                       setError(newResult, err)
                        setError(newResult, fmt.Errorf("Error converting 
BytesOut from procnetdev: %v", err))
                        return
                }
                newResult.Vitals.BytesIn, err = strconv.ParseInt(numbers[0], 
10, 64)
                if err != nil {
-                       setError(newResult, err)
                        setError(newResult, fmt.Errorf("Error converting 
BytesIn from procnetdev: %v", err))
                        return
                }
@@ -103,6 +105,8 @@ func EvalCache(result cache.Result, mc 
*traffic_ops.TrafficMonitorConfigMap) (bo
                return false, "set to OFFLINE"
        case status == "ONLINE":
                return true, "set to ONLINE"
+       case result.Error != nil:
+               return false, fmt.Sprintf("error: %v", result.Error)
        case result.Vitals.LoadAvg > 
mc.Profile[mc.TrafficServer[string(result.Id)].Profile].Parameters.HealthThresholdLoadAvg:
                return false, fmt.Sprintf("load average %f exceeds threshold 
%f", result.Vitals.LoadAvg, 
mc.Profile[mc.TrafficServer[string(result.Id)].Profile].Parameters.HealthThresholdLoadAvg)
        case result.Vitals.MaxKbpsOut < result.Vitals.KbpsOut:

http://git-wip-us.apache.org/repos/asf/incubator-trafficcontrol/blob/890465fb/traffic_monitor/experimental/traffic_monitor/manager/healthresult.go
----------------------------------------------------------------------
diff --git 
a/traffic_monitor/experimental/traffic_monitor/manager/healthresult.go 
b/traffic_monitor/experimental/traffic_monitor/manager/healthresult.go
index 01a7132..c2d0e12 100644
--- a/traffic_monitor/experimental/traffic_monitor/manager/healthresult.go
+++ b/traffic_monitor/experimental/traffic_monitor/manager/healthresult.go
@@ -103,17 +103,19 @@ func processHealthResult(cacheHealthChan <-chan 
cache.Result, toData todata.TODa
                fetchCount.Inc()
                var prevResult cache.Result
                healthResultHistory := 
healthHistory[enum.CacheName(healthResult.Id)]
-               // healthResultHistory := 
healthHistory.Get(enum.CacheName(healthResult.Id))
                if len(healthResultHistory) != 0 {
                        prevResult = 
healthResultHistory[len(healthResultHistory)-1]
                }
 
-               health.GetVitals(&healthResult, &prevResult, &monitorConfigCopy)
-               // healthHistory.Set(enum.CacheName(healthResult.Id), 
pruneHistory(append(healthHistory.Get(enum.CacheName(healthResult.Id)), 
healthResult), defaultMaxHistory))
+               if healthResult.Error == nil {
+                       health.GetVitals(&healthResult, &prevResult, 
&monitorConfigCopy)
+               }
+
                healthHistory[enum.CacheName(healthResult.Id)] = 
pruneHistory(append(healthHistory[enum.CacheName(healthResult.Id)], 
healthResult), cfg.MaxHealthHistory)
+
                isAvailable, whyAvailable := health.EvalCache(healthResult, 
&monitorConfigCopy)
                if localStates.Get().Caches[healthResult.Id].IsAvailable != 
isAvailable {
-                       log.Infof("Changing state for %s was: %t now: %t 
because %s errors: %v", healthResult.Id, prevResult.Available, isAvailable, 
whyAvailable, healthResult.Errors)
+                       log.Infof("Changing state for %s was: %t now: %t 
because %s error: %v", healthResult.Id, prevResult.Available, isAvailable, 
whyAvailable, healthResult.Error)
                        events.Add(Event{Time: time.Now().Unix(), Description: 
whyAvailable, Name: healthResult.Id, Hostname: healthResult.Id, Type: 
toDataCopy.ServerTypes[healthResult.Id].String(), Available: isAvailable})
                }
 

Reply via email to