TaylorCFrey commented on a change in pull request #6527:
URL: https://github.com/apache/trafficcontrol/pull/6527#discussion_r803359752



##########
File path: docs/source/development/traffic_monitor/traffic_monitor_api.rst
##########
@@ -402,6 +402,35 @@ Response Structure
 TODO
 
 
+``/publish/DistributedPeerStates``
+==================================
+The health state information from all distributed peer Traffic Monitors.
+
+``GET``
+-------
+:Response Type: ?
+
+Request Structure
+"""""""""""""""""
+.. table:: Request Query Parameters
+
+       
+--------------+---------+------------------------------------------------+
+       |  Parameter   | Type    |                  Description                 
  |
+       
+==============+=========+================================================+
+       | ``hc``       | integer | The history count, number of items to 
display. |
+       
+--------------+---------+------------------------------------------------+
+       | ``stats``    | string  | A comma separated list of stats to display.  
  |
+       
+--------------+---------+------------------------------------------------+
+       | ``wildcard`` | boolean | Controls whether specified stats should be   
  |
+       |              |         | treated as partial strings.                  
  |
+       
+--------------+---------+------------------------------------------------+
+
+Response Structure
+""""""""""""""""""
+
+TODO

Review comment:
       I believe there are issues filed to add more information to the TODOs 
for the TM APIs, is that correct?

##########
File path: lib/go-tc/traffic_monitor.go
##########
@@ -438,7 +438,7 @@ type TrafficMonitor struct {
        Profile string `json:"profile"`
        // Location is the Name of the Cache Group to which the Traffic Monitor
        // belongs - called "Location" for legacy reasons.
-       Location string `json:"location"`
+       Location string `json:"cachegroup"`

Review comment:
       Since we are cleaning up the JSON object, can we remove the explanation 
comment above as well?

##########
File path: traffic_monitor/config/config.go
##########
@@ -237,6 +235,9 @@ func (c *Config) UnmarshalJSON(data []byte) error {
        if aux.TrafficOpsMaxRetryIntervalMs != nil {
                c.TrafficOpsMaxRetryInterval = 
time.Duration(*aux.TrafficOpsMaxRetryIntervalMs) * time.Millisecond
        }
+       if c.StatPolling && c.DistributedPolling {
+               return errors.New("invalid configuration: stat_polling cannot 
be enabled if distributed_polling is also enabled")

Review comment:
       If they are both `false` is that state something we need to account for?

##########
File path: traffic_monitor/datareq/crstate.go
##########
@@ -51,15 +58,36 @@ func srvTRState(params url.Values, localStates 
peer.CRStatesThreadsafe, combined
                }
        }
 
-       data, err := srvTRStateDerived(combinedStates, peerStates)
+       data, err := srvTRStateDerived(combinedStates, local && 
distributedPollingEnabled)
 
        return data, http.StatusOK, err
 }
 
-func srvTRStateDerived(combinedStates peer.CRStatesThreadsafe, peerStates 
peer.CRStatesPeersThreadsafe) ([]byte, error) {
-       return tc.CRStatesMarshall(combinedStates.Get())
+func srvTRStateDerived(combinedStates peer.CRStatesThreadsafe, 
directlyPolledOnly bool) ([]byte, error) {
+       if !directlyPolledOnly {

Review comment:
       For this `directlyPolledOnly` check, is the desire here to not _have_ to 
filter if we already know it's not distributed? It seems like we could filter 
each time, but perhaps we are concerned about performance?
   
   Could we filter the `combinedStates` once when we initially receive the 
results to avoid passing a boolean flag into the function?

##########
File path: traffic_monitor/peer/crstates.go
##########
@@ -36,7 +36,7 @@ type CRStatesThreadsafe struct {
 
 // NewCRStatesThreadsafe creates a new CRStatesThreadsafe object safe for 
multiple goroutine readers and a single writer.
 func NewCRStatesThreadsafe() CRStatesThreadsafe {
-       crs := tc.NewCRStates()
+       crs := tc.NewCRStates(8, 8)

Review comment:
       Could we label these values so as to avoid "magic numbers"?

##########
File path: traffic_monitor/cache/data.go
##########
@@ -25,11 +25,6 @@ import (
        "github.com/apache/trafficcontrol/lib/go-tc"
 )
 
-// AvailableStatusReported is the status string returned by caches set to

Review comment:
       Is this simply removed and not actually "put somewhere more generic"?

##########
File path: traffic_monitor/manager/monitorconfig.go
##########
@@ -337,6 +379,103 @@ func monitorConfigListen(
        }
 }
 
+// getCacheGroupsToPoll returns the name of this Traffic Monitor's cache group
+// and the set of cache groups it needs to poll.
+func getCacheGroupsToPoll(distributedPolling bool, hostname string, monitors 
map[string]tc.TrafficMonitor,
+       caches map[string]tc.TrafficServer, allCacheGroups 
map[string]tc.TMCacheGroup) (string, map[string]tc.TMCacheGroup, error) {
+       tmGroupSet := make(map[string]tc.TMCacheGroup)
+       cacheGroupSet := make(map[string]tc.TMCacheGroup)
+       tmGroupToPolledCacheGroups := 
make(map[string]map[string]tc.TMCacheGroup)
+       thisTMGroup := ""
+
+       for _, tm := range monitors {
+               if tm.HostName == hostname {
+                       thisTMGroup = tm.Location
+               }
+               if tc.CacheStatusFromString(tm.ServerStatus) == 
tc.CacheStatusOnline {
+                       tmGroupSet[tm.Location] = allCacheGroups[tm.Location]
+                       if _, ok := tmGroupToPolledCacheGroups[tm.Location]; 
!ok {
+                               tmGroupToPolledCacheGroups[tm.Location] = 
make(map[string]tc.TMCacheGroup)
+                       }
+               }
+       }
+       if thisTMGroup == "" {
+               return "", nil, fmt.Errorf("unable to find cache group for this 
Traffic Monitor (%s) in monitoring config snapshot", hostname)
+       }
+
+       for _, c := range caches {
+               status := tc.CacheStatusFromString(c.ServerStatus)
+               if status == tc.CacheStatusOnline || status == 
tc.CacheStatusReported || status == tc.CacheStatusAdminDown {
+                       cacheGroupSet[c.CacheGroup] = 
allCacheGroups[c.CacheGroup]
+               }
+       }
+
+       if !distributedPolling {
+               return thisTMGroup, cacheGroupSet, nil
+       }
+
+       tmGroups := make([]string, 0, len(tmGroupSet))
+       for tg := range tmGroupSet {
+               tmGroups = append(tmGroups, tg)
+       }
+       sort.Strings(tmGroups)
+       cgs := make([]string, 0, len(cacheGroupSet))
+       for cg := range cacheGroupSet {
+               cgs = append(cgs, cg)
+       }
+       sort.Strings(cgs)

Review comment:
       Is it necessary to sort the tmGroups and cacheGroupSets?

##########
File path: traffic_monitor/manager/monitorconfig.go
##########
@@ -337,6 +379,103 @@ func monitorConfigListen(
        }
 }
 
+// getCacheGroupsToPoll returns the name of this Traffic Monitor's cache group
+// and the set of cache groups it needs to poll.
+func getCacheGroupsToPoll(distributedPolling bool, hostname string, monitors 
map[string]tc.TrafficMonitor,
+       caches map[string]tc.TrafficServer, allCacheGroups 
map[string]tc.TMCacheGroup) (string, map[string]tc.TMCacheGroup, error) {
+       tmGroupSet := make(map[string]tc.TMCacheGroup)
+       cacheGroupSet := make(map[string]tc.TMCacheGroup)
+       tmGroupToPolledCacheGroups := 
make(map[string]map[string]tc.TMCacheGroup)
+       thisTMGroup := ""
+
+       for _, tm := range monitors {
+               if tm.HostName == hostname {
+                       thisTMGroup = tm.Location
+               }
+               if tc.CacheStatusFromString(tm.ServerStatus) == 
tc.CacheStatusOnline {
+                       tmGroupSet[tm.Location] = allCacheGroups[tm.Location]
+                       if _, ok := tmGroupToPolledCacheGroups[tm.Location]; 
!ok {
+                               tmGroupToPolledCacheGroups[tm.Location] = 
make(map[string]tc.TMCacheGroup)
+                       }
+               }
+       }
+       if thisTMGroup == "" {
+               return "", nil, fmt.Errorf("unable to find cache group for this 
Traffic Monitor (%s) in monitoring config snapshot", hostname)
+       }
+
+       for _, c := range caches {
+               status := tc.CacheStatusFromString(c.ServerStatus)
+               if status == tc.CacheStatusOnline || status == 
tc.CacheStatusReported || status == tc.CacheStatusAdminDown {
+                       cacheGroupSet[c.CacheGroup] = 
allCacheGroups[c.CacheGroup]
+               }
+       }
+
+       if !distributedPolling {
+               return thisTMGroup, cacheGroupSet, nil
+       }
+
+       tmGroups := make([]string, 0, len(tmGroupSet))
+       for tg := range tmGroupSet {
+               tmGroups = append(tmGroups, tg)
+       }
+       sort.Strings(tmGroups)
+       cgs := make([]string, 0, len(cacheGroupSet))
+       for cg := range cacheGroupSet {
+               cgs = append(cgs, cg)
+       }
+       sort.Strings(cgs)
+       tmGroupCount := len(tmGroups)
+       var closest string
+       for tmi := 0; len(cgs) > 0; tmi = (tmi + 1) % tmGroupCount {
+               tmGroup := tmGroups[tmi]
+               closest, cgs = findAndRemoveClosestCachegroup(cgs, 
allCacheGroups[tmGroup], allCacheGroups)
+               tmGroupToPolledCacheGroups[tmGroup][closest] = 
allCacheGroups[closest]
+       }
+       return thisTMGroup, tmGroupToPolledCacheGroups[thisTMGroup], nil
+}
+
+func findAndRemoveClosestCachegroup(remainingCacheGroups []string, target 
tc.TMCacheGroup, allCacheGroups map[string]tc.TMCacheGroup) (string, []string) {
+       shortestDistance := math.MaxFloat64
+       shortestIndex := -1
+       for i := 0; i < len(remainingCacheGroups); i++ {
+               distance := getDistance(target, 
allCacheGroups[remainingCacheGroups[i]])
+               if distance < shortestDistance {
+                       shortestDistance = distance
+                       shortestIndex = i
+               }
+       }
+       closest := remainingCacheGroups[shortestIndex]
+       remainingCacheGroups = append(remainingCacheGroups[:shortestIndex], 
remainingCacheGroups[shortestIndex+1:]...)
+       return closest, remainingCacheGroups
+}
+
+const meanEarthRadius = 6371.0
+const x = math.Pi / 180
+
+// toRadians converts degrees to radians.
+func toRadians(d float64) float64 {
+       return d * x
+}
+
+// getDistance gets the great circle distance in kilometers between x and y.
+func getDistance(x, y tc.TMCacheGroup) float64 {

Review comment:
       Whoa, nice. I'll asume this is correct ;P Is this Haversine?

##########
File path: traffic_monitor/conf/traffic_monitor.cfg
##########
@@ -4,6 +4,8 @@
        "peer_optimistic": true,
        "peer_optimistic_quorum_min": 0,
        "max_events": 200,
+       "stat_polling": true,
+       "distributed_polling": false,

Review comment:
       Is this flag then necessary for all TMs? Or only for those that wish to 
participate in distributed polling? Can you have some with it `false` and some 
with it `true` or will all TMs then have to abide by one or the other methods?

##########
File path: traffic_monitor/manager/monitorconfig.go
##########
@@ -286,25 +307,46 @@ func monitorConfigListen(
                }
 
                peerSet := map[tc.TrafficMonitorName]struct{}{}
+               tmsByGroup := make(map[string][]tc.TrafficMonitor)
                for _, srv := range monitorConfig.TrafficMonitor {
-                       if srv.HostName == staticAppData.Hostname {
+                       if tc.CacheStatusFromString(srv.ServerStatus) != 
tc.CacheStatusOnline {
+                               continue
+                       }
+                       tmsByGroup[srv.Location] = 
append(tmsByGroup[srv.Location], srv)
+               }
+
+               for _, srv := range monitorConfig.TrafficMonitor {
+                       if srv.HostName == staticAppData.Hostname || 
(cfg.DistributedPolling && srv.Location != thisTMGroup) {
                                continue
                        }
                        if tc.CacheStatusFromString(srv.ServerStatus) != 
tc.CacheStatusOnline {
                                continue
                        }
                        // TODO: the URL should be config driven. -jse
-                       url4 := 
fmt.Sprintf("http://%s:%d/publish/CrStates?raw";, srv.IP, srv.Port)
-                       url6 := 
fmt.Sprintf("http://[%s]:%d/publish/CrStates?raw";, ipv6CIDRStrToAddr(srv.IP6), 
srv.Port)
-                       peerURLs[srv.HostName] = poller.PollConfig{URL: url4, 
URLv6: url6, Host: srv.FQDN} // TODO determine timeout.
+                       peerURL := 
fmt.Sprintf("http://%s:%d/publish/CrStates?raw";, srv.FQDN, srv.Port)

Review comment:
       Do we no longer need IPv6?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to