TaylorCFrey commented on a change in pull request #6527:
URL: https://github.com/apache/trafficcontrol/pull/6527#discussion_r803359752
##########
File path: docs/source/development/traffic_monitor/traffic_monitor_api.rst
##########
@@ -402,6 +402,35 @@ Response Structure
TODO
+``/publish/DistributedPeerStates``
+==================================
+The health state information from all distributed peer Traffic Monitors.
+
+``GET``
+-------
+:Response Type: ?
+
+Request Structure
+"""""""""""""""""
+.. table:: Request Query Parameters
+
+
+--------------+---------+------------------------------------------------+
+ | Parameter | Type | Description
|
+
+==============+=========+================================================+
+ | ``hc`` | integer | The history count, number of items to
display. |
+
+--------------+---------+------------------------------------------------+
+ | ``stats`` | string | A comma separated list of stats to display.
|
+
+--------------+---------+------------------------------------------------+
+ | ``wildcard`` | boolean | Controls whether specified stats should be
|
+ | | | treated as partial strings.
|
+
+--------------+---------+------------------------------------------------+
+
+Response Structure
+""""""""""""""""""
+
+TODO
Review comment:
I believe there are issues filed to add more information to the TODOs
for the TM APIs, is that correct?
##########
File path: lib/go-tc/traffic_monitor.go
##########
@@ -438,7 +438,7 @@ type TrafficMonitor struct {
Profile string `json:"profile"`
// Location is the Name of the Cache Group to which the Traffic Monitor
// belongs - called "Location" for legacy reasons.
- Location string `json:"location"`
+ Location string `json:"cachegroup"`
Review comment:
Since we are cleaning up the JSON object, can we remove the explanation
comment above as well?
##########
File path: traffic_monitor/config/config.go
##########
@@ -237,6 +235,9 @@ func (c *Config) UnmarshalJSON(data []byte) error {
if aux.TrafficOpsMaxRetryIntervalMs != nil {
c.TrafficOpsMaxRetryInterval =
time.Duration(*aux.TrafficOpsMaxRetryIntervalMs) * time.Millisecond
}
+ if c.StatPolling && c.DistributedPolling {
+ return errors.New("invalid configuration: stat_polling cannot
be enabled if distributed_polling is also enabled")
Review comment:
If they are both `false` is that state something we need to account for?
##########
File path: traffic_monitor/datareq/crstate.go
##########
@@ -51,15 +58,36 @@ func srvTRState(params url.Values, localStates
peer.CRStatesThreadsafe, combined
}
}
- data, err := srvTRStateDerived(combinedStates, peerStates)
+ data, err := srvTRStateDerived(combinedStates, local &&
distributedPollingEnabled)
return data, http.StatusOK, err
}
-func srvTRStateDerived(combinedStates peer.CRStatesThreadsafe, peerStates
peer.CRStatesPeersThreadsafe) ([]byte, error) {
- return tc.CRStatesMarshall(combinedStates.Get())
+func srvTRStateDerived(combinedStates peer.CRStatesThreadsafe,
directlyPolledOnly bool) ([]byte, error) {
+ if !directlyPolledOnly {
Review comment:
For this `directlyPolledOnly` check, is the desire here to not _have_ to
filter if we already know it's not distributed? It seems like we could filter
each time, but perhaps we are concerned about performance?
Could we filter the `combinedStates` once when we initially receive the
results to avoid passing a boolean flag into the function?
##########
File path: traffic_monitor/peer/crstates.go
##########
@@ -36,7 +36,7 @@ type CRStatesThreadsafe struct {
// NewCRStatesThreadsafe creates a new CRStatesThreadsafe object safe for
multiple goroutine readers and a single writer.
func NewCRStatesThreadsafe() CRStatesThreadsafe {
- crs := tc.NewCRStates()
+ crs := tc.NewCRStates(8, 8)
Review comment:
Could we label these values so as to avoid "magic numbers"?
##########
File path: traffic_monitor/cache/data.go
##########
@@ -25,11 +25,6 @@ import (
"github.com/apache/trafficcontrol/lib/go-tc"
)
-// AvailableStatusReported is the status string returned by caches set to
Review comment:
Is this simply removed and not actually "put somewhere more generic"?
##########
File path: traffic_monitor/manager/monitorconfig.go
##########
@@ -337,6 +379,103 @@ func monitorConfigListen(
}
}
+// getCacheGroupsToPoll returns the name of this Traffic Monitor's cache group
+// and the set of cache groups it needs to poll.
+func getCacheGroupsToPoll(distributedPolling bool, hostname string, monitors
map[string]tc.TrafficMonitor,
+ caches map[string]tc.TrafficServer, allCacheGroups
map[string]tc.TMCacheGroup) (string, map[string]tc.TMCacheGroup, error) {
+ tmGroupSet := make(map[string]tc.TMCacheGroup)
+ cacheGroupSet := make(map[string]tc.TMCacheGroup)
+ tmGroupToPolledCacheGroups :=
make(map[string]map[string]tc.TMCacheGroup)
+ thisTMGroup := ""
+
+ for _, tm := range monitors {
+ if tm.HostName == hostname {
+ thisTMGroup = tm.Location
+ }
+ if tc.CacheStatusFromString(tm.ServerStatus) ==
tc.CacheStatusOnline {
+ tmGroupSet[tm.Location] = allCacheGroups[tm.Location]
+ if _, ok := tmGroupToPolledCacheGroups[tm.Location];
!ok {
+ tmGroupToPolledCacheGroups[tm.Location] =
make(map[string]tc.TMCacheGroup)
+ }
+ }
+ }
+ if thisTMGroup == "" {
+ return "", nil, fmt.Errorf("unable to find cache group for this
Traffic Monitor (%s) in monitoring config snapshot", hostname)
+ }
+
+ for _, c := range caches {
+ status := tc.CacheStatusFromString(c.ServerStatus)
+ if status == tc.CacheStatusOnline || status ==
tc.CacheStatusReported || status == tc.CacheStatusAdminDown {
+ cacheGroupSet[c.CacheGroup] =
allCacheGroups[c.CacheGroup]
+ }
+ }
+
+ if !distributedPolling {
+ return thisTMGroup, cacheGroupSet, nil
+ }
+
+ tmGroups := make([]string, 0, len(tmGroupSet))
+ for tg := range tmGroupSet {
+ tmGroups = append(tmGroups, tg)
+ }
+ sort.Strings(tmGroups)
+ cgs := make([]string, 0, len(cacheGroupSet))
+ for cg := range cacheGroupSet {
+ cgs = append(cgs, cg)
+ }
+ sort.Strings(cgs)
Review comment:
Is it necessary to sort the tmGroups and cacheGroupSets?
##########
File path: traffic_monitor/manager/monitorconfig.go
##########
@@ -337,6 +379,103 @@ func monitorConfigListen(
}
}
+// getCacheGroupsToPoll returns the name of this Traffic Monitor's cache group
+// and the set of cache groups it needs to poll.
+func getCacheGroupsToPoll(distributedPolling bool, hostname string, monitors
map[string]tc.TrafficMonitor,
+ caches map[string]tc.TrafficServer, allCacheGroups
map[string]tc.TMCacheGroup) (string, map[string]tc.TMCacheGroup, error) {
+ tmGroupSet := make(map[string]tc.TMCacheGroup)
+ cacheGroupSet := make(map[string]tc.TMCacheGroup)
+ tmGroupToPolledCacheGroups :=
make(map[string]map[string]tc.TMCacheGroup)
+ thisTMGroup := ""
+
+ for _, tm := range monitors {
+ if tm.HostName == hostname {
+ thisTMGroup = tm.Location
+ }
+ if tc.CacheStatusFromString(tm.ServerStatus) ==
tc.CacheStatusOnline {
+ tmGroupSet[tm.Location] = allCacheGroups[tm.Location]
+ if _, ok := tmGroupToPolledCacheGroups[tm.Location];
!ok {
+ tmGroupToPolledCacheGroups[tm.Location] =
make(map[string]tc.TMCacheGroup)
+ }
+ }
+ }
+ if thisTMGroup == "" {
+ return "", nil, fmt.Errorf("unable to find cache group for this
Traffic Monitor (%s) in monitoring config snapshot", hostname)
+ }
+
+ for _, c := range caches {
+ status := tc.CacheStatusFromString(c.ServerStatus)
+ if status == tc.CacheStatusOnline || status ==
tc.CacheStatusReported || status == tc.CacheStatusAdminDown {
+ cacheGroupSet[c.CacheGroup] =
allCacheGroups[c.CacheGroup]
+ }
+ }
+
+ if !distributedPolling {
+ return thisTMGroup, cacheGroupSet, nil
+ }
+
+ tmGroups := make([]string, 0, len(tmGroupSet))
+ for tg := range tmGroupSet {
+ tmGroups = append(tmGroups, tg)
+ }
+ sort.Strings(tmGroups)
+ cgs := make([]string, 0, len(cacheGroupSet))
+ for cg := range cacheGroupSet {
+ cgs = append(cgs, cg)
+ }
+ sort.Strings(cgs)
+ tmGroupCount := len(tmGroups)
+ var closest string
+ for tmi := 0; len(cgs) > 0; tmi = (tmi + 1) % tmGroupCount {
+ tmGroup := tmGroups[tmi]
+ closest, cgs = findAndRemoveClosestCachegroup(cgs,
allCacheGroups[tmGroup], allCacheGroups)
+ tmGroupToPolledCacheGroups[tmGroup][closest] =
allCacheGroups[closest]
+ }
+ return thisTMGroup, tmGroupToPolledCacheGroups[thisTMGroup], nil
+}
+
+func findAndRemoveClosestCachegroup(remainingCacheGroups []string, target
tc.TMCacheGroup, allCacheGroups map[string]tc.TMCacheGroup) (string, []string) {
+ shortestDistance := math.MaxFloat64
+ shortestIndex := -1
+ for i := 0; i < len(remainingCacheGroups); i++ {
+ distance := getDistance(target,
allCacheGroups[remainingCacheGroups[i]])
+ if distance < shortestDistance {
+ shortestDistance = distance
+ shortestIndex = i
+ }
+ }
+ closest := remainingCacheGroups[shortestIndex]
+ remainingCacheGroups = append(remainingCacheGroups[:shortestIndex],
remainingCacheGroups[shortestIndex+1:]...)
+ return closest, remainingCacheGroups
+}
+
+const meanEarthRadius = 6371.0
+const x = math.Pi / 180
+
+// toRadians converts degrees to radians.
+func toRadians(d float64) float64 {
+ return d * x
+}
+
+// getDistance gets the great circle distance in kilometers between x and y.
+func getDistance(x, y tc.TMCacheGroup) float64 {
Review comment:
Whoa, nice. I'll asume this is correct ;P Is this Haversine?
##########
File path: traffic_monitor/conf/traffic_monitor.cfg
##########
@@ -4,6 +4,8 @@
"peer_optimistic": true,
"peer_optimistic_quorum_min": 0,
"max_events": 200,
+ "stat_polling": true,
+ "distributed_polling": false,
Review comment:
Is this flag then necessary for all TMs? Or only for those that wish to
participate in distributed polling? Can you have some with it `false` and some
with it `true` or will all TMs then have to abide by one or the other methods?
##########
File path: traffic_monitor/manager/monitorconfig.go
##########
@@ -286,25 +307,46 @@ func monitorConfigListen(
}
peerSet := map[tc.TrafficMonitorName]struct{}{}
+ tmsByGroup := make(map[string][]tc.TrafficMonitor)
for _, srv := range monitorConfig.TrafficMonitor {
- if srv.HostName == staticAppData.Hostname {
+ if tc.CacheStatusFromString(srv.ServerStatus) !=
tc.CacheStatusOnline {
+ continue
+ }
+ tmsByGroup[srv.Location] =
append(tmsByGroup[srv.Location], srv)
+ }
+
+ for _, srv := range monitorConfig.TrafficMonitor {
+ if srv.HostName == staticAppData.Hostname ||
(cfg.DistributedPolling && srv.Location != thisTMGroup) {
continue
}
if tc.CacheStatusFromString(srv.ServerStatus) !=
tc.CacheStatusOnline {
continue
}
// TODO: the URL should be config driven. -jse
- url4 :=
fmt.Sprintf("http://%s:%d/publish/CrStates?raw", srv.IP, srv.Port)
- url6 :=
fmt.Sprintf("http://[%s]:%d/publish/CrStates?raw", ipv6CIDRStrToAddr(srv.IP6),
srv.Port)
- peerURLs[srv.HostName] = poller.PollConfig{URL: url4,
URLv6: url6, Host: srv.FQDN} // TODO determine timeout.
+ peerURL :=
fmt.Sprintf("http://%s:%d/publish/CrStates?raw", srv.FQDN, srv.Port)
Review comment:
Do we no longer need IPv6?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]