Erik Zachte has uploaded a new change for review. https://gerrit.wikimedia.org/r/76518
Change subject: final tweaks on new udp msg loss report, before publishing ...................................................................... final tweaks on new udp msg loss report, before publishing Change-Id: If9f0cb6eae424c4076ffc7076b99f1f46aac022f --- M squids/bash/SquidLoadScan.sh M squids/perl/SquidLoadScan.pl 2 files changed, 286 insertions(+), 87 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/wikistats refs/changes/18/76518/1 diff --git a/squids/bash/SquidLoadScan.sh b/squids/bash/SquidLoadScan.sh index 05681a2..597c6c9 100755 --- a/squids/bash/SquidLoadScan.sh +++ b/squids/bash/SquidLoadScan.sh @@ -10,8 +10,11 @@ squids=$wikistats/squids perl=$squids/perl csv=$squids/csv - +reports=$squids/reports cd $perl -nice perl SquidLoadScan.pl -i $csv -o $csv/load +# -i input +# -o csv output +# -h html output +nice perl SquidLoadScan.pl -i $csv -o $csv/load -h $reports/load diff --git a/squids/perl/SquidLoadScan.pl b/squids/perl/SquidLoadScan.pl index c8b63a6..e370368 100755 --- a/squids/perl/SquidLoadScan.pl +++ b/squids/perl/SquidLoadScan.pl @@ -12,12 +12,25 @@ # # delta is difference in sequence number beteen consecutive udp messages from same squid # should ideally be 1000 in a 1:1000 sampled log, but small fluctuations are normal -# avg gap between sequence numbers > 1010: there is probably packet loss +# avg gap between sequence numbers > 1010: there is probably packet loss +# +# 2013 July: added html report +# +# comment by Andrew: In conjunction, there are similar ganglia metrics of these numbers on each of the udp2log boxes. Example: http://bit.ly/13pkK3e +# You can see a similar breakdown of packet_loss_average per role. These roles are defined by the pybal config: http://noc.wikimedia.org/pybal/ +# The packet_loss_average metric is sampled at a 1/10 level instead of 1/1000, so it will be slightly more accurate. However, these metrics don't weight anything, so if there is any loss from a role that has very few requests, the average will be skewed. +# Having both of these available for troubleshooting is very useful. +# reply by Erik Zachte: +# > However, these metrics don't weight anything, so if there is any loss from a role that has very few requests, the average will be skewed. +# Yes, that is precisely what I thought was missing. So what my report adds is the bottom line: "how much of x% drop in MoM page views can be attributed to msg loss?" +# And as we have server clusters on hot standby, being fed a trickle of data (as I understood long ago to keep caches up to date), their contribution to overall loss would be minimal. +# But seeing them in red could give early warning that we would have an issue when they would become primary server. + use Time::Local ; use Getopt::Std ; use File::Path ; - getopt ("io", \%options) ; + getopt ("ioh", \%options) ; $| = 1; # flush screen output @@ -29,12 +42,21 @@ die "Input path '$path_csv_in' not found (squids csv top folder)" if ! -d $path_csv_in ; $path_csv_out = $options {'o'} ; - die "Specify output path as -o [path]" if $path_csv_out eq '' ; + die "Specify csv output path as -o [path]" if $path_csv_out eq '' ; + + $path_html_out = $options {'h'} ; + die "Specify html output path as -h [path]" if $path_html_out eq '' ; if (! -d $path_csv_out) { mkpath $path_csv_out ; die "Path '$path_csv_out' could not be created" if ! -d $path_csv_out ; + } + + if (! -d $path_html_out) + { + mkpath $path_html_out ; + die "Path '$path_html_out' could not be created" if ! -d $path_html_out ; } # read all files on squid log aggregator with hourly counts for @@ -44,6 +66,7 @@ &ReadData ; &ProcessData ; + &CalcHourlyEvents ; &BuildListDaysHours ; &WriteHourlyAveragedDeltaSequenceNumbers ; @@ -129,52 +152,59 @@ elsif ($squid =~ wikimedia) { $location = 'pmpta' ; } - # roles taken from CommonSettings.php - $role = 'role?' ; + # roles mostly taken from http://noc.wikimedia.org/conf/highlight.php?file=squid.php (roles for older servers retained from elsewhere) + $role = '' ; if ($squid =~ /^ssl/) - { $role = 'https' ; } - elsif (($name eq 'sq37') or - ($name ge 'sq59' and $name le 'sq66') or - ($name ge 'sq71' and $name le 'sq78') or - ($name ge 'cp1006' and $name le 'cp1020') or - ($name ge 'knsq23' and $name le 'knsq29') or - ($name ge 'amssq31' and $name le 'amssq46')) - { $role = 'text' ; } - elsif ( $name eq 'sq67' or $name eq 'sq68' or - $name eq 'arsenic' or $name eq 'niobium' or - $name eq 'cp3001' or $name eq 'cp3002' or - ($name ge 'cp3019' and $name le 'cp3022')) - { $role = 'bits' ; } - elsif (($name eq 'sq33' or $name eq 'sq34' or $name eq 'sq36') or - ($name ge 'cp1001' and $name le 'cp1005')) - { $role = 'API' ; } - elsif (($name ge 'sq41' and $name le 'sq45') or - ($name ge 'sq48' and $name le 'sq58') or - ($name ge 'sq79' and $name le 'sq86') or - ($name ge 'cp1021' and $name le 'cp1036') or - ($name ge 'amssq47' and $name le 'amssq62') or - ($name ge 'knsq16' and $name le 'knsq22')) - { $role = 'upload' ; } - elsif ($name ge 'cp1041' and $name le 'cp1044') - { $role = 'mobile' ; } - elsif (($name ge 'cp1021' and $name le 'cp1036') or - ($name ge 'cp3003' and $name le 'cp3010')) - { $role = 'cache' ; } - elsif ($name ge 'sq31' and $name le 'sq47') - { $role = '~RIP1' ; $location = '-' ; } - elsif ($name ge 'knsq1' and $name le 'knsq9') - { $role = '~RIP2' ; $location = '-' ; } - elsif ($name =~ /^mobile/) - { $role = '~RIP3' ; $location = '-' ; } - elsif ($name eq 'cp1039' or $name eq 'cp1040') - { $role = '~RIP4' ; $location = '-' ; } - elsif ($name eq 'gurvin' or $name eq 'maerlant' or $name eq yvon) - { $role = '~RIP5' ; $location = '-' ; } - elsif ($name eq 'marmontel') - { $role = 'unknown1' ; } + { $role = 'ssl' ; } + # not maximaly efficient but compact + # instead of specifying range which implies nested condition here's a simple and condense iteration: + $role = 'text' if $location eq 'pmpta' and $name =~ /sq37|sq59|sq60|sq61|sq62|sq63|sq64|sq65|sq66|sq71|sq72|sq73|sq74|sq75|sq76|sq77|sq78/ ; + $role = 'text' if $location eq 'eqiad' and $name ge 'cp1006' and $name le 'cp1020' ; + $role = 'text' if $location eq 'esams' and $name ge 'knsq23' and $name le 'knsq29' ; + $role = 'text' if $location eq 'knams' and $name ge 'knsq23' and $name le 'knsq29' ; + $role = 'text' if $location eq 'esams' and $name ge 'amssq31' and $name le 'amssq46' ; - next if $role eq 'text' and $location eq 'pmpta' ; + $role = 'bits' if $location eq 'pmpta' and $name =~ /sq67|sq68|sq69|sq70/ ; + $role = 'bits' if $location eq 'eqiad' and $name =~ /arsenic|mobium|cp1056|cp1057|cp1069|cp1070/ ; + $role = 'bits' if $location eq 'esams' and $name =~ /cp3019|cp3020|cp3021|cp3022/ ; + + $role = 'api' if $location eq 'pmpta' and $name =~ /sq33|sq34|sq36/ ; + $role = 'api' if $location eq 'eqiad' and $name =~ /cp1001|cp1002|cp1003|cp1004|cp1005/ ; + + $role = 'upload' if $location eq 'pmpta' and ( ($name ge 'sq41' and $name le 'sq45') or ($name ge 'sq48' and $name le 'sq58') or ($name ge 'sq79' and $name le 'sq86') ) ; + $role = 'upload' if $location eq 'eqiad' and ( ($name ge 'cp1021' and $name le 'cp1036') or ($name ge 'cp1048' and $name le 'cp1051') or ($name ge 'cp1061' and $name le 'cp1064') or $name eq 'dysprosium' ) ; + $role = 'upload' if $location eq 'esams' and ( ($name ge 'cp3003' and $name le 'cp3010') or ($name ge 'knsq16' and $name le 'knsq22') ) ; + $role = 'upload' if $location eq 'knams' and $name ge 'knsq16' and $name le 'knsq22' ; + $role = 'upload' if $location eq 'esams' and $name ge 'amssq47' and $name le 'amssq62' ; + + $role = 'mobile' if $location eq 'pmpta' and $name =~ /cp1041|cp1042|cp1043|cp1044/ ; # http://noc.wikimedia.org/conf/highlight.php?file=squid.php ?? not live config? + $role = 'mobile' if $location eq 'eqiad' and $name =~ /cp1041|cp1042|cp1043|cp1044|cp1046|cp1047|cp1059|cp1060/ ; + $role = 'mobile' if $location eq 'esams' and $name =~ /cp3011|cp3012|cp3013|cp3014/ ; + + $role = 'varnish' if $location eq 'eqiad' and ( ($name ge 'cp1037' and $name le 'cp1040') or ($name ge 'cp1052' and $name le 'cp1055') or ($name ge 'cp1065' and $name le 'cp1068') ) ; + if ($role eq '') # old code predates squid.php + { + if ($name ge 'sq31' and $name le 'sq47') + { $role = '~RIP1' ; } # $location = '-' ; } + + elsif ($name ge 'knsq1' and $name le 'knsq9') + { $role = '~RIP2' ; } # $location = '-' ; } + + elsif ($name =~ /^mobile/) + { $role = '~RIP3' ; } # $location = '-' ; } + + elsif ($name eq 'cp1039' or $name eq 'cp1040') + { $roe = '~RIP4' ; } # $location = '-' ; } + + elsif ($name eq 'gurvin' or $name eq 'maerlant' or $name eq yvon) + { $role = '~RIP5' ; } # $location = '-' ; } + + else + { $role = 'zrole?' ; } # z -> sort last, but before RIP + } + + next if $location eq 'pmpta' and $name =~ /cp1039|cp1040/ ; # too old, one data point $roles_locations {"$role/$location"} ++ ; ($squid3 = $squid) =~ s/\.wikimedia\.org// ; @@ -225,6 +255,22 @@ $all_regular_squids_active {"$role,$date_yyyy_mm_dd,$hour"} ++ ; } close CSV ; +} + +sub CalcHourlyEvents +{ + foreach $squid_set (sort keys %squid_sets) + { + foreach $month (sort keys %months) + { + $key = "$squid_set,$month" ; + if ($squid_set_hours_month {$key} > 0) + { $squid_set_hourly_events {$key} = sprintf ("%.0f", $squid_set_events_month {$key} / $squid_set_hours_month {$key}) ; } + + if ( $squid_set_hourly_events {$key} > $squid_set_hourly_events_hi {$squid_set} ) + { $squid_set_hourly_events_hi {$squid_set} = $squid_set_hourly_events {$key} ; } + } + } } sub BuildListDaysHours @@ -314,28 +360,73 @@ # monthly data per squid set, first average hourly delta between sequence numbers, then hourly number of events sub WriteMonthlyMetricsPerSquidSet { - open CSV , '>', "$path_csv_out/SquidDataMonthlyPerSquidSet.csv" ; - - print CSV "Role taken from CommonSettings.php\n\n" ; + open HTML, '>', "$path_html_out/SquidDataMonthlyPerSquidSet.htm" ; + print HTML "<html>\n<head>\n<title>Monthly avg msg loss per squid set</title>\n" . + "<style type='text/css'>\n" . + "<!--\n" . + "td {white-space:nowrap; text-align:right; background-color:#DDD; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:10px}\n" . + "th {white-space:nowrap; text-align:right; background-color:#BBB; padding-left:2px; padding-right:2px; padding-top:1px;padding-bottom:0px ; font-size:10px}\n" . + "td.improbable {background-color:#AAA; color:#B0B}\n" . + "td.good {background-color:#4F4; color:#000}\n" . + "td.mediocre {background-color:#FF4; color:#000}\n" . + "td.bad {background-color:#F00; color:#FFF}\n" . + "td.insignificant {background-color:#AAA; color:#666}\n" . + "td.msg_loss_good {background-color:#FFF; color:#080}\n" . + "td.msg_loss_mediocre {background-color:#FFF; color:#FA0}\n" . + "td.msg_loss_bad {background-color:#FFF; color:#F00}\n" . + "-->\n" . + "</style>\n" . + "</head><body>\n" ; + + print HTML "<h3>udp2log message loss, and traffic volume, based on 1:1000 sampled squid logs</h3>\n" ; + print HTML "<p>Server roles mostly taken from <a href='http://noc.wikimedia.org/conf/highlight.php?file=squid.php'>squid.php</a><p>\n" ; + + open CSV, '>', "$path_csv_out/SquidDataMonthlyPerSquidSet.csv" ; + print CSV "Server roles mostly taken from http://noc.wikimedia.org/conf/highlight.php?file=squid.php<small>\n\n" ; foreach $role_location (sort keys %roles_locations) { - $line = "$role_location:" ; + $line_csv = "$role_location:" ; + ($role_location2 = $role_location) =~ s/zrole/role/ ; # z was added to sort last before RIP + $line_html = "<b>$role_location2:</b>" ; %servers = %{$servers_found {$role_location}} ; foreach $server (sort keys %servers) - { $line .= " $server" ; } - $line =~ s/~// ; # ~ just for sorting - print CSV "$line\n" ; + { + $line_csv .= " $server" ; + $line_html .= "$server, " ; + } + $line_csv =~ s/~// ; # ~ just for sorting + print CSV "$line_csv\n" ; + print HTML "<small>$line_html</small><br>\n" ; } + print HTML "</small>\n" ; print CSV "\n" ; - print CSV "\nAverage delta in sequence numbers per squid per active hour \n\n" ; + print HTML "<p><b>Legend</b><table border=1>\n" ; + print HTML "<tr><td class='good'>Packet loss less than 0.2% (avg gap < 1020)</td></tr>\n" ; + print HTML "<tr><td class='mediocre'>Packet loss between 0.2% and 0.5% (1020 <= avg gap < 1052)</td></tr>\n" ; + print HTML "<tr><td class='bad'>Packet loss larger than 0.5% (avg gap >= 1052)</td></tr>\n" ; + print HTML "<tr><td class='insignificant'>Insignificant message volume: less than 10 (sampled) msgs per hour</td></tr>\n" ; + print HTML "<tr><td class='improbable'>Sequence numbers totally broken</td></tr>\n" ; + print HTML "</table>\n" ; + print HTML "<p><small>Msg loss is weighed average over all servers, excluding those where sequence numbers are broken (weighed = taking into account message volume)</small>\n" ; + print HTML "<br><small>Other causes of data loss than UDP msg loss are not covered by this report (e.g. total outage for several hours)</small><br> \n" ; + print HTML "<table border=1>\n" ; - $line1 = '' ; - $line2 = '' ; - $line3 = "month" ; + print CSV "\nAverage gap between in udp msg sequence numbers per squid per active hour (ideally should be 1000 in 1:1000 sampled log)\n\n" ; + + $line_csv_1 = '' ; + $line_csv_2 = '' ; + $line_csv_3 = "month" ; + + $line_html_1 = "<th> </th><th> </th>" ; + $line_html_2 = "<th> </th><th> </th>" ; + $line_html_3 = "<th>month</th><th>msg loss</th>" ; + foreach $squid_set (sort keys %squid_sets) { + next if $squid_set_hourly_events_hi {$squid_set} < 25 ; # skip columns for servers with hardly any throughput ever + if ($squid_sets_lo {$squid_set} eq $squid_sets_hi {$squid_set}) { $squid_range = $squid_sets_lo {$squid_set} ; } else @@ -347,49 +438,154 @@ ($location = $squid_range) =~ s/\..*$// ; $location =~ s/^[^\/]*\/// ; ($range = $squid_range) =~ s/^[^\.]*\.// ; - $line1 .= ",$role" ; - $line2 .= ",$location" ; - $line3 .= ",$range" ; - } - print CSV "$line1\n" ; - print CSV "$line2\n" ; - print CSV "$line3\n" ; + $range =~ s/deployment-cache/dpl-ca/ ; - foreach $month (sort keys %months) - { - $line = $month ; - foreach $squid_set (sort keys %squid_sets) - { - $key = "$squid_set,$month" ; - if ($squid_set_hours_month {$key} == 0) - { $line .= "," ; } - else - { $line .= "," . sprintf ("%.0f", $squid_set_delta_month {$key} / $squid_set_hours_month {$key}) ; } - } - print CSV "$line\n" ; - } + $role =~ s/zrole/role/ ; # z was to sort last - print CSV "\n\nAverage events per squid per active hour in 1:1000 sampled log\n\n" ; + $line_csv_1 .= ",$role" ; + $line_csv_2 .= ",$location" ; + $line_csv_3 .= ",$range" ; - print CSV "$line1\n" ; - print CSV "$line2\n" ; - print CSV "$line3\n" ; + $line_html_1 .= "<th>$role</th>" ; + $line_html_2 .= "<th>$location</th>" ; + $line_html_3 .= "<th>$range</th>" ; + } + print CSV "$line_csv_1\n" ; + print CSV "$line_csv_2\n" ; + print CSV "$line_csv_3\n" ; + + print HTML "<tr><td colspan=99 style='text-align:left'> <br><b>UDP message loss: average gap between sequence numbers per squid (should be close to 1000 in 1:1000 sampled log)</b><p><small>For calculation of message loss percentage from average gap between sequence numbers see <a href='#calc'>bottom of page</a></small><br> </b></td></tr>\n" ; + print HTML "<tr>$line_html_1</tr>\n" ; + print HTML "<tr>$line_html_2</tr>\n" ; + print HTML "<tr>$line_html_3</tr>\n" ; foreach $month (sort keys %months) { - $line = $month ; + $line_csv = $month ; + $line_html = "<th>$month</th>XXX" ; + $total_avg_gap = 0 ; + $count_avg_gap = 0 ; foreach $squid_set (sort keys %squid_sets) { + next if $squid_set_hourly_events_hi {$squid_set} < 25 ; # skip columns for servers with hardly any throughput ever $key = "$squid_set,$month" ; if ($squid_set_hours_month {$key} == 0) - { $line .= "," ; } + { + $line_csv .= "," ; + $line_html .= "<td> </td>" ; + } else - { $line .= "," . sprintf ("%.0f", $squid_set_events_month {$key} / $squid_set_hours_month {$key}) ; } + { + $line_csv .= "," . sprintf ("%.0f", $squid_set_delta_month {$key} / $squid_set_hours_month {$key}) ; + $class = "" ; + + $avg_gap = $squid_set_delta_month {$key} / $squid_set_hours_month {$key} ; + $msg_per_hr = sprintf ("%.0f", $squid_set_events_month {$key} / $squid_set_hours_month {$key}) ; + + if ($msg_per_hr < 10) + { $class = "insignificant" ; } + elsif ((($avg_gap < 950) or ($avg_gap > 2000)) || ($squid_set =~ /^ssl/)) + { $class = "improbable" ; } + else + { + $count_avg_gap += $squid_sets {$squid_set} * $msg_per_hr ; + $total_avg_gap += $avg_gap * $squid_sets {$squid_set} * $msg_per_hr ; + if ($avg_gap < 1020) + { $class = "good" ; } + elsif ($avg_gap < 1040) + { $class = "mediocre" ; } + else + { $class = "bad" ; } + } + $avg_gap = sprintf ("%.0f", $avg_gap) ; + $line_html .= "<td class='$class'>$avg_gap</td>" ; + } } - print CSV "$line\n" ; + print CSV "$line_csv\n" ; + + $weighed_avg_gap = '-' ; + if ($count_avg_gap > 0) + { $weighed_avg_gap = sprintf ("%.0f", $total_avg_gap / $count_avg_gap) ; } + $msg_loss_perc = '-' ; + if ( $weighed_avg_gap != 0 ) + { $msg_loss_perc = sprintf ("%.1f\%", 100 - 100 * (1000 / $weighed_avg_gap)) ; } # see below at $calc for explanation + + if ($msg_loss_perc > 3) + { $class = "msg_loss_bad" ; } + elsif ($msg_loss_perc > 1) + { $class = "msg_loss_mediocre" ; } + else + { $class = "msg_loss_good" ; } + + $line_html =~ s/XXX/<td class='$class'><b>$msg_loss_perc<\/b><\/td>/ ; + print HTML "<tr>$line_html</tr>\n" ; + } + + $line_html_3 =~ s/msg loss/ / ; + + print CSV "\n\nMessage volume: average events per squid per active hour in 1:1000 sampled log\n\n" ; + + print CSV "$line_csv_1\n" ; + print CSV "$line_csv_2\n" ; + print CSV "$line_csv_3\n" ; + + print HTML "<tr><td colspan=99 style='text-align:left'> <br><b>Traffic volume: average events per squid per active hour in 1:1000 sampled log <br> </b></td></tr>\n" ; + print HTML "<tr>$line_html_1</tr>\n" ; + print HTML "<tr>$line_html_2</tr>\n" ; + print HTML "<tr>$line_html_3</tr>\n" ; + + foreach $month (sort keys %months) + { + $line_csv = $month ; + $line_html = "<th>$month</th><td> </td>" ; + foreach $squid_set (sort keys %squid_sets) + { + next if $squid_set_hourly_events_hi {$squid_set} < 25 ; # skip columns for servers with hardly any throughput ever + + $key = "$squid_set,$month" ; + if ($squid_set_hours_month {$key} == 0) + { + $line_csv .= "," ; + $line_html .= "<td> </td>" ; + } + else + { + $hourly_events = $squid_set_hourly_events {$key} ; + $line_csv .= ",$hourly_events" ; + $line_html .= "<td>$hourly_events</td>" ; + } + } + print CSV "$line_csv\n" ; + print HTML "<tr>$line_html</tr>\n" ; } close CSV ; + + print HTML "</table>\n" ; + + $calc = <<__calc__ ; +<a name=calc id=calc></a><small> +<p><b>Calculation of msg loss percentage:</b> +<p>Assume total volume of messages per hour per server is <b>v</b> +<br>We expect average gap between messages per server is 1000 +<br>We see avarage gap is <b>g</b> +<p>Msg received percentage is 100 x actual volume / expected volume +<br>Msg loss percentage = 100 - msg rcvd percentage +<p>Actual volume = v / g +<br>Expected volume = v / 1000 +<p>Msg rcvd ratio = actual volume / expected volume = (v/g)/(v/1000) = (v/g) x (1000/v)=1000/g +<br>Msg loss perc = 100 - 100 x msg rcvd ratio = 100 - 100 (1000/g) +<p>Examples: +<br> g = 1000 => msg loss perc = 100 - 100 (1000/1000) = 0 % +<br> g = 2000 => msg loss perc = 100 - 100 (1000/2000) = 50 % +<p> g = 1050 => msg loss perc = 100 - 100 (1000/1050) = 4.7% +<br> g = 1100 => msg loss perc = 100 - 100 (1000/1100) = 9.1% +</small> +__calc__ + + print HTML $calc ; + print HTML "</body></html>\n" ; + close HTML ; } sub days_in_month -- To view, visit https://gerrit.wikimedia.org/r/76518 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: If9f0cb6eae424c4076ffc7076b99f1f46aac022f Gerrit-PatchSet: 1 Gerrit-Project: analytics/wikistats Gerrit-Branch: master Gerrit-Owner: Erik Zachte <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
