Jcrespo has uploaded a new change for review.
https://gerrit.wikimedia.org/r/253665
Change subject: [WIP] Use heartbeat when possible to check slave lag
......................................................................
[WIP] Use heartbeat when possible to check slave lag
Bug: T112473 T114752
Change-Id: I2d229173a0fdf5d91e6c95b4a6cb1335fa56f317
---
M files/icinga/check_mariadb.pl
1 file changed, 52 insertions(+), 22 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/65/253665/1
diff --git a/files/icinga/check_mariadb.pl b/files/icinga/check_mariadb.pl
index af0f0d5..a1d3479 100755
--- a/files/icinga/check_mariadb.pl
+++ b/files/icinga/check_mariadb.pl
@@ -20,12 +20,15 @@
my $user = "";
my $pass = "";
my $sock = "";
+my $master_server_id = "";
my $sql_lag_warn = 30;
my $sql_lag_crit = 60;
# Warn when IO or SQL stopped cleanly (no errno)
my $warn_stopped = 0;
+
+my $heartbeat_table = 'heartbeat.heartbeat';
my @vars = ();
@@ -70,6 +73,10 @@
elsif ($arg =~ /^--no-warn-stopped$/)
{
$warn_stopped = 0;
+ }
+ elsif ($arg =~ /^--master-server-id=(.+)$/)
+ {
+ $master_server_id = $1;
}
elsif ($arg =~ /^--set=(.+)$/)
{
@@ -162,8 +169,18 @@
if ($check eq "slave_sql_lag")
{
- # TODO: Make this check heartbeat
-
+# The slave lag is checked using the $heartbeat_table table,
+# usually created and updated by running pt-heartbeat on the
+# master.
+# For that, --master-server-id is strongly suggested to be
+# set. In case it is not, the lag from its direct master is
+# reported. If the heartbeat table does not exist, the record
+# for the master is not found or any other errors happens,
+# it failbacks to using Seconds_Behind_Master.
+# If the server is not a slave, it returns OK. If lag cannot
+# be determined neither by using heartbeat nor seconds behind
+# master, it returns unknown, unless the replication is
+# stopped manually- reporting optionally a warning.
my $status = $db->selectrow_hashref("show slave status");
unless ($status) {
@@ -171,36 +188,49 @@
exit($EOK);
}
- # Either IO or SQL threads stopped? WARN
- if ($status->{Slave_IO_Running} ne "Yes" ||
$status->{Slave_SQL_Running} ne "Yes") {
- if ($warn_stopped == 1) {
- printf("%s %s Slave_IO_Running: %s, Slave_SQL_Running:
%s\n",
- $WARN, $check, $status->{Slave_IO_Running},
$status->{Slave_SQL_Running});
- exit($EWARN);
- }
- printf("%s %s Slave_IO_Running: %s, Slave_SQL_Running: %s, (no
error; intentional)\n",
- $OK, $check, $status->{Slave_IO_Running},
$status->{Slave_SQL_Running});
- exit($EOK);
- }
+ if ($master_server_id eq "") {
+ $master_server_id = $status->{Master_Server_Id};
+ }
+ my $heartbeat = $db->selectrow_hashref("SELECT
TIMESTAMPDIFF(MICROSECOND,ts,UTC_TIMESTAMP(6)) AS lag FROM heartbeat.heartbeat
WHERE server_id = $master_server_id");
+ my $lag =
$heartbeat->{lag}?$heartbeat->{lag}/1000000:$status->{Seconds_Behind_Master};
+
+ if ($lag eq "NULL") {
+ # Either IO or SQL threads stopped? WARN
+ if ($status->{Slave_IO_Running} ne "Yes" ||
$status->{Slave_SQL_Running} ne "Yes") {
+ if ($warn_stopped == 1) {
+ printf("%s %s Slave_IO_Running: %s,
Slave_SQL_Running: %s\n",
+ $WARN, $check, $status->{Slave_IO_Running},
$status->{Slave_SQL_Running});
+ exit($EWARN);
+ }
+ printf("%s %s Slave_IO_Running: %s, Slave_SQL_Running: %s,
(no error; intentional)\n",
+ $OK, $check, $status->{Slave_IO_Running},
$status->{Slave_SQL_Running});
+ exit($EOK);
+ }
+ # lag could not be determined
+ printf("%s %s lag could not be determined\n", $UNKN, $check);
+ exit($EUNKN);
+
+ }
# Small lag? OK
- if ($status->{Seconds_Behind_Master} < $sql_lag_warn) {
- printf("%s %s Seconds_Behind_Master: %s\n",
- $OK, $check, $status->{Seconds_Behind_Master});
+ if ($lag < $sql_lag_warn) {
+ printf("%s %s Replication lag: %s seconds\n",
+ $OK, $check, $lag);
exit($EOK);
}
# Medium lag? WARN
- if ($status->{Seconds_Behind_Master} < $sql_lag_crit) {
- printf("%s %s Seconds_Behind_Master: %s\n",
- $WARN, $check, $status->{Seconds_Behind_Master});
+ if ($lag < $sql_lag_crit) {
+ printf("%s %s Replication lag: %s seconds\n",
+ $WARN, $check, $lag);
exit($EWARN);
}
- printf("%s %s Seconds_Behind_Master: %s\n",
- $CRIT, $check, $status->{Seconds_Behind_Master});
+ printf("%s %s Replication lag: %s seconds\n",
+ $CRIT, $check, $lag);
exit($ECRIT);
}
printf("%s %s invalid check: %s\n", $UNKN, $check, $check);
-exit($EUNKN);
\ No newline at end of file
+exit($EUNKN);
+
--
To view, visit https://gerrit.wikimedia.org/r/253665
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I2d229173a0fdf5d91e6c95b4a6cb1335fa56f317
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Jcrespo <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits