BBlack has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/368779 )

Change subject: OCSP: Warn less, retry more
......................................................................

OCSP: Warn less, retry more

This doubles the OCSP fetcher executions to twice per day, and
reduces the warning thresholds so they don't trigger until at
least 2 straight days of failure.

Bug: T172116
Change-Id: I076b956f72e9dfd54e306eb316a21047eb4f1527
---
M modules/nagios_common/files/check_commands/check_ssl
M modules/sslcert/manifests/ocsp/init.pp
M modules/tlsproxy/manifests/ocsp.pp
3 files changed, 11 insertions(+), 8 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/79/368779/1

diff --git a/modules/nagios_common/files/check_commands/check_ssl 
b/modules/nagios_common/files/check_commands/check_ssl
index 7c05284..62a45d5 100755
--- a/modules/nagios_common/files/check_commands/check_ssl
+++ b/modules/nagios_common/files/check_commands/check_ssl
@@ -109,7 +109,7 @@
     $ng->arg(
         spec    => 'ocspwarn=i',
         help    => 'Warning threshold for OCSP staple validity in seconds 
(default: %s)',
-        default => 86400*3,
+        default => 86400*2,
     );
     $ng->arg(
         spec    => 'ocspcrit=i',
diff --git a/modules/sslcert/manifests/ocsp/init.pp 
b/modules/sslcert/manifests/ocsp/init.pp
index a9514a9..0edb475 100644
--- a/modules/sslcert/manifests/ocsp/init.pp
+++ b/modules/sslcert/manifests/ocsp/init.pp
@@ -49,10 +49,12 @@
         mode   => '0755',
     }
 
+    # Twice a day, 12h apart
+    $cron_h12 = fqdn_rand(12, 'e663dd38dd6d3384')
     cron { 'update-ocsp-all':
         command => '/usr/local/sbin/update-ocsp-all 2>&1 | logger -t 
update-ocsp-all',
         minute  => fqdn_rand(60, '1adf3dd699e51805'),
-        hour    => fqdn_rand(24, 'e663dd38dd6d3384'),
+        hour    => [ $cron_h12, $cron_h12 + 12 ]
         require => [
             File['/usr/local/sbin/update-ocsp-all'],
             File['/etc/update-ocsp.d'],
diff --git a/modules/tlsproxy/manifests/ocsp.pp 
b/modules/tlsproxy/manifests/ocsp.pp
index 3d039cc..25df3d6 100644
--- a/modules/tlsproxy/manifests/ocsp.pp
+++ b/modules/tlsproxy/manifests/ocsp.pp
@@ -19,13 +19,14 @@
     # fetch of data has a 4-7 day lifetime depending on the vendor (GlobalSign
     # or Digicert)
     #
-    # The crit/warn values of 259500 and 86700 correspond to "1d5m" and
-    # "3d5m", so those are basically warning if 1 updates in a row failed
-    # for a given cert, and critical if 3 updates in a row failed (at which
-    # point we have ~24h left to fix the situation before the validity window
-    # expires).
+    # The warn and crit values of 173100 and 259200 correspond to "2d5m" and
+    # "3d5m", and are checking the mtime of the files (not the internal expiry
+    # times).  This should give us ~24h to fix, assuming we're getting minimum
+    # 4-day staples.  The live ssl checker also checks for internal timestamps
+    # nearing expiry as well (warn at 2 days left, crit at 1 day left), so
+    # we're covered on two fronts here.
 
-    $check_args = '-c 259500 -w 86700 -d /var/cache/ocsp -g "*.ocsp"'
+    $check_args = '-c 259500 -w 173100 -d /var/cache/ocsp -g "*.ocsp"'
     nrpe::monitor_service { 'ocsp-freshness':
         description  => 'Freshness of OCSP Stapling files',
         nrpe_command => "/usr/lib/nagios/plugins/check-fresh-files-in-dir.py 
${check_args}",

-- 
To view, visit https://gerrit.wikimedia.org/r/368779
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I076b956f72e9dfd54e306eb316a21047eb4f1527
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: BBlack <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to