Thcipriani has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/403574 )

Change subject: Scap canary: cache last good deploy time
......................................................................

Scap canary: cache last good deploy time

Currently, if during the course of a series of deployments the canary
checker fails a deploy due to a high error rate then the baseline of
comparison for subsequent deployments is invalid because we have no way
of rolling back MediaWiki canaries automatically.

In practical terms that means that the canary will stop one bad
deployment from hitting all of production, but if the patch deployed
next doesn't fix the error-rate spike then the canary check will allow
it to go to production anyway because of this shifting baseline.

To fix this, after a bad deployment to the canary, we cache the
timestamp of that deployment. If that timestamp exists on disk then we
use that timestamp during subsequen deployments to calculate the
baseline mean error rate rather than using the mean error rate for the
past hour (less 20 seconds) as we do now.

Cached timestamps will expire after 1 hour.

Bug: T183999
Change-Id: Ifc7db81d2b08381743e5807d6b982a09d87275ac
---
M modules/service/files/logstash_checker.py
1 file changed, 103 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/74/403574/1

diff --git a/modules/service/files/logstash_checker.py 
b/modules/service/files/logstash_checker.py
index f24bd07..b362307 100755
--- a/modules/service/files/logstash_checker.py
+++ b/modules/service/files/logstash_checker.py
@@ -12,6 +12,7 @@
 
 
 import argparse
+import datetime
 import getpass
 import json
 import logging
@@ -91,8 +92,7 @@
     """Shell class for checking services."""
 
     def __init__(self, host, service_name, logstash_host, user='', password='',
-                 verbose=False, delay=120, fail_threshold=2.0,
-                 absolute_threshold=1.0):
+                 delay=120, fail_threshold=2.0, absolute_threshold=1.0):
         """Initialize the checker."""
         self.host = host
         self.service_name = service_name
@@ -101,10 +101,51 @@
         self.fail_threshold = fail_threshold
         self.absolute_threshold = absolute_threshold
         self.auth = None
+        self._cache_dir = None
+        self._cached_timestamp = None
         self.logger = logging.getLogger(__name__)
 
         if user:
             self.auth = '{}:{}'.format(user, password)
+
+    @property
+    def cache_dir(self):
+        """
+        Where we can stash persistent information for logstash_checker
+        """
+        if self._cache_dir:
+            return self._cache_dir
+
+        base_path = '/tmp'
+        cache_name = 'logstash-checker-cache'
+
+        if self.service_name == 'mwdeploy':
+            with open('/etc/profile.d/mediawiki.sh') as f:
+                profile = f.readlines()
+            mw_lines = [l for l in profile if l.startswith('MEDIAWIKI')]
+            mw_profile = {}
+            for mw_line in mw_lines:
+                k, v = mw_line.split('=')
+                mw_profile[k] = v.strip('"\n ')
+
+            base_path = os.path.join(
+                mw_profile['MEDIAWIKI_STAGING_DIR'], '.git')
+
+        self._cache_dir = os.path.join(base_path, cache_name)
+
+        self.logger.debug('Using cache directory "{}"'.format(self._cache_dir))
+        return self._cache_dir
+
+    @property
+    def cached_timestamp_path(self):
+        if self._cached_timestamp:
+            return self._cached_timestamp
+
+        self._cached_timestamp = os.path.join(
+            self.cache_dir,
+            '{}.timestamp'.format(self.host))
+
+        return self._cached_timestamp
 
     def _logstash_query(self):
         if self.service_name == 'mwdeploy':
@@ -221,6 +262,59 @@
             }
         }
 
+    def _save_cached_ts(self, ts):
+        """
+        Saves a cached timestamp to disk
+        """
+        if not os.path.isdir(self.cache_dir):
+            os.makedirs(self.cache_dir, 02770)
+
+        data = {'timestamp': ts}
+        data_json = json.dumps(data)
+
+        self.logger.debug('Saving cached timestamp {}'.format(data_json))
+        with open(self.cached_timestamp_path, 'w') as f:
+            f.write(data_json)
+
+    def _rm_cached_ts(self):
+        """
+        Remove cached ts file
+        """
+        if os.path.exists(self.cached_timestamp_path):
+            os.unlink(self.cached_timestamp_path)
+
+    def _get_cached_ts(self, ts):
+        """
+        Returns a cached timestamp if one is available
+        """
+        if not os.path.exists(self.cached_timestamp_path):
+            return ts
+
+        with open(self.cached_timestamp_path) as f:
+            cached_data = json.load(f)
+
+        cached_ts = cached_data.get('timestamp')
+
+        if not cached_ts:
+            self._rm_cached_ts()
+            return ts
+
+        try:
+            ts_datetime = datetime.datetime.fromtimestamp(int(cached_ts)/1000)
+        except ValueError:
+            self._rm_cached_ts()
+            return ts
+
+        an_hour_old = datetime.datetime.utcnow() - datetime.timedelta(hours=1)
+        ts_still_valid = ts_datetime > an_hour_old
+
+        if not ts_still_valid:
+            self._rm_cached_ts()
+            return ts
+
+        self.logger.debug('Using cached timestamp {}'.format(cached_ts))
+        return cached_ts
+
     def run(self):
         """
         Query logstash and check error rate.
@@ -253,14 +347,15 @@
         except ValueError:
             raise ValueError("Logstash request returned error")
 
-        self.logger.debug('logstash response %s', r)
+        self.logger.debug('logstash response {}'.format(r))
 
         # Calculate mean event rates before / after the deploy.
         entries = r['aggregations']['2']['buckets']
         cutoff_ts = (time.time() - self.delay) * 1000
+        before_cutoff_ts = self._get_cached_ts(cutoff_ts)
 
         counts_before = [entry['doc_count'] for entry in entries
-                         if entry['key'] < cutoff_ts]
+                         if entry['key'] < before_cutoff_ts]
 
         mean_before = float(sum(counts_before)) / max(1, len(counts_before))
 
@@ -283,11 +378,15 @@
                               percent_over, mean_before, mean_after,
                               target_error_rate)
 
+            self._save_cached_ts(cutoff_ts)
+
         else:
             self.logger.info('OK (Avg. Error rate: '
                              'Before: %.2f, After: %.2f, Threshold: %.2f)',
                              mean_before, mean_after, target_error_rate)
 
+            self._rm_cached_ts()
+
         return over_threshold
 
     def _spawn_downloader(self):

-- 
To view, visit https://gerrit.wikimedia.org/r/403574
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ifc7db81d2b08381743e5807d6b982a09d87275ac
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Thcipriani <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to