GWicke has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/292505

Change subject: WIP: logstash_checker script for canary deploys
......................................................................

WIP: logstash_checker script for canary deploys

This is a work-in progress checker script for checking logstash error rates
before/after a deploy.

Bug: T110068
Change-Id: I1a900ee1d7eadc4689e14306a2fc72ad2c138a28
---
A modules/service/files/logstash_checker.py
1 file changed, 217 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/05/292505/1

diff --git a/modules/service/files/logstash_checker.py 
b/modules/service/files/logstash_checker.py
new file mode 100755
index 0000000..05b2f73
--- /dev/null
+++ b/modules/service/files/logstash_checker.py
@@ -0,0 +1,217 @@
+#!/usr/bin/python
+
+"""
+Basic logstash error rate checker
+
+Theory of operation:
+    - fetch histogram of error / fatals from logstash for the last ~10 minutes
+    - calculate the mean rates before/after a time <delay> seconds in the past
+    - if the `after` rate is more than <threshold> times the before rate,
+      return an error; else, exit with 0.
+"""
+
+
+import sys
+reload(sys)
+sys.setdefaultencoding('utf-8')
+
+try:
+    import urlparse
+except ImportError:
+    import urllib.parse as urlparse
+import json
+import urllib3
+import sys
+import argparse
+import re
+import urllib
+import time
+from collections import namedtuple
+
+
+class CheckServiceError(Exception):
+
+    """
+    Generic Exception used as a catchall
+    """
+    pass
+
+
+def fetch_url(client, url, **kw):
+    """
+    Standalone function to fetch an url.
+
+    Args:
+        client (urllib3.Poolmanager):
+                                 The HTTP client we want to use
+        url (str): The URL to fetch
+
+        kw: any keyword arguments we want to pass to
+            urllib3.request.RequestMethods.request
+    """
+    if 'method' in kw:
+        method = kw['method'].upper()
+        del kw['method']
+    else:
+        method = 'GET'
+    try:
+        if method == 'GET':
+            return client.request(
+                method,
+                url,
+                **kw
+            )
+        elif method == 'POST':
+            try:
+                headers = kw.get('headers', {})
+                content_type = headers.get('content-type', '')
+            except:
+                content_type = ''
+
+            # Handle json-encoded requests
+            if content_type.lower() == 'application/json':
+                kw['body'] = json.dumps(kw['body'])
+                return client.urlopen(
+                    method,
+                    url,
+                    **kw
+                )
+
+            return client.request_encode_body(
+                method,
+                url,
+                encode_multipart=False,
+                **kw
+            )
+    except urllib3.exceptions.SSLError:
+        raise CheckServiceError("Invalid certificate")
+    except (urllib3.exceptions.ConnectTimeoutError,
+            urllib3.exceptions.TimeoutError,
+            # urllib3.exceptions.ConnectionError, # commented out until we can
+            # remove trusty (aka urllib3 1.7.1) support
+            urllib3.exceptions.ReadTimeoutError):
+        raise CheckServiceError("Timeout on connection while "
+                                "downloading {}".format(url))
+    except Exception as e:
+        raise CheckServiceError("Generic connection error: {}".format(e))
+
+
+class CheckService(object):
+
+    """
+    Shell class for checking services
+    """
+    nagios_codes = ['OK', 'WARNING', 'CRITICAL']
+    spec_url = '/?spec'
+    default_response = {'status': 200}
+    _supported_methods = ['get', 'post']
+
+    def __init__(self, args):
+        """
+        Initialize the checker
+
+        Args:
+            host_ip (str): The host ipv4 address (also works with a hostname)
+
+            base_url (str): The base url the service expects to respond from
+
+            timeout (int): Number of seconds to wait for each request
+        """
+        self.host = args.host
+        self.service_name = args.service_name
+        self.delay = args.delay
+        self.threshold = 2
+        self.logstash_query_url = 
'https://logstash.wikimedia.org/logstash-2016.06.02/_search'
+        #args.logstash_query_url
+
+    def _logstashQuery(self):
+        return {"facets":{"0":{
+            "date_histogram":{"field":"@timestamp","interval":"10s"},
+            "global":True,
+            "facet_filter":{"fquery":{"query":{
+                "filtered":{"query":{"query_string":{"query":"*"}},
+                    "filter":{"bool":{
+                        "must":[
+                            
{"range":{"@timestamp":{"from":"now-10m","to":"now"}}},
+                            {"terms":{"_type":["mediawiki"]}},
+                            
{"fquery":{"query":{"query_string":{"query":"host:(\"" + self.host +
+                                "\")"}},"_cache":True}}
+                            ],
+                        "must_not":[
+                            {"fquery":{
+                                
"query":{"query_string":{"query":"level:(\"INFO\")"}},"_cache":True}},{"fquery":{"query":{"query_string":{"query":"level:(\"WARNING\")"}},"_cache":True}}
+                        ]}}}}}}
+                    }},"size":20,
+                    
"query":{"filtered":{"query":{"query_string":{"query":"type:scap AND 
(channel.raw:scap.announce OR message:\"Started 
sync_wikiversions\")"}},"filter":{"bool":{"must":[{"range":{"@timestamp":{"from":"now","to":"now"}}}]}}}},
+                    
"sort":[{"@timestamp":{"order":"desc","ignore_unmapped":True}},{"@timestamp":{"order":"desc","ignore_unmapped":True}}]}
+
+    def run(self):
+        """
+        Gets the full spec from base_url + '/?spec' and parses it.
+        Returns a generator iterating over the available endpoints
+        """
+        http = self._spawn_downloader()
+        headers=urllib3.util.make_headers(basic_auth='GWicke:suandsiref')
+        headers['content-type'] = "application/json"
+        response = fetch_url(
+            http,
+            self.logstash_query_url,
+            timeout=5,
+            headers=headers,
+            method='POST',
+            body=self._logstashQuery()
+        )
+        resp = response.data.decode('utf-8')
+
+        try:
+            r = json.loads(resp)
+        except ValueError:
+            raise ValueError("Logstash request returned error")
+
+        print r
+
+
+        entries = r['facets']['0']['entries']
+        cutoff_ts = (time.time() - self.delay) * 1000
+        print cutoff_ts
+        counts_before = [entry['count'] for entry in entries
+                            if entry['time'] < cutoff_ts]
+        mean_before = float(sum(counts_before)) / max(1, len(counts_before))
+        counts_after = [entry['count'] for entry in entries
+                            if entry['time'] >= cutoff_ts]
+        print 'counts_after', counts_after
+        mean_after = float(sum(counts_after)) / max(1, len(counts_after))
+        over_threshold = mean_after > (mean_before * self.threshold)
+        print 'over_threshold', mean_before, mean_after, over_threshold
+        sys.exit(over_threshold)
+
+    def _spawn_downloader(self):
+        """
+        Spawns an urllib3.Poolmanager with the correct configuration.
+        """
+        kw = {
+            # 'retries': 1, uncomment this once we've got rid of trusty
+            'timeout': 5
+        }
+        kw['ca_certs'] = "/etc/ssl/certs/ca-certificates.crt"
+        kw['cert_reqs'] = 'CERT_REQUIRED'
+        return urllib3.PoolManager(**kw)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Checks the error rate change of a single "
+            "WMF service / host")
+    parser.add_argument('host', help="The host to check")
+    parser.add_argument('service_name',
+                        help="The service name to match")
+    parser.add_argument('-d', dest="delay", default=5, type=int,
+                        help="Length of the delay between a deploy & the call"
+                            "to this check script")
+    args = parser.parse_args()
+    checker = CheckService(args)
+    checker.run()
+
+
+if __name__ == '__main__':
+    main()

-- 
To view, visit https://gerrit.wikimedia.org/r/292505
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I1a900ee1d7eadc4689e14306a2fc72ad2c138a28
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: GWicke <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to