[MediaWiki-commits] [Gerrit] Add basic monitoring script for projectcount aggregates - change (analytics/aggregator)

QChris (Code Review) Sun, 09 Nov 2014 20:04:48 -0800

QChris has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/172196


Change subject: Add basic monitoring script for projectcount aggregates
......................................................................

Add basic monitoring script for projectcount aggregates

Bug: 72740
Change-Id: I3709f8f259393a9aa083555618c3b212c8d5cb9b
---
M aggregator/projectcounts.py
A bin/check_validity_aggregated_projectcounts
M tests/test_projectcounts.py
3 files changed, 329 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/aggregator 
refs/changes/96/172196/1

diff --git a/aggregator/projectcounts.py b/aggregator/projectcounts.py
index 03f9c5c..4642bb1 100644
--- a/aggregator/projectcounts.py
+++ b/aggregator/projectcounts.py
@@ -189,3 +189,101 @@
 
         with open(csv_file_abs, 'w') as csv_file:
             csv_file.writelines(sorted(csv_data.itervalues()))
+
+
+def get_validity_issues_for_aggregated_projectcounts(data_dir_abs):
+    """Gets a list of obvious validity issues of aggregated projectcount CSVs
+
+    :param data_dir_abs: Absolute directory of the per project CSVs.
+    """
+    issues = []
+    dbnames = []
+
+    big_wikis = [
+        'enwiki',
+        'jawiki',
+        'dewiki',
+        'eswiki',
+        'frwiki',
+        'ruwiki',
+        'itwiki',
+        ]
+
+    yesterday = datetime.date.today() - datetime.timedelta(days=1)
+    for csv_file_abs in glob.glob(os.path.join(data_dir_abs, '*.csv')):
+        logging.info("Checking csv '%s'" % (csv_file_abs))
+
+        dbname = os.path.basename(csv_file_abs)
+        dbname = dbname.rsplit('.csv', 1)[0]
+        dbnames.append(dbname)
+
+        with open(csv_file_abs, 'r') as file:
+            lines = file.readlines()
+
+            if len(lines):
+                # Analyze last line
+                last_line = (lines[-1]).split('\n', 1)[0]
+                last_line_split = last_line.split(',')
+                if len(last_line_split) == 4:
+                    # Check if last line is not older than yesterday
+                    try:
+                        last_line_date = util.parse_string_to_date(
+                            last_line_split[0])
+                        if last_line_date < yesterday:
+                            issues.append("Last line of %s is too old "
+                                          "'%s'" % (csv_file_abs, last_line))
+                    except ValueError:
+                        issues.append("Last line of %s is too old "
+                                      "'%s'" % (csv_file_abs, last_line))
+
+                    if dbname in big_wikis:
+                        # Check desktop count
+                        try:
+                            if int(last_line_split[1]) < 1000000:
+                                issues.append("Desktop count of last line of "
+                                              "%s is too low '%s'" % (
+                                                  csv_file_abs, last_line))
+                        except ValueError:
+                            issues.append("Desktop count of last line of %s is"
+                                          "not an integer '%s'" % (
+                                              csv_file_abs, last_line))
+
+                        # Check mobile count
+                        try:
+                            if int(last_line_split[2]) < 10000:
+                                issues.append("Desktop count of last line of "
+                                              "%s is too low '%s'" % (
+                                                  csv_file_abs, last_line))
+                        except ValueError:
+                            issues.append("Mobile count of last line of %s is"
+                                          "not an integer '%s'" % (
+                                              csv_file_abs, last_line))
+
+                        # Check zero count
+                        try:
+                            if int(last_line_split[3]) < 100:
+                                issues.append("Zero count of last line of "
+                                              "%s is too low '%s'" % (
+                                                  csv_file_abs, last_line))
+                        except ValueError:
+                            issues.append("Desktop count of last line of %s is"
+                                          "not an integer '%s'" % (
+                                              csv_file_abs, last_line))
+
+                else:
+                    issues.append("Last line of %s does not have 4 columns: "
+                                  "'%s'" % (csv_file_abs, last_line))
+            else:
+                issues.append("No lines for %s" % csv_file_abs)
+
+    if not len(dbnames):
+        issues.append("Could not find any CSVs")
+
+    if set(big_wikis) - set(dbnames):
+        issues.append("Not all big wikis covered (Missing: %s)" % (
+            [x for x in (set(big_wikis) - set(dbnames))]))
+
+    if not (set(dbnames) - set(big_wikis)):
+        issues.append("No wikis beyond the big wikis")
+
+    return sorted(issues)
diff --git a/bin/check_validity_aggregated_projectcounts 
b/bin/check_validity_aggregated_projectcounts
new file mode 100755
index 0000000..2e6be43
--- /dev/null
+++ b/bin/check_validity_aggregated_projectcounts
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Checks that each aggregated projectcount file is current
+
+Usage: check_validity_aggregated_projectcounts --data DATA_DIR
+           [-v ...] [--help]
+
+Options:
+    -h, --help          Show this help message and exit
+    --data DATA_DIR     Directory holding the csvs to check
+    -v, --verbose       Increase verbosity
+"""
+
+# Add parent directory to python path to allow allow loading of modules without
+# messing PYTHONPATH on the command line
+import os
+import sys
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
+from docopt import docopt
+
+import logging
+
+import aggregator
+
+if __name__ == '__main__':
+    arguments = docopt(__doc__)
+
+    # Setting up logging
+    log_level = logging.ERROR
+    if arguments['--verbose'] >= 1:
+        if arguments['--verbose'] >= 2:
+            if arguments['--verbose'] >= 3:
+                log_level = logging.DEBUG
+            else:
+                log_level = logging.INFO
+        else:
+            log_level = logging.WARNING
+    logging.basicConfig(level=log_level,
+                        format='%(asctime)s %(levelname)-6s %(message)s',
+                        datefmt='%Y-%m-%dT%H:%M:%S')
+
+    logging.debug("Parsed arguments: %s" % (arguments))
+
+    # Setting up directories
+    data_dir_abs = arguments['--data']
+    try:
+        data_dir_abs = aggregator.existing_dir_abs(data_dir_abs)
+    except ValueError:
+        logging.error("Data directory '%s' does not point to an existing "
+                      "directory" % (data_dir_abs))
+        sys.exit(1)
+
+    issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
+        data_dir_abs)
+
+    if issues:
+        for issue in issues:
+            logging.error(issue)
+        sys.exit(1)
diff --git a/tests/test_projectcounts.py b/tests/test_projectcounts.py
index 987d6bf..3f347b0 100644
--- a/tests/test_projectcounts.py
+++ b/tests/test_projectcounts.py
@@ -56,6 +56,25 @@
             for line in lines:
                 file.write(line + '\n')
 
+    def create_valid_aggregated_projects(self, tmp_dir_abs):
+        today = datetime.date.today()
+        for dbname in [
+            'enwiki',
+            'jawiki',
+            'dewiki',
+            'eswiki',
+            'frwiki',
+            'ruwiki',
+            'itwiki',
+            'foo',
+        ]:
+            csv_file_abs = os.path.join(tmp_dir_abs, dbname + '.csv')
+            with open(csv_file_abs, 'w') as file:
+                for day_offset in range(-10, 0):
+                    date = (today + datetime.timedelta(days=day_offset))
+                    date_str = date.isoformat()
+                    file.write('%s,123456789,12345678,1234567\n' % (date_str))
+
     def assert_file_content_equals(self, actual_file_abs, expected_lines):
         with open(actual_file_abs, 'r') as file:
             for expected_line in expected_lines:
@@ -371,3 +390,141 @@
             '2014-11-02,48276,0,0',
             '2014-11-03,72276,0,0',
             ])
+
+    def test_validity_no_csvs(self):
+        tmp_dir_abs = self.create_tmp_dir_abs()
+
+        issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
+            tmp_dir_abs)
+
+        # At least one issue, as no csvs could get found
+        nose.tools.assert_greater_equal(len(issues), 1)
+
+    def test_validity_no_enwiki(self):
+        tmp_dir_abs = self.create_tmp_dir_abs()
+
+        self.create_valid_aggregated_projects(tmp_dir_abs)
+
+        enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
+        os.unlink(enwiki_file_abs)
+
+        issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
+            tmp_dir_abs)
+
+        # At least one issue, as enwiki.csv is missing
+        nose.tools.assert_greater_equal(len(issues), 1)
+
+    def test_validity_only_big_wikis(self):
+        tmp_dir_abs = self.create_tmp_dir_abs()
+
+        self.create_valid_aggregated_projects(tmp_dir_abs)
+
+        foo_file_abs = os.path.join(tmp_dir_abs, 'foo.csv')
+        os.unlink(foo_file_abs)
+
+        issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
+            tmp_dir_abs)
+
+        # At least one issue, as no csvs for other wikis than the big wikis are
+        # present.
+        nose.tools.assert_greater_equal(len(issues), 1)
+
+    def test_validity_enwiki_empty(self):
+        tmp_dir_abs = self.create_tmp_dir_abs()
+
+        self.create_valid_aggregated_projects(tmp_dir_abs)
+
+        enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
+        self.create_empty_file(enwiki_file_abs)
+
+        issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
+            tmp_dir_abs)
+
+        # At least one issue, as enwiki has no reading
+        nose.tools.assert_greater_equal(len(issues), 1)
+
+    def test_validity_enwiki_no_today(self):
+        tmp_dir_abs = self.create_tmp_dir_abs()
+
+        self.create_valid_aggregated_projects(tmp_dir_abs)
+
+        enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
+        yesterday = aggregator.parse_string_to_date('yesterday')
+        with open(enwiki_file_abs, 'w') as file:
+            for day_offset in range(-10, 0):
+                date = (yesterday + datetime.timedelta(days=day_offset))
+                date_str = date.isoformat()
+                file.write('%s,123456789,12345678,1234567\n' % (date_str))
+
+        issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
+            tmp_dir_abs)
+
+        # At least one issue, as enwiki has no reading for today
+        nose.tools.assert_greater_equal(len(issues), 1)
+
+    def test_validity_enwiki_too_low_desktop(self):
+        tmp_dir_abs = self.create_tmp_dir_abs()
+
+        self.create_valid_aggregated_projects(tmp_dir_abs)
+
+        enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
+        today = datetime.date.today()
+        with open(enwiki_file_abs, 'w') as file:
+            for day_offset in range(-10, 0):
+                date = (today + datetime.timedelta(days=day_offset))
+                date_str = date.isoformat()
+                file.write('%s,0,12345678,1234567\n' % (date_str))
+
+        issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
+            tmp_dir_abs)
+
+        # At least one issue, as enwiki has no reading for today
+        nose.tools.assert_greater_equal(len(issues), 1)
+
+    def test_validity_enwiki_too_low_mobile(self):
+        tmp_dir_abs = self.create_tmp_dir_abs()
+
+        self.create_valid_aggregated_projects(tmp_dir_abs)
+
+        enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
+        today = datetime.date.today()
+        with open(enwiki_file_abs, 'w') as file:
+            for day_offset in range(-10, 0):
+                date = (today + datetime.timedelta(days=day_offset))
+                date_str = date.isoformat()
+                file.write('%s,123456789,0,1234567\n' % (date_str))
+
+        issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
+            tmp_dir_abs)
+
+        # At least one issue, as enwiki has no reading for today
+        nose.tools.assert_greater_equal(len(issues), 1)
+
+    def test_validity_enwiki_too_low_zero(self):
+        tmp_dir_abs = self.create_tmp_dir_abs()
+
+        self.create_valid_aggregated_projects(tmp_dir_abs)
+
+        enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
+        today = datetime.date.today()
+        with open(enwiki_file_abs, 'w') as file:
+            for day_offset in range(-10, 0):
+                date = (today + datetime.timedelta(days=day_offset))
+                date_str = date.isoformat()
+                file.write('%s,123456789,12345678,0\n' % (date_str))
+
+        issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
+            tmp_dir_abs)
+
+        # At least one issue, as enwiki has no reading for today
+        nose.tools.assert_greater_equal(len(issues), 1)
+
+    def test_validity_valid(self):
+        tmp_dir_abs = self.create_tmp_dir_abs()
+
+        self.create_valid_aggregated_projects(tmp_dir_abs)
+
+        issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
+            tmp_dir_abs)
+
+        self.assertEquals(issues, [])

-- 
To view, visit https://gerrit.wikimedia.org/r/172196
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I3709f8f259393a9aa083555618c3b212c8d5cb9b
Gerrit-PatchSet: 1
Gerrit-Project: analytics/aggregator
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Add basic monitoring script for projectcount aggregates - change (analytics/aggregator)

Reply via email to