[MediaWiki-commits] [Gerrit] Add total column for project aggregation CSVs - change (analytics/aggregator)

QChris (Code Review) Mon, 10 Nov 2014 05:14:24 -0800

QChris has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/172237


Change subject: Add total column for project aggregation CSVs
......................................................................

Add total column for project aggregation CSVs

While the total column does not add new information, it eases direct
plotting.

Change-Id: Ib62841200da246763efb6b9550df2b5af52be505
---
M aggregator/projectcounts.py
M tests/test_projectcounts.py
2 files changed, 80 insertions(+), 32 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/aggregator 
refs/changes/37/172237/1

diff --git a/aggregator/projectcounts.py b/aggregator/projectcounts.py
index 18d16ef..2f95370 100644
--- a/aggregator/projectcounts.py
+++ b/aggregator/projectcounts.py
@@ -188,16 +188,17 @@
                     source_dir_abs, abbreviation, date)
 
                 # injecting obtained data
-                csv_data[date_str] = '%s,%s,%s,%s%s' % (
+                csv_data[date_str] = '%s,%s,%s,%s,%s%s' % (
                     date_str,
+                    count_desktop + count_mobile + count_zero,
                     count_desktop,
                     count_mobile,
                     count_zero,
                     CSV_LINE_ENDING)
 
         with open(csv_file_abs, 'w') as csv_file:
-            csv_file.write('Date,Desktop site,Mobile site,Zero site%s' % (
-                CSV_LINE_ENDING))
+            csv_file.write('Date,Total,Desktop site,Mobile site,Zero site%s'
+                           % (CSV_LINE_ENDING))
             csv_file.writelines(sorted(csv_data.itervalues()))
 
 
@@ -236,7 +237,7 @@
                 # opened in text mode by default, line ends are normalized to
                 # LF, event though CRLF gets written.
                 last_line_split = last_line.split(',')
-                if len(last_line_split) == 4:
+                if len(last_line_split) == 5:
                     # Check if last line is not older than yesterday
                     try:
                         last_line_date = util.parse_string_to_date(
@@ -249,9 +250,20 @@
                                       "'%s'" % (csv_file_abs, last_line))
 
                     if dbname in big_wikis:
-                        # Check desktop count
+                        # Check total count
                         try:
                             if int(last_line_split[1]) < 1000000:
+                                issues.append("Total count of last line of "
+                                              "%s is too low '%s'" % (
+                                                  csv_file_abs, last_line))
+                        except ValueError:
+                            issues.append("Total count of last line of %s is"
+                                          "not an integer '%s'" % (
+                                              csv_file_abs, last_line))
+
+                        # Check desktop count
+                        try:
+                            if int(last_line_split[2]) < 1000000:
                                 issues.append("Desktop count of last line of "
                                               "%s is too low '%s'" % (
                                                   csv_file_abs, last_line))
@@ -262,7 +274,7 @@
 
                         # Check mobile count
                         try:
-                            if int(last_line_split[2]) < 10000:
+                            if int(last_line_split[3]) < 10000:
                                 issues.append("Desktop count of last line of "
                                               "%s is too low '%s'" % (
                                                   csv_file_abs, last_line))
@@ -273,7 +285,7 @@
 
                         # Check zero count
                         try:
-                            if int(last_line_split[3]) < 100:
+                            if int(last_line_split[4]) < 100:
                                 issues.append("Zero count of last line of "
                                               "%s is too low '%s'" % (
                                                   csv_file_abs, last_line))
@@ -282,8 +294,23 @@
                                           "not an integer '%s'" % (
                                               csv_file_abs, last_line))
 
+                        # Check zero count
+                        try:
+                            if int(last_line_split[1]) != \
+                                    int(last_line_split[2]) + \
+                                    int(last_line_split[3]) + \
+                                    int(last_line_split[4]):
+                                issues.append(
+                                    "Total column is not the sum of "
+                                    "individual columns in '%s' for %s" % (
+                                        last_line, csv_file_abs))
+                        except ValueError:
+                            # Some column is not a number. This has already
+                            # been reported above, so we just pass.
+                            pass
+
                 else:
-                    issues.append("Last line of %s does not have 4 columns: "
+                    issues.append("Last line of %s does not have 5 columns: "
                                   "'%s'" % (csv_file_abs, last_line))
             else:
                 issues.append("No lines for %s" % csv_file_abs)
diff --git a/tests/test_projectcounts.py b/tests/test_projectcounts.py
index b8c783a..f9336fe 100644
--- a/tests/test_projectcounts.py
+++ b/tests/test_projectcounts.py
@@ -73,11 +73,12 @@
                 for day_offset in range(-10, 0):
                     date = (today + datetime.timedelta(days=day_offset))
                     date_str = date.isoformat()
-                    file.write('%s,123456789,12345678,1234567%s' % (
+                    file.write('%s,137037034,123456789,12345678,1234567%s' % (
                         date_str, aggregator.CSV_LINE_ENDING))
 
     def assert_file_content_equals(self, actual_file_abs, expected_lines):
-        expected_lines.insert(0, 'Date,Desktop site,Mobile site,Zero site')
+        header = 'Date,Total,Desktop site,Mobile site,Zero site'
+        expected_lines.insert(0, header)
         with open(actual_file_abs, 'r') as file:
             for expected_line in expected_lines:
                 try:
@@ -252,7 +253,7 @@
 
         enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
         self.create_file(enwiki_file_abs, [
-            '2014-11-01,1,2,3'
+            '2014-11-01,1,2,3,4'
             ])
 
         aggregator.update_daily_per_project_csvs(
@@ -262,7 +263,7 @@
             date)
 
         self.assert_file_content_equals(enwiki_file_abs, [
-            '2014-11-01,1,2,3',
+            '2014-11-01,1,2,3,4',
             ])
 
     def test_update_daily_per_project_single_csvs_3days_2014_11_01(self):
@@ -282,7 +283,7 @@
             date)
 
         self.assert_file_content_equals(enwiki_file_abs, [
-            '2014-11-01,24276,0,0',
+            '2014-11-01,24276,24276,0,0',
             ])
 
     def test_update_daily_per_project_single_csvs_3days_2014_11_02(self):
@@ -302,7 +303,7 @@
             date)
 
         self.assert_file_content_equals(enwiki_file_abs, [
-            '2014-11-02,48276,0,0',
+            '2014-11-02,48276,48276,0,0',
             ])
 
     def test_update_daily_per_project_single_csvs_3days_2014_11_03(self):
@@ -322,7 +323,7 @@
             date)
 
         self.assert_file_content_equals(enwiki_file_abs, [
-            '2014-11-03,72276,0,0',
+            '2014-11-03,72276,72276,0,0',
             ])
 
     def test_update_daily_per_project_single_csvs_3days_prefilled(self):
@@ -334,8 +335,8 @@
 
         enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
         self.create_file(enwiki_file_abs, [
-            '2014-11-03,1,2,3',
-            '2014-11-01,4,5,6',
+            '2014-11-03,1,2,3,4',
+            '2014-11-01,5,6,7,8',
             ])
 
         aggregator.update_daily_per_project_csvs(
@@ -345,9 +346,9 @@
             date)
 
         self.assert_file_content_equals(enwiki_file_abs, [
-            '2014-11-01,4,5,6',
-            '2014-11-02,48276,0,0',
-            '2014-11-03,1,2,3',
+            '2014-11-01,5,6,7,8',
+            '2014-11-02,48276,48276,0,0',
+            '2014-11-03,1,2,3,4',
             ])
 
     def test_update_daily_per_project_single_csvs_3days_doubled(self):
@@ -359,8 +360,8 @@
 
         enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
         self.create_file(enwiki_file_abs, [
-            '2014-11-01,1,2,3',
-            '2014-11-01,2,3,4',
+            '2014-11-01,1,2,3,4',
+            '2014-11-01,2,3,4,5',
             ])
 
         nose.tools.assert_raises(
@@ -389,9 +390,9 @@
             last_date)
 
         self.assert_file_content_equals(enwiki_file_abs, [
-            '2014-11-01,24276,0,0',
-            '2014-11-02,48276,0,0',
-            '2014-11-03,72276,0,0',
+            '2014-11-01,24276,24276,0,0',
+            '2014-11-02,48276,48276,0,0',
+            '2014-11-03,72276,72276,0,0',
             ])
 
     def test_update_daily_forced_recomputation(self):
@@ -403,7 +404,7 @@
 
         enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
         self.create_file(enwiki_file_abs, [
-            '2014-11-01,1,2,3'
+            '2014-11-01,1,2,3,4'
             ])
 
         aggregator.update_daily_per_project_csvs(
@@ -414,7 +415,7 @@
             True)
 
         self.assert_file_content_equals(enwiki_file_abs, [
-            '2014-11-01,24276,0,0',
+            '2014-11-01,24276,24276,0,0',
             ])
 
     def test_update_daily_forced_recomputation_missing_hours(self):
@@ -426,7 +427,7 @@
 
         enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
         self.create_file(enwiki_file_abs, [
-            '2014-11-01,1,2,3'
+            '2014-11-01,1,2,3,4'
             ])
 
         nose.tools.assert_raises(
@@ -501,7 +502,7 @@
         for day_offset in range(-10, 0):
             date = (yesterday + datetime.timedelta(days=day_offset))
             date_str = date.isoformat()
-            lines.append('%s,123456789,12345678,1234567' % (date_str))
+            lines.append('%s,135925923,123456789,12345678,123456' % (date_str))
         self.create_file(enwiki_file_abs, lines)
 
         issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
@@ -521,7 +522,7 @@
         for day_offset in range(-10, 0):
             date = (today + datetime.timedelta(days=day_offset))
             date_str = date.isoformat()
-            lines.append('%s,0,12345678,1234567' % (date_str))
+            lines.append('%s,13580245,0,12345678,1234567' % (date_str))
         self.create_file(enwiki_file_abs, lines)
 
         issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
@@ -541,7 +542,7 @@
         for day_offset in range(-10, 0):
             date = (today + datetime.timedelta(days=day_offset))
             date_str = date.isoformat()
-            lines.append('%s,123456789,0,1234567' % (date_str))
+            lines.append('%s,124691356,123456789,0,1234567' % (date_str))
         self.create_file(enwiki_file_abs, lines)
 
         issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
@@ -561,7 +562,7 @@
         for day_offset in range(-10, 0):
             date = (today + datetime.timedelta(days=day_offset))
             date_str = date.isoformat()
-            lines.append('%s,123456789,12345678,0' % (date_str))
+            lines.append('%s,135802467,123456789,12345678,0' % (date_str))
         self.create_file(enwiki_file_abs, lines)
 
         issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
@@ -570,6 +571,26 @@
         # At least one issue, as enwiki has no reading for today
         nose.tools.assert_greater_equal(len(issues), 1)
 
+    def test_validity_enwiki_total_does_not_add_up(self):
+        tmp_dir_abs = self.create_tmp_dir_abs()
+
+        self.create_valid_aggregated_projects(tmp_dir_abs)
+
+        enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
+        today = datetime.date.today()
+        lines = []
+        for day_offset in range(-10, 0):
+            date = (today + datetime.timedelta(days=day_offset))
+            date_str = date.isoformat()
+            lines.append('%s,200000000,123456789,12345678,123456' % (date_str))
+        self.create_file(enwiki_file_abs, lines)
+
+        issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
+            tmp_dir_abs)
+
+        # At least one issue, as the total is not the sum of tho other colums.
+        nose.tools.assert_greater_equal(len(issues), 1)
+
     def test_validity_valid(self):
         tmp_dir_abs = self.create_tmp_dir_abs()
 

-- 
To view, visit https://gerrit.wikimedia.org/r/172237
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib62841200da246763efb6b9550df2b5af52be505
Gerrit-PatchSet: 1
Gerrit-Project: analytics/aggregator
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Add total column for project aggregation CSVs - change (analytics/aggregator)

Reply via email to