QChris has uploaded a new change for review.
https://gerrit.wikimedia.org/r/172237
Change subject: Add total column for project aggregation CSVs
......................................................................
Add total column for project aggregation CSVs
While the total column does not add new information, it eases direct
plotting.
Change-Id: Ib62841200da246763efb6b9550df2b5af52be505
---
M aggregator/projectcounts.py
M tests/test_projectcounts.py
2 files changed, 80 insertions(+), 32 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/aggregator
refs/changes/37/172237/1
diff --git a/aggregator/projectcounts.py b/aggregator/projectcounts.py
index 18d16ef..2f95370 100644
--- a/aggregator/projectcounts.py
+++ b/aggregator/projectcounts.py
@@ -188,16 +188,17 @@
source_dir_abs, abbreviation, date)
# injecting obtained data
- csv_data[date_str] = '%s,%s,%s,%s%s' % (
+ csv_data[date_str] = '%s,%s,%s,%s,%s%s' % (
date_str,
+ count_desktop + count_mobile + count_zero,
count_desktop,
count_mobile,
count_zero,
CSV_LINE_ENDING)
with open(csv_file_abs, 'w') as csv_file:
- csv_file.write('Date,Desktop site,Mobile site,Zero site%s' % (
- CSV_LINE_ENDING))
+ csv_file.write('Date,Total,Desktop site,Mobile site,Zero site%s'
+ % (CSV_LINE_ENDING))
csv_file.writelines(sorted(csv_data.itervalues()))
@@ -236,7 +237,7 @@
# opened in text mode by default, line ends are normalized to
# LF, event though CRLF gets written.
last_line_split = last_line.split(',')
- if len(last_line_split) == 4:
+ if len(last_line_split) == 5:
# Check if last line is not older than yesterday
try:
last_line_date = util.parse_string_to_date(
@@ -249,9 +250,20 @@
"'%s'" % (csv_file_abs, last_line))
if dbname in big_wikis:
- # Check desktop count
+ # Check total count
try:
if int(last_line_split[1]) < 1000000:
+ issues.append("Total count of last line of "
+ "%s is too low '%s'" % (
+ csv_file_abs, last_line))
+ except ValueError:
+ issues.append("Total count of last line of %s is"
+ "not an integer '%s'" % (
+ csv_file_abs, last_line))
+
+ # Check desktop count
+ try:
+ if int(last_line_split[2]) < 1000000:
issues.append("Desktop count of last line of "
"%s is too low '%s'" % (
csv_file_abs, last_line))
@@ -262,7 +274,7 @@
# Check mobile count
try:
- if int(last_line_split[2]) < 10000:
+ if int(last_line_split[3]) < 10000:
issues.append("Desktop count of last line of "
"%s is too low '%s'" % (
csv_file_abs, last_line))
@@ -273,7 +285,7 @@
# Check zero count
try:
- if int(last_line_split[3]) < 100:
+ if int(last_line_split[4]) < 100:
issues.append("Zero count of last line of "
"%s is too low '%s'" % (
csv_file_abs, last_line))
@@ -282,8 +294,23 @@
"not an integer '%s'" % (
csv_file_abs, last_line))
+ # Check zero count
+ try:
+ if int(last_line_split[1]) != \
+ int(last_line_split[2]) + \
+ int(last_line_split[3]) + \
+ int(last_line_split[4]):
+ issues.append(
+ "Total column is not the sum of "
+ "individual columns in '%s' for %s" % (
+ last_line, csv_file_abs))
+ except ValueError:
+ # Some column is not a number. This has already
+ # been reported above, so we just pass.
+ pass
+
else:
- issues.append("Last line of %s does not have 4 columns: "
+ issues.append("Last line of %s does not have 5 columns: "
"'%s'" % (csv_file_abs, last_line))
else:
issues.append("No lines for %s" % csv_file_abs)
diff --git a/tests/test_projectcounts.py b/tests/test_projectcounts.py
index b8c783a..f9336fe 100644
--- a/tests/test_projectcounts.py
+++ b/tests/test_projectcounts.py
@@ -73,11 +73,12 @@
for day_offset in range(-10, 0):
date = (today + datetime.timedelta(days=day_offset))
date_str = date.isoformat()
- file.write('%s,123456789,12345678,1234567%s' % (
+ file.write('%s,137037034,123456789,12345678,1234567%s' % (
date_str, aggregator.CSV_LINE_ENDING))
def assert_file_content_equals(self, actual_file_abs, expected_lines):
- expected_lines.insert(0, 'Date,Desktop site,Mobile site,Zero site')
+ header = 'Date,Total,Desktop site,Mobile site,Zero site'
+ expected_lines.insert(0, header)
with open(actual_file_abs, 'r') as file:
for expected_line in expected_lines:
try:
@@ -252,7 +253,7 @@
enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
self.create_file(enwiki_file_abs, [
- '2014-11-01,1,2,3'
+ '2014-11-01,1,2,3,4'
])
aggregator.update_daily_per_project_csvs(
@@ -262,7 +263,7 @@
date)
self.assert_file_content_equals(enwiki_file_abs, [
- '2014-11-01,1,2,3',
+ '2014-11-01,1,2,3,4',
])
def test_update_daily_per_project_single_csvs_3days_2014_11_01(self):
@@ -282,7 +283,7 @@
date)
self.assert_file_content_equals(enwiki_file_abs, [
- '2014-11-01,24276,0,0',
+ '2014-11-01,24276,24276,0,0',
])
def test_update_daily_per_project_single_csvs_3days_2014_11_02(self):
@@ -302,7 +303,7 @@
date)
self.assert_file_content_equals(enwiki_file_abs, [
- '2014-11-02,48276,0,0',
+ '2014-11-02,48276,48276,0,0',
])
def test_update_daily_per_project_single_csvs_3days_2014_11_03(self):
@@ -322,7 +323,7 @@
date)
self.assert_file_content_equals(enwiki_file_abs, [
- '2014-11-03,72276,0,0',
+ '2014-11-03,72276,72276,0,0',
])
def test_update_daily_per_project_single_csvs_3days_prefilled(self):
@@ -334,8 +335,8 @@
enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
self.create_file(enwiki_file_abs, [
- '2014-11-03,1,2,3',
- '2014-11-01,4,5,6',
+ '2014-11-03,1,2,3,4',
+ '2014-11-01,5,6,7,8',
])
aggregator.update_daily_per_project_csvs(
@@ -345,9 +346,9 @@
date)
self.assert_file_content_equals(enwiki_file_abs, [
- '2014-11-01,4,5,6',
- '2014-11-02,48276,0,0',
- '2014-11-03,1,2,3',
+ '2014-11-01,5,6,7,8',
+ '2014-11-02,48276,48276,0,0',
+ '2014-11-03,1,2,3,4',
])
def test_update_daily_per_project_single_csvs_3days_doubled(self):
@@ -359,8 +360,8 @@
enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
self.create_file(enwiki_file_abs, [
- '2014-11-01,1,2,3',
- '2014-11-01,2,3,4',
+ '2014-11-01,1,2,3,4',
+ '2014-11-01,2,3,4,5',
])
nose.tools.assert_raises(
@@ -389,9 +390,9 @@
last_date)
self.assert_file_content_equals(enwiki_file_abs, [
- '2014-11-01,24276,0,0',
- '2014-11-02,48276,0,0',
- '2014-11-03,72276,0,0',
+ '2014-11-01,24276,24276,0,0',
+ '2014-11-02,48276,48276,0,0',
+ '2014-11-03,72276,72276,0,0',
])
def test_update_daily_forced_recomputation(self):
@@ -403,7 +404,7 @@
enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
self.create_file(enwiki_file_abs, [
- '2014-11-01,1,2,3'
+ '2014-11-01,1,2,3,4'
])
aggregator.update_daily_per_project_csvs(
@@ -414,7 +415,7 @@
True)
self.assert_file_content_equals(enwiki_file_abs, [
- '2014-11-01,24276,0,0',
+ '2014-11-01,24276,24276,0,0',
])
def test_update_daily_forced_recomputation_missing_hours(self):
@@ -426,7 +427,7 @@
enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
self.create_file(enwiki_file_abs, [
- '2014-11-01,1,2,3'
+ '2014-11-01,1,2,3,4'
])
nose.tools.assert_raises(
@@ -501,7 +502,7 @@
for day_offset in range(-10, 0):
date = (yesterday + datetime.timedelta(days=day_offset))
date_str = date.isoformat()
- lines.append('%s,123456789,12345678,1234567' % (date_str))
+ lines.append('%s,135925923,123456789,12345678,123456' % (date_str))
self.create_file(enwiki_file_abs, lines)
issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
@@ -521,7 +522,7 @@
for day_offset in range(-10, 0):
date = (today + datetime.timedelta(days=day_offset))
date_str = date.isoformat()
- lines.append('%s,0,12345678,1234567' % (date_str))
+ lines.append('%s,13580245,0,12345678,1234567' % (date_str))
self.create_file(enwiki_file_abs, lines)
issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
@@ -541,7 +542,7 @@
for day_offset in range(-10, 0):
date = (today + datetime.timedelta(days=day_offset))
date_str = date.isoformat()
- lines.append('%s,123456789,0,1234567' % (date_str))
+ lines.append('%s,124691356,123456789,0,1234567' % (date_str))
self.create_file(enwiki_file_abs, lines)
issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
@@ -561,7 +562,7 @@
for day_offset in range(-10, 0):
date = (today + datetime.timedelta(days=day_offset))
date_str = date.isoformat()
- lines.append('%s,123456789,12345678,0' % (date_str))
+ lines.append('%s,135802467,123456789,12345678,0' % (date_str))
self.create_file(enwiki_file_abs, lines)
issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
@@ -570,6 +571,26 @@
# At least one issue, as enwiki has no reading for today
nose.tools.assert_greater_equal(len(issues), 1)
+ def test_validity_enwiki_total_does_not_add_up(self):
+ tmp_dir_abs = self.create_tmp_dir_abs()
+
+ self.create_valid_aggregated_projects(tmp_dir_abs)
+
+ enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
+ today = datetime.date.today()
+ lines = []
+ for day_offset in range(-10, 0):
+ date = (today + datetime.timedelta(days=day_offset))
+ date_str = date.isoformat()
+ lines.append('%s,200000000,123456789,12345678,123456' % (date_str))
+ self.create_file(enwiki_file_abs, lines)
+
+ issues = aggregator.get_validity_issues_for_aggregated_projectcounts(
+ tmp_dir_abs)
+
+ # At least one issue, as the total is not the sum of tho other colums.
+ nose.tools.assert_greater_equal(len(issues), 1)
+
def test_validity_valid(self):
tmp_dir_abs = self.create_tmp_dir_abs()
--
To view, visit https://gerrit.wikimedia.org/r/172237
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib62841200da246763efb6b9550df2b5af52be505
Gerrit-PatchSet: 1
Gerrit-Project: analytics/aggregator
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits