QChris has uploaded a new change for review.
https://gerrit.wikimedia.org/r/172236
Change subject: Allow to force recomputation of existing data
......................................................................
Allow to force recomputation of existing data
Change-Id: Ida9d515ca5033e51a105624ed9f41325d658485a
---
M aggregator/projectcounts.py
M bin/aggregate_projectcounts
M tests/test_projectcounts.py
3 files changed, 55 insertions(+), 4 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/aggregator
refs/changes/36/172236/1
diff --git a/aggregator/projectcounts.py b/aggregator/projectcounts.py
index ef5969c..18d16ef 100644
--- a/aggregator/projectcounts.py
+++ b/aggregator/projectcounts.py
@@ -124,7 +124,7 @@
def update_daily_per_project_csvs(source_dir_abs, target_dir_abs, first_date,
- last_date):
+ last_date, force_recomputation=False):
"""Updates daily per project CSVs from hourly projectcounts files.
The existing per project CSV files in target_dir_abs are updated with daily
@@ -142,6 +142,8 @@
:param target_dir_abs: Absolute directory of the per project CSVs.
:param first_date: The first date to compute non-existing data for.
:param last_date: The last date to compute non-existing data for.
+ :param force_recomputation: If True, recompute data for the given days,
+ even if it is already in the CSV. (Default: False)
"""
for csv_file_abs in glob.glob(os.path.join(target_dir_abs, '*.csv')):
logging.info("Updating csv '%s'" % (csv_file_abs))
@@ -166,7 +168,7 @@
date_str = date.isoformat()
logging.debug("Updating csv '%s' for date '%s'" % (
dbname, str(date)))
- if date_str not in csv_data:
+ if date_str not in csv_data or force_recomputation:
# desktop site
abbreviation = util.dbname_to_webstatscollector_abbreviation(
dbname, 'desktop')
diff --git a/bin/aggregate_projectcounts b/bin/aggregate_projectcounts
index 045b006..1ad7fd1 100755
--- a/bin/aggregate_projectcounts
+++ b/bin/aggregate_projectcounts
@@ -18,7 +18,7 @@
Usage: aggregate_projectcounts [--source SOURCE_DIR] [--target TARGET_DIR]
[--first-date FIRST_DATE] [--last-date LAST_DATE] [--date DATE]
- [--log LOG_FILE] [--push-target] [-v ...] [--help]
+ [--log LOG_FILE] [--force] [--push-target] [-v ...] [--help]
Options:
-h, --help Show this help message and exit.
@@ -34,6 +34,8 @@
[default: yesterday]
--date DATE Day to aggregate for (overrides --first-date, and
--last-date)
+ --force Force recomputation of given days, even if the CSV
+ would already contain that data.
--push-target Assumes the target directory is a git repository,
and automatically hard reset it before the
aggregation, and commit and push after the
@@ -154,6 +156,8 @@
logging.error("first_date '%s' is not before last_date '%s'" %
(first_date, last_date))
+ force_recomputation = arguments['--force']
+
if not all_parameters_ok:
logging.error("Parameters could not get parsed")
sys.exit(1)
@@ -169,7 +173,8 @@
source_dir_abs,
target_dir_abs,
first_date,
- last_date
+ last_date,
+ force_recomputation
)
if arguments["--push-target"]:
diff --git a/tests/test_projectcounts.py b/tests/test_projectcounts.py
index 98ce3c7..b8c783a 100644
--- a/tests/test_projectcounts.py
+++ b/tests/test_projectcounts.py
@@ -394,6 +394,50 @@
'2014-11-03,72276,0,0',
])
+ def test_update_daily_forced_recomputation(self):
+ fixture = os.path.join(FIXTURES_DIR_ABS,
+ '2014-11-3days-enwiki-day-times-100-plus-hour')
+ date = datetime.date(2014, 11, 1)
+
+ tmp_dir_abs = self.create_tmp_dir_abs()
+
+ enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
+ self.create_file(enwiki_file_abs, [
+ '2014-11-01,1,2,3'
+ ])
+
+ aggregator.update_daily_per_project_csvs(
+ fixture,
+ tmp_dir_abs,
+ date,
+ date,
+ True)
+
+ self.assert_file_content_equals(enwiki_file_abs, [
+ '2014-11-01,24276,0,0',
+ ])
+
+ def test_update_daily_forced_recomputation_missing_hours(self):
+ fixture = os.path.join(FIXTURES_DIR_ABS, '2014-11-missing-hours')
+
+ date = datetime.date(2014, 11, 1)
+
+ tmp_dir_abs = self.create_tmp_dir_abs()
+
+ enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
+ self.create_file(enwiki_file_abs, [
+ '2014-11-01,1,2,3'
+ ])
+
+ nose.tools.assert_raises(
+ RuntimeError,
+ aggregator.update_daily_per_project_csvs,
+ fixture,
+ tmp_dir_abs,
+ date,
+ date,
+ True)
+
def test_validity_no_csvs(self):
tmp_dir_abs = self.create_tmp_dir_abs()
--
To view, visit https://gerrit.wikimedia.org/r/172236
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ida9d515ca5033e51a105624ed9f41325d658485a
Gerrit-PatchSet: 1
Gerrit-Project: analytics/aggregator
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits