QChris has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/172236

Change subject: Allow to force recomputation of existing data
......................................................................

Allow to force recomputation of existing data

Change-Id: Ida9d515ca5033e51a105624ed9f41325d658485a
---
M aggregator/projectcounts.py
M bin/aggregate_projectcounts
M tests/test_projectcounts.py
3 files changed, 55 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/aggregator 
refs/changes/36/172236/1

diff --git a/aggregator/projectcounts.py b/aggregator/projectcounts.py
index ef5969c..18d16ef 100644
--- a/aggregator/projectcounts.py
+++ b/aggregator/projectcounts.py
@@ -124,7 +124,7 @@
 
 
 def update_daily_per_project_csvs(source_dir_abs, target_dir_abs, first_date,
-                                  last_date):
+                                  last_date, force_recomputation=False):
     """Updates daily per project CSVs from hourly projectcounts files.
 
     The existing per project CSV files in target_dir_abs are updated with daily
@@ -142,6 +142,8 @@
     :param target_dir_abs: Absolute directory of the per project CSVs.
     :param first_date: The first date to compute non-existing data for.
     :param last_date: The last date to compute non-existing data for.
+    :param force_recomputation: If True, recompute data for the given days,
+        even if it is already in the CSV. (Default: False)
     """
     for csv_file_abs in glob.glob(os.path.join(target_dir_abs, '*.csv')):
         logging.info("Updating csv '%s'" % (csv_file_abs))
@@ -166,7 +168,7 @@
             date_str = date.isoformat()
             logging.debug("Updating csv '%s' for date '%s'" % (
                 dbname, str(date)))
-            if date_str not in csv_data:
+            if date_str not in csv_data or force_recomputation:
                 # desktop site
                 abbreviation = util.dbname_to_webstatscollector_abbreviation(
                     dbname, 'desktop')
diff --git a/bin/aggregate_projectcounts b/bin/aggregate_projectcounts
index 045b006..1ad7fd1 100755
--- a/bin/aggregate_projectcounts
+++ b/bin/aggregate_projectcounts
@@ -18,7 +18,7 @@
 
 Usage: aggregate_projectcounts [--source SOURCE_DIR] [--target TARGET_DIR]
            [--first-date FIRST_DATE] [--last-date LAST_DATE] [--date DATE]
-           [--log LOG_FILE] [--push-target] [-v ...] [--help]
+           [--log LOG_FILE] [--force] [--push-target] [-v ...] [--help]
 
 Options:
     -h, --help               Show this help message and exit.
@@ -34,6 +34,8 @@
                              [default: yesterday]
     --date DATE              Day to aggregate for (overrides --first-date, and
                              --last-date)
+    --force                  Force recomputation of given days, even if the CSV
+                             would already contain that data.
     --push-target            Assumes the target directory is a git repository,
                              and automatically hard reset it before the
                              aggregation, and commit and push after the
@@ -154,6 +156,8 @@
         logging.error("first_date '%s' is not before last_date '%s'" %
                       (first_date, last_date))
 
+    force_recomputation = arguments['--force']
+
     if not all_parameters_ok:
         logging.error("Parameters could not get parsed")
         sys.exit(1)
@@ -169,7 +173,8 @@
         source_dir_abs,
         target_dir_abs,
         first_date,
-        last_date
+        last_date,
+        force_recomputation
     )
 
     if arguments["--push-target"]:
diff --git a/tests/test_projectcounts.py b/tests/test_projectcounts.py
index 98ce3c7..b8c783a 100644
--- a/tests/test_projectcounts.py
+++ b/tests/test_projectcounts.py
@@ -394,6 +394,50 @@
             '2014-11-03,72276,0,0',
             ])
 
+    def test_update_daily_forced_recomputation(self):
+        fixture = os.path.join(FIXTURES_DIR_ABS,
+                               '2014-11-3days-enwiki-day-times-100-plus-hour')
+        date = datetime.date(2014, 11, 1)
+
+        tmp_dir_abs = self.create_tmp_dir_abs()
+
+        enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
+        self.create_file(enwiki_file_abs, [
+            '2014-11-01,1,2,3'
+            ])
+
+        aggregator.update_daily_per_project_csvs(
+            fixture,
+            tmp_dir_abs,
+            date,
+            date,
+            True)
+
+        self.assert_file_content_equals(enwiki_file_abs, [
+            '2014-11-01,24276,0,0',
+            ])
+
+    def test_update_daily_forced_recomputation_missing_hours(self):
+        fixture = os.path.join(FIXTURES_DIR_ABS, '2014-11-missing-hours')
+
+        date = datetime.date(2014, 11, 1)
+
+        tmp_dir_abs = self.create_tmp_dir_abs()
+
+        enwiki_file_abs = os.path.join(tmp_dir_abs, 'enwiki.csv')
+        self.create_file(enwiki_file_abs, [
+            '2014-11-01,1,2,3'
+            ])
+
+        nose.tools.assert_raises(
+            RuntimeError,
+            aggregator.update_daily_per_project_csvs,
+            fixture,
+            tmp_dir_abs,
+            date,
+            date,
+            True)
+
     def test_validity_no_csvs(self):
         tmp_dir_abs = self.create_tmp_dir_abs()
 

-- 
To view, visit https://gerrit.wikimedia.org/r/172236
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ida9d515ca5033e51a105624ed9f41325d658485a
Gerrit-PatchSet: 1
Gerrit-Project: analytics/aggregator
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to