Milimetric has submitted this change and it was merged. Change subject: pages created timeseries implemented ......................................................................
pages created timeseries implemented Change-Id: I16121f5324d2e6101ee23cfc41bcba72f692ec2e --- M tests/test_models/test_aggregate_report.py M wikimetrics/metrics/pages_created.py M wikimetrics/metrics/survivors.py M wikimetrics/models/report_nodes/aggregate_report.py 4 files changed, 200 insertions(+), 44 deletions(-) Approvals: Milimetric: Verified; Looks good to me, approved diff --git a/tests/test_models/test_aggregate_report.py b/tests/test_models/test_aggregate_report.py index 7a932d9..cb789ae 100644 --- a/tests/test_models/test_aggregate_report.py +++ b/tests/test_models/test_aggregate_report.py @@ -1,6 +1,8 @@ from decimal import Decimal from nose.tools import assert_equals, assert_true -from wikimetrics.metrics import metric_classes +from wikimetrics.metrics import ( + metric_classes, NamespaceEdits, TimeseriesChoices, +) from wikimetrics.models import ( Aggregation, AggregateReport, PersistentReport, Cohort, ) @@ -96,7 +98,6 @@ ) assert_equals( finished[ar.result_key][Aggregation.AVG]['edits'], - # TODO: Again, figure out this crazy "None" user id Decimal(1.25) ) assert_equals( @@ -125,6 +126,127 @@ assert_true(str(ar).find('AggregateReport') >= 0) + +class AggregateReportTimeseriesTest(QueueDatabaseTest): + + def setUp(self): + DatabaseTest.setUp(self) + self.create_test_cohort( + editor_count=4, + revisions_per_editor=4, + revision_timestamps=[ + [20121231230000, 20130101003000, 20130101010000, 20140101010000], + [20130101120000, 20130102000000, 20130102120000, 20130103120000], + [20130101000000, 20130108000000, 20130116000000, 20130216000000], + [20130101000000, 20130201000000, 20140101000000, 20140102000000], + ], + revision_lengths=10 + ) + + def test_timeseries_day(self): + metric = NamespaceEdits( + namespaces=[0], + start_date='2012-12-31 00:00:00', + end_date='2013-01-03 00:00:00', + timeseries=TimeseriesChoices.DAY, + ) + ar = AggregateReport( + self.cohort, + metric, + individual=True, + aggregate=True, + aggregate_sum=True, + aggregate_average=True, + aggregate_std_deviation=True, + user_id=self.test_user_id, + ) + results = ar.task.delay(ar).get() + + self.session.commit() + aggregate_key = self.session.query(PersistentReport)\ + .filter(PersistentReport.id == ar.persistent_id)\ + .one()\ + .result_key + + assert_equals( + results[aggregate_key][Aggregation.IND][0][self.editors[0].user_id]['edits'], + { + '2012-12-31 00:00:00' : 1, + '2013-01-01 00:00:00' : 2, + '2013-01-02 00:00:00' : 0, + } + ) + assert_equals( + results[aggregate_key][Aggregation.SUM]['edits'], + { + '2012-12-31 00:00:00' : 1, + '2013-01-01 00:00:00' : 5, + '2013-01-02 00:00:00' : 2, + } + ) + assert_equals( + results[aggregate_key][Aggregation.AVG]['edits'], + { + '2012-12-31 00:00:00' : Decimal(0.25), + '2013-01-01 00:00:00' : Decimal(1.25), + '2013-01-02 00:00:00' : Decimal(0.5), + } + ) + + def test_finish_timeseries(self): + metric = NamespaceEdits( + namespaces=[0], + start_date='2012-12-31 00:00:00', + end_date='2013-01-03 00:00:00', + timeseries=TimeseriesChoices.DAY, + ) + ar = AggregateReport( + self.cohort, + metric, + individual=True, + aggregate=True, + aggregate_sum=True, + aggregate_average=True, + aggregate_std_deviation=True, + user_id=self.test_user_id, + ) + + finished = ar.finish([ + { + 'namespace edits - fake cohort' : { + 1: {'edits': {'date1': 1, 'date2': 2}}, + 2: {'edits': {'date1': 0, 'date2': 1}}, + 3: {'edits': {'date1': 0, 'date2': 0}}, + None: {'edits': {'date1': None, 'date2': None}} + } + }, + { + 'some other metric - fake cohort' : { + 1: {'other_sub_metric': {'date3': Decimal(2.3), 'date4': 0}}, + 2: {'other_sub_metric': {'date3': 0, 'date4': Decimal(3.4)}}, + 3: {'other_sub_metric': {'date3': None, 'date4': None}}, + None: {'other_sub_metric': {'date3': None, 'date4': None}} + } + }, + ]) + + assert_equals( + finished[ar.result_key][Aggregation.SUM]['edits'], + {'date1': 1, 'date2': 3} + ) + assert_equals( + finished[ar.result_key][Aggregation.SUM]['other_sub_metric'], + {'date3': Decimal(2.3), 'date4': Decimal(3.4)} + ) + assert_equals( + finished[ar.result_key][Aggregation.AVG]['edits'], + {'date1': 0.25, 'date2': 0.75} + ) + assert_equals( + finished[ar.result_key][Aggregation.AVG]['other_sub_metric'], + {'date3': 0.575, 'date4': 0.85} + ) + # NOTE: a sample output of AggregateReport: #{ #'f5ca5afe-6b2d-4052-bd51-6cbeaeba5eb9': { diff --git a/wikimetrics/metrics/pages_created.py b/wikimetrics/metrics/pages_created.py index ac68888..8e8a30b 100644 --- a/wikimetrics/metrics/pages_created.py +++ b/wikimetrics/metrics/pages_created.py @@ -1,6 +1,6 @@ from ..utils import thirty_days_ago, today from sqlalchemy import func -from metric import Metric +from timeseries_metric import TimeseriesMetric from form_fields import CommaSeparatedIntegerListField, BetterDateTimeField from wtforms.validators import Required from wikimetrics.models import Page, Revision @@ -9,7 +9,7 @@ __all__ = ['PagesCreated'] -class PagesCreated(Metric): +class PagesCreated(TimeseriesMetric): """ This class counts the pages created by editors over a period of time. @@ -34,9 +34,6 @@ editor in a time interval' ) - start_date = BetterDateTimeField(default=thirty_days_ago) - end_date = BetterDateTimeField(default=today) - namespaces = CommaSeparatedIntegerListField( None, [Required()], @@ -57,20 +54,21 @@ start_date = self.start_date.data end_date = self.end_date.data - pages_by_user = dict( - session - .query(Revision.rev_user, func.count(Page.page_id)) - .join(Page) - .filter(Page.page_namespace.in_(self.namespaces.data)) - .filter(Revision.rev_parent_id == 0) - .filter(Revision.rev_user.in_(user_ids)) - .filter(Revision.rev_timestamp > start_date) - .filter(Revision.rev_timestamp <= end_date) + pages_by_user = session\ + .query(Revision.rev_user, func.count(Page.page_id))\ + .join(Page)\ + .filter(Page.page_namespace.in_(self.namespaces.data))\ + .filter(Revision.rev_parent_id == 0)\ + .filter(Revision.rev_user.in_(user_ids))\ + .filter(Revision.rev_timestamp > start_date)\ + .filter(Revision.rev_timestamp <= end_date)\ .group_by(Revision.rev_user) - .all() - ) - return { - user_id: {'pages_created': pages_by_user.get(user_id, 0)} - for user_id in user_ids - } + query = self.apply_timeseries(pages_by_user) + return self.results_by_user( + user_ids, + query, + [('pages_created', 1, 0)], + submetric_default=0, + date_index=2, + ) diff --git a/wikimetrics/metrics/survivors.py b/wikimetrics/metrics/survivors.py index d290e81..2ef8449 100644 --- a/wikimetrics/metrics/survivors.py +++ b/wikimetrics/metrics/survivors.py @@ -23,7 +23,7 @@ """ - show_in_ui = True + show_in_ui = False id = 'survivors' label = 'Survivors' description = ( diff --git a/wikimetrics/models/report_nodes/aggregate_report.py b/wikimetrics/models/report_nodes/aggregate_report.py index b7514cf..0f228e0 100644 --- a/wikimetrics/models/report_nodes/aggregate_report.py +++ b/wikimetrics/models/report_nodes/aggregate_report.py @@ -97,32 +97,68 @@ for results_by_user in list_of_results: for user_id in results_by_user.keys(): for key in results_by_user[user_id]: - if not key in aggregation: - aggregation[key] = 0 - helper[key] = dict() - helper[key]['sum'] = Decimal(0.0) - helper[key]['count'] = 0 - value = results_by_user[user_id][key] if not value: + # NOTE: value should never be None in a timeseries result value = Decimal(0) - helper[key]['sum'] += Decimal(value) - helper[key]['count'] += 1 + # handle timeseries aggregation + if isinstance(value, dict): + if not key in aggregation: + aggregation[key] = dict() + helper[key] = dict() + for subkey in value: + aggregation[key][subkey] = 0 + helper[key][subkey] = dict() + helper[key][subkey]['sum'] = Decimal(0.0) + helper[key][subkey]['count'] = 0 + + for subkey in value: + value_subkey = value[subkey] or Decimal(0) + + helper[key][subkey]['sum'] += Decimal(value_subkey) + helper[key][subkey]['count'] += 1 + + if type_of_aggregate == Aggregation.SUM: + aggregation[key][subkey] = round( + helper[key][subkey]['sum'], + 4 + ) + elif type_of_aggregate == Aggregation.AVG: + cummulative_sum = helper[key][subkey]['sum'] + count = helper[key][subkey]['count'] + aggregation[key][subkey] = round( + cummulative_sum / count, + 4 + ) + elif type_of_aggregate == Aggregation.STD: + aggregation[key][subkey] = 'Not Implemented' + pass - if type_of_aggregate == Aggregation.SUM: - aggregation[key] = round( - helper[key]['sum'], - 4 - ) - elif type_of_aggregate == Aggregation.AVG: - aggregation[key] = round( - helper[key]['sum'] / helper[key]['count'], - 4 - ) - elif type_of_aggregate == Aggregation.STD: - aggregation[key] = 'Not Implemented' - pass + # handle normal aggregation + else: + if not key in aggregation: + aggregation[key] = 0 + helper[key] = dict() + helper[key]['sum'] = Decimal(0.0) + helper[key]['count'] = 0 + + helper[key]['sum'] += Decimal(value) + helper[key]['count'] += 1 + + if type_of_aggregate == Aggregation.SUM: + aggregation[key] = round( + helper[key]['sum'], + 4 + ) + elif type_of_aggregate == Aggregation.AVG: + aggregation[key] = round( + helper[key]['sum'] / helper[key]['count'], + 4 + ) + elif type_of_aggregate == Aggregation.STD: + aggregation[key] = 'Not Implemented' + pass return aggregation -- To view, visit https://gerrit.wikimedia.org/r/84764 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I16121f5324d2e6101ee23cfc41bcba72f692ec2e Gerrit-PatchSet: 1 Gerrit-Project: analytics/wikimetrics Gerrit-Branch: master Gerrit-Owner: Milimetric <dandree...@wikimedia.org> Gerrit-Reviewer: Milimetric <dandree...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits