jenkins-bot has submitted this change and it was merged.
Change subject: Fix Aggregation of empty metric results
......................................................................
Fix Aggregation of empty metric results
When a metric that runs daily (like new register users)
does not return any results the agreggation step fails.
Relatedly, when a wiki level cohort runss through the system, the metric
and cohort service were not treating the group_by_project logic
correctly. This has also been fixed with a test to prove it.
Bug: 66439
Change-Id: I9d4f3250760b9b3a3dc60849dd88e21148d024a7
---
M tests/fixtures.py
M tests/test_api/test_cohort_service.py
A tests/test_metrics/test_edge_cases.py
M wikimetrics/metrics/bytes_added.py
M wikimetrics/metrics/metric.py
M wikimetrics/metrics/namespace_edits.py
M wikimetrics/metrics/newly_registered.py
M wikimetrics/metrics/pages_created.py
M wikimetrics/metrics/revert_rate.py
M wikimetrics/metrics/survival.py
M wikimetrics/metrics/threshold.py
M wikimetrics/models/report_nodes/aggregate_report.py
M wikimetrics/models/report_nodes/metric_report.py
M wikimetrics/models/storage/cohort.py
M wikimetrics/utils.py
15 files changed, 130 insertions(+), 29 deletions(-)
Approvals:
Nuria: Looks good to me, approved
jenkins-bot: Verified
diff --git a/tests/fixtures.py b/tests/fixtures.py
index 0d60ae1..c3dec56 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -538,7 +538,12 @@
self.session.add(basic_wiki_cohort)
self.session.commit()
self.basic_wiki_cohort = basic_wiki_cohort
-
+
+ owner = UserStore(username='test cohort owner', email='[email protected]')
+ self.session.add(owner)
+ self.session.commit()
+ self.owner_user_id = owner.id
+
cohort_user = CohortUserStore(
user_id=self.owner_user_id,
cohort_id=basic_wiki_cohort.id,
diff --git a/tests/test_api/test_cohort_service.py
b/tests/test_api/test_cohort_service.py
index 69a14f1..99a9609 100644
--- a/tests/test_api/test_cohort_service.py
+++ b/tests/test_api/test_cohort_service.py
@@ -1,7 +1,7 @@
from nose.tools import assert_equals, assert_true, raises
from sqlalchemy.orm.exc import NoResultFound
-from tests.fixtures import DatabaseTest
+from tests.fixtures import DatabaseTest, mediawiki_project
from wikimetrics.api import CohortService
from wikimetrics.exceptions import Unauthorized, InvalidCohort
from wikimetrics.models import CohortStore, CohortUserStore, CohortUserRole
@@ -116,3 +116,13 @@
def test_get_wikiusers_empty(self):
users = self.cohort_service.get_wikiusers(self.empty_cohort)
assert_equals(len(users), 0)
+
+ def test_wiki_cohort_group_by_project_works(self):
+ self.create_wiki_cohort()
+ c = self.cohort_service.get(
+ self.session, self.owner_user_id, by_id=self.basic_wiki_cohort.id
+ )
+ users_by_project = list(self.cohort_service.get_users_by_project(c))
+ assert_equals(len(users_by_project), 1)
+ assert_equals(users_by_project[0][0], mediawiki_project)
+ assert_equals(users_by_project[0][1], None)
diff --git a/tests/test_metrics/test_edge_cases.py
b/tests/test_metrics/test_edge_cases.py
new file mode 100644
index 0000000..2c6edb2
--- /dev/null
+++ b/tests/test_metrics/test_edge_cases.py
@@ -0,0 +1,53 @@
+from datetime import datetime, timedelta
+from nose.tools import assert_true, assert_equals, assert_false
+
+from tests.fixtures import QueueDatabaseTest
+from wikimetrics.metrics import NewlyRegistered
+from wikimetrics.metrics import (
+ metric_classes, NamespaceEdits, TimeseriesChoices,
+)
+from wikimetrics.utils import r
+from wikimetrics.models import Aggregation, AggregateReport
+
+
+class EdgeCasesTest(QueueDatabaseTest):
+ '''
+ Tests different cases for metric system when it comes to report results
+ '''
+ def setUp(self):
+ QueueDatabaseTest.setUp(self)
+
+ def test_aggregate_empty_results(self):
+ '''
+ Tests what happens when no users are
+ returned for the initial metric run
+ so there are no users to agreggate
+ '''
+ self.create_wiki_cohort()
+
+ metric = metric_classes['NamespaceEdits'](
+ name='NamespaceEdits',
+ namespaces=[0, 1, 2],
+ start_date='2010-01-01 00:00:00',
+ end_date='2010-01-02 00:00:00',
+ )
+ options = {
+ 'individualResults': True,
+ 'aggregateResults': True,
+ 'aggregateSum': True,
+ 'aggregateAverage': True,
+ 'aggregateStandardDeviation': True,
+ }
+
+ ar = AggregateReport(
+ metric,
+ self.basic_wiki_cohort,
+ options,
+ user_id=self.basic_wiki_cohort_owner,
+ )
+ result = ar.task.delay(ar).get()
+
+ assert_equals(result[Aggregation.IND].keys(), [])
+ assert_equals(result[Aggregation.SUM]['edits'], r(0))
+ assert_equals(result[Aggregation.AVG]['edits'], r(0))
+ assert_equals(result[Aggregation.STD]['edits'], r(0))
diff --git a/wikimetrics/metrics/bytes_added.py
b/wikimetrics/metrics/bytes_added.py
index 5310758..e486c49 100644
--- a/wikimetrics/metrics/bytes_added.py
+++ b/wikimetrics/metrics/bytes_added.py
@@ -60,11 +60,13 @@
) AS anon_1
GROUP BY anon_1.rev_user
"""
- show_in_ui = True
- id = 'bytes-added'
- label = 'Bytes Added'
- description = 'Compute different aggregations of the bytes\
- contributed or removed from a mediawiki project'
+ show_in_ui = True
+ id = 'bytes-added'
+ label = 'Bytes Added'
+ description = 'Compute different aggregations of the bytes\
+ contributed or removed from a mediawiki project'
+ # filled in below as the default depends on options
+ default_result = {}
namespaces = CommaSeparatedIntegerListField(
None,
@@ -151,6 +153,8 @@
)).label('negative_only_sum'),
)
index += 1
-
+
+ self.default_result = {s[0]: s[2] for s in submetrics}
+
query = self.apply_timeseries(bytes_added_by_user, rev=BC.c)
return self.results_by_user(user_ids, query, submetrics,
date_index=index)
diff --git a/wikimetrics/metrics/metric.py b/wikimetrics/metrics/metric.py
index 3135c9e..4d39059 100644
--- a/wikimetrics/metrics/metric.py
+++ b/wikimetrics/metrics/metric.py
@@ -13,13 +13,14 @@
class level properties that can be introspected by an interface.
"""
- show_in_ui = False
- id = None # unique identifier for client-side use
+ show_in_ui = False
+ id = None # unique identifier for client-side use
# this will be displayed as the title of the metric-specific
# tab in the request form
- label = None
- description = None # basic description of what the metric does
+ label = None
+ description = None # basic description of what the metric does
+ default_result = {} # if results are empty, default to this
def __call__(self, user_ids, session):
"""
diff --git a/wikimetrics/metrics/namespace_edits.py
b/wikimetrics/metrics/namespace_edits.py
index 11a5f23..b99bd1d 100644
--- a/wikimetrics/metrics/namespace_edits.py
+++ b/wikimetrics/metrics/namespace_edits.py
@@ -36,7 +36,10 @@
'Compute the number of edits in a specific'
'namespace of a mediawiki project'
)
-
+ default_result = {
+ 'edits': 0,
+ }
+
namespaces = CommaSeparatedIntegerListField(
None,
[Required()],
diff --git a/wikimetrics/metrics/newly_registered.py
b/wikimetrics/metrics/newly_registered.py
index 5d41f5d..74f9410 100644
--- a/wikimetrics/metrics/newly_registered.py
+++ b/wikimetrics/metrics/newly_registered.py
@@ -28,6 +28,9 @@
'A newly registered user is a previously unregistered user creating a
username \
for the first time on a Wikimedia project.'
)
+ default_result = {
+ 'newly_registered': 0,
+ }
start_date = BetterDateTimeField(default=thirty_days_ago)
end_date = BetterDateTimeField(default=today)
@@ -66,6 +69,6 @@
return metric_results
else:
return {
- uid: metric_results.get(uid, {NewlyRegistered.id : 0})
+ uid: metric_results.get(uid, self.default_result)
for uid in user_ids
}
diff --git a/wikimetrics/metrics/pages_created.py
b/wikimetrics/metrics/pages_created.py
index 64650cc..fedfd8c 100644
--- a/wikimetrics/metrics/pages_created.py
+++ b/wikimetrics/metrics/pages_created.py
@@ -33,7 +33,10 @@
'Compute the number of pages created by each \
editor in a time interval'
)
-
+ default_result = {
+ 'pages_created': 0,
+ }
+
namespaces = CommaSeparatedIntegerListField(
None,
[Required()],
diff --git a/wikimetrics/metrics/revert_rate.py
b/wikimetrics/metrics/revert_rate.py
index e593370..a699662 100644
--- a/wikimetrics/metrics/revert_rate.py
+++ b/wikimetrics/metrics/revert_rate.py
@@ -34,10 +34,11 @@
group by rev_user
"""
- show_in_ui = False
- id = 'revert-rate'
- label = 'Revert Rate'
- description = 'Compute the number of reverted edits in a mediawiki project'
+ show_in_ui = False
+ id = 'revert-rate'
+ label = 'Revert Rate'
+ description = 'Compute the number of reverted edits in a mediawiki
project'
+ default_result = {}
start_date = BetterDateTimeField(default=thirty_days_ago)
end_date = BetterDateTimeField(default=today)
diff --git a/wikimetrics/metrics/survival.py b/wikimetrics/metrics/survival.py
index c4050df..dc47989 100644
--- a/wikimetrics/metrics/survival.py
+++ b/wikimetrics/metrics/survival.py
@@ -62,6 +62,10 @@
If sunset_in_hours is 0, look for edits from \
<registration> + <survival_hours> to <today>.'
)
+ default_result = {
+ 'survived': None,
+ CENSORED: None,
+ }
number_of_edits = IntegerField(default=1)
survival_hours = IntegerField(default=0)
@@ -167,10 +171,7 @@
}
r = {
- uid: metric_results.get(uid, {
- Survival.id : None,
- CENSORED : None,
- })
+ uid: metric_results.get(uid, self.default_result)
for uid in user_ids or metric_results.keys()
}
diff --git a/wikimetrics/metrics/threshold.py b/wikimetrics/metrics/threshold.py
index 9b90a71..5940601 100644
--- a/wikimetrics/metrics/threshold.py
+++ b/wikimetrics/metrics/threshold.py
@@ -63,6 +63,11 @@
<registration> to <registration> + <threshold_hours>. \
Also compute the time it took them to reach that threshold, in hours.'
)
+ default_result = {
+ 'threshold': None,
+ 'time_to_threshold': None,
+ CENSORED: None,
+ }
number_of_edits = IntegerField(default=1)
threshold_hours = IntegerField(default=24)
diff --git a/wikimetrics/models/report_nodes/aggregate_report.py
b/wikimetrics/models/report_nodes/aggregate_report.py
index 677b3ad..75f0881 100644
--- a/wikimetrics/models/report_nodes/aggregate_report.py
+++ b/wikimetrics/models/report_nodes/aggregate_report.py
@@ -3,7 +3,7 @@
from collections import OrderedDict
from celery.utils.log import get_task_logger
-from wikimetrics.utils import stringify, CENSORED, r
+from wikimetrics.utils import stringify, CENSORED, NO_RESULTS, r
from report import ReportNode
from multi_project_metric_report import MultiProjectMetricReport
@@ -87,6 +87,8 @@
)
if self.individual:
+ if NO_RESULTS in results_by_user:
+ results_by_user = {}
aggregated_results[Aggregation.IND] = results_by_user
return aggregated_results
diff --git a/wikimetrics/models/report_nodes/metric_report.py
b/wikimetrics/models/report_nodes/metric_report.py
index 23e6841..d1e7b2a 100644
--- a/wikimetrics/models/report_nodes/metric_report.py
+++ b/wikimetrics/models/report_nodes/metric_report.py
@@ -1,9 +1,7 @@
from wikimetrics.configurables import db
from report import ReportLeaf
from wikimetrics.models.storage.wikiuser import WikiUserKey
-
-
-__all__ = ['MetricReport']
+from wikimetrics.utils import NO_RESULTS
class MetricReport(ReportLeaf):
@@ -23,7 +21,10 @@
"""
super(MetricReport, self).__init__(*args, **kwargs)
self.metric = metric
- self.user_ids = list(user_ids)
+ if user_ids is not None:
+ self.user_ids = list(user_ids)
+ else:
+ self.user_ids = None
self.cohort_id = cohort_id
self.project = project
@@ -31,10 +32,13 @@
session = db.get_mw_session(self.project)
try:
results_by_user = self.metric(self.user_ids, session)
- return {
+ results = {
str(WikiUserKey(key, self.project, self.cohort_id)) : value
for key, value in results_by_user.items()
}
+ if not len(results):
+ results = {NO_RESULTS : self.metric.default_result}
+ return results
finally:
session.close()
diff --git a/wikimetrics/models/storage/cohort.py
b/wikimetrics/models/storage/cohort.py
index a96e8bf..2785c22 100644
--- a/wikimetrics/models/storage/cohort.py
+++ b/wikimetrics/models/storage/cohort.py
@@ -98,6 +98,10 @@
).order_by(WikiUserStore.project).all()
finally:
db_session.close()
+
+ if not len(user_id_projects):
+ return [(self.default_project, None)]
+
groups = itertools.groupby(user_id_projects, key=itemgetter(1))
return (
diff --git a/wikimetrics/utils.py b/wikimetrics/utils.py
index 79a85a0..72ed8e0 100644
--- a/wikimetrics/utils.py
+++ b/wikimetrics/utils.py
@@ -14,6 +14,8 @@
PRETTY_TIMESTAMP = '%Y-%m-%d %H:%M:%S'
# This is used to mean that a result was censored in some way
CENSORED = 'censored'
+# Used to mean a metric is not returning any results
+NO_RESULTS = 'no-results'
# Unicode NULL
UNICODE_NULL = u'\x00'
--
To view, visit https://gerrit.wikimedia.org/r/138031
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I9d4f3250760b9b3a3dc60849dd88e21148d024a7
Gerrit-PatchSet: 2
Gerrit-Project: analytics/wikimetrics
Gerrit-Branch: master
Gerrit-Owner: Nuria <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits