Milimetric has submitted this change and it was merged.
Change subject: Implementing Threshold metric
......................................................................
Implementing Threshold metric
Updating Survival metric
Adding fixes for flake8 problems
Analytics card 699 finish
Change-Id: I90e9efba482d404a89a4c090113f0882dfdd2067
---
M tests/test_metrics/test_survivors.py
M wikimetrics/metrics/__init__.py
M wikimetrics/metrics/survivors.py
A wikimetrics/metrics/threshold.py
4 files changed, 234 insertions(+), 171 deletions(-)
Approvals:
Milimetric: Verified; Looks good to me, approved
diff --git a/tests/test_metrics/test_survivors.py
b/tests/test_metrics/test_survivors.py
index 5685ca9..93b52b2 100644
--- a/tests/test_metrics/test_survivors.py
+++ b/tests/test_metrics/test_survivors.py
@@ -10,6 +10,9 @@
)
+metric_name = Survivors.id
+
+
class SurvivorsTest(DatabaseWithSurvivorCohortTest):
def test_case1_24h_count1(self):
@@ -19,9 +22,9 @@
)
results = m(list(self.cohort), self.mwSession)
- assert_equal(results[self.mw_dan_id]["survivor"], True)
- assert_equal(results[self.mw_evan_id]["survivor"], True)
- assert_equal(results[self.mw_andrew_id]["survivor"] , True)
+ assert_equal(results[self.mw_dan_id][metric_name], True)
+ assert_equal(results[self.mw_evan_id][metric_name], True)
+ assert_equal(results[self.mw_andrew_id][metric_name] , True)
def test_case1_72h_count1(self):
m = Survivors(
@@ -30,9 +33,9 @@
)
results = m(list(self.cohort), self.mwSession)
- assert_equal(results[self.mw_dan_id]["survivor"], False)
- assert_equal(results[self.mw_evan_id]["survivor"], False)
- assert_equal(results[self.mw_andrew_id]["survivor"] , True)
+ assert_equal(results[self.mw_dan_id][metric_name], False)
+ assert_equal(results[self.mw_evan_id][metric_name], False)
+ assert_equal(results[self.mw_andrew_id][metric_name] , True)
def test_case1_24h_count3(self):
m = Survivors(
@@ -42,9 +45,9 @@
)
results = m(list(self.cohort), self.mwSession)
- assert_equal(results[self.mw_dan_id]["survivor"], False)
- assert_equal(results[self.mw_evan_id]["survivor"], False)
- assert_equal(results[self.mw_andrew_id]["survivor"] , True)
+ assert_equal(results[self.mw_dan_id][metric_name], False)
+ assert_equal(results[self.mw_evan_id][metric_name], False)
+ assert_equal(results[self.mw_andrew_id][metric_name] , True)
def test_case2_24h_count3_sunset72h(self):
m = Survivors(
@@ -55,9 +58,9 @@
)
results = m(list(self.cohort), self.mwSession)
- assert_equal(results[self.mw_dan_id]["survivor"], False)
- assert_equal(results[self.mw_evan_id]["survivor"], False)
- assert_equal(results[self.mw_andrew_id]["survivor"] , True)
+ assert_equal(results[self.mw_dan_id][metric_name], False)
+ assert_equal(results[self.mw_evan_id][metric_name], False)
+ assert_equal(results[self.mw_andrew_id][metric_name] , True)
def test_default(self):
m = Survivors(
@@ -66,9 +69,9 @@
results = m(list(self.cohort), self.mwSession)
#self.debug_query = m.debug_query
- assert_equal(results[self.mw_dan_id]["survivor"], True)
- assert_equal(results[self.mw_evan_id]["survivor"], True)
- assert_equal(results[self.mw_andrew_id]["survivor"] , True)
+ assert_equal(results[self.mw_dan_id][metric_name], True)
+ assert_equal(results[self.mw_evan_id][metric_name], True)
+ assert_equal(results[self.mw_andrew_id][metric_name] , True)
def test_censored1(self):
@@ -82,10 +85,10 @@
)
results = m(list(self.cohort), self.mwSession)
assert_equal(results, {
- self.mw_dan_id: {'censored': 1, 'survivor': 0},
- self.mw_evan_id: {'censored': 1, 'survivor': 0},
- self.mw_andrew_id: {'censored': 1, 'survivor': 0},
- self.mw_diederik_id: {'censored': 0, 'survivor': 0},
+ self.mw_dan_id: {'censored': 1, metric_name: 0},
+ self.mw_evan_id: {'censored': 1, metric_name: 0},
+ self.mw_andrew_id: {'censored': 1, metric_name: 0},
+ self.mw_diederik_id: {'censored': 0, metric_name: 0},
})
def test_censored2(self):
@@ -115,8 +118,8 @@
results = m(list(self.cohort), self.mwSession)
assert_equal(results, {
- self.mw_dan_id: {'censored': 0, 'survivor': 0},
- self.mw_evan_id: {'censored': 0, 'survivor': 0},
- self.mw_andrew_id: {'censored': 0, 'survivor': 1},
- self.mw_diederik_id: {'censored': 0, 'survivor': 0},
+ self.mw_dan_id: {'censored': 0, metric_name: 0},
+ self.mw_evan_id: {'censored': 0, metric_name: 0},
+ self.mw_andrew_id: {'censored': 0, metric_name: 1},
+ self.mw_diederik_id: {'censored': 0, metric_name: 0},
})
diff --git a/wikimetrics/metrics/__init__.py b/wikimetrics/metrics/__init__.py
index 4fb24cc..56b1780 100644
--- a/wikimetrics/metrics/__init__.py
+++ b/wikimetrics/metrics/__init__.py
@@ -5,6 +5,7 @@
from revert_rate import *
from bytes_added import *
from pages_created import *
+from threshold import *
from survivors import *
# ignore flake8 because of F403 violation
diff --git a/wikimetrics/metrics/survivors.py b/wikimetrics/metrics/survivors.py
index 504e06b..94e4215 100644
--- a/wikimetrics/metrics/survivors.py
+++ b/wikimetrics/metrics/survivors.py
@@ -1,22 +1,17 @@
-import datetime
-import calendar
-from sqlalchemy import func, case, Integer
-from sqlalchemy.sql.expression import label, between, and_, or_
-
-from wikimetrics.models import Page, Revision, MediawikiUser
-from wikimetrics.utils import thirty_days_ago, today, CENSORED
-from metric import Metric
-from form_fields import CommaSeparatedIntegerListField, BetterDateTimeField
-from wtforms.validators import Required
from wtforms import BooleanField, IntegerField
+from wikimetrics.metrics import Threshold
__all__ = ['Survivors']
-class Survivors(Metric):
+class Survivors(Threshold):
"""
- This metric counts the survivors .
+ Survivor is a metric that determines whether an editor has performed a
certain
+ activity at least n times in a specified time window. It is used to
measure early
+ user activation (when t is measured from account creation) or
+ during a certain window of interest
+ (for example in an A/B test or a usability test for an editing
gadget/feature)
The SQL query that inspired this metric was:
@@ -42,150 +37,18 @@
AND unix_timestamp(revision.rev_timestamp) -
unix_timestamp(user.user_registration)
BETWEEN
- <survival> AND <survival + sunset>
+ <survival> AND <now>
GROUP BY user.user_id
) AS rev_counts ON user.user_id = rev_count.user_id
WHERE user.user_id IN (<cohort>)
) AS revs
"""
-
- show_in_ui = True
- id = 'survival'
- label = 'Survival'
+ id = 'survival'
+ label = 'Survival'
description = (
'Compute whether editors "survived" by making <<number_of_edits>>
edits from \
<<registration + survival hours>> to \
<<registration + survival hours + sunset hours>>. If <<sunset hours>>
is 0, \
- look for edits until the current time.'
+ look for edits from registration up to today.'
)
-
- number_of_edits = IntegerField(default=1)
- survival_hours = IntegerField(default=0)
sunset_in_hours = IntegerField(default=0)
-
- namespaces = CommaSeparatedIntegerListField(
- None,
- [Required()],
- default='0',
- description='0, 2, 4, etc.',
- )
-
- def debug_print(self, r, session, user_ids):
- s = ''
- for uid in user_ids:
- if uid:
- user_name = session \
- .query(MediawikiUser.user_name) \
- .filter(MediawikiUser.user_id == uid) \
- .first()[0]
- s += '{0} ({1}) ===> [{2}] [{3}] \n'.format(
- user_name, str(uid), str(r[uid]['survivor']),
str(r[uid][CENSORED])
- )
- print(s)
-
- def __call__(self, user_ids, session):
- """
- Parameters:
- user_ids : list of mediawiki user ids to find edit for
- session : sqlalchemy session open on a mediawiki database
-
- Returns:
- dictionary from user ids to the number of edit found.
- """
-
- survival_hours = int(self.survival_hours.data)
- sunset_in_hours = int(self.sunset_in_hours.data)
- number_of_edits = int(self.number_of_edits.data)
-
- revisions = session \
- .query(
- MediawikiUser.user_id,
- label('rev_count', func.count())
- ) \
- .join(Revision) \
- .join(Page) \
- .group_by(MediawikiUser.user_id) \
- .filter(MediawikiUser.user_id.in_(user_ids)) \
- .filter(Page.page_namespace.in_(self.namespaces.data))
-
- # sunset_in_hours is zero, so we use the first case [T+t,today]
- if sunset_in_hours == 0:
- revisions = revisions.filter(
- between(
- func.unix_timestamp(Revision.rev_timestamp) -
- func.unix_timestamp(MediawikiUser.user_registration)
- ,
- (survival_hours * 3600)
- ,
- func.unix_timestamp(func.now()) + 86400
- )
- )
- # otherwise use the sunset_in_hours [T+t,T+t+s]
- else:
- revisions = revisions.filter(
- between(
- func.unix_timestamp(Revision.rev_timestamp) -
- func.unix_timestamp(MediawikiUser.user_registration)
- ,
- (survival_hours * 3600)
- ,
- ((survival_hours + sunset_in_hours) * 3600)
- )
- )
-
- revisions = revisions.subquery()
- revs = session.query(
- MediawikiUser.user_id,
- MediawikiUser.user_registration,
- label(
- 'rev_count',
- func.coalesce(revisions.c.rev_count, 0)
- )
- ) \
- .outerjoin(revisions, MediawikiUser.user_id ==
revisions.c.user_id) \
- .filter(MediawikiUser.user_id.in_(user_ids)) \
- .subquery()
-
- metric = session.query(
- revs.c.user_id,
- func.unix_timestamp(func.now()),
- func.IF(
- func.unix_timestamp(func.now()) <
- func.unix_timestamp(revs.c.user_registration) +
- (survival_hours + sunset_in_hours) * 3600,
- 1, 0
- ),
- revs.c.rev_count,
- label('survived', func.IF(revs.c.rev_count >= number_of_edits, 1,
0)),
- label(CENSORED, func.IF(
- revs.c.rev_count >= number_of_edits,
- 0,
- func.IF(
- func.unix_timestamp(func.now()) <
- func.unix_timestamp(revs.c.user_registration) +
- (survival_hours + sunset_in_hours) * 3600,
- 1, 0
- )
- ))
- )
-
- data = metric.all()
-
- metric_results = {
- u.user_id: {
- 'survivor': u.survived,
- CENSORED: u.censored,
- }
- for u in data
- }
-
- r = {
- uid: metric_results.get(uid, {
- 'survivor': None,
- CENSORED: None,
- })
- for uid in user_ids
- }
-
- #self.debug_print(r, session, user_ids)
- return r
diff --git a/wikimetrics/metrics/threshold.py b/wikimetrics/metrics/threshold.py
new file mode 100644
index 0000000..052a1bb
--- /dev/null
+++ b/wikimetrics/metrics/threshold.py
@@ -0,0 +1,196 @@
+from wikimetrics.metrics import Metric
+import datetime
+import calendar
+from sqlalchemy import func, case, Integer
+from sqlalchemy.sql.expression import label, between, and_, or_
+
+from wikimetrics.models import Page, Revision, MediawikiUser
+from wikimetrics.utils import thirty_days_ago, today, CENSORED
+from form_fields import CommaSeparatedIntegerListField, BetterDateTimeField
+from wtforms.validators import Required
+from wtforms import BooleanField, IntegerField
+
+__all__ = ['Threshold']
+
+
+class Threshold(Metric):
+ """
+ Threshold is a metric that determines whether an editor has performed a
certain
+ activity at least n times in a specified time window. It is used to
measure early
+ user activation (when t is measured from account creation) or
+ during a certain window of interest
+ (for example in an A/B test or a usability test for an editing
gadget/feature)
+
+ The SQL query that inspired this metric was:
+
+ SELECT revs.user_id AS revs_user_id,
+ IF(revs.rev_count >= 1, 1, 0) AS survived,
+ IF(revs.rev_count >= 1, 0, IF(unix_timestamp(now())
+ < unix_timestamp(revs.user_registration) + 2595600, 1, 0)) AS
censored
+
+ FROM (SELECT user.user_id AS user_id,
+ user.user_registration AS user_registration,
+ coalesce(rev_counts.rev_count, 0) AS rev_count
+ FROM user
+ LEFT OUTER JOIN
+ (SELECT user.user_id AS user_id,
+ count(*) as rev_count
+ FROM user
+ INNER JOIN
+ revision ON user.user_id = revision.rev_user
+ INNER JOIN
+ page ON page.page_id = revision.rev_page
+ WHERE user.user_id IN (<cohort>)
+ AND page.page_namespace IN (0)
+ AND unix_timestamp(revision.rev_timestamp) -
+ unix_timestamp(user.user_registration)
+ BETWEEN
+ <survival> AND <now>
+ GROUP BY user.user_id
+ ) AS rev_counts ON user.user_id = rev_count.user_id
+ WHERE user.user_id IN (<cohort>)
+ ) AS revs
+ """
+
+ show_in_ui = True
+ id = 'threshold'
+ label = 'Threshold'
+ description = (
+ 'Compute whether editors "survived" if they have at least \
+ number_of_edits up to today.'
+ )
+
+ number_of_edits = IntegerField(default=1)
+ survival_hours = IntegerField(default=0)
+
+ namespaces = CommaSeparatedIntegerListField(
+ None,
+ [Required()],
+ default='0',
+ description='0, 2, 4, etc.',
+ )
+
+ def debug_print(self, r, session, user_ids):
+ s = ''
+ for uid in user_ids:
+ if uid:
+ user_name = session \
+ .query(MediawikiUser.user_name) \
+ .filter(MediawikiUser.user_id == uid) \
+ .first()[0]
+ s += '{0} ({1}) ===> [{2}] [{3}] \n'.format(
+ user_name, str(uid), str(r[uid]['survivor']),
str(r[uid][CENSORED])
+ )
+ print(s)
+
+ def __call__(self, user_ids, session):
+ """
+ Parameters:
+ user_ids : list of mediawiki user ids to find edit for
+ session : sqlalchemy session open on a mediawiki database
+
+ Returns:
+ dictionary from user ids to the number of edit found.
+ """
+
+ survival_hours = int(self.survival_hours.data)
+
+ if self.sunset_in_hours:
+ sunset_in_hours = int(self.sunset_in_hours.data)
+ else:
+ sunset_in_hours = 0
+
+ number_of_edits = int(self.number_of_edits.data)
+
+ revisions = session \
+ .query(
+ MediawikiUser.user_id,
+ label('rev_count', func.count())
+ ) \
+ .join(Revision) \
+ .join(Page) \
+ .group_by(MediawikiUser.user_id) \
+ .filter(MediawikiUser.user_id.in_(user_ids)) \
+ .filter(Page.page_namespace.in_(self.namespaces.data))
+
+ # sunset_in_hours is zero, so we use the first case [T+t,today]
+ if sunset_in_hours == 0:
+ revisions = revisions.filter(
+ between(
+ func.unix_timestamp(Revision.rev_timestamp) -
+ func.unix_timestamp(MediawikiUser.user_registration)
+ ,
+ (survival_hours * 3600)
+ ,
+ func.unix_timestamp(func.now()) + 86400
+ )
+ )
+ # otherwise use the sunset_in_hours [T+t,T+t+s]
+ else:
+ revisions = revisions.filter(
+ between(
+ func.unix_timestamp(Revision.rev_timestamp) -
+ func.unix_timestamp(MediawikiUser.user_registration)
+ ,
+ (survival_hours * 3600)
+ ,
+ ((survival_hours + sunset_in_hours) * 3600)
+ )
+ )
+
+ revisions = revisions.subquery()
+ revs = session.query(
+ MediawikiUser.user_id,
+ MediawikiUser.user_registration,
+ label(
+ 'rev_count',
+ func.coalesce(revisions.c.rev_count, 0)
+ )
+ ) \
+ .outerjoin(revisions, MediawikiUser.user_id ==
revisions.c.user_id) \
+ .filter(MediawikiUser.user_id.in_(user_ids)) \
+ .subquery()
+
+ metric = session.query(
+ revs.c.user_id,
+ func.unix_timestamp(func.now()),
+ func.IF(
+ func.unix_timestamp(func.now()) <
+ func.unix_timestamp(revs.c.user_registration) +
+ (survival_hours + sunset_in_hours) * 3600,
+ 1, 0
+ ),
+ revs.c.rev_count,
+ label('survived', func.IF(revs.c.rev_count >= number_of_edits, 1,
0)),
+ label(CENSORED, func.IF(
+ revs.c.rev_count >= number_of_edits,
+ 0,
+ func.IF(
+ func.unix_timestamp(func.now()) <
+ func.unix_timestamp(revs.c.user_registration) +
+ (survival_hours + sunset_in_hours) * 3600,
+ 1, 0
+ )
+ ))
+ )
+
+ data = metric.all()
+
+ metric_results = {
+ u.user_id: {
+ self.id: u.survived,
+ CENSORED: u.censored,
+ }
+ for u in data
+ }
+
+ r = {
+ uid: metric_results.get(uid, {
+ self.id: None,
+ CENSORED: None,
+ })
+ for uid in user_ids
+ }
+
+ #self.debug_print(r, session, user_ids)
+ return r
--
To view, visit https://gerrit.wikimedia.org/r/87619
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I90e9efba482d404a89a4c090113f0882dfdd2067
Gerrit-PatchSet: 5
Gerrit-Project: analytics/wikimetrics
Gerrit-Branch: master
Gerrit-Owner: Stefan.petrea <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits