Milimetric has uploaded a new change for review.
https://gerrit.wikimedia.org/r/91309
Change subject: not yet ready to merge, just checkpoint
......................................................................
not yet ready to merge, just checkpoint
Change-Id: Ia52efad15c18e954d93136ffffdfa879804179af
---
M tests/test_metrics/test_threshold.py
M wikimetrics/metrics/survival.py
M wikimetrics/metrics/threshold.py
3 files changed, 92 insertions(+), 54 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/wikimetrics
refs/changes/09/91309/1
diff --git a/tests/test_metrics/test_threshold.py
b/tests/test_metrics/test_threshold.py
index 2146114..73edb4d 100644
--- a/tests/test_metrics/test_threshold.py
+++ b/tests/test_metrics/test_threshold.py
@@ -20,7 +20,7 @@
DatabaseTest.setUp(self)
self.create_test_cohort(
editor_count=2,
- revisions_per_editor=3,
+ revisions_per_editor=2,
user_registrations=i(reg),
revision_timestamps=[
[i(reg + one_hour) , i(reg + one_hour * 25)],
@@ -31,7 +31,7 @@
def test_case1_24h_count1(self):
m = Threshold(
- namespaces=[self.survivors_namespace],
+ namespaces=[0],
)
results = m(list(self.cohort), self.mwSession)
@@ -42,7 +42,7 @@
def test_case1_72h_count1(self):
m = Threshold(
- namespaces=[self.survivors_namespace],
+ namespaces=[0],
threshold_hours=3 * 24,
)
results = m(list(self.cohort), self.mwSession)
@@ -54,11 +54,12 @@
def test_case1_72h_count3(self):
m = Threshold(
- namespaces=[self.survivors_namespace],
+ namespaces=[0],
threshold_hours=3 * 24,
number_of_edits=3,
)
results = m(list(self.cohort), self.mwSession)
+ print results
assert_equal(results[self.editors[0].user_id][Threshold.id], False)
assert_equal(results[self.editors[1].user_id][Threshold.id], False)
@@ -67,11 +68,12 @@
def test_case1_24h_count3(self):
m = Threshold(
- namespaces=[self.survivors_namespace],
+ namespaces=[0],
threshold_hours=1 * 24,
number_of_edits=3,
)
results = m(list(self.cohort), self.mwSession)
+ print results
assert_equal(results[self.editors[0].user_id][Threshold.id], False)
assert_equal(results[self.editors[1].user_id][Threshold.id], False)
diff --git a/wikimetrics/metrics/survival.py b/wikimetrics/metrics/survival.py
index d6602e9..3904c12 100644
--- a/wikimetrics/metrics/survival.py
+++ b/wikimetrics/metrics/survival.py
@@ -88,14 +88,7 @@
"""
survival_hours = int(self.survival_hours.data)
-
- # NOTE: this kind of breaks OOP because it handles a child class need
- # in the parent class. TODO: fix
- if hasattr(self, 'sunset_in_hours'):
- sunset_in_hours = int(self.sunset_in_hours.data)
- else:
- sunset_in_hours = 0
-
+ sunset_in_hours = int(self.sunset_in_hours.data)
number_of_edits = int(self.number_of_edits.data)
revisions = session \
@@ -157,7 +150,7 @@
1, 0
),
revs.c.rev_count,
- label('metric_result', func.IF(revs.c.rev_count >=
number_of_edits, 1, 0)),
+ label(Survival.id, func.IF(revs.c.rev_count >= number_of_edits, 1,
0)),
label(CENSORED, func.IF(
revs.c.rev_count >= number_of_edits,
0,
@@ -174,16 +167,16 @@
metric_results = {
u.user_id: {
- self.id: u.metric_result,
- CENSORED: u.censored,
+ Survival.id : u.survived,
+ CENSORED : u.censored,
}
for u in data
}
r = {
uid: metric_results.get(uid, {
- self.id: None,
- CENSORED: None,
+ Survival.id : None,
+ CENSORED : None,
})
for uid in user_ids
}
diff --git a/wikimetrics/metrics/threshold.py b/wikimetrics/metrics/threshold.py
index 63cc5d2..03eafcf 100644
--- a/wikimetrics/metrics/threshold.py
+++ b/wikimetrics/metrics/threshold.py
@@ -2,6 +2,7 @@
import datetime
import calendar
from sqlalchemy import func, case, Integer
+from sqlalchemy.orm import aliased
from sqlalchemy.sql.expression import label, between, and_, or_
from wikimetrics.models import Page, Revision, MediawikiUser
@@ -19,43 +20,14 @@
Threshold is a metric that determines whether an editor has performed >= n
edits
in a specified time window. It is used to measure early user activation.
It also
computes the time it took a user to reach the threshold, if they did.
-
- The SQL query that inspired this metric was:
-
- SELECT revs.user_id AS revs_user_id,
- IF(revs.rev_count >= <number_of_edits>, 1, 0)
- AS threshold,
- IF(revs.rev_count >= <number_of_edits>, 0, IF(unix_timestamp(now())
- < unix_timestamp(revs.user_registration) + <threshold_hours>, 1,
0))
- AS censored
-
- FROM (SELECT user.user_id AS user_id,
- user.user_registration AS user_registration,
- coalesce(rev_counts.rev_count, 0) AS rev_count
- FROM user
- LEFT OUTER JOIN
- (SELECT user.user_id AS user_id,
- count(*) as rev_count
- FROM user
- INNER JOIN
- revision ON user.user_id = revision.rev_user
- INNER JOIN
- page ON page.page_id = revision.rev_page
- WHERE user.user_id IN (<cohort>)
- AND page.page_namespace IN (<namespaces>)
- AND unix_timestamp(revision.rev_timestamp) -
- unix_timestamp(user.user_registration)
- <= <threshold_hours>
- GROUP BY user.user_id
- ) AS rev_counts ON user.user_id = rev_count.user_id
- WHERE user.user_id IN (<cohort>)
- ) AS revs
-
- And the Time to Threshold sub-metric is also computed by this metric.
This is the number
+ Time to Threshold is also computed by this metric. This is the number
of hours that it took an editor to reach exactly n edits. If the editor
did not
- reach the threshold, None is returned. The inspiration SQL for this is:
+ reach the threshold, None is returned.
+
+ The SQL that inspired this metric was:
SELECT user_id,
+ IF(rev_timestamp is not null, 1, 0) as threshold,
IF(rev_timestamp is not null,
(unix_timestamp(rev_timestamp) -
unix_timestamp(user_registration)) / 3600,
null
@@ -64,7 +36,7 @@
r1.rev_timestamp,
COUNT(*) AS number
FROM revision r1
- JOIN
+ INNER JOIN
revision r2 ON r1.rev_user = r2.rev_user
AND r1.rev_timestamp >= r2.rev_timestamp
WHERE user.user_id IN (<cohort>)
@@ -114,4 +86,75 @@
'censored': 1 for True, 0 for False
}
"""
- return 'Not Implemented'
+
+ threshold_hours = int(self.threshold_hours.data)
+ threshold_secs = threshold_hours * 3600
+ number_of_edits = int(self.number_of_edits.data)
+
+ Revision2 = aliased(Revision, name='r2')
+ ordered_revisions = session \
+ .query(
+ Revision.rev_user,
+ Revision.rev_timestamp,
+ label('number', func.count()),
+ ) \
+ .join(MediawikiUser) \
+ .join(Page) \
+ .join(
+ Revision2,
+ Revision.rev_user == Revision2.rev_user and
+ Revision.rev_timestamp >= Revision2.rev_timestamp
+ ) \
+ .group_by(Revision.rev_user) \
+ .group_by(Revision.rev_timestamp) \
+ .filter(Revision.rev_user.in_(user_ids)) \
+ .filter(Page.page_namespace.in_(self.namespaces.data)) \
+ .filter(
+ func.unix_timestamp(Revision.rev_timestamp) -
+ func.unix_timestamp(MediawikiUser.user_registration) <=
threshold_secs
+ )
+
+ o_r = ordered_revisions.subquery()
+
+ metric = session.query(
+ MediawikiUser.user_id,
+ label(
+ Threshold.id,
+ func.IF(o_r.c.rev_timestamp != None, 1, 0)
+ ),
+ label(
+ 'time_to_threshold',
+ func.IF(
+ o_r.c.rev_timestamp != None,
+ (func.unix_timestamp(o_r.c.rev_timestamp) -
+ func.unix_timestamp(MediawikiUser.user_registration))
/ 3600,
+ 0
+ )
+ ),
+ label(CENSORED, func.IF(
+ o_r.c.rev_timestamp != None,
+ 0,
+ func.IF(
+ func.unix_timestamp(MediawikiUser.user_registration) +
threshold_secs
+ >
+ func.unix_timestamp(func.now()),
+ 1,
+ 0
+ )
+ ))
+ ) \
+ .outerjoin(
+ o_r,
+ MediawikiUser.user_id == o_r.c.rev_user and
+ o_r.c.number == number_of_edits
+ ) \
+ .filter(MediawikiUser.user_id.in_(user_ids))
+
+ return {
+ u.user_id: {
+ Threshold.id : u.threshold,
+ 'time_to_threshold' : u.time_to_threshold,
+ CENSORED : u.censored,
+ }
+ for u in metric.all()
+ }
--
To view, visit https://gerrit.wikimedia.org/r/91309
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia52efad15c18e954d93136ffffdfa879804179af
Gerrit-PatchSet: 1
Gerrit-Project: analytics/wikimetrics
Gerrit-Branch: master
Gerrit-Owner: Milimetric <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits