Milimetric has submitted this change and it was merged.

Change subject: Implementing Threshold metric
......................................................................


Implementing Threshold metric

Updating Survival metric
Adding fixes for flake8 problems
Analytics card 699 finish

Change-Id: I90e9efba482d404a89a4c090113f0882dfdd2067
---
M tests/test_metrics/test_survivors.py
M wikimetrics/metrics/__init__.py
M wikimetrics/metrics/survivors.py
A wikimetrics/metrics/threshold.py
4 files changed, 234 insertions(+), 171 deletions(-)

Approvals:
  Milimetric: Verified; Looks good to me, approved



diff --git a/tests/test_metrics/test_survivors.py 
b/tests/test_metrics/test_survivors.py
index 5685ca9..93b52b2 100644
--- a/tests/test_metrics/test_survivors.py
+++ b/tests/test_metrics/test_survivors.py
@@ -10,6 +10,9 @@
 )
 
 
+metric_name = Survivors.id
+
+
 class SurvivorsTest(DatabaseWithSurvivorCohortTest):
     
     def test_case1_24h_count1(self):
@@ -19,9 +22,9 @@
         )
         results = m(list(self.cohort), self.mwSession)
 
-        assert_equal(results[self.mw_dan_id]["survivor"], True)
-        assert_equal(results[self.mw_evan_id]["survivor"], True)
-        assert_equal(results[self.mw_andrew_id]["survivor"] , True)
+        assert_equal(results[self.mw_dan_id][metric_name], True)
+        assert_equal(results[self.mw_evan_id][metric_name], True)
+        assert_equal(results[self.mw_andrew_id][metric_name] , True)
     
     def test_case1_72h_count1(self):
         m = Survivors(
@@ -30,9 +33,9 @@
         )
         results = m(list(self.cohort), self.mwSession)
 
-        assert_equal(results[self.mw_dan_id]["survivor"], False)
-        assert_equal(results[self.mw_evan_id]["survivor"], False)
-        assert_equal(results[self.mw_andrew_id]["survivor"] , True)
+        assert_equal(results[self.mw_dan_id][metric_name], False)
+        assert_equal(results[self.mw_evan_id][metric_name], False)
+        assert_equal(results[self.mw_andrew_id][metric_name] , True)
     
     def test_case1_24h_count3(self):
         m = Survivors(
@@ -42,9 +45,9 @@
         )
         results = m(list(self.cohort), self.mwSession)
 
-        assert_equal(results[self.mw_dan_id]["survivor"], False)
-        assert_equal(results[self.mw_evan_id]["survivor"], False)
-        assert_equal(results[self.mw_andrew_id]["survivor"] , True)
+        assert_equal(results[self.mw_dan_id][metric_name], False)
+        assert_equal(results[self.mw_evan_id][metric_name], False)
+        assert_equal(results[self.mw_andrew_id][metric_name] , True)
     
     def test_case2_24h_count3_sunset72h(self):
         m = Survivors(
@@ -55,9 +58,9 @@
         )
         results = m(list(self.cohort), self.mwSession)
 
-        assert_equal(results[self.mw_dan_id]["survivor"], False)
-        assert_equal(results[self.mw_evan_id]["survivor"], False)
-        assert_equal(results[self.mw_andrew_id]["survivor"] , True)
+        assert_equal(results[self.mw_dan_id][metric_name], False)
+        assert_equal(results[self.mw_evan_id][metric_name], False)
+        assert_equal(results[self.mw_andrew_id][metric_name] , True)
     
     def test_default(self):
         m = Survivors(
@@ -66,9 +69,9 @@
         results = m(list(self.cohort), self.mwSession)
         #self.debug_query = m.debug_query
 
-        assert_equal(results[self.mw_dan_id]["survivor"], True)
-        assert_equal(results[self.mw_evan_id]["survivor"], True)
-        assert_equal(results[self.mw_andrew_id]["survivor"] , True)
+        assert_equal(results[self.mw_dan_id][metric_name], True)
+        assert_equal(results[self.mw_evan_id][metric_name], True)
+        assert_equal(results[self.mw_andrew_id][metric_name] , True)
     
     def test_censored1(self):
         
@@ -82,10 +85,10 @@
         )
         results = m(list(self.cohort), self.mwSession)
         assert_equal(results, {
-            self.mw_dan_id: {'censored': 1, 'survivor': 0},
-            self.mw_evan_id: {'censored': 1, 'survivor': 0},
-            self.mw_andrew_id: {'censored': 1, 'survivor': 0},
-            self.mw_diederik_id: {'censored': 0, 'survivor': 0},
+            self.mw_dan_id: {'censored': 1, metric_name: 0},
+            self.mw_evan_id: {'censored': 1, metric_name: 0},
+            self.mw_andrew_id: {'censored': 1, metric_name: 0},
+            self.mw_diederik_id: {'censored': 0, metric_name: 0},
         })
 
     def test_censored2(self):
@@ -115,8 +118,8 @@
         results = m(list(self.cohort), self.mwSession)
 
         assert_equal(results, {
-            self.mw_dan_id: {'censored': 0, 'survivor': 0},
-            self.mw_evan_id: {'censored': 0, 'survivor': 0},
-            self.mw_andrew_id: {'censored': 0, 'survivor': 1},
-            self.mw_diederik_id: {'censored': 0, 'survivor': 0},
+            self.mw_dan_id: {'censored': 0, metric_name: 0},
+            self.mw_evan_id: {'censored': 0, metric_name: 0},
+            self.mw_andrew_id: {'censored': 0, metric_name: 1},
+            self.mw_diederik_id: {'censored': 0, metric_name: 0},
         })
diff --git a/wikimetrics/metrics/__init__.py b/wikimetrics/metrics/__init__.py
index 4fb24cc..56b1780 100644
--- a/wikimetrics/metrics/__init__.py
+++ b/wikimetrics/metrics/__init__.py
@@ -5,6 +5,7 @@
 from revert_rate import *
 from bytes_added import *
 from pages_created import *
+from threshold import *
 from survivors import *
 
 # ignore flake8 because of F403 violation
diff --git a/wikimetrics/metrics/survivors.py b/wikimetrics/metrics/survivors.py
index 504e06b..94e4215 100644
--- a/wikimetrics/metrics/survivors.py
+++ b/wikimetrics/metrics/survivors.py
@@ -1,22 +1,17 @@
-import datetime
-import calendar
-from sqlalchemy import func, case, Integer
-from sqlalchemy.sql.expression import label, between, and_, or_
-
-from wikimetrics.models import Page, Revision, MediawikiUser
-from wikimetrics.utils import thirty_days_ago, today, CENSORED
-from metric import Metric
-from form_fields import CommaSeparatedIntegerListField, BetterDateTimeField
-from wtforms.validators import Required
 from wtforms import BooleanField, IntegerField
+from wikimetrics.metrics import Threshold
 
 
 __all__ = ['Survivors']
 
 
-class Survivors(Metric):
+class Survivors(Threshold):
     """
-    This metric counts the survivors .
+    Survivor is a metric that determines whether an editor has performed a 
certain
+    activity at least n times in a specified time window. It is used to 
measure early
+    user activation (when t is measured from account creation) or
+    during a certain window of interest
+    (for example in an A/B test or a usability test for an editing 
gadget/feature)
     
     The SQL query that inspired this metric was:
     
@@ -42,150 +37,18 @@
                     AND unix_timestamp(revision.rev_timestamp) -
                         unix_timestamp(user.user_registration)
                             BETWEEN
-                        <survival> AND <survival + sunset>
+                        <survival> AND <now>
                   GROUP BY user.user_id
                 ) AS rev_counts     ON user.user_id = rev_count.user_id
           WHERE user.user_id IN (<cohort>)
         ) AS revs
     """
-    
-    show_in_ui  = True
-    id          = 'survival'
-    label       = 'Survival'
+    id = 'survival'
+    label = 'Survival'
     description = (
         'Compute whether editors "survived" by making <<number_of_edits>> 
edits from \
         <<registration + survival hours>> to \
         <<registration + survival hours + sunset hours>>.  If <<sunset hours>> 
is 0, \
-        look for edits until the current time.'
+        look for edits from registration up to today.'
     )
-    
-    number_of_edits       = IntegerField(default=1)
-    survival_hours        = IntegerField(default=0)
     sunset_in_hours       = IntegerField(default=0)
-    
-    namespaces = CommaSeparatedIntegerListField(
-        None,
-        [Required()],
-        default='0',
-        description='0, 2, 4, etc.',
-    )
-    
-    def debug_print(self, r, session, user_ids):
-        s = ''
-        for uid in user_ids:
-            if uid:
-                user_name = session \
-                    .query(MediawikiUser.user_name) \
-                    .filter(MediawikiUser.user_id == uid) \
-                    .first()[0]
-                s += '{0} ({1}) ===> [{2}] [{3}] \n'.format(
-                    user_name, str(uid), str(r[uid]['survivor']), 
str(r[uid][CENSORED])
-                )
-        print(s)
-    
-    def __call__(self, user_ids, session):
-        """
-        Parameters:
-            user_ids    : list of mediawiki user ids to find edit for
-            session     : sqlalchemy session open on a mediawiki database
-        
-        Returns:
-            dictionary from user ids to the number of edit found.
-        """
-
-        survival_hours = int(self.survival_hours.data)
-        sunset_in_hours = int(self.sunset_in_hours.data)
-        number_of_edits = int(self.number_of_edits.data)
-        
-        revisions = session \
-            .query(
-                MediawikiUser.user_id,
-                label('rev_count', func.count())
-            ) \
-            .join(Revision) \
-            .join(Page) \
-            .group_by(MediawikiUser.user_id) \
-            .filter(MediawikiUser.user_id.in_(user_ids)) \
-            .filter(Page.page_namespace.in_(self.namespaces.data))
-        
-        # sunset_in_hours is zero, so we use the first case [T+t,today]
-        if sunset_in_hours == 0:
-            revisions = revisions.filter(
-                between(
-                    func.unix_timestamp(Revision.rev_timestamp) -
-                    func.unix_timestamp(MediawikiUser.user_registration)
-                    ,
-                    (survival_hours * 3600)
-                    ,
-                    func.unix_timestamp(func.now()) + 86400
-                )
-            )
-        # otherwise use the sunset_in_hours [T+t,T+t+s]
-        else:
-            revisions = revisions.filter(
-                between(
-                    func.unix_timestamp(Revision.rev_timestamp) -
-                    func.unix_timestamp(MediawikiUser.user_registration)
-                    ,
-                    (survival_hours * 3600)
-                    ,
-                    ((survival_hours + sunset_in_hours) * 3600)
-                )
-            )
-        
-        revisions = revisions.subquery()
-        revs = session.query(
-            MediawikiUser.user_id,
-            MediawikiUser.user_registration,
-            label(
-                'rev_count',
-                func.coalesce(revisions.c.rev_count, 0)
-            )
-        ) \
-            .outerjoin(revisions, MediawikiUser.user_id == 
revisions.c.user_id) \
-            .filter(MediawikiUser.user_id.in_(user_ids)) \
-            .subquery()
-        
-        metric = session.query(
-            revs.c.user_id,
-            func.unix_timestamp(func.now()),
-            func.IF(
-                func.unix_timestamp(func.now()) <
-                func.unix_timestamp(revs.c.user_registration) +
-                (survival_hours + sunset_in_hours) * 3600,
-                1, 0
-            ),
-            revs.c.rev_count,
-            label('survived', func.IF(revs.c.rev_count >= number_of_edits, 1, 
0)),
-            label(CENSORED, func.IF(
-                revs.c.rev_count >= number_of_edits,
-                0,
-                func.IF(
-                    func.unix_timestamp(func.now()) <
-                    func.unix_timestamp(revs.c.user_registration) +
-                    (survival_hours + sunset_in_hours) * 3600,
-                    1, 0
-                )
-            ))
-        )
-        
-        data = metric.all()
-        
-        metric_results = {
-            u.user_id: {
-                'survivor': u.survived,
-                CENSORED: u.censored,
-            }
-            for u in data
-        }
-
-        r = {
-            uid: metric_results.get(uid, {
-                'survivor': None,
-                CENSORED: None,
-            })
-            for uid in user_ids
-        }
-
-        #self.debug_print(r, session, user_ids)
-        return r
diff --git a/wikimetrics/metrics/threshold.py b/wikimetrics/metrics/threshold.py
new file mode 100644
index 0000000..052a1bb
--- /dev/null
+++ b/wikimetrics/metrics/threshold.py
@@ -0,0 +1,196 @@
+from wikimetrics.metrics import Metric
+import datetime
+import calendar
+from sqlalchemy import func, case, Integer
+from sqlalchemy.sql.expression import label, between, and_, or_
+
+from wikimetrics.models import Page, Revision, MediawikiUser
+from wikimetrics.utils import thirty_days_ago, today, CENSORED
+from form_fields import CommaSeparatedIntegerListField, BetterDateTimeField
+from wtforms.validators import Required
+from wtforms import BooleanField, IntegerField
+
+__all__ = ['Threshold']
+
+
+class Threshold(Metric):
+    """
+    Threshold is a metric that determines whether an editor has performed a 
certain
+    activity at least n times in a specified time window. It is used to 
measure early
+    user activation (when t is measured from account creation) or
+    during a certain window of interest
+    (for example in an A/B test or a usability test for an editing 
gadget/feature)
+    
+    The SQL query that inspired this metric was:
+    
+ SELECT revs.user_id AS revs_user_id,
+        IF(revs.rev_count >= 1, 1, 0) AS survived,
+        IF(revs.rev_count >= 1, 0, IF(unix_timestamp(now())
+            < unix_timestamp(revs.user_registration) + 2595600, 1, 0)) AS 
censored
+
+   FROM (SELECT user.user_id AS user_id,
+                user.user_registration AS user_registration,
+                coalesce(rev_counts.rev_count, 0) AS rev_count
+           FROM user
+                        LEFT OUTER JOIN
+                (SELECT user.user_id AS user_id,
+                        count(*) as rev_count
+                   FROM user
+                                INNER JOIN
+                        revision    ON user.user_id = revision.rev_user
+                                INNER JOIN
+                        page        ON page.page_id = revision.rev_page
+                  WHERE user.user_id IN (<cohort>)
+                    AND page.page_namespace IN (0)
+                    AND unix_timestamp(revision.rev_timestamp) -
+                        unix_timestamp(user.user_registration)
+                            BETWEEN
+                        <survival> AND <now>
+                  GROUP BY user.user_id
+                ) AS rev_counts     ON user.user_id = rev_count.user_id
+          WHERE user.user_id IN (<cohort>)
+        ) AS revs
+    """
+    
+    show_in_ui  = True
+    id          = 'threshold'
+    label       = 'Threshold'
+    description = (
+        'Compute whether editors "survived" if they have at least \
+         number_of_edits up to today.'
+    )
+    
+    number_of_edits       = IntegerField(default=1)
+    survival_hours        = IntegerField(default=0)
+    
+    namespaces = CommaSeparatedIntegerListField(
+        None,
+        [Required()],
+        default='0',
+        description='0, 2, 4, etc.',
+    )
+    
+    def debug_print(self, r, session, user_ids):
+        s = ''
+        for uid in user_ids:
+            if uid:
+                user_name = session \
+                    .query(MediawikiUser.user_name) \
+                    .filter(MediawikiUser.user_id == uid) \
+                    .first()[0]
+                s += '{0} ({1}) ===> [{2}] [{3}] \n'.format(
+                    user_name, str(uid), str(r[uid]['survivor']), 
str(r[uid][CENSORED])
+                )
+        print(s)
+    
+    def __call__(self, user_ids, session):
+        """
+        Parameters:
+            user_ids    : list of mediawiki user ids to find edit for
+            session     : sqlalchemy session open on a mediawiki database
+        
+        Returns:
+            dictionary from user ids to the number of edit found.
+        """
+
+        survival_hours = int(self.survival_hours.data)
+
+        if self.sunset_in_hours:
+            sunset_in_hours = int(self.sunset_in_hours.data)
+        else:
+            sunset_in_hours = 0
+
+        number_of_edits = int(self.number_of_edits.data)
+        
+        revisions = session \
+            .query(
+                MediawikiUser.user_id,
+                label('rev_count', func.count())
+            ) \
+            .join(Revision) \
+            .join(Page) \
+            .group_by(MediawikiUser.user_id) \
+            .filter(MediawikiUser.user_id.in_(user_ids)) \
+            .filter(Page.page_namespace.in_(self.namespaces.data))
+        
+        # sunset_in_hours is zero, so we use the first case [T+t,today]
+        if sunset_in_hours == 0:
+            revisions = revisions.filter(
+                between(
+                    func.unix_timestamp(Revision.rev_timestamp) -
+                    func.unix_timestamp(MediawikiUser.user_registration)
+                    ,
+                    (survival_hours * 3600)
+                    ,
+                    func.unix_timestamp(func.now()) + 86400
+                )
+            )
+        # otherwise use the sunset_in_hours [T+t,T+t+s]
+        else:
+            revisions = revisions.filter(
+                between(
+                    func.unix_timestamp(Revision.rev_timestamp) -
+                    func.unix_timestamp(MediawikiUser.user_registration)
+                    ,
+                    (survival_hours * 3600)
+                    ,
+                    ((survival_hours + sunset_in_hours) * 3600)
+                )
+            )
+        
+        revisions = revisions.subquery()
+        revs = session.query(
+            MediawikiUser.user_id,
+            MediawikiUser.user_registration,
+            label(
+                'rev_count',
+                func.coalesce(revisions.c.rev_count, 0)
+            )
+        ) \
+            .outerjoin(revisions, MediawikiUser.user_id == 
revisions.c.user_id) \
+            .filter(MediawikiUser.user_id.in_(user_ids)) \
+            .subquery()
+        
+        metric = session.query(
+            revs.c.user_id,
+            func.unix_timestamp(func.now()),
+            func.IF(
+                func.unix_timestamp(func.now()) <
+                func.unix_timestamp(revs.c.user_registration) +
+                (survival_hours + sunset_in_hours) * 3600,
+                1, 0
+            ),
+            revs.c.rev_count,
+            label('survived', func.IF(revs.c.rev_count >= number_of_edits, 1, 
0)),
+            label(CENSORED, func.IF(
+                revs.c.rev_count >= number_of_edits,
+                0,
+                func.IF(
+                    func.unix_timestamp(func.now()) <
+                    func.unix_timestamp(revs.c.user_registration) +
+                    (survival_hours + sunset_in_hours) * 3600,
+                    1, 0
+                )
+            ))
+        )
+        
+        data = metric.all()
+        
+        metric_results = {
+            u.user_id: {
+                self.id: u.survived,
+                CENSORED: u.censored,
+            }
+            for u in data
+        }
+
+        r = {
+            uid: metric_results.get(uid, {
+                self.id: None,
+                CENSORED: None,
+            })
+            for uid in user_ids
+        }
+
+        #self.debug_print(r, session, user_ids)
+        return r

-- 
To view, visit https://gerrit.wikimedia.org/r/87619
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I90e9efba482d404a89a4c090113f0882dfdd2067
Gerrit-PatchSet: 5
Gerrit-Project: analytics/wikimetrics
Gerrit-Branch: master
Gerrit-Owner: Stefan.petrea <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to