Milimetric has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/91309


Change subject: not yet ready to merge, just checkpoint
......................................................................

not yet ready to merge, just checkpoint

Change-Id: Ia52efad15c18e954d93136ffffdfa879804179af
---
M tests/test_metrics/test_threshold.py
M wikimetrics/metrics/survival.py
M wikimetrics/metrics/threshold.py
3 files changed, 92 insertions(+), 54 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/wikimetrics 
refs/changes/09/91309/1

diff --git a/tests/test_metrics/test_threshold.py 
b/tests/test_metrics/test_threshold.py
index 2146114..73edb4d 100644
--- a/tests/test_metrics/test_threshold.py
+++ b/tests/test_metrics/test_threshold.py
@@ -20,7 +20,7 @@
         DatabaseTest.setUp(self)
         self.create_test_cohort(
             editor_count=2,
-            revisions_per_editor=3,
+            revisions_per_editor=2,
             user_registrations=i(reg),
             revision_timestamps=[
                 [i(reg + one_hour)     , i(reg + one_hour * 25)],
@@ -31,7 +31,7 @@
     
     def test_case1_24h_count1(self):
         m = Threshold(
-            namespaces=[self.survivors_namespace],
+            namespaces=[0],
         )
         results = m(list(self.cohort), self.mwSession)
         
@@ -42,7 +42,7 @@
     
     def test_case1_72h_count1(self):
         m = Threshold(
-            namespaces=[self.survivors_namespace],
+            namespaces=[0],
             threshold_hours=3 * 24,
         )
         results = m(list(self.cohort), self.mwSession)
@@ -54,11 +54,12 @@
     
     def test_case1_72h_count3(self):
         m = Threshold(
-            namespaces=[self.survivors_namespace],
+            namespaces=[0],
             threshold_hours=3 * 24,
             number_of_edits=3,
         )
         results = m(list(self.cohort), self.mwSession)
+        print results
         
         assert_equal(results[self.editors[0].user_id][Threshold.id], False)
         assert_equal(results[self.editors[1].user_id][Threshold.id], False)
@@ -67,11 +68,12 @@
     
     def test_case1_24h_count3(self):
         m = Threshold(
-            namespaces=[self.survivors_namespace],
+            namespaces=[0],
             threshold_hours=1 * 24,
             number_of_edits=3,
         )
         results = m(list(self.cohort), self.mwSession)
+        print results
         
         assert_equal(results[self.editors[0].user_id][Threshold.id], False)
         assert_equal(results[self.editors[1].user_id][Threshold.id], False)
diff --git a/wikimetrics/metrics/survival.py b/wikimetrics/metrics/survival.py
index d6602e9..3904c12 100644
--- a/wikimetrics/metrics/survival.py
+++ b/wikimetrics/metrics/survival.py
@@ -88,14 +88,7 @@
         """
         
         survival_hours = int(self.survival_hours.data)
-        
-        # NOTE: this kind of breaks OOP because it handles a child class need
-        # in the parent class.  TODO: fix
-        if hasattr(self, 'sunset_in_hours'):
-            sunset_in_hours = int(self.sunset_in_hours.data)
-        else:
-            sunset_in_hours = 0
-        
+        sunset_in_hours = int(self.sunset_in_hours.data)
         number_of_edits = int(self.number_of_edits.data)
         
         revisions = session \
@@ -157,7 +150,7 @@
                 1, 0
             ),
             revs.c.rev_count,
-            label('metric_result', func.IF(revs.c.rev_count >= 
number_of_edits, 1, 0)),
+            label(Survival.id, func.IF(revs.c.rev_count >= number_of_edits, 1, 
0)),
             label(CENSORED, func.IF(
                 revs.c.rev_count >= number_of_edits,
                 0,
@@ -174,16 +167,16 @@
         
         metric_results = {
             u.user_id: {
-                self.id: u.metric_result,
-                CENSORED: u.censored,
+                Survival.id : u.survived,
+                CENSORED    : u.censored,
             }
             for u in data
         }
         
         r = {
             uid: metric_results.get(uid, {
-                self.id: None,
-                CENSORED: None,
+                Survival.id : None,
+                CENSORED    : None,
             })
             for uid in user_ids
         }
diff --git a/wikimetrics/metrics/threshold.py b/wikimetrics/metrics/threshold.py
index 63cc5d2..03eafcf 100644
--- a/wikimetrics/metrics/threshold.py
+++ b/wikimetrics/metrics/threshold.py
@@ -2,6 +2,7 @@
 import datetime
 import calendar
 from sqlalchemy import func, case, Integer
+from sqlalchemy.orm import aliased
 from sqlalchemy.sql.expression import label, between, and_, or_
 
 from wikimetrics.models import Page, Revision, MediawikiUser
@@ -19,43 +20,14 @@
     Threshold is a metric that determines whether an editor has performed >= n 
edits
     in a specified time window. It is used to measure early user activation.  
It also
     computes the time it took a user to reach the threshold, if they did.
-    
-    The SQL query that inspired this metric was:
-    
- SELECT revs.user_id AS revs_user_id,
-        IF(revs.rev_count >= <number_of_edits>, 1, 0)
-            AS threshold,
-        IF(revs.rev_count >= <number_of_edits>, 0, IF(unix_timestamp(now())
-            < unix_timestamp(revs.user_registration) + <threshold_hours>, 1, 
0))
-            AS censored
-
-   FROM (SELECT user.user_id AS user_id,
-                user.user_registration AS user_registration,
-                coalesce(rev_counts.rev_count, 0) AS rev_count
-           FROM user
-                        LEFT OUTER JOIN
-                (SELECT user.user_id AS user_id,
-                        count(*) as rev_count
-                   FROM user
-                                INNER JOIN
-                        revision    ON user.user_id = revision.rev_user
-                                INNER JOIN
-                        page        ON page.page_id = revision.rev_page
-                  WHERE user.user_id IN (<cohort>)
-                    AND page.page_namespace IN (<namespaces>)
-                    AND unix_timestamp(revision.rev_timestamp) -
-                        unix_timestamp(user.user_registration)
-                            <= <threshold_hours>
-                  GROUP BY user.user_id
-                ) AS rev_counts     ON user.user_id = rev_count.user_id
-          WHERE user.user_id IN (<cohort>)
-        ) AS revs
-
-    And the Time to Threshold sub-metric is also computed by this metric.  
This is the number
+    Time to Threshold is also computed by this metric.  This is the number
     of hours that it took an editor to reach exactly n edits.  If the editor 
did not
-    reach the threshold, None is returned.  The inspiration SQL for this is:
+    reach the threshold, None is returned.
+    
+    The SQL that inspired this metric was:
     
  SELECT user_id,
+        IF(rev_timestamp is not null, 1, 0) as threshold,
         IF(rev_timestamp is not null,
             (unix_timestamp(rev_timestamp) - 
unix_timestamp(user_registration)) / 3600,
             null
@@ -64,7 +36,7 @@
                 r1.rev_timestamp,
                 COUNT(*) AS number
            FROM revision r1
-                  JOIN
+                    INNER JOIN
                 revision r2  ON r1.rev_user = r2.rev_user
                              AND r1.rev_timestamp >= r2.rev_timestamp
           WHERE user.user_id IN (<cohort>)
@@ -114,4 +86,75 @@
                 'censored': 1 for True, 0 for False
             }
         """
-        return 'Not Implemented'
+        
+        threshold_hours = int(self.threshold_hours.data)
+        threshold_secs  = threshold_hours * 3600
+        number_of_edits = int(self.number_of_edits.data)
+        
+        Revision2 = aliased(Revision, name='r2')
+        ordered_revisions = session \
+            .query(
+                Revision.rev_user,
+                Revision.rev_timestamp,
+                label('number', func.count()),
+            ) \
+            .join(MediawikiUser) \
+            .join(Page) \
+            .join(
+                Revision2,
+                Revision.rev_user == Revision2.rev_user and
+                Revision.rev_timestamp >= Revision2.rev_timestamp
+            ) \
+            .group_by(Revision.rev_user) \
+            .group_by(Revision.rev_timestamp) \
+            .filter(Revision.rev_user.in_(user_ids)) \
+            .filter(Page.page_namespace.in_(self.namespaces.data)) \
+            .filter(
+                func.unix_timestamp(Revision.rev_timestamp) -
+                func.unix_timestamp(MediawikiUser.user_registration) <= 
threshold_secs
+            )
+        
+        o_r = ordered_revisions.subquery()
+        
+        metric = session.query(
+            MediawikiUser.user_id,
+            label(
+                Threshold.id,
+                func.IF(o_r.c.rev_timestamp != None, 1, 0)
+            ),
+            label(
+                'time_to_threshold',
+                func.IF(
+                    o_r.c.rev_timestamp != None,
+                    (func.unix_timestamp(o_r.c.rev_timestamp) -
+                        func.unix_timestamp(MediawikiUser.user_registration)) 
/ 3600,
+                    0
+                )
+            ),
+            label(CENSORED, func.IF(
+                o_r.c.rev_timestamp != None,
+                0,
+                func.IF(
+                    func.unix_timestamp(MediawikiUser.user_registration) + 
threshold_secs
+                    >
+                    func.unix_timestamp(func.now()),
+                    1,
+                    0
+                )
+            ))
+        ) \
+            .outerjoin(
+                o_r,
+                MediawikiUser.user_id == o_r.c.rev_user and
+                o_r.c.number == number_of_edits
+            ) \
+            .filter(MediawikiUser.user_id.in_(user_ids))
+        
+        return {
+            u.user_id: {
+                Threshold.id        : u.threshold,
+                'time_to_threshold' : u.time_to_threshold,
+                CENSORED            : u.censored,
+            }
+            for u in metric.all()
+        }

-- 
To view, visit https://gerrit.wikimedia.org/r/91309
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia52efad15c18e954d93136ffffdfa879804179af
Gerrit-PatchSet: 1
Gerrit-Project: analytics/wikimetrics
Gerrit-Branch: master
Gerrit-Owner: Milimetric <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to