Mforns has submitted this change and it was merged.

Change subject: Add pages edited metric
......................................................................


Add pages edited metric

Pages edited metric counts how many pages were edited
by each cohort user. Note that if a user edits the same
page more than once, this would still count as just one
page edited.

When summing up the pages edited for the whole cohort,
the individual values for each user are summed, without
considering that two different users may have edited
the same page. So the total pages edited value for the
cohort does not conform to the metric definition.

Bug: 73072
Change-Id: I9e7322ffa6918af25724aaa8e5128f9e89b3579e
---
M tests/fixtures.py
A tests/test_metrics/test_pages_edited.py
M wikimetrics/metrics/__init__.py
A wikimetrics/metrics/pages_edited.py
M wikimetrics/models/mediawiki/archive.py
5 files changed, 234 insertions(+), 1 deletion(-)

Approvals:
  Mforns: Verified; Looks good to me, approved
  jenkins-bot: Verified



diff --git a/tests/fixtures.py b/tests/fixtures.py
index 027e495..2a39c71 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -599,7 +599,7 @@
     def archive_revisions(self):
         """
         Archive all the revisions in the revision table
-        NOTE: only populates ar_namespace, ar_timestamp, and ar_user
+        NOTE: only populates ar_page_id, ar_namespace, ar_timestamp, and 
ar_user
         NOTE: leaves ar_rev_id NULL because that's valid and a good edge case
         NOTE: creates duplicates with NULL ar_rev_id
         """
@@ -608,6 +608,7 @@
             Revision.rev_user,
             Revision.rev_parent_id,
             Page.page_namespace,
+            Page.page_id,
         ).join(Page)
         revisions = query.all()
 
@@ -619,6 +620,7 @@
                     'ar_user': r.rev_user,
                     'ar_parent_id': r.rev_parent_id,
                     'ar_namespace': r.page_namespace,
+                    'ar_page_id': r.page_id,
                 }
                 for r in revisions
             ]
diff --git a/tests/test_metrics/test_pages_edited.py 
b/tests/test_metrics/test_pages_edited.py
new file mode 100644
index 0000000..573130e
--- /dev/null
+++ b/tests/test_metrics/test_pages_edited.py
@@ -0,0 +1,71 @@
+from nose.tools import assert_true, assert_equal
+from tests.fixtures import DatabaseTest
+
+from wikimetrics.metrics import PagesEdited
+
+
+class PagesEditedDatabaseTest(DatabaseTest):
+    def setUp(self):
+        DatabaseTest.setUp(self)
+        self.common_cohort_4()
+
+    def test_filters_out_other_editors(self):
+        self.common_cohort_1(cohort=False)
+        metric = PagesEdited(
+            start_date='2012-12-31 22:59:59',
+            end_date='2014-01-01 00:00:00',
+        )
+        results = metric(self.editor_ids, self.mwSession)
+
+        assert_equal(len(results), 2)
+        assert_equal(results[self.editors[0].user_id][PagesEdited.id], 4)
+        assert_equal(results[self.editors[1].user_id][PagesEdited.id], 2)
+
+    def test_filters_out_other_editors_deduplicate(self):
+        self.common_cohort_1(cohort=False)
+        metric = PagesEdited(
+            deduplicate_across_users=True,
+            start_date='2012-12-31 22:59:59',
+            end_date='2014-01-01 00:00:00',
+        )
+        results = metric(self.editor_ids, self.mwSession)
+
+        assert_equal(len(results), 1)
+        assert_equal(results[-1][PagesEdited.id], 5)
+
+    def test_runs_for_an_entire_wiki(self):
+        self.common_cohort_1(cohort=False)
+        metric = PagesEdited(
+            start_date='2012-12-31 22:59:59',
+            end_date='2014-01-01 00:00:00',
+        )
+        results = metric(None, self.mwSession)
+
+        assert_equal(len(results), 6)
+        assert_equal(results[self.editors[0].user_id][PagesEdited.id], 4)
+        assert_equal(results[self.editors[1].user_id][PagesEdited.id], 2)
+        # NOTE: this is a bit precarious as it assumes the order of test data 
inserts
+        assert_equal(results[self.editors[0].user_id + 4][PagesEdited.id], 1)
+
+    def test_reports_zero_edits(self):
+        metric = PagesEdited(
+            namespaces=[0],
+            start_date='2011-01-01 00:00:00',
+            end_date='2011-02-01 00:00:00',
+        )
+        results = metric(self.editor_ids, self.mwSession)
+
+        assert_true(results is not None)
+        assert_equal(results[self.editors[0].user_id][PagesEdited.id], 0)
+
+    def test_filters_out_other_editors_with_archive(self):
+        self.archive_revisions()
+        self.test_filters_out_other_editors()
+
+    def test_runs_for_an_entire_wiki_with_archive(self):
+        self.archive_revisions()
+        self.test_runs_for_an_entire_wiki()
+
+    def test_reports_zero_edits_with_archive(self):
+        self.archive_revisions()
+        self.test_reports_zero_edits()
diff --git a/wikimetrics/metrics/__init__.py b/wikimetrics/metrics/__init__.py
index 7138dfc..024de0e 100644
--- a/wikimetrics/metrics/__init__.py
+++ b/wikimetrics/metrics/__init__.py
@@ -5,6 +5,7 @@
 from revert_rate import *
 from bytes_added import *
 from pages_created import *
+from pages_edited import *
 from threshold import *
 from survival import *
 from newly_registered import *
diff --git a/wikimetrics/metrics/pages_edited.py 
b/wikimetrics/metrics/pages_edited.py
new file mode 100644
index 0000000..b288bef
--- /dev/null
+++ b/wikimetrics/metrics/pages_edited.py
@@ -0,0 +1,158 @@
+from sqlalchemy import func
+from sqlalchemy.sql.expression import label, literal_column
+
+from wikimetrics.utils import thirty_days_ago, today
+from wikimetrics.models import Page, Revision, Archive
+from wikimetrics.forms.fields import (
+    CommaSeparatedIntegerListField,
+    BetterBooleanField, BetterDateTimeField
+)
+from metric import Metric
+
+ROLLUP_USER_ID = -1
+
+
+class PagesEdited(Metric):
+    """
+    This class counts the pages edited by editors over a period of time.
+    This sql query was used as a starting point for the sqlalchemy query:
+
+    SELECT user_id, COUNT(*)
+    FROM (
+        SELECT DISTINCT user_id, page_id
+        FROM (
+            SELECT r.rev_user AS user_id, r.rev_page AS page_id
+            FROM revision r
+                INNER JOIN page p ON p.page_id = r.rev_page
+            WHERE r.rev_timestamp BETWEEN [start] AND [end]
+            AND r.rev_user in ([parameterized])
+            AND p.page_namespace in ([parameterized])
+
+            UNION ALL
+
+            SELECT a.ar_user AS user_id, a.ar_page_id AS page_id
+            FROM archive a
+            WHERE a.ar_timestamp BETWEEN [start] AND [end]
+            AND a.ar_user in ([parameterized])
+            AND a.ar_namespace in ([parameterized])
+        ) AS subq1
+    ) AS subq2 GROUP BY user_id;
+    """
+
+    # NOTE: this is not enabled in the UI yet, but could be easily
+    # The deduplicate parameter's a bit confusing, maybe consider
+    # re-wording that when making this metric available
+    show_in_ui  = False
+    id          = 'pages_edited'
+    label       = 'Pages Edited'
+    category    = 'Content'
+    description = (
+        'Compute the number of unique pages edited by the'
+        'cohort\'s users within a time interval'
+    )
+    default_result = {'pages_edited': 0}
+
+    start_date  = BetterDateTimeField(default=thirty_days_ago)
+    end_date    = BetterDateTimeField(default=today)
+
+    include_deleted = BetterBooleanField(
+        default=True,
+        description='Count pages that have been deleted',
+    )
+    namespaces = CommaSeparatedIntegerListField(
+        None,
+        description='0, 2, 4, etc. (leave blank for *all*)',
+    )
+    deduplicate_across_users = BetterBooleanField(
+        default=False,
+        description='Count unique pages edited by the entire cohort,'
+                    ' rolled up to one number.',
+    )
+
+    def __call__(self, user_ids, session):
+        """
+        Parameters:
+            user_ids    : list of mediawiki user ids to find pages for
+            session     : sqlalchemy session open on a mediawiki database
+
+        Returns:
+            dictionary from user ids to the number of pages edited found
+        """
+        start_date = self.start_date.data
+        end_date = self.end_date.data
+        deduplicate = self.deduplicate_across_users.data
+
+        revisions = (
+            session.query(
+                label('user_id', Revision.rev_user),
+                label('page_id', Revision.rev_page),
+                label('timestamp', Revision.rev_timestamp)
+            )
+            .filter(Revision.rev_timestamp > start_date)
+            .filter(Revision.rev_timestamp <= end_date))
+
+        archives = (
+            session.query(
+                label('user_id', Archive.ar_user),
+                label('page_id', Archive.ar_page_id),
+                label('timestamp', Archive.ar_timestamp)
+            )
+            .filter(Archive.ar_timestamp > start_date)
+            .filter(Archive.ar_timestamp <= end_date))
+
+        if self.namespaces.data and len(self.namespaces.data) > 0:
+            revisions = (
+                revisions
+                .join(Page)
+                .filter(Page.page_namespace.in_(self.namespaces.data))
+            )
+            archives = (
+                archives
+                .filter(Archive.ar_namespace.in_(self.namespaces.data))
+            )
+
+        revisions = self.filter(revisions, user_ids, column=Revision.rev_user)
+        archives = self.filter(archives, user_ids, column=Archive.ar_user)
+
+        both = revisions
+        if self.include_deleted.data:
+            both = both.union_all(archives)
+        both = both.subquery()
+
+        if deduplicate:
+            # Use a constant user id here to deduplicate only by page
+            # A single result will be returned and assigned to user_id = 
ROLLUP_USER_ID
+            both_grouped = (
+                session.query(
+                    label('user_id', literal_column(str(ROLLUP_USER_ID))), 
both.c.page_id
+                )
+                .distinct().subquery()
+            )
+        else:
+            # Select distinct user_id-page_id pairs
+            # to count edits by the same user on the same page as one
+            both_grouped = (
+                session.query(both.c.user_id, both.c.page_id)
+                .distinct().subquery()
+            )
+
+        query = (
+            session.query(both_grouped.c.user_id, func.count())
+            .group_by(both_grouped.c.user_id)
+        )
+
+        # Format the output
+        metric_results = {r[0]: {PagesEdited.id : r[1]} for r in query.all()}
+        if user_ids is None:
+            return metric_results
+        elif deduplicate:
+            ret = {}
+            ret[ROLLUP_USER_ID] = metric_results.get(
+                ROLLUP_USER_ID, self.default_result
+            )
+            return ret
+        else:
+            return {
+                uid: metric_results.get(uid, self.default_result)
+                for uid in user_ids
+            }
diff --git a/wikimetrics/models/mediawiki/archive.py 
b/wikimetrics/models/mediawiki/archive.py
index a8c1fd7..8c122c8 100644
--- a/wikimetrics/models/mediawiki/archive.py
+++ b/wikimetrics/models/mediawiki/archive.py
@@ -43,4 +43,5 @@
     ar_flags      = Column(TINYBLOB, nullable=False, default='')
     ar_rev_id     = Column(Integer, nullable=True)
     ar_deleted    = Column(Boolean, nullable=False, default=False)
+    ar_page_id    = Column(Integer, nullable=True)
     ar_parent_id  = Column(Integer, nullable=True)

-- 
To view, visit https://gerrit.wikimedia.org/r/174773
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I9e7322ffa6918af25724aaa8e5128f9e89b3579e
Gerrit-PatchSet: 6
Gerrit-Project: analytics/wikimetrics
Gerrit-Branch: master
Gerrit-Owner: Mforns <[email protected]>
Gerrit-Reviewer: Madhuvishy <[email protected]>
Gerrit-Reviewer: Mforns <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to