https://www.mediawiki.org/wiki/Special:Code/MediaWiki/114804

Revision: 114804
Author:   giovanni
Date:     2012-04-09 18:58:33 +0000 (Mon, 09 Apr 2012)
Log Message:
-----------
first refactoring step to lifecycle.rates

Modified Paths:
--------------
    trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py
    trunk/tools/wsor/editor_lifecycle/scripts/fetchrates

Modified: trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py
===================================================================
--- trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py        2012-04-09 
11:52:46 UTC (rev 114803)
+++ trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py        2012-04-09 
18:58:33 UTC (rev 114804)
@@ -1,200 +1,117 @@
 #!/usr/bin/python
 #:vim:ts=python:
 
-''' functions for computing average activity rate of a single cohort '''
+''' activity rate estimation '''
 
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+# Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
 
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
-import os
-import sys
-
 import numpy as np
-import datetime as dt
-
-from argparse import ArgumentParser, FileType
 from scipy.sparse import coo_matrix
-from scipy.stats import gmean
 from collections import deque
 
-__prog__ = os.path.basename(os.path.abspath(__file__))
+MAX_NS = 109 # taken from WP:Namespaces
+_user_dtype = np.dtype([('day', int), ('ns', int), ('edits', int)])
 
-namespaces = {
-        'main': 0, 
-        'talk': 1, 
-        'user': 2, 
-        'user_talk' : 3,
-        'wikipedia' : 4, 
-        'wikipedia_talk' : 5, 
-        'file' : 6, 
-        'file_talk' : 7, 
-        'mediawiki' : 8, 
-        'mediawiki_talk' : 9, 
-        'template' : 10, 
-        'template_talk' : 11, 
-        'help' : 12, 
-        'help_talk' : 13,
-        'category' : 14,
-        'category_talk' : 15, 
-        'portal' : 100, 
-        'portal_talk' : 101,
-        'book' : 108, 
-        'book_talk' : 109
-}
+def userrate(rows):
+    '''
+    User-level activity rate
 
-MAX_NS = np.max(namespaces.values()) # 109 as of August 2011
+    Parameters
+    ----------
+    rows - a sequence of (timestamps, namespace). 
 
-def estimaterate(edits, step):
-    edits = np.asfarray(edits)
-    N = len(edits)
-    tmp = []
-    rem = N % step
-    for i in xrange(0, N, step): # step-size chunks
-        if i + step <= N:
-            tmp.append(edits[i:i+step].sum() / step)
-        else:
-            tmp.append(edits[i:].sum() / (N-i))
-    return np.asarray(tmp)
+    Returns
+    -------
+    An array with the following fields: day, ns, edits. Suitable for
+    `scipy.sparse.coo_matrix`
+    '''
+    rev_time, rev_ns = np.asfarray(rows).T
+    m, M = np.floor(rev_time.min()), np.ceil(rev_time.max())
+    uns = sorted(np.unique(rev_ns))
+    bins_time = np.arange(m,M + 1)
+    bins_ns = uns + [uns[-1] + 1]
+    rates, days, _ = np.histogram2d(rev_time, rev_ns, bins=(bins_time, 
bins_ns))
+    I,J = np.nonzero(rates)
+    data = [ (days[i],uns[j],rates[i,j]) for i, j in zip(I,J) ] 
+    return np.asarray(data, dtype=_user_dtype)
 
-def itercycles(npzarchive, every, users=None, onlyns=None):
+def cohortrate(npzarchive, onlyns=None, minsize=None, minsnr=None):
     '''
-    Iterates over the archive or over given list of users and returns estimated
-    activity life cycle (see estimaterate())
+    Cohort-level activity rate
 
-    For parameters, see computerates
+    Parameters
+    ----------
+    npzarchive - a mapping user id -> ndarray, 
+        usually an NpzObj returned from `numpy.io.load`
+    onlyns - sequence of namespace codes
+        compute activity rates only with edits to these namespaces
+    minsize - positive int
+        filter out activity rate estimates based on sample of size less than
+    minsize minsnr - positive real
+        filter out activity rate estimates with signal-to-noise rate less than
+        parameter
+
+    Returns
+    -------
+    an array of daily activity rate observations, together estimated 
uncertainties 
     '''
-    for uid in (users or npzarchive.files):
+    day_counts = {}
+
+    for uid in npzarchive.files:
         data = npzarchive[uid].view(np.recarray)
-        idx = data.ns >= 0 # let's filter out junk (i.e. edits to virtual ns)
+
+        # negative namespaces are virtual. Let's filter out the edits to them
+        idx = data.ns >= 0 
         days = data.day[idx]
         edits = data.edits[idx]
         ns = data.ns[idx]
-        days = days - days.min()
+        days -= days.min()
         shape = (days.max() + 1, MAX_NS + 1)
         M = coo_matrix((edits, (days, ns)), shape=shape)
         if onlyns is not None:
-            M = M.tocsc()[:, onlyns]
+            M = M.tocsc()[:, onlyns].tocoo()
         M = M.tocsr()
-        rates = estimaterate(np.asarray(M.sum(axis=1)).ravel(), every)
-        yield np.c_[np.arange(len(rates)) * every, rates]
+        counts = np.asarray(M.sum(axis=1)).ravel()
 
-def average(ratesbyday, geometric=False):
-    '''
-    Computes average cycle with standard errors. Takes in input a dictionary
-    returned by groupbydayssince(). If geometric, compute the geometric mean
-    and standard deviation instead.
-    '''
-    all_days = sorted(ratesbyday.keys())
-    result = deque()
-    for d in all_days:
-        s = ratesbyday[d]
-        N = len(s)
-        if geometric:
-            s = np.ma.masked_equal(s, 0.0)
-            m = gmean(s)
-            sem = np.exp(np.std(np.log(s), ddof=1)) / np.sqrt(N)
-        else:
-            m = np.mean(s)
-            sem = np.std(s, ddof=1)
-        result.append((d, m, sem, N))
-    return np.asarray(result)
-
-def groupbyday(npzarchive, every, users=None, onlyns=None):
-    '''
-    This function estimates editors' activity rates and groups rate estimates 
by
-    number of days elapsed since editor registration (which corresponds to 
time = 0)
-
-    For parameters, see computerates
-    '''
-    tmp = {}
-    for cyclearr in itercycles(npzarchive, every, users, onlyns):
-        for d, r in cyclearr:
+        # group by day
+        for i in xrange(len(counts)):
             try:
-                tmp[d].append(r)
+                day_counts[i].append(counts[i])
             except KeyError:
-                tmp[d] = deque([r])
-    return tmp
+                day_counts[i] = deque([counts[i]])
 
-# NOTE: not used right now
-def lifetimes(npzarchive, users=None):
-    '''
-    Returns the distribution of account lifetimes over an archive. Can take an
-    optional list users ids to restrict the sample to a specific group of
-    editors
-    '''
-    lt = deque()
-    for uid in (users or npzarchive.files):
-        days, edits = npzarchive[uid].T
-        lt.append(days.ptp())
-    return np.asarray(lt)
+    # average over each day, filter out unwanted observations
+    max_day = len(day_counts)
+    rate = deque()
 
-# NOTE: not used right now
-def find_inactives(npzarchive, inactivity, minimum_activity, maximum_activity):
-    now = dt.datetime.now().toordinal()
-    epoch = dt.datetime(1970,1,1).toordinal()
-    unix_now = now - epoch
-    inactives = deque()
-    for uid in npzarchive.files:
-        days, edits = npzarchive[uid].T
-        if days.ptp() <= inactivity:
-            continue
-        unix_last = days[-1]
-        if (unix_now - unix_last) > inactivity:
-            tot_edits = float(edits.sum())
-            tot_days = float(days.ptp() - inactivity)
-            activity = tot_edits / tot_days * 365.0
-            if minimum_activity < activity and maximum_activity > activity:
-                inactives.append(uid)
-    return inactives
+    for i in xrange(max_day):
+        try:
+            sample = np.asarray(day_counts[i])
+            n = len(sample)
+            if n < minsize:
+                continue
+            m = np.mean(sample)
+            #  the uncertainity of the estimate is just the standard error
+            #  of the mean
+            s = np.std(sample, ddof=1) / np.sqrt(n) 
+            if (m / s) < minsnr:
+                continue
+            rate.append((i, m, s))
+        except KeyError:
+            pass
+    return np.asarray(rate)
 
-def computerates(fn, every, users=None, onlyns=None, geometric=False):
-    '''
-    Returns an array of average activity vs day since registration with 
standard
-    errors of the average
-
-    Parameters
-    ----------
-    onlyns    - compute edit counts only over specified list of namespaces
-    users     - compute rates only for these users
-    every     - compute daily rates in strides of length `every'
-    geometric - computes geometric mean of average rate by day since
-                registration
-    '''
-    npzarchive = np.load(fn)
-    rates = average(groupbyday(npzarchive, every, users, onlyns), geometric)
-    return rates
-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument('data_file', metavar='data')
-parser.add_argument('output_file', metavar='output', type=FileType('w'))
-parser.add_argument('-every', type=int, help='default: average over 
%(default)d days',
-        default=30, metavar='NUM')
-parser.add_argument('-ns', type=int, action='append', help='select only these 
NS',
-        dest='only')
-
-def main(ns):
-    rates = computerates(ns.data_file, ns.every, onlyns=ns.only)
-    if ns.output_file.isatty():
-        print >> sys.stderr, 'writing to standard output'
-    np.savetxt(ns.output_file, rates, fmt='%f')
-    if not ns.output_file.isatty():
-        print '%s: output saved to %s' % (__prog__, ns.output_file.name)
-
-if __name__ == '__main__':
-    ns = parser.parse_args()
-    main(ns)

Modified: trunk/tools/wsor/editor_lifecycle/scripts/fetchrates
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/fetchrates        2012-04-09 
11:52:46 UTC (rev 114803)
+++ trunk/tools/wsor/editor_lifecycle/scripts/fetchrates        2012-04-09 
18:58:33 UTC (rev 114804)
@@ -2,24 +2,22 @@
 
 ''' Fetches and computes daily edit rates for cohorts of users '''
 
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+# Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
 
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
 __author__ = "Giovanni Luca Ciampaglia"
 __email__ = "[email protected]"
 
@@ -57,27 +55,9 @@
 
 __prog__ = os.path.basename(os.path.abspath(__file__))
 
-# def iterateoversets(cursor):
-#     yield list(cursor)
-#     while cursor.nextset():
-#         yield list(cursor)
 
 def process(rows, user_id, output_dir=os.path.curdir):
-    if len(rows) == 0:
-        print >> sys.stderr, '%s: error: empty revision history for user %d' % 
(__prog__,
-                user_id)
-        return
-
-    rev_time, rev_ns = np.asfarray(rows).T
-    m, M = np.floor(rev_time.min()), np.ceil(rev_time.max())
-    uns = sorted(np.unique(rev_ns))
-    bins_time = np.arange(m,M + 1)
-    bins_ns = uns + [uns[-1] + 1]
-    rates, days, _ = np.histogram2d(rev_time, rev_ns, bins=(bins_time, 
bins_ns))
-    I,J = np.nonzero(rates)
-    data = [ (days[i],uns[j],rates[i,j]) for i, j in zip(I,J) ] # in 
coordinate format
-    dtype=np.dtype([('day', int), ('ns', int), ('edits', int)])
-    data = np.asarray(data, dtype=dtype)
+    data = userrate(rows)
     out_path = os.path.join(output_dir, '%d.npy' % user_id)
     np.save(out_path, data)
     return out_path


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to