http://www.mediawiki.org/wiki/Special:Code/MediaWiki/95119

Revision: 95119
Author:   giovanni
Date:     2011-08-21 04:03:20 +0000 (Sun, 21 Aug 2011)
Log Message:
-----------
new comprates script, added -B/--no-errbars to timechart, fixed bug in the way 
daily activity rates are computed in lifecycle.rates.estimaterate, added 
procedure to find best spline smoothing factor by cross-validation

Modified Paths:
--------------
    trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py
    trunk/tools/wsor/editor_lifecycle/timechart

Added Paths:
-----------
    trunk/tools/wsor/editor_lifecycle/comprates
    trunk/tools/wsor/editor_lifecycle/lifecycle/cvsmooth.py

Added: trunk/tools/wsor/editor_lifecycle/comprates
===================================================================
--- trunk/tools/wsor/editor_lifecycle/comprates                         (rev 0)
+++ trunk/tools/wsor/editor_lifecycle/comprates 2011-08-21 04:03:20 UTC (rev 
95119)
@@ -0,0 +1,49 @@
+#!/usr/bin/python
+
+'''
+computes average activity rates from downloaded user counts archive files
+'''
+
+__author__ = "Giovanni Luca Ciampaglia"
+__email__ = "[email protected]"
+
+import sys
+import os
+import numpy as np
+
+from argparse import ArgumentParser
+
+from lifecycle.rates import computerates
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument('input_path_list', metavar='data', nargs='+')
+parser.add_argument('-outdir', dest='output_dir', default=os.curdir)
+parser.add_argument('-every', type=int, help='default: average over 
%(default)d days',
+        default=30, metavar='NUM')
+parser.add_argument('-n', '--namespace', type=int, action='append', 
help='select only these NS',
+        dest='only')
+parser.add_argument('-G', '--geometric', action='store_true', help='compute '
+        'geometric mean and standard deviation of data')
+
+__prog__ = os.path.basename(__file__)
+
+def main(ns):
+
+    for path in ns.input_path_list:
+
+        # define output path
+        output_path = os.path.basename(path)
+        output_path = os.path.splitext(output_path)[0] + '.tsv'
+        output_path = os.path.join(ns.output_dir, output_path)
+
+        # compute rates for this cohort and save them to file
+        rates = computerates(path, ns.every, onlyns=ns.only,
+                geometric=ns.geometric)
+        np.savetxt(output_path, rates, fmt='%d\t%12.8g\t%12.8g\t%d')
+        print '%s: output saved to %s' % (__prog__, output_path)
+
+if __name__ == '__main__':
+    # get arguments from command line
+    ns = parser.parse_args()
+    main(ns)
+


Property changes on: trunk/tools/wsor/editor_lifecycle/comprates
___________________________________________________________________
Added: svn:executable
   + *

Added: trunk/tools/wsor/editor_lifecycle/lifecycle/cvsmooth.py
===================================================================
--- trunk/tools/wsor/editor_lifecycle/lifecycle/cvsmooth.py                     
        (rev 0)
+++ trunk/tools/wsor/editor_lifecycle/lifecycle/cvsmooth.py     2011-08-21 
04:03:20 UTC (rev 95119)
@@ -0,0 +1,45 @@
+import numpy as np
+from scipy.interpolate import splrep, splev
+
+def spsmooth(x, y, ye, **kwargs):
+    ''' 
+    Finds the best spline smoothing factor by leave-one-out cross validation
+
+    Additional keyword arguments are passed to splrep (e.g. k for the degree)
+    '''
+
+    best = []
+    N = len(x)
+
+    smax = 10
+    ss = smax / 100.0
+    slist = list(np.arange(ss, smax, ss))
+
+    for i in xrange(N):
+
+        train_idx = np.arange(N) != i
+        test_idx = True - train_idx
+
+        train_x = x[train_idx]
+        train_y = y[train_idx]
+        train_w = ye[train_idx] ** -1
+
+        test_x = x[test_idx]
+        test_y = y[test_idx]
+        
+        s_best = None
+        err_best = np.inf
+        
+        for s in slist:
+            tck = splrep(train_x, train_y, w=train_w, s=s, **kwargs)
+            err = np.sqrt((splev(test_x, tck) - test_y) ** 2)
+            
+            if err < err_best:
+                s_best = s
+                err_best = err
+
+        best.append(s_best)
+
+    return np.mean(best)
+        
+           

Modified: trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py
===================================================================
--- trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py        2011-08-21 
04:03:16 UTC (rev 95118)
+++ trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py        2011-08-21 
04:03:20 UTC (rev 95119)
@@ -7,11 +7,14 @@
 
 import os
 import sys
+
+import numpy as np
+import datetime as dt
+
 from argparse import ArgumentParser, FileType
-import numpy as np
 from scipy.sparse import coo_matrix
+from scipy.stats import gmean
 from collections import deque
-import datetime as dt
 
 __prog__ = os.path.basename(os.path.abspath(__file__))
 
@@ -41,18 +44,16 @@
 MAX_NS = np.max(namespaces.values()) # 109 as of August 2011
 
 def estimaterate(edits, step):
-    '''
-    This function takes the daily edit history of an individual editor, and a
-    step parameter; it estimates the daily activity of the editor. It returns
-    the daily rates every `step' days.
-    '''
+    edits = np.asfarray(edits)
     N = len(edits)
-    if N % step:
-        NN = np.ceil(N / float(step)) * step
-        tmp = np.zeros((NN,), dtype=edits.dtype)
-        tmp[:N] = edits
-        edits = tmp
-    return edits.reshape((-1, step)).sum(axis=-1) / float(step)
+    tmp = []
+    rem = N % step
+    for i in xrange(0, N, step): # step-size chunks
+        if i + step <= N:
+            tmp.append(edits[i:i+step].sum() / step)
+        else:
+            tmp.append(edits[i:].sum() / (N-i))
+    return np.asarray(tmp)
 
 def itercycles(npzarchive, every, users=None, onlyns=None):
     '''
@@ -76,17 +77,25 @@
         rates = estimaterate(np.asarray(M.sum(axis=1)).ravel(), every)
         yield np.c_[np.arange(len(rates)) * every, rates]
 
-def average(ratesbyday):
+def average(ratesbyday, geometric=False):
     '''
     Computes average cycle with standard errors. Takes in input a dictionary
-    returned by groupbydayssince()
+    returned by groupbydayssince(). If geometric, compute the geometric mean
+    and standard deviation instead.
     '''
     all_days = sorted(ratesbyday.keys())
     result = deque()
     for d in all_days:
         s = ratesbyday[d]
-        sqN = np.sqrt(len(s))
-        result.append((d, np.mean(s), np.std(s)/np.sqrt(len(s)), len(s)))
+        N = len(s)
+        if geometric:
+            s = np.ma.masked_equal(s, 0.0)
+            m = gmean(s)
+            sem = np.exp(np.std(np.log(s), ddof=1)) / np.sqrt(N)
+        else:
+            m = np.mean(s)
+            sem = np.std(s, ddof=1)
+        result.append((d, m, sem, N))
     return np.asarray(result)
 
 def groupbyday(npzarchive, every, users=None, onlyns=None):
@@ -137,19 +146,21 @@
                 inactives.append(uid)
     return inactives
 
-def computerates(fn, every, users=None, onlyns=None):
+def computerates(fn, every, users=None, onlyns=None, geometric=False):
     '''
     Returns an array of average activity vs day since registration with 
standard
     errors of the average
 
     Parameters
     ----------
-    onlyns - compute edit counts only over specified list of namespaces
-    users  - compute rates only for these users
-    every  - compute daily rates in strides of length `every'
+    onlyns    - compute edit counts only over specified list of namespaces
+    users     - compute rates only for these users
+    every     - compute daily rates in strides of length `every'
+    geometric - computes geometric mean of average rate by day since
+                registration
     '''
     npzarchive = np.load(fn)
-    rates = average(groupbyday(npzarchive, every, users, onlyns))
+    rates = average(groupbyday(npzarchive, every, users, onlyns), geometric)
     return rates
 
 parser = ArgumentParser(description=__doc__)

Modified: trunk/tools/wsor/editor_lifecycle/timechart
===================================================================
--- trunk/tools/wsor/editor_lifecycle/timechart 2011-08-21 04:03:16 UTC (rev 
95118)
+++ trunk/tools/wsor/editor_lifecycle/timechart 2011-08-21 04:03:20 UTC (rev 
95119)
@@ -21,6 +21,8 @@
 parser.add_argument('-m', '--minsize', type=int, default=0)
 parser.add_argument('-o', '--output', dest='output_path', metavar='FILE')
 parser.add_argument('-T', '--title')
+parser.add_argument('-B', '--no-errbars', action='store_true', help='no error'
+        ' bars')
 
 markers = 'ov^<>sp*+xD'
 colors = 'bgrcmykw'
@@ -40,7 +42,8 @@
 
         # load cohort data and filter out estimates based on samples with size
         # smaller than minimum requested
-        days, rate, rate_err, size = np.loadtxt(path, delimiter='\t', unpack=1)
+        days, rate, rate_err, size = map(np.ravel, np.loadtxt(path,
+            delimiter='\t', unpack=1))
         idx = size >= ns.minsize
         days = days[idx]
         rate = rate[idx]
@@ -50,12 +53,20 @@
                     (__prog__, path)
             continue
  
-        # plot errorbars
-        l, (wu, wd), mc = ax.errorbar(days, rate, rate_err, marker=markers[i % 
M], color=colors[i
-            % C], label=os.path.splitext(path)[0].replace('_',' '),
-            ecolor='none', ls=':', lw=2)
+        # plot errorbars or lines
+        label = os.path.splitext(path)[0].replace('_',' ')
+        color = colors[i % C]
+        marker = markers[i % M]
+        if ns.no_errbars:
+            l, = ax.plot(days, rate, marker=marker, color=color, label=label,
+                    ls=':', lw=2) 
+        else:
+            l, (wu, wd), mc = ax.errorbar(days, rate, rate_err, 
+                    marker=marker, color=color, label=label, ecolor='none', 
+                    ls=':', lw=2)
+            pp.setp(wd, ls='none')
+
         lines.append(l)
-        pp.setp(wd, ls='none')
 
     # decorate figure
     pp.xlabel('days since registration')
@@ -65,6 +76,7 @@
     pp.minorticks_on()
     pp.grid("on")
     pp.axis('tight')
+#    pp.ylim(0,np.ceil(ymax * 1.01)) # fix this!
 
     pp.draw()
     if ns.title is not None:


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to