http://www.mediawiki.org/wiki/Special:Code/MediaWiki/95119
Revision: 95119
Author: giovanni
Date: 2011-08-21 04:03:20 +0000 (Sun, 21 Aug 2011)
Log Message:
-----------
new comprates script, added -B/--no-errbars to timechart, fixed bug in the way
daily activity rates are computed in lifecycle.rates.estimaterate, added
procedure to find best spline smoothing factor by cross-validation
Modified Paths:
--------------
trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py
trunk/tools/wsor/editor_lifecycle/timechart
Added Paths:
-----------
trunk/tools/wsor/editor_lifecycle/comprates
trunk/tools/wsor/editor_lifecycle/lifecycle/cvsmooth.py
Added: trunk/tools/wsor/editor_lifecycle/comprates
===================================================================
--- trunk/tools/wsor/editor_lifecycle/comprates (rev 0)
+++ trunk/tools/wsor/editor_lifecycle/comprates 2011-08-21 04:03:20 UTC (rev
95119)
@@ -0,0 +1,49 @@
+#!/usr/bin/python
+
+'''
+computes average activity rates from downloaded user counts archive files
+'''
+
+__author__ = "Giovanni Luca Ciampaglia"
+__email__ = "[email protected]"
+
+import sys
+import os
+import numpy as np
+
+from argparse import ArgumentParser
+
+from lifecycle.rates import computerates
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument('input_path_list', metavar='data', nargs='+')
+parser.add_argument('-outdir', dest='output_dir', default=os.curdir)
+parser.add_argument('-every', type=int, help='default: average over
%(default)d days',
+ default=30, metavar='NUM')
+parser.add_argument('-n', '--namespace', type=int, action='append',
help='select only these NS',
+ dest='only')
+parser.add_argument('-G', '--geometric', action='store_true', help='compute '
+ 'geometric mean and standard deviation of data')
+
+__prog__ = os.path.basename(__file__)
+
+def main(ns):
+
+ for path in ns.input_path_list:
+
+ # define output path
+ output_path = os.path.basename(path)
+ output_path = os.path.splitext(output_path)[0] + '.tsv'
+ output_path = os.path.join(ns.output_dir, output_path)
+
+ # compute rates for this cohort and save them to file
+ rates = computerates(path, ns.every, onlyns=ns.only,
+ geometric=ns.geometric)
+ np.savetxt(output_path, rates, fmt='%d\t%12.8g\t%12.8g\t%d')
+ print '%s: output saved to %s' % (__prog__, output_path)
+
+if __name__ == '__main__':
+ # get arguments from command line
+ ns = parser.parse_args()
+ main(ns)
+
Property changes on: trunk/tools/wsor/editor_lifecycle/comprates
___________________________________________________________________
Added: svn:executable
+ *
Added: trunk/tools/wsor/editor_lifecycle/lifecycle/cvsmooth.py
===================================================================
--- trunk/tools/wsor/editor_lifecycle/lifecycle/cvsmooth.py
(rev 0)
+++ trunk/tools/wsor/editor_lifecycle/lifecycle/cvsmooth.py 2011-08-21
04:03:20 UTC (rev 95119)
@@ -0,0 +1,45 @@
+import numpy as np
+from scipy.interpolate import splrep, splev
+
+def spsmooth(x, y, ye, **kwargs):
+ '''
+ Finds the best spline smoothing factor by leave-one-out cross validation
+
+ Additional keyword arguments are passed to splrep (e.g. k for the degree)
+ '''
+
+ best = []
+ N = len(x)
+
+ smax = 10
+ ss = smax / 100.0
+ slist = list(np.arange(ss, smax, ss))
+
+ for i in xrange(N):
+
+ train_idx = np.arange(N) != i
+ test_idx = True - train_idx
+
+ train_x = x[train_idx]
+ train_y = y[train_idx]
+ train_w = ye[train_idx] ** -1
+
+ test_x = x[test_idx]
+ test_y = y[test_idx]
+
+ s_best = None
+ err_best = np.inf
+
+ for s in slist:
+ tck = splrep(train_x, train_y, w=train_w, s=s, **kwargs)
+ err = np.sqrt((splev(test_x, tck) - test_y) ** 2)
+
+ if err < err_best:
+ s_best = s
+ err_best = err
+
+ best.append(s_best)
+
+ return np.mean(best)
+
+
Modified: trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py
===================================================================
--- trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py 2011-08-21
04:03:16 UTC (rev 95118)
+++ trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py 2011-08-21
04:03:20 UTC (rev 95119)
@@ -7,11 +7,14 @@
import os
import sys
+
+import numpy as np
+import datetime as dt
+
from argparse import ArgumentParser, FileType
-import numpy as np
from scipy.sparse import coo_matrix
+from scipy.stats import gmean
from collections import deque
-import datetime as dt
__prog__ = os.path.basename(os.path.abspath(__file__))
@@ -41,18 +44,16 @@
MAX_NS = np.max(namespaces.values()) # 109 as of August 2011
def estimaterate(edits, step):
- '''
- This function takes the daily edit history of an individual editor, and a
- step parameter; it estimates the daily activity of the editor. It returns
- the daily rates every `step' days.
- '''
+ edits = np.asfarray(edits)
N = len(edits)
- if N % step:
- NN = np.ceil(N / float(step)) * step
- tmp = np.zeros((NN,), dtype=edits.dtype)
- tmp[:N] = edits
- edits = tmp
- return edits.reshape((-1, step)).sum(axis=-1) / float(step)
+ tmp = []
+ rem = N % step
+ for i in xrange(0, N, step): # step-size chunks
+ if i + step <= N:
+ tmp.append(edits[i:i+step].sum() / step)
+ else:
+ tmp.append(edits[i:].sum() / (N-i))
+ return np.asarray(tmp)
def itercycles(npzarchive, every, users=None, onlyns=None):
'''
@@ -76,17 +77,25 @@
rates = estimaterate(np.asarray(M.sum(axis=1)).ravel(), every)
yield np.c_[np.arange(len(rates)) * every, rates]
-def average(ratesbyday):
+def average(ratesbyday, geometric=False):
'''
Computes average cycle with standard errors. Takes in input a dictionary
- returned by groupbydayssince()
+ returned by groupbydayssince(). If geometric, compute the geometric mean
+ and standard deviation instead.
'''
all_days = sorted(ratesbyday.keys())
result = deque()
for d in all_days:
s = ratesbyday[d]
- sqN = np.sqrt(len(s))
- result.append((d, np.mean(s), np.std(s)/np.sqrt(len(s)), len(s)))
+ N = len(s)
+ if geometric:
+ s = np.ma.masked_equal(s, 0.0)
+ m = gmean(s)
+ sem = np.exp(np.std(np.log(s), ddof=1)) / np.sqrt(N)
+ else:
+ m = np.mean(s)
+ sem = np.std(s, ddof=1)
+ result.append((d, m, sem, N))
return np.asarray(result)
def groupbyday(npzarchive, every, users=None, onlyns=None):
@@ -137,19 +146,21 @@
inactives.append(uid)
return inactives
-def computerates(fn, every, users=None, onlyns=None):
+def computerates(fn, every, users=None, onlyns=None, geometric=False):
'''
Returns an array of average activity vs day since registration with
standard
errors of the average
Parameters
----------
- onlyns - compute edit counts only over specified list of namespaces
- users - compute rates only for these users
- every - compute daily rates in strides of length `every'
+ onlyns - compute edit counts only over specified list of namespaces
+ users - compute rates only for these users
+ every - compute daily rates in strides of length `every'
+ geometric - computes geometric mean of average rate by day since
+ registration
'''
npzarchive = np.load(fn)
- rates = average(groupbyday(npzarchive, every, users, onlyns))
+ rates = average(groupbyday(npzarchive, every, users, onlyns), geometric)
return rates
parser = ArgumentParser(description=__doc__)
Modified: trunk/tools/wsor/editor_lifecycle/timechart
===================================================================
--- trunk/tools/wsor/editor_lifecycle/timechart 2011-08-21 04:03:16 UTC (rev
95118)
+++ trunk/tools/wsor/editor_lifecycle/timechart 2011-08-21 04:03:20 UTC (rev
95119)
@@ -21,6 +21,8 @@
parser.add_argument('-m', '--minsize', type=int, default=0)
parser.add_argument('-o', '--output', dest='output_path', metavar='FILE')
parser.add_argument('-T', '--title')
+parser.add_argument('-B', '--no-errbars', action='store_true', help='no error'
+ ' bars')
markers = 'ov^<>sp*+xD'
colors = 'bgrcmykw'
@@ -40,7 +42,8 @@
# load cohort data and filter out estimates based on samples with size
# smaller than minimum requested
- days, rate, rate_err, size = np.loadtxt(path, delimiter='\t', unpack=1)
+ days, rate, rate_err, size = map(np.ravel, np.loadtxt(path,
+ delimiter='\t', unpack=1))
idx = size >= ns.minsize
days = days[idx]
rate = rate[idx]
@@ -50,12 +53,20 @@
(__prog__, path)
continue
- # plot errorbars
- l, (wu, wd), mc = ax.errorbar(days, rate, rate_err, marker=markers[i %
M], color=colors[i
- % C], label=os.path.splitext(path)[0].replace('_',' '),
- ecolor='none', ls=':', lw=2)
+ # plot errorbars or lines
+ label = os.path.splitext(path)[0].replace('_',' ')
+ color = colors[i % C]
+ marker = markers[i % M]
+ if ns.no_errbars:
+ l, = ax.plot(days, rate, marker=marker, color=color, label=label,
+ ls=':', lw=2)
+ else:
+ l, (wu, wd), mc = ax.errorbar(days, rate, rate_err,
+ marker=marker, color=color, label=label, ecolor='none',
+ ls=':', lw=2)
+ pp.setp(wd, ls='none')
+
lines.append(l)
- pp.setp(wd, ls='none')
# decorate figure
pp.xlabel('days since registration')
@@ -65,6 +76,7 @@
pp.minorticks_on()
pp.grid("on")
pp.axis('tight')
+# pp.ylim(0,np.ceil(ymax * 1.01)) # fix this!
pp.draw()
if ns.title is not None:
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs