https://www.mediawiki.org/wiki/Special:Code/MediaWiki/113492
Revision: 113492
Author: giovanni
Date: 2012-03-09 18:29:43 +0000 (Fri, 09 Mar 2012)
Log Message:
-----------
added scripts for analyzing daily cohort edit count data
Added Paths:
-----------
trunk/tools/wsor/editor_lifecycle/scripts/fitcounts
trunk/tools/wsor/editor_lifecycle/scripts/mksamples
Added: trunk/tools/wsor/editor_lifecycle/scripts/fitcounts
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/fitcounts
(rev 0)
+++ trunk/tools/wsor/editor_lifecycle/scripts/fitcounts 2012-03-09 18:29:43 UTC
(rev 113492)
@@ -0,0 +1,55 @@
+#!/usr/bin/python
+#:vim:ft=python
+# encoding:utf-8
+
+''' fits daily count samples '''
+
+import os
+from contextlib import closing
+from argparse import ArgumentParser
+import numpy as np
+from scipy.stats import nbinom, geom, poisson, chisquare
+from scipy.optimize import fmin
+import matplotlib.pyplot as pp
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument('input_path', metavar='file')
+
+models = [ nbinom, ] # poisson, geom ]
+initial_args = { 'nbinom' : (5,.5), 'poisson' : (10,), 'geom' : (.5,) }
+
+def main(args):
+ model_params = {}
+ model_pvalue = {}
+ with closing(open(args.input_path)) as infile:
+ for i, line in enumerate(infile):
+ sample = np.asarray(map(int, line.split()))
+ if len(sample) < 5:
+ print 'day %d: skipping rest of file' % i
+ break
+ f_obs, bins = np.histogram(sample, bins=sample.ptp() or 1)
+ for rv in models:
+ nll = lambda k : - rv(*k).logpmf(sample).sum()
+ beta = fmin(nll, initial_args[rv.name], disp=False)
+ f_exp = rv(*beta).pmf(bins[:-1]) * sample.sum()
+ chisq, pval = chisquare(f_obs, f_exp, rv.numargs)
+ try:
+ model_params[rv.name].append(beta)
+ model_pvalue[rv.name].append(pval)
+ except KeyError:
+ model_params[rv.name] = [ beta ]
+ model_pvalue[rv.name] = [ pval ]
+ print 'day %d: done' % i
+ for rv in models:
+ model_params[rv.name] = np.asarray(model_params[rv.name])
+ model_pvalue[rv.name] = np.asarray(model_pvalue[rv.name])
+ return model_params, model_pvalue
+
+
+# pp.scatter(params.T[0], params.T[1], c='k', marker='.')
+# pp.show()
+
+
+if __name__ == '__main__':
+ args = parser.parse_args()
+ params, pvalues = main(args)
Added: trunk/tools/wsor/editor_lifecycle/scripts/mksamples
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/mksamples
(rev 0)
+++ trunk/tools/wsor/editor_lifecycle/scripts/mksamples 2012-03-09 18:29:43 UTC
(rev 113492)
@@ -0,0 +1,47 @@
+#!/usr/bin/python
+#:vim:ft=python
+# encoding:utf-8
+
+''' groups user counts by day since registration '''
+
+import os
+from argparse import ArgumentParser
+import numpy as np
+from scipy.sparse import coo_matrix
+from collections import deque
+from contextlib import closing
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument('input_paths', metavar='file', nargs='+')
+parser.add_argument('-p', '--prefix', dest='output_prefix', default='',
+ metavar='PREFIX')
+
+def main(args):
+ for path in args.input_paths:
+ output_path = args.output_prefix + os.path.basename(path)
+ output_path = os.path.splitext(output_path)[0] + '.tsv'
+ day_counts = {}
+ archive = np.load(path)
+ N = len(archive.files)
+ print '%d users in %s' % (N, path)
+ with closing(open(output_path, 'w')) as out_file:
+ for uid in archive.files:
+ data = archive[uid].view(np.recarray)
+ idx = data.ns >= 0
+ data = data[idx]
+ counts = coo_matrix((data.edits, (data.day - data.day.min(),
+ data.ns))).tocsr().sum(axis=1)
+ for day in xrange(counts.shape[0]):
+ n = int(counts[day])
+ try:
+ day_counts[day].append(n)
+ except KeyError:
+ day_counts[day] = deque([n])
+ max_day = max(day_counts.keys())
+ for day in xrange(max_day):
+ print >> out_file, ' '.join(map(str, day_counts.get(day, [])))
+ print '%s saved.' % output_path
+
+if __name__ == '__main__':
+ args = parser.parse_args()
+ main(args)
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs