http://www.mediawiki.org/wiki/Special:Code/MediaWiki/95019
Revision: 95019
Author: giovanni
Date: 2011-08-19 17:53:49 +0000 (Fri, 19 Aug 2011)
Log Message:
-----------
added registration_lags scripts
Added Paths:
-----------
trunk/tools/wsor/registration_lags/
trunk/tools/wsor/registration_lags/evolplot
trunk/tools/wsor/registration_lags/fitmixture
trunk/tools/wsor/registration_lags/graphics.py
trunk/tools/wsor/registration_lags/mixhist
Added: trunk/tools/wsor/registration_lags/evolplot
===================================================================
--- trunk/tools/wsor/registration_lags/evolplot (rev 0)
+++ trunk/tools/wsor/registration_lags/evolplot 2011-08-19 17:53:49 UTC (rev
95019)
@@ -0,0 +1,31 @@
+#!/usr/bin/python
+# :vim:ft=python
+
+import os
+from pylab import *
+from datetime import datetime
+from argparse import ArgumentParser
+
+parser = ArgumentParser()
+parser.add_argument('data_file', metavar='data')
+
+ns = parser.parse_args()
+
+ts = loadtxt(ns.data_file, dtype=dtype('S7,f,f,f,f,f,f'))
+
+cf = lambda k : datetime.strptime(k, '%Y-%m')
+x = map(cf, ts['f0'])
+
+plot(x, ts['f1'], 'o w')
+plot(x, ts['f2'], 'd k')
+axis('auto')
+xlabel('time')
+ylabel('log-days')
+title('GMM means, all NS (revision + archive)')
+draw()
+
+pdf_file = os.path.splitext(ns.data_file)[0] + '.pdf'
+savefig(pdf_file)
+print 'output saved to %s.' % pdf_file
+
+show()
Property changes on: trunk/tools/wsor/registration_lags/evolplot
___________________________________________________________________
Added: svn:executable
+ *
Added: trunk/tools/wsor/registration_lags/fitmixture
===================================================================
--- trunk/tools/wsor/registration_lags/fitmixture
(rev 0)
+++ trunk/tools/wsor/registration_lags/fitmixture 2011-08-19 17:53:49 UTC
(rev 95019)
@@ -0,0 +1,39 @@
+#!/usr/bin/python
+# vim:ft=python
+
+''' fits lags data to a gaussian mixture model '''
+
+import os
+from argparse import ArgumentParser
+import numpy as np
+from scikits.learn.mixture import GMM
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument('data_file', metavar='data', help='NumPy array file')
+parser.add_argument('components', type=int)
+
+ns = parser.parse_args()
+
+data = np.load(ns.data_file)
+w = - np.diff(data, axis=1)
+w = np.log(w[w > 0] / 86400.)
+
+key = os.path.splitext(os.path.basename(ns.data_file))[0]
+out = [key]
+
+if len(w) > ns.components:
+ gmm = GMM(ns.components)
+ gmm.fit(w[:, None])
+
+ means = np.ravel(gmm.means)
+ covars = np.ravel(gmm.covars)
+ weights = np.ravel(gmm.weights)
+
+ idx = means.argsort()
+ out.extend(means[idx])
+ out.extend(covars[idx])
+ out.extend(weights[idx])
+else:
+ out.extend([np.nan] * 3 * ns.components)
+
+print '\t'.join(map(str, out))
Property changes on: trunk/tools/wsor/registration_lags/fitmixture
___________________________________________________________________
Added: svn:executable
+ *
Added: trunk/tools/wsor/registration_lags/graphics.py
===================================================================
--- trunk/tools/wsor/registration_lags/graphics.py
(rev 0)
+++ trunk/tools/wsor/registration_lags/graphics.py 2011-08-19 17:53:49 UTC
(rev 95019)
@@ -0,0 +1,106 @@
+import numpy as np
+import matplotlib.pyplot as pp
+from matplotlib import cm
+from scipy.stats import gaussian_kde
+from matplotlib.collections import LineCollection
+
+def stackedarea(x, components, weights, cmap=cm.YlGnBu, **kwargs):
+ '''
+ Produces a stacked area plot from given components and weights.
+
+ Parameters
+ ----------
+ x - ordinates
+ components - a sequence of objects with a `pdf' method
+ weights - a sequence of components' weights
+
+ Default color map is Yellow-Green-Blue. Additional keyword arguments are
+ passed matplotlib.pyplot.fill_between. Returns a list of PolyCollections
+ (one for each component).
+ '''
+ assert np.allclose(np.sum(weights), 1) and np.all(weights), 'illegal
weights'
+ p = [ w * comp.pdf(x) for comp, w in zip(components, weights) ]
+ p = [ np.zeros(len(x)) ] + p
+ p = np.cumsum(p, axis=0)
+ N = len(p)
+ colors = cmap(np.linspace(0, 1, N) * (1 - 1.0 / N))
+ ret = []
+ for i in xrange(1, N):
+ kwargs['color'] = colors[i-1]
+ r = pp.fill_between(x, p[i-1], p[i], **kwargs)
+ ret.append(r)
+ pp.draw()
+ return ret
+
+def mixturehist(data, components, weights, bins=10, num=1000, cmap=cm.YlGnBu,
**kwargs):
+ '''
+ Plots a histogram of given data with a stacked densities of given
+ components
+
+ Parameters
+ ----------
+ data - data array
+ components - a sequence of random variable objects (see scipy.stats)
+ weights - a sequence of components' weights
+ bins - number of histogram bins
+ num - number of points at which stacked densities are evaluated
+ cmap - stacked area plot color map
+
+ Additional keyword arguments are passed to both matplotlib.pyplot.hist and
+ stackedarea.
+ '''
+ histkw = dict(kwargs)
+ # settings for producing transparent histograms
+ histkw.update(normed=True, fc=(0,0,0,0), ec='k')
+ pp.hist(data, bins=bins, **histkw)
+ xmin, xmax = pp.xlim()
+ xi = np.linspace(xmin, xmax, num)
+ stackedarea(xi, components, weights, cmap, **kwargs)
+
+def kdeplot(data, xmin=None, xmax=None, num=50, vmin=None, vmax=None, lc='k',
**kwargs):
+ '''
+ Plots density of data, estimated via Gaussian Kernels, together with
+ vertical lines for each data point.
+
+ Parameters
+ ----------
+ data - data sample
+ xmin, xmax - range of density line plot
+ num - number of points at which kde will be evaluated
+ vmin, vmax - vertical lines will span from vmin to vmax
+ lc - vertical lines color
+
+ Returns
+ -------
+ l - density line object
+ linecoll - collection of vertical lines
+ kde - scipy.stats.kde.gaussian_kde
+
+ Additional keyword arguments are passed to plot the density line
+ '''
+ data = np.ravel(data)
+ x0, x1 = data.min(), data.max()
+ xmin = xmin or x0
+ xmax = xmax or x1
+ x = np.linspace(xmin, xmax, num)
+ kde = gaussian_kde(data)
+ d = kde.evaluate(x)
+ if 'axes' in kwargs:
+ ax = kwargs['axes']
+ elif 'figure' in kwargs:
+ ax = kwargs['figure'].axes[-1]
+ elif kwargs.pop('hold', False):
+ ax = pp.gca()
+ else:
+ fig = pp.figure()
+ ax = fig.add_subplot(111)
+ l = ax.plot(x,d, **kwargs)
+ y0, y1 = ax.get_ylim()
+ vmax = vmax or -(y1 - y0) / 30.
+ vmin = vmin or -(y1 - y0) / 10.
+ linesiter = ( [(d, vmax), (d, vmin)] for d in data )
+ linecoll = LineCollection(linesiter, color=lc, alpha=.1)
+ ax.add_collection(linecoll)
+ ax.set_ylim(y0 - (y1 - vmin)/9., y1)
+ pp.draw()
+ return l, linecoll, kde
Added: trunk/tools/wsor/registration_lags/mixhist
===================================================================
--- trunk/tools/wsor/registration_lags/mixhist (rev 0)
+++ trunk/tools/wsor/registration_lags/mixhist 2011-08-19 17:53:49 UTC (rev
95019)
@@ -0,0 +1,72 @@
+#!/usr/bin/python
+# vim:ft=python
+
+''' plot histogram of waiting time data in log-space '''
+
+from argparse import ArgumentParser
+from pylab import *
+import os
+from scikits.learn.mixture import GMM
+from scipy.stats import norm
+
+from graphics import mixturehist
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument('data_file', metavar='data', help='NumPy array file')
+parser.add_argument('components', type=int)
+parser.add_argument('-subtitle', metavar='TEXT')
+
+times = array([1/86400., 60/86400., 3600/86400., 1, 7, 30, 365])
+logtimes = log(times)
+timestexts = ['1 sec', '1 min', '1 hr', '1 d', '1 week', '1 month', '1 year']
+
+if __name__ == '__main__':
+ ns = parser.parse_args()
+
+ data = np.load(ns.data_file)
+ w = - diff(data, axis=1)
+ w = np.log(w[w > 0] / 86400.)
+
+ print 'fitting GMM with {} components'.format(ns.components)
+ gmm = GMM(ns.components)
+ gmm.fit(w[:, None])
+ means = np.ravel(gmm.means)
+ covars = np.ravel(gmm.covars)
+ weights = np.ravel(gmm.weights)
+
+ print
+ for i, (m, v, p) in enumerate(zip(means, covars, weights)):
+ print 'component {}'.format(i + 1)
+ print '-------------'
+ mu = np.exp(m + v / 2)
+ med = np.exp(m)
+ var = (np.exp(v) - 1) * np.exp(2 * m + v)
+ print 'mean: %8.4g' % mu
+ print 'median: %8.4g' % med
+ print 'std. dev.: %8.4g' % np.sqrt(var)
+ print 'weight: %8.4g' % p
+ print
+
+ comps = [ norm(m, np.sqrt(v)) for m, v in zip(means, covars) ]
+ mixturehist(w, comps, weights, 50, 1000)
+
+ ym, yM = ylim()
+
+ for t, l in zip(logtimes, timestexts):
+ axvline(t, color='k', ls=':')
+ text(t+.1, yM - 0.05 * (yM-ym), l, fontsize='small', color='k',
rotation=-30)
+
+ xlabel('log(days)')
+ ylabel('density')
+
+ _title = 'Time between registration and first edit (N = {})'.format(len(w))
+ if ns.subtitle:
+ _title += '\n' + ns.subtitle
+ title(_title)
+
+ draw()
+
+ fn = os.path.splitext(ns.data_file)[0] + '.pdf'
+ savefig(fn, format='pdf')
+ print 'output saved into {}'.format(fn)
+ show()
Property changes on: trunk/tools/wsor/registration_lags/mixhist
___________________________________________________________________
Added: svn:executable
+ *
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs