wsor

giovanni Fri, 19 Aug 2011 10:53:57 -0700

http://www.mediawiki.org/wiki/Special:Code/MediaWiki/95019


Revision: 95019
Author:   giovanni
Date:     2011-08-19 17:53:49 +0000 (Fri, 19 Aug 2011)
Log Message:
-----------
added registration_lags scripts

Added Paths:
-----------
    trunk/tools/wsor/registration_lags/
    trunk/tools/wsor/registration_lags/evolplot
    trunk/tools/wsor/registration_lags/fitmixture
    trunk/tools/wsor/registration_lags/graphics.py
    trunk/tools/wsor/registration_lags/mixhist

Added: trunk/tools/wsor/registration_lags/evolplot
===================================================================
--- trunk/tools/wsor/registration_lags/evolplot                         (rev 0)
+++ trunk/tools/wsor/registration_lags/evolplot 2011-08-19 17:53:49 UTC (rev 
95019)
@@ -0,0 +1,31 @@
+#!/usr/bin/python
+# :vim:ft=python
+
+import os
+from pylab import *
+from datetime import datetime
+from argparse import ArgumentParser
+
+parser = ArgumentParser()
+parser.add_argument('data_file', metavar='data')
+
+ns = parser.parse_args()
+
+ts = loadtxt(ns.data_file, dtype=dtype('S7,f,f,f,f,f,f'))
+
+cf = lambda k : datetime.strptime(k, '%Y-%m')
+x = map(cf, ts['f0'])
+
+plot(x, ts['f1'], 'o w')
+plot(x, ts['f2'], 'd k')
+axis('auto')
+xlabel('time')
+ylabel('log-days')
+title('GMM means, all NS (revision + archive)')
+draw()
+
+pdf_file = os.path.splitext(ns.data_file)[0] + '.pdf'
+savefig(pdf_file)
+print 'output saved to %s.' % pdf_file
+
+show()


Property changes on: trunk/tools/wsor/registration_lags/evolplot
___________________________________________________________________
Added: svn:executable
   + *

Added: trunk/tools/wsor/registration_lags/fitmixture
===================================================================
--- trunk/tools/wsor/registration_lags/fitmixture                               
(rev 0)
+++ trunk/tools/wsor/registration_lags/fitmixture       2011-08-19 17:53:49 UTC 
(rev 95019)
@@ -0,0 +1,39 @@
+#!/usr/bin/python
+# vim:ft=python
+
+''' fits lags data to a gaussian mixture model '''
+
+import os
+from argparse import ArgumentParser
+import numpy as np
+from scikits.learn.mixture import GMM
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument('data_file', metavar='data', help='NumPy array file')
+parser.add_argument('components', type=int)
+
+ns = parser.parse_args()
+
+data = np.load(ns.data_file)
+w = - np.diff(data, axis=1)
+w = np.log(w[w > 0] / 86400.)
+
+key = os.path.splitext(os.path.basename(ns.data_file))[0]
+out = [key]
+
+if len(w) > ns.components:
+    gmm = GMM(ns.components)
+    gmm.fit(w[:, None])
+
+    means = np.ravel(gmm.means)
+    covars = np.ravel(gmm.covars)
+    weights = np.ravel(gmm.weights)
+
+    idx = means.argsort()
+    out.extend(means[idx]) 
+    out.extend(covars[idx]) 
+    out.extend(weights[idx])
+else:
+    out.extend([np.nan] * 3 * ns.components)
+
+print '\t'.join(map(str, out))


Property changes on: trunk/tools/wsor/registration_lags/fitmixture
___________________________________________________________________
Added: svn:executable
   + *

Added: trunk/tools/wsor/registration_lags/graphics.py
===================================================================
--- trunk/tools/wsor/registration_lags/graphics.py                              
(rev 0)
+++ trunk/tools/wsor/registration_lags/graphics.py      2011-08-19 17:53:49 UTC 
(rev 95019)
@@ -0,0 +1,106 @@
+import numpy as np
+import matplotlib.pyplot as pp
+from matplotlib import cm
+from scipy.stats import gaussian_kde
+from matplotlib.collections import LineCollection
+
+def stackedarea(x, components, weights, cmap=cm.YlGnBu, **kwargs):
+    '''
+    Produces a stacked area plot from given components and weights.
+    
+    Parameters
+    ----------
+    x           - ordinates
+    components  - a sequence of objects with a `pdf' method 
+    weights     - a sequence of components' weights 
+
+    Default color map is Yellow-Green-Blue. Additional keyword arguments are
+    passed matplotlib.pyplot.fill_between. Returns a list of PolyCollections
+    (one for each component).
+    '''
+    assert np.allclose(np.sum(weights), 1) and np.all(weights), 'illegal 
weights'
+    p = [ w * comp.pdf(x) for comp, w in zip(components, weights) ]
+    p = [ np.zeros(len(x)) ] + p
+    p = np.cumsum(p, axis=0)
+    N = len(p)
+    colors = cmap(np.linspace(0, 1, N) * (1 - 1.0 / N)) 
+    ret = []
+    for i in xrange(1, N):
+        kwargs['color'] = colors[i-1]
+        r = pp.fill_between(x, p[i-1], p[i], **kwargs)
+        ret.append(r)
+    pp.draw()
+    return ret
+
+def mixturehist(data, components, weights, bins=10, num=1000, cmap=cm.YlGnBu, 
**kwargs):
+    '''
+    Plots a histogram of given data with a stacked densities of given
+    components
+
+    Parameters
+    ----------
+    data        - data array
+    components  - a sequence of random variable objects (see scipy.stats)
+    weights     - a sequence of components' weights
+    bins        - number of histogram bins
+    num         - number of points at which stacked densities are evaluated
+    cmap        - stacked area plot color map
+
+    Additional keyword arguments are passed to both matplotlib.pyplot.hist and
+    stackedarea.
+    '''
+    histkw = dict(kwargs)
+    # settings for producing transparent histograms
+    histkw.update(normed=True, fc=(0,0,0,0), ec='k') 
+    pp.hist(data, bins=bins, **histkw)
+    xmin, xmax = pp.xlim()
+    xi = np.linspace(xmin, xmax, num)
+    stackedarea(xi, components, weights, cmap, **kwargs)
+
+def kdeplot(data, xmin=None, xmax=None, num=50, vmin=None, vmax=None, lc='k', 
**kwargs):
+    '''
+    Plots density of data, estimated via Gaussian Kernels, together with
+    vertical lines for each data point.
+
+    Parameters
+    ----------
+    data       - data sample
+    xmin, xmax - range of density line plot
+    num        - number of points at which kde will be evaluated
+    vmin, vmax - vertical lines will span from vmin to vmax
+    lc         - vertical lines color
+
+    Returns
+    -------
+    l        - density line object
+    linecoll - collection of vertical lines
+    kde      - scipy.stats.kde.gaussian_kde
+
+    Additional keyword arguments are passed to plot the density line
+    '''
+    data = np.ravel(data)
+    x0, x1 = data.min(), data.max()
+    xmin = xmin or x0
+    xmax = xmax or x1
+    x = np.linspace(xmin, xmax, num)
+    kde = gaussian_kde(data)
+    d = kde.evaluate(x)
+    if 'axes' in kwargs:
+        ax = kwargs['axes']
+    elif 'figure' in kwargs:
+        ax = kwargs['figure'].axes[-1]
+    elif kwargs.pop('hold', False):
+        ax = pp.gca()
+    else:
+        fig = pp.figure()
+        ax = fig.add_subplot(111)
+    l = ax.plot(x,d, **kwargs)
+    y0, y1 = ax.get_ylim()
+    vmax = vmax or -(y1 - y0) / 30.
+    vmin = vmin or -(y1 - y0) / 10.
+    linesiter = ( [(d, vmax), (d, vmin)] for d in data )
+    linecoll = LineCollection(linesiter, color=lc, alpha=.1)
+    ax.add_collection(linecoll)
+    ax.set_ylim(y0 - (y1 - vmin)/9., y1)
+    pp.draw()
+    return l, linecoll, kde

Added: trunk/tools/wsor/registration_lags/mixhist
===================================================================
--- trunk/tools/wsor/registration_lags/mixhist                          (rev 0)
+++ trunk/tools/wsor/registration_lags/mixhist  2011-08-19 17:53:49 UTC (rev 
95019)
@@ -0,0 +1,72 @@
+#!/usr/bin/python
+# vim:ft=python
+
+''' plot histogram of waiting time data in log-space '''
+
+from argparse import ArgumentParser
+from pylab import *
+import os
+from scikits.learn.mixture import GMM
+from scipy.stats import norm
+
+from graphics import mixturehist
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument('data_file', metavar='data', help='NumPy array file')
+parser.add_argument('components', type=int)
+parser.add_argument('-subtitle', metavar='TEXT')
+
+times = array([1/86400., 60/86400., 3600/86400., 1, 7, 30, 365])
+logtimes = log(times)
+timestexts = ['1 sec', '1 min', '1 hr', '1 d', '1 week', '1 month', '1 year']
+
+if __name__ == '__main__':
+    ns = parser.parse_args()
+
+    data = np.load(ns.data_file)
+    w = - diff(data, axis=1)
+    w = np.log(w[w > 0] / 86400.)
+ 
+    print 'fitting GMM with {} components'.format(ns.components)
+    gmm = GMM(ns.components)
+    gmm.fit(w[:, None])
+    means = np.ravel(gmm.means)
+    covars = np.ravel(gmm.covars)
+    weights = np.ravel(gmm.weights)
+
+    print
+    for i, (m, v, p) in enumerate(zip(means, covars, weights)):
+        print 'component {}'.format(i + 1)
+        print '-------------'
+        mu = np.exp(m + v / 2)
+        med = np.exp(m)
+        var = (np.exp(v) - 1) * np.exp(2 * m + v)
+        print 'mean: %8.4g' % mu
+        print 'median: %8.4g' % med
+        print 'std. dev.: %8.4g' % np.sqrt(var)
+        print 'weight: %8.4g' % p
+        print
+
+    comps = [ norm(m, np.sqrt(v)) for m, v in zip(means, covars) ]
+    mixturehist(w, comps, weights, 50, 1000)
+
+    ym, yM = ylim()
+
+    for t, l in zip(logtimes, timestexts):
+        axvline(t, color='k', ls=':')
+        text(t+.1, yM - 0.05 * (yM-ym), l, fontsize='small', color='k', 
rotation=-30)
+
+    xlabel('log(days)')
+    ylabel('density')
+
+    _title = 'Time between registration and first edit (N = {})'.format(len(w))
+    if ns.subtitle:
+        _title += '\n' + ns.subtitle
+    title(_title)
+
+    draw()
+
+    fn = os.path.splitext(ns.data_file)[0] + '.pdf'
+    savefig(fn, format='pdf')
+    print 'output saved into {}'.format(fn)
+    show()


Property changes on: trunk/tools/wsor/registration_lags/mixhist
___________________________________________________________________
Added: svn:executable
   + *


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

[MediaWiki-CVS] SVN: [95019] trunk/tools/wsor

Reply via email to