https://www.mediawiki.org/wiki/Special:Code/MediaWiki/114807

Revision: 114807
Author:   giovanni
Date:     2012-04-09 18:58:40 +0000 (Mon, 09 Apr 2012)
Log Message:
-----------
refactored fitting

Modified Paths:
--------------
    trunk/tools/wsor/editor_lifecycle/scripts/fitting

Modified: trunk/tools/wsor/editor_lifecycle/scripts/fitting
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/fitting   2012-04-09 18:58:38 UTC 
(rev 114806)
+++ trunk/tools/wsor/editor_lifecycle/scripts/fitting   2012-04-09 18:58:40 UTC 
(rev 114807)
@@ -2,26 +2,24 @@
 # coding: utf-8
 # :vim:ft=python
 
-''' editor lifecycle data fitting tool '''
+''' fits data to a parametric model '''
 
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+# Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
 
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
 __author__ = "Giovanni Luca Ciampaglia"
 __email__ = "[email protected]"
 
@@ -29,7 +27,6 @@
 import os
 import numpy as np
 import matplotlib.pyplot as pp
-from matplotlib.font_manager import FontProperties
 from argparse import ArgumentParser
 from scipy.optimize import curve_fit
 
@@ -38,146 +35,84 @@
 
 __prog__ = os.path.basename(os.path.abspath(__file__))
 
-_maxfev = 100000
+_maxfev = 10000
 
-parent = ArgumentParser(add_help=False)
-parent.add_argument('data_files', metavar='DATA', nargs='+')
-parent.add_argument('-output', dest='output_file', metavar='FILE')
-parent.add_argument('-title')
-group = parent.add_mutually_exclusive_group()
-group.add_argument('-loglog', action='store_true')
-group.add_argument('-logpow', action='store_true')
-parent.add_argument('-constrain', choices=['head', 'tail', 'both'])
-parent.add_argument('-batch', action='store_true', help='do not show graphics')
-parent.add_argument('-force', action='store_true', help='force overwrite')
-parent.add_argument('-minsize', type=int)
-parent.add_argument('-c', '--chop-tail', type=int, default=0, metavar='NUM',
-        help='remove %(metavar)s tail observations')
-parent.add_argument('-xmax', type=int, help='show only (0,xmax) interval')
-parent.add_argument('-snr', type=float, help='minimum signal-to-noise ratio')
-
 parser = ArgumentParser(description=__doc__)
-subparsers = parser.add_subparsers(help='Parametric models supported')
+parser.add_argument(
+        'path', 
+        metavar='FILE',
+        help='input text data file')
+parser.add_argument(
+        '-C',
+        '--directory',
+        help='output directory',
+        dest='output_dir',
+        default=os.curdir,
+        metavar='DIR')
+parser.add_argument(
+        '-f',
+        '--force', 
+        action='store_true', 
+        dest='force_overwrite',
+        help='force overwrite')
+parser.add_argument(
+        '-b',
+        '--batch', 
+        action='store_true', 
+        help='do not show graphics')
 
-parser_expon = subparsers.add_parser('expon', parents=[parent])
-parser_expon.set_defaults(modelclass=Expon)
+model_group = parser.add_mutually_exclusive_group(required=1)
+model_group.add_argument(
+        '-me',
+        '--expon', 
+        action='store_const',
+        const=Expon, 
+        help='exponential model',
+        dest='modelclass')
+model_group.add_argument(
+        '-ms',
+        '--stretched', 
+        action='store_const',
+        const=StretchedExpon, 
+        help='stretched exponential model',
+        dest='modelclass')
+model_group.add_argument(
+        '-mp',
+        '--power-law', 
+        action='store_const',
+        const=PowerLaw, 
+        help='power-law model',
+        dest='modelclass')
 
-parser_stretch = subparsers.add_parser('stretchedexp', parents=[parent])
-parser_stretch.set_defaults(modelclass=StretchedExpon)
+group = parser.add_mutually_exclusive_group()
+group.add_argument(
+        '-ll', 
+        '--loglog', 
+        action='store_true', 
+        help='display log-power axis')
+group.add_argument(
+        '-lp',
+        '--logpow',
+        help='display log-log axis',
+        action='store_true')
 
-parser_power = subparsers.add_parser('powerlaw', parents=[parent])
-parser_power.set_defaults(modelclass=PowerLaw)
-
-class DataError(Exception):
-    pass
-
-class TooFewDataError(DataError):
-    pass
-
-class EmptyDataError(DataError):
-    pass
-
-def plotfit(model, x, y, ye, label='data', **kwargs):
-    xx = np.linspace(x.min(), x.max(), endpoint=True, num=1000)
-    yy = model(xx)
-    kwargs['ecolor'] = 'none'
-    kwargs['ls'] = 'none'
-    kwargs['mfc'] = kwargs.get('color', 'k')
-    kwargs['ms'] = 4
-    pp.errorbar(x, y, ye, label=label, **kwargs)
-#    pp.plot(x, y, label=label, **kwargs)
-    model_label = model.name.split()
-    if len(model_label) > 1:
-        model_label[1] = model_label[1][:3] + '.'
-    model_label = ' '.join(model_label[:2]).capitalize()
-    c = kwargs.get('color', 'r')
-    pp.plot(xx, yy, '--', color=c, lw=2)
-    if ns.loglog:
-        pp.xscale('log')
-        pp.yscale('log')
-    elif ns.logpow:
-        pp.xscale('power', exponent=model.beta)
-        pp.yscale('log')
-    pp.legend(loc='best', prop=FontProperties(size='x-small'))
-    if ns.title is not None:
-        pp.title(ns.title)
-    pp.xlabel('Days since registration')
-    pp.ylabel('Edits/day')
-
-def plotresid(model, x, y, label='data', **kwargs):
-    r = model(x) - y
-#    rm = r[True - np.isinf(r)].max()
-#    r /= np.abs(rm)
-    pp.axhline(y=0, c='k')
-    kwargs['ls'] = ':'
-    kwargs['lw'] = 2
-    kwargs['ms'] = 4
-    pp.plot(x, r, label=label, **kwargs)
-    pp.title('Fit residuals')
-    pp.xlabel('Days since registration')
-#    pp.ylabel(r'Relative residual $\xi / \max{|\xi|}$')
-#    pp.ylim(-1,1)
-    pp.draw()
-
-def _testoverwrite(*files):
-    exit_flag = False
-    for fn in files:
-        if os.path.exists(fn):
-            exit_flag = True
-            print '%s: error: cannot overwrite %s' % (__prog__, fn)
-    if exit_flag:
-        sys.exit(1)
-
-# TODO add usecols and option for taking only observations based on minimum
-# sample size
-
-def fit(data_file, modelclass, constrain=None, minsize=None):
+def fit(path, modelclass):
     '''
-    Fits a model to data. Data are read from `data_file` and the model is
-    specified as a `modelclass` (see `lifecycle.models`). The data file should
-    contain a (N,4) array where columns are x, y, ye (errors) and n (sample
-    sizes -- number of users on which the average rate y is computed).
+    fits a `lifecycle.models.ParametricModel`
 
     Parameters
     ----------
-    If `modelclass` is any of `Expon` or `StretchedExpon`, one can constrain 
the
-    value of some parameters before performing the fit. To this end one can use
-    the parameter `constrain`, which can be any of the following strings: 
-    'both', 'head', or 'tail'. Their meaning is the following:
+    path - a path to a text file with data (x, y, ye)
+    modelclass - a subclass of `lifecycle.models.ParametricModel`. 
+        See docstring of `lifecycle.models` for a list of available models
 
-    * head : A = y[0]
-    * tail : C = y[-1]
-    * both : A = y[0] and C = y[-1]
+    Returns
+    -------
+    x, y, ye - data loaded from `path`
+    model - a fitted model of class `modelclass`
     '''
-    x, y, ye, n = np.loadtxt(data_file, unpack=True)
-    # XXX using the S/N ratio to filter out noisy estimates of the ratio
-    s = ye / np.sqrt(n)
-    snr = y / s
-    idx = snr >= ns.snr
-#     x = x[:-ns.chop_tail]
-#     y = y[:-ns.chop_tail]
-#     ye = ye[:-ns.chop_tail]
-#     n = n[:-ns.chop_tail]
-    idx = idx * (ye > 0)
-#     if minsize:
-#         idx = idx * (n > minsize)
-    if not np.any(idx):
-        raise EmptyDataError()
-    if np.sum(idx) < len(modelclass.__params__):
-        raise TooFewDataError()
-    x = x[idx]
-    y = y[idx]
-    ye = ye[idx]
-    n = n[idx]
-    # XXX <Sat Oct 29 17:30:38 CEST 2011> getting the standard error of the
-    # mean. This should be really computed in in lifecycle.rates and new data
-    # files be produced using mkrates.
-    ye = ye / np.sqrt(n)
+    x, y, ye = np.loadtxt(path, unpack=True)
     model = modelclass()
-    if constrain in ['head', 'both']:
-        model.A = y[np.argmin(np.abs(x))]
-    if constrain in ['tail', 'both']:
-        model.C = y.min()
     pest, pcov = model.fit(x, y, ye=ye, maxfev=_maxfev)
     if not np.isscalar(pcov):
         perr = np.sqrt(np.diag(pcov)) / 2.
@@ -188,82 +123,107 @@
     model.goftest = gof
     model.residtest = resid
     model.Rsquared = Rsquared
-    print model.summary(dataset=data_file, observations=len(x))
+    print model.summary(dataset=path, observations=len(x))
     return x, y, ye, model
 
+# plotfit, data
+_kwargs1 = { 
+        'color' : 'black',
+        'marker' : 'o',
+        'ecolor' : 'none',
+        'ls' : 'none',
+        'ms' : 2,
+}
+
+# plotfit, fitted curve
+_kwargs2 = { 
+        'color' : 'darkgray',
+        'lw' : 2,
+        'ls' : '-',
+}
+
+# plotresid
+_kwargs3 = {
+        'marker' : 'o',
+        'color' : 'black',
+        'ls' : 'none',
+        'lw' : 2,
+        'ms' : 4,
+}
+
 def main(ns):
-    # import matploblib 
-#     from matplotlib.lines import lineMarkers as markers
-#     markers = dict(filter(
-#             lambda k : isinstance(k[0],str) and k[1] is not '_draw_nothing',
-#             markers.items())).keys()
-#     markers.remove('.')
-#     markers.remove(',')
-    markers = 'oD^vs*hHpdv<>'
-    colors = 'brgkcmyw'
+    # test output dir exists
+    if not os.path.isdir(ns.output_dir):
+        print >> sys.stderr, '%s: error: not an existing directory: %s' %\
+                (__prog__, ns.output_dir)
+        sys.exit(1)
 
-    data = []
-    models = []
-    labels = []
+    # test input file exists
+    if not os.path.exists(ns.path):
+        print >> sys.stderr, '%s: error: not such file: %s' % (__prog__,
+                ns.path)
+        sys.exit(1)
+    title, _ = os.path.splitext(os.path.basename(ns.path))
 
-    # fit all datasets
-    for f in ns.data_files:
-        try:
-            x, y, ye, model = fit(f, ns.modelclass, ns.constrain, ns.minsize)
-        except TooFewDataError:
-            print >> sys.stderr, '%s: warning: too few data: %s' % (__prog__, 
f)
-            continue
-        except EmptyDataError:
-            print >> sys.stderr, '%s: warning: no usable data: %s'% (__prog__, 
f)
-            continue
-        data.append((x,y,ye))
-        models.append(model)
-        lab = os.path.splitext(os.path.basename(f))[0]
-        labels.append(lab.replace('_','\\_'))
-    
-    # plot fits 
+    # test path to fit plot file does not exists (unless --force was passed)
+    plot_path = os.path.join(ns.output_dir, title + '.pdf')
+    if os.path.exists(plot_path) and not ns.force_overwrite:
+        print >> sys.stderr, '%s: error: file exists (pass --force): %s' %\
+                (__prog__, plot_path)
+        sys.exit(1)
+
+    # test path to residuals plot file does not exists (unless --force was
+    # passed)
+    resid_plot_path = os.path.join(ns.output_dir, 'resid_' + title + '.pdf')
+    if os.path.exists(resid_plot_path) and not ns.force_overwrite:
+        print >> sys.stderr, '%s: error: file exists (pass --force): %s' %\
+                (__prog__, resid_plot_path)
+        sys.exit(1)
+
+    # fit the model
+    x, y, ye, model = fit(ns.path, ns.modelclass)
+
+    # plot fitted curve and data
     pp.figure()
-    pp.hold(1)
-    for i, ((x, y, ye), model, label) in enumerate(zip(data, models, labels)):
-        m = markers[i % len(markers)]
-        c = colors[i % len(colors)]
-        plotfit(model, x, y, ye, label, marker=m, color=c)
-        if ns.xmax is not None:
-            pp.xlim(0, ns.xmax)
+    xx = np.linspace(x.min(), x.max(), endpoint=True, num=len(x) * 3 - 1)
+    yy = model(xx)
+    pp.plot(xx, yy, **_kwargs2)
+    pp.errorbar(x, y, ye, **_kwargs1)
 
+    # set axes scale
+    if ns.loglog:
+        pp.xscale('log')
+        pp.yscale('log')
+    elif ns.logpow:
+        pp.xscale('power', exponent=model.beta)
+        pp.yscale('log')
+
+    # title, axis labels
+    pp.title(title.replace('_','\\_'))
+    pp.xlabel('Days since registration')
+    pp.ylabel('Edits/day')
+    pp.draw()
+    pp.savefig(plot_path)
+    print '%s: fit plot saved to %s' % (__prog__, plot_path)
+
     # plot residuals
     pp.figure()
-    pp.hold(1)
-    for i, ((x, y, ye), model, label) in enumerate(zip(data, models, labels)):
-        m = markers[i % len(markers)]
-        c = colors[i % len(colors)]
-        plotresid(model, x, y, label, marker=m, color=c)
-        if ns.xmax is not None:
-            pp.xlim(0, ns.xmax)
+    r = model(x) - y
+    pp.axhline(y=0, c='k')
+    pp.plot(x, r, hold=1, **_kwargs3)
 
-    # save figures
-    if ns.output_file is not None:
-        fn, ext = os.path.splitext(ns.output_file)
-        fmt = ext[1:]
-        if ns.batch and fmt.lower() != 'pdf':
-            print '%s: error: batch mode supports only PDF format' % __prog__
-            sys.exit(1)
-        resid_output_file = fn + '_residuals' + ext
-        if not ns.force:
-            _testoverwrite(ns.output_file, resid_output_file)
-        pp.figure(1)
-        pp.savefig(ns.output_file, format=fmt)
-        print '%s: output saved to %s' % (__prog__, ns.output_file)
-        pp.figure(2)
-        pp.savefig(resid_output_file, format=fmt)
-        print '%s: output saved to %s' % (__prog__, resid_output_file)
+    # title, axis labels
+    pp.title(title.replace('_','\\_'))
+    pp.ylabel('Residuals')
+    pp.xlabel('Days since registration')
+    pp.draw()
+    pp.savefig(resid_plot_path)
+    print '%s: residuals plot saved to %s' % (__prog__, resid_plot_path)
 
-    pp.show()
+    if not ns.batch:
+        pp.show()
 
 if __name__ == '__main__':
+    # parse arguments from command line
     ns = parser.parse_args()
-#    if ns.batch:
-#        import matplotlib
-#        matplotlib.use('PDF')
-#    import matplotlib.pyplot as pp
     main(ns)


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to