editor_lifecycle

giovanni Mon, 09 Apr 2012 11:58:44 -0700

https://www.mediawiki.org/wiki/Special:Code/MediaWiki/114806


Revision: 114806
Author:   giovanni
Date:     2012-04-09 18:58:38 +0000 (Mon, 09 Apr 2012)
Log Message:
-----------
refactored comprates + minor fixes

* comprates now gives interface to cohortrate
* fixed comments in lifecyle.rates
* renamed fetchrates into fetchcounts (more accurate)
* deleted scripts/groupbyday

Modified Paths:
--------------
    trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py
    trunk/tools/wsor/editor_lifecycle/scripts/comprates

Added Paths:
-----------
    trunk/tools/wsor/editor_lifecycle/scripts/fetchcounts

Removed Paths:
-------------
    trunk/tools/wsor/editor_lifecycle/scripts/fetchrates
    trunk/tools/wsor/editor_lifecycle/scripts/groupbyday

Modified: trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py
===================================================================
--- trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py        2012-04-09 
18:58:36 UTC (rev 114805)
+++ trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py        2012-04-09 
18:58:38 UTC (rev 114806)
@@ -61,13 +61,14 @@
         compute activity rates only with edits to these namespaces
     minsize - positive int
         filter out activity rate estimates based on sample of size less than
-    minsize minsnr - positive real
+    minsnr - positive real
         filter out activity rate estimates with signal-to-noise rate less than
         parameter
 
     Returns
     -------
-    an array of daily activity rate observations, together estimated 
uncertainties 
+    an array of daily activity rate observations, together estimated
+    uncertainties 
     '''
     day_counts = {}
 

Modified: trunk/tools/wsor/editor_lifecycle/scripts/comprates
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/comprates 2012-04-09 18:58:36 UTC 
(rev 114805)
+++ trunk/tools/wsor/editor_lifecycle/scripts/comprates 2012-04-09 18:58:38 UTC 
(rev 114806)
@@ -1,27 +1,23 @@
 #!/usr/bin/python
 
-'''
-computes average activity rates from downloaded user counts archive files
-'''
+''' computes average activity rates from cohort archives with user counts '''
 
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+# Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
 
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
 __author__ = "Giovanni Luca Ciampaglia"
 __email__ = "[email protected]"
 
@@ -31,37 +27,96 @@
 
 from argparse import ArgumentParser
 
-from lifecycle.rates import computerates
+from lifecycle.rates import cohortrate
 
 parser = ArgumentParser(description=__doc__)
-parser.add_argument('input_path_list', metavar='data', nargs='+')
-parser.add_argument('-outdir', dest='output_dir', default=os.curdir)
-parser.add_argument('-every', type=int, help='default: average over 
%(default)d days',
-        default=30, metavar='NUM')
-parser.add_argument('-n', '--namespace', type=int, action='append', 
help='select only these NS',
+parser.add_argument(
+        'input_path_list', 
+        metavar='FILE',
+        help='cohort archive(s)',
+        nargs='+')
+parser.add_argument(
+        '-C',
+        '--directory', 
+        dest='output_dir', 
+        default=os.curdir,
+        help='output to directory %(metavar)s',
+        metavar='DIR')
+parser.add_argument(
+        '-n',
+        '--namespace',
+        type=int,
+        action='append',
+        metavar='NS',
+        help='compute rates only with edits to %(metavar)s (may specify this '\
+                'argument multiple times)',
         dest='only')
-parser.add_argument('-G', '--geometric', action='store_true', help='compute '
-        'geometric mean and standard deviation of data')
+parser.add_argument(
+        '-s',
+        '--snr',
+        dest='minsnr',
+        type=float,
+        metavar='RATIO',
+        help='minimum signal-to-noise ratio of observations')
+parser.add_argument(
+        '-m',
+        '--size',
+        type=int,
+        dest='minsize',
+        metavar='SIZE',
+        help='compute averages over samples of minimum size only')
 
-__prog__ = os.path.basename(__file__)
+__prog__ = os.path.basename(os.path.abspath(__file__))
 
 def main(ns):
+    
+    # test output directory exists
+    if not os.path.isdir(ns.output_dir):
+        print >> sys.stderr, '%s: error: not an existing directory: %s' % \
+                (__prog__, ns.output_dir)
+        sys.exit(1)
 
+    # check SNR argument and if OK feedback to user
+    if ns.minsnr is not None:
+        if ns.minsnr <= 0:
+            print >> sys.stderr, '%s: error: SNR is not a ratio: %g' % 
(__prog__,
+                    ns.minsnr)
+            sys.exit(1)
+        else: 
+            print '%s: minimum signal-to-noise ratio: %g' % (__prog__,
+                    ns.minsnr)
+
+    # check size argument and if OK feedback to user
+    if ns.minsize is not None:
+        if ns.minsize <= 0:
+            print >> sys.stderr, '%s: error: not a valid sample size: %d' %\
+                    (__prog__, ns.minsize)
+        else:
+            print '%s: minimum sample size: %d' % (__prog__, ns.minsize)
+
+    # loop over inputs
     for path in ns.input_path_list:
 
+        # check input path exists
+        if not os.path.exists(path):
+            print >> sys.stderr, '%s: error: skipping non-existing file: %s' %\
+                    (__prog__, path)
+            continue
+
         # define output path
         output_path = os.path.basename(path)
         output_path = os.path.splitext(output_path)[0] + '.tsv'
         output_path = os.path.join(ns.output_dir, output_path)
 
         # compute rates for this cohort and save them to file
-        rates = computerates(path, ns.every, onlyns=ns.only,
-                geometric=ns.geometric)
-        np.savetxt(output_path, rates, fmt='%d\t%12.8g\t%12.8g\t%d')
+        npzarc = np.load(path)
+        rates = cohortrate(npzarc, onlyns=ns.only, minsnr=ns.minsnr,
+                minsize=ns.minsize)
+        np.savetxt(output_path, rates, fmt='%d\t%12.8g\t%12.8g')
         print '%s: output saved to %s' % (__prog__, output_path)
 
 if __name__ == '__main__':
-    # get arguments from command line
+    # parse arguments from command line
     ns = parser.parse_args()
     main(ns)
 

Copied: trunk/tools/wsor/editor_lifecycle/scripts/fetchcounts (from rev 114805, 
trunk/tools/wsor/editor_lifecycle/scripts/fetchrates)
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/fetchcounts                       
        (rev 0)
+++ trunk/tools/wsor/editor_lifecycle/scripts/fetchcounts       2012-04-09 
18:58:38 UTC (rev 114806)
@@ -0,0 +1,125 @@
+#!/usr/bin/python
+
+''' Fetches edit count data from database '''
+
+# Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
+
+__author__ = "Giovanni Luca Ciampaglia"
+__email__ = "[email protected]"
+
+import sys
+import os
+import numpy as np
+
+from zipfile import ZipFile
+from contextlib import closing
+from tempfile import mkdtemp
+from oursql import connect, InterfaceError
+from argparse import ArgumentParser
+from datetime import datetime
+
+from lifecycle.rates import userrate
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument(
+            'input_path', 
+            metavar='FILE',
+            help='cohort file (with user ids on a single line)')
+parser.add_argument(
+            '-config', 
+            dest='config_file', 
+            metavar='FILE', 
+            default='~/.my.cnf',
+            help='mysql config file (default: %(default)s)')
+parser.add_argument(
+            '-outdir', 
+            dest='output_dir', 
+            help='output directory',
+            metavar='DIR',
+            default=os.curdir)
+
+_query = """
+SELECT UNIX_TIMESTAMP(rev_timestamp)/86400.0, page_namespace
+FROM revision r 
+JOIN page p
+ON r.rev_page = p.page_id
+WHERE rev_user = ?
+ORDER BY rev_timestamp
+"""
+
+__prog__ = os.path.basename(os.path.abspath(__file__))
+
+def main(ns):
+    # test configuration file for mysql client exists
+    cnf = os.path.expanduser(os.path.expandvars(ns.config_file))
+    if not os.path.exists(cnf):
+        print >> sys.stderr, '%s: error: no config file found: %s' % 
(__prog__, cnf)
+        sys.exit(1)
+
+    # test output path exists and is a directory
+    if not os.path.isdir(ns.output_dir):
+        print >> sys.stderr, '%s: error: not an existing directory: %s' % 
(__prog__, 
+                ns.output_dir)
+        sys.exit(1)
+
+    # read user ids from cohort file, create zip archive and temp dir
+    with closing(open(ns.input_path)) as f:
+        line = f.readline().strip()
+        if line:
+            user_ids = map(int, line.split('\t'))
+        else:
+            print >> sys.stderr, '%s: error: empty cohort file: %s' % 
ns.input_path
+            sys.exit(1)
+
+    # connect to DB
+    try:
+        conn = connect(read_default_file=ns.config_file)
+    except InterfaceError, e:
+        print >> sys.stderr, '%s: error: %s' % (__prog__, e.args[1])
+        sys.exit(1)
+
+    # create output archive and temp working dir
+    zip_path = os.path.splitext(os.path.basename(ns.input_path))[0] + '.npz'
+    zip_path = os.path.join(ns.output_dir, zip_path)
+    temp_dir = mkdtemp()
+
+    with closing(ZipFile(zip_path, 'w')) as zf:
+
+        # compute user rates and write them into the zip file
+        with conn.cursor() as cursor:
+            for uid in user_ids:
+                cursor.execute(_query, (uid,))
+                rows = list(cursor)
+                if len(rows) == 0:
+                    continue
+                data = userrate(rows)
+                path = os.path.join(temp_dir, '%d.npy' % uid)
+                np.save(temp_dir, data)
+                zf.write(path, os.path.basename(path))
+                os.remove(path)
+
+    # remove temp dir
+    os.removedirs(temp_dir)
+
+    # tell user
+    print '%s: output saved to %s' % (datetime.now(), zip_path)
+
+if __name__ == '__main__':
+    # parse arguments from command line
+    ns = parser.parse_args()
+    main(ns)
+

Deleted: trunk/tools/wsor/editor_lifecycle/scripts/fetchrates
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/fetchrates        2012-04-09 
18:58:36 UTC (rev 114805)
+++ trunk/tools/wsor/editor_lifecycle/scripts/fetchrates        2012-04-09 
18:58:38 UTC (rev 114806)
@@ -1,125 +0,0 @@
-#!/usr/bin/python
-
-''' Fetches edit count data from database '''
-
-# Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-# http://www.gnu.org/copyleft/gpl.html
-
-__author__ = "Giovanni Luca Ciampaglia"
-__email__ = "[email protected]"
-
-import sys
-import os
-import numpy as np
-
-from zipfile import ZipFile
-from contextlib import closing
-from tempfile import mkdtemp
-from oursql import connect, InterfaceError
-from argparse import ArgumentParser
-from datetime import datetime
-
-from lifecycle.rates import userrate
-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument(
-            'input_path', 
-            metavar='FILE',
-            help='cohort file (with user ids on a single line)')
-parser.add_argument(
-            '-config', 
-            dest='config_file', 
-            metavar='FILE', 
-            default='~/.my.cnf',
-            help='mysql config file (default: %(default)s)')
-parser.add_argument(
-            '-outdir', 
-            dest='output_dir', 
-            help='output directory',
-            metavar='DIR',
-            default=os.curdir)
-
-_query = """
-SELECT UNIX_TIMESTAMP(rev_timestamp)/86400.0, page_namespace
-FROM revision r 
-JOIN page p
-ON r.rev_page = p.page_id
-WHERE rev_user = ?
-ORDER BY rev_timestamp
-"""
-
-__prog__ = os.path.basename(os.path.abspath(__file__))
-
-def main(ns):
-    # test configuration file for mysql client exists
-    cnf = os.path.expanduser(os.path.expandvars(ns.config_file))
-    if not os.path.exists(cnf):
-        print >> sys.stderr, '%s: error: no config file found: %s' % 
(__prog__, cnf)
-        sys.exit(1)
-
-    # test output path exists and is a directory
-    if not os.path.isdir(ns.output_dir):
-        print >> sys.stderr, '%s: error: not an existing directory: %s' % 
(__prog__, 
-                ns.output_dir)
-        sys.exit(1)
-
-    # read user ids from cohort file, create zip archive and temp dir
-    with closing(open(ns.input_path)) as f:
-        line = f.readline().strip()
-        if line:
-            user_ids = map(int, line.split('\t'))
-        else:
-            print >> sys.stderr, '%s: error: empty cohort file: %s' % 
ns.input_path
-            sys.exit(1)
-
-    # connect to DB
-    try:
-        conn = connect(read_default_file=ns.config_file)
-    except InterfaceError, e:
-        print >> sys.stderr, '%s: error: %s' % (__prog__, e.args[1])
-        sys.exit(1)
-
-    # create output archive and temp working dir
-    zip_path = os.path.splitext(os.path.basename(ns.input_path))[0] + '.npz'
-    zip_path = os.path.join(ns.output_dir, zip_path)
-    temp_dir = mkdtemp()
-
-    with closing(ZipFile(zip_path, 'w')) as zf:
-
-        # compute user rates and write them into the zip file
-        with conn.cursor() as cursor:
-            for uid in user_ids:
-                cursor.execute(_query, (uid,))
-                rows = list(cursor)
-                if len(rows) == 0:
-                    continue
-                data = userrate(rows)
-                path = os.path.join(temp_dir, '%d.npy' % uid)
-                np.save(temp_dir, data)
-                zf.write(path, os.path.basename(path))
-                os.remove(path)
-
-    # remove temp dir
-    os.removedirs(temp_dir)
-
-    # tell user
-    print '%s: output saved to %s' % (datetime.now(), zip_path)
-
-if __name__ == '__main__':
-    # parse arguments from command line
-    ns = parser.parse_args()
-    main(ns)
-

Deleted: trunk/tools/wsor/editor_lifecycle/scripts/groupbyday
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/groupbyday        2012-04-09 
18:58:36 UTC (rev 114805)
+++ trunk/tools/wsor/editor_lifecycle/scripts/groupbyday        2012-04-09 
18:58:38 UTC (rev 114806)
@@ -1,64 +0,0 @@
-#!/usr/bin/python
-#:vim:ft=python
-# encoding:utf-8
-
-''' groups user counts by day since registration '''
-
-import os
-from argparse import ArgumentParser
-import numpy as np
-from scipy.sparse import coo_matrix
-from collections import deque
-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument('input_paths', metavar='file', nargs='+')
-parser.add_argument('-p', '--prefix', dest='out_prefix', default='daily_',
-        metavar='PREFIX', help="(default: %(metavar)s)")
-
-def group_by_day(counts):
-    '''
-    counts is a mapping between user IDs and edits-by-namespace count data
-    '''
-    # hold cohort daily counts in a mapping in memory
-    day_counts = {}
-
-    for uid in counts:
-        data = counts[uid].view(np.recarray)
-
-        # NS < 0 are virtual. Filter out those edits because they are junk
-        idx = data.ns >= 0
-        data = data[idx]
-
-        # Sparse matrix (num_days x namespaces) where num_days is the activity
-        # span in days. Summing along rows returns a dense matrix
-        counts_matrix = coo_matrix((data.edits, (data.day - data.day.min(), \
-                data.ns))).tocsc().sum(axis=1)
-        
-        # Add counts to cohort daily counts
-        for day in xrange(counts_matrix.shape[0]):
-            n = int(counts_matrix[day])
-            try:
-                day_counts[str(day)].append(n)
-            except KeyError:
-                day_counts[str(day)] = deque([n])
-
-    return day_counts
-
-def main(args):
-    for path in args.input_paths:
-        # if path is /a/b/c/whatever.npz, by default output will be in
-        # $WD/byday_whatever.npz where $WD is the working dir
-        out_path = args.out_prefix + os.path.basename(path)
-        out_path = os.path.splitext(out_path)[0] + '.npz'
-
-        # load input, group, save to file, tell user
-        user_counts = np.load(path)
-        N = len(user_counts.files)
-        print '%d users in %s' % (N, path)
-        day_counts = group_by_day(user_counts)
-        np.savez(out_path, **day_counts)
-        print '%s saved (%d days).' % (out_path, len(day_counts))
-
-if __name__ == '__main__':
-    args = parser.parse_args()
-    main(args)


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

[MediaWiki-CVS] SVN: [114806] trunk/tools/wsor/editor_lifecycle

Reply via email to