https://www.mediawiki.org/wiki/Special:Code/MediaWiki/114806
Revision: 114806
Author: giovanni
Date: 2012-04-09 18:58:38 +0000 (Mon, 09 Apr 2012)
Log Message:
-----------
refactored comprates + minor fixes
* comprates now gives interface to cohortrate
* fixed comments in lifecyle.rates
* renamed fetchrates into fetchcounts (more accurate)
* deleted scripts/groupbyday
Modified Paths:
--------------
trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py
trunk/tools/wsor/editor_lifecycle/scripts/comprates
Added Paths:
-----------
trunk/tools/wsor/editor_lifecycle/scripts/fetchcounts
Removed Paths:
-------------
trunk/tools/wsor/editor_lifecycle/scripts/fetchrates
trunk/tools/wsor/editor_lifecycle/scripts/groupbyday
Modified: trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py
===================================================================
--- trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py 2012-04-09
18:58:36 UTC (rev 114805)
+++ trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py 2012-04-09
18:58:38 UTC (rev 114806)
@@ -61,13 +61,14 @@
compute activity rates only with edits to these namespaces
minsize - positive int
filter out activity rate estimates based on sample of size less than
- minsize minsnr - positive real
+ minsnr - positive real
filter out activity rate estimates with signal-to-noise rate less than
parameter
Returns
-------
- an array of daily activity rate observations, together estimated
uncertainties
+ an array of daily activity rate observations, together estimated
+ uncertainties
'''
day_counts = {}
Modified: trunk/tools/wsor/editor_lifecycle/scripts/comprates
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/comprates 2012-04-09 18:58:36 UTC
(rev 114805)
+++ trunk/tools/wsor/editor_lifecycle/scripts/comprates 2012-04-09 18:58:38 UTC
(rev 114806)
@@ -1,27 +1,23 @@
#!/usr/bin/python
-'''
-computes average activity rates from downloaded user counts archive files
-'''
+''' computes average activity rates from cohort archives with user counts '''
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+# Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
__author__ = "Giovanni Luca Ciampaglia"
__email__ = "[email protected]"
@@ -31,37 +27,96 @@
from argparse import ArgumentParser
-from lifecycle.rates import computerates
+from lifecycle.rates import cohortrate
parser = ArgumentParser(description=__doc__)
-parser.add_argument('input_path_list', metavar='data', nargs='+')
-parser.add_argument('-outdir', dest='output_dir', default=os.curdir)
-parser.add_argument('-every', type=int, help='default: average over
%(default)d days',
- default=30, metavar='NUM')
-parser.add_argument('-n', '--namespace', type=int, action='append',
help='select only these NS',
+parser.add_argument(
+ 'input_path_list',
+ metavar='FILE',
+ help='cohort archive(s)',
+ nargs='+')
+parser.add_argument(
+ '-C',
+ '--directory',
+ dest='output_dir',
+ default=os.curdir,
+ help='output to directory %(metavar)s',
+ metavar='DIR')
+parser.add_argument(
+ '-n',
+ '--namespace',
+ type=int,
+ action='append',
+ metavar='NS',
+ help='compute rates only with edits to %(metavar)s (may specify this '\
+ 'argument multiple times)',
dest='only')
-parser.add_argument('-G', '--geometric', action='store_true', help='compute '
- 'geometric mean and standard deviation of data')
+parser.add_argument(
+ '-s',
+ '--snr',
+ dest='minsnr',
+ type=float,
+ metavar='RATIO',
+ help='minimum signal-to-noise ratio of observations')
+parser.add_argument(
+ '-m',
+ '--size',
+ type=int,
+ dest='minsize',
+ metavar='SIZE',
+ help='compute averages over samples of minimum size only')
-__prog__ = os.path.basename(__file__)
+__prog__ = os.path.basename(os.path.abspath(__file__))
def main(ns):
+
+ # test output directory exists
+ if not os.path.isdir(ns.output_dir):
+ print >> sys.stderr, '%s: error: not an existing directory: %s' % \
+ (__prog__, ns.output_dir)
+ sys.exit(1)
+ # check SNR argument and if OK feedback to user
+ if ns.minsnr is not None:
+ if ns.minsnr <= 0:
+ print >> sys.stderr, '%s: error: SNR is not a ratio: %g' %
(__prog__,
+ ns.minsnr)
+ sys.exit(1)
+ else:
+ print '%s: minimum signal-to-noise ratio: %g' % (__prog__,
+ ns.minsnr)
+
+ # check size argument and if OK feedback to user
+ if ns.minsize is not None:
+ if ns.minsize <= 0:
+ print >> sys.stderr, '%s: error: not a valid sample size: %d' %\
+ (__prog__, ns.minsize)
+ else:
+ print '%s: minimum sample size: %d' % (__prog__, ns.minsize)
+
+ # loop over inputs
for path in ns.input_path_list:
+ # check input path exists
+ if not os.path.exists(path):
+ print >> sys.stderr, '%s: error: skipping non-existing file: %s' %\
+ (__prog__, path)
+ continue
+
# define output path
output_path = os.path.basename(path)
output_path = os.path.splitext(output_path)[0] + '.tsv'
output_path = os.path.join(ns.output_dir, output_path)
# compute rates for this cohort and save them to file
- rates = computerates(path, ns.every, onlyns=ns.only,
- geometric=ns.geometric)
- np.savetxt(output_path, rates, fmt='%d\t%12.8g\t%12.8g\t%d')
+ npzarc = np.load(path)
+ rates = cohortrate(npzarc, onlyns=ns.only, minsnr=ns.minsnr,
+ minsize=ns.minsize)
+ np.savetxt(output_path, rates, fmt='%d\t%12.8g\t%12.8g')
print '%s: output saved to %s' % (__prog__, output_path)
if __name__ == '__main__':
- # get arguments from command line
+ # parse arguments from command line
ns = parser.parse_args()
main(ns)
Copied: trunk/tools/wsor/editor_lifecycle/scripts/fetchcounts (from rev 114805,
trunk/tools/wsor/editor_lifecycle/scripts/fetchrates)
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/fetchcounts
(rev 0)
+++ trunk/tools/wsor/editor_lifecycle/scripts/fetchcounts 2012-04-09
18:58:38 UTC (rev 114806)
@@ -0,0 +1,125 @@
+#!/usr/bin/python
+
+''' Fetches edit count data from database '''
+
+# Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
+
+__author__ = "Giovanni Luca Ciampaglia"
+__email__ = "[email protected]"
+
+import sys
+import os
+import numpy as np
+
+from zipfile import ZipFile
+from contextlib import closing
+from tempfile import mkdtemp
+from oursql import connect, InterfaceError
+from argparse import ArgumentParser
+from datetime import datetime
+
+from lifecycle.rates import userrate
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument(
+ 'input_path',
+ metavar='FILE',
+ help='cohort file (with user ids on a single line)')
+parser.add_argument(
+ '-config',
+ dest='config_file',
+ metavar='FILE',
+ default='~/.my.cnf',
+ help='mysql config file (default: %(default)s)')
+parser.add_argument(
+ '-outdir',
+ dest='output_dir',
+ help='output directory',
+ metavar='DIR',
+ default=os.curdir)
+
+_query = """
+SELECT UNIX_TIMESTAMP(rev_timestamp)/86400.0, page_namespace
+FROM revision r
+JOIN page p
+ON r.rev_page = p.page_id
+WHERE rev_user = ?
+ORDER BY rev_timestamp
+"""
+
+__prog__ = os.path.basename(os.path.abspath(__file__))
+
+def main(ns):
+ # test configuration file for mysql client exists
+ cnf = os.path.expanduser(os.path.expandvars(ns.config_file))
+ if not os.path.exists(cnf):
+ print >> sys.stderr, '%s: error: no config file found: %s' %
(__prog__, cnf)
+ sys.exit(1)
+
+ # test output path exists and is a directory
+ if not os.path.isdir(ns.output_dir):
+ print >> sys.stderr, '%s: error: not an existing directory: %s' %
(__prog__,
+ ns.output_dir)
+ sys.exit(1)
+
+ # read user ids from cohort file, create zip archive and temp dir
+ with closing(open(ns.input_path)) as f:
+ line = f.readline().strip()
+ if line:
+ user_ids = map(int, line.split('\t'))
+ else:
+ print >> sys.stderr, '%s: error: empty cohort file: %s' %
ns.input_path
+ sys.exit(1)
+
+ # connect to DB
+ try:
+ conn = connect(read_default_file=ns.config_file)
+ except InterfaceError, e:
+ print >> sys.stderr, '%s: error: %s' % (__prog__, e.args[1])
+ sys.exit(1)
+
+ # create output archive and temp working dir
+ zip_path = os.path.splitext(os.path.basename(ns.input_path))[0] + '.npz'
+ zip_path = os.path.join(ns.output_dir, zip_path)
+ temp_dir = mkdtemp()
+
+ with closing(ZipFile(zip_path, 'w')) as zf:
+
+ # compute user rates and write them into the zip file
+ with conn.cursor() as cursor:
+ for uid in user_ids:
+ cursor.execute(_query, (uid,))
+ rows = list(cursor)
+ if len(rows) == 0:
+ continue
+ data = userrate(rows)
+ path = os.path.join(temp_dir, '%d.npy' % uid)
+ np.save(temp_dir, data)
+ zf.write(path, os.path.basename(path))
+ os.remove(path)
+
+ # remove temp dir
+ os.removedirs(temp_dir)
+
+ # tell user
+ print '%s: output saved to %s' % (datetime.now(), zip_path)
+
+if __name__ == '__main__':
+ # parse arguments from command line
+ ns = parser.parse_args()
+ main(ns)
+
Deleted: trunk/tools/wsor/editor_lifecycle/scripts/fetchrates
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/fetchrates 2012-04-09
18:58:36 UTC (rev 114805)
+++ trunk/tools/wsor/editor_lifecycle/scripts/fetchrates 2012-04-09
18:58:38 UTC (rev 114806)
@@ -1,125 +0,0 @@
-#!/usr/bin/python
-
-''' Fetches edit count data from database '''
-
-# Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-# http://www.gnu.org/copyleft/gpl.html
-
-__author__ = "Giovanni Luca Ciampaglia"
-__email__ = "[email protected]"
-
-import sys
-import os
-import numpy as np
-
-from zipfile import ZipFile
-from contextlib import closing
-from tempfile import mkdtemp
-from oursql import connect, InterfaceError
-from argparse import ArgumentParser
-from datetime import datetime
-
-from lifecycle.rates import userrate
-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument(
- 'input_path',
- metavar='FILE',
- help='cohort file (with user ids on a single line)')
-parser.add_argument(
- '-config',
- dest='config_file',
- metavar='FILE',
- default='~/.my.cnf',
- help='mysql config file (default: %(default)s)')
-parser.add_argument(
- '-outdir',
- dest='output_dir',
- help='output directory',
- metavar='DIR',
- default=os.curdir)
-
-_query = """
-SELECT UNIX_TIMESTAMP(rev_timestamp)/86400.0, page_namespace
-FROM revision r
-JOIN page p
-ON r.rev_page = p.page_id
-WHERE rev_user = ?
-ORDER BY rev_timestamp
-"""
-
-__prog__ = os.path.basename(os.path.abspath(__file__))
-
-def main(ns):
- # test configuration file for mysql client exists
- cnf = os.path.expanduser(os.path.expandvars(ns.config_file))
- if not os.path.exists(cnf):
- print >> sys.stderr, '%s: error: no config file found: %s' %
(__prog__, cnf)
- sys.exit(1)
-
- # test output path exists and is a directory
- if not os.path.isdir(ns.output_dir):
- print >> sys.stderr, '%s: error: not an existing directory: %s' %
(__prog__,
- ns.output_dir)
- sys.exit(1)
-
- # read user ids from cohort file, create zip archive and temp dir
- with closing(open(ns.input_path)) as f:
- line = f.readline().strip()
- if line:
- user_ids = map(int, line.split('\t'))
- else:
- print >> sys.stderr, '%s: error: empty cohort file: %s' %
ns.input_path
- sys.exit(1)
-
- # connect to DB
- try:
- conn = connect(read_default_file=ns.config_file)
- except InterfaceError, e:
- print >> sys.stderr, '%s: error: %s' % (__prog__, e.args[1])
- sys.exit(1)
-
- # create output archive and temp working dir
- zip_path = os.path.splitext(os.path.basename(ns.input_path))[0] + '.npz'
- zip_path = os.path.join(ns.output_dir, zip_path)
- temp_dir = mkdtemp()
-
- with closing(ZipFile(zip_path, 'w')) as zf:
-
- # compute user rates and write them into the zip file
- with conn.cursor() as cursor:
- for uid in user_ids:
- cursor.execute(_query, (uid,))
- rows = list(cursor)
- if len(rows) == 0:
- continue
- data = userrate(rows)
- path = os.path.join(temp_dir, '%d.npy' % uid)
- np.save(temp_dir, data)
- zf.write(path, os.path.basename(path))
- os.remove(path)
-
- # remove temp dir
- os.removedirs(temp_dir)
-
- # tell user
- print '%s: output saved to %s' % (datetime.now(), zip_path)
-
-if __name__ == '__main__':
- # parse arguments from command line
- ns = parser.parse_args()
- main(ns)
-
Deleted: trunk/tools/wsor/editor_lifecycle/scripts/groupbyday
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/groupbyday 2012-04-09
18:58:36 UTC (rev 114805)
+++ trunk/tools/wsor/editor_lifecycle/scripts/groupbyday 2012-04-09
18:58:38 UTC (rev 114806)
@@ -1,64 +0,0 @@
-#!/usr/bin/python
-#:vim:ft=python
-# encoding:utf-8
-
-''' groups user counts by day since registration '''
-
-import os
-from argparse import ArgumentParser
-import numpy as np
-from scipy.sparse import coo_matrix
-from collections import deque
-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument('input_paths', metavar='file', nargs='+')
-parser.add_argument('-p', '--prefix', dest='out_prefix', default='daily_',
- metavar='PREFIX', help="(default: %(metavar)s)")
-
-def group_by_day(counts):
- '''
- counts is a mapping between user IDs and edits-by-namespace count data
- '''
- # hold cohort daily counts in a mapping in memory
- day_counts = {}
-
- for uid in counts:
- data = counts[uid].view(np.recarray)
-
- # NS < 0 are virtual. Filter out those edits because they are junk
- idx = data.ns >= 0
- data = data[idx]
-
- # Sparse matrix (num_days x namespaces) where num_days is the activity
- # span in days. Summing along rows returns a dense matrix
- counts_matrix = coo_matrix((data.edits, (data.day - data.day.min(), \
- data.ns))).tocsc().sum(axis=1)
-
- # Add counts to cohort daily counts
- for day in xrange(counts_matrix.shape[0]):
- n = int(counts_matrix[day])
- try:
- day_counts[str(day)].append(n)
- except KeyError:
- day_counts[str(day)] = deque([n])
-
- return day_counts
-
-def main(args):
- for path in args.input_paths:
- # if path is /a/b/c/whatever.npz, by default output will be in
- # $WD/byday_whatever.npz where $WD is the working dir
- out_path = args.out_prefix + os.path.basename(path)
- out_path = os.path.splitext(out_path)[0] + '.npz'
-
- # load input, group, save to file, tell user
- user_counts = np.load(path)
- N = len(user_counts.files)
- print '%d users in %s' % (N, path)
- day_counts = group_by_day(user_counts)
- np.savez(out_path, **day_counts)
- print '%s saved (%d days).' % (out_path, len(day_counts))
-
-if __name__ == '__main__':
- args = parser.parse_args()
- main(args)
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs