http://www.mediawiki.org/wiki/Special:Code/MediaWiki/95709
Revision: 95709
Author: giovanni
Date: 2011-08-29 22:07:27 +0000 (Mon, 29 Aug 2011)
Log Message:
-----------
removed obsolete scripts
Removed Paths:
-------------
trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort
trunk/tools/wsor/editor_lifecycle/obsolete/graphlife
trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort
trunk/tools/wsor/editor_lifecycle/obsolete/rates
trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh
trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql
Deleted: trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort
===================================================================
--- trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort 2011-08-29
22:02:56 UTC (rev 95708)
+++ trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort 2011-08-29
22:07:27 UTC (rev 95709)
@@ -1,79 +0,0 @@
-#!/usr/bin/python
-# vim:ft=python:
-# coding : utf-8
-
-# TODO: obsolete
-
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
-''' fetches a cohort based on year of registration and editing activity '''
-
-from argparse import ArgumentParser
-from oursql import connect
-import os
-import sys
-import datetime as dt
-import csv
-
-prog = os.path.basename(os.path.abspath(__file__))
-
-parser = ArgumentParser(description=__doc__, fromfile_prefix_chars='@')
-parser.add_argument('registration_year', metavar='year', type=int)
-parser.add_argument('min_activity', metavar='minedits', type=int)
-parser.add_argument('max_activity', metavar='maxedits', type=int)
-parser.add_argument('-c', '--config', dest='config_file')
-parser.add_argument('-l', '--limit', type=int)
-
-query = '''
-select
- user_id,
- user_name,
- user_registration,
- user_editcount
-from user u left join user_groups ug
-on u.user_id = ug.ug_user
-where
- (ug_group <> 'bot' or ug_user is null)
- and year(user_registration) = ?
- and user_editcount > ?
- and user_editcount < ?
-'''
-
-if __name__ == '__main__':
- ns = parser.parse_args()
- if ns.min_activity >= ns.max_activity:
- print >> sys.stderr, '%s: error: min_activity >= max_activity' % prog
- sys.exit(1)
- if ns.registration_year < 2001 or ns.registration_year >
dt.datetime.now().year:
- print >> sys.stderr, '%s: error: illegal year: %d' % (prog,
- ns.registration_year)
- sys.exit(1)
-
- if ns.limit is not None:
- query += 'limit %d' % ns.limit
-
- if ns.config_file is None:
- ns.config_file = os.path.expanduser('~/.my.cnf')
-
- conn = connect(read_default_file=ns.config_file)
- writer = csv.writer(sys.stdout, dialect='excel-tab')
- cursor = conn.cursor()
- cursor.execute(query, (ns.registration_year, ns.min_activity,
ns.max_activity))
- for row in cursor:
- writer.writerow(row)
Deleted: trunk/tools/wsor/editor_lifecycle/obsolete/graphlife
===================================================================
--- trunk/tools/wsor/editor_lifecycle/obsolete/graphlife 2011-08-29
22:02:56 UTC (rev 95708)
+++ trunk/tools/wsor/editor_lifecycle/obsolete/graphlife 2011-08-29
22:07:27 UTC (rev 95709)
@@ -1,108 +0,0 @@
-#!/usr/bin/python
-
-''' plot editor life cycle '''
-
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
-import sys
-import numpy as np
-from argparse import ArgumentParser
-import os
-
-__prog__ = os.path.basename(os.path.abspath(__file__))
-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument('data_files', metavar='data', nargs='+')
-parser.add_argument('-l', '--label', metavar='TEXT', action='append',
- dest='labels_list')
-parser.add_argument('-inset', dest='inset_data_file', metavar='FILE')
-parser.add_argument('-batch', action='store_true', help='uses PDF backend')
-parser.add_argument('-title')
-parser.add_argument('-fmt', default='pdf', help='default: %(default)s')
-
-if __name__ == '__main__':
- ns = parser.parse_args()
-
- # checks
- if ns.labels_list and len(ns.data_files) != len(ns.labels_list):
- print >> sys.stderr, '%s: error: please provide as many labels '\
- 'as data files' % __prog__
- sys.exit(1)
-
- # import pyplot, make lists of colors and markers
- if ns.batch:
- import matplotlib
- matplotlib.use('PDF')
- import matplotlib.pyplot as pp
- from matplotlib.lines import lineMarkers as markers
- markers = dict(filter(
- lambda k : isinstance(k[0],str) and k[1] is not '_draw_nothing',
- markers.items())).keys()
- colors = 'krbgm'
-
- # create figure and axes
- fig = pp.figure()
- ax = pp.axes([.1, .1, .85, .8])
-
- # add lines
- N = len(ns.data_files)
- for i in xrange(N):
- data_file = ns.data_files[i]
- if ns.labels_list is not None:
- label = ns.labels_list[i]
- else:
- label = 'line-%d' % (i + 1)
- color = colors[i % len(colors)]
- marker= markers[i % len(markers)]
- x, y, ye = np.loadtxt(data_file, unpack=1)
- ax.errorbar(x, y, ye, color=color, marker=marker, mfc='none',
- mec=color, ls=':', label=label)
-
- ax.legend(loc=2)
- ax.set_xlabel('days since registration')
- ax.set_ylabel('edits/day')
- if ns.title is not None:
- ax.set_title(ns.title)
- ax.axis('tight')
-
- # plot hist of lifetimes in inset axes
- if ns.inset_data_file is not None:
- lt = np.loadtxt(ns.inset_data_file)
- inax = pp.axes([.55, .6, .35, .25], axisbg='none')
- inax.hist(lt, bins=20, fc='none', cumulative=-1, normed=0)
- for l in inax.xaxis.get_ticklabels():
- l.set_rotation(30)
- l.set_fontsize('x-small')
- for l in inax.yaxis.get_ticklabels():
- l.set_fontsize('x-small')
- inax.set_xlabel('lifespan $x$ (days)', fontsize='small')
- inax.set_ylabel('no. of users older\n more than $x$ days',
- fontsize='small')
- inax.set_title('account lifetime')
- inax.axis('tight')
-
- pp.draw()
- if ns.title is not None:
- fn = ns.title.replace(' ', '_').lower() + '.' + ns.fmt
- else:
- fn = 'output.' + ns.fmt
- print 'output saved to %s' % fn
-
- pp.savefig(fn, fmt=ns.fmt)
- pp.show()
Deleted: trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort
===================================================================
--- trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort 2011-08-29 22:02:56 UTC
(rev 95708)
+++ trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort 2011-08-29 22:07:27 UTC
(rev 95709)
@@ -1,214 +0,0 @@
-#!/usr/bin/python
-# coding: utf-8
-# :vim:ft=python
-
-# TODO: obsolete
-
-''' creates cohort files, filtering out bots '''
-
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
-'''
-This script reads two files: an ZIP archive file, and an index file, which is a
-tab-separated text file like the following:
-
- 34 WojPob 20010129110725 2524
- 94 AstroNomer 20010207222248 1532
- 43 Lee Daniel Crocker 20010314020407 4388
- 86 Stephen Gilbert 20010326191355 3599
- 3 Tobias Hoevekamp 20010326202105 1903
- 1273 Wathiik 20010510171751 1772
- 3371 Arno 20010721180708 2700
- 122 Ap 20010722201619 2137
- 182 Rjstott 20010726102546 2602
- 64 Uriyan 20010727141651 1634
-
-Where fields are: id, name, date, count. Dates are parsed using dateutil, so
-other formats are allowed too (e.g. 2010-01-29 11:07:25).
-
-The script will aggregate users based on the date field and will lookup for
-files of the form <id>.npy in the archive file. Each of these files contains
the
-daily edits count for a single user, stored using the NumPy binary array
-format. A relative path within the ZIP archive can be specified from the
command
-line with -P/--datapath. Once the data for a cohort (e.g. an aggregated group
-of users) have been collected, the script will compute the average activity
rate
-since the first day of activity for all users in that cohort.
-
-The script produces two files per each cohort: a tab-separated values file with
-cohort average activity rate, and a compressed NumPy binary archive with the
-user data array files.
-
-For each discovered cohort, the script will print on the console the date of
the
-cohort, how many users it contains, and how many suspected BOT users it
filtered
-out from the index. Use --bot disable this chieck and always include them. The
-check is as follows: if the name contains the pattern 'bot' at the beginning or
-at the end of any word, it will be filtered out (e.g. "Botuser IV" will match,
-but "Francis Abbott" won't). If arguments -mincount or -maxcount (or both) are
-passed, the script will process only users whose edit count is below the
minimum
-count, or above the maximum count, or both.
-
-Please note that the index file must be already sorted by date, in order for
the
-group by date aggregation to work. You can use `sort' from the commmand line,
-e.g.:
-
- $~ sort -t$'\t' -k3 -h unsorted.tsv
-
-should sort file unsorted.tsv.
-'''
-
-import re
-import os
-import sys
-import csv
-import numpy as np
-from argparse import ArgumentParser, FileType
-from contextlib import closing
-from itertools import groupby
-from dateutil.parser import parser as DateParser
-from datetime import datetime
-from zipfile import ZipFile
-
-from rates import computerates
-
-__prog__ = os.path.basename(os.path.abspath(__file__))
-_botpat = r'\bbot|bot\b'
-_fields = ['id', 'name', 'date', 'count']
-
-def yearkey(date):
- return date.year,
-
-def monthkey(date):
- return date.year, date.month
-
-def daykey(date):
- return date.year, date.month, date.day
-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument('index', type=FileType('r'), help='*must* be already
sorted')
-parser.add_argument('archive_path', metavar='archive', help='data archive in
ZIP '
- 'format')
-group = parser.add_mutually_exclusive_group(required=1)
-group.add_argument('--year', help='group by year', action='store_const',
- const=yearkey, dest='keyfunc')
-group.add_argument('--month', help='group by month', action='store_const',
- const=monthkey, dest='keyfunc')
-group.add_argument('--day', help='group by day', action='store_const',
- const=daykey, dest='keyfunc')
-parser.add_argument('--bots', action='store_true', help='do NOT filter out
bots')
-parser.add_argument('-P', '--datapath', help='relative path of files within '
- 'archive', default='')
-parser.add_argument('-mincount', type=int)
-parser.add_argument('-maxcount', type=int)
-parser.add_argument('-minperyear', type=int)
-parser.add_argument('-maxperyear', type=int)
-parser.add_argument('-n', '--dry-run', action='store_true', help='write to '
- 'console all actions, but do not produce any file')
-parser.add_argument('-every', type=int, help='default: average over
%(default)d days',
- default=30, metavar='NUM')
-parser.add_argument('-ns', type=int, action='append', help='select only these
NS',
- dest='only')
-
-dateparser = DateParser()
-
-# dummy ZipFile class in case we do not want do anything!
-class DummyZipFile:
- def __init__(self, fn, mode):
- pass
- def close(self):
- pass
- def write(self, fn, *args):
- pass
-
-if __name__ == '__main__':
- ns = parser.parse_args()
- reader = csv.DictReader(ns.index, _fields, quoting=csv.QUOTE_NONE,
- delimiter='\t')
- archive = ZipFile(ns.archive_path)
-
- def _keyfunc(row):
- try:
- date = dateparser.parse(row['date'])
- except:
- print row
- raise
-
- return ns.keyfunc(date)
-
- # group by index by date of registration
- for key, subiter in groupby(reader, _keyfunc):
-
- # reset indices and define output file names from cohort period
- tot_users = 0
- tot_bots = 0
- datestr = '-'.join(map(lambda k : '%02d' % k, key)) # (2010,1) ->
'2010-01'
- zipfn = '{}.npz'.format(datestr)
- tsvfn = '{}.tsv'.format(datestr)
-
- # if user wants to do a dry-run, replace the Zip files class with the
- # dummy one
- if ns.dry_run:
- ZipFile = DummyZipFile
-
- # for each user, determine if may go in cohort
- with closing(ZipFile(zipfn, 'w')) as zf:
- for row in subiter:
-
- # compute user details (edit count, yearly activity rate, etc.)
- # and other useful variables
- user_id = row['id']
- count = int(row['count'])
- user_date = dateparser.parse(row['date'])
- now_date = datetime.now()
- activity_span = float((now_date - user_date).days) # in days
- yearly_rate = count / activity_span * 365.0
- bot_flag = re.search(_botpat, row['name'], re.I) is not None
- tot_bots += bot_flag # update counts of bot matches
-
- # define paths
- basepath = '{}.npy'.format(user_id)
- archivepath = os.path.join(ns.datapath, basepath)
-
- # check cohort membership (keep if conjunction of all given
- # criteria is true, that is, discard if any given criterion is
- # false)
- if ns.mincount is not None and count <= ns.mincount:
- continue
- if ns.maxcount is not None and count >= ns.maxcount:
- continue
- if ns.minperyear is not None and yearly_rate <= ns.minperyear:
- continue
- if ns.maxperyear is not None and yearly_rate >= ns.maxperyear:
- continue
- # user can turn this test off by passing --bots
- if not ns.bots and bot_flag:
- continue
- try:
- zf.writestr(basepath, archive.read(archivepath))
- except KeyError:
- print >> sys.stderr, '%s: warning: %s not in archive' %\
- (__prog__, archivepath)
- tot_users += 1
-
- if tot_users > 0:
- rates = computerates(zipfn, ns.every, onlyns=ns.only)
- np.savetxt(tsvfn, rates, fmt='%f')
-
- print '%s: %s, %s created (users: %5d, skipped bots %5d)' % (
- __prog__, tsvfn, zipfn, tot_users, tot_bots)
- sys.stdout.flush()
Deleted: trunk/tools/wsor/editor_lifecycle/obsolete/rates
===================================================================
--- trunk/tools/wsor/editor_lifecycle/obsolete/rates 2011-08-29 22:02:56 UTC
(rev 95708)
+++ trunk/tools/wsor/editor_lifecycle/obsolete/rates 2011-08-29 22:07:27 UTC
(rev 95709)
@@ -1,96 +0,0 @@
-#!/usr/bin/python
-#:vim:ts=python:
-
-''' compute editor lifecycle '''
-
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
-import re
-import os
-from argparse import ArgumentParser
-import numpy as np
-from collections import deque
-import datetime as dt
-
-from lifecycle.rates import *
-
-__prog__ = os.path.basename(os.path.abspath(__file__))
-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument('data_file', metavar='data')
-parser.add_argument(metavar='minact', type=int, dest='minimum_activity')
-parser.add_argument(metavar='maxact', type=int, dest='maximum_activity')
-parser.add_argument('-key')
-parser.add_argument('-every', type=int, help='default: %(default)d days',
- default=30, metavar='NUM')
-parser.add_argument('-inactivity', type=int, default=180, help='default: '
- '%(default)d days', metavar='NUM')
-parser.add_argument('-all', dest='dump_all', action='store_true')
-
-
-def main(ns):
- if ns.key is None:
- m = re.match('(.*?)\.npz', ns.data_file, re.I)
- if m is not None:
- ns.key = m.groups()[0]
- else:
- print >> sys.stderr, '%s: cannot determine key from file name: %s'\
- % (__prog__, ns.data_file)
- sys.exit(1)
- if ns.minimum_activity >= ns.maximum_activity:
- print >> sys.stderr, '%s: error: minact >= maxact' % __prog__
- sys.exit(1)
-
- # load data
- npzarchive = np.load(ns.data_file)
-
- if ns.dump_all:
- fn = mkfn('cycles', ns, 'npz')
- values_iter = itercycles(npzarchive, ns.every)
- keys = npzarchive.files
- tmp = dict(zip(keys, list(values_iter)))
- np.savez(fn, **tmp)
- print '%s: output saved to %s' % (__prog__, fn)
- else:
- # compute lifetime distribution
- lt = lifetimes(npzarchive)
-
- # compute inactive subgroups
- inactive_users = find_inactives(npzarchive, ns.inactivity,
ns.minimum_activity,
- ns.maximum_activity)
-
- ratesbyday = groupbyday(npzarchive, ns.every)
- ratesbyday_inact = groupbyday(npzarchive, ns.every, inactive_users)
-
- avg_all = averagecycle(ratesbyday)
- avg_inact = averagecycle(ratesbyday_inact)
-
- lens = [ len(npzarchive.files), len(inactive_users) ]
-
- names = ['lt', 'len', 'all', 'inact' ]
- arrs = [ lt, lens, avg_all, avg_inact ]
-
- for n, a in zip(names, arrs):
- fn = '%s_%s.%s' % (ns.key, n, 'tsv')
- np.savetxt(fn, a)
- print '%s: output saved to %s' % (__prog__, fn)
-
-if __name__ == '__main__':
- ns = parser.parse_args()
- main(ns)
Deleted: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh
===================================================================
--- trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh 2011-08-29
22:02:56 UTC (rev 95708)
+++ trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh 2011-08-29
22:07:27 UTC (rev 95709)
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-# Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-# http://www.gnu.org/copyleft/gpl.html
-
-# This scripts writes to output a list of registered, not-flagged-as-bot users,
-# sorted by time of first edit. Each item in the list comprises:
-#
-# 1. user_id
-# 2. user_name
-# 3. first_timestamp
-# 4. editcount
-#
-# For the SQL query, check file userlist.sql.
-
-srcdir=`dirname $(type -p $0)`
-mysql -BN < $srcdir/userlist.sql | sort -h -k3 -t $'\t'
Deleted: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql
===================================================================
--- trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql 2011-08-29
22:02:56 UTC (rev 95708)
+++ trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql 2011-08-29
22:07:27 UTC (rev 95709)
@@ -1,30 +0,0 @@
--- user ID, user name, timestamp of first edit and edit count of all registered
--- users that are not flagged bots. N.B. there might still be unflagged bots.
-
--- Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
--- This program is free software; you can redistribute it and/or modify
--- it under the terms of the GNU General Public License as published by
--- the Free Software Foundation; either version 2 of the License, or
--- (at your option) any later version.
---
--- This program is distributed in the hope that it will be useful,
--- but WITHOUT ANY WARRANTY; without even the implied warranty of
--- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
--- GNU General Public License for more details.
---
--- You should have received a copy of the GNU General Public License along
--- with this program; if not, write to the Free Software Foundation, Inc.,
--- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
--- http://www.gnu.org/copyleft/gpl.html
-
-select
- rev_user as user_id,
- rev_user_text as user_name,
- min(rev_timestamp) as first_timestamp,
- count(rev_timestamp) as editcount
-from
- revision r use index (usertext_timestamp) left join user_groups g
-on r.rev_user = g.ug_user
-where (ug_group <> 'bot' or g.ug_user is null) and rev_user > 0
-group by rev_user_text
--- limit 100
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs