http://www.mediawiki.org/wiki/Special:Code/MediaWiki/95709

Revision: 95709
Author:   giovanni
Date:     2011-08-29 22:07:27 +0000 (Mon, 29 Aug 2011)
Log Message:
-----------
removed obsolete scripts

Removed Paths:
-------------
    trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort
    trunk/tools/wsor/editor_lifecycle/obsolete/graphlife
    trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort
    trunk/tools/wsor/editor_lifecycle/obsolete/rates
    trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh
    trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql

Deleted: trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort
===================================================================
--- trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort      2011-08-29 
22:02:56 UTC (rev 95708)
+++ trunk/tools/wsor/editor_lifecycle/obsolete/fetchcohort      2011-08-29 
22:07:27 UTC (rev 95709)
@@ -1,79 +0,0 @@
-#!/usr/bin/python
-# vim:ft=python:
-# coding : utf-8
-
-# TODO: obsolete
-
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
-''' fetches a cohort based on year of registration and editing activity '''
-
-from argparse import ArgumentParser
-from oursql import connect
-import os
-import sys
-import datetime as dt
-import csv
-
-prog = os.path.basename(os.path.abspath(__file__))
-
-parser = ArgumentParser(description=__doc__, fromfile_prefix_chars='@')
-parser.add_argument('registration_year', metavar='year', type=int)
-parser.add_argument('min_activity', metavar='minedits', type=int)
-parser.add_argument('max_activity', metavar='maxedits', type=int)
-parser.add_argument('-c', '--config', dest='config_file')
-parser.add_argument('-l', '--limit', type=int)
-
-query = '''
-select 
-    user_id, 
-    user_name, 
-    user_registration, 
-    user_editcount
-from user u left join user_groups ug
-on u.user_id = ug.ug_user 
-where 
-    (ug_group <> 'bot' or ug_user is null)
-    and year(user_registration) = ?
-    and user_editcount > ? 
-    and user_editcount < ?
-'''
-
-if __name__ == '__main__':
-    ns = parser.parse_args()
-    if ns.min_activity >= ns.max_activity:
-        print >> sys.stderr, '%s: error: min_activity >= max_activity' % prog
-        sys.exit(1)
-    if ns.registration_year < 2001 or ns.registration_year > 
dt.datetime.now().year:
-        print >> sys.stderr, '%s: error: illegal year: %d' % (prog,
-                ns.registration_year)
-        sys.exit(1)
-
-    if ns.limit is not None:
-        query += 'limit %d' % ns.limit
-
-    if ns.config_file is None:
-        ns.config_file = os.path.expanduser('~/.my.cnf')
-
-    conn = connect(read_default_file=ns.config_file)
-    writer = csv.writer(sys.stdout, dialect='excel-tab')
-    cursor = conn.cursor()
-    cursor.execute(query, (ns.registration_year, ns.min_activity, 
ns.max_activity))
-    for row in cursor:
-        writer.writerow(row)

Deleted: trunk/tools/wsor/editor_lifecycle/obsolete/graphlife
===================================================================
--- trunk/tools/wsor/editor_lifecycle/obsolete/graphlife        2011-08-29 
22:02:56 UTC (rev 95708)
+++ trunk/tools/wsor/editor_lifecycle/obsolete/graphlife        2011-08-29 
22:07:27 UTC (rev 95709)
@@ -1,108 +0,0 @@
-#!/usr/bin/python
-
-''' plot editor life cycle '''
-
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
-import sys
-import numpy as np
-from argparse import ArgumentParser
-import os
-
-__prog__ = os.path.basename(os.path.abspath(__file__))
-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument('data_files', metavar='data', nargs='+')
-parser.add_argument('-l', '--label', metavar='TEXT', action='append',
-        dest='labels_list')
-parser.add_argument('-inset', dest='inset_data_file', metavar='FILE')
-parser.add_argument('-batch', action='store_true', help='uses PDF backend')
-parser.add_argument('-title')
-parser.add_argument('-fmt', default='pdf', help='default: %(default)s')
-
-if __name__ == '__main__':
-    ns = parser.parse_args()
-    
-    # checks
-    if ns.labels_list and len(ns.data_files) != len(ns.labels_list):
-        print >> sys.stderr, '%s: error: please provide as many labels '\
-                'as data files' % __prog__
-        sys.exit(1)
-
-    # import pyplot, make lists of colors and markers
-    if ns.batch:
-        import matplotlib
-        matplotlib.use('PDF')
-    import matplotlib.pyplot as pp
-    from matplotlib.lines import lineMarkers as markers
-    markers = dict(filter(
-            lambda k : isinstance(k[0],str) and k[1] is not '_draw_nothing',
-            markers.items())).keys()
-    colors = 'krbgm'
-
-    # create figure and axes
-    fig = pp.figure()
-    ax = pp.axes([.1, .1, .85, .8])
-    
-    # add lines
-    N = len(ns.data_files)
-    for i in xrange(N):
-        data_file = ns.data_files[i]
-        if ns.labels_list is not None:
-            label = ns.labels_list[i]
-        else:
-            label = 'line-%d' % (i + 1)
-        color = colors[i % len(colors)]
-        marker= markers[i % len(markers)]
-        x, y, ye = np.loadtxt(data_file, unpack=1)
-        ax.errorbar(x, y, ye, color=color, marker=marker, mfc='none', 
-            mec=color, ls=':', label=label)
-
-    ax.legend(loc=2)
-    ax.set_xlabel('days since registration')
-    ax.set_ylabel('edits/day')
-    if ns.title is not None:
-        ax.set_title(ns.title)
-    ax.axis('tight')
-
-    # plot hist of lifetimes in inset axes 
-    if ns.inset_data_file is not None:
-        lt = np.loadtxt(ns.inset_data_file)
-        inax = pp.axes([.55, .6, .35, .25], axisbg='none')
-        inax.hist(lt, bins=20, fc='none', cumulative=-1, normed=0)
-        for l in inax.xaxis.get_ticklabels():
-            l.set_rotation(30)
-            l.set_fontsize('x-small')
-        for l in inax.yaxis.get_ticklabels():
-            l.set_fontsize('x-small')
-        inax.set_xlabel('lifespan $x$ (days)', fontsize='small')
-        inax.set_ylabel('no. of users older\n more than $x$ days', 
-                fontsize='small')
-        inax.set_title('account lifetime')
-        inax.axis('tight')
-
-    pp.draw()
-    if ns.title is not None:
-        fn = ns.title.replace(' ', '_').lower() + '.' + ns.fmt
-    else:
-        fn = 'output.' + ns.fmt
-    print 'output saved to %s' % fn
-
-    pp.savefig(fn, fmt=ns.fmt)
-    pp.show()

Deleted: trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort
===================================================================
--- trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort 2011-08-29 22:02:56 UTC 
(rev 95708)
+++ trunk/tools/wsor/editor_lifecycle/obsolete/mkcohort 2011-08-29 22:07:27 UTC 
(rev 95709)
@@ -1,214 +0,0 @@
-#!/usr/bin/python
-# coding: utf-8
-# :vim:ft=python
-
-# TODO: obsolete
-
-''' creates cohort files, filtering out bots '''
-
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
-'''
-This script reads two files: an ZIP archive file, and an index file, which is a
-tab-separated text file like the following:
-
-    34     WojPob              20010129110725  2524
-    94     AstroNomer          20010207222248  1532
-    43     Lee Daniel Crocker  20010314020407  4388
-    86     Stephen Gilbert     20010326191355  3599
-    3      Tobias Hoevekamp    20010326202105  1903
-    1273    Wathiik            20010510171751  1772
-    3371    Arno               20010721180708  2700
-    122            Ap                  20010722201619  2137
-    182            Rjstott             20010726102546  2602
-    64     Uriyan              20010727141651  1634
-
-Where fields are: id, name, date, count. Dates are parsed using dateutil, so
-other formats are allowed too (e.g.  2010-01-29 11:07:25). 
-
-The script will aggregate users based on the date field and will lookup for
-files of the form <id>.npy in the archive file. Each of these files contains 
the
-daily edits count for a single user, stored using the NumPy binary array
-format. A relative path within the ZIP archive can be specified from the 
command
-line with -P/--datapath. Once the data for a cohort (e.g.  an aggregated group
-of users) have been collected, the script will compute the average activity 
rate
-since the first day of activity for all users in that cohort.
-
-The script produces two files per each cohort: a tab-separated values file with
-cohort average activity rate, and a compressed NumPy binary archive with the
-user data array files.
-
-For each discovered cohort, the script will print on the console the date of 
the
-cohort, how many users it contains, and how many suspected BOT users it 
filtered
-out from the index. Use --bot disable this chieck and always include them. The
-check is as follows: if the name contains the pattern 'bot' at the beginning or
-at the end of any word, it will be filtered out (e.g. "Botuser IV" will match,
-but "Francis Abbott" won't). If arguments -mincount or -maxcount (or both) are
-passed, the script will process only users whose edit count is below the 
minimum
-count, or above the maximum count, or both.
-
-Please note that the index file must be already sorted by date, in order for 
the
-group by date aggregation to work. You can use `sort' from the commmand line,
-e.g.:
-
-    $~ sort -t$'\t' -k3 -h unsorted.tsv
-
-should sort file unsorted.tsv. 
-'''
-
-import re
-import os
-import sys
-import csv
-import numpy as np
-from argparse import ArgumentParser, FileType
-from contextlib import closing
-from itertools import groupby
-from dateutil.parser import parser as DateParser
-from datetime import datetime
-from zipfile import ZipFile
-
-from rates import computerates
-
-__prog__ = os.path.basename(os.path.abspath(__file__))
-_botpat = r'\bbot|bot\b' 
-_fields = ['id', 'name', 'date', 'count']
-
-def yearkey(date):
-    return date.year, 
-
-def monthkey(date):
-    return date.year, date.month
-
-def daykey(date):
-    return date.year, date.month, date.day
-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument('index', type=FileType('r'), help='*must* be already 
sorted')
-parser.add_argument('archive_path', metavar='archive', help='data archive in 
ZIP '
-        'format')
-group = parser.add_mutually_exclusive_group(required=1)
-group.add_argument('--year', help='group by year', action='store_const',
-        const=yearkey, dest='keyfunc')
-group.add_argument('--month', help='group by month', action='store_const',
-        const=monthkey, dest='keyfunc')
-group.add_argument('--day', help='group by day', action='store_const', 
-        const=daykey, dest='keyfunc')
-parser.add_argument('--bots', action='store_true', help='do NOT filter out 
bots')
-parser.add_argument('-P', '--datapath', help='relative path of files within '
-        'archive', default='')
-parser.add_argument('-mincount', type=int)
-parser.add_argument('-maxcount', type=int)
-parser.add_argument('-minperyear', type=int)
-parser.add_argument('-maxperyear', type=int)
-parser.add_argument('-n', '--dry-run', action='store_true', help='write to '
-        'console all actions, but do not produce any file')
-parser.add_argument('-every', type=int, help='default: average over 
%(default)d days',
-        default=30, metavar='NUM')
-parser.add_argument('-ns', type=int, action='append', help='select only these 
NS',
-        dest='only')
-
-dateparser = DateParser()
-
-# dummy ZipFile class in case we do not want do anything!
-class DummyZipFile:
-    def __init__(self, fn, mode):
-        pass
-    def close(self):
-        pass
-    def write(self, fn, *args):
-        pass
-
-if __name__ == '__main__':
-    ns = parser.parse_args()
-    reader = csv.DictReader(ns.index, _fields, quoting=csv.QUOTE_NONE,
-            delimiter='\t')
-    archive = ZipFile(ns.archive_path)
-
-    def _keyfunc(row):
-        try:
-            date = dateparser.parse(row['date'])
-        except:
-            print row
-            raise
-
-        return ns.keyfunc(date)
-
-    # group by index by date of registration
-    for key, subiter in groupby(reader, _keyfunc):
-
-        # reset indices and define output file names from cohort period
-        tot_users = 0
-        tot_bots = 0
-        datestr = '-'.join(map(lambda k : '%02d' % k, key)) # (2010,1) -> 
'2010-01'
-        zipfn = '{}.npz'.format(datestr)
-        tsvfn = '{}.tsv'.format(datestr)
-
-        # if user wants to do a dry-run, replace the Zip files class with the
-        # dummy one
-        if ns.dry_run:
-            ZipFile = DummyZipFile
-
-        # for each user, determine if may go in cohort
-        with closing(ZipFile(zipfn, 'w')) as zf:
-            for row in subiter:
-
-                # compute user details (edit count, yearly activity rate, etc.)
-                # and other useful variables
-                user_id = row['id']
-                count = int(row['count'])
-                user_date = dateparser.parse(row['date'])
-                now_date = datetime.now()
-                activity_span = float((now_date - user_date).days) # in days
-                yearly_rate = count / activity_span * 365.0
-                bot_flag = re.search(_botpat, row['name'], re.I) is not None
-                tot_bots += bot_flag # update counts of bot matches
-
-                # define paths 
-                basepath = '{}.npy'.format(user_id) 
-                archivepath = os.path.join(ns.datapath, basepath)
-
-                # check cohort membership (keep if conjunction of all given
-                # criteria is true, that is, discard if any given criterion is
-                # false)
-                if ns.mincount is not None and count <= ns.mincount:
-                    continue
-                if ns.maxcount is not None and count >= ns.maxcount:
-                    continue
-                if ns.minperyear is not None and yearly_rate <= ns.minperyear:
-                    continue
-                if ns.maxperyear is not None and yearly_rate >= ns.maxperyear:
-                    continue
-                # user can turn this test off by passing --bots
-                if not ns.bots and bot_flag:
-                    continue
-                try:
-                    zf.writestr(basepath, archive.read(archivepath))
-                except KeyError:
-                    print >> sys.stderr, '%s: warning: %s not in archive' %\
-                            (__prog__, archivepath)
-                tot_users += 1
-
-        if tot_users > 0:
-            rates = computerates(zipfn, ns.every, onlyns=ns.only)
-            np.savetxt(tsvfn, rates, fmt='%f')
-
-            print '%s: %s, %s created (users: %5d, skipped bots %5d)' % (
-                    __prog__, tsvfn, zipfn, tot_users, tot_bots)
-            sys.stdout.flush()

Deleted: trunk/tools/wsor/editor_lifecycle/obsolete/rates
===================================================================
--- trunk/tools/wsor/editor_lifecycle/obsolete/rates    2011-08-29 22:02:56 UTC 
(rev 95708)
+++ trunk/tools/wsor/editor_lifecycle/obsolete/rates    2011-08-29 22:07:27 UTC 
(rev 95709)
@@ -1,96 +0,0 @@
-#!/usr/bin/python
-#:vim:ts=python:
-
-''' compute editor lifecycle '''
-
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
-import re
-import os
-from argparse import ArgumentParser
-import numpy as np
-from collections import deque
-import datetime as dt
-
-from lifecycle.rates import *
-
-__prog__ = os.path.basename(os.path.abspath(__file__))
-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument('data_file', metavar='data')
-parser.add_argument(metavar='minact', type=int, dest='minimum_activity')
-parser.add_argument(metavar='maxact', type=int, dest='maximum_activity')
-parser.add_argument('-key')
-parser.add_argument('-every', type=int, help='default: %(default)d days',
-        default=30, metavar='NUM')
-parser.add_argument('-inactivity', type=int, default=180, help='default: '
-        '%(default)d days', metavar='NUM')
-parser.add_argument('-all', dest='dump_all', action='store_true')
-
-
-def main(ns):
-    if ns.key is None:
-        m = re.match('(.*?)\.npz', ns.data_file, re.I)
-        if m is not None:
-            ns.key = m.groups()[0]
-        else:
-            print >> sys.stderr, '%s: cannot determine key from file name: %s'\
-                    % (__prog__, ns.data_file)
-            sys.exit(1)
-    if ns.minimum_activity >= ns.maximum_activity:
-        print >> sys.stderr, '%s: error: minact >= maxact' % __prog__
-        sys.exit(1)
-
-    # load data 
-    npzarchive = np.load(ns.data_file)
-
-    if ns.dump_all:
-        fn = mkfn('cycles', ns, 'npz')
-        values_iter = itercycles(npzarchive, ns.every)
-        keys = npzarchive.files
-        tmp = dict(zip(keys, list(values_iter)))
-        np.savez(fn, **tmp)
-        print '%s: output saved to %s' % (__prog__, fn)
-    else:
-        # compute lifetime distribution
-        lt = lifetimes(npzarchive)
-
-        # compute inactive subgroups
-        inactive_users = find_inactives(npzarchive, ns.inactivity, 
ns.minimum_activity,
-                ns.maximum_activity)
-
-        ratesbyday = groupbyday(npzarchive, ns.every)
-        ratesbyday_inact = groupbyday(npzarchive, ns.every, inactive_users)
-
-        avg_all = averagecycle(ratesbyday)
-        avg_inact = averagecycle(ratesbyday_inact)
-        
-        lens = [ len(npzarchive.files), len(inactive_users) ]
-
-        names = ['lt', 'len', 'all', 'inact' ]
-        arrs = [ lt, lens, avg_all, avg_inact ]
-        
-        for n, a in zip(names, arrs):
-            fn = '%s_%s.%s' % (ns.key, n, 'tsv')
-            np.savetxt(fn, a)
-            print '%s: output saved to %s' % (__prog__, fn)
-
-if __name__ == '__main__':
-    ns = parser.parse_args()
-    main(ns)

Deleted: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh
===================================================================
--- trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh      2011-08-29 
22:02:56 UTC (rev 95708)
+++ trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sh      2011-08-29 
22:07:27 UTC (rev 95709)
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-# Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-# 
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-# 
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-# http://www.gnu.org/copyleft/gpl.html
-
-# This scripts writes to output a list of registered, not-flagged-as-bot users,
-# sorted by time of first edit. Each item in the list comprises:
-#
-# 1. user_id
-# 2. user_name
-# 3. first_timestamp
-# 4. editcount
-#
-# For the SQL query, check file userlist.sql.
-
-srcdir=`dirname $(type -p $0)`
-mysql -BN < $srcdir/userlist.sql | sort -h -k3 -t $'\t'

Deleted: trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql
===================================================================
--- trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql     2011-08-29 
22:02:56 UTC (rev 95708)
+++ trunk/tools/wsor/editor_lifecycle/obsolete/userlist.sql     2011-08-29 
22:07:27 UTC (rev 95709)
@@ -1,30 +0,0 @@
--- user ID, user name, timestamp of first edit and edit count of all registered
--- users that are not flagged bots. N.B. there might still be unflagged bots.
-
--- Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
--- This program is free software; you can redistribute it and/or modify
--- it under the terms of the GNU General Public License as published by
--- the Free Software Foundation; either version 2 of the License, or
--- (at your option) any later version.
--- 
--- This program is distributed in the hope that it will be useful,
--- but WITHOUT ANY WARRANTY; without even the implied warranty of
--- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
--- GNU General Public License for more details.
--- 
--- You should have received a copy of the GNU General Public License along
--- with this program; if not, write to the Free Software Foundation, Inc.,
--- 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
--- http://www.gnu.org/copyleft/gpl.html
-
-select 
-    rev_user as user_id,
-    rev_user_text as user_name,
-    min(rev_timestamp) as first_timestamp,
-    count(rev_timestamp) as editcount
-from 
-    revision r use index (usertext_timestamp) left join user_groups g 
-on r.rev_user = g.ug_user 
-where (ug_group <> 'bot' or g.ug_user is null) and rev_user > 0   
-group by rev_user_text
--- limit 100


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to