https://www.mediawiki.org/wiki/Special:Code/MediaWiki/114804
Revision: 114804
Author: giovanni
Date: 2012-04-09 18:58:33 +0000 (Mon, 09 Apr 2012)
Log Message:
-----------
first refactoring step to lifecycle.rates
Modified Paths:
--------------
trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py
trunk/tools/wsor/editor_lifecycle/scripts/fetchrates
Modified: trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py
===================================================================
--- trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py 2012-04-09
11:52:46 UTC (rev 114803)
+++ trunk/tools/wsor/editor_lifecycle/lifecycle/rates.py 2012-04-09
18:58:33 UTC (rev 114804)
@@ -1,200 +1,117 @@
#!/usr/bin/python
#:vim:ts=python:
-''' functions for computing average activity rate of a single cohort '''
+''' activity rate estimation '''
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+# Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
-import os
-import sys
-
import numpy as np
-import datetime as dt
-
-from argparse import ArgumentParser, FileType
from scipy.sparse import coo_matrix
-from scipy.stats import gmean
from collections import deque
-__prog__ = os.path.basename(os.path.abspath(__file__))
+MAX_NS = 109 # taken from WP:Namespaces
+_user_dtype = np.dtype([('day', int), ('ns', int), ('edits', int)])
-namespaces = {
- 'main': 0,
- 'talk': 1,
- 'user': 2,
- 'user_talk' : 3,
- 'wikipedia' : 4,
- 'wikipedia_talk' : 5,
- 'file' : 6,
- 'file_talk' : 7,
- 'mediawiki' : 8,
- 'mediawiki_talk' : 9,
- 'template' : 10,
- 'template_talk' : 11,
- 'help' : 12,
- 'help_talk' : 13,
- 'category' : 14,
- 'category_talk' : 15,
- 'portal' : 100,
- 'portal_talk' : 101,
- 'book' : 108,
- 'book_talk' : 109
-}
+def userrate(rows):
+ '''
+ User-level activity rate
-MAX_NS = np.max(namespaces.values()) # 109 as of August 2011
+ Parameters
+ ----------
+ rows - a sequence of (timestamps, namespace).
-def estimaterate(edits, step):
- edits = np.asfarray(edits)
- N = len(edits)
- tmp = []
- rem = N % step
- for i in xrange(0, N, step): # step-size chunks
- if i + step <= N:
- tmp.append(edits[i:i+step].sum() / step)
- else:
- tmp.append(edits[i:].sum() / (N-i))
- return np.asarray(tmp)
+ Returns
+ -------
+ An array with the following fields: day, ns, edits. Suitable for
+ `scipy.sparse.coo_matrix`
+ '''
+ rev_time, rev_ns = np.asfarray(rows).T
+ m, M = np.floor(rev_time.min()), np.ceil(rev_time.max())
+ uns = sorted(np.unique(rev_ns))
+ bins_time = np.arange(m,M + 1)
+ bins_ns = uns + [uns[-1] + 1]
+ rates, days, _ = np.histogram2d(rev_time, rev_ns, bins=(bins_time,
bins_ns))
+ I,J = np.nonzero(rates)
+ data = [ (days[i],uns[j],rates[i,j]) for i, j in zip(I,J) ]
+ return np.asarray(data, dtype=_user_dtype)
-def itercycles(npzarchive, every, users=None, onlyns=None):
+def cohortrate(npzarchive, onlyns=None, minsize=None, minsnr=None):
'''
- Iterates over the archive or over given list of users and returns estimated
- activity life cycle (see estimaterate())
+ Cohort-level activity rate
- For parameters, see computerates
+ Parameters
+ ----------
+ npzarchive - a mapping user id -> ndarray,
+ usually an NpzObj returned from `numpy.io.load`
+ onlyns - sequence of namespace codes
+ compute activity rates only with edits to these namespaces
+ minsize - positive int
+ filter out activity rate estimates based on sample of size less than
+ minsize minsnr - positive real
+ filter out activity rate estimates with signal-to-noise rate less than
+ parameter
+
+ Returns
+ -------
+ an array of daily activity rate observations, together estimated
uncertainties
'''
- for uid in (users or npzarchive.files):
+ day_counts = {}
+
+ for uid in npzarchive.files:
data = npzarchive[uid].view(np.recarray)
- idx = data.ns >= 0 # let's filter out junk (i.e. edits to virtual ns)
+
+ # negative namespaces are virtual. Let's filter out the edits to them
+ idx = data.ns >= 0
days = data.day[idx]
edits = data.edits[idx]
ns = data.ns[idx]
- days = days - days.min()
+ days -= days.min()
shape = (days.max() + 1, MAX_NS + 1)
M = coo_matrix((edits, (days, ns)), shape=shape)
if onlyns is not None:
- M = M.tocsc()[:, onlyns]
+ M = M.tocsc()[:, onlyns].tocoo()
M = M.tocsr()
- rates = estimaterate(np.asarray(M.sum(axis=1)).ravel(), every)
- yield np.c_[np.arange(len(rates)) * every, rates]
+ counts = np.asarray(M.sum(axis=1)).ravel()
-def average(ratesbyday, geometric=False):
- '''
- Computes average cycle with standard errors. Takes in input a dictionary
- returned by groupbydayssince(). If geometric, compute the geometric mean
- and standard deviation instead.
- '''
- all_days = sorted(ratesbyday.keys())
- result = deque()
- for d in all_days:
- s = ratesbyday[d]
- N = len(s)
- if geometric:
- s = np.ma.masked_equal(s, 0.0)
- m = gmean(s)
- sem = np.exp(np.std(np.log(s), ddof=1)) / np.sqrt(N)
- else:
- m = np.mean(s)
- sem = np.std(s, ddof=1)
- result.append((d, m, sem, N))
- return np.asarray(result)
-
-def groupbyday(npzarchive, every, users=None, onlyns=None):
- '''
- This function estimates editors' activity rates and groups rate estimates
by
- number of days elapsed since editor registration (which corresponds to
time = 0)
-
- For parameters, see computerates
- '''
- tmp = {}
- for cyclearr in itercycles(npzarchive, every, users, onlyns):
- for d, r in cyclearr:
+ # group by day
+ for i in xrange(len(counts)):
try:
- tmp[d].append(r)
+ day_counts[i].append(counts[i])
except KeyError:
- tmp[d] = deque([r])
- return tmp
+ day_counts[i] = deque([counts[i]])
-# NOTE: not used right now
-def lifetimes(npzarchive, users=None):
- '''
- Returns the distribution of account lifetimes over an archive. Can take an
- optional list users ids to restrict the sample to a specific group of
- editors
- '''
- lt = deque()
- for uid in (users or npzarchive.files):
- days, edits = npzarchive[uid].T
- lt.append(days.ptp())
- return np.asarray(lt)
+ # average over each day, filter out unwanted observations
+ max_day = len(day_counts)
+ rate = deque()
-# NOTE: not used right now
-def find_inactives(npzarchive, inactivity, minimum_activity, maximum_activity):
- now = dt.datetime.now().toordinal()
- epoch = dt.datetime(1970,1,1).toordinal()
- unix_now = now - epoch
- inactives = deque()
- for uid in npzarchive.files:
- days, edits = npzarchive[uid].T
- if days.ptp() <= inactivity:
- continue
- unix_last = days[-1]
- if (unix_now - unix_last) > inactivity:
- tot_edits = float(edits.sum())
- tot_days = float(days.ptp() - inactivity)
- activity = tot_edits / tot_days * 365.0
- if minimum_activity < activity and maximum_activity > activity:
- inactives.append(uid)
- return inactives
+ for i in xrange(max_day):
+ try:
+ sample = np.asarray(day_counts[i])
+ n = len(sample)
+ if n < minsize:
+ continue
+ m = np.mean(sample)
+ # the uncertainity of the estimate is just the standard error
+ # of the mean
+ s = np.std(sample, ddof=1) / np.sqrt(n)
+ if (m / s) < minsnr:
+ continue
+ rate.append((i, m, s))
+ except KeyError:
+ pass
+ return np.asarray(rate)
-def computerates(fn, every, users=None, onlyns=None, geometric=False):
- '''
- Returns an array of average activity vs day since registration with
standard
- errors of the average
-
- Parameters
- ----------
- onlyns - compute edit counts only over specified list of namespaces
- users - compute rates only for these users
- every - compute daily rates in strides of length `every'
- geometric - computes geometric mean of average rate by day since
- registration
- '''
- npzarchive = np.load(fn)
- rates = average(groupbyday(npzarchive, every, users, onlyns), geometric)
- return rates
-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument('data_file', metavar='data')
-parser.add_argument('output_file', metavar='output', type=FileType('w'))
-parser.add_argument('-every', type=int, help='default: average over
%(default)d days',
- default=30, metavar='NUM')
-parser.add_argument('-ns', type=int, action='append', help='select only these
NS',
- dest='only')
-
-def main(ns):
- rates = computerates(ns.data_file, ns.every, onlyns=ns.only)
- if ns.output_file.isatty():
- print >> sys.stderr, 'writing to standard output'
- np.savetxt(ns.output_file, rates, fmt='%f')
- if not ns.output_file.isatty():
- print '%s: output saved to %s' % (__prog__, ns.output_file.name)
-
-if __name__ == '__main__':
- ns = parser.parse_args()
- main(ns)
Modified: trunk/tools/wsor/editor_lifecycle/scripts/fetchrates
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/fetchrates 2012-04-09
11:52:46 UTC (rev 114803)
+++ trunk/tools/wsor/editor_lifecycle/scripts/fetchrates 2012-04-09
18:58:33 UTC (rev 114804)
@@ -2,24 +2,22 @@
''' Fetches and computes daily edit rates for cohorts of users '''
-'''
-Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
+# Copyright (C) 2011 GIOVANNI LUCA CIAMPAGLIA, [email protected]
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+# http://www.gnu.org/copyleft/gpl.html
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License along
-with this program; if not, write to the Free Software Foundation, Inc.,
-51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-http://www.gnu.org/copyleft/gpl.html
-'''
-
__author__ = "Giovanni Luca Ciampaglia"
__email__ = "[email protected]"
@@ -57,27 +55,9 @@
__prog__ = os.path.basename(os.path.abspath(__file__))
-# def iterateoversets(cursor):
-# yield list(cursor)
-# while cursor.nextset():
-# yield list(cursor)
def process(rows, user_id, output_dir=os.path.curdir):
- if len(rows) == 0:
- print >> sys.stderr, '%s: error: empty revision history for user %d' %
(__prog__,
- user_id)
- return
-
- rev_time, rev_ns = np.asfarray(rows).T
- m, M = np.floor(rev_time.min()), np.ceil(rev_time.max())
- uns = sorted(np.unique(rev_ns))
- bins_time = np.arange(m,M + 1)
- bins_ns = uns + [uns[-1] + 1]
- rates, days, _ = np.histogram2d(rev_time, rev_ns, bins=(bins_time,
bins_ns))
- I,J = np.nonzero(rates)
- data = [ (days[i],uns[j],rates[i,j]) for i, j in zip(I,J) ] # in
coordinate format
- dtype=np.dtype([('day', int), ('ns', int), ('edits', int)])
- data = np.asarray(data, dtype=dtype)
+ data = userrate(rows)
out_path = os.path.join(output_dir, '%d.npy' % user_id)
np.save(out_path, data)
return out_path
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs