http://www.mediawiki.org/wiki/Special:Code/MediaWiki/95806
Revision: 95806
Author: whym
Date: 2011-08-30 19:25:57 +0000 (Tue, 30 Aug 2011)
Log Message:
-----------
Updates to trending article scripts, adding docs and some optimizations
Modified Paths:
--------------
trunk/tools/wsor/trending_articles/README.rst
trunk/tools/wsor/trending_articles/chart.py
trunk/tools/wsor/trending_articles/detectbursts.py
trunk/tools/wsor/trending_articles/find_revision_status.py
Modified: trunk/tools/wsor/trending_articles/README.rst
===================================================================
--- trunk/tools/wsor/trending_articles/README.rst 2011-08-30 19:08:57 UTC
(rev 95805)
+++ trunk/tools/wsor/trending_articles/README.rst 2011-08-30 19:25:57 UTC
(rev 95806)
@@ -1,4 +1,37 @@
-See http://meta.wikimedia.org/wiki/Research:Trending_articles_and_new_editors
+These scripts are used to produce the results published on the `sprint
+page on meta`_ on the editor behavior in trending articles.
-Counts files are available at:
-http://dammit.lt/wikistats/archive/2011/01/
+Usage
+---------
+We use following directory names.
+
+``pageview.all``
+ raw (hourly) page view count files
+``pageview.200907.en``
+ hourly page view count files for EN wiki in July 2009 only
+``pageview.200907.daily.en``
+ daily page view count files for EN wiki in July 2009 only
+
+
+1. Obtain the page view count files from `Domas's WikiStats` or from the
``stats`` directory `Toolserver's user-store`.
+2. (optional) Select only the page names you are interested to reduce the
processing time. For example, use this command ::
+
+ for f in pageview.all/2009/07/pagecounts-200907*.gz ; do ggrep '^en [^ ]*
' $f | gzip > pageview.200907.en/`basename $f`; done
+
+3. (optional) Convert hourly page views in to daily page views with ::
+
+ ./accumulatedaily.py pageview.200907.en/pagecounts-200907*.gz -p 3 -f
'pageview.200907.daily.en/pagecounts-%Y%m%d-%H%M%S.gz'
+
+4. Detect bursts in page views with ::
+
+ python -O detectbursts.py pageview.200907.daily.en/pagecounts-200907* -w 3
--rate=3 --min=1000 --max=10 --cutoff=20 -o bursts_200907_daily.tsv
+
+
+Notes
+--------
+Edit counts generated by these scripts may contain errors due to a MediaWiki's
`bug 19311`_.
+
+.. _Domas's WikiStats: http://dammit.lt/wikistats/archive
+.. _Toolserver's user-store: https://wiki.toolserver.org/view/User-store
+.. _sprint page on meta:
http://meta.wikimedia.org/wiki/Research:Trending_articles_and_new_editors
+.. _bug 19311: https://bugzilla.wikimedia.org/show_bug.cgi?id=19311
Modified: trunk/tools/wsor/trending_articles/chart.py
===================================================================
--- trunk/tools/wsor/trending_articles/chart.py 2011-08-30 19:08:57 UTC (rev
95805)
+++ trunk/tools/wsor/trending_articles/chart.py 2011-08-30 19:25:57 UTC (rev
95806)
@@ -12,6 +12,7 @@
import datetime
import math
import re
+import math
from collections import namedtuple
counter_tuple = namedtuple('counter', 'name filter color explode')
@@ -36,6 +37,9 @@
parser.add_argument('-v', '--verbose',
dest='verbose', action='store_true', default=False,
help='turn on verbose message output')
+ parser.add_argument('-X', '--exclude-semiprotect',
+ dest='nosemiprotect', action='store_true', default=False,
+ help='')
parser.add_argument('files', nargs='+')
options = parser.parse_args()
@@ -52,14 +56,19 @@
counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
]
- # counters = [counter_tuple('new registered users', lambda x: x[10] ==
'REG' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#4444FF', 0.1),
- # counter_tuple('old registered users', lambda x: x[10] ==
'REG' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#8888EE', 0.0),
- # counter_tuple('new IP users', lambda x: x[10] ==
'ANON' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#FF4444', 0.1),
- # counter_tuple('old IP users', lambda x: x[10] ==
'ANON' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#EE8888', 0.0),
- # counter_tuple('bots', lambda x: x[10] ==
'REG_BOT' and x[14] != 'SEMIPROTECT', '#666666', 0.0),
- # #counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
+ # counters = [counter_tuple('w/ <30d edit history or IP', lambda x: x[10]
== 'ANON' or x[13] == 'NEW', '#FF4444', 0.1),
+ # counter_tuple('w/ >30d edit history and registered', lambda
x: x, '#CCCCCC', 0.0),
# ]
+ if options.nosemiprotect:
+ counters = [counter_tuple('new registered users', lambda x: x[10] ==
'REG' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#4444FF', 0.1),
+ counter_tuple('old registered users', lambda x: x[10] ==
'REG' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#8888EE', 0.0),
+ counter_tuple('new IP users', lambda x: x[10] ==
'ANON' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#FF4444', 0.1),
+ counter_tuple('old IP users', lambda x: x[10] ==
'ANON' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#EE8888', 0.0),
+ counter_tuple('bots', lambda x: x[10] ==
'REG_BOT' and x[14] != 'SEMIPROTECT', '#666666', 0.0),
+ #counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
+ ]
+
counters_map = {}
for x in counters:
counters_map[x.name] = x
@@ -67,14 +76,17 @@
ratios = []
patt = re.compile('(\d+) / (\d+) / (\d+)')
for (i,fname) in enumerate(options.files):
+ ratios.append(1.0)
for line in open(fname).readlines():
m = patt.search(line)
if m:
- ratios.append((float(m.group(1)) / float(m.group(2)) /
float(m.group(3))) ** 0.5)
+ ratios[i] = (float(m.group(1)) / float(m.group(2)) /
float(m.group(3))) ** 0.5
break
+
sum_ratio = sum(ratios)
counter_names = [x.name for x in counters]
+ # chart for breakdown of users
plots = []
matplotlib.rc('font', size=options.fsize)
for (n,fname) in enumerate(options.files):
@@ -91,7 +103,8 @@
counts[c[0]].add(cols[options.field-1])
break
- print counts#!
+ for (name,value) in counts.items():
+ print name, len(value)
#plt.subplot(1, len(options.files), n+1)
plt.axes([0, 0, ratios[n]/sum_ratio, ratios[n]/sum_ratio])
plt.title(fname)
@@ -105,4 +118,39 @@
loc=(.8, .8))
base,ext = os.path.splitext(fname)
+ print >>sys.stderr, 'output: ' + base
plt.savefig('.'.join([base, 'svg']))
+
+ # chart for new editor retention
+ for (n,fname) in enumerate(options.files):
+ plt.figure(figsize=(10,10))
+ table = list(csv.reader(filter(lambda x: x[0] != '#', open(fname)),
delimiter='\t'))
+ table = table[1:]
+ filt = lambda x: x[10] == 'REG' and x[13] == 'NEW'
+ bin = lambda x: min(int(10 * math.log10(int(x[15]) + 1)), int(10 *
math.log10(3000)))
+ username = lambda x: x[11]
+ users = {}
+ bins = {}
+ for cols in table:
+ if filt(cols) and not users.has_key(username(cols)):
+ users[username(cols)] = True
+ b = bin(cols)
+ bins.setdefault(b, 0)
+ bins[b] += 1
+
+ bins = sorted(bins.items(), key=lambda x: -x[0])
+ max_bin = max(x[0] for x in bins)
+
+ if max_bin == 0:
+ print >>sys.stderr, '%s: %s (no values)' % (fname, bins)
+ continue
+ print >>sys.stderr, '%s: %s' % (fname, bins)
+
+ p = plt.pie([x[1] for x in bins],
+ pctdistance=1.2,
+ autopct='%1.1f%%',
+ colors=['#' + 3 * ('%02X' % int(255 - 255 * float(x[0]) /
max_bin)) for x in bins])
+
+ base,ext = os.path.splitext(fname)
+ print >>sys.stderr, 'output: ' + base
+ plt.savefig('.'.join([base, 'retention', 'svg']))
Modified: trunk/tools/wsor/trending_articles/detectbursts.py
===================================================================
--- trunk/tools/wsor/trending_articles/detectbursts.py 2011-08-30 19:08:57 UTC
(rev 95805)
+++ trunk/tools/wsor/trending_articles/detectbursts.py 2011-08-30 19:25:57 UTC
(rev 95806)
@@ -7,19 +7,22 @@
from datetime import datetime, timedelta
import argparse
import random
-import gzip
+import myzip
import re
import os
import urllib2
from collections import deque, namedtuple
import numpy as np
-import gc
pageview_tuple = namedtuple('Pageview', 'date count')
count_tuple = namedtuple('Count', 'pred real')
def time_parse(x):
- return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz')
+ if x.endswith('.gz'):
+ return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz')
+ elif x.endswith('.gz'):
+ return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.xz')
+
def time_format(x):
return datetime.strftime(x, '%Y/%m/%d %H:%M:%S')
def datetime2days(x):
@@ -28,7 +31,7 @@
def load_wikistats_file(f):
print >>sys.stderr, 'loading %s...' % f
ret = {}
- for line in gzip.open(f):
+ for line in myzip.open(f):
line.strip()
(lang,title,count,bytes) = line.split(' ')
ret[(lang,title)] = count_tuple(float(count), int(count))
@@ -143,9 +146,6 @@
if options.inclusive:
ls.insert(len(ls), bursting.has_key(page))
writer.writerow([unicode(x) for x in ls])
- except UnicodeEncodeError, e:
- print >>sys.stderr, '%s: %s' % (e, page)
- continue
except UnicodeDecodeError, e:
print >>sys.stderr, '%s: %s' % (e, page)
continue
Modified: trunk/tools/wsor/trending_articles/find_revision_status.py
===================================================================
--- trunk/tools/wsor/trending_articles/find_revision_status.py 2011-08-30
19:08:57 UTC (rev 95805)
+++ trunk/tools/wsor/trending_articles/find_revision_status.py 2011-08-30
19:25:57 UTC (rev 95806)
@@ -10,7 +10,27 @@
import urllib2
import re
from datetime import datetime, timedelta
+from collections import namedtuple
+revision_t = namedtuple('revision', 'oldid pageid textid comment userid
usertext timestamp minor deleted length parentid')
+user_t = namedtuple('user', 'id name first editcount periodedits futureedits
type')
+article_t = namedtuple('article', 'title protectlog older')
+edits_t = namedtuple('edits', 'before between')
+wikidate_t = namedtuple('wikidate', 'text datetime')
+log_t = namedtuple('log', 'title action params timestamp')
+
+botpat = re.compile('bot( |$)', re.IGNORECASE)
+protectpat = re.compile('\[edit=(.*?)\] \((.*?) \(UTC\)\)')
+
+def make_revision_t(*args):
+ x = revision_t(*args)
+ return x._replace(timestamp=wikidate_t(text=x.timestamp,
+
datetime=parse_wikidate(x.timestamp)))
+def make_log_t(*args):
+ x = log_t(*args)
+ return x._replace(timestamp=wikidate_t(text=x.timestamp,
+
datetime=parse_wikidate(x.timestamp)))
+
def parse_wikidate(x):
return datetime.strptime(str(x), '%Y%m%d%H%M%S')
@@ -47,57 +67,162 @@
(title, rd_pid) = redirected(cursor, rd_pid, namespace)
return (title, rd_pid)
-def firstedits(cursor, uid, uname, delta, n):
+def allprotect(cursor, start, end, ns=0):
+ cursor.execute('''
+ SELECT l.log_title, l.log_action, l.log_params, l.log_timestamp
+ FROM logging l
+ WHERE
+ l.log_type = "protect"
+ AND l.log_timestamp BETWEEN ? AND ?
+ AND l.log_namespace = ?
+ ORDER BY l.log_timestamp DESC
+ ;
+ ''', (start, end, ns))
+ return [make_log_t(*x) for x in list(cursor)]
+
+def closestprotect(cursor, limit, start, title, ns=0):
+ cursor.execute('''
+ SELECT l.log_title, l.log_action, l.log_params, l.log_timestamp
+ FROM logging l
+ WHERE
+ l.log_type = "protect"
+ AND l.log_title = ?
+ AND l.log_timestamp BETWEEN ? AND ?
+ AND l.log_namespace = ?
+ ORDER BY l.log_timestamp DESC
+ LIMIT 1
+ ;
+ ''', (title, limit, start, ns))
+ ls = list(cursor)
+ if len(ls) == 0:
+ return None
+ return make_log_t(*(ls[0]))
+
+def firstedits(cursor, uid, uname, limit=1):
where = 'r.rev_user_text = ?'
uspec = uname
if uid != 0:
where = 'r.rev_user = ?'
uspec = uid
cursor.execute('''
- SELECT r.rev_timestamp
+ SELECT *
FROM revision r
WHERE
r.rev_timestamp != ""
AND %s
- ORDER BY r.rev_timestamp ASC
- LIMIT 1
+ ORDER BY r.rev_timestamp ASC
+ LIMIT ?
;
- ''' % (where,), (uspec,))
- first = list(cursor)[0][0]
- first = parse_wikidate(first)
+ ''' % (where,), (uspec,limit))
+ return [make_revision_t(*x) for x in cursor]
+
+def olderthan(cursor, title, timestamp):
cursor.execute('''
- SELECT r.rev_id
+ SELECT r.rev_timestamp
FROM revision r
+ INNER JOIN page p on p.page_id = r.rev_page
WHERE
- %s
- AND r.rev_timestamp BETWEEN ? AND ?
- LIMIT ?
+ r.rev_timestamp != ""
+ AND p.page_title = ?
+ AND r.rev_timestamp < ?
+ LIMIT 1
;
- ''' % (where,), (uspec, format_wikidate(first), format_wikidate(first +
delta), n))
- return [int(x[0]) for x in list(cursor)]
+ ''', (title,timestamp))
+ return len(list(cursor)) != 0
-def editcount(cursor, uid, uname, timestamp):
- where = 'r.rev_user_text = ?'
- uspec = uname
+def editcount_before(cursor, uid, uname, timestamp):
if uid != 0:
- where = 'r.rev_user = ?'
- uspec = uid
-
+ cursor.execute('''
+ SELECT /* SLOW_OK */ count(*)
+ FROM revision r
+ WHERE
+ r.rev_user = ?
+ AND r.rev_timestamp > ?
+ ;
+ ''', (uid,timestamp))
+ newedits = list(cursor)[0][0]
+ cursor.execute('''
+ SELECT u.user_editcount
+ FROM user u
+ WHERE
+ u.user_id = ?
+ ;
+ ''', (uid,))
+ alledits = list(cursor)[0][0]
+ return int(alledits) - int(newedits)
+ else:
+ # anonymous user's edit count only can be found from revision
+ cursor.execute('''
+ SELECT /* SLOW_OK */ count(*)
+ FROM revision r
+ WHERE
+ r.rev_user_text = ?
+ AND r.rev_timestamp < ?
+ ;
+ ''', (uname,timestamp))
+ return int(list(cursor)[0][0])
+
+def editcount_duration(cursor, uid, uname, timestamp1, timestamp2):
+ uspec = 'r.rev_user = ?'
+ uarg = uid
+ if uid == 0:
+ uspec = 'r.rev_user_text = ?'
+ uarg = uname
cursor.execute('''
- SELECT count(*)
+ SELECT /* SLOW_OK */ count(*)
FROM revision r
WHERE
%s
- AND r.rev_timestamp < ?
+ AND r.rev_timestamp BETWEEN ? AND ?
;
- ''' % (where,), (uspec,timestamp))
+ ''' % uspec, (uarg, timestamp1, timestamp2))
return int(list(cursor)[0][0])
+def edits_duration(cursor, uid, uname, timestamp1, timestamp2):
+ uspec = 'r.rev_user = ?'
+ uarg = uid
+ if uid == 0:
+ uspec = 'r.rev_user_text = ?'
+ uarg = uname
+ cursor.execute('''
+ SELECT /* SLOW_OK */ *
+ FROM revision r
+ WHERE
+ %s
+ AND r.rev_timestamp BETWEEN ? AND ?
+ ;
+ ''' % uspec, (uarg, timestamp1, timestamp2))
+ return [make_revision_t(*x) for x in list(cursor)]
+
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--field', metavar='N',
dest='field', type=int, default=1,
help='')
+ parser.add_argument('-H', '--host', metavar='HOST',
+ dest='host', type=str, default='',
+ help='mysql host name')
+ parser.add_argument('-R', '--hours', metavar='N',
+ dest='hours', type=int, default=1,
+ help='')
+ parser.add_argument('-a', '--activity-delta', metavar='DAYS',
+ dest='activedelta', type=lambda x: timedelta(days=x),
default=timedelta(days=120),
+ help='')
+ parser.add_argument('-D', '--activity-duration', metavar='DAYS',
+ dest='activedur', type=lambda x: timedelta(days=x),
default=timedelta(days=90),
+ help='')
+ parser.add_argument('-O', '--threshold', metavar='DATE',
+ dest='olderthan', type=lambda x: parse_wikidate(x),
default=None,
+ help='')
+ parser.add_argument('-L', '--limit', metavar='N',
+ dest='limit', type=int, default=30,
+ help='')
+ parser.add_argument('-o', '--output', metavar='FILE',
+ dest='output', type=lambda x: open(x, 'w'),
default=sys.stdout,
+ help='')
+ parser.add_argument('-b', '--include-bots',
+ dest='include_bots', action='store_true',
default=False,
+ help='')
parser.add_argument('-d', '--db', metavar='DBNAME', required=True,
dest='db', type=str, default='hywiki-p',
help='target wiki name')
@@ -105,21 +230,20 @@
options = parser.parse_args()
options.db = options.db.replace('_','-')
- host = options.db + '.rrdb.toolserver.org'
- conn = oursql.connect(host = host,
+ if options.host == '':
+ options.host = options.db + '.rrdb.toolserver.org'
+ conn = oursql.connect(host = options.host,
read_default_file=os.path.expanduser('~/.my.cnf'),
db = options.db.replace('-','_'),
charset=None,
use_unicode=False)
cursor = conn.cursor()
-
csv.field_size_limit(1000000000)
table = list(csv.reader(open(options.input), delimiter='\t'))
table = table[1:]
output = []
- hours = {}
for cols in table:
cursor.execute('''
SELECT p.page_id, p.page_title, page_is_redirect
@@ -134,8 +258,7 @@
print >>sys.stderr, 'error 1 %s' % cols
continue
redirect = int(res[0][2]) == 1
- cols.insert(options.field, 'REDIRECT' if redirect else 'ARTICLE')
- cols.insert(options.field, str(res[0][0]))
+ cols[options.field:options.field] = ['REDIRECT' if redirect else
'ARTICLE', str(res[0][0])]
output.append(cols)
if redirect:
(title,pageid) = redirected(cursor, res[0][0])
@@ -143,11 +266,8 @@
print >>sys.stderr, 'error 2 %s' % cols
continue
a = [x for x in cols]
- a[0] = title
- a[1] = str(pageid)
- a[2] = 'REDIRECT_RESOLVED'
+ a[options.field-1:options.field+2] =
(title,str(pageid),'REDIRECT_RESOLVED')
output.append(a)
- hours[cols[3]] = True
# cursor.executemany('''
# SELECT p.page_title, p.page_id
@@ -158,16 +278,25 @@
# ''', [(urllib2.quote(x[options.field-1]),) for x in table])
# print list(cursor)
- print '\t'.join(['title', 'page_id', 'redirect?', 'pageview timestamp',
'predicted pageview', 'actual pageview', 'trending hours', 'surprisedness',
'revision', 'timestamp', 'user type', 'username', 'editcount', 'new?'])
-
- botpat = re.compile('bot( |$)', re.IGNORECASE)
- edits = 0
+ edits = {}
articles = {}
+ users = {}
+ timestamps = {}
for cols in output:
- start = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S')
- end = start + timedelta(hours=1)
+ ts = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S')
+ timestamps[ts] = True
+ duration = sorted(timestamps.keys())
+ duration = (wikidate_t(format_wikidate(duration[0]), duration[0]),
+ wikidate_t(format_wikidate(duration[-1]), duration[-2]))
+ if options.olderthan == None:
+ options.olderthan = duration[0].datetime - timedelta(days=365)
+
+ for cols in output:
+ ts = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S')
+ start = ts + timedelta(hours=-options.hours)
+ end = start + timedelta(hours=options.hours)
cursor.execute('''
- SELECT r.rev_id, r.rev_timestamp, r.rev_user, r.rev_user_text
+ SELECT *
FROM revision r
WHERE
r.rev_page = ?
@@ -175,21 +304,96 @@
;
''', (cols[1],
- datetime.strftime(start, '%Y%m%d$H%M%S'),
- datetime.strftime(end, '%Y%m%d$H%M%S'),
+ datetime.strftime(start, '%Y%m%d%H%M%S'),
+ datetime.strftime(end, '%Y%m%d%H%M%S'),
))
- ls = list(cursor)
+ ls = [make_revision_t(*x) for x in cursor]
if len(ls) == 0:
- print >>sys.stderr, 'no revision: %s %s %s' % (cols[0], start, end)
- for (rev,ts,uid,username) in ls:
- usertype = 'ANON' if uid == 0 else 'REG'
- if uid != 0 and botpat.search(username):
- usertype += '_BOT'
- output = cols + [str(x) for x in [rev, ts, usertype, username,
-
editcount(cursor,uid,username,re.sub('[ /\:]', '', cols[3])),
- 'NEW' if
firstedits(cursor,uid,username,timedelta(days=30),30).count(rev) > 0 else
'OLD']]
- print '\t'.join(output)
- edits +=1
- articles[cols[1]] = True
+ print >>sys.stderr, 'no revision: %s %s %s' % (start, end, cols[0])
+ for rev in ls:
+ usertype = 'ANON' if rev.userid == 0 else 'REG'
+ if rev.userid != 0 and botpat.search(rev.usertext):
+ if options.include_bots:
+ usertype += '_BOT'
+ else:
+ print >>sys.stderr, 'rev %s is by bot (%s)' % (rev.oldid,
rev.usertext)
+ continue
+ if not users.has_key((rev.userid,rev.usertext)):
+ users[(rev.userid,rev.usertext)] = user_t(id=rev.userid,
name=rev.usertext,
+
first=firstedits(cursor, rev.userid, rev.usertext),
+
editcount=editcount_before(cursor, rev.userid, rev.usertext, duration[0].text),
+
periodedits=edits_duration(cursor, rev.userid, rev.usertext, duration[0].text,
duration[1].text),
+
futureedits=edits_duration(cursor, rev.userid, rev.usertext,
duration[0].datetime + options.activedelta, duration[1].datetime +
options.activedelta + options.activedur),
+ type=usertype)
+ edits[rev.oldid] = (cols,rev)
+ print >>sys.stderr, rev.oldid
+ if not articles.has_key(cols[0]):
+ articles[cols[0]] = article_t(title=cols[0], protectlog=[],
older=olderthan(cursor, cols[0], options.olderthan))
- print '# %s / %s / %s edits/article/hour' % (edits, len(articles.keys()),
len(hours.keys()))
+ # collect protect logs
+ print >>sys.stderr, 'collecting protection log entries for %s - %s...' %
(duration[0].text, duration[1].text)
+ protectlog = allprotect(cursor, duration[0].text, duration[1].text)
+
+ # collect protect information
+ print >>sys.stderr, 'collecting protection log entries of %d articles for
%s - %s...' % (len(articles.items()), duration[0].text, duration[1].text)
+ for (title,article) in articles.items():
+ article.protectlog.extend(filter(lambda x: x.title == title,
protectlog))
+ closest = None
+ if article.older:
+ closest = closestprotect(cursor,
format_wikidate(options.olderthan), duration[0].text, article.title)
+ else:
+ closest = closestprotect(cursor, '0', duration[0].text,
article.title)
+ if closest:
+ article.protectlog.append(closest)
+ print >>sys.stderr, '%s %d' % (title, len(article.protectlog))
+
+ options.output.write('\t'.join(['title', 'page_id', 'redirect?', 'pageview
timestamp', 'predicted pageview', 'actual pageview', 'trending hours',
'surprisedness', 'revision', 'timestamp', 'user type', 'username', 'editcount',
'new?', 'protect', 'editcount_%dd+%dd' % (options.activedelta.days,
options.activedur.days)]) + '\n')
+
+ # collect protect information
+ print >>sys.stderr, 'writing %d edits...' % (len(edits.items()))
+ for (revid,(cols,rev)) in sorted(edits.items(), key=lambda x: x[0]):
+ new = 'OLD'
+ user = users[(rev.userid, rev.usertext)]
+ if len(user.first) == 0 or user.first[0].timestamp.datetime >
rev.timestamp.datetime + timedelta(days=-30):
+ new = 'NEW'
+
+ revdate = rev.timestamp.datetime
+
+ article = articles[cols[0]]
+ protect = None
+ if len(article.protectlog) > 0:
+ f = filter(lambda x: x.timestamp.datetime <
rev.timestamp.datetime, article.protectlog)
+ if len(f) > 0:
+ protect = f[0]
+
+ if protect == None or len(protect) == 0:
+ protect = 'NO_PROTECT'
+ else:
+ m = protectpat.search(protect.params)
+ if m:
+ lv = m.group(1)
+ try:
+ expire = datetime.strptime(m.group(2), 'expires %M:%S, %d
%B %Y')
+ if lv == 'autoconfirmed' and expire > revdate:
+ protect = 'SEMIPROTECT'
+ elif lv == 'admin' and expire > revdate:
+ protect = 'PROTECT'
+ else:
+ protect = 'OTHER_PROTECT'
+ except ValueError, e:
+ if m.group(2).find('indefinite'):
+ protect = 'INDEFINITE'
+ else:
+ protect = 'OTHER_PROTECT'
+ else:
+ protect = 'UNKNOWN'
+
+ output = cols + [str(x) for x in [revid, rev.timestamp.text,
user.type, user.name,
+ user.editcount + len(filter(lambda
x: x.timestamp.datetime < rev.timestamp.datetime, user.periodedits)),
+ new,
+ protect,
+ len(filter(lambda x:
rev.timestamp.datetime + options.activedelta < x.timestamp.datetime and
x.timestamp.datetime < rev.timestamp.datetime + options.activedelta +
options.activedur, user.futureedits))
+ ]]
+ line = '\t'.join(output)
+ options.output.write(line + '\n')
+ options.output.write('# %s / %s edits/article\n' % (len(edits.keys()),
len(articles.keys())))
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs