trending_articles

whym Tue, 30 Aug 2011 12:26:09 -0700

http://www.mediawiki.org/wiki/Special:Code/MediaWiki/95806


Revision: 95806
Author:   whym
Date:     2011-08-30 19:25:57 +0000 (Tue, 30 Aug 2011)
Log Message:
-----------
Updates to trending article scripts, adding docs and some optimizations

Modified Paths:
--------------
    trunk/tools/wsor/trending_articles/README.rst
    trunk/tools/wsor/trending_articles/chart.py
    trunk/tools/wsor/trending_articles/detectbursts.py
    trunk/tools/wsor/trending_articles/find_revision_status.py

Modified: trunk/tools/wsor/trending_articles/README.rst
===================================================================
--- trunk/tools/wsor/trending_articles/README.rst       2011-08-30 19:08:57 UTC 
(rev 95805)
+++ trunk/tools/wsor/trending_articles/README.rst       2011-08-30 19:25:57 UTC 
(rev 95806)
@@ -1,4 +1,37 @@
-See http://meta.wikimedia.org/wiki/Research:Trending_articles_and_new_editors
+These scripts are used to produce the results published on the `sprint
+page on meta`_ on the editor behavior in trending articles.
 
-Counts files are available at:
-http://dammit.lt/wikistats/archive/2011/01/
+Usage
+---------
+We use following directory names.
+
+``pageview.all``
+        raw (hourly) page view count files
+``pageview.200907.en``
+        hourly page view count files for EN wiki in July 2009 only
+``pageview.200907.daily.en``
+        daily page view count files for EN wiki in July 2009 only
+
+
+1. Obtain the page view count files from `Domas's WikiStats` or from the 
``stats`` directory `Toolserver's user-store`.
+2. (optional) Select only the page names you are interested to reduce the 
processing time. For example, use this command ::
+   
+     for f in pageview.all/2009/07/pagecounts-200907*.gz ; do ggrep '^en [^ ]* 
' $f | gzip > pageview.200907.en/`basename $f`; done
+   
+3. (optional) Convert hourly page views in to daily page views with ::
+   
+     ./accumulatedaily.py pageview.200907.en/pagecounts-200907*.gz -p 3 -f 
'pageview.200907.daily.en/pagecounts-%Y%m%d-%H%M%S.gz'
+   
+4. Detect bursts in page views with ::
+   
+    python -O detectbursts.py pageview.200907.daily.en/pagecounts-200907* -w 3 
--rate=3 --min=1000 --max=10 --cutoff=20 -o bursts_200907_daily.tsv
+
+
+Notes
+--------
+Edit counts generated by these scripts may contain errors due to a MediaWiki's 
`bug 19311`_.
+
+.. _Domas's WikiStats: http://dammit.lt/wikistats/archive
+.. _Toolserver's user-store: https://wiki.toolserver.org/view/User-store
+.. _sprint page on meta: 
http://meta.wikimedia.org/wiki/Research:Trending_articles_and_new_editors
+.. _bug 19311: https://bugzilla.wikimedia.org/show_bug.cgi?id=19311

Modified: trunk/tools/wsor/trending_articles/chart.py
===================================================================
--- trunk/tools/wsor/trending_articles/chart.py 2011-08-30 19:08:57 UTC (rev 
95805)
+++ trunk/tools/wsor/trending_articles/chart.py 2011-08-30 19:25:57 UTC (rev 
95806)
@@ -12,6 +12,7 @@
 import datetime
 import math
 import re
+import math
 from collections import namedtuple
 
 counter_tuple = namedtuple('counter', 'name filter color explode')
@@ -36,6 +37,9 @@
     parser.add_argument('-v', '--verbose',
                       dest='verbose', action='store_true', default=False,
                       help='turn on verbose message output')
+    parser.add_argument('-X', '--exclude-semiprotect',
+                      dest='nosemiprotect', action='store_true', default=False,
+                      help='')
     parser.add_argument('files', nargs='+')
     options = parser.parse_args()
 
@@ -52,14 +56,19 @@
                 counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
                ]
 
-    # counters = [counter_tuple('new registered users', lambda x: x[10] == 
'REG'  and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#4444FF', 0.1),
-    #             counter_tuple('old registered users', lambda x: x[10] == 
'REG'  and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#8888EE', 0.0),
-    #             counter_tuple('new IP users',         lambda x: x[10] == 
'ANON' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#FF4444', 0.1),
-    #             counter_tuple('old IP users',         lambda x: x[10] == 
'ANON' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#EE8888', 0.0),
-    #             counter_tuple('bots',                 lambda x: x[10] == 
'REG_BOT' and x[14] != 'SEMIPROTECT',                 '#666666', 0.0),
-    #             #counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
+    # counters = [counter_tuple('w/ <30d edit history or IP', lambda x: x[10] 
== 'ANON'  or x[13] == 'NEW', '#FF4444', 0.1),
+    #             counter_tuple('w/ >30d edit history and registered', lambda 
x: x, '#CCCCCC', 0.0),
     #            ]
 
+    if options.nosemiprotect:
+        counters = [counter_tuple('new registered users', lambda x: x[10] == 
'REG'  and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#4444FF', 0.1),
+                    counter_tuple('old registered users', lambda x: x[10] == 
'REG'  and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#8888EE', 0.0),
+                    counter_tuple('new IP users',         lambda x: x[10] == 
'ANON' and x[13] == 'NEW' and x[14] != 'SEMIPROTECT', '#FF4444', 0.1),
+                    counter_tuple('old IP users',         lambda x: x[10] == 
'ANON' and x[13] == 'OLD' and x[14] != 'SEMIPROTECT', '#EE8888', 0.0),
+                    counter_tuple('bots',                 lambda x: x[10] == 
'REG_BOT' and x[14] != 'SEMIPROTECT',                 '#666666', 0.0),
+                    #counter_tuple('others', lambda x: x, '#CCCCCC', 0.0),
+                    ]
+
     counters_map = {}
     for x in counters:
         counters_map[x.name] = x
@@ -67,14 +76,17 @@
     ratios = []
     patt = re.compile('(\d+) / (\d+) / (\d+)')
     for (i,fname) in enumerate(options.files):
+        ratios.append(1.0)
         for line in open(fname).readlines():
             m = patt.search(line)
             if m:
-                ratios.append((float(m.group(1)) / float(m.group(2)) / 
float(m.group(3))) ** 0.5)
+                ratios[i] = (float(m.group(1)) / float(m.group(2)) / 
float(m.group(3))) ** 0.5
                 break
+
     sum_ratio = sum(ratios)
     counter_names = [x.name for x in counters]
 
+    # chart for breakdown of users
     plots = []
     matplotlib.rc('font', size=options.fsize)
     for (n,fname) in enumerate(options.files):
@@ -91,7 +103,8 @@
                     counts[c[0]].add(cols[options.field-1])
                     break
 
-        print counts#!
+        for (name,value) in counts.items():
+            print name, len(value)
         #plt.subplot(1, len(options.files), n+1)
         plt.axes([0, 0, ratios[n]/sum_ratio, ratios[n]/sum_ratio])
         plt.title(fname)
@@ -105,4 +118,39 @@
                    loc=(.8, .8))
 
         base,ext = os.path.splitext(fname)
+        print >>sys.stderr, 'output: ' + base
         plt.savefig('.'.join([base, 'svg']))
+
+    # chart for new editor retention
+    for (n,fname) in enumerate(options.files):
+        plt.figure(figsize=(10,10))
+        table = list(csv.reader(filter(lambda x: x[0] != '#', open(fname)), 
delimiter='\t'))
+        table = table[1:]
+        filt = lambda x: x[10] == 'REG'  and x[13] == 'NEW'
+        bin = lambda x: min(int(10 * math.log10(int(x[15]) + 1)), int(10 * 
math.log10(3000)))
+        username = lambda x: x[11]
+        users = {}
+        bins = {}
+        for cols in table:
+            if filt(cols) and not users.has_key(username(cols)):
+                users[username(cols)] = True
+                b = bin(cols)
+                bins.setdefault(b, 0)
+                bins[b] += 1
+
+        bins = sorted(bins.items(), key=lambda x: -x[0])
+        max_bin = max(x[0] for x in bins)
+
+        if max_bin == 0:
+            print >>sys.stderr, '%s: %s (no values)' % (fname, bins)
+            continue
+        print >>sys.stderr, '%s: %s' % (fname, bins)
+
+        p = plt.pie([x[1] for x in bins],
+                    pctdistance=1.2,
+                    autopct='%1.1f%%',
+                    colors=['#' + 3 * ('%02X' % int(255 - 255 * float(x[0]) / 
max_bin)) for x in bins])
+
+        base,ext = os.path.splitext(fname)
+        print >>sys.stderr, 'output: ' + base
+        plt.savefig('.'.join([base, 'retention', 'svg']))

Modified: trunk/tools/wsor/trending_articles/detectbursts.py
===================================================================
--- trunk/tools/wsor/trending_articles/detectbursts.py  2011-08-30 19:08:57 UTC 
(rev 95805)
+++ trunk/tools/wsor/trending_articles/detectbursts.py  2011-08-30 19:25:57 UTC 
(rev 95806)
@@ -7,19 +7,22 @@
 from datetime import datetime, timedelta
 import argparse
 import random
-import gzip
+import myzip
 import re
 import os
 import urllib2
 from collections import deque, namedtuple
 import numpy as np
-import gc
 
 pageview_tuple = namedtuple('Pageview', 'date count')
 count_tuple = namedtuple('Count', 'pred real')
 
 def time_parse(x):
-    return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz')
+    if x.endswith('.gz'):
+        return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.gz')
+    elif x.endswith('.gz'):
+        return datetime.strptime(x, 'pagecounts-%Y%m%d-%H%M%S.xz')
+
 def time_format(x):
     return datetime.strftime(x, '%Y/%m/%d %H:%M:%S')
 def datetime2days(x):
@@ -28,7 +31,7 @@
 def load_wikistats_file(f):
     print >>sys.stderr, 'loading %s...' % f
     ret = {}
-    for line in gzip.open(f):
+    for line in myzip.open(f):
         line.strip()
         (lang,title,count,bytes) = line.split(' ')
         ret[(lang,title)] = count_tuple(float(count), int(count))
@@ -143,9 +146,6 @@
                     if options.inclusive:
                         ls.insert(len(ls), bursting.has_key(page))
                     writer.writerow([unicode(x) for x in ls])
-                except UnicodeEncodeError, e:
-                    print >>sys.stderr, '%s: %s' % (e, page)
-                    continue
                 except UnicodeDecodeError, e:
                     print >>sys.stderr, '%s: %s' % (e, page)
                     continue

Modified: trunk/tools/wsor/trending_articles/find_revision_status.py
===================================================================
--- trunk/tools/wsor/trending_articles/find_revision_status.py  2011-08-30 
19:08:57 UTC (rev 95805)
+++ trunk/tools/wsor/trending_articles/find_revision_status.py  2011-08-30 
19:25:57 UTC (rev 95806)
@@ -10,7 +10,27 @@
 import urllib2
 import re
 from datetime import datetime, timedelta
+from collections import namedtuple
 
+revision_t = namedtuple('revision', 'oldid pageid textid comment userid 
usertext timestamp minor deleted length parentid')
+user_t = namedtuple('user', 'id name first editcount periodedits futureedits 
type')
+article_t = namedtuple('article', 'title protectlog older')
+edits_t = namedtuple('edits', 'before between')
+wikidate_t = namedtuple('wikidate', 'text datetime')
+log_t = namedtuple('log', 'title action params timestamp')
+
+botpat = re.compile('bot( |$)', re.IGNORECASE)
+protectpat = re.compile('\[edit=(.*?)\] \((.*?) \(UTC\)\)')
+
+def make_revision_t(*args):
+    x = revision_t(*args)
+    return x._replace(timestamp=wikidate_t(text=x.timestamp,
+                                           
datetime=parse_wikidate(x.timestamp)))
+def make_log_t(*args):
+    x = log_t(*args)
+    return x._replace(timestamp=wikidate_t(text=x.timestamp,
+                                           
datetime=parse_wikidate(x.timestamp)))
+
 def parse_wikidate(x):
     return datetime.strptime(str(x), '%Y%m%d%H%M%S')
 
@@ -47,57 +67,162 @@
         (title, rd_pid) = redirected(cursor, rd_pid, namespace)
     return (title, rd_pid)
 
-def firstedits(cursor, uid, uname, delta, n):
+def allprotect(cursor, start, end, ns=0):
+    cursor.execute('''
+          SELECT l.log_title, l.log_action, l.log_params, l.log_timestamp
+            FROM logging l
+            WHERE
+              l.log_type = "protect"
+              AND l.log_timestamp BETWEEN ? AND ?
+              AND l.log_namespace = ?
+              ORDER BY l.log_timestamp DESC
+        ;
+    ''', (start, end, ns))
+    return [make_log_t(*x) for x in list(cursor)]
+
+def closestprotect(cursor, limit, start, title, ns=0):
+    cursor.execute('''
+          SELECT l.log_title, l.log_action, l.log_params, l.log_timestamp
+            FROM logging l
+            WHERE
+              l.log_type = "protect"
+              AND l.log_title = ?
+              AND l.log_timestamp BETWEEN ? AND ?
+              AND l.log_namespace = ?
+            ORDER BY l.log_timestamp DESC
+            LIMIT 1
+        ;
+    ''', (title, limit, start, ns))
+    ls = list(cursor)
+    if len(ls) == 0:
+        return None
+    return make_log_t(*(ls[0]))
+
+def firstedits(cursor, uid, uname, limit=1):
     where = 'r.rev_user_text = ?'
     uspec = uname
     if uid != 0:
         where = 'r.rev_user = ?'
         uspec = uid
     cursor.execute('''
-          SELECT r.rev_timestamp
+          SELECT *
             FROM revision r
             WHERE
               r.rev_timestamp != ""
               AND %s
-              ORDER BY r.rev_timestamp ASC
-            LIMIT 1
+            ORDER BY r.rev_timestamp ASC
+          LIMIT ?
         ;
-    ''' % (where,), (uspec,))
-    first = list(cursor)[0][0]
-    first = parse_wikidate(first)
+    ''' % (where,), (uspec,limit))
+    return [make_revision_t(*x) for x in cursor]
+
+def olderthan(cursor, title, timestamp):
     cursor.execute('''
-          SELECT r.rev_id
+          SELECT r.rev_timestamp
             FROM revision r
+              INNER JOIN page p on p.page_id = r.rev_page
             WHERE
-              %s
-              AND r.rev_timestamp BETWEEN ? AND ?
-            LIMIT ?
+              r.rev_timestamp != ""
+              AND p.page_title = ?
+              AND r.rev_timestamp < ?
+          LIMIT 1
         ;
-    ''' % (where,), (uspec, format_wikidate(first), format_wikidate(first + 
delta), n))
-    return [int(x[0]) for x in list(cursor)]
+    ''', (title,timestamp))
+    return len(list(cursor)) != 0
 
-def editcount(cursor, uid, uname, timestamp):
-    where = 'r.rev_user_text = ?'
-    uspec = uname
+def editcount_before(cursor, uid, uname, timestamp):
     if uid != 0:
-        where = 'r.rev_user = ?'
-        uspec = uid
-    
+        cursor.execute('''
+          SELECT /* SLOW_OK */ count(*)
+            FROM revision r
+            WHERE
+              r.rev_user = ?
+              AND r.rev_timestamp > ?
+        ;
+        ''', (uid,timestamp))
+        newedits = list(cursor)[0][0]
+        cursor.execute('''
+          SELECT u.user_editcount
+            FROM user u
+            WHERE
+              u.user_id = ?
+        ;
+        ''', (uid,))
+        alledits = list(cursor)[0][0]
+        return int(alledits) - int(newedits)
+    else:
+        # anonymous user's edit count only can be found from revision
+        cursor.execute('''
+          SELECT /* SLOW_OK */ count(*)
+            FROM revision r
+            WHERE
+              r.rev_user_text = ?
+              AND r.rev_timestamp < ?
+        ;
+        ''', (uname,timestamp))
+        return int(list(cursor)[0][0])
+
+def editcount_duration(cursor, uid, uname, timestamp1, timestamp2):
+    uspec = 'r.rev_user = ?'
+    uarg = uid
+    if uid == 0:
+        uspec = 'r.rev_user_text = ?'
+        uarg = uname
     cursor.execute('''
-          SELECT count(*)
+          SELECT /* SLOW_OK */ count(*)
             FROM revision r
             WHERE
               %s
-              AND r.rev_timestamp < ?
+              AND r.rev_timestamp BETWEEN ? AND ?
         ;
-        ''' % (where,), (uspec,timestamp))
+    ''' % uspec, (uarg, timestamp1, timestamp2))
     return int(list(cursor)[0][0])
 
+def edits_duration(cursor, uid, uname, timestamp1, timestamp2):
+    uspec = 'r.rev_user = ?'
+    uarg = uid
+    if uid == 0:
+        uspec = 'r.rev_user_text = ?'
+        uarg = uname
+    cursor.execute('''
+          SELECT /* SLOW_OK */ *
+            FROM revision r
+            WHERE
+              %s
+              AND r.rev_timestamp BETWEEN ? AND ?
+        ;
+    ''' % uspec, (uarg, timestamp1, timestamp2))
+    return [make_revision_t(*x) for x in list(cursor)]
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('-f', '--field', metavar='N',
                         dest='field', type=int, default=1,
                         help='')
+    parser.add_argument('-H', '--host', metavar='HOST',
+                        dest='host', type=str, default='',
+                        help='mysql host name')
+    parser.add_argument('-R', '--hours', metavar='N',
+                        dest='hours', type=int, default=1,
+                        help='')
+    parser.add_argument('-a', '--activity-delta', metavar='DAYS',
+                        dest='activedelta', type=lambda x: timedelta(days=x), 
default=timedelta(days=120),
+                        help='')
+    parser.add_argument('-D', '--activity-duration', metavar='DAYS',
+                        dest='activedur', type=lambda x: timedelta(days=x), 
default=timedelta(days=90),
+                        help='')
+    parser.add_argument('-O', '--threshold', metavar='DATE',
+                        dest='olderthan', type=lambda x: parse_wikidate(x), 
default=None,
+                        help='')
+    parser.add_argument('-L', '--limit', metavar='N',
+                        dest='limit', type=int, default=30,
+                        help='')
+    parser.add_argument('-o', '--output', metavar='FILE',
+                        dest='output', type=lambda x: open(x, 'w'), 
default=sys.stdout,
+                        help='')
+    parser.add_argument('-b', '--include-bots',
+                        dest='include_bots', action='store_true', 
default=False,
+                        help='')
     parser.add_argument('-d', '--db', metavar='DBNAME', required=True,
                         dest='db', type=str, default='hywiki-p',
                         help='target wiki name')
@@ -105,21 +230,20 @@
     options = parser.parse_args()
     options.db = options.db.replace('_','-')
 
-    host = options.db + '.rrdb.toolserver.org'
-    conn = oursql.connect(host = host,
+    if options.host == '':
+       options.host = options.db + '.rrdb.toolserver.org'
+    conn = oursql.connect(host = options.host,
                           read_default_file=os.path.expanduser('~/.my.cnf'),
                           db = options.db.replace('-','_'),
                           charset=None,
                           use_unicode=False)
 
     cursor = conn.cursor()
-
     csv.field_size_limit(1000000000)
     table = list(csv.reader(open(options.input), delimiter='\t'))
     table = table[1:]
 
     output = []
-    hours = {}
     for cols in table:
         cursor.execute('''
           SELECT p.page_id, p.page_title, page_is_redirect
@@ -134,8 +258,7 @@
             print >>sys.stderr, 'error 1 %s' % cols
             continue
         redirect = int(res[0][2]) == 1
-        cols.insert(options.field, 'REDIRECT' if redirect else 'ARTICLE')
-        cols.insert(options.field, str(res[0][0]))
+        cols[options.field:options.field] = ['REDIRECT' if redirect else 
'ARTICLE', str(res[0][0])]
         output.append(cols)
         if redirect:
             (title,pageid) = redirected(cursor, res[0][0])
@@ -143,11 +266,8 @@
                 print >>sys.stderr, 'error 2 %s' % cols
                 continue
             a = [x for x in cols]
-            a[0] = title
-            a[1] = str(pageid)
-            a[2] = 'REDIRECT_RESOLVED'
+            a[options.field-1:options.field+2] = 
(title,str(pageid),'REDIRECT_RESOLVED')
             output.append(a)
-        hours[cols[3]] = True
 
     # cursor.executemany('''
     #       SELECT p.page_title, p.page_id
@@ -158,16 +278,25 @@
     # ''', [(urllib2.quote(x[options.field-1]),) for x in table])
     # print list(cursor)
 
-    print '\t'.join(['title', 'page_id', 'redirect?', 'pageview timestamp', 
'predicted pageview', 'actual pageview', 'trending hours', 'surprisedness', 
'revision', 'timestamp', 'user type', 'username', 'editcount', 'new?'])
-
-    botpat = re.compile('bot( |$)', re.IGNORECASE)
-    edits = 0
+    edits = {}
     articles = {}
+    users = {}
+    timestamps = {}
     for cols in output:
-        start = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S')
-        end   = start + timedelta(hours=1)
+        ts = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S')
+        timestamps[ts] = True
+    duration = sorted(timestamps.keys())
+    duration = (wikidate_t(format_wikidate(duration[0]), duration[0]),
+                wikidate_t(format_wikidate(duration[-1]), duration[-2]))
+    if options.olderthan == None:
+        options.olderthan = duration[0].datetime - timedelta(days=365)
+        
+    for cols in output:
+        ts = datetime.strptime(cols[3], '%Y/%m/%d %H:%M:%S')
+        start = ts + timedelta(hours=-options.hours)
+        end   = start + timedelta(hours=options.hours)
         cursor.execute('''
-       SELECT r.rev_id, r.rev_timestamp, r.rev_user, r.rev_user_text
+       SELECT *
            FROM revision r
            WHERE
                r.rev_page = ?
@@ -175,21 +304,96 @@
        ;
 
        ''', (cols[1],
-             datetime.strftime(start, '%Y%m%d$H%M%S'),
-             datetime.strftime(end,   '%Y%m%d$H%M%S'),
+             datetime.strftime(start, '%Y%m%d%H%M%S'),
+             datetime.strftime(end,   '%Y%m%d%H%M%S'),
              ))
-        ls = list(cursor)
+        ls = [make_revision_t(*x) for x in cursor]
         if len(ls) == 0:
-            print >>sys.stderr, 'no revision: %s %s %s' % (cols[0], start, end)
-        for (rev,ts,uid,username) in ls:
-            usertype = 'ANON' if uid == 0 else 'REG'
-            if uid != 0 and botpat.search(username):
-               usertype += '_BOT' 
-            output = cols + [str(x) for x in [rev, ts, usertype, username,
-                                         
editcount(cursor,uid,username,re.sub('[ /\:]', '', cols[3])),
-                                         'NEW' if 
firstedits(cursor,uid,username,timedelta(days=30),30).count(rev) > 0 else 
'OLD']]
-            print '\t'.join(output)
-            edits +=1
-            articles[cols[1]] = True
+            print >>sys.stderr, 'no revision: %s %s %s' % (start, end, cols[0])
+        for rev in ls:
+            usertype = 'ANON' if rev.userid == 0 else 'REG'
+            if rev.userid != 0 and botpat.search(rev.usertext):
+                if options.include_bots:
+                    usertype += '_BOT'
+                else:
+                    print >>sys.stderr, 'rev %s is by bot (%s)' % (rev.oldid, 
rev.usertext)
+                    continue
+            if not users.has_key((rev.userid,rev.usertext)):
+                users[(rev.userid,rev.usertext)] = user_t(id=rev.userid, 
name=rev.usertext,
+                                                          
first=firstedits(cursor, rev.userid, rev.usertext),
+                                                          
editcount=editcount_before(cursor, rev.userid, rev.usertext, duration[0].text),
+                                                          
periodedits=edits_duration(cursor, rev.userid, rev.usertext, duration[0].text, 
duration[1].text),
+                                                          
futureedits=edits_duration(cursor, rev.userid, rev.usertext, 
duration[0].datetime + options.activedelta, duration[1].datetime + 
options.activedelta + options.activedur),
+                                                          type=usertype)
+            edits[rev.oldid] = (cols,rev)
+            print >>sys.stderr, rev.oldid
+            if not articles.has_key(cols[0]):
+                articles[cols[0]] = article_t(title=cols[0], protectlog=[], 
older=olderthan(cursor, cols[0], options.olderthan))
 
-    print '# %s / %s / %s edits/article/hour' % (edits, len(articles.keys()), 
len(hours.keys()))
+    # collect protect logs
+    print >>sys.stderr, 'collecting protection log entries for %s - %s...' % 
(duration[0].text, duration[1].text)
+    protectlog = allprotect(cursor, duration[0].text, duration[1].text)
+
+    # collect protect information
+    print >>sys.stderr, 'collecting protection log entries of %d articles for 
%s - %s...' % (len(articles.items()), duration[0].text, duration[1].text)
+    for (title,article) in articles.items():
+        article.protectlog.extend(filter(lambda x: x.title == title, 
protectlog))
+        closest = None
+        if article.older:
+            closest = closestprotect(cursor, 
format_wikidate(options.olderthan), duration[0].text, article.title)
+        else:
+            closest = closestprotect(cursor, '0', duration[0].text, 
article.title)
+        if closest:
+            article.protectlog.append(closest)
+        print >>sys.stderr, '%s %d' % (title, len(article.protectlog))
+
+    options.output.write('\t'.join(['title', 'page_id', 'redirect?', 'pageview 
timestamp', 'predicted pageview', 'actual pageview', 'trending hours', 
'surprisedness', 'revision', 'timestamp', 'user type', 'username', 'editcount', 
'new?', 'protect', 'editcount_%dd+%dd' % (options.activedelta.days, 
options.activedur.days)]) + '\n')
+
+    # collect protect information
+    print >>sys.stderr, 'writing %d edits...' % (len(edits.items()))
+    for (revid,(cols,rev)) in sorted(edits.items(), key=lambda x: x[0]):
+        new = 'OLD'
+        user = users[(rev.userid, rev.usertext)]
+        if len(user.first) == 0 or user.first[0].timestamp.datetime > 
rev.timestamp.datetime + timedelta(days=-30):
+            new = 'NEW'
+            
+        revdate = rev.timestamp.datetime
+
+        article = articles[cols[0]]
+        protect = None
+        if len(article.protectlog) > 0:
+            f = filter(lambda x: x.timestamp.datetime < 
rev.timestamp.datetime, article.protectlog)
+            if len(f) > 0:
+                protect = f[0]
+
+        if protect == None or len(protect) == 0:
+            protect = 'NO_PROTECT'
+        else:
+            m = protectpat.search(protect.params)
+            if m:
+                lv = m.group(1)
+                try:
+                    expire = datetime.strptime(m.group(2), 'expires %M:%S, %d 
%B %Y')
+                    if lv == 'autoconfirmed' and expire > revdate:
+                        protect = 'SEMIPROTECT'
+                    elif lv == 'admin' and expire > revdate:
+                        protect = 'PROTECT'
+                    else:
+                        protect = 'OTHER_PROTECT'
+                except ValueError, e:
+                    if m.group(2).find('indefinite'):
+                        protect = 'INDEFINITE'
+                    else:
+                        protect = 'OTHER_PROTECT'
+            else:
+                protect = 'UNKNOWN'
+
+        output = cols + [str(x) for x in [revid, rev.timestamp.text, 
user.type, user.name,
+                                          user.editcount + len(filter(lambda 
x: x.timestamp.datetime < rev.timestamp.datetime, user.periodedits)),
+                                          new,
+                                          protect,
+                                          len(filter(lambda x: 
rev.timestamp.datetime + options.activedelta < x.timestamp.datetime and 
x.timestamp.datetime < rev.timestamp.datetime + options.activedelta + 
options.activedur, user.futureedits))
+                                          ]]
+        line = '\t'.join(output)
+        options.output.write(line + '\n')
+    options.output.write('# %s / %s edits/article\n' % (len(edits.keys()), 
len(articles.keys())))


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

[MediaWiki-CVS] SVN: [95806] trunk/tools/wsor/trending_articles

Reply via email to