http://www.mediawiki.org/wiki/Special:Code/MediaWiki/90935

Revision: 90935
Author:   diederik
Date:     2011-06-28 04:19:32 +0000 (Tue, 28 Jun 2011)
Log Message:
-----------
Backlog of small fixes.

Modified Paths:
--------------
    trunk/tools/editor_trends/etl/sort.py
    trunk/tools/editor_trends/etl/store.py

Added Paths:
-----------
    trunk/tools/editor_trends/analyses/adhoc/benchmarker_queue.py
    trunk/tools/editor_trends/analyses/network/
    trunk/tools/editor_trends/analyses/network/community_graph.py
    trunk/tools/editor_trends/analyses/network/graph_db.py
    trunk/tools/editor_trends/analyses/plugins/kaggle_correlation.py
    trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check.py
    trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check_edits.py
    trunk/tools/editor_trends/analyses/plugins/sor_newbie_treatment.py
    trunk/tools/editor_trends/kaggle/training_db.py
    trunk/tools/editor_trends/kaggle/training_file.py

Removed Paths:
-------------
    trunk/tools/editor_trends/analyses/adhoc/community_graph.py
    trunk/tools/editor_trends/kaggle/training.py

Added: trunk/tools/editor_trends/analyses/adhoc/benchmarker_queue.py
===================================================================
--- trunk/tools/editor_trends/analyses/adhoc/benchmarker_queue.py               
                (rev 0)
+++ trunk/tools/editor_trends/analyses/adhoc/benchmarker_queue.py       
2011-06-28 04:19:32 UTC (rev 90935)
@@ -0,0 +1,24 @@
+from Queue import Queue
+#import cProfile
+from guppy import hpy
+h = hpy()
+
+q1, q2, q3 = Queue(), Queue(), Queue()
+h.heap()
+print 'ughh'
+for x in xrange(1000):
+    q1.put(x)
+    q2.put({})
+    q3.put([])
+    #h = hpy()
+hpy().doc
+h.heap()
+#    for x in xrange(100):
+#        a = q1.get()
+#        b = q2.get()
+#        c = q3.get()
+#    h.heap()
+
+#if __name__ == '__main__':
+#    main()
+    #cProfile.run('main()')

Deleted: trunk/tools/editor_trends/analyses/adhoc/community_graph.py
===================================================================
--- trunk/tools/editor_trends/analyses/adhoc/community_graph.py 2011-06-28 
04:00:40 UTC (rev 90934)
+++ trunk/tools/editor_trends/analyses/adhoc/community_graph.py 2011-06-28 
04:19:32 UTC (rev 90935)
@@ -1,62 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-'''
-Copyright (C) 2010 by Diederik van Liere ([email protected])
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License version 2
-as published by the Free Software Foundation.
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-See the GNU General Public License for more details, at
-http://www.fsf.org/licenses/gpl.html
-'''
-
-__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
-__email__ = 'dvanliere at gmail dot com'
-__date__ = '2011-01-10'
-__version__ = '0.1'
-
-import sys
-if '..' not in sys.path:
-    sys.path.append('..')
-
-from classes import settings
-settings = settings.Settings()
-from classes import storage
-from utils import file_utils
-
-try:
-    import psyco
-    psyco.full()
-except ImportError:
-    pass
-
-def create_articles_set(edits):
-    s = set()
-    years = edits.keys()
-    for year in years:
-        for edit in edits[year]:
-            s.add(edit['article'])
-    return s
-
-
-def create_edgelist(project, collection):
-    db = storage.init_database(rts.storage, project, collection)
-    ids = db.retrieve_distinct_keys('editor')
-    ids.sort()
-    fh = file_utils.create_txt_filehandle(settings.dataset_location, 
'%s_edgelist.csv' % project, 'w', 'utf-8')
-    for i in ids:
-        author_i = conn[collection].find_one({'editor': i})
-        article_i = create_articles_set(author_i['edits'])
-        for j in ids:
-            if i > j:
-                author_j = conn[collection].find_one({'editor': j})
-                article_j = create_articles_set(author_j['edits'])
-                common = article_i.intersection(article_j)
-                if len(common) > 0:
-                    file_utils.write_list_to_csv([i, j, len(common)], fh, 
recursive=False, newline=True)
-    fh.close()
-
-if __name__ == '__main__':
-    create_edgelist('enwiki', 'editors')

Copied: trunk/tools/editor_trends/analyses/network/community_graph.py (from rev 
88957, trunk/tools/editor_trends/analyses/adhoc/community_graph.py)
===================================================================
--- trunk/tools/editor_trends/analyses/network/community_graph.py               
                (rev 0)
+++ trunk/tools/editor_trends/analyses/network/community_graph.py       
2011-06-28 04:19:32 UTC (rev 90935)
@@ -0,0 +1,63 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-01-10'
+__version__ = '0.1'
+
+import sys
+if '../../' not in sys.path:
+    sys.path.append('../../')
+
+from classes import settings
+settings = settings.Settings()
+from classes import storage
+from utils import file_utils
+
+try:
+    import psyco
+    psyco.full()
+except ImportError:
+    pass
+
+def create_articles_set(edits):
+    s = set()
+    years = edits.keys()
+    for year in years:
+        for edit in edits[year]:
+            s.add(edit['article'])
+    return s
+
+
+def create_edgelist(project, collection):
+    db = storage.init_database('mongo', project, collection)
+    ids = db.retrieve_distinct_keys('editor')
+    ids.sort()
+    fh = file_utils.create_txt_filehandle(settings.dataset_location, 
'%s_edgelist.csv' % project, 'w', 'utf-8')
+    for i in ids:
+        author_i = db.find_one({'editor': i})
+        if author_i != None:
+            article_i = create_articles_set(author_i['edits'])
+            for j in ids:
+                if i > j:
+                    author_j = db.find_one({'editor': j})
+                    article_j = create_articles_set(author_j['edits'])
+                    common = article_i.intersection(article_j)
+                    if len(common) > 0:
+                        file_utils.write_list_to_csv([i, j, len(common)], fh, 
recursive=False, newline=True)
+    fh.close()
+
+if __name__ == '__main__':
+    create_edgelist('wikilytics', 'enwiki_editors_raw')

Added: trunk/tools/editor_trends/analyses/network/graph_db.py
===================================================================
--- trunk/tools/editor_trends/analyses/network/graph_db.py                      
        (rev 0)
+++ trunk/tools/editor_trends/analyses/network/graph_db.py      2011-06-28 
04:19:32 UTC (rev 90935)
@@ -0,0 +1,82 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2010-11-25'
+__version__ = '0.1'
+
+import codecs
+from neo4jrestclient import GraphDatabase, NotFoundError
+
+neo4jrestclient.request.CACHE = True
+
+class IDGenerator:
+    def __init__(self):
+        self.n = 0
+        self.ids = {}
+        self.inverted_ids = {}
+
+    def invert_dict(self):
+        return dict((v, k) for k, v in self.ids.iteritems())
+
+    def get_id(self, n):
+        if n not in self.ids:
+            self.ids[n] = self.n
+            self.n += 1
+        return self.ids[n]
+
+    def reverse_lookup(self, n):
+        if self.inverted_ids == {}:
+            self.inverted_ids = self.invert_dict()
+        return self.inverted_ids[n]
+
+
+def read_edgelist():
+    fh = 
codecs.open('C:\\Users\\diederik.vanliere\\Dropbox\\wsor\\diederik\\wikilytics_edgelist.csv',
 'r', 'utf-8')
+    for line in fh:
+        line = line.strip()
+        line = line.split('\t')
+        actor_a = line[0]
+        actor_b = line[1]
+        weight = int(line[2])
+        yield (actor_a, actor_b, weight)
+    fh.close()
+
+def init_db():
+    gdb = GraphDatabase("http://localhost:7474/db/data/";)
+    return gdb
+
+def get_node(gdb, idg, node):
+    node = idg.get_id(node)
+    try:
+        #n = gdb.nodes.get('id', node)
+        n = gdb.nodes[node]
+    except NotFoundError:
+        n = gdb.nodes.create(id=node)
+        n['id'] = node
+
+    return n
+
+def load_data():
+    idg = IDGenerator()
+    gdb = init_db()
+    for (actor_a, actor_b, weight) in read_edgelist():
+        n1 = get_node(gdb, idg, actor_a)
+        n2 = get_node(gdb, idg, actor_b)
+        n1.relationships.create("cognitive_distance", n2, weight=weight)
+
+if __name__ == '__main__':
+    load_data()
+


Property changes on: trunk/tools/editor_trends/analyses/network/graph_db.py
___________________________________________________________________
Added: svn:eol-style
   + native

Added: trunk/tools/editor_trends/analyses/plugins/kaggle_correlation.py
===================================================================
--- trunk/tools/editor_trends/analyses/plugins/kaggle_correlation.py            
                (rev 0)
+++ trunk/tools/editor_trends/analyses/plugins/kaggle_correlation.py    
2011-06-28 04:19:32 UTC (rev 90935)
@@ -0,0 +1,49 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-04-19'
+__version__ = '0.1'
+
+from datetime import datetime
+from dateutil.relativedelta import *
+
+
+def kaggle_correlation(var, editor, **kwargs):
+    end_date = datetime(2011, 2, 1)
+    cutoff_date = datetime(2010, 9, 1)
+    start_date = datetime(2009, 9, 1)
+    edits = editor['edit_count']
+    username = editor['username']
+
+    pre, after = 0, 0
+
+    while start_date < cutoff_date:
+        year = str(start_date.year)
+        month = str(start_date.month)
+        pre += edits.get(year, {}).get(month, {}).get('0', 0)
+        start_date = start_date + relativedelta(months= +1)
+
+    start_date = datetime(2010, 9, 1)
+    while start_date < end_date:
+        year = str(start_date.year)
+        month = str(start_date.month)
+        after += edits.get(year, {}).get(month, {}).get('0', 0)
+        start_date = start_date + relativedelta(months= +1)
+
+    if pre > 0:
+        var.add(end_date, pre, {'after': after, 'username': username})
+
+    return var

Added: trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check.py
===================================================================
--- trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check.py           
                (rev 0)
+++ trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check.py   
2011-06-28 04:19:32 UTC (rev 90935)
@@ -0,0 +1,49 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-01-28'
+__version__ = '0.1'
+
+
+from datetime import datetime
+from dateutil.relativedelta import *
+
+
+def kaggle_sanity_check(var, editor, **kwargs):
+    end_date = datetime(2011, 2, 1)
+    cutoff = datetime(2010, 9, 1)
+    start_date = datetime(2009, 9, 1)
+    edits = editor['edit_count']
+    active = 0
+    count = 0
+    while start_date < cutoff:
+        year = str(start_date.year)
+        month = str(start_date.month)
+        #namespaces = edits.get(year, {}).get(month, {}).keys()
+        #for ns in namespaces:
+        count += edits.get(year, {}).get(month, {}).get('0', 0)
+        start_date = start_date + relativedelta(months= +1)
+
+    if count > 0:
+        while start_date < end_date:
+            year = str(start_date.year)
+            month = str(start_date.month)
+            active += edits.get(year, {}).get(month, {}).get('0', 0)
+            start_date = start_date + relativedelta(months= +1)
+        if active > 0 :
+            var.add(cutoff, 1)
+
+    return var

Added: trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check_edits.py
===================================================================
--- trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check_edits.py     
                        (rev 0)
+++ trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check_edits.py     
2011-06-28 04:19:32 UTC (rev 90935)
@@ -0,0 +1,41 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-01-28'
+__version__ = '0.1'
+
+
+from datetime import datetime
+from dateutil.relativedelta import *
+
+
+def kaggle_sanity_check_edits(var, editor, **kwargs):
+    end_date = datetime(2011, 2, 1)
+    start_date = datetime(2010, 9, 1)
+    edits = editor['edit_count']
+    username = editor['username']
+
+    count = 0
+    while start_date < end_date:
+        year = str(start_date.year)
+        month = str(start_date.month)
+        count += edits.get(year, {}).get(month, {}).get('0', 0)
+        start_date = start_date + relativedelta(months= +1)
+
+    if count > 0:
+        var.add(end_date, count, {'editor': username})
+
+    return var

Added: trunk/tools/editor_trends/analyses/plugins/sor_newbie_treatment.py
===================================================================
--- trunk/tools/editor_trends/analyses/plugins/sor_newbie_treatment.py          
                (rev 0)
+++ trunk/tools/editor_trends/analyses/plugins/sor_newbie_treatment.py  
2011-06-28 04:19:32 UTC (rev 90935)
@@ -0,0 +1,42 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-01-25'
+__version__ = '0.1'
+
+from classes import storage
+
+def sor_newbie_treatment(editor, var, **kwargs):
+    rts = kwargs.pop('rts')
+    tenth_edit = editor['new_wikipedian']
+    title = ':%s' % editor['username']
+    collection = '%s%s_diffs_dataset' % (rts.language.code, rts.project.name)
+    db = storage.init_database(rts.storage, rts.dbname, collection)
+
+    if tenth_edit != False:
+        qualifier = {'ns': 3, 'timestamp': {'$lt':tenth_edit}}
+        observations = db.find_one(qualifier)
+    else:
+        observations = db.find_one('editor', editor)
+
+    if observations != None:
+        for obs in observations:
+            if obs['ns'] == 3:
+                values = obs.values()
+                print values
+
+
+

Modified: trunk/tools/editor_trends/etl/sort.py
===================================================================
--- trunk/tools/editor_trends/etl/sort.py       2011-06-28 04:00:40 UTC (rev 
90934)
+++ trunk/tools/editor_trends/etl/sort.py       2011-06-28 04:19:32 UTC (rev 
90935)
@@ -55,11 +55,6 @@
                 fh.close()
                 for x, d in enumerate(data):
                     d = d.strip().split('\t')
-                    #TEMP FIX:
-                    #editor = d[2]
-                    #d[2] = d[0]
-                    #d[0] = editor
-                    #END TEMP FIX
                     data[x] = d
                 #data = [d.strip() for d in data]
                 #data = [d.split('\t') for d in data]
@@ -153,7 +148,7 @@
     pbar = progressbar.ProgressBar(maxval=len(files)).start()
     tasks = multiprocessing.JoinableQueue()
     result = multiprocessing.JoinableQueue()
-    number_of_processes = 3
+    number_of_processes = 2
     sorters = [Sorter(rts, tasks, result) for x in xrange(number_of_processes)]
 
     for filename in files:
@@ -166,16 +161,14 @@
         sorter.start()
 
     ppills = number_of_processes
-    while True:
-        while ppills > 0:
-            try:
-                res = result.get(block=True)
-                if res == True:
-                    pbar.update(pbar.currval + 1)
-                else:
-                    ppills -= 1
-            except Empty:
-                pass
-        break
+    while ppills > 0:
+        try:
+            res = result.get()
+            if res == True:
+                pbar.update(pbar.currval + 1)
+            else:
+                ppills -= 1
+        except Empty:
+            pass
 
     tasks.join()

Modified: trunk/tools/editor_trends/etl/store.py
===================================================================
--- trunk/tools/editor_trends/etl/store.py      2011-06-28 04:00:40 UTC (rev 
90934)
+++ trunk/tools/editor_trends/etl/store.py      2011-06-28 04:19:32 UTC (rev 
90935)
@@ -79,7 +79,7 @@
     date = text_utils.convert_timestamp_to_datetime_utc(line[6])
     md5 = line[7]
     revert = int(line[8])
-    reverted_user = int(line[9])
+    reverted_user = line[9]
     reverted_rev_id = int(line[10])
     bot = int(line[11])
     cur_size = int(line[12])
@@ -96,12 +96,10 @@
             'cur_size':cur_size,
             'delta':delta,
             'bot':bot,
+            'reverted_user': reverted_user,
+            'reverted_rev_id': reverted_rev_id
     }
 
-    if reverted_user > -1:
-        data['reverted_user'] = reverted_user,
-        data['reverted_rev_id'] = reverted_rev_id
-
     return data
 
 

Deleted: trunk/tools/editor_trends/kaggle/training.py
===================================================================
--- trunk/tools/editor_trends/kaggle/training.py        2011-06-28 04:00:40 UTC 
(rev 90934)
+++ trunk/tools/editor_trends/kaggle/training.py        2011-06-28 04:19:32 UTC 
(rev 90935)
@@ -1,141 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-'''
-Copyright (C) 2010 by Diederik van Liere ([email protected])
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License version 2
-as published by the Free Software Foundation.
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-See the GNU General Public License for more details, at
-http://www.fsf.org/licenses/gpl.html
-'''
-
-__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
-__email__ = 'dvanliere at gmail dot com'
-__date__ = '2011-04-12'
-__version__ = '0.1'
-
-import os
-import sys
-import cPickle
-import codecs
-from datetime import datetime
-sys.path.append('../')
-
-from classes import storage
-
-location = '/home/diederik/wikimedia/en/wiki/kaggle'
-files = os.listdir(location)
-files.reverse()
-
-max_size = 2147483648
-max_size_reached = False
-
-t0 = datetime.now()
-titles = {}
-ids = set()
-dates = {}
-edits = {}
-ignore_ids = set()
-size = 0
-cnt_obs = 0
-cutoff_date = datetime(2010, 8, 31)
-
-print 'Constructing training dataset...'
-db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
-dataset = codecs.open('training.tsv', 'w', 'utf-8')
-for filename in files:
-    if not filename.startswith('comments') and not 
filename.startswith('articles'):
-        fh = codecs.open(os.path.join(location, filename))
-        if max_size_reached == True:
-            break
-        for line in fh:
-            line = line.strip()
-            line = line.split('\t')
-            if len(line) != 12:
-                continue
-            if line[10] == '1':
-                continue
-            timestamp = datetime.strptime(line[6], '%Y-%m-%dT%H:%M:%SZ')
-            if timestamp > cutoff_date:
-                continue
-            username = line[3].lower()
-            if username.endswith('bot') or username.find('script') > -1:
-                #line[10] = '1'
-                continue
-            id = line[2]
-            if id not in ids and id not in ignore_ids:
-                res = db.find_one({'editor': id})
-                if res == None:
-                    ignore_ids.add(id)
-                    continue
-            cnt_obs += 1
-            title_id = line[1]
-            ids.add(id)
-            simple_date = '%s-%s' % (timestamp.year, timestamp.month)
-            dates.setdefault(simple_date, 0)
-            dates[simple_date] += 1
-            title = line.pop(5)
-            titles[title_id] = title
-            line.append('\n')
-            line = '\t'.join(line)
-            size += len(line)
-            if size > max_size:
-                max_size_reached = True
-            dataset.write(line.decode('utf-8'))
-
-dataset.close()
-
-print 'Constructing title dataset...'
-fh = codecs.open('titles.tsv', 'w', 'utf-8')
-for id, title in titles.iteritems():
-    fh.write('%s\t%s\n' % (id, title.decode('utf-8')))
-fh.close()
-
-
-print 'Constructing solution dataset...'
-x = 0
-fh = codecs.open('solutions.tsv', 'w', 'utf-8')
-for id in ids:
-    if id not in ignore_ids:
-        obs = db.find_one({'editor': str(id)}, 'cum_edit_count_main_ns')
-        if obs != None:
-            x += 1
-            n = obs['cum_edit_count_main_ns']
-            fh.write('%s,%s\n' % (id.decode('utf-8'), n))
-            edits.setdefault(n, 0)
-            edits[n] += 1
-        else:
-            print id
-fh.close()
-
-print 'Storing date histogram'
-fh = open('histogram_dates.bin', 'wb')
-cPickle.dump(dates, fh)
-fh.close()
-
-
-fh = open('histogram_dates.tsv', 'w')
-for date, n in dates.iteritems():
-    fh.write('%s\t%s\n' % (date, n))
-fh.close()
-
-
-print 'Storing edit histogram'
-fh = open('histogram_edits.bin', 'wb')
-cPickle.dump(edits, fh)
-fh.close()
-
-fh = open('histogram_edits.tsv', 'w')
-for edit, n in edits.iteritems():
-    fh.write('%s\t%s\n' % (edit, n))
-fh.close()
-
-
-t1 = datetime.now()
-print 'Descriptives:'
-print 'Number of editors: %s' % x
-print 'Number of edits: %s' % cnt_obs
-print 'It took %s to construct the Kaggle training set' % (t1 - t0)

Copied: trunk/tools/editor_trends/kaggle/training_db.py (from rev 89242, 
trunk/tools/editor_trends/kaggle/training.py)
===================================================================
--- trunk/tools/editor_trends/kaggle/training_db.py                             
(rev 0)
+++ trunk/tools/editor_trends/kaggle/training_db.py     2011-06-28 04:19:32 UTC 
(rev 90935)
@@ -0,0 +1,452 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])'])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-04-12'
+__version__ = '0.1'
+
+import os
+import sys
+import cPickle
+import codecs
+import random
+from itertools import izip
+from datetime import datetime
+from dateutil.relativedelta import *
+sys.path.append('../')
+
+random.seed(1024)
+from classes import storage
+
+headers = ['user_id', 'article_id', 'revision_id', 'namespace', 'timestamp',
+        'md5', 'reverted', 'reverted_user_id', 'reverted_revision_id', 
'delta', 'cur_size']
+keys = ['user_id', 'article_id', 'rev_id', 'ns', 'date',
+        'hash', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 
'cur_size']
+
+max_size = 2147483648
+#max_size = 2000000
+cnt_obs = 0         #count of number of edits
+revs = {}
+titles = {}
+predictions = {}
+
+t0 = datetime.now()
+location = '/home/diederik/wikimedia/xml/en/wiki/txt/'
+files = os.listdir(location)
+#files.sort()
+#files.reverse()
+editors_seen = {}
+cutoff_date = datetime(2010, 9, 1) #operator is >
+end_date = datetime(2011, 2, 1) #operator is <
+cutoff_date_training = datetime(2010, 1, 31) #operator is >
+end_date_training = datetime(2010, 9, 1) # operator is <
+
+class IDGenerator:
+    def __init__(self):
+        self.n = 0
+        self.ids = {}
+
+    def get_id(self, n):
+        if n not in self.ids:
+            self.ids[n] = self.n
+            self.n += 1
+        return str(self.ids[n])
+
+class RandomIDGenerator:
+    def __init__(self):
+        self.n = 0
+        self.ids = {}
+        self.rnd_ids = {}
+        self.inverted_ids = None
+
+    def invert_dict(self, dictionary):
+        return dict((v, k) for k, v in dictionary.iteritems())
+
+    def get_id(self, n):
+        if n not in self.ids:
+            self.n += 1
+            while len(self.rnd_ids) < self.n :
+                rnd_id = self.get_random_id()
+                if self.rnd_ids.get(rnd_id, False) == False:
+                    self.rnd_ids[rnd_id] = True
+                    self.ids[n] = rnd_id
+        return self.ids[n]
+
+    def get_random_id(self):
+        return random.randrange(0, 1000000)
+
+    def reverse_lookup(self, n):
+        self.inverted_ids = self.invert_dict(self.ids)
+        return self.inverted_ids[n]
+
+
+def construct_article_meta(fh_articles, files):
+    print 'Constructing title dataset...'
+    headers = ['article_id', 'category', 'timestamp', 'namespace', 'redirect', 
'title', 'related_page']
+    write_headers(fh_articles, headers)
+    #fh_articles.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ('article_id', 
'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page'))
+    article_meta = {}
+    for filename in files:
+        if filename.startswith('articles_meta'):
+            fh = codecs.open(os.path.join(location, filename))
+            for line in fh:
+                line = line.strip()
+                line = line.split('\t')
+                category = line[1]
+                if category != 'List':
+                    title = line[2]
+                    title = title.split('/')
+                    article_meta.setdefault(title[-1], {})
+                    article_meta[title[-1]]['category'] = category
+                    article_meta[title[-1]]['id'] = line[0]
+            fh.close()
+    return article_meta
+
+
+def determine_active(edits, start_date, end_date):
+    active = 0
+    if start_date == datetime(2009, 9, 1):
+        if '2009' not in edits and '2010' not in edits:
+            return active
+#    elif start_date == datetime(2010, 9, 1):
+#        if '2010' not in edits and '2011' not in edits:
+#            return active
+
+
+    namespaces = ['0', '1', '2', '3', '4', '5']
+    while start_date < end_date:
+            year = str(start_date.year)
+            month = str(start_date.month)
+            for ns in namespaces:
+                active += edits.get(year, {}).get(month, {}).get(ns, 0)
+                if active > 0: #we don't need to know how many edits,just if 
active
+                    return active
+            start_date = start_date + relativedelta(months= +1)
+    return active
+
+
+def load_binary_file(filename):
+    fh = open(filename, 'rb')
+    obj = cPickle.load(fh)
+    fh.close()
+    return obj
+
+
+def convert_tz_to_mysql_tz(tz):
+    return tz.__str__()
+
+
+def check_reverter(idg, reverter):
+    try:
+        if reverter != -1:
+            reverter = idg.get_id(reverter)
+            return reverter
+    except ValueError:
+        pass
+    return -1
+
+
+def check_user_id(user_id):
+    try:
+        int(user_id)
+    except ValueError:
+        return False
+    return True
+
+
+def check_username(username):
+    username = username.lower()
+    if username.endswith('bot') or username.find('script') > -1:
+        return False #exclude more bots and scripts
+    return True
+
+
+def determine_editors(db):
+    start_date_pre = datetime(2009, 9, 1)
+    end_date_pre = datetime(2010, 9, 1)
+    end_date = datetime(2011, 2, 1)
+    pre_editors = set()
+    post_editors = set()
+    cursor = db.find({}, 'first_edit,edit_count,user_id,username')
+    x, y, z = 0, 0, 0
+    for editor in cursor:
+        x += 1
+        if 'first_edit' not in editor:
+            continue
+        if editor['first_edit'] > end_date_pre:
+            continue
+        if check_username(editor['username']) == False:
+            continue
+        if check_user_id(editor['user_id']) == False:
+            continue
+
+        active_pre = determine_active(editor['edit_count'], start_date_pre, 
end_date_pre)
+        if x % 100000 == 0:
+            print 'Retrieved %s pre_editors / %s post_editors / %s total 
editors...' % (y, z, x)
+
+        if active_pre == 0:
+            continue #exclude editors who are not active in the year before 
the cutoff date
+        else:
+            active_post = determine_active(editor['edit_count'], end_date_pre, 
end_date)
+            if active_post == 0:
+                pre_editors.add(user_id)
+                y += 1
+            else:
+                post_editors.add(user_id)
+                z += 1
+    print 'Retrieved %s pre_editors / %s post_editors / %s total editors...' % 
(y, z, x)
+    return pre_editors, post_editors
+
+
+def write_headers(fh, headers):
+    for i, key in enumerate(headers):
+        fh.write('%s' % key)
+        if (i + 1) != len(headers):
+            fh.write('\t')
+        else:
+            fh.write('\n')
+
+def write_revision(dataset, revision):
+    for i, key in enumerate(keys):
+        if type(revision[key]) == type(0):
+            revision[key] = str(revision[key])
+        dataset.write('%s' % revision[key].decode('utf-8'))
+        if (i + 1) != len(keys):
+            dataset.write('\t')
+        else:
+            dataset.write('\n')
+
+
+print 'Constructing training dataset...'
+db_dataset = storage.init_database('mongo', 'wikilytics', 
'enwiki_editors_dataset')
+print 'Loading editors...'
+if not os.path.exists('set_a.bin'):
+    pre_editors, post_editors = determine_editors(db_dataset)
+    fh = open('set_a.bin', 'wb')
+    cPickle.dump(pre_editors, fh)
+    fh.close()
+
+    fh = open('set_b.bin', 'wb')
+    cPickle.dump(post_editors, fh)
+    fh.close()
+else:
+    pre_editors = load_binary_file('set_a.bin')
+    post_editors = load_binary_file('set_b.bin')
+
+
+dataset = codecs.open('training.tsv', 'w', 'utf-8')
+write_headers(dataset, headers)
+idg = RandomIDGenerator()
+
+namespaces = IDGenerator()
+print 'Parsing revisions...'
+db_raw = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_raw')
+seen_editors = {}
+editors = {}
+x = 1
+for editor in post_editors:
+    #print editor
+    editors[x] = editor
+    x += 2
+x = 0
+z = len(post_editors)
+for y, editor in enumerate(pre_editors):
+    #print editor
+    editors[x] = editor
+    x += 2
+    if z == y:
+        break
+
+editor_keys = editors.keys()
+editor_keys.sort()
+for key in editor_keys:
+    #print editors
+    #for editor in editors:
+    editor = editors[key]
+    #print editor
+    go = editors_seen.get(editor, True)
+    if go:
+        editors_seen[editor] = False
+        user_id = idg.get_id(editor)
+        print 'Parsing editor %s (%s) ...' % (editor, user_id)
+        revisions = db_raw.find({'user_id': str(editor)})
+
+        predictions.setdefault(user_id, {})
+        predictions[user_id].setdefault('solution', 0)
+        predictions[user_id].setdefault('training', 0)
+
+        for revision in revisions:
+            revision['user_id'] = user_id #recode id to make it harder to look 
up answers
+            if revision['ns'] < 0 or revision['ns'] > 5:
+                continue
+            #revision['ns'] = namespaces.get_id(revision['ns'])
+            timestamp = revision['date']
+            revision['date'] = convert_tz_to_mysql_tz(timestamp)
+
+
+
+            if timestamp > cutoff_date:
+                #print editor, user_id, timestamp, revision['date']
+                if timestamp < end_date:
+                    predictions[user_id]['solution'] += 1
+            elif timestamp > cutoff_date_training:
+                if timestamp < end_date_training:
+                    predictions[user_id]['training'] += 1
+
+            if timestamp > cutoff_date: #exclude edits after cut off date
+                continue
+            if revision['revert'] == 1:
+                revision['reverted_user'] = check_reverter(idg, 
revision.get('reverted_user', -1))
+            revision.pop('_id')
+            revision.pop('username')
+            titles[revision['article_id']] = True
+            revs[revision['rev_id']] = True
+            write_revision(dataset, revision)
+            cnt_obs += 1
+            if cnt_obs % 10000 == 0:
+                print 'Parsed %s revisions...' % cnt_obs
+    if dataset.tell() > max_size:
+        break
+if dataset.tell() > max_size:
+    print 'Reached maximum filesize...'
+else:
+    print 'Parsed all available editors in post set...'
+dataset.close()
+
+
+
+print 'Constructing solution dataset...'
+fh = codecs.open('solutions.csv', 'w', 'utf-8')
+editor_keys = predictions.keys()
+editor_keys.sort()
+fh.write('%s,%s\n' % ('user_id', 'solution'))
+for key in editor_keys:
+    fh.write('%s,%s\n' % (key, predictions[key]['solution']))
+    print key, predictions[key]['solution']
+fh.close()
+
+
+print 'Constructing test dataset...'
+fh = codecs.open('test.csv', 'w', 'utf-8')
+fh.write('%s,%s\n' % ('user_id', 'test'))
+for key, value in predictions.iteritems():
+    fh.write('%s,%s\n' % (key, value['training']))
+fh.close()
+
+print 'Constructing article file...'
+fh_articles = codecs.open('titles.tsv', 'w', 'utf-8')
+article_meta = construct_article_meta(fh_articles, files)
+categories = IDGenerator()
+for filename in files:
+    if filename.startswith('articles') and not 
filename.startswith('articles_meta'):
+        fh = codecs.open(os.path.join(location, filename))
+        for line in fh:
+            line = line.strip()
+            line = line.split('\t')
+            if len(line) == 6:
+                article_id = int(line[0])
+                title = titles.pop(article_id, None)
+                if title:
+                    title = line[-1]
+                    meta = article_meta.get(title, None)
+                    parent_id = '-1'
+                    category = -1
+                    redirect = line[4]
+                    if redirect == 'False':
+                        redirect = '0'
+                    else:
+                        redirect = '1'
+                    line[4] = redirect
+                    if meta:
+                        parent_id = meta['id']
+                        category = meta['category']
+
+
+                    line[1] = categories.get_id(category)
+                    tz = datetime.strptime(line[2], '%Y-%m-%dT%H:%M:%SZ')
+                    line[2] = convert_tz_to_mysql_tz(tz)
+                    line[-1] = line[-1].decode('utf-8')
+                    line.append(parent_id)
+                    line.append('\n')
+                    fh_articles.write('\t'.join(line))
+        fh.close()
+fh_articles.close()
+
+
+print 'Constructing comment dataset...'
+fh_comments = codecs.open('comments.tsv', 'w', 'utf-8')
+fh_comments.write('%s\t%s\n' % ('revision_id', 'comment'))
+cnt = len(revs.keys())
+for filename in files:
+    if filename.startswith('comments'):
+        fh = codecs.open(os.path.join(location, filename))
+        for line in fh:
+            if cnt == 0:
+                break
+            line = line.strip()
+            line = line.split('\t')
+            if len(line) == 2:  #some lines are missing rev id, not sure why.
+                try:
+                    rev_id = int(line[0])
+                    exists = revs.get(rev_id, None)
+                    if exists:
+                        fh_comments.write('%s\t%s\n' % (rev_id, 
line[1].decode('utf-8')))
+                        cnt -= 1
+                except (ValueError, KeyError), error:
+                    print error
+        fh.close()
+fh_comments.close()
+
+print 'Storing random ids...'
+fh = open('random_ids.bin', 'wb')
+cPickle.dump(idg, fh)
+fh.close()
+
+fh = codecs.open('namespaces.tsv', 'w', 'utf-8')
+write_headers(fh, ['key', 'namespace'])
+namespaces = {'0':'Main',
+              '1':'Talk',
+              '2':'User',
+              '3':'User Talk',
+              '4':'Wikipedia',
+              '5':'Wikipedia Talk'
+              }
+for key, value in namespaces.iteritems():
+    fh.write('%s\t%s\n' % (key, value))
+fh.close()
+
+fh = codecs.open('categories.tsv', 'w', 'utf-8')
+write_headers(fh, ['id', 'name'])
+for key, value in categories.ids.iteritems():
+    fh.write('%s\t%s\n' % (value, key))
+fh.close()
+
+fh = open('descriptives.tsv', 'w')
+fh.write('Number of unique editors: %s\n' % idg.n)
+fh.write('Number of revisions: %s\n' % cnt_obs)
+fh.write('Number of pre-editors: %s\n' % len(pre_editors))
+fh.write('Number of post-editors: %s\n' % len(post_editors))
+fh.write('Number of editors with zero edits after August 30th. 2010: %s' % 
(len(pre_editors) - len(post_editors)))
+fh.close()
+
+
+t1 = datetime.now()
+print 'Descriptives:'
+print 'Number of unique editors: %s' % idg.n
+print 'Number of revisions: %s' % cnt_obs
+print 'Number of pre-editors: %s' % len(pre_editors)
+print 'Number of post-editors: %s' % len(post_editors)
+print 'Number of editors with zero edits after August 30th. 2010: %s' % 
(len(pre_editors) - len(post_editors))
+print 'It took %s to construct the Kaggle training set' % (t1 - t0)

Added: trunk/tools/editor_trends/kaggle/training_file.py
===================================================================
--- trunk/tools/editor_trends/kaggle/training_file.py                           
(rev 0)
+++ trunk/tools/editor_trends/kaggle/training_file.py   2011-06-28 04:19:32 UTC 
(rev 90935)
@@ -0,0 +1,430 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])'])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-04-12'
+__version__ = '0.1'
+
+import os
+import sys
+import cPickle
+import codecs
+import random
+from itertools import izip_longest
+from datetime import datetime
+from dateutil.relativedelta import *
+sys.path.append('../')
+import resource
+
+random.seed(1024)
+from classes import storage
+
+headers = ['user_id', 'article_id', 'revision_id', 'namespace', 'timestamp',
+        'md5', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 
'cur_size']
+keys = ['user_id', 'article_id', 'rev_id', 'ns', 'date',
+        'hash', 'revert', 'reverted_user', 'reverted_rev_id', 'delta', 
'cur_size']
+
+size = 0 #current size of file
+#max_size = 2147483648
+max_size = 5000000
+editors_seen = {}
+cnt_obs = 0         #count of number of edits
+revs = {}
+titles = {}
+predictions = {}
+
+t0 = datetime.now()
+location = '/home/diederik/wikimedia/xml/en/wiki/txt/'
+txt_files = '/home/diederik/wikimedia/xml/en/wiki/sorted/'
+files = os.listdir(location)
+max_file_handles = resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100
+#files.sort()
+#files.reverse()
+
+cutoff_date = datetime(2010, 8, 31) #operator is >
+end_date = datetime(2011, 2, 1) #operator is <
+cutoff_date_training = datetime(2010, 1, 31) #operator is >
+end_date_training = datetime(2010, 9, 1) # operator is <
+
+
+class IDGenerator:
+    def __init__(self):
+        self.n = 0
+        self.ids = {}
+        self.rnd_ids = {}
+        self.inverted_ids = None
+
+    def invert_dict(self, dictionary):
+        return dict((v, k) for k, v in dictionary.iteritems())
+
+    def get_id(self, n):
+        if n not in self.ids:
+            self.n += 1
+            while len(self.rnd_ids) < self.n :
+                rnd_id = self.get_random_id()
+                if self.rnd_ids.get(rnd_id, False) == False:
+                    self.rnd_ids[rnd_id] = True
+                    self.ids[n] = rnd_id
+        return self.ids[n]
+
+    def get_random_id(self):
+        return random.randrange(0, 1000000)
+
+    def reverse_lookup(self, n):
+        self.inverted_ids = self.invert_dict(self.ids)
+        return self.inverted_ids[n]
+
+
+def construct_article_meta(fh_articles, files):
+    print 'Constructing title dataset...'
+    headers = ['article_id', 'category', 'timestamp', 'namespace', 'redirect', 
'title', 'related_page']
+    write_headers(fh_articles, headers)
+    #fh_articles.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ('article_id', 
'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page'))
+    article_meta = {}
+    for filename in files:
+        if filename.startswith('articles_meta'):
+            fh = codecs.open(os.path.join(location, filename))
+            for line in fh:
+                line = line.strip()
+                line = line.split('\t')
+                category = line[1]
+                if category != 'List':
+                    title = line[2]
+                    title = title.split('/')
+                    article_meta.setdefault(title[-1], {})
+                    article_meta[title[-1]]['category'] = category
+                    article_meta[title[-1]]['id'] = line[0]
+            fh.close()
+    return article_meta
+
+
+def determine_active(edits, start_date, end_date):
+    active = 0
+    namespaces = ['0', '1', '2', '3', '4', '5']
+    if start_date == datetime(2009, 9, 1):
+        if '2009' not in edits and '2010' not in edits:
+            return active
+    elif start_date == datetime(2010, 9, 1):
+        if '2010' not in edits and '2011' not in edits:
+            return active
+
+    while start_date < end_date:
+            year = str(start_date.year)
+            month = str(start_date.month)
+            for ns in namespaces:
+                active += edits.get(year, {}).get(month, {}).get(ns, 0)
+                if active > 0: #we don't need to know how many edits,just if 
active
+                    return active
+            start_date = start_date + relativedelta(months= +1)
+    return active
+
+def load_binary_file(filename):
+    fh = open('set_b.bin', 'rb')
+    obj = cPickle.load(fh)
+    fh.close()
+    return obj
+
+
+def convert_tz_to_mysql_tz(tz):
+    iso = tz.__str__()
+    tz = iso[0:4] + '-' + iso[4:6] + '-' + iso[6:]
+    return tz
+
+
+def check_reverter(idg, reverter):
+    try:
+        reverter = int(reverter)
+        if reverter != -1:
+            reverter = idg.get_id(reverter)
+            return reverter
+    except ValueError:
+        pass
+    return -1
+
+
+def check_user_id(user_id):
+    try:
+        int(user_id)
+    except ValueError:
+        return False
+    return True
+
+
+def check_username(username):
+    username = username.lower()
+    if username.endswith('bot') or username.find('script') > -1:
+        return False #exclude more bots and scripts
+    return True
+
+
+def determine_editors(db):
+    start_date_pre = datetime(2009, 9, 1)
+    end_date_pre = datetime(2010, 9, 1)
+    end_date = datetime(2011, 2, 1)
+    pre_editors = set()
+    post_editors = set()
+    #cursor = db.find({'date': {'$gte': start_date_pre, '$lt': end_date_pre}}, 
'first_edit,edit_count,user_id,username')
+    cursor = db.find({}, 'first_edit,edit_count,user_id,username')
+    x, y, z = 0, 0, 0
+    for editor in cursor:
+        x += 1
+        if 'first_edit' not in editor:
+            continue
+        if editor['first_edit'] >= end_date_pre:
+            continue
+        if check_username(editor['username']) == False:
+            continue
+        if check_user_id(editor['editor']) == False:
+            continue
+
+        #print editor['edit_count']
+        active = determine_active(editor['edit_count'], start_date_pre, 
end_date_pre)
+        if active > 0:
+            pre_editors.add(editor['editor'])
+            y += 1
+        active = determine_active(editor['edit_count'], end_date_pre, end_date)
+        if active > 0:
+            post_editors.add(editor['editor'])
+            z += 1
+        if x % 100000 == 0:
+            print 'Retrieved %s pre_editors / %s post_editors / %s total 
editors...' % (y, z, x)
+
+    #set_a = pre_editors.difference(post_editors)
+    post_editors = pre_editors.intersection(post_editors)
+
+    return pre_editors, post_editors
+
+
+def write_headers(fh, headers):
+    for i, key in enumerate(headers):
+        fh.write('%s' % key)
+        if (i + 1) != len(keys):
+            fh.write('\t')
+        else:
+            fh.write('\n')
+
+def write_revision(dataset, revision):
+    size = 0
+    for i, key in enumerate(keys):
+        #print key, revision[key]
+#        if key == 'reverted_user' or key == 'reverted_rev_id':
+#            revision[key] = revision[key][0]
+        if type(revision[key]) == type(0):
+            revision[key] = str(revision[key])
+
+        dataset.write('%s' % revision[key].decode('utf-8'))
+        size += len(revision[key])
+        if (i + 1) != len(keys):
+            dataset.write('\t')
+        else:
+            dataset.write('\n')
+    return size
+
+
+print 'Constructing training dataset...'
+db_dataset = storage.init_database('mongo', 'wikilytics', 
'enwiki_editors_dataset')
+print 'Loading editors...'
+if not os.path.exists('set_a.bin'):
+    pre_editors, post_editors = determine_editors(db_dataset)
+    fh = open('set_a.bin', 'wb')
+    cPickle.dump(pre_editors, fh)
+    fh.close()
+
+    fh = open('set_b.bin', 'wb')
+    cPickle.dump(post_editors, fh)
+    fh.close()
+else:
+    pre_editors = load_binary_file('set_a.bin')
+    post_editors = load_binary_file('set_b.bin')
+
+
+dataset = codecs.open('training.tsv', 'w', 'utf-8')
+write_headers(dataset, headers)
+idg = IDGenerator()
+
+
+
+print 'Parsing revisions...'
+db_raw = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_raw')
+seen_editors = {}
+for editors in izip_longest(post_editors, pre_editors, fillvalue=None):
+    for editor in editors:
+        go = editors_seen.get(editor, True)
+        if go:
+        #if editor:
+            editors_seen[editor] = False
+            print 'Parsing editor %s...' % editor
+            #revisions = db_raw.find({'user_id': editor})
+            file_id = int(editor) % max_file_handles
+            fh = codecs.open(os.path.join(txt_files, '%s.csv' % file_id), 'r', 
'utf-8')
+            for line in fh:
+                line = line.strip()
+                line = line.split('\t')
+                if line[0] != editor:
+                    continue
+                revision = {}
+                revision['user_id'] = int(line[0])
+                revision['article_id'] = int(line[1])
+                revision['rev_id'] = int(line[2])
+                revision['ns'] = line[4]
+                revision['date'] = datetime.strptime(line[6], 
'%Y-%m-%dT%H:%M:%SZ')
+                revision['hash'] = line[7]
+                revision['revert'] = line[8]
+                revision['reverted_user'] = line[9]
+                revision['reverted_rev_id'] = line[10]
+                revision['cur_size'] = line[12]
+                revision['delta'] = line[13]
+                #print line
+                #print revision
+
+                #'user_id', 'article_id', 'rev_id', 'ns', 'date',
+                #'hash', 'revert', 'reverted_user', 'reverted_rev_id', 
'delta', 'cur_size'
+                #print 'Editor %s made % edits' % (editor, len(revisions))
+                #for revision in revisions:
+                user_id = idg.get_id(revision['user_id'])
+                revision['user_id'] = user_id #recode id to make it harder to 
look up answers
+                if revision['ns'] < 0:
+                    continue
+                timestamp = revision['date']
+                #revision['date'] = convert_tz_to_mysql_tz(timestamp)
+
+                predictions.setdefault(user_id, {})
+                predictions[user_id].setdefault('solution', 0)
+                predictions[user_id].setdefault('training', 0)
+
+                if timestamp > cutoff_date and timestamp < end_date:
+                    predictions[user_id]['solution'] += 1
+                elif timestamp > cutoff_date_training and timestamp < 
end_date_training:
+                    predictions[user_id]['training'] += 1
+                if timestamp > cutoff_date: #exclude edits after cut off date
+                    continue
+
+                revision['reverted_user'] = check_reverter(idg, 
revision.get('reverted_user', -1))
+                #revision.pop('_id')
+                #revision.pop('username')
+                revision['date'] = revision['date'].__str__()
+                titles[revision['article_id']] = True
+                revs[revision['rev_id']] = True
+                size += write_revision(dataset, revision)
+                cnt_obs += 1
+                if cnt_obs % 10000 == 0:
+                    print 'Parsed %s revisions...' % cnt_obs
+            fh.close()
+    if size > max_size:
+        break
+if size > max_size:
+    print 'Reached maximum filesize...'
+else:
+    print 'Parsed all available editors in post set...'
+dataset.close()
+
+
+
+print 'Constructing solution dataset...'
+fh = codecs.open('solutions.csv', 'w', 'utf-8')
+keys = predictions.keys()
+keys.sort()
+fh.write('%s,%s\n' % ('editor_id', 'solution'))
+for key in keys:
+    fh.write('%s,%s\n' % (key, predictions[key]['solution']))
+fh.close()
+
+
+print 'Constructing test dataset...'
+fh = codecs.open('test.csv', 'w', 'utf-8')
+fh.write('%s,%s\n' % ('editor_id', 'test'))
+for key, value in predictions.iteritems():
+    fh.write('%s,%s\n' % (key, value['training']))
+fh.close()
+
+
+print 'Constructing article file...'
+fh_articles = codecs.open('titles.tsv', 'w', 'utf-8')
+article_meta = construct_article_meta(fh_articles, files)
+for filename in files:
+    if filename.startswith('articles') and not 
filename.startswith('articles_meta'):
+        fh = codecs.open(os.path.join(location, filename))
+        for line in fh:
+            line = line.strip()
+            line = line.split('\t')
+            if len(line) == 6:
+                article_id = int(line[0])
+                title = titles.get(article_id, None)
+                if title:
+                    title = line[-1]
+                    meta = article_meta.get(title, None)
+                    parent_id = -1
+                    category = 'Null'
+                    if meta:
+                        parent_id = meta['id']
+                        category = meta['category']
+
+                    line[1] = category
+                    line[2] = convert_tz_to_mysql_tz(line[2])
+                    line[-1] = line[-1].decode('utf-8')
+                    line.append(str(parent_id))
+                    line.append('\n')
+                    fh_articles.write('\t'.join(line))
+        fh.close()
+fh_articles.close()
+
+
+print 'Constructing comment dataset...'
+fh_comments = codecs.open('comments.tsv', 'w', 'utf-8')
+fh_comments.write('%s\t%s\n' % ('rev_id', 'text'))
+cnt = len(revs.keys())
+for filename in files:
+    if filename.startswith('comments'):
+        fh = codecs.open(os.path.join(location, filename))
+        for line in fh:
+            if cnt == 0:
+                break
+            line = line.strip()
+            line = line.split('\t')
+            if len(line) == 2:  #some lines are missing rev id, not sure why.
+                try:
+                    rev_id = int(line[0])
+                    exists = revs.get(rev_id, None)
+                    if exists:
+                        fh_comments.write('%s\t%s\n' % (rev_id, 
line[1].decode('utf-8')))
+                        cnt -= 1
+                except (ValueError, KeyError), error:
+                    print error
+        fh.close()
+fh_comments.close()
+
+print 'Storing random ids...'
+fh = open('random_ids.bin', 'wb')
+cPickle.dump(idg, fh)
+fh.close()
+
+
+fh = open('descriptives.tsv', 'w')
+fh.write('Number of unique editors: %s\n' % idg.n)
+fh.write('Number of revisions: %s\n' % cnt_obs)
+fh.write('Number of pre-editors: %s\n' % len(pre_editors))
+fh.write('Number of post-editors: %s\n' % len(post_editors))
+fh.write('Number of editors with zero edits after August 30th. 2010: %s' % 
(len(pre_editors) - len(post_editors)))
+fh.close()
+
+
+t1 = datetime.now()
+print 'Descriptives:'
+print 'Number of unique editors: %s' % idg.n
+print 'Number of revisions: %s' % cnt_obs
+print 'Number of pre-editors: %s' % len(pre_editors)
+print 'Number of post-editors: %s' % len(post_editors)
+print 'Number of editors with zero edits after August 30th. 2010: %s' % 
(len(pre_editors) - len(post_editors))
+print 'It took %s to construct the Kaggle training set' % (t1 - t0)


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to