http://www.mediawiki.org/wiki/Special:Code/MediaWiki/90935
Revision: 90935
Author: diederik
Date: 2011-06-28 04:19:32 +0000 (Tue, 28 Jun 2011)
Log Message:
-----------
Backlog of small fixes.
Modified Paths:
--------------
trunk/tools/editor_trends/etl/sort.py
trunk/tools/editor_trends/etl/store.py
Added Paths:
-----------
trunk/tools/editor_trends/analyses/adhoc/benchmarker_queue.py
trunk/tools/editor_trends/analyses/network/
trunk/tools/editor_trends/analyses/network/community_graph.py
trunk/tools/editor_trends/analyses/network/graph_db.py
trunk/tools/editor_trends/analyses/plugins/kaggle_correlation.py
trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check.py
trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check_edits.py
trunk/tools/editor_trends/analyses/plugins/sor_newbie_treatment.py
trunk/tools/editor_trends/kaggle/training_db.py
trunk/tools/editor_trends/kaggle/training_file.py
Removed Paths:
-------------
trunk/tools/editor_trends/analyses/adhoc/community_graph.py
trunk/tools/editor_trends/kaggle/training.py
Added: trunk/tools/editor_trends/analyses/adhoc/benchmarker_queue.py
===================================================================
--- trunk/tools/editor_trends/analyses/adhoc/benchmarker_queue.py
(rev 0)
+++ trunk/tools/editor_trends/analyses/adhoc/benchmarker_queue.py
2011-06-28 04:19:32 UTC (rev 90935)
@@ -0,0 +1,24 @@
+from Queue import Queue
+#import cProfile
+from guppy import hpy
+h = hpy()
+
+q1, q2, q3 = Queue(), Queue(), Queue()
+h.heap()
+print 'ughh'
+for x in xrange(1000):
+ q1.put(x)
+ q2.put({})
+ q3.put([])
+ #h = hpy()
+hpy().doc
+h.heap()
+# for x in xrange(100):
+# a = q1.get()
+# b = q2.get()
+# c = q3.get()
+# h.heap()
+
+#if __name__ == '__main__':
+# main()
+ #cProfile.run('main()')
Deleted: trunk/tools/editor_trends/analyses/adhoc/community_graph.py
===================================================================
--- trunk/tools/editor_trends/analyses/adhoc/community_graph.py 2011-06-28
04:00:40 UTC (rev 90934)
+++ trunk/tools/editor_trends/analyses/adhoc/community_graph.py 2011-06-28
04:19:32 UTC (rev 90935)
@@ -1,62 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-'''
-Copyright (C) 2010 by Diederik van Liere ([email protected])
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License version 2
-as published by the Free Software Foundation.
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-See the GNU General Public License for more details, at
-http://www.fsf.org/licenses/gpl.html
-'''
-
-__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
-__email__ = 'dvanliere at gmail dot com'
-__date__ = '2011-01-10'
-__version__ = '0.1'
-
-import sys
-if '..' not in sys.path:
- sys.path.append('..')
-
-from classes import settings
-settings = settings.Settings()
-from classes import storage
-from utils import file_utils
-
-try:
- import psyco
- psyco.full()
-except ImportError:
- pass
-
-def create_articles_set(edits):
- s = set()
- years = edits.keys()
- for year in years:
- for edit in edits[year]:
- s.add(edit['article'])
- return s
-
-
-def create_edgelist(project, collection):
- db = storage.init_database(rts.storage, project, collection)
- ids = db.retrieve_distinct_keys('editor')
- ids.sort()
- fh = file_utils.create_txt_filehandle(settings.dataset_location,
'%s_edgelist.csv' % project, 'w', 'utf-8')
- for i in ids:
- author_i = conn[collection].find_one({'editor': i})
- article_i = create_articles_set(author_i['edits'])
- for j in ids:
- if i > j:
- author_j = conn[collection].find_one({'editor': j})
- article_j = create_articles_set(author_j['edits'])
- common = article_i.intersection(article_j)
- if len(common) > 0:
- file_utils.write_list_to_csv([i, j, len(common)], fh,
recursive=False, newline=True)
- fh.close()
-
-if __name__ == '__main__':
- create_edgelist('enwiki', 'editors')
Copied: trunk/tools/editor_trends/analyses/network/community_graph.py (from rev
88957, trunk/tools/editor_trends/analyses/adhoc/community_graph.py)
===================================================================
--- trunk/tools/editor_trends/analyses/network/community_graph.py
(rev 0)
+++ trunk/tools/editor_trends/analyses/network/community_graph.py
2011-06-28 04:19:32 UTC (rev 90935)
@@ -0,0 +1,63 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-01-10'
+__version__ = '0.1'
+
+import sys
+if '../../' not in sys.path:
+ sys.path.append('../../')
+
+from classes import settings
+settings = settings.Settings()
+from classes import storage
+from utils import file_utils
+
+try:
+ import psyco
+ psyco.full()
+except ImportError:
+ pass
+
+def create_articles_set(edits):
+ s = set()
+ years = edits.keys()
+ for year in years:
+ for edit in edits[year]:
+ s.add(edit['article'])
+ return s
+
+
+def create_edgelist(project, collection):
+ db = storage.init_database('mongo', project, collection)
+ ids = db.retrieve_distinct_keys('editor')
+ ids.sort()
+ fh = file_utils.create_txt_filehandle(settings.dataset_location,
'%s_edgelist.csv' % project, 'w', 'utf-8')
+ for i in ids:
+ author_i = db.find_one({'editor': i})
+ if author_i != None:
+ article_i = create_articles_set(author_i['edits'])
+ for j in ids:
+ if i > j:
+ author_j = db.find_one({'editor': j})
+ article_j = create_articles_set(author_j['edits'])
+ common = article_i.intersection(article_j)
+ if len(common) > 0:
+ file_utils.write_list_to_csv([i, j, len(common)], fh,
recursive=False, newline=True)
+ fh.close()
+
+if __name__ == '__main__':
+ create_edgelist('wikilytics', 'enwiki_editors_raw')
Added: trunk/tools/editor_trends/analyses/network/graph_db.py
===================================================================
--- trunk/tools/editor_trends/analyses/network/graph_db.py
(rev 0)
+++ trunk/tools/editor_trends/analyses/network/graph_db.py 2011-06-28
04:19:32 UTC (rev 90935)
@@ -0,0 +1,82 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2010-11-25'
+__version__ = '0.1'
+
+import codecs
+from neo4jrestclient import GraphDatabase, NotFoundError
+
+neo4jrestclient.request.CACHE = True
+
+class IDGenerator:
+ def __init__(self):
+ self.n = 0
+ self.ids = {}
+ self.inverted_ids = {}
+
+ def invert_dict(self):
+ return dict((v, k) for k, v in self.ids.iteritems())
+
+ def get_id(self, n):
+ if n not in self.ids:
+ self.ids[n] = self.n
+ self.n += 1
+ return self.ids[n]
+
+ def reverse_lookup(self, n):
+ if self.inverted_ids == {}:
+ self.inverted_ids = self.invert_dict()
+ return self.inverted_ids[n]
+
+
+def read_edgelist():
+ fh =
codecs.open('C:\\Users\\diederik.vanliere\\Dropbox\\wsor\\diederik\\wikilytics_edgelist.csv',
'r', 'utf-8')
+ for line in fh:
+ line = line.strip()
+ line = line.split('\t')
+ actor_a = line[0]
+ actor_b = line[1]
+ weight = int(line[2])
+ yield (actor_a, actor_b, weight)
+ fh.close()
+
+def init_db():
+ gdb = GraphDatabase("http://localhost:7474/db/data/")
+ return gdb
+
+def get_node(gdb, idg, node):
+ node = idg.get_id(node)
+ try:
+ #n = gdb.nodes.get('id', node)
+ n = gdb.nodes[node]
+ except NotFoundError:
+ n = gdb.nodes.create(id=node)
+ n['id'] = node
+
+ return n
+
+def load_data():
+ idg = IDGenerator()
+ gdb = init_db()
+ for (actor_a, actor_b, weight) in read_edgelist():
+ n1 = get_node(gdb, idg, actor_a)
+ n2 = get_node(gdb, idg, actor_b)
+ n1.relationships.create("cognitive_distance", n2, weight=weight)
+
+if __name__ == '__main__':
+ load_data()
+
Property changes on: trunk/tools/editor_trends/analyses/network/graph_db.py
___________________________________________________________________
Added: svn:eol-style
+ native
Added: trunk/tools/editor_trends/analyses/plugins/kaggle_correlation.py
===================================================================
--- trunk/tools/editor_trends/analyses/plugins/kaggle_correlation.py
(rev 0)
+++ trunk/tools/editor_trends/analyses/plugins/kaggle_correlation.py
2011-06-28 04:19:32 UTC (rev 90935)
@@ -0,0 +1,49 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-04-19'
+__version__ = '0.1'
+
+from datetime import datetime
+from dateutil.relativedelta import *
+
+
+def kaggle_correlation(var, editor, **kwargs):
+ end_date = datetime(2011, 2, 1)
+ cutoff_date = datetime(2010, 9, 1)
+ start_date = datetime(2009, 9, 1)
+ edits = editor['edit_count']
+ username = editor['username']
+
+ pre, after = 0, 0
+
+ while start_date < cutoff_date:
+ year = str(start_date.year)
+ month = str(start_date.month)
+ pre += edits.get(year, {}).get(month, {}).get('0', 0)
+ start_date = start_date + relativedelta(months= +1)
+
+ start_date = datetime(2010, 9, 1)
+ while start_date < end_date:
+ year = str(start_date.year)
+ month = str(start_date.month)
+ after += edits.get(year, {}).get(month, {}).get('0', 0)
+ start_date = start_date + relativedelta(months= +1)
+
+ if pre > 0:
+ var.add(end_date, pre, {'after': after, 'username': username})
+
+ return var
Added: trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check.py
===================================================================
--- trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check.py
(rev 0)
+++ trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check.py
2011-06-28 04:19:32 UTC (rev 90935)
@@ -0,0 +1,49 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-01-28'
+__version__ = '0.1'
+
+
+from datetime import datetime
+from dateutil.relativedelta import *
+
+
+def kaggle_sanity_check(var, editor, **kwargs):
+ end_date = datetime(2011, 2, 1)
+ cutoff = datetime(2010, 9, 1)
+ start_date = datetime(2009, 9, 1)
+ edits = editor['edit_count']
+ active = 0
+ count = 0
+ while start_date < cutoff:
+ year = str(start_date.year)
+ month = str(start_date.month)
+ #namespaces = edits.get(year, {}).get(month, {}).keys()
+ #for ns in namespaces:
+ count += edits.get(year, {}).get(month, {}).get('0', 0)
+ start_date = start_date + relativedelta(months= +1)
+
+ if count > 0:
+ while start_date < end_date:
+ year = str(start_date.year)
+ month = str(start_date.month)
+ active += edits.get(year, {}).get(month, {}).get('0', 0)
+ start_date = start_date + relativedelta(months= +1)
+ if active > 0 :
+ var.add(cutoff, 1)
+
+ return var
Added: trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check_edits.py
===================================================================
--- trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check_edits.py
(rev 0)
+++ trunk/tools/editor_trends/analyses/plugins/kaggle_sanity_check_edits.py
2011-06-28 04:19:32 UTC (rev 90935)
@@ -0,0 +1,41 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-01-28'
+__version__ = '0.1'
+
+
+from datetime import datetime
+from dateutil.relativedelta import *
+
+
+def kaggle_sanity_check_edits(var, editor, **kwargs):
+ end_date = datetime(2011, 2, 1)
+ start_date = datetime(2010, 9, 1)
+ edits = editor['edit_count']
+ username = editor['username']
+
+ count = 0
+ while start_date < end_date:
+ year = str(start_date.year)
+ month = str(start_date.month)
+ count += edits.get(year, {}).get(month, {}).get('0', 0)
+ start_date = start_date + relativedelta(months= +1)
+
+ if count > 0:
+ var.add(end_date, count, {'editor': username})
+
+ return var
Added: trunk/tools/editor_trends/analyses/plugins/sor_newbie_treatment.py
===================================================================
--- trunk/tools/editor_trends/analyses/plugins/sor_newbie_treatment.py
(rev 0)
+++ trunk/tools/editor_trends/analyses/plugins/sor_newbie_treatment.py
2011-06-28 04:19:32 UTC (rev 90935)
@@ -0,0 +1,42 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-01-25'
+__version__ = '0.1'
+
+from classes import storage
+
+def sor_newbie_treatment(editor, var, **kwargs):
+ rts = kwargs.pop('rts')
+ tenth_edit = editor['new_wikipedian']
+ title = ':%s' % editor['username']
+ collection = '%s%s_diffs_dataset' % (rts.language.code, rts.project.name)
+ db = storage.init_database(rts.storage, rts.dbname, collection)
+
+ if tenth_edit != False:
+ qualifier = {'ns': 3, 'timestamp': {'$lt':tenth_edit}}
+ observations = db.find_one(qualifier)
+ else:
+ observations = db.find_one('editor', editor)
+
+ if observations != None:
+ for obs in observations:
+ if obs['ns'] == 3:
+ values = obs.values()
+ print values
+
+
+
Modified: trunk/tools/editor_trends/etl/sort.py
===================================================================
--- trunk/tools/editor_trends/etl/sort.py 2011-06-28 04:00:40 UTC (rev
90934)
+++ trunk/tools/editor_trends/etl/sort.py 2011-06-28 04:19:32 UTC (rev
90935)
@@ -55,11 +55,6 @@
fh.close()
for x, d in enumerate(data):
d = d.strip().split('\t')
- #TEMP FIX:
- #editor = d[2]
- #d[2] = d[0]
- #d[0] = editor
- #END TEMP FIX
data[x] = d
#data = [d.strip() for d in data]
#data = [d.split('\t') for d in data]
@@ -153,7 +148,7 @@
pbar = progressbar.ProgressBar(maxval=len(files)).start()
tasks = multiprocessing.JoinableQueue()
result = multiprocessing.JoinableQueue()
- number_of_processes = 3
+ number_of_processes = 2
sorters = [Sorter(rts, tasks, result) for x in xrange(number_of_processes)]
for filename in files:
@@ -166,16 +161,14 @@
sorter.start()
ppills = number_of_processes
- while True:
- while ppills > 0:
- try:
- res = result.get(block=True)
- if res == True:
- pbar.update(pbar.currval + 1)
- else:
- ppills -= 1
- except Empty:
- pass
- break
+ while ppills > 0:
+ try:
+ res = result.get()
+ if res == True:
+ pbar.update(pbar.currval + 1)
+ else:
+ ppills -= 1
+ except Empty:
+ pass
tasks.join()
Modified: trunk/tools/editor_trends/etl/store.py
===================================================================
--- trunk/tools/editor_trends/etl/store.py 2011-06-28 04:00:40 UTC (rev
90934)
+++ trunk/tools/editor_trends/etl/store.py 2011-06-28 04:19:32 UTC (rev
90935)
@@ -79,7 +79,7 @@
date = text_utils.convert_timestamp_to_datetime_utc(line[6])
md5 = line[7]
revert = int(line[8])
- reverted_user = int(line[9])
+ reverted_user = line[9]
reverted_rev_id = int(line[10])
bot = int(line[11])
cur_size = int(line[12])
@@ -96,12 +96,10 @@
'cur_size':cur_size,
'delta':delta,
'bot':bot,
+ 'reverted_user': reverted_user,
+ 'reverted_rev_id': reverted_rev_id
}
- if reverted_user > -1:
- data['reverted_user'] = reverted_user,
- data['reverted_rev_id'] = reverted_rev_id
-
return data
Deleted: trunk/tools/editor_trends/kaggle/training.py
===================================================================
--- trunk/tools/editor_trends/kaggle/training.py 2011-06-28 04:00:40 UTC
(rev 90934)
+++ trunk/tools/editor_trends/kaggle/training.py 2011-06-28 04:19:32 UTC
(rev 90935)
@@ -1,141 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-'''
-Copyright (C) 2010 by Diederik van Liere ([email protected])
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License version 2
-as published by the Free Software Foundation.
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-See the GNU General Public License for more details, at
-http://www.fsf.org/licenses/gpl.html
-'''
-
-__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
-__email__ = 'dvanliere at gmail dot com'
-__date__ = '2011-04-12'
-__version__ = '0.1'
-
-import os
-import sys
-import cPickle
-import codecs
-from datetime import datetime
-sys.path.append('../')
-
-from classes import storage
-
-location = '/home/diederik/wikimedia/en/wiki/kaggle'
-files = os.listdir(location)
-files.reverse()
-
-max_size = 2147483648
-max_size_reached = False
-
-t0 = datetime.now()
-titles = {}
-ids = set()
-dates = {}
-edits = {}
-ignore_ids = set()
-size = 0
-cnt_obs = 0
-cutoff_date = datetime(2010, 8, 31)
-
-print 'Constructing training dataset...'
-db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
-dataset = codecs.open('training.tsv', 'w', 'utf-8')
-for filename in files:
- if not filename.startswith('comments') and not
filename.startswith('articles'):
- fh = codecs.open(os.path.join(location, filename))
- if max_size_reached == True:
- break
- for line in fh:
- line = line.strip()
- line = line.split('\t')
- if len(line) != 12:
- continue
- if line[10] == '1':
- continue
- timestamp = datetime.strptime(line[6], '%Y-%m-%dT%H:%M:%SZ')
- if timestamp > cutoff_date:
- continue
- username = line[3].lower()
- if username.endswith('bot') or username.find('script') > -1:
- #line[10] = '1'
- continue
- id = line[2]
- if id not in ids and id not in ignore_ids:
- res = db.find_one({'editor': id})
- if res == None:
- ignore_ids.add(id)
- continue
- cnt_obs += 1
- title_id = line[1]
- ids.add(id)
- simple_date = '%s-%s' % (timestamp.year, timestamp.month)
- dates.setdefault(simple_date, 0)
- dates[simple_date] += 1
- title = line.pop(5)
- titles[title_id] = title
- line.append('\n')
- line = '\t'.join(line)
- size += len(line)
- if size > max_size:
- max_size_reached = True
- dataset.write(line.decode('utf-8'))
-
-dataset.close()
-
-print 'Constructing title dataset...'
-fh = codecs.open('titles.tsv', 'w', 'utf-8')
-for id, title in titles.iteritems():
- fh.write('%s\t%s\n' % (id, title.decode('utf-8')))
-fh.close()
-
-
-print 'Constructing solution dataset...'
-x = 0
-fh = codecs.open('solutions.tsv', 'w', 'utf-8')
-for id in ids:
- if id not in ignore_ids:
- obs = db.find_one({'editor': str(id)}, 'cum_edit_count_main_ns')
- if obs != None:
- x += 1
- n = obs['cum_edit_count_main_ns']
- fh.write('%s,%s\n' % (id.decode('utf-8'), n))
- edits.setdefault(n, 0)
- edits[n] += 1
- else:
- print id
-fh.close()
-
-print 'Storing date histogram'
-fh = open('histogram_dates.bin', 'wb')
-cPickle.dump(dates, fh)
-fh.close()
-
-
-fh = open('histogram_dates.tsv', 'w')
-for date, n in dates.iteritems():
- fh.write('%s\t%s\n' % (date, n))
-fh.close()
-
-
-print 'Storing edit histogram'
-fh = open('histogram_edits.bin', 'wb')
-cPickle.dump(edits, fh)
-fh.close()
-
-fh = open('histogram_edits.tsv', 'w')
-for edit, n in edits.iteritems():
- fh.write('%s\t%s\n' % (edit, n))
-fh.close()
-
-
-t1 = datetime.now()
-print 'Descriptives:'
-print 'Number of editors: %s' % x
-print 'Number of edits: %s' % cnt_obs
-print 'It took %s to construct the Kaggle training set' % (t1 - t0)
Copied: trunk/tools/editor_trends/kaggle/training_db.py (from rev 89242,
trunk/tools/editor_trends/kaggle/training.py)
===================================================================
--- trunk/tools/editor_trends/kaggle/training_db.py
(rev 0)
+++ trunk/tools/editor_trends/kaggle/training_db.py 2011-06-28 04:19:32 UTC
(rev 90935)
@@ -0,0 +1,452 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])'])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-04-12'
+__version__ = '0.1'
+
+import os
+import sys
+import cPickle
+import codecs
+import random
+from itertools import izip
+from datetime import datetime
+from dateutil.relativedelta import *
+sys.path.append('../')
+
+random.seed(1024)
+from classes import storage
+
+headers = ['user_id', 'article_id', 'revision_id', 'namespace', 'timestamp',
+ 'md5', 'reverted', 'reverted_user_id', 'reverted_revision_id',
'delta', 'cur_size']
+keys = ['user_id', 'article_id', 'rev_id', 'ns', 'date',
+ 'hash', 'revert', 'reverted_user', 'reverted_rev_id', 'delta',
'cur_size']
+
+max_size = 2147483648
+#max_size = 2000000
+cnt_obs = 0 #count of number of edits
+revs = {}
+titles = {}
+predictions = {}
+
+t0 = datetime.now()
+location = '/home/diederik/wikimedia/xml/en/wiki/txt/'
+files = os.listdir(location)
+#files.sort()
+#files.reverse()
+editors_seen = {}
+cutoff_date = datetime(2010, 9, 1) #operator is >
+end_date = datetime(2011, 2, 1) #operator is <
+cutoff_date_training = datetime(2010, 1, 31) #operator is >
+end_date_training = datetime(2010, 9, 1) # operator is <
+
+class IDGenerator:
+ def __init__(self):
+ self.n = 0
+ self.ids = {}
+
+ def get_id(self, n):
+ if n not in self.ids:
+ self.ids[n] = self.n
+ self.n += 1
+ return str(self.ids[n])
+
+class RandomIDGenerator:
+ def __init__(self):
+ self.n = 0
+ self.ids = {}
+ self.rnd_ids = {}
+ self.inverted_ids = None
+
+ def invert_dict(self, dictionary):
+ return dict((v, k) for k, v in dictionary.iteritems())
+
+ def get_id(self, n):
+ if n not in self.ids:
+ self.n += 1
+ while len(self.rnd_ids) < self.n :
+ rnd_id = self.get_random_id()
+ if self.rnd_ids.get(rnd_id, False) == False:
+ self.rnd_ids[rnd_id] = True
+ self.ids[n] = rnd_id
+ return self.ids[n]
+
+ def get_random_id(self):
+ return random.randrange(0, 1000000)
+
+ def reverse_lookup(self, n):
+ self.inverted_ids = self.invert_dict(self.ids)
+ return self.inverted_ids[n]
+
+
+def construct_article_meta(fh_articles, files):
+ print 'Constructing title dataset...'
+ headers = ['article_id', 'category', 'timestamp', 'namespace', 'redirect',
'title', 'related_page']
+ write_headers(fh_articles, headers)
+ #fh_articles.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ('article_id',
'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page'))
+ article_meta = {}
+ for filename in files:
+ if filename.startswith('articles_meta'):
+ fh = codecs.open(os.path.join(location, filename))
+ for line in fh:
+ line = line.strip()
+ line = line.split('\t')
+ category = line[1]
+ if category != 'List':
+ title = line[2]
+ title = title.split('/')
+ article_meta.setdefault(title[-1], {})
+ article_meta[title[-1]]['category'] = category
+ article_meta[title[-1]]['id'] = line[0]
+ fh.close()
+ return article_meta
+
+
+def determine_active(edits, start_date, end_date):
+ active = 0
+ if start_date == datetime(2009, 9, 1):
+ if '2009' not in edits and '2010' not in edits:
+ return active
+# elif start_date == datetime(2010, 9, 1):
+# if '2010' not in edits and '2011' not in edits:
+# return active
+
+
+ namespaces = ['0', '1', '2', '3', '4', '5']
+ while start_date < end_date:
+ year = str(start_date.year)
+ month = str(start_date.month)
+ for ns in namespaces:
+ active += edits.get(year, {}).get(month, {}).get(ns, 0)
+ if active > 0: #we don't need to know how many edits,just if
active
+ return active
+ start_date = start_date + relativedelta(months= +1)
+ return active
+
+
+def load_binary_file(filename):
+ fh = open(filename, 'rb')
+ obj = cPickle.load(fh)
+ fh.close()
+ return obj
+
+
+def convert_tz_to_mysql_tz(tz):
+ return tz.__str__()
+
+
+def check_reverter(idg, reverter):
+ try:
+ if reverter != -1:
+ reverter = idg.get_id(reverter)
+ return reverter
+ except ValueError:
+ pass
+ return -1
+
+
+def check_user_id(user_id):
+ try:
+ int(user_id)
+ except ValueError:
+ return False
+ return True
+
+
+def check_username(username):
+ username = username.lower()
+ if username.endswith('bot') or username.find('script') > -1:
+ return False #exclude more bots and scripts
+ return True
+
+
+def determine_editors(db):
+ start_date_pre = datetime(2009, 9, 1)
+ end_date_pre = datetime(2010, 9, 1)
+ end_date = datetime(2011, 2, 1)
+ pre_editors = set()
+ post_editors = set()
+ cursor = db.find({}, 'first_edit,edit_count,user_id,username')
+ x, y, z = 0, 0, 0
+ for editor in cursor:
+ x += 1
+ if 'first_edit' not in editor:
+ continue
+ if editor['first_edit'] > end_date_pre:
+ continue
+ if check_username(editor['username']) == False:
+ continue
+ if check_user_id(editor['user_id']) == False:
+ continue
+
+ active_pre = determine_active(editor['edit_count'], start_date_pre,
end_date_pre)
+ if x % 100000 == 0:
+ print 'Retrieved %s pre_editors / %s post_editors / %s total
editors...' % (y, z, x)
+
+ if active_pre == 0:
+ continue #exclude editors who are not active in the year before
the cutoff date
+ else:
+ active_post = determine_active(editor['edit_count'], end_date_pre,
end_date)
+ if active_post == 0:
+ pre_editors.add(user_id)
+ y += 1
+ else:
+ post_editors.add(user_id)
+ z += 1
+ print 'Retrieved %s pre_editors / %s post_editors / %s total editors...' %
(y, z, x)
+ return pre_editors, post_editors
+
+
+def write_headers(fh, headers):
+ for i, key in enumerate(headers):
+ fh.write('%s' % key)
+ if (i + 1) != len(headers):
+ fh.write('\t')
+ else:
+ fh.write('\n')
+
+def write_revision(dataset, revision):
+ for i, key in enumerate(keys):
+ if type(revision[key]) == type(0):
+ revision[key] = str(revision[key])
+ dataset.write('%s' % revision[key].decode('utf-8'))
+ if (i + 1) != len(keys):
+ dataset.write('\t')
+ else:
+ dataset.write('\n')
+
+
+print 'Constructing training dataset...'
+db_dataset = storage.init_database('mongo', 'wikilytics',
'enwiki_editors_dataset')
+print 'Loading editors...'
+if not os.path.exists('set_a.bin'):
+ pre_editors, post_editors = determine_editors(db_dataset)
+ fh = open('set_a.bin', 'wb')
+ cPickle.dump(pre_editors, fh)
+ fh.close()
+
+ fh = open('set_b.bin', 'wb')
+ cPickle.dump(post_editors, fh)
+ fh.close()
+else:
+ pre_editors = load_binary_file('set_a.bin')
+ post_editors = load_binary_file('set_b.bin')
+
+
+dataset = codecs.open('training.tsv', 'w', 'utf-8')
+write_headers(dataset, headers)
+idg = RandomIDGenerator()
+
+namespaces = IDGenerator()
+print 'Parsing revisions...'
+db_raw = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_raw')
+seen_editors = {}
+editors = {}
+x = 1
+for editor in post_editors:
+ #print editor
+ editors[x] = editor
+ x += 2
+x = 0
+z = len(post_editors)
+for y, editor in enumerate(pre_editors):
+ #print editor
+ editors[x] = editor
+ x += 2
+ if z == y:
+ break
+
+editor_keys = editors.keys()
+editor_keys.sort()
+for key in editor_keys:
+ #print editors
+ #for editor in editors:
+ editor = editors[key]
+ #print editor
+ go = editors_seen.get(editor, True)
+ if go:
+ editors_seen[editor] = False
+ user_id = idg.get_id(editor)
+ print 'Parsing editor %s (%s) ...' % (editor, user_id)
+ revisions = db_raw.find({'user_id': str(editor)})
+
+ predictions.setdefault(user_id, {})
+ predictions[user_id].setdefault('solution', 0)
+ predictions[user_id].setdefault('training', 0)
+
+ for revision in revisions:
+ revision['user_id'] = user_id #recode id to make it harder to look
up answers
+ if revision['ns'] < 0 or revision['ns'] > 5:
+ continue
+ #revision['ns'] = namespaces.get_id(revision['ns'])
+ timestamp = revision['date']
+ revision['date'] = convert_tz_to_mysql_tz(timestamp)
+
+
+
+ if timestamp > cutoff_date:
+ #print editor, user_id, timestamp, revision['date']
+ if timestamp < end_date:
+ predictions[user_id]['solution'] += 1
+ elif timestamp > cutoff_date_training:
+ if timestamp < end_date_training:
+ predictions[user_id]['training'] += 1
+
+ if timestamp > cutoff_date: #exclude edits after cut off date
+ continue
+ if revision['revert'] == 1:
+ revision['reverted_user'] = check_reverter(idg,
revision.get('reverted_user', -1))
+ revision.pop('_id')
+ revision.pop('username')
+ titles[revision['article_id']] = True
+ revs[revision['rev_id']] = True
+ write_revision(dataset, revision)
+ cnt_obs += 1
+ if cnt_obs % 10000 == 0:
+ print 'Parsed %s revisions...' % cnt_obs
+ if dataset.tell() > max_size:
+ break
+if dataset.tell() > max_size:
+ print 'Reached maximum filesize...'
+else:
+ print 'Parsed all available editors in post set...'
+dataset.close()
+
+
+
+print 'Constructing solution dataset...'
+fh = codecs.open('solutions.csv', 'w', 'utf-8')
+editor_keys = predictions.keys()
+editor_keys.sort()
+fh.write('%s,%s\n' % ('user_id', 'solution'))
+for key in editor_keys:
+ fh.write('%s,%s\n' % (key, predictions[key]['solution']))
+ print key, predictions[key]['solution']
+fh.close()
+
+
+print 'Constructing test dataset...'
+fh = codecs.open('test.csv', 'w', 'utf-8')
+fh.write('%s,%s\n' % ('user_id', 'test'))
+for key, value in predictions.iteritems():
+ fh.write('%s,%s\n' % (key, value['training']))
+fh.close()
+
+print 'Constructing article file...'
+fh_articles = codecs.open('titles.tsv', 'w', 'utf-8')
+article_meta = construct_article_meta(fh_articles, files)
+categories = IDGenerator()
+for filename in files:
+ if filename.startswith('articles') and not
filename.startswith('articles_meta'):
+ fh = codecs.open(os.path.join(location, filename))
+ for line in fh:
+ line = line.strip()
+ line = line.split('\t')
+ if len(line) == 6:
+ article_id = int(line[0])
+ title = titles.pop(article_id, None)
+ if title:
+ title = line[-1]
+ meta = article_meta.get(title, None)
+ parent_id = '-1'
+ category = -1
+ redirect = line[4]
+ if redirect == 'False':
+ redirect = '0'
+ else:
+ redirect = '1'
+ line[4] = redirect
+ if meta:
+ parent_id = meta['id']
+ category = meta['category']
+
+
+ line[1] = categories.get_id(category)
+ tz = datetime.strptime(line[2], '%Y-%m-%dT%H:%M:%SZ')
+ line[2] = convert_tz_to_mysql_tz(tz)
+ line[-1] = line[-1].decode('utf-8')
+ line.append(parent_id)
+ line.append('\n')
+ fh_articles.write('\t'.join(line))
+ fh.close()
+fh_articles.close()
+
+
+print 'Constructing comment dataset...'
+fh_comments = codecs.open('comments.tsv', 'w', 'utf-8')
+fh_comments.write('%s\t%s\n' % ('revision_id', 'comment'))
+cnt = len(revs.keys())
+for filename in files:
+ if filename.startswith('comments'):
+ fh = codecs.open(os.path.join(location, filename))
+ for line in fh:
+ if cnt == 0:
+ break
+ line = line.strip()
+ line = line.split('\t')
+ if len(line) == 2: #some lines are missing rev id, not sure why.
+ try:
+ rev_id = int(line[0])
+ exists = revs.get(rev_id, None)
+ if exists:
+ fh_comments.write('%s\t%s\n' % (rev_id,
line[1].decode('utf-8')))
+ cnt -= 1
+ except (ValueError, KeyError), error:
+ print error
+ fh.close()
+fh_comments.close()
+
+print 'Storing random ids...'
+fh = open('random_ids.bin', 'wb')
+cPickle.dump(idg, fh)
+fh.close()
+
+fh = codecs.open('namespaces.tsv', 'w', 'utf-8')
+write_headers(fh, ['key', 'namespace'])
+namespaces = {'0':'Main',
+ '1':'Talk',
+ '2':'User',
+ '3':'User Talk',
+ '4':'Wikipedia',
+ '5':'Wikipedia Talk'
+ }
+for key, value in namespaces.iteritems():
+ fh.write('%s\t%s\n' % (key, value))
+fh.close()
+
+fh = codecs.open('categories.tsv', 'w', 'utf-8')
+write_headers(fh, ['id', 'name'])
+for key, value in categories.ids.iteritems():
+ fh.write('%s\t%s\n' % (value, key))
+fh.close()
+
+fh = open('descriptives.tsv', 'w')
+fh.write('Number of unique editors: %s\n' % idg.n)
+fh.write('Number of revisions: %s\n' % cnt_obs)
+fh.write('Number of pre-editors: %s\n' % len(pre_editors))
+fh.write('Number of post-editors: %s\n' % len(post_editors))
+fh.write('Number of editors with zero edits after August 30th. 2010: %s' %
(len(pre_editors) - len(post_editors)))
+fh.close()
+
+
+t1 = datetime.now()
+print 'Descriptives:'
+print 'Number of unique editors: %s' % idg.n
+print 'Number of revisions: %s' % cnt_obs
+print 'Number of pre-editors: %s' % len(pre_editors)
+print 'Number of post-editors: %s' % len(post_editors)
+print 'Number of editors with zero edits after August 30th. 2010: %s' %
(len(pre_editors) - len(post_editors))
+print 'It took %s to construct the Kaggle training set' % (t1 - t0)
Added: trunk/tools/editor_trends/kaggle/training_file.py
===================================================================
--- trunk/tools/editor_trends/kaggle/training_file.py
(rev 0)
+++ trunk/tools/editor_trends/kaggle/training_file.py 2011-06-28 04:19:32 UTC
(rev 90935)
@@ -0,0 +1,430 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+'''
+Copyright (C) 2010 by Diederik van Liere ([email protected])
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License version 2
+as published by the Free Software Foundation.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+See the GNU General Public License for more details, at
+http://www.fsf.org/licenses/gpl.html
+'''
+
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])'])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-04-12'
+__version__ = '0.1'
+
+import os
+import sys
+import cPickle
+import codecs
+import random
+from itertools import izip_longest
+from datetime import datetime
+from dateutil.relativedelta import *
+sys.path.append('../')
+import resource
+
+random.seed(1024)
+from classes import storage
+
+headers = ['user_id', 'article_id', 'revision_id', 'namespace', 'timestamp',
+ 'md5', 'revert', 'reverted_user', 'reverted_rev_id', 'delta',
'cur_size']
+keys = ['user_id', 'article_id', 'rev_id', 'ns', 'date',
+ 'hash', 'revert', 'reverted_user', 'reverted_rev_id', 'delta',
'cur_size']
+
+size = 0 #current size of file
+#max_size = 2147483648
+max_size = 5000000
+editors_seen = {}
+cnt_obs = 0 #count of number of edits
+revs = {}
+titles = {}
+predictions = {}
+
+t0 = datetime.now()
+location = '/home/diederik/wikimedia/xml/en/wiki/txt/'
+txt_files = '/home/diederik/wikimedia/xml/en/wiki/sorted/'
+files = os.listdir(location)
+max_file_handles = resource.getrlimit(resource.RLIMIT_NOFILE)[0] - 100
+#files.sort()
+#files.reverse()
+
+cutoff_date = datetime(2010, 8, 31) #operator is >
+end_date = datetime(2011, 2, 1) #operator is <
+cutoff_date_training = datetime(2010, 1, 31) #operator is >
+end_date_training = datetime(2010, 9, 1) # operator is <
+
+
+class IDGenerator:
+ def __init__(self):
+ self.n = 0
+ self.ids = {}
+ self.rnd_ids = {}
+ self.inverted_ids = None
+
+ def invert_dict(self, dictionary):
+ return dict((v, k) for k, v in dictionary.iteritems())
+
+ def get_id(self, n):
+ if n not in self.ids:
+ self.n += 1
+ while len(self.rnd_ids) < self.n :
+ rnd_id = self.get_random_id()
+ if self.rnd_ids.get(rnd_id, False) == False:
+ self.rnd_ids[rnd_id] = True
+ self.ids[n] = rnd_id
+ return self.ids[n]
+
+ def get_random_id(self):
+ return random.randrange(0, 1000000)
+
+ def reverse_lookup(self, n):
+ self.inverted_ids = self.invert_dict(self.ids)
+ return self.inverted_ids[n]
+
+
+def construct_article_meta(fh_articles, files):
+ print 'Constructing title dataset...'
+ headers = ['article_id', 'category', 'timestamp', 'namespace', 'redirect',
'title', 'related_page']
+ write_headers(fh_articles, headers)
+ #fh_articles.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ('article_id',
'category', 'timestamp', 'namespace', 'redirect', 'title', 'related_page'))
+ article_meta = {}
+ for filename in files:
+ if filename.startswith('articles_meta'):
+ fh = codecs.open(os.path.join(location, filename))
+ for line in fh:
+ line = line.strip()
+ line = line.split('\t')
+ category = line[1]
+ if category != 'List':
+ title = line[2]
+ title = title.split('/')
+ article_meta.setdefault(title[-1], {})
+ article_meta[title[-1]]['category'] = category
+ article_meta[title[-1]]['id'] = line[0]
+ fh.close()
+ return article_meta
+
+
+def determine_active(edits, start_date, end_date):
+ active = 0
+ namespaces = ['0', '1', '2', '3', '4', '5']
+ if start_date == datetime(2009, 9, 1):
+ if '2009' not in edits and '2010' not in edits:
+ return active
+ elif start_date == datetime(2010, 9, 1):
+ if '2010' not in edits and '2011' not in edits:
+ return active
+
+ while start_date < end_date:
+ year = str(start_date.year)
+ month = str(start_date.month)
+ for ns in namespaces:
+ active += edits.get(year, {}).get(month, {}).get(ns, 0)
+ if active > 0: #we don't need to know how many edits,just if
active
+ return active
+ start_date = start_date + relativedelta(months= +1)
+ return active
+
+def load_binary_file(filename):
+ fh = open('set_b.bin', 'rb')
+ obj = cPickle.load(fh)
+ fh.close()
+ return obj
+
+
+def convert_tz_to_mysql_tz(tz):
+ iso = tz.__str__()
+ tz = iso[0:4] + '-' + iso[4:6] + '-' + iso[6:]
+ return tz
+
+
+def check_reverter(idg, reverter):
+ try:
+ reverter = int(reverter)
+ if reverter != -1:
+ reverter = idg.get_id(reverter)
+ return reverter
+ except ValueError:
+ pass
+ return -1
+
+
+def check_user_id(user_id):
+ try:
+ int(user_id)
+ except ValueError:
+ return False
+ return True
+
+
+def check_username(username):
+ username = username.lower()
+ if username.endswith('bot') or username.find('script') > -1:
+ return False #exclude more bots and scripts
+ return True
+
+
+def determine_editors(db):
+ start_date_pre = datetime(2009, 9, 1)
+ end_date_pre = datetime(2010, 9, 1)
+ end_date = datetime(2011, 2, 1)
+ pre_editors = set()
+ post_editors = set()
+ #cursor = db.find({'date': {'$gte': start_date_pre, '$lt': end_date_pre}},
'first_edit,edit_count,user_id,username')
+ cursor = db.find({}, 'first_edit,edit_count,user_id,username')
+ x, y, z = 0, 0, 0
+ for editor in cursor:
+ x += 1
+ if 'first_edit' not in editor:
+ continue
+ if editor['first_edit'] >= end_date_pre:
+ continue
+ if check_username(editor['username']) == False:
+ continue
+ if check_user_id(editor['editor']) == False:
+ continue
+
+ #print editor['edit_count']
+ active = determine_active(editor['edit_count'], start_date_pre,
end_date_pre)
+ if active > 0:
+ pre_editors.add(editor['editor'])
+ y += 1
+ active = determine_active(editor['edit_count'], end_date_pre, end_date)
+ if active > 0:
+ post_editors.add(editor['editor'])
+ z += 1
+ if x % 100000 == 0:
+ print 'Retrieved %s pre_editors / %s post_editors / %s total
editors...' % (y, z, x)
+
+ #set_a = pre_editors.difference(post_editors)
+ post_editors = pre_editors.intersection(post_editors)
+
+ return pre_editors, post_editors
+
+
+def write_headers(fh, headers):
+ for i, key in enumerate(headers):
+ fh.write('%s' % key)
+ if (i + 1) != len(keys):
+ fh.write('\t')
+ else:
+ fh.write('\n')
+
+def write_revision(dataset, revision):
+ size = 0
+ for i, key in enumerate(keys):
+ #print key, revision[key]
+# if key == 'reverted_user' or key == 'reverted_rev_id':
+# revision[key] = revision[key][0]
+ if type(revision[key]) == type(0):
+ revision[key] = str(revision[key])
+
+ dataset.write('%s' % revision[key].decode('utf-8'))
+ size += len(revision[key])
+ if (i + 1) != len(keys):
+ dataset.write('\t')
+ else:
+ dataset.write('\n')
+ return size
+
+
+print 'Constructing training dataset...'
+db_dataset = storage.init_database('mongo', 'wikilytics',
'enwiki_editors_dataset')
+print 'Loading editors...'
+if not os.path.exists('set_a.bin'):
+ pre_editors, post_editors = determine_editors(db_dataset)
+ fh = open('set_a.bin', 'wb')
+ cPickle.dump(pre_editors, fh)
+ fh.close()
+
+ fh = open('set_b.bin', 'wb')
+ cPickle.dump(post_editors, fh)
+ fh.close()
+else:
+ pre_editors = load_binary_file('set_a.bin')
+ post_editors = load_binary_file('set_b.bin')
+
+
+dataset = codecs.open('training.tsv', 'w', 'utf-8')
+write_headers(dataset, headers)
+idg = IDGenerator()
+
+
+
+print 'Parsing revisions...'
+db_raw = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_raw')
+seen_editors = {}
+for editors in izip_longest(post_editors, pre_editors, fillvalue=None):
+ for editor in editors:
+ go = editors_seen.get(editor, True)
+ if go:
+ #if editor:
+ editors_seen[editor] = False
+ print 'Parsing editor %s...' % editor
+ #revisions = db_raw.find({'user_id': editor})
+ file_id = int(editor) % max_file_handles
+ fh = codecs.open(os.path.join(txt_files, '%s.csv' % file_id), 'r',
'utf-8')
+ for line in fh:
+ line = line.strip()
+ line = line.split('\t')
+ if line[0] != editor:
+ continue
+ revision = {}
+ revision['user_id'] = int(line[0])
+ revision['article_id'] = int(line[1])
+ revision['rev_id'] = int(line[2])
+ revision['ns'] = line[4]
+ revision['date'] = datetime.strptime(line[6],
'%Y-%m-%dT%H:%M:%SZ')
+ revision['hash'] = line[7]
+ revision['revert'] = line[8]
+ revision['reverted_user'] = line[9]
+ revision['reverted_rev_id'] = line[10]
+ revision['cur_size'] = line[12]
+ revision['delta'] = line[13]
+ #print line
+ #print revision
+
+ #'user_id', 'article_id', 'rev_id', 'ns', 'date',
+ #'hash', 'revert', 'reverted_user', 'reverted_rev_id',
'delta', 'cur_size'
+ #print 'Editor %s made % edits' % (editor, len(revisions))
+ #for revision in revisions:
+ user_id = idg.get_id(revision['user_id'])
+ revision['user_id'] = user_id #recode id to make it harder to
look up answers
+ if revision['ns'] < 0:
+ continue
+ timestamp = revision['date']
+ #revision['date'] = convert_tz_to_mysql_tz(timestamp)
+
+ predictions.setdefault(user_id, {})
+ predictions[user_id].setdefault('solution', 0)
+ predictions[user_id].setdefault('training', 0)
+
+ if timestamp > cutoff_date and timestamp < end_date:
+ predictions[user_id]['solution'] += 1
+ elif timestamp > cutoff_date_training and timestamp <
end_date_training:
+ predictions[user_id]['training'] += 1
+ if timestamp > cutoff_date: #exclude edits after cut off date
+ continue
+
+ revision['reverted_user'] = check_reverter(idg,
revision.get('reverted_user', -1))
+ #revision.pop('_id')
+ #revision.pop('username')
+ revision['date'] = revision['date'].__str__()
+ titles[revision['article_id']] = True
+ revs[revision['rev_id']] = True
+ size += write_revision(dataset, revision)
+ cnt_obs += 1
+ if cnt_obs % 10000 == 0:
+ print 'Parsed %s revisions...' % cnt_obs
+ fh.close()
+ if size > max_size:
+ break
+if size > max_size:
+ print 'Reached maximum filesize...'
+else:
+ print 'Parsed all available editors in post set...'
+dataset.close()
+
+
+
+print 'Constructing solution dataset...'
+fh = codecs.open('solutions.csv', 'w', 'utf-8')
+keys = predictions.keys()
+keys.sort()
+fh.write('%s,%s\n' % ('editor_id', 'solution'))
+for key in keys:
+ fh.write('%s,%s\n' % (key, predictions[key]['solution']))
+fh.close()
+
+
+print 'Constructing test dataset...'
+fh = codecs.open('test.csv', 'w', 'utf-8')
+fh.write('%s,%s\n' % ('editor_id', 'test'))
+for key, value in predictions.iteritems():
+ fh.write('%s,%s\n' % (key, value['training']))
+fh.close()
+
+
+print 'Constructing article file...'
+fh_articles = codecs.open('titles.tsv', 'w', 'utf-8')
+article_meta = construct_article_meta(fh_articles, files)
+for filename in files:
+ if filename.startswith('articles') and not
filename.startswith('articles_meta'):
+ fh = codecs.open(os.path.join(location, filename))
+ for line in fh:
+ line = line.strip()
+ line = line.split('\t')
+ if len(line) == 6:
+ article_id = int(line[0])
+ title = titles.get(article_id, None)
+ if title:
+ title = line[-1]
+ meta = article_meta.get(title, None)
+ parent_id = -1
+ category = 'Null'
+ if meta:
+ parent_id = meta['id']
+ category = meta['category']
+
+ line[1] = category
+ line[2] = convert_tz_to_mysql_tz(line[2])
+ line[-1] = line[-1].decode('utf-8')
+ line.append(str(parent_id))
+ line.append('\n')
+ fh_articles.write('\t'.join(line))
+ fh.close()
+fh_articles.close()
+
+
+print 'Constructing comment dataset...'
+fh_comments = codecs.open('comments.tsv', 'w', 'utf-8')
+fh_comments.write('%s\t%s\n' % ('rev_id', 'text'))
+cnt = len(revs.keys())
+for filename in files:
+ if filename.startswith('comments'):
+ fh = codecs.open(os.path.join(location, filename))
+ for line in fh:
+ if cnt == 0:
+ break
+ line = line.strip()
+ line = line.split('\t')
+ if len(line) == 2: #some lines are missing rev id, not sure why.
+ try:
+ rev_id = int(line[0])
+ exists = revs.get(rev_id, None)
+ if exists:
+ fh_comments.write('%s\t%s\n' % (rev_id,
line[1].decode('utf-8')))
+ cnt -= 1
+ except (ValueError, KeyError), error:
+ print error
+ fh.close()
+fh_comments.close()
+
+print 'Storing random ids...'
+fh = open('random_ids.bin', 'wb')
+cPickle.dump(idg, fh)
+fh.close()
+
+
+fh = open('descriptives.tsv', 'w')
+fh.write('Number of unique editors: %s\n' % idg.n)
+fh.write('Number of revisions: %s\n' % cnt_obs)
+fh.write('Number of pre-editors: %s\n' % len(pre_editors))
+fh.write('Number of post-editors: %s\n' % len(post_editors))
+fh.write('Number of editors with zero edits after August 30th. 2010: %s' %
(len(pre_editors) - len(post_editors)))
+fh.close()
+
+
+t1 = datetime.now()
+print 'Descriptives:'
+print 'Number of unique editors: %s' % idg.n
+print 'Number of revisions: %s' % cnt_obs
+print 'Number of pre-editors: %s' % len(pre_editors)
+print 'Number of post-editors: %s' % len(post_editors)
+print 'Number of editors with zero edits after August 30th. 2010: %s' %
(len(pre_editors) - len(post_editors))
+print 'It took %s to construct the Kaggle training set' % (t1 - t0)
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs