http://www.mediawiki.org/wiki/Special:Code/MediaWiki/89189
Revision: 89189
Author: diederik
Date: 2011-05-30 21:50:45 +0000 (Mon, 30 May 2011)
Log Message:
-----------
Preparing for Summer of Research
Modified Paths:
--------------
trunk/tools/editor_trends/analyses/adhoc/bot_detector.py
trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
trunk/tools/editor_trends/analyses/analyzer.py
trunk/tools/editor_trends/classes/languages.py
trunk/tools/editor_trends/classes/projects.py
trunk/tools/editor_trends/classes/runtime_settings.py
trunk/tools/editor_trends/classes/settings.py
trunk/tools/editor_trends/classes/storage.py
trunk/tools/editor_trends/etl/differ.py
trunk/tools/editor_trends/etl/extracter.py
trunk/tools/editor_trends/etl/transformer.py
trunk/tools/editor_trends/etl/variables.py
trunk/tools/editor_trends/kaggle/training.py
trunk/tools/editor_trends/manage.py
trunk/tools/editor_trends/statistics/stata/ppi.do
trunk/tools/editor_trends/utils/file_utils.py
trunk/tools/editor_trends/utils/log.py
trunk/tools/editor_trends/utils/text_utils.py
Removed Paths:
-------------
trunk/tools/editor_trends/etl/kaggle.py
Modified: trunk/tools/editor_trends/analyses/adhoc/bot_detector.py
===================================================================
--- trunk/tools/editor_trends/analyses/adhoc/bot_detector.py 2011-05-30
21:07:43 UTC (rev 89188)
+++ trunk/tools/editor_trends/analyses/adhoc/bot_detector.py 2011-05-30
21:50:45 UTC (rev 89189)
@@ -244,8 +244,8 @@
'''
This is the launcher that uses multiprocesses.
'''
- consumers = [consumers.XMLFileConsumer(tasks, None) for i in
xrange(settings.number_of_processes)]
- for x in xrange(settings.number_of_processes):
+ consumers = [consumers.XMLFileConsumer(tasks, None) for i in
xrange(multiprocessing.cpu_count())]
+ for x in xrange(multiprocessing.cpu_count()):
tasks.put(None)
for w in consumers:
Modified: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
===================================================================
--- trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py 2011-05-30
21:07:43 UTC (rev 89188)
+++ trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py 2011-05-30
21:50:45 UTC (rev 89189)
@@ -159,7 +159,7 @@
min_d = min(data.keys())
max_d = max(data.keys())
match = data[max_d]
- matches.append((ppi_editor, match))
+ matches.append((ppi_editor, match, max_d))
#remove match to make sure that every matched pair is unique
for editor in distances:
try:
@@ -177,11 +177,12 @@
fh.write('_a\t'.join(vars))
fh.write('\t%s\t' % ('editor_b'))
fh.write('_b\t'.join(vars))
- fh.write('\tdelta registration days\tid\n')
+ fh.write('\tdelta registration days\tid\teuclid_dist\n')
for i, match in enumerate(matches):
line = []
editor_a = match[0]
editor_b = match[1]
+ dist = match[2]
line.append(editor_a)
values_a = [str(obs_a[editor_a][v]) for v in vars]
values_b = [str(obs_b[editor_b][v]) for v in vars]
@@ -191,6 +192,7 @@
dt = obs_a[editor_a]['reg_date'] - obs_b[editor_b]['reg_date']
line.append(str(dt.days))
line.append(str(i))
+ line.append(dist)
line.append('\n')
print line
#line = '\t'.join([str(l).decode('utf-8') for l in line])
Modified: trunk/tools/editor_trends/analyses/analyzer.py
===================================================================
--- trunk/tools/editor_trends/analyses/analyzer.py 2011-05-30 21:07:43 UTC
(rev 89188)
+++ trunk/tools/editor_trends/analyses/analyzer.py 2011-05-30 21:50:45 UTC
(rev 89189)
@@ -17,7 +17,7 @@
_date__ = '2010-12-10'
__version__ = '0.1'
-from multiprocessing import JoinableQueue, Queue, Manager, RLock, Process
+from multiprocessing import JoinableQueue, Queue, Manager, RLock, Process,
cpu_count
from multiprocessing.managers import BaseManager
from Queue import Empty
@@ -141,10 +141,10 @@
del editors
analyzers = [analytics.Analyzer(rts, tasks, result, var, data, plugin,
func) for
- x in xrange(rts.number_of_processes)]
+ x in xrange(cpu_count())]
- for x in xrange(rts.number_of_processes):
+ for x in xrange(cpu_count()):
tasks.put(None)
pbar = progressbar.ProgressBar(maxval=n).start()
@@ -152,7 +152,7 @@
analyzer.start()
- ppills = rts.number_of_processes
+ ppills = cpu_count()
while True:
while ppills > 0:
try:
@@ -216,7 +216,7 @@
def launcher():
- project, language, parser = manage.init_args_parser()
+ project, language, parser = commandline.init_args_parser()
args = parser.parse_args(['django'])
rts = runtime_settings.init_environment('wiki', 'en', args)
generate_chart_data(rts, 'taxonomy_burnout', time_unit='month')
Modified: trunk/tools/editor_trends/classes/languages.py
===================================================================
--- trunk/tools/editor_trends/classes/languages.py 2011-05-30 21:07:43 UTC
(rev 89188)
+++ trunk/tools/editor_trends/classes/languages.py 2011-05-30 21:50:45 UTC
(rev 89189)
@@ -679,9 +679,12 @@
print abbr
print len(abbr)
-def init():
+def init(language_code=None):
lnc = LanguageContainer()
- return lnc.languages[lnc.default]
+ if language_code:
+ return lnc.languages[language_code]
+ else:
+ return lnc.languages[lnc.default]
if __name__ == '__main__':
init()
Modified: trunk/tools/editor_trends/classes/projects.py
===================================================================
--- trunk/tools/editor_trends/classes/projects.py 2011-05-30 21:07:43 UTC
(rev 89188)
+++ trunk/tools/editor_trends/classes/projects.py 2011-05-30 21:50:45 UTC
(rev 89189)
@@ -107,9 +107,12 @@
pc = ProjectContainer()
pc.supported_projects()
-def init():
+def init(project=None):
pc = ProjectContainer()
- return pc.get_project('wiki')
+ if project:
+ return pc.get_project(project)
+ else:
+ return pc.get_project('wiki')
if __name__ == '__main__':
debug()
Modified: trunk/tools/editor_trends/classes/runtime_settings.py
===================================================================
--- trunk/tools/editor_trends/classes/runtime_settings.py 2011-05-30
21:07:43 UTC (rev 89188)
+++ trunk/tools/editor_trends/classes/runtime_settings.py 2011-05-30
21:50:45 UTC (rev 89189)
@@ -27,6 +27,9 @@
import datetime
import time
+if '..' not in sys.path:
+ sys.path.append('../')
+
from settings import Settings
from analyses import inventory
from classes import exceptions
@@ -48,49 +51,49 @@
self.language = language
self.dbname = 'wikilytics'
- if args:
- self.args = args
- self.hash = self.secs_since_epoch()
- #print self.settings.input_location
- #print self.get_value('location')
- self.project = self.update_project_settings()
- self.language = self.update_language_settings()
+ #if args:
+ self.args = args
+ self.id = '%s%s_%s' % (self.language.code, self.project.name,
'current_month')
+ #print self.settings.input_location
+ #print self.get_value('location')
+ self.project = self.update_project_settings()
+ self.language = self.update_language_settings()
- self.input_location = self.set_input_location()
- self.output_location = self.set_output_location()
+ self.input_location = self.set_input_location()
+ self.output_location = self.set_output_location()
- self.plugins = self.set_plugin()
- self.keywords = self.split_keywords()
- self.namespaces = self.get_namespaces()
+ self.plugins = self.set_plugin()
+ self.keywords = self.split_keywords()
+ self.namespaces = self.get_namespaces()
- self.kaggle = self.get_value('kaggle')
- self.function = self.get_value('func')
- self.ignore = self.get_value('except')
- self.force = self.get_value('force')
- self.analyzer_collection = self.get_value('collection')
+ #self.kaggle = self.get_value('kaggle')
+ self.function = self.get_value('func')
+ self.ignore = self.get_value('except')
+ self.force = self.get_value('force')
+ self.analyzer_collection = self.get_value('collection')
- self.dataset = os.path.join(self.dataset_location,
self.project.name)
- self.txt = os.path.join(self.output_location, 'txt')
- self.sorted = os.path.join(self.output_location, 'sorted')
- self.diffs = os.path.join(self.output_location, 'diffs')
+ self.dataset = os.path.join(self.dataset_location, self.project.name)
+ self.txt = os.path.join(self.output_location, 'txt')
+ self.sorted = os.path.join(self.output_location, 'sorted')
+ self.diffs = os.path.join(self.output_location, 'diffs')
- self.directories = [self.output_location,
- self.txt,
- self.sorted,
- self.dataset,
- self.diffs]
- self.verify_environment(self.directories)
+ self.directories = [self.output_location,
+ self.txt,
+ self.sorted,
+ self.dataset,
+ self.diffs]
+ self.verify_environment(self.directories)
- #Wikidump file related variables
- self.dump_filename = self.generate_wikidump_filename()
- self.dump_relative_path = self.set_dump_path()
- self.dump_absolute_path = self.set_dump_path(absolute=True)
+ #Wikidump file related variables
+ self.dump_filename = self.generate_wikidump_filename()
+ self.dump_relative_path = self.set_dump_path()
+ self.dump_absolute_path = self.set_dump_path(absolute=True)
- #Collection names
- self.editors_raw = '%s%s_editors_raw' % (self.language.code,
self.project.name)
- self.editors_dataset = '%s%s_editors_dataset' %
(self.language.code, self.project.name)
- self.articles_raw = '%s%s_articles_raw' % (self.language.code,
self.project.name)
- self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code,
self.project.name)
+ #Collection names
+ self.editors_raw = '%s%s_editors_raw' % (self.language.code,
self.project.name)
+ self.editors_dataset = '%s%s_editors_dataset' % (self.language.code,
self.project.name)
+ self.articles_raw = '%s%s_articles_raw' % (self.language.code,
self.project.name)
+ self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code,
self.project.name)
@@ -239,7 +242,7 @@
'''
default = self.project
proj = self.get_value('project')
- if proj != 'wiki':
+ if proj != default:
pc = projects.ProjectContainer()
proj = pc.get_project(proj)
return proj
@@ -281,7 +284,7 @@
return ['0'] #Assume that the mainspace is of interest
-def init_environment(project, language_code, args):
+def init_environment(project, language_code):
'''
Initialize an instance of RuntimeSettings.
'''
@@ -289,8 +292,9 @@
project = pjc.get_project(project)
lnc = languages.LanguageContainer()
language = lnc.get_language(language_code)
-
- args.language = language.name
- args.project = project.name
+ parser = init_args_parser(language_code, project)
+ args = parser.parse_args(['django'])
+ #args.language = language.name
+ #args.project = project.name
rts = RunTimeSettings(project, language, args)
return rts
Modified: trunk/tools/editor_trends/classes/settings.py
===================================================================
--- trunk/tools/editor_trends/classes/settings.py 2011-05-30 21:07:43 UTC
(rev 89188)
+++ trunk/tools/editor_trends/classes/settings.py 2011-05-30 21:50:45 UTC
(rev 89189)
@@ -17,11 +17,6 @@
__date__ = '2010-10-21'
__version__ = '0.1'
-'''
-This file contains settings that are used for constructing and analyzing
-the datasets as part of the Editor Dynamics and Anti-Vandalism projects.
-'''
-
from multiprocessing import cpu_count
import ConfigParser
import os
@@ -73,7 +68,7 @@
#Change this to match your computers configuration (RAM / CPU)
# I want to get rid off these two variables.
self.number_of_processes = cpu_count()
- self.windows_register = {'7z.exe': 'Software\\7-Zip'}
+ #self.windows_register = {'7z.exe': 'Software\\7-Zip'}
self.wp_dump_location = 'http://dumps.wikimedia.org'
@@ -107,6 +102,8 @@
self.default_project = config.get('wiki', 'project')
self.default_language = config.get('wiki', 'language')
self.storage = config.get('storage', 'db')
+ self.master = config.get('cluster', 'master')
+ self.slaves = config.get('cluster', 'slaves')
return True
except Exception, error:
#raise exceptions.GenericMessage('corrupted_config')
Modified: trunk/tools/editor_trends/classes/storage.py
===================================================================
--- trunk/tools/editor_trends/classes/storage.py 2011-05-30 21:07:43 UTC
(rev 89188)
+++ trunk/tools/editor_trends/classes/storage.py 2011-05-30 21:50:45 UTC
(rev 89189)
@@ -102,9 +102,14 @@
This class provides the functionality to talk to a MongoDB backend
including
inserting, finding, and updating data.
'''
- def __init__(self, dbname, collection):
+ def __init__(self, dbname, collection, master=None, slaves=[]):
+ if master == None:
+ self.master = 'localhost'
+ else:
+ self.master = master
+ self.slaves = slaves
+ self.port = 27017
super(Mongo, self).__init__(dbname, collection)
- self.port = 27017
@classmethod
def is_registrar_for(cls, storage):
@@ -114,8 +119,16 @@
return storage == 'mongo'
def connect(self):
- db = pymongo.Connection()
- return db[self.dbname]
+ master = pymongo.Connection(host=self.master, port=self.port)
+ if self.master == 'localhost':
+ return master[self.dbname]
+ else:
+ slave_connections = []
+ for slave in self.slaves:
+ slave = pymongo.Connection(host=slave, port=self.port)
+ slave_connections.append(slave)
+ master_slave_connection = pymongo.MasterSlaveConnection(master,
slave_connections)
+ return master_slave_connection[self.dbname]
def save(self, data):
assert isinstance(data, dict), 'You need to feed me dictionaries.'
Modified: trunk/tools/editor_trends/etl/differ.py
===================================================================
--- trunk/tools/editor_trends/etl/differ.py 2011-05-30 21:07:43 UTC (rev
89188)
+++ trunk/tools/editor_trends/etl/differ.py 2011-05-30 21:50:45 UTC (rev
89189)
@@ -17,23 +17,35 @@
__date__ = '2011-04-10'
__version__ = '0.1'
+
+'''
+This script generates diffs of edits for the Talk, User Talk and Wikipedia Talk
+pages of a Wikipedia project. These diffs are stored in json files and then
+imported in Mongo.
+'''
+import pprint
import json
import cStringIO
import codecs
import sys
import os
import difflib
+import bson
from xml.etree.cElementTree import iterparse, dump
from multiprocessing import JoinableQueue, Process, cpu_count
from datetime import datetime
+from copy import deepcopy
if '..' not in sys.path:
sys.path.append('../')
from utils import file_utils
+from utils import text_utils
from etl import variables
from classes import exceptions
+from classes import storage
+from classes import runtime_settings
def parse_xml(fh, format, process_id, location):
@@ -50,13 +62,13 @@
context = iterparse(fh, events=(start, end))
context = iter(context)
- article = {}
+
+ revisions = []
count_articles = 0
id = False
ns = False
parse = False
- rev1 = None
- rev2 = None
+ prev_rev_text = None
file_id, fh_output = None, None
try:
@@ -80,11 +92,11 @@
parsing this article, else it will skip this article.
'''
title = variables.parse_title(elem)
- article['title'] = title
current_namespace = variables.determine_namespace(title,
namespaces, include_ns)
if current_namespace == 1 or current_namespace == 3 or
current_namespace == 5:
parse = True
- article['namespace'] = current_namespace
+ #article['namespace'] = current_namespace
+ title = title.replace(namespaces[current_namespace], '')
count_articles += 1
if count_articles % 10000 == 0:
print 'Worker %s parsed %s articles' % (process_id,
count_articles)
@@ -105,23 +117,32 @@
timestamp = elem.find('%s%s' % (xml_namespace,
'timestamp')).text
contributor = elem.find('%s%s' % (xml_namespace,
'contributor'))
editor = variables.parse_contributor(contributor,
None, xml_namespace)
+ text = variables.extract_revision_text(elem,
xml_namespace)
+ comment = variables.extract_comment_text(elem,
xml_namespace)
if editor:
rev_id = variables.extract_revision_id(rev_id)
+ if prev_rev_text == None:
+ diff = text
+ prev_rev_text = deepcopy(text)
+ if prev_rev_text != None:
+ #print text[0:20], prev_rev_text[0:20]
+ diff = diff_revision(prev_rev_text, text)
- if rev1 == None and rev2 == None:
- diff = variables.extract_revision_text(elem,
xml_namespace)
- rev1 = elem
- if rev1 != None and rev2 != None:
- diff = diff_revision(rev1, rev2, xml_namespace)
+ if diff != None:
+ timestamp =
text_utils.convert_timestamp_to_datetime_utc(timestamp)
+ timestamp = timestamp.isoformat()
+ revision = dict(rev_id=rev_id, title=title,
+ timestamp=timestamp,
+ diff=diff, comment=comment,
+ id=editor['id'],
+
username=editor['username'],
+ article_id=article_id,
+ ns=current_namespace)
+ revisions.append(revision)
- article[rev_id] = {}
- article[rev_id].update(editor)
- article[rev_id]['timestamp'] = timestamp
- article[rev_id]['diff'] = diff
-
clear = True
if clear:
- rev2 = rev1
+ prev_rev_text = deepcopy(text)
elem.clear()
else:
elem.clear()
@@ -130,7 +151,7 @@
'''
Determine id of article
'''
- article['article_id'] = elem.text
+ article_id = int(elem.text)
id = True
elem.clear()
@@ -140,17 +161,16 @@
memory.
'''
elem.clear()
- #write diff of text to file
+
if parse:
- #print article
- fh_output, file_id = assign_filehandle(fh_output, file_id,
location, process_id, format)
- write_diff(fh_output, article, format)
+ #write diff of text to file
+ if len(revisions) > 0:
+ fh_output, file_id = assign_filehandle(fh_output,
file_id, location, process_id, format)
+ write_diff(fh_output, revisions, format)
+
#Reset all variables for next article
- article = {}
- if rev1 != None:
- rev1.clear()
- if rev2 != None:
- rev2.clear()
+ revisions = []
+ prev_rev_text = None
id = False
parse = False
@@ -181,14 +201,47 @@
return fh, file_id
+
def write_xml_diff(fh, article):
pass
-def write_json_diff(fh, article):
- json.dump(article, fh)
+def write_json_diff(fh, revisions):
+ fh.write('\nStart new JSON object\n')
+ json.dump(revisions, fh, indent=4, sort_keys=True)
+def store_json_diffs(rts):
+ files = os.listdir(rts.diffs)
+ print files, rts.diffs
+ db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
+ buffer = cStringIO.StringIO()
+
+ for filename in files:
+ fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r',
'utf-8')
+ for line in fh:
+ if line.startswith('\n') or line.startswith('Start'):
+ obj = buffer.getvalue()
+ if obj != '':
+ obj = json.loads(obj)
+ obj[0]['article_id'] = int(obj[0]['article_id'])
+ for key, value in obj[0].iteritems():
+ if type(value) == type(dict()):
+ value['timestamp'] =
datetime.strptime(value['timestamp'], '%Y-%m-%dT%H:%M:%S')
+ obj[0][key] = value
+ obj = obj[0]
+ #print obj
+ #print len(obj)
+ try:
+ db.save(obj)
+ except bson.errors.InvalidDocument, error:
+ print error
+ buffer = cStringIO.StringIO()
+ else:
+ buffer.write(line)
+ fh.close()
+
+
def write_diff(fh, article, format):
if format == 'xml':
write_xml_diff(fh, article)
@@ -198,23 +251,47 @@
raise exceptions.OutputNotSupported()
-def diff_revision(rev1, rev2, xml_namespace):
- buffer = cStringIO.StringIO()
- if rev1.text != None and rev2.text != None:
- diff = difflib.unified_diff(rev1.text, rev2.text, n=0, lineterm='')
+def diff_revision(rev1, rev2):
+ if rev1 == None:
+ rev1 = ''
+ if rev2 == None:
+ rev2 = ''
+ if len(rev1) != len(rev2):
+ buffer = cStringIO.StringIO()
+ rev1 = rev1.splitlines(1)
+ rev2 = rev2.splitlines(2)
+
+ diff = difflib.unified_diff(rev1, rev2, n=0, lineterm='')
for line in diff:
if len(line) > 3:
- print line
- buffer.write(line)
+ #print line
+ buffer.write(line.encode('utf-8'))
- return buffer.getvalue()
+ diff = buffer.getvalue()
+ if diff == '':
+ return None
+ else:
+ return diff
+ else:
+ return None
+
+
+def store_diffs_debug(rts):
+ db = storage.init_database(rts)
+ files = os.listdir(rts.diffs)
+ for filename in files:
+ fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r',
'utf-8')
+ diffs = json.load(fh)
+ db.insert(diffs)
+ fh.close()
+
+
def stream_raw_xml(input_queue, process_id, rts, format):
'''
This function fetches an XML file from the queue and launches the
processor.
'''
t0 = datetime.now()
- file_id = 0
while True:
filename = input_queue.get()
@@ -225,7 +302,7 @@
print filename
fh = file_utils.create_streaming_buffer(filename)
- parse_xml(fh, format, process_id, rts.input_location)
+ parse_xml(fh, format, process_id, rts.diffs)
fh.close()
t1 = datetime.now()
@@ -266,7 +343,14 @@
input_queue.join()
+ store_json_diffs(rts)
+ db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
+ db.add_index('title')
+ db.add_index('timestamp')
+ db.add_index('username')
+ db.add_index('ns')
+
def launcher_simple():
location = 'c:\\wikimedia\\nl\\wiki\\'
output_location = 'c:\\wikimedia\\nl\\wiki\\diffs\\'
@@ -311,5 +395,6 @@
if __name__ == '__main__':
+ #read_json_diffs()
launcher_simple()
#debug()
Modified: trunk/tools/editor_trends/etl/extracter.py
===================================================================
--- trunk/tools/editor_trends/etl/extracter.py 2011-05-30 21:07:43 UTC (rev
89188)
+++ trunk/tools/editor_trends/etl/extracter.py 2011-05-30 21:50:45 UTC (rev
89189)
@@ -22,8 +22,8 @@
parsing the XML on the fly and extracting & constructing the variables that are
need for subsequent analysis. The extract module is initialized using an
instance of RunTimeSettings and the most important parameters are:
-The name of project\n
-The language of the project\n
+The name of project
+The language of the project
The location where the dump files are stored
'''
Deleted: trunk/tools/editor_trends/etl/kaggle.py
===================================================================
--- trunk/tools/editor_trends/etl/kaggle.py 2011-05-30 21:07:43 UTC (rev
89188)
+++ trunk/tools/editor_trends/etl/kaggle.py 2011-05-30 21:50:45 UTC (rev
89189)
@@ -1,49 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-'''
-Copyright (C) 2010 by Diederik van Liere ([email protected])
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License version 2
-as published by the Free Software Foundation.
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-See the GNU General Public License for more details, at
-http://www.fsf.org/licenses/gpl.html
-'''
-
-__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
-__email__ = 'dvanliere at gmail dot com'
-__date__ = '2011-04-12'
-__version__ = '0.1'
-
-import sys
-
-if '..' not in sys.path:
- sys.path.append('..')
-
-from utils import file_utils
-
-
-def launcher():
- location = '/home/diederik/wikimedia/en/wiki/kaggle_training/'
- #location = 'C:\\wikimedia\\en\\wiki\\txt'
- files = file_utils.retrieve_file_list(location, extension='csv')
- files.sort()
- dataset = file_utils.create_txt_filehandle(location, 'dataset.csv', 'w',
'utf-8')
- for filename in files:
- if not filename.startswith('comments') and \
- not filename.startswith('articles') and not
filename.startswith('dataset'):
- fh = file_utils.create_txt_filehandle(location, filename, 'r',
'utf-8')
- print fh
- for line in fh:
- data = line.split('\t')
- username = data[3].lower()
- if username.endswith('bot'):
- continue
- else:
- dataset.write(line)
- fh.close()
- dataset.close()
-
-launcher()
Modified: trunk/tools/editor_trends/etl/transformer.py
===================================================================
--- trunk/tools/editor_trends/etl/transformer.py 2011-05-30 21:07:43 UTC
(rev 89188)
+++ trunk/tools/editor_trends/etl/transformer.py 2011-05-30 21:50:45 UTC
(rev 89189)
@@ -338,11 +338,11 @@
db_dataset = storage.init_database(rts.storage, rts.dbname,
rts.editors_dataset)
db_dataset.drop_collection()
editors = db_raw.retrieve_editors()
- return editors
+ return editors, db_raw, db_dataset
def transform_editors_multi_launcher(rts):
- editors = setup_database(rts)
+ editors, db_raw, db_dataset = setup_database(rts)
n = editors.size()
result = queue.JoinableRetryQueue()
pbar = progressbar.ProgressBar(maxval=n).start()
@@ -372,7 +372,7 @@
def transform_editors_single_launcher(rts):
print rts.dbname, rts.editors_raw
- editors = setup_database(rts)
+ editors, db_raw, db_dataset = setup_database(rts)
n = editors.size()
pbar = progressbar.ProgressBar(maxval=n).start()
@@ -384,7 +384,7 @@
editors.task_done()
if editor == None:
break
- editor = Editor(rts, editor)
+ editor = Editor(rts, editor, db_raw, db_dataset)
editor()
pbar.update(pbar.currval + 1)
Modified: trunk/tools/editor_trends/etl/variables.py
===================================================================
--- trunk/tools/editor_trends/etl/variables.py 2011-05-30 21:07:43 UTC (rev
89188)
+++ trunk/tools/editor_trends/etl/variables.py 2011-05-30 21:50:45 UTC (rev
89189)
@@ -275,20 +275,20 @@
Determine the id of a revision
'''
if revision_id != None:
- return revision_id.text
+ return int(revision_id.text)
else:
return None
-def extract_comment_text(revision_id, revision):
+def extract_comment_text(revision, xml_namespace):
'''
Extract the comment associated with an edit.
'''
- comment = {}
- text = revision.find('comment')
- if text != None and text.text != None:
- comment[revision_id] = text.text.encode('utf-8')
- return comment
+ comment_text = revision.find('%s%s' % (xml_namespace, 'comment'))
+ if comment_text != None and comment_text.text != None:
+ return comment_text.text
+ else:
+ return None
def create_namespace_dict(siteinfo, xml_namespace):
Modified: trunk/tools/editor_trends/kaggle/training.py
===================================================================
--- trunk/tools/editor_trends/kaggle/training.py 2011-05-30 21:07:43 UTC
(rev 89188)
+++ trunk/tools/editor_trends/kaggle/training.py 2011-05-30 21:50:45 UTC
(rev 89189)
@@ -17,23 +17,35 @@
__date__ = '2011-04-12'
__version__ = '0.1'
+import os
+import sys
+import cPickle
import codecs
-import os
from datetime import datetime
-import json
+sys.path.append('../')
-location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction'
+from classes import storage
+
+location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction_solution'
files = os.listdir(location)
files.reverse()
-dataset = codecs.open('training.tsv', 'w', 'utf-8')
+
+max_size = 2147483648
+max_size_reached = False
+
t0 = datetime.now()
-max_size = 2147483648
titles = {}
ids = set()
+dates = {}
+edits = {}
+ignore_ids = set()
size = 0
cnt_obs = 0
-max_size_reached = False
+cutoff_date = datetime(2010, 8, 31)
+print 'Constructing training dataset...'
+db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
+dataset = codecs.open('training.tsv', 'w', 'utf-8')
for filename in files:
if not filename.startswith('comments') and not
filename.startswith('articles'):
fh = codecs.open(os.path.join(location, filename))
@@ -46,13 +58,25 @@
continue
if line[10] == '1':
continue
+ timestamp = datetime.strptime(line[6], '%Y-%m-%dT%H:%M:%SZ')
+ if timestamp > cutoff_date:
+ continue
username = line[3].lower()
- if username.endswith('bot'):
+ if username.endswith('bot') or username.find('script') > -1:
#line[10] = '1'
continue
+ id = line[2]
+ if id not in ids and id not in ignore_ids:
+ res = db.find_one('editor', id)
+ if res == None:
+ ignore_ids.add(id)
+ continue
cnt_obs += 1
title_id = line[1]
- ids.add(line[2])
+ ids.add(id)
+ simple_date = '%s-%s' % (timestamp.year, timestamp.month)
+ dates.setdefault(simple_date, 0)
+ dates[simple_date] += 1
title = line.pop(5)
titles[title_id] = title
line.append('\n')
@@ -64,20 +88,54 @@
dataset.close()
+print 'Constructing title dataset...'
fh = codecs.open('titles.tsv', 'w', 'utf-8')
for id, title in titles.iteritems():
fh.write('%s\t%s\n' % (id, title.decode('utf-8')))
fh.close()
-fh = codecs.open('ids.json', 'w', 'utf-8')
-json.dump(ids, fh)
-#for id in ids:
-#fh.write('%s\n' % (id.decode('utf-8')))
-#fh.write('%s\n' % (json.du)
+
+print 'Constructing solution dataset...'
+x = 0
+fh = codecs.open('solutions.tsv', 'w', 'utf-8')
+for id in ids:
+ if id not in ignore_ids:
+ obs = db.find_one('editor', str(id), 'cum_edit_count_main_ns')
+ if obs != None:
+ x += 1
+ n = obs['cum_edit_count_main_ns']
+ fh.write('%s,%s\n' % (id.decode('utf-8'), n))
+ edits.setdefault(n, 0)
+ edits[n] += 1
+ else:
+ print id
fh.close()
+print 'Storing date histogram'
+fh = open('histogram_dates.bin', 'wb')
+cPickle.dump(dates, fh)
+fh.close()
+
+
+fh = open('histogram_dates.tsv', 'w')
+for date, n in dates.iteritems():
+ fh.write('%s\t%s\n' % (date, n))
+fh.close()
+
+
+print 'Storing edit histogram'
+fh = open('histogram_edits.bin', 'wb')
+cPickle.dump(edits, fh)
+fh.close()
+
+fh = open('histogram_edits.tsv', 'w')
+for edit, n in edits.iteritems():
+ fh.write('%s\t%s\n' % (edit, n))
+fh.close()
+
+
t1 = datetime.now()
-print 'Descriptives:\n'
-print 'Number of editors: %s' % len(ids)
+print 'Descriptives:'
+print 'Number of editors: %s' % x
print 'Number of edits: %s' % cnt_obs
print 'It took %s to construct the Kaggle training set' % (t1 - t0)
Modified: trunk/tools/editor_trends/manage.py
===================================================================
--- trunk/tools/editor_trends/manage.py 2011-05-30 21:07:43 UTC (rev 89188)
+++ trunk/tools/editor_trends/manage.py 2011-05-30 21:50:45 UTC (rev 89189)
@@ -30,6 +30,7 @@
from classes import projects
from classes import runtime_settings
from utils import file_utils
+from utils import text_utils
from utils import ordered_dict
from utils import log
from utils import timer
@@ -43,14 +44,113 @@
from analyses import inventory
-def init_args_parser():
+
+def config_launcher(rts, logger):
'''
+ Config launcher is used to (re)configure Wikilytics.
+ '''
+
+ pc = projects.ProjectContainer()
+ if not os.path.exists('wiki.cfg') or rts.force:
+ config = ConfigParser.RawConfigParser()
+ project = None
+ language = None
+ db = None
+ valid_hostname = False
+ valid_storage = ['mongo', 'cassandra']
+ working_directory = raw_input('''Please indicate where you installed
+ Wikilytics.\nCurrent location is %s\nPress Enter to accept
default.\n''' % os.getcwd())
+
+ input_location = raw_input('''Please indicate where the Wikipedia dump
+ files are or will be located.\nDefault is: %s\nPress Enter to
+ accept default.\n''' % rts.input_location)
+
+ base_location = raw_input('''Please indicate where to store all
+ Wikilytics project files.\nDefault is: %s\nPress Enter to accept
+ default.\n''' % rts.base_location)
+
+ while db not in valid_storage:
+ db = raw_input('''Please indicate what database you are using for
storage.\nDefault is: Mongo\n''')
+ db = 'mongo' if len(db) == 0 else db.lower()
+ if db not in valid_storage:
+ print 'Valid choices are: %s' % ','.join(valid_storage)
+
+ while project not in pc.projects.keys():
+ project = raw_input('''Please indicate which project you would
like
+ to analyze.\nDefault is: %s\nPress Enter to accept default.\n''' %
rts.project.full_name)
+ project = project if len(project) > 0 else rts.project.name
+ if project not in pc.projects.keys():
+ print 'Valid choices for a project are: %s' %
','.join(pc.projects.keys())
+
+ while language not in rts.project.valid_languages:
+ language = raw_input('''Please indicate which language of project
+ %s you would like to analyze.\nDefault is: %s\nPress Enter to
accept
+ default.\n''' % (rts.project.full_name, rts.language))
+ if len(language) == 0:
+ language = rts.language.code
+ language = language if language in rts.project.valid_languages \
+ else rts.language.default
+
+ while valid_hostname == False:
+ master = raw_input('''Please indicate the hostname master of your
database
+ cluster.\n Default is: %s\nPress Enter to accept default.\n'''
% ('localhost'))
+ master = 'localhost' if len(master) == 0 else master
+ valid_hostname = text_utils.validate_hostname(master)
+
+ if master != 'localhost':
+ valid_hostname = False
+ while valid_hostname == False:
+ slaves = raw_input('''Please indicate the hostnames of your
slaves
+ of your database cluster.Separate names using a
comma.\n''')
+ slaves = slaves.split(',')
+ results = []
+ for slave in slaves:
+ results.append(text_utils.validate_hostname(slave))
+ valid_hostname = True if all(results) else False
+
+ slaves = ','.join(slaves)
+ input_location = input_location if len(input_location) > 0 else \
+ rts.input_location
+ base_location = base_location if len(base_location) > 0 else \
+ rts.base_location
+ working_directory = working_directory if len(working_directory) > 0 \
+ else os.getcwd()
+
+ config = ConfigParser.RawConfigParser()
+ config.add_section('file_locations')
+ config.set('file_locations', 'working_directory', working_directory)
+ config.set('file_locations', 'input_location', input_location)
+ config.set('file_locations', 'base_location', base_location)
+ config.add_section('wiki')
+ config.set('wiki', 'project', project)
+ config.set('wiki', 'language', language)
+ config.add_section('storage')
+ config.set('storage', 'db', db)
+ config.add_section('cluster')
+ config.set('cluster', 'master', master)
+ config.set('cluster', 'slaves', slaves)
+
+ fh = file_utils.create_binary_filehandle(working_directory,
'wiki.cfg', 'wb')
+ config.write(fh)
+ fh.close()
+
+ log.to_csv(logger, rts, 'New configuration', 'Creating',
+ config_launcher,
+ working_directory=working_directory,
+ input_location=input_location,
+ base_location=base_location,
+ project=project,
+ language=language,)
+
+
+def init_args_parser(language_code=None, project=None):
+ '''
Entry point for parsing command line and launching the needed function(s).
'''
- language = languages.init()
- project = projects.init()
+ language = languages.init(language_code)
+ project = projects.init(project)
pjc = projects.ProjectContainer()
- rts = runtime_settings.RunTimeSettings(project, language)
+ #rts = runtime_settings.RunTimeSettings(project, language)
file_choices = {'meta-full': 'stub-meta-history.xml.gz',
'meta-current': 'stub-meta-current.xml.gz',
@@ -78,7 +178,7 @@
parser_config.set_defaults(func=config_launcher)
parser_config.add_argument('-f', '--force',
action='store_true',
- help='Reconfigure Editor Toolkit (this will replace wiki.cfg')
+ help='Reconfigure Wikilytics (this will replace wiki.cfg')
#DOWNLOAD
parser_download = subparsers.add_parser('download',
@@ -141,7 +241,7 @@
parser_diff = subparsers.add_parser('diff',
help='Create a Mongo collection containing the diffs between
revisions.')
parser_diff.set_defaults(func=diff_launcher)
-
+
#DJANGO
parser_django = subparsers.add_parser('django')
parser_django.add_argument('-e', '--except',
@@ -192,85 +292,9 @@
%s' % ''.join([f + ',\n' for f in file_choices]),
default=file_choices['meta-full'])
- return project, language, parser
+ return parser
-def config_launcher(rts, logger):
- '''
- Config launcher is used to reconfigure editor trends toolkit.
- '''
-
- pc = projects.ProjectContainer()
- if not os.path.exists('wiki.cfg') or rts.force:
- config = ConfigParser.RawConfigParser()
- project = None
- language = None
- db = None
- valid_storage = ['mongo', 'cassandra']
- working_directory = raw_input('''Please indicate where you installed
- Wikilytics.\nCurrent location is %s\nPress Enter to accept
default.\n''' % os.getcwd())
-
- input_location = raw_input('''Please indicate where the Wikipedia dump
- files are or will be located.\nDefault is: %s\nPress Enter to
- accept default.\n''' % rts.input_location)
-
- base_location = raw_input('''Please indicate where to store all
- Wikilytics project files.\nDefault is: %s\nPress Enter to accept
- default.\n''' % rts.base_location)
-
- while db not in valid_storage:
- db = raw_input('Please indicate what database you are using for
storage. \nDefault is: Mongo\n')
- db = 'mongo' if len(db) == 0 else db.lower()
- if db not in valid_storage:
- print 'Valid choices are: %s' % ','.join(valid_storage)
-
- while project not in pc.projects.keys():
- project = raw_input('''Please indicate which project you would
like
- to analyze.\nDefault is: %s\nPress Enter to accept default.\n''' %
rts.project.full_name)
- project = project if len(project) > 0 else rts.project.name
- if project not in pc.projects.keys():
- print 'Valid choices for a project are: %s' %
','.join(pc.projects.keys())
-
- while language not in rts.project.valid_languages:
- language = raw_input('''Please indicate which language of project
- %s you would like to analyze.\nDefault is: %s\nPress Enter to
accept
- default.\n''' % (rts.project.full_name, rts.language))
- if len(language) == 0:
- language = rts.language.code
- language = language if language in rts.project.valid_languages \
- else rts.language.default
-
- input_location = input_location if len(input_location) > 0 else \
- rts.input_location
- base_location = base_location if len(base_location) > 0 else \
- rts.base_location
- working_directory = working_directory if len(working_directory) > 0 \
- else os.getcwd()
-
- config = ConfigParser.RawConfigParser()
- config.add_section('file_locations')
- config.set('file_locations', 'working_directory', working_directory)
- config.set('file_locations', 'input_location', input_location)
- config.set('file_locations', 'base_location', base_location)
- config.add_section('wiki')
- config.set('wiki', 'project', project)
- config.set('wiki', 'language', language)
- config.add_section('storage')
- config.set('storage', 'db', db)
-
- fh = file_utils.create_binary_filehandle(working_directory,
'wiki.cfg', 'wb')
- config.write(fh)
- fh.close()
-
- log.to_csv(logger, rts, 'New configuration', 'Creating',
- config_launcher,
- working_directory=working_directory,
- input_location=input_location,
- base_location=base_location,
- project=project,
- language=language,)
-
-
def downloader_launcher(rts, logger):
'''
This launcher calls the dump downloader to download a Wikimedia dump file.
@@ -343,7 +367,8 @@
stopwatch = timer.Timer()
log.to_db(rts, 'dataset', 'transform', stopwatch, event='start')
log.to_csv(logger, rts, 'Start', 'Transform', transformer_launcher)
- transformer.transform_editors_multi_launcher(rts)
+ #transformer.transform_editors_multi_launcher(rts)
+ transformer.transform_editors_single_launcher(rts)
stopwatch.elapsed()
log.to_db(rts, 'dataset', 'transform', stopwatch, event='finish')
log.to_csv(logger, rts, 'Finish', 'Transform', transformer_launcher)
@@ -359,8 +384,8 @@
log.to_db(rts, 'dataset', 'diff', stopwatch, event='finish')
log.to_csv(logger, rts, 'Finish', 'Diff', diff_launcher)
-
+
def dataset_launcher(rts, logger):
'''
Dataset launcher is the entry point to generate datasets from the command
@@ -414,8 +439,11 @@
'''
This function initializes the command line parser.
'''
- project, language, parser, = init_args_parser()
+ parser = init_args_parser()
args = parser.parse_args()
+ language = languages.init()
+ project = projects.init()
+
rts = runtime_settings.RunTimeSettings(project, language, args)
#initialize logger
logger = logging.getLogger('manager')
Modified: trunk/tools/editor_trends/statistics/stata/ppi.do
===================================================================
--- trunk/tools/editor_trends/statistics/stata/ppi.do 2011-05-30 21:07:43 UTC
(rev 89188)
+++ trunk/tools/editor_trends/statistics/stata/ppi.do 2011-05-30 21:50:45 UTC
(rev 89189)
@@ -1,5 +1,11 @@
clear
insheet using "C:\Users\diederik.vanliere\Desktop\ppi_quality.csv"
+
+gen diff_character_count = character_count_a - character_count_b
+gen diff_cum_edit_count_main_ns = cum_edit_count_main_ns_a-
cum_edit_count_main_ns_b
+gen diff_cum_edit_count_other_ns = cum_edit_count_other_ns_a-
cum_edit_count_other_ns_b
+gen diff_article_count = article_count_a- article_count_b
+
label var character_count_a "PPI editor"
label var character_count_b "Regular editor"
Modified: trunk/tools/editor_trends/utils/file_utils.py
===================================================================
--- trunk/tools/editor_trends/utils/file_utils.py 2011-05-30 21:07:43 UTC
(rev 89188)
+++ trunk/tools/editor_trends/utils/file_utils.py 2011-05-30 21:50:45 UTC
(rev 89189)
@@ -173,9 +173,10 @@
'''Create a filehandle for text file with utf-8 encoding'''
filename = str(filename)
if not filename.endswith('.csv'):
- filename = construct_filename(filename, '.csv')
+ if filename.find('.') == -1:
+ filename = construct_filename(filename, '.csv')
path = os.path.join(location, filename)
- return codecs.open(path, mode, encoding='utf-8')
+ return codecs.open(path, mode, encoding)
def create_streaming_buffer(path):
@@ -189,7 +190,8 @@
fh = subprocess.Popen('7z e -bd -so %s 2>/dev/null' % path, shell=True,
stdout=subprocess.PIPE, bufsize=65535).stdout
elif extension == '.xml':
- fh = create_txt_filehandle(path, None, 'r', 'utf-8')
+ location, filename = os.path.split(path)
+ fh = create_txt_filehandle(location, filename, 'r', 'utf-8')
else:
raise exceptions.CompressedFileNotSupported(extension)
return fh
@@ -247,6 +249,7 @@
os.utime(path, (mod_rem, mod_rem))
#sraise exceptions.NotYetImplementedError(set_modified_data)
+
def get_modified_date(location, filename):
'''determine the date the file was originally created'''
path = os.path.join(location, filename)
Modified: trunk/tools/editor_trends/utils/log.py
===================================================================
--- trunk/tools/editor_trends/utils/log.py 2011-05-30 21:07:43 UTC (rev
89188)
+++ trunk/tools/editor_trends/utils/log.py 2011-05-30 21:50:45 UTC (rev
89189)
@@ -31,11 +31,9 @@
def to_db(rts, jobtype, task, timer, event='start'):
db = storage.init_database(rts.storage, rts.dbname, 'jobs')
created = datetime.datetime.now()
- hash = '%s_%s' % (rts.project, rts.hash)
+ job = db.find_one('hash', rts.id)
- job = db.find_one('hash', hash)
-
- data = {'hash': hash,
+ data = {'hash': rts.id,
'created': created,
'jobtype': jobtype,
'in_progress': True,
@@ -60,7 +58,7 @@
t['start'] = timer.t0
t['in_progress'] = True
tasks[task] = t
- db.update('hash', hash, {'$set': {'tasks': tasks}})
+ db.update('hash', rts.id, {'$set': {'tasks': tasks}})
#coll.update({'hash': hash}, {'$set': {'tasks': tasks}})
elif event == 'finish':
t['finish'] = timer.t1
@@ -68,11 +66,11 @@
tasks[task] = t
if task == 'transform' or jobtype == 'chart':
#final task, set entire task to finished
- db.update('hash', hash, {'$set': {'tasks': tasks,
+ db.update('hash', rts.id, {'$set': {'tasks': tasks,
'in_progress': False,
'finished': True}})
else:
- db.update('hash', hash, {'$set': {'tasks': tasks}})
+ db.update('hash', rts.id, {'$set': {'tasks': tasks}})
def to_csv(logger, settings, message, verb, function, **kwargs):
Modified: trunk/tools/editor_trends/utils/text_utils.py
===================================================================
--- trunk/tools/editor_trends/utils/text_utils.py 2011-05-30 21:07:43 UTC
(rev 89188)
+++ trunk/tools/editor_trends/utils/text_utils.py 2011-05-30 21:50:45 UTC
(rev 89189)
@@ -20,6 +20,7 @@
import datetime
import time
import sys
+import re
if '..' not in sys.path:
sys.path.append('..')
@@ -52,6 +53,14 @@
return dict([[v, k] for k, v in dictionary.items()])
+def validate_hostname(hostname):
+ regex_hostname =
re.compile('^(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|\b-){0,61}[0-9A-Za-z])?(?:\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|\b-){0,61}[0-9A-Za-z])?)*\.?$')
+ res = re.match(regex_hostname, hostname)
+ if res == None:
+ return False
+ else:
+ return True
+
def get_max_width(table, index):
'''
Get the maximum width of the given column index
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs