http://www.mediawiki.org/wiki/Special:Code/MediaWiki/89189

Revision: 89189
Author:   diederik
Date:     2011-05-30 21:50:45 +0000 (Mon, 30 May 2011)
Log Message:
-----------
Preparing for Summer of Research

Modified Paths:
--------------
    trunk/tools/editor_trends/analyses/adhoc/bot_detector.py
    trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
    trunk/tools/editor_trends/analyses/analyzer.py
    trunk/tools/editor_trends/classes/languages.py
    trunk/tools/editor_trends/classes/projects.py
    trunk/tools/editor_trends/classes/runtime_settings.py
    trunk/tools/editor_trends/classes/settings.py
    trunk/tools/editor_trends/classes/storage.py
    trunk/tools/editor_trends/etl/differ.py
    trunk/tools/editor_trends/etl/extracter.py
    trunk/tools/editor_trends/etl/transformer.py
    trunk/tools/editor_trends/etl/variables.py
    trunk/tools/editor_trends/kaggle/training.py
    trunk/tools/editor_trends/manage.py
    trunk/tools/editor_trends/statistics/stata/ppi.do
    trunk/tools/editor_trends/utils/file_utils.py
    trunk/tools/editor_trends/utils/log.py
    trunk/tools/editor_trends/utils/text_utils.py

Removed Paths:
-------------
    trunk/tools/editor_trends/etl/kaggle.py

Modified: trunk/tools/editor_trends/analyses/adhoc/bot_detector.py
===================================================================
--- trunk/tools/editor_trends/analyses/adhoc/bot_detector.py    2011-05-30 
21:07:43 UTC (rev 89188)
+++ trunk/tools/editor_trends/analyses/adhoc/bot_detector.py    2011-05-30 
21:50:45 UTC (rev 89189)
@@ -244,8 +244,8 @@
     '''
     This is the launcher that uses multiprocesses.
     '''
-    consumers = [consumers.XMLFileConsumer(tasks, None) for i in 
xrange(settings.number_of_processes)]
-    for x in xrange(settings.number_of_processes):
+    consumers = [consumers.XMLFileConsumer(tasks, None) for i in 
xrange(multiprocessing.cpu_count())]
+    for x in xrange(multiprocessing.cpu_count()):
         tasks.put(None)
 
     for w in consumers:

Modified: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
===================================================================
--- trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py     2011-05-30 
21:07:43 UTC (rev 89188)
+++ trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py     2011-05-30 
21:50:45 UTC (rev 89189)
@@ -159,7 +159,7 @@
         min_d = min(data.keys())
         max_d = max(data.keys())
         match = data[max_d]
-        matches.append((ppi_editor, match))
+        matches.append((ppi_editor, match, max_d))
         #remove match to make sure that every matched pair is unique
         for editor in distances:
             try:
@@ -177,11 +177,12 @@
     fh.write('_a\t'.join(vars))
     fh.write('\t%s\t' % ('editor_b'))
     fh.write('_b\t'.join(vars))
-    fh.write('\tdelta registration days\tid\n')
+    fh.write('\tdelta registration days\tid\teuclid_dist\n')
     for i, match in enumerate(matches):
         line = []
         editor_a = match[0]
         editor_b = match[1]
+        dist = match[2]
         line.append(editor_a)
         values_a = [str(obs_a[editor_a][v]) for v in vars]
         values_b = [str(obs_b[editor_b][v]) for v in vars]
@@ -191,6 +192,7 @@
         dt = obs_a[editor_a]['reg_date'] - obs_b[editor_b]['reg_date']
         line.append(str(dt.days))
         line.append(str(i))
+        line.append(dist)
         line.append('\n')
         print line
         #line = '\t'.join([str(l).decode('utf-8') for l in line])

Modified: trunk/tools/editor_trends/analyses/analyzer.py
===================================================================
--- trunk/tools/editor_trends/analyses/analyzer.py      2011-05-30 21:07:43 UTC 
(rev 89188)
+++ trunk/tools/editor_trends/analyses/analyzer.py      2011-05-30 21:50:45 UTC 
(rev 89189)
@@ -17,7 +17,7 @@
 _date__ = '2010-12-10'
 __version__ = '0.1'
 
-from multiprocessing import JoinableQueue, Queue, Manager, RLock, Process
+from multiprocessing import JoinableQueue, Queue, Manager, RLock, Process, 
cpu_count
 from multiprocessing.managers import BaseManager
 from Queue import Empty
 
@@ -141,10 +141,10 @@
     del editors
 
     analyzers = [analytics.Analyzer(rts, tasks, result, var, data, plugin, 
func) for
-                 x in xrange(rts.number_of_processes)]
+                 x in xrange(cpu_count())]
 
 
-    for x in xrange(rts.number_of_processes):
+    for x in xrange(cpu_count()):
         tasks.put(None)
 
     pbar = progressbar.ProgressBar(maxval=n).start()
@@ -152,7 +152,7 @@
         analyzer.start()
 
 
-    ppills = rts.number_of_processes
+    ppills = cpu_count()
     while True:
         while ppills > 0:
             try:
@@ -216,7 +216,7 @@
 
 
 def launcher():
-    project, language, parser = manage.init_args_parser()
+    project, language, parser = commandline.init_args_parser()
     args = parser.parse_args(['django'])
     rts = runtime_settings.init_environment('wiki', 'en', args)
     generate_chart_data(rts, 'taxonomy_burnout', time_unit='month')

Modified: trunk/tools/editor_trends/classes/languages.py
===================================================================
--- trunk/tools/editor_trends/classes/languages.py      2011-05-30 21:07:43 UTC 
(rev 89188)
+++ trunk/tools/editor_trends/classes/languages.py      2011-05-30 21:50:45 UTC 
(rev 89189)
@@ -679,9 +679,12 @@
         print abbr
         print len(abbr)
 
-def init():
+def init(language_code=None):
     lnc = LanguageContainer()
-    return lnc.languages[lnc.default]
+    if language_code:
+        return lnc.languages[language_code]
+    else:
+        return lnc.languages[lnc.default]
 
 if __name__ == '__main__':
     init()

Modified: trunk/tools/editor_trends/classes/projects.py
===================================================================
--- trunk/tools/editor_trends/classes/projects.py       2011-05-30 21:07:43 UTC 
(rev 89188)
+++ trunk/tools/editor_trends/classes/projects.py       2011-05-30 21:50:45 UTC 
(rev 89189)
@@ -107,9 +107,12 @@
     pc = ProjectContainer()
     pc.supported_projects()
 
-def init():
+def init(project=None):
     pc = ProjectContainer()
-    return pc.get_project('wiki')
+    if project:
+        return pc.get_project(project)
+    else:
+        return pc.get_project('wiki')
 
 if __name__ == '__main__':
     debug()

Modified: trunk/tools/editor_trends/classes/runtime_settings.py
===================================================================
--- trunk/tools/editor_trends/classes/runtime_settings.py       2011-05-30 
21:07:43 UTC (rev 89188)
+++ trunk/tools/editor_trends/classes/runtime_settings.py       2011-05-30 
21:50:45 UTC (rev 89189)
@@ -27,6 +27,9 @@
 import datetime
 import time
 
+if '..' not in sys.path:
+    sys.path.append('../')
+
 from settings import Settings
 from analyses import inventory
 from classes import exceptions
@@ -48,49 +51,49 @@
         self.language = language
         self.dbname = 'wikilytics'
 
-        if args:
-            self.args = args
-            self.hash = self.secs_since_epoch()
-            #print self.settings.input_location
-            #print self.get_value('location')
-            self.project = self.update_project_settings()
-            self.language = self.update_language_settings()
+        #if args:
+        self.args = args
+        self.id = '%s%s_%s' % (self.language.code, self.project.name, 
'current_month')
+        #print self.settings.input_location
+        #print self.get_value('location')
+        self.project = self.update_project_settings()
+        self.language = self.update_language_settings()
 
-            self.input_location = self.set_input_location()
-            self.output_location = self.set_output_location()
+        self.input_location = self.set_input_location()
+        self.output_location = self.set_output_location()
 
-            self.plugins = self.set_plugin()
-            self.keywords = self.split_keywords()
-            self.namespaces = self.get_namespaces()
+        self.plugins = self.set_plugin()
+        self.keywords = self.split_keywords()
+        self.namespaces = self.get_namespaces()
 
-            self.kaggle = self.get_value('kaggle')
-            self.function = self.get_value('func')
-            self.ignore = self.get_value('except')
-            self.force = self.get_value('force')
-            self.analyzer_collection = self.get_value('collection')
+        #self.kaggle = self.get_value('kaggle')
+        self.function = self.get_value('func')
+        self.ignore = self.get_value('except')
+        self.force = self.get_value('force')
+        self.analyzer_collection = self.get_value('collection')
 
-            self.dataset = os.path.join(self.dataset_location, 
self.project.name)
-            self.txt = os.path.join(self.output_location, 'txt')
-            self.sorted = os.path.join(self.output_location, 'sorted')
-            self.diffs = os.path.join(self.output_location, 'diffs')
+        self.dataset = os.path.join(self.dataset_location, self.project.name)
+        self.txt = os.path.join(self.output_location, 'txt')
+        self.sorted = os.path.join(self.output_location, 'sorted')
+        self.diffs = os.path.join(self.output_location, 'diffs')
 
-            self.directories = [self.output_location,
-                                self.txt,
-                                self.sorted,
-                                self.dataset,
-                                self.diffs]
-            self.verify_environment(self.directories)
+        self.directories = [self.output_location,
+                            self.txt,
+                            self.sorted,
+                            self.dataset,
+                            self.diffs]
+        self.verify_environment(self.directories)
 
-            #Wikidump file related variables
-            self.dump_filename = self.generate_wikidump_filename()
-            self.dump_relative_path = self.set_dump_path()
-            self.dump_absolute_path = self.set_dump_path(absolute=True)
+        #Wikidump file related variables
+        self.dump_filename = self.generate_wikidump_filename()
+        self.dump_relative_path = self.set_dump_path()
+        self.dump_absolute_path = self.set_dump_path(absolute=True)
 
-            #Collection names
-            self.editors_raw = '%s%s_editors_raw' % (self.language.code, 
self.project.name)
-            self.editors_dataset = '%s%s_editors_dataset' % 
(self.language.code, self.project.name)
-            self.articles_raw = '%s%s_articles_raw' % (self.language.code, 
self.project.name)
-            self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, 
self.project.name)
+        #Collection names
+        self.editors_raw = '%s%s_editors_raw' % (self.language.code, 
self.project.name)
+        self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, 
self.project.name)
+        self.articles_raw = '%s%s_articles_raw' % (self.language.code, 
self.project.name)
+        self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, 
self.project.name)
 
 
 
@@ -239,7 +242,7 @@
         '''
         default = self.project
         proj = self.get_value('project')
-        if proj != 'wiki':
+        if proj != default:
             pc = projects.ProjectContainer()
             proj = pc.get_project(proj)
             return proj
@@ -281,7 +284,7 @@
             return ['0']  #Assume that the mainspace is of interest
 
 
-def init_environment(project, language_code, args):
+def init_environment(project, language_code):
     '''
     Initialize an instance of RuntimeSettings. 
     '''
@@ -289,8 +292,9 @@
     project = pjc.get_project(project)
     lnc = languages.LanguageContainer()
     language = lnc.get_language(language_code)
-
-    args.language = language.name
-    args.project = project.name
+    parser = init_args_parser(language_code, project)
+    args = parser.parse_args(['django'])
+    #args.language = language.name
+    #args.project = project.name
     rts = RunTimeSettings(project, language, args)
     return rts

Modified: trunk/tools/editor_trends/classes/settings.py
===================================================================
--- trunk/tools/editor_trends/classes/settings.py       2011-05-30 21:07:43 UTC 
(rev 89188)
+++ trunk/tools/editor_trends/classes/settings.py       2011-05-30 21:50:45 UTC 
(rev 89189)
@@ -17,11 +17,6 @@
 __date__ = '2010-10-21'
 __version__ = '0.1'
 
-'''
-This file contains settings that are used for constructing and analyzing
-the datasets as part of the Editor Dynamics and Anti-Vandalism projects.
-'''
-
 from multiprocessing import cpu_count
 import ConfigParser
 import os
@@ -73,7 +68,7 @@
         #Change this to match your computers configuration (RAM / CPU)
         # I want to get rid off these two variables.
         self.number_of_processes = cpu_count()
-        self.windows_register = {'7z.exe': 'Software\\7-Zip'}
+        #self.windows_register = {'7z.exe': 'Software\\7-Zip'}
 
         self.wp_dump_location = 'http://dumps.wikimedia.org'
 
@@ -107,6 +102,8 @@
             self.default_project = config.get('wiki', 'project')
             self.default_language = config.get('wiki', 'language')
             self.storage = config.get('storage', 'db')
+            self.master = config.get('cluster', 'master')
+            self.slaves = config.get('cluster', 'slaves')
             return True
         except Exception, error:
             #raise exceptions.GenericMessage('corrupted_config')

Modified: trunk/tools/editor_trends/classes/storage.py
===================================================================
--- trunk/tools/editor_trends/classes/storage.py        2011-05-30 21:07:43 UTC 
(rev 89188)
+++ trunk/tools/editor_trends/classes/storage.py        2011-05-30 21:50:45 UTC 
(rev 89189)
@@ -102,9 +102,14 @@
     This class provides the functionality to talk to a MongoDB backend 
including
     inserting, finding, and updating data.
     '''
-    def __init__(self, dbname, collection):
+    def __init__(self, dbname, collection, master=None, slaves=[]):
+        if master == None:
+            self.master = 'localhost'
+        else:
+            self.master = master
+        self.slaves = slaves
+        self.port = 27017
         super(Mongo, self).__init__(dbname, collection)
-        self.port = 27017
 
     @classmethod
     def is_registrar_for(cls, storage):
@@ -114,8 +119,16 @@
         return storage == 'mongo'
 
     def connect(self):
-        db = pymongo.Connection()
-        return db[self.dbname]
+        master = pymongo.Connection(host=self.master, port=self.port)
+        if self.master == 'localhost':
+            return master[self.dbname]
+        else:
+            slave_connections = []
+            for slave in self.slaves:
+                slave = pymongo.Connection(host=slave, port=self.port)
+                slave_connections.append(slave)
+            master_slave_connection = pymongo.MasterSlaveConnection(master, 
slave_connections)
+            return master_slave_connection[self.dbname]
 
     def save(self, data):
         assert isinstance(data, dict), 'You need to feed me dictionaries.'

Modified: trunk/tools/editor_trends/etl/differ.py
===================================================================
--- trunk/tools/editor_trends/etl/differ.py     2011-05-30 21:07:43 UTC (rev 
89188)
+++ trunk/tools/editor_trends/etl/differ.py     2011-05-30 21:50:45 UTC (rev 
89189)
@@ -17,23 +17,35 @@
 __date__ = '2011-04-10'
 __version__ = '0.1'
 
+
+'''
+This script generates diffs of edits for the Talk, User Talk and Wikipedia Talk
+pages of a Wikipedia project. These diffs are stored in json files and then
+imported in Mongo. 
+'''
+import pprint
 import json
 import cStringIO
 import codecs
 import sys
 import os
 import difflib
+import bson
 from xml.etree.cElementTree import iterparse, dump
 from multiprocessing import JoinableQueue, Process, cpu_count
 from datetime import datetime
+from copy import deepcopy
 
 
 if '..' not in sys.path:
     sys.path.append('../')
 
 from utils import file_utils
+from utils import text_utils
 from etl import variables
 from classes import exceptions
+from classes import storage
+from classes import runtime_settings
 
 
 def parse_xml(fh, format, process_id, location):
@@ -50,13 +62,13 @@
     context = iterparse(fh, events=(start, end))
     context = iter(context)
 
-    article = {}
+
+    revisions = []
     count_articles = 0
     id = False
     ns = False
     parse = False
-    rev1 = None
-    rev2 = None
+    prev_rev_text = None
     file_id, fh_output = None, None
 
     try:
@@ -80,11 +92,11 @@
                 parsing this article, else it will skip this article. 
                 '''
                 title = variables.parse_title(elem)
-                article['title'] = title
                 current_namespace = variables.determine_namespace(title, 
namespaces, include_ns)
                 if current_namespace == 1 or current_namespace == 3 or 
current_namespace == 5:
                     parse = True
-                    article['namespace'] = current_namespace
+                    #article['namespace'] = current_namespace
+                    title = title.replace(namespaces[current_namespace], '')
                     count_articles += 1
                     if count_articles % 10000 == 0:
                         print 'Worker %s parsed %s articles' % (process_id, 
count_articles)
@@ -105,23 +117,32 @@
                         timestamp = elem.find('%s%s' % (xml_namespace, 
'timestamp')).text
                         contributor = elem.find('%s%s' % (xml_namespace, 
'contributor'))
                         editor = variables.parse_contributor(contributor, 
None, xml_namespace)
+                        text = variables.extract_revision_text(elem, 
xml_namespace)
+                        comment = variables.extract_comment_text(elem, 
xml_namespace)
                         if editor:
                             rev_id = variables.extract_revision_id(rev_id)
+                            if prev_rev_text == None:
+                                diff = text
+                                prev_rev_text = deepcopy(text)
+                            if prev_rev_text != None:
+                                #print text[0:20], prev_rev_text[0:20]
+                                diff = diff_revision(prev_rev_text, text)
 
-                            if rev1 == None and rev2 == None:
-                                diff = variables.extract_revision_text(elem, 
xml_namespace)
-                                rev1 = elem
-                            if rev1 != None and rev2 != None:
-                                diff = diff_revision(rev1, rev2, xml_namespace)
+                                if diff != None:
+                                    timestamp = 
text_utils.convert_timestamp_to_datetime_utc(timestamp)
+                                    timestamp = timestamp.isoformat()
+                                    revision = dict(rev_id=rev_id, title=title,
+                                                    timestamp=timestamp,
+                                                    diff=diff, comment=comment,
+                                                    id=editor['id'],
+                                                    
username=editor['username'],
+                                                    article_id=article_id,
+                                                    ns=current_namespace)
+                                    revisions.append(revision)
 
-                            article[rev_id] = {}
-                            article[rev_id].update(editor)
-                            article[rev_id]['timestamp'] = timestamp
-                            article[rev_id]['diff'] = diff
-
                         clear = True
                     if clear:
-                        rev2 = rev1
+                        prev_rev_text = deepcopy(text)
                         elem.clear()
                 else:
                     elem.clear()
@@ -130,7 +151,7 @@
                 '''
                 Determine id of article
                 '''
-                article['article_id'] = elem.text
+                article_id = int(elem.text)
                 id = True
                 elem.clear()
 
@@ -140,17 +161,16 @@
                 memory. 
                 '''
                 elem.clear()
-                #write diff of text to file
+
                 if parse:
-                    #print article
-                    fh_output, file_id = assign_filehandle(fh_output, file_id, 
location, process_id, format)
-                    write_diff(fh_output, article, format)
+                    #write diff of text to file
+                    if len(revisions) > 0:
+                        fh_output, file_id = assign_filehandle(fh_output, 
file_id, location, process_id, format)
+                        write_diff(fh_output, revisions, format)
+
                 #Reset all variables for next article
-                article = {}
-                if rev1 != None:
-                    rev1.clear()
-                if rev2 != None:
-                    rev2.clear()
+                revisions = []
+                prev_rev_text = None
                 id = False
                 parse = False
 
@@ -181,14 +201,47 @@
 
     return fh, file_id
 
+
 def write_xml_diff(fh, article):
     pass
 
 
-def write_json_diff(fh, article):
-    json.dump(article, fh)
+def write_json_diff(fh, revisions):
+    fh.write('\nStart new JSON object\n')
+    json.dump(revisions, fh, indent=4, sort_keys=True)
 
 
+def store_json_diffs(rts):
+    files = os.listdir(rts.diffs)
+    print files, rts.diffs
+    db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
+    buffer = cStringIO.StringIO()
+
+    for filename in files:
+        fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 
'utf-8')
+        for line in fh:
+            if line.startswith('\n') or line.startswith('Start'):
+                obj = buffer.getvalue()
+                if obj != '':
+                    obj = json.loads(obj)
+                    obj[0]['article_id'] = int(obj[0]['article_id'])
+                    for key, value in obj[0].iteritems():
+                        if type(value) == type(dict()):
+                            value['timestamp'] = 
datetime.strptime(value['timestamp'], '%Y-%m-%dT%H:%M:%S')
+                        obj[0][key] = value
+                    obj = obj[0]
+                    #print obj
+                    #print len(obj)
+                    try:
+                        db.save(obj)
+                    except bson.errors.InvalidDocument, error:
+                        print error
+                buffer = cStringIO.StringIO()
+            else:
+                buffer.write(line)
+        fh.close()
+
+
 def write_diff(fh, article, format):
     if format == 'xml':
         write_xml_diff(fh, article)
@@ -198,23 +251,47 @@
         raise exceptions.OutputNotSupported()
 
 
-def diff_revision(rev1, rev2, xml_namespace):
-    buffer = cStringIO.StringIO()
-    if rev1.text != None and rev2.text != None:
-        diff = difflib.unified_diff(rev1.text, rev2.text, n=0, lineterm='')
+def diff_revision(rev1, rev2):
+    if rev1 == None:
+        rev1 = ''
+    if rev2 == None:
+        rev2 = ''
+    if len(rev1) != len(rev2):
+        buffer = cStringIO.StringIO()
+        rev1 = rev1.splitlines(1)
+        rev2 = rev2.splitlines(2)
+
+        diff = difflib.unified_diff(rev1, rev2, n=0, lineterm='')
         for line in diff:
             if len(line) > 3:
-                print line
-                buffer.write(line)
+                #print line
+                buffer.write(line.encode('utf-8'))
 
-        return buffer.getvalue()
+        diff = buffer.getvalue()
 
+        if diff == '':
+            return None
+        else:
+            return diff
+    else:
+        return None
+
+
+def store_diffs_debug(rts):
+    db = storage.init_database(rts)
+    files = os.listdir(rts.diffs)
+    for filename in files:
+        fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 
'utf-8')
+        diffs = json.load(fh)
+        db.insert(diffs)
+        fh.close()
+
+
 def stream_raw_xml(input_queue, process_id, rts, format):
     '''
     This function fetches an XML file from the queue and launches the 
processor. 
     '''
     t0 = datetime.now()
-    file_id = 0
 
     while True:
         filename = input_queue.get()
@@ -225,7 +302,7 @@
 
         print filename
         fh = file_utils.create_streaming_buffer(filename)
-        parse_xml(fh, format, process_id, rts.input_location)
+        parse_xml(fh, format, process_id, rts.diffs)
         fh.close()
 
         t1 = datetime.now()
@@ -266,7 +343,14 @@
 
     input_queue.join()
 
+    store_json_diffs(rts)
+    db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
+    db.add_index('title')
+    db.add_index('timestamp')
+    db.add_index('username')
+    db.add_index('ns')
 
+
 def launcher_simple():
     location = 'c:\\wikimedia\\nl\\wiki\\'
     output_location = 'c:\\wikimedia\\nl\\wiki\\diffs\\'
@@ -311,5 +395,6 @@
 
 
 if __name__ == '__main__':
+    #read_json_diffs()
     launcher_simple()
     #debug()

Modified: trunk/tools/editor_trends/etl/extracter.py
===================================================================
--- trunk/tools/editor_trends/etl/extracter.py  2011-05-30 21:07:43 UTC (rev 
89188)
+++ trunk/tools/editor_trends/etl/extracter.py  2011-05-30 21:50:45 UTC (rev 
89189)
@@ -22,8 +22,8 @@
 parsing the XML on the fly and extracting & constructing the variables that are
 need for subsequent analysis. The extract module is initialized using an 
 instance of RunTimeSettings and the most important parameters are:
-The name of project\n
-The language of the project\n
+The name of project
+The language of the project
 The location where the dump files are stored
 '''
 

Deleted: trunk/tools/editor_trends/etl/kaggle.py
===================================================================
--- trunk/tools/editor_trends/etl/kaggle.py     2011-05-30 21:07:43 UTC (rev 
89188)
+++ trunk/tools/editor_trends/etl/kaggle.py     2011-05-30 21:50:45 UTC (rev 
89189)
@@ -1,49 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-'''
-Copyright (C) 2010 by Diederik van Liere ([email protected])
-This program is free software; you can redistribute it and/or
-modify it under the terms of the GNU General Public License version 2
-as published by the Free Software Foundation.
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-See the GNU General Public License for more details, at
-http://www.fsf.org/licenses/gpl.html
-'''
-
-__author__ = '''\n'''.join(['Diederik van Liere ([email protected])', ])
-__email__ = 'dvanliere at gmail dot com'
-__date__ = '2011-04-12'
-__version__ = '0.1'
-
-import sys
-
-if '..' not in sys.path:
-    sys.path.append('..')
-
-from utils import file_utils
-
-
-def launcher():
-    location = '/home/diederik/wikimedia/en/wiki/kaggle_training/'
-    #location = 'C:\\wikimedia\\en\\wiki\\txt'
-    files = file_utils.retrieve_file_list(location, extension='csv')
-    files.sort()
-    dataset = file_utils.create_txt_filehandle(location, 'dataset.csv', 'w', 
'utf-8')
-    for filename in files:
-        if not filename.startswith('comments') and \
-            not filename.startswith('articles') and not 
filename.startswith('dataset'):
-            fh = file_utils.create_txt_filehandle(location, filename, 'r', 
'utf-8')
-            print fh
-            for line in fh:
-                data = line.split('\t')
-                username = data[3].lower()
-                if username.endswith('bot'):
-                    continue
-                else:
-                    dataset.write(line)
-            fh.close()
-    dataset.close()
-
-launcher()

Modified: trunk/tools/editor_trends/etl/transformer.py
===================================================================
--- trunk/tools/editor_trends/etl/transformer.py        2011-05-30 21:07:43 UTC 
(rev 89188)
+++ trunk/tools/editor_trends/etl/transformer.py        2011-05-30 21:50:45 UTC 
(rev 89189)
@@ -338,11 +338,11 @@
     db_dataset = storage.init_database(rts.storage, rts.dbname, 
rts.editors_dataset)
     db_dataset.drop_collection()
     editors = db_raw.retrieve_editors()
-    return editors
+    return editors, db_raw, db_dataset
 
 
 def transform_editors_multi_launcher(rts):
-    editors = setup_database(rts)
+    editors, db_raw, db_dataset = setup_database(rts)
     n = editors.size()
     result = queue.JoinableRetryQueue()
     pbar = progressbar.ProgressBar(maxval=n).start()
@@ -372,7 +372,7 @@
 
 def transform_editors_single_launcher(rts):
     print rts.dbname, rts.editors_raw
-    editors = setup_database(rts)
+    editors, db_raw, db_dataset = setup_database(rts)
     n = editors.size()
     pbar = progressbar.ProgressBar(maxval=n).start()
 
@@ -384,7 +384,7 @@
         editors.task_done()
         if editor == None:
             break
-        editor = Editor(rts, editor)
+        editor = Editor(rts, editor, db_raw, db_dataset)
         editor()
 
         pbar.update(pbar.currval + 1)

Modified: trunk/tools/editor_trends/etl/variables.py
===================================================================
--- trunk/tools/editor_trends/etl/variables.py  2011-05-30 21:07:43 UTC (rev 
89188)
+++ trunk/tools/editor_trends/etl/variables.py  2011-05-30 21:50:45 UTC (rev 
89189)
@@ -275,20 +275,20 @@
     Determine the id of a revision 
     '''
     if revision_id != None:
-        return revision_id.text
+        return int(revision_id.text)
     else:
         return None
 
 
-def extract_comment_text(revision_id, revision):
+def extract_comment_text(revision, xml_namespace):
     '''
     Extract the comment associated with an edit. 
     '''
-    comment = {}
-    text = revision.find('comment')
-    if text != None and text.text != None:
-        comment[revision_id] = text.text.encode('utf-8')
-    return comment
+    comment_text = revision.find('%s%s' % (xml_namespace, 'comment'))
+    if comment_text != None and comment_text.text != None:
+        return comment_text.text
+    else:
+        return None
 
 
 def create_namespace_dict(siteinfo, xml_namespace):

Modified: trunk/tools/editor_trends/kaggle/training.py
===================================================================
--- trunk/tools/editor_trends/kaggle/training.py        2011-05-30 21:07:43 UTC 
(rev 89188)
+++ trunk/tools/editor_trends/kaggle/training.py        2011-05-30 21:50:45 UTC 
(rev 89189)
@@ -17,23 +17,35 @@
 __date__ = '2011-04-12'
 __version__ = '0.1'
 
+import os
+import sys
+import cPickle
 import codecs
-import os
 from datetime import datetime
-import json
+sys.path.append('../')
 
-location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction'
+from classes import storage
+
+location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction_solution'
 files = os.listdir(location)
 files.reverse()
-dataset = codecs.open('training.tsv', 'w', 'utf-8')
+
+max_size = 2147483648
+max_size_reached = False
+
 t0 = datetime.now()
-max_size = 2147483648
 titles = {}
 ids = set()
+dates = {}
+edits = {}
+ignore_ids = set()
 size = 0
 cnt_obs = 0
-max_size_reached = False
+cutoff_date = datetime(2010, 8, 31)
 
+print 'Constructing training dataset...'
+db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
+dataset = codecs.open('training.tsv', 'w', 'utf-8')
 for filename in files:
     if not filename.startswith('comments') and not 
filename.startswith('articles'):
         fh = codecs.open(os.path.join(location, filename))
@@ -46,13 +58,25 @@
                 continue
             if line[10] == '1':
                 continue
+            timestamp = datetime.strptime(line[6], '%Y-%m-%dT%H:%M:%SZ')
+            if timestamp > cutoff_date:
+                continue
             username = line[3].lower()
-            if username.endswith('bot'):
+            if username.endswith('bot') or username.find('script') > -1:
                 #line[10] = '1'
                 continue
+            id = line[2]
+            if id not in ids and id not in ignore_ids:
+                res = db.find_one('editor', id)
+                if res == None:
+                    ignore_ids.add(id)
+                    continue
             cnt_obs += 1
             title_id = line[1]
-            ids.add(line[2])
+            ids.add(id)
+            simple_date = '%s-%s' % (timestamp.year, timestamp.month)
+            dates.setdefault(simple_date, 0)
+            dates[simple_date] += 1
             title = line.pop(5)
             titles[title_id] = title
             line.append('\n')
@@ -64,20 +88,54 @@
 
 dataset.close()
 
+print 'Constructing title dataset...'
 fh = codecs.open('titles.tsv', 'w', 'utf-8')
 for id, title in titles.iteritems():
     fh.write('%s\t%s\n' % (id, title.decode('utf-8')))
 fh.close()
 
-fh = codecs.open('ids.json', 'w', 'utf-8')
-json.dump(ids, fh)
-#for id in ids:
-#fh.write('%s\n' % (id.decode('utf-8')))
-#fh.write('%s\n' % (json.du)
+
+print 'Constructing solution dataset...'
+x = 0
+fh = codecs.open('solutions.tsv', 'w', 'utf-8')
+for id in ids:
+    if id not in ignore_ids:
+        obs = db.find_one('editor', str(id), 'cum_edit_count_main_ns')
+        if obs != None:
+            x += 1
+            n = obs['cum_edit_count_main_ns']
+            fh.write('%s,%s\n' % (id.decode('utf-8'), n))
+            edits.setdefault(n, 0)
+            edits[n] += 1
+        else:
+            print id
 fh.close()
 
+print 'Storing date histogram'
+fh = open('histogram_dates.bin', 'wb')
+cPickle.dump(dates, fh)
+fh.close()
+
+
+fh = open('histogram_dates.tsv', 'w')
+for date, n in dates.iteritems():
+    fh.write('%s\t%s\n' % (date, n))
+fh.close()
+
+
+print 'Storing edit histogram'
+fh = open('histogram_edits.bin', 'wb')
+cPickle.dump(edits, fh)
+fh.close()
+
+fh = open('histogram_edits.tsv', 'w')
+for edit, n in edits.iteritems():
+    fh.write('%s\t%s\n' % (edit, n))
+fh.close()
+
+
 t1 = datetime.now()
-print 'Descriptives:\n'
-print 'Number of editors: %s' % len(ids)
+print 'Descriptives:'
+print 'Number of editors: %s' % x
 print 'Number of edits: %s' % cnt_obs
 print 'It took %s to construct the Kaggle training set' % (t1 - t0)

Modified: trunk/tools/editor_trends/manage.py
===================================================================
--- trunk/tools/editor_trends/manage.py 2011-05-30 21:07:43 UTC (rev 89188)
+++ trunk/tools/editor_trends/manage.py 2011-05-30 21:50:45 UTC (rev 89189)
@@ -30,6 +30,7 @@
 from classes import projects
 from classes import runtime_settings
 from utils import file_utils
+from utils import text_utils
 from utils import ordered_dict
 from utils import log
 from utils import timer
@@ -43,14 +44,113 @@
 from analyses import inventory
 
 
-def init_args_parser():
+
+def config_launcher(rts, logger):
     '''
+    Config launcher is used to (re)configure Wikilytics. 
+    '''
+
+    pc = projects.ProjectContainer()
+    if not os.path.exists('wiki.cfg') or rts.force:
+        config = ConfigParser.RawConfigParser()
+        project = None
+        language = None
+        db = None
+        valid_hostname = False
+        valid_storage = ['mongo', 'cassandra']
+        working_directory = raw_input('''Please indicate where you installed 
+        Wikilytics.\nCurrent location is %s\nPress Enter to accept 
default.\n''' % os.getcwd())
+
+        input_location = raw_input('''Please indicate where the Wikipedia dump 
+        files are or will be located.\nDefault is: %s\nPress Enter to 
+        accept default.\n''' % rts.input_location)
+
+        base_location = raw_input('''Please indicate where to store all 
+        Wikilytics project files.\nDefault is: %s\nPress Enter to accept 
+        default.\n''' % rts.base_location)
+
+        while db not in valid_storage:
+            db = raw_input('''Please indicate what database you are using for 
storage.\nDefault is: Mongo\n''')
+            db = 'mongo' if len(db) == 0 else db.lower()
+            if db not in valid_storage:
+                print 'Valid choices are: %s' % ','.join(valid_storage)
+
+        while project not in pc.projects.keys():
+            project = raw_input('''Please indicate which project you would 
like 
+            to analyze.\nDefault is: %s\nPress Enter to accept default.\n''' % 
rts.project.full_name)
+            project = project if len(project) > 0 else rts.project.name
+            if project not in pc.projects.keys():
+                print 'Valid choices for a project are: %s' % 
','.join(pc.projects.keys())
+
+        while language not in rts.project.valid_languages:
+            language = raw_input('''Please indicate which language of project 
+            %s you would like to analyze.\nDefault is: %s\nPress Enter to 
accept 
+            default.\n''' % (rts.project.full_name, rts.language))
+            if len(language) == 0:
+                language = rts.language.code
+            language = language if language in rts.project.valid_languages \
+                else rts.language.default
+
+        while valid_hostname == False:
+            master = raw_input('''Please indicate the hostname master of your 
database 
+                cluster.\n Default is: %s\nPress Enter to accept default.\n''' 
% ('localhost'))
+            master = 'localhost' if len(master) == 0 else master
+            valid_hostname = text_utils.validate_hostname(master)
+
+        if master != 'localhost':
+            valid_hostname = False
+            while valid_hostname == False:
+                slaves = raw_input('''Please indicate the hostnames of your 
slaves 
+                    of your database cluster.Separate names using a 
comma.\n''')
+                slaves = slaves.split(',')
+                results = []
+                for slave in slaves:
+                    results.append(text_utils.validate_hostname(slave))
+                valid_hostname = True if all(results) else False
+
+        slaves = ','.join(slaves)
+        input_location = input_location if len(input_location) > 0 else \
+            rts.input_location
+        base_location = base_location if len(base_location) > 0 else \
+            rts.base_location
+        working_directory = working_directory if len(working_directory) > 0 \
+            else os.getcwd()
+
+        config = ConfigParser.RawConfigParser()
+        config.add_section('file_locations')
+        config.set('file_locations', 'working_directory', working_directory)
+        config.set('file_locations', 'input_location', input_location)
+        config.set('file_locations', 'base_location', base_location)
+        config.add_section('wiki')
+        config.set('wiki', 'project', project)
+        config.set('wiki', 'language', language)
+        config.add_section('storage')
+        config.set('storage', 'db', db)
+        config.add_section('cluster')
+        config.set('cluster', 'master', master)
+        config.set('cluster', 'slaves', slaves)
+
+        fh = file_utils.create_binary_filehandle(working_directory, 
'wiki.cfg', 'wb')
+        config.write(fh)
+        fh.close()
+
+        log.to_csv(logger, rts, 'New configuration', 'Creating',
+                       config_launcher,
+                       working_directory=working_directory,
+                       input_location=input_location,
+                       base_location=base_location,
+                       project=project,
+                       language=language,)
+
+
+def init_args_parser(language_code=None, project=None):
+    '''
     Entry point for parsing command line and launching the needed function(s).
     '''
-    language = languages.init()
-    project = projects.init()
+    language = languages.init(language_code)
+    project = projects.init(project)
     pjc = projects.ProjectContainer()
-    rts = runtime_settings.RunTimeSettings(project, language)
+    #rts = runtime_settings.RunTimeSettings(project, language)
 
     file_choices = {'meta-full': 'stub-meta-history.xml.gz',
                     'meta-current': 'stub-meta-current.xml.gz',
@@ -78,7 +178,7 @@
     parser_config.set_defaults(func=config_launcher)
     parser_config.add_argument('-f', '--force',
         action='store_true',
-        help='Reconfigure Editor Toolkit (this will replace wiki.cfg')
+        help='Reconfigure Wikilytics (this will replace wiki.cfg')
 
     #DOWNLOAD
     parser_download = subparsers.add_parser('download',
@@ -141,7 +241,7 @@
     parser_diff = subparsers.add_parser('diff',
         help='Create a Mongo collection containing the diffs between 
revisions.')
     parser_diff.set_defaults(func=diff_launcher)
-    
+
     #DJANGO
     parser_django = subparsers.add_parser('django')
     parser_django.add_argument('-e', '--except',
@@ -192,85 +292,9 @@
             %s' % ''.join([f + ',\n' for f in file_choices]),
         default=file_choices['meta-full'])
 
-    return project, language, parser
+    return parser
 
 
-def config_launcher(rts, logger):
-    '''
-    Config launcher is used to reconfigure editor trends toolkit. 
-    '''
-
-    pc = projects.ProjectContainer()
-    if not os.path.exists('wiki.cfg') or rts.force:
-        config = ConfigParser.RawConfigParser()
-        project = None
-        language = None
-        db = None
-        valid_storage = ['mongo', 'cassandra']
-        working_directory = raw_input('''Please indicate where you installed 
-        Wikilytics.\nCurrent location is %s\nPress Enter to accept 
default.\n''' % os.getcwd())
-
-        input_location = raw_input('''Please indicate where the Wikipedia dump 
-        files are or will be located.\nDefault is: %s\nPress Enter to 
-        accept default.\n''' % rts.input_location)
-
-        base_location = raw_input('''Please indicate where to store all 
-        Wikilytics project files.\nDefault is: %s\nPress Enter to accept 
-        default.\n''' % rts.base_location)
-
-        while db not in valid_storage:
-            db = raw_input('Please indicate what database you are using for 
storage. \nDefault is: Mongo\n')
-            db = 'mongo' if len(db) == 0 else db.lower()
-            if db not in valid_storage:
-                print 'Valid choices are: %s' % ','.join(valid_storage)
-
-        while project not in pc.projects.keys():
-            project = raw_input('''Please indicate which project you would 
like 
-            to analyze.\nDefault is: %s\nPress Enter to accept default.\n''' % 
rts.project.full_name)
-            project = project if len(project) > 0 else rts.project.name
-            if project not in pc.projects.keys():
-                print 'Valid choices for a project are: %s' % 
','.join(pc.projects.keys())
-
-        while language not in rts.project.valid_languages:
-            language = raw_input('''Please indicate which language of project 
-            %s you would like to analyze.\nDefault is: %s\nPress Enter to 
accept 
-            default.\n''' % (rts.project.full_name, rts.language))
-            if len(language) == 0:
-                language = rts.language.code
-            language = language if language in rts.project.valid_languages \
-                else rts.language.default
-
-        input_location = input_location if len(input_location) > 0 else \
-            rts.input_location
-        base_location = base_location if len(base_location) > 0 else \
-            rts.base_location
-        working_directory = working_directory if len(working_directory) > 0 \
-            else os.getcwd()
-
-        config = ConfigParser.RawConfigParser()
-        config.add_section('file_locations')
-        config.set('file_locations', 'working_directory', working_directory)
-        config.set('file_locations', 'input_location', input_location)
-        config.set('file_locations', 'base_location', base_location)
-        config.add_section('wiki')
-        config.set('wiki', 'project', project)
-        config.set('wiki', 'language', language)
-        config.add_section('storage')
-        config.set('storage', 'db', db)
-
-        fh = file_utils.create_binary_filehandle(working_directory, 
'wiki.cfg', 'wb')
-        config.write(fh)
-        fh.close()
-
-        log.to_csv(logger, rts, 'New configuration', 'Creating',
-                       config_launcher,
-                       working_directory=working_directory,
-                       input_location=input_location,
-                       base_location=base_location,
-                       project=project,
-                       language=language,)
-
-
 def downloader_launcher(rts, logger):
     '''
     This launcher calls the dump downloader to download a Wikimedia dump file.
@@ -343,7 +367,8 @@
     stopwatch = timer.Timer()
     log.to_db(rts, 'dataset', 'transform', stopwatch, event='start')
     log.to_csv(logger, rts, 'Start', 'Transform', transformer_launcher)
-    transformer.transform_editors_multi_launcher(rts)
+    #transformer.transform_editors_multi_launcher(rts)
+    transformer.transform_editors_single_launcher(rts)
     stopwatch.elapsed()
     log.to_db(rts, 'dataset', 'transform', stopwatch, event='finish')
     log.to_csv(logger, rts, 'Finish', 'Transform', transformer_launcher)
@@ -359,8 +384,8 @@
     log.to_db(rts, 'dataset', 'diff', stopwatch, event='finish')
     log.to_csv(logger, rts, 'Finish', 'Diff', diff_launcher)
 
-    
 
+
 def dataset_launcher(rts, logger):
     '''
     Dataset launcher is the entry point to generate datasets from the command
@@ -414,8 +439,11 @@
     '''
     This function initializes the command line parser. 
     '''
-    project, language, parser, = init_args_parser()
+    parser = init_args_parser()
     args = parser.parse_args()
+    language = languages.init()
+    project = projects.init()
+
     rts = runtime_settings.RunTimeSettings(project, language, args)
     #initialize logger
     logger = logging.getLogger('manager')

Modified: trunk/tools/editor_trends/statistics/stata/ppi.do
===================================================================
--- trunk/tools/editor_trends/statistics/stata/ppi.do   2011-05-30 21:07:43 UTC 
(rev 89188)
+++ trunk/tools/editor_trends/statistics/stata/ppi.do   2011-05-30 21:50:45 UTC 
(rev 89189)
@@ -1,5 +1,11 @@
 clear
 insheet using "C:\Users\diederik.vanliere\Desktop\ppi_quality.csv"
+
+gen diff_character_count =  character_count_a - character_count_b
+gen diff_cum_edit_count_main_ns =  cum_edit_count_main_ns_a- 
cum_edit_count_main_ns_b
+gen diff_cum_edit_count_other_ns =  cum_edit_count_other_ns_a- 
cum_edit_count_other_ns_b
+gen diff_article_count =  article_count_a- article_count_b
+
 label var character_count_a "PPI editor"
 label var character_count_b "Regular editor"
 

Modified: trunk/tools/editor_trends/utils/file_utils.py
===================================================================
--- trunk/tools/editor_trends/utils/file_utils.py       2011-05-30 21:07:43 UTC 
(rev 89188)
+++ trunk/tools/editor_trends/utils/file_utils.py       2011-05-30 21:50:45 UTC 
(rev 89189)
@@ -173,9 +173,10 @@
     '''Create a filehandle for text file with utf-8 encoding'''
     filename = str(filename)
     if not filename.endswith('.csv'):
-        filename = construct_filename(filename, '.csv')
+        if filename.find('.') == -1:
+            filename = construct_filename(filename, '.csv')
     path = os.path.join(location, filename)
-    return codecs.open(path, mode, encoding='utf-8')
+    return codecs.open(path, mode, encoding)
 
 
 def create_streaming_buffer(path):
@@ -189,7 +190,8 @@
         fh = subprocess.Popen('7z e -bd -so %s 2>/dev/null' % path, shell=True,
                               stdout=subprocess.PIPE, bufsize=65535).stdout
     elif extension == '.xml':
-        fh = create_txt_filehandle(path, None, 'r', 'utf-8')
+        location, filename = os.path.split(path)
+        fh = create_txt_filehandle(location, filename, 'r', 'utf-8')
     else:
         raise exceptions.CompressedFileNotSupported(extension)
     return fh
@@ -247,6 +249,7 @@
     os.utime(path, (mod_rem, mod_rem))
     #sraise exceptions.NotYetImplementedError(set_modified_data)
 
+
 def get_modified_date(location, filename):
     '''determine the date the file was originally created'''
     path = os.path.join(location, filename)

Modified: trunk/tools/editor_trends/utils/log.py
===================================================================
--- trunk/tools/editor_trends/utils/log.py      2011-05-30 21:07:43 UTC (rev 
89188)
+++ trunk/tools/editor_trends/utils/log.py      2011-05-30 21:50:45 UTC (rev 
89189)
@@ -31,11 +31,9 @@
 def to_db(rts, jobtype, task, timer, event='start'):
     db = storage.init_database(rts.storage, rts.dbname, 'jobs')
     created = datetime.datetime.now()
-    hash = '%s_%s' % (rts.project, rts.hash)
+    job = db.find_one('hash', rts.id)
 
-    job = db.find_one('hash', hash)
-
-    data = {'hash': hash,
+    data = {'hash': rts.id,
           'created': created,
           'jobtype': jobtype,
           'in_progress': True,
@@ -60,7 +58,7 @@
         t['start'] = timer.t0
         t['in_progress'] = True
         tasks[task] = t
-        db.update('hash', hash, {'$set': {'tasks': tasks}})
+        db.update('hash', rts.id, {'$set': {'tasks': tasks}})
         #coll.update({'hash': hash}, {'$set': {'tasks': tasks}})
     elif event == 'finish':
         t['finish'] = timer.t1
@@ -68,11 +66,11 @@
         tasks[task] = t
         if task == 'transform' or jobtype == 'chart':
             #final task, set entire task to finished
-            db.update('hash', hash, {'$set': {'tasks': tasks,
+            db.update('hash', rts.id, {'$set': {'tasks': tasks,
                                                  'in_progress': False,
                                                  'finished': True}})
         else:
-            db.update('hash', hash, {'$set': {'tasks': tasks}})
+            db.update('hash', rts.id, {'$set': {'tasks': tasks}})
 
 
 def to_csv(logger, settings, message, verb, function, **kwargs):

Modified: trunk/tools/editor_trends/utils/text_utils.py
===================================================================
--- trunk/tools/editor_trends/utils/text_utils.py       2011-05-30 21:07:43 UTC 
(rev 89188)
+++ trunk/tools/editor_trends/utils/text_utils.py       2011-05-30 21:50:45 UTC 
(rev 89189)
@@ -20,6 +20,7 @@
 import datetime
 import time
 import sys
+import re
 
 if '..' not in sys.path:
     sys.path.append('..')
@@ -52,6 +53,14 @@
     return dict([[v, k] for k, v in dictionary.items()])
 
 
+def validate_hostname(hostname):
+    regex_hostname = 
re.compile('^(?=.{1,255}$)[0-9A-Za-z](?:(?:[0-9A-Za-z]|\b-){0,61}[0-9A-Za-z])?(?:\.[0-9A-Za-z](?:(?:[0-9A-Za-z]|\b-){0,61}[0-9A-Za-z])?)*\.?$')
+    res = re.match(regex_hostname, hostname)
+    if res == None:
+        return False
+    else:
+        return True
+
 def get_max_width(table, index):
     '''
     Get the maximum width of the given column index


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to