http://www.mediawiki.org/wiki/Special:Code/MediaWiki/76201
Revision: 76201
Author: diederik
Date: 2010-11-06 17:42:36 +0000 (Sat, 06 Nov 2010)
Log Message:
-----------
Various bugfixes
Modified Paths:
--------------
trunk/tools/editor_trends/config.py
trunk/tools/editor_trends/manage.py
trunk/tools/editor_trends/map_wiki_editors.py
trunk/tools/editor_trends/optimize_editors.py
trunk/tools/editor_trends/split_xml_file.py
trunk/tools/editor_trends/utils/process_constructor.py
trunk/tools/editor_trends/utils/utils.py
Modified: trunk/tools/editor_trends/config.py
===================================================================
--- trunk/tools/editor_trends/config.py 2010-11-06 17:35:15 UTC (rev 76200)
+++ trunk/tools/editor_trends/config.py 2010-11-06 17:42:36 UTC (rev 76201)
@@ -20,13 +20,14 @@
import os
import ConfigParser
-from _winreg import *
+
import settings
from utils import utils
def detect_windows_program(program):
+ from _winreg import *
entry = settings.WINDOWS_REGISTER[program]
try:
key = OpenKey(HKEY_CURRENT_USER, entry, 0, KEY_READ)
Modified: trunk/tools/editor_trends/manage.py
===================================================================
--- trunk/tools/editor_trends/manage.py 2010-11-06 17:35:15 UTC (rev 76200)
+++ trunk/tools/editor_trends/manage.py 2010-11-06 17:42:36 UTC (rev 76201)
@@ -22,8 +22,8 @@
import subprocess
from argparse import ArgumentParser
from argparse import RawTextHelpFormatter
+import locale
-
import progressbar
import settings
@@ -43,6 +43,11 @@
config.load_configuration(args)
+def determine_default_language():
+ language_code = locale.getdefaultlocale()[0]
+ return language_code.split('_')[0]
+
+
def retrieve_projectname(args):
language_code = retrieve_language(args)
if language_code == None:
@@ -53,13 +58,16 @@
if project == None:
print 'Entered project: %s is not valid Wikipedia project.' %
get_value(args, 'project')
sys.exit(-1)
+ if project == 'commonswiki':
+ return project
+ else:
+ return '%s%s' % (language_code, project)
- return '%s%s' % (language_code, project)
def retrieve_language(args):
language = get_value(args, 'language')
language = language.title()
- return languages.MAPPING.get(language, None)
+ return languages.MAPPING.get(language, 'en')
def retrieve_project(args):
@@ -75,13 +83,24 @@
def determine_file_locations(args):
locations = {}
+ location = get_value(args, 'location') if get_value(args, 'location') !=
None else settings.XML_FILE_LOCATION
locations['language_code'] = retrieve_language(args)
- locations['location'] = os.path.join(get_value(args, 'location'),
retrieve_language(args))
+ locations['location'] = os.path.join(location, retrieve_language(args))
locations['project'] = retrieve_projectname(args)
locations['filename'] = generate_wikidump_filename(args)
return locations
+def show_settings(args, location, filename, project, language_code):
+ project = settings.WIKIMEDIA_PROJECTS.get(project, 'wiki')
+ project = project.title()
+ language_map = utils.invert_dict(languages.MAPPING)
+ print 'Project: %s' % (project)
+ print 'Language: %s' % language_map[language_code]
+ print 'Input directory: %s' % location
+ print 'Output directory: TODO'
+
+
def dump_downloader_launcher(args, location, filename, project, language_code):
print 'dump downloader'
pbar = get_value(args, 'progress')
@@ -113,8 +132,8 @@
path = config.detect_installed_program('7zip')
source = os.path.join(location, file)
- retcode = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' %
location, '%s' % (source,)])
- return retcode
+ p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location,
'%s' % (source,)])
+ return p
def mongodb_script_launcher(args, location, filename, project, language_code):
@@ -153,6 +172,7 @@
def main():
+ default_language = determine_default_language()
file_choices = ('stub-meta-history.xml.gz',
'stub-meta-current.xml.gz',
'pages-meta-history.xml.7z',
@@ -188,7 +208,7 @@
parser.add_argument('-l', '--language', action='store',
help='Example of valid languages.',
choices=supported_languages(),
- default='Russian')
+ default=default_language)
parser.add_argument('-p', '--project', action='store',
help='Specify the Wikimedia project that you would
like to download',
@@ -210,6 +230,7 @@
args = parser.parse_args()
config.load_configuration(args)
locations = determine_file_locations(args)
+ show_settings(args, **locations)
args.func(args, **locations)
Modified: trunk/tools/editor_trends/map_wiki_editors.py
===================================================================
--- trunk/tools/editor_trends/map_wiki_editors.py 2010-11-06 17:35:15 UTC
(rev 76200)
+++ trunk/tools/editor_trends/map_wiki_editors.py 2010-11-06 17:42:36 UTC
(rev 76201)
@@ -244,6 +244,26 @@
print 'Time elapsed: %s and processed %s items.' %
(datetime.datetime.now() - editor_cache.init_time, editor_cache.cumulative_n)
+def load_cache_objects():
+ cache = {}
+ files = utils.retrieve_file_list(settings.BINARY_OBJECT_FILE_LOCATION,
'.bin')
+ for x, file in enumerate(files):
+ cache[x] = utils.load_object(settings.BINARY_OBJECT_FILE_LOCATION,
file)
+ return cache
+
+
+def search_cache_for_missed_editors(dbname):
+ mongo = db.init_mongo_db(dbname)
+ collection = mongo['editors']
+ editor_cache = cache.EditorCache(collection)
+ cache = load_cache_objects()
+ for c in cache:
+ for editor in cache[c]:
+ editor_cache.add(editor, cache[c][editor])
+ cache[c] = {}
+ editor_cache.add('NEXT', '')
+
+
def load_bot_ids():
'''
Loader function to retrieve list of id's of known Wikipedia bots.
@@ -267,7 +287,6 @@
'language': language,
}
chunks = {}
- #file_location = os.path.join(settings.XML_FILE_LOCATION, language)
files = utils.retrieve_file_list(location, 'xml')
parts = int(round(float(len(files)) / settings.NUMBER_OF_PROCESSES, 0))
a = 0
@@ -277,12 +296,14 @@
a = (x + 1) * parts
pc.build_scaffolding(pc.load_queue, parse_editors, chunks, store_editors,
True, **kwargs)
+ search_cache_for_missed_editors(dbname)
def debug_parse_editors(dbname):
q = JoinableQueue()
parse_editors('en\\522.xml', q, None, None, True)
store_editors(q, [], dbname)
+ search_cache_for_missed_editors(dbname)
if __name__ == "__main__":
Modified: trunk/tools/editor_trends/optimize_editors.py
===================================================================
--- trunk/tools/editor_trends/optimize_editors.py 2010-11-06 17:35:15 UTC
(rev 76200)
+++ trunk/tools/editor_trends/optimize_editors.py 2010-11-06 17:42:36 UTC
(rev 76201)
@@ -17,11 +17,15 @@
__date__ = '2010-11-02'
__version__ = '0.1'
+from multiprocessing import Queue
+from Queue import Empty
+from operator import itemgetter
+import datetime
-
import settings
from database import db
from utils import process_constructor as pc
+import construct_datasets
def create_datacontainer(init_value=0):
@@ -37,7 +41,7 @@
data[str(x)] = init_value
return data
-
+
def determine_edits_by_year(dates):
'''
This function counts the number of edits by year made by a particular
editor.
@@ -87,7 +91,7 @@
output.insert({'editor': id, 'edits': edits,
'edits_by_year': edits_by_year,
- 'year_joined': year,
+ 'year_joined': new_wikipedian,
'edit_count': edit_count,
'final_edit': final_edit,
'first_edit': first_edit,
@@ -101,20 +105,31 @@
kwargs = {'definition': 'traditional',
'pbar': True,
'dbname': 'enwiki',
- 'nr_input_processors': 2,
+ 'nr_input_processors': 1,
'nr_output_processors': 0,
}
- pc.build_scaffolding(pc.load_queue, optimize_editors, ids, False, False,
**kwargs)
+ chunks = {}
+ parts = int(round(float(len(ids)) / 1, 0))
+ a = 0
+ for x in xrange(settings.NUMBER_OF_PROCESSES):
+ b = a + parts
+ chunks[x] = ids[a:b]
+ a = (x + 1) * parts
+ if a >= len(ids):
+ break
+ pc.build_scaffolding(pc.load_queue, optimize_editors, chunks, False,
False, **kwargs)
+
def debug_optimize_editors(dbname):
ids = construct_datasets.retrieve_editor_ids_mongo(dbname, 'editors')
q = pc.load_queue(ids)
kwargs = {'definition': 'traditional',
- 'dbname': 'enwiki'
+ 'dbname': dbname
}
optimize_editors(q, False, True, kwargs)
if __name__ == '__main__':
- run_optimize_editors('enwiki')
\ No newline at end of file
+ debug_optimize_editors('test')
+ #run_optimize_editors('test')
Modified: trunk/tools/editor_trends/split_xml_file.py
===================================================================
--- trunk/tools/editor_trends/split_xml_file.py 2010-11-06 17:35:15 UTC (rev
76200)
+++ trunk/tools/editor_trends/split_xml_file.py 2010-11-06 17:42:36 UTC (rev
76201)
@@ -172,7 +172,7 @@
#elem = parse_comments(elem,
remove_ascii_control_characters)
#print cElementTree.tostring(elem)
except SyntaxError:
- fh = utils.create_txt_filehandle(ERROR_MESSAGE_FILE_LOCATION,
'split_xml', 'w', settings.ENCODING)
+ fh = utils.create_txt_filehandle(settings.ERROR_MESSAGE_FILE_LOCATION,
'split_xml', 'w', settings.ENCODING)
fh.write(cElementTree.tostring(elem))
fh.close()
Modified: trunk/tools/editor_trends/utils/process_constructor.py
===================================================================
--- trunk/tools/editor_trends/utils/process_constructor.py 2010-11-06
17:35:15 UTC (rev 76200)
+++ trunk/tools/editor_trends/utils/process_constructor.py 2010-11-06
17:42:36 UTC (rev 76201)
@@ -57,14 +57,16 @@
nr_output_processors = kwargs.pop('nr_output_processors')
input_queues = {}
result_queues = {}
- assert len(obj) == nr_input_processors
- if result_queue:
- assert len(obj)== nr_output_processors
+ #assert len(obj) == nr_input_processors
+ #if result_queue:
+ # assert len(obj)== nr_output_processors
for i, o in enumerate(obj):
input_queues[i] = load_input_queue(obj[o], poison_pill=True)
if result_queue:
result_queues[i] = JoinableQueue()
+ else:
+ result_queues[i] = False
if settings.PROGRESS_BAR:
size = sum([input_queues[q].qsize() for q in input_queues])
Modified: trunk/tools/editor_trends/utils/utils.py
===================================================================
--- trunk/tools/editor_trends/utils/utils.py 2010-11-06 17:35:15 UTC (rev
76200)
+++ trunk/tools/editor_trends/utils/utils.py 2010-11-06 17:42:36 UTC (rev
76201)
@@ -32,6 +32,7 @@
import ctypes
import settings
+import exceptions
try:
@@ -160,6 +161,7 @@
else:
return 'wb'
+
def write_list_to_csv(data, fh, recursive=False):
if recursive:
recursive = False
@@ -170,6 +172,7 @@
fh.write('%s\t' % d)
if recursive:
return True
+
def write_dict_to_csv(data, fh):
keys = data.keys()
@@ -225,7 +228,7 @@
if is_exe(exe_file):
return exe_file
- return None
+ raise exceptions.FileNotFoundException(program)
def store_object(object, location, filename):
@@ -254,6 +257,15 @@
return string
+def invert_dict(dictionary):
+ '''
+ @dictionary is a simple dictionary containing simple values, ie. no lists,
+ or other dictionaries
+ output: dictionary where key and value are swapped.
+ '''
+ return dict([[v,k] for k,v in dictionary.items()])
+
+
def create_dict_from_csv_file(filename, encoding):
d = {}
for line in read_data_from_csv(filename, encoding):
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs