http://www.mediawiki.org/wiki/Special:Code/MediaWiki/76211
Revision: 76211
Author: diederik
Date: 2010-11-06 19:37:32 +0000 (Sat, 06 Nov 2010)
Log Message:
-----------
Fixes include:
* utf8 support on console if proper fonts are installed
* separation of concerns
Modified Paths:
--------------
trunk/tools/editor_trends/construct_datasets.py
trunk/tools/editor_trends/manage.py
trunk/tools/editor_trends/map_wiki_editors.py
trunk/tools/editor_trends/optimize_editors.py
trunk/tools/editor_trends/utils/utils.py
Modified: trunk/tools/editor_trends/construct_datasets.py
===================================================================
--- trunk/tools/editor_trends/construct_datasets.py 2010-11-06 19:22:16 UTC
(rev 76210)
+++ trunk/tools/editor_trends/construct_datasets.py 2010-11-06 19:37:32 UTC
(rev 76211)
@@ -134,33 +134,41 @@
input_queue = pc.load_queue(ids)
q = Queue()
generate_editor_dataset(input_queue, q, False, kwargs)
- #generate_editor_dataset_launcher()
- #retrieve_list_contributors()
- #retrieve_edits_by_contributor()
-def generate_editor_dataset_launcher():
+
+def generate_editor_dataset_launcher(dbname):
kwargs = {'nr_input_processors': 1,
'nr_output_processors': 1,
'debug': False,
- 'dbname': 'enwiki',
+ 'dbname': dbname,
}
- ids = retrieve_editor_ids_mongo('enwiki', 'editors')
- pc.build_scaffolding(pc.load_queue, generate_editor_dataset, ids, False,
False, **kwargs)
+ ids = retrieve_editor_ids_mongo(dbname, 'editors')
+ chunks = {}
+ parts = int(round(float(len(ids)) / 1, 0))
+ a = 0
+ for x in xrange(settings.NUMBER_OF_PROCESSES):
+ b = a + parts
+ chunks[x] = ids[a:b]
+ a = (x + 1) * parts
+ if a >= len(ids):
+ break
+
+ pc.build_scaffolding(pc.load_queue, generate_editor_dataset, chunks,
False, False, **kwargs)
-def generate_editor_dataset_debug():
- ids = retrieve_editor_ids_mongo('enwiki', 'editors')
+def generate_editor_dataset_debug(dbname):
+ ids = retrieve_editor_ids_mongo(dbname, 'editors')
input_queue = pc.load_queue(ids)
#write_dataset(input_queue, [], 'enwiki')
kwargs = {'nr_input_processors': 1,
'nr_output_processors': 1,
'debug': True,
- 'dbname': 'enwiki',
+ 'dbname': dbname,
}
generate_editor_dataset(input_queue, False, False, kwargs)
if __name__ == '__main__':
- #generate_editor_dataset_debug()
- generate_editor_dataset_launcher()
+ #generate_editor_dataset_debug('test')
+ generate_editor_dataset_launcher('test')
#debug_retrieve_edits_by_contributor_launcher()
Modified: trunk/tools/editor_trends/manage.py
===================================================================
--- trunk/tools/editor_trends/manage.py 2010-11-06 19:22:16 UTC (rev 76210)
+++ trunk/tools/editor_trends/manage.py 2010-11-06 19:37:32 UTC (rev 76211)
@@ -32,6 +32,8 @@
from utils import dump_downloader
import split_xml_file
import map_wiki_editors
+import optimize_editors
+import construct_datasets
import config
@@ -96,7 +98,7 @@
project = project.title()
language_map = utils.invert_dict(languages.MAPPING)
print 'Project: %s' % (project)
- print 'Language: %s' % language_map[language_code]
+ print 'Language: %s' % language_map[language_code].decode('utf-8')
print 'Input directory: %s' % location
print 'Output directory: TODO'
@@ -130,23 +132,37 @@
def extract_xml_file(args, location, file):
path = config.detect_installed_program('7zip')
-
source = os.path.join(location, file)
- p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' % location,
'%s' % (source,)])
+ p = None
+
+ if settings.OS == 'Windows':
+ p = subprocess.Popen(['%s%s' % (path, '7z.exe'), 'e', '-o%s\\' %
location, '%s' % (source,)], shell=True).wait()
+ elif settings.OS == 'Linux':
+ raise NotImplementedError
+ elif settings.OS == 'OSX':
+ raise NotImplementedError
+ else:
+ raise exceptions.PlatformNotSupportedError
return p
def mongodb_script_launcher(args, location, filename, project, language_code):
print 'mongodb_script_launcher'
map_wiki_editors.run_parse_editors(project, language_code, location)
- #print args
+def dataset_launcher(args, project):
+ print 'dataset launcher'
+ optimize_editors.run_optimize_editors(project)
+ construct_datasets.generate_editor_dataset_launcher(project)
+
+
def all_launcher(args, location, filename, project, language_code):
print 'all_launcher'
dump_downloader_launcher(args, location, filename, project, language_code)
split_xml_file_launcher(args, location, filename, project, language_code)
mongodb_script_launcher(args, location, filename, project, language_code)
+ dataset_launcher(args, location, filename, project, language_code)
def supported_languages():
@@ -165,23 +181,30 @@
languages.append(choice)
languages.sort()
for language in languages:
- if first == None:
+ try:
+ if first != None and language.startswith(first):
+ print '%s' % language.decode('utf-8')
+ elif first == None:
+ print '%s' % language.decode('utf-8')
+ except UnicodeEncodeError:
print '%s' % language
- elif first != None and language.startswith(first):
- print '%s' % language
+def about():
+ print 'Editor Trends Software is (c) 2010 by the Wikimedia Foundation.'
+ print 'Written by Diederik van Liere ([email protected]).'
+ print 'This software comes with ABSOLUTELY NO WARRANTY. This is free
software, and you are welcome to distribute it under certain conditions.'
+ print 'See the README.1ST file for more information.'
+ print ''
+
def main():
default_language = determine_default_language()
file_choices = ('stub-meta-history.xml.gz',
- 'stub-meta-current.xml.gz',
- 'pages-meta-history.xml.7z',
- 'pages-meta-current.xml.bz2')
+ 'stub-meta-current.xml.gz',
+ 'pages-meta-history.xml.7z',
+ 'pages-meta-current.xml.bz2')
parser = ArgumentParser(prog='manage',
formatter_class=RawTextHelpFormatter)
- #group = parser.add_mutually_exclusive_group()
- #group.add_argument('show_languages', action='store')
- #group.add_argument('language', action='store')
subparsers = parser.add_subparsers(help='sub-command help')
parser_languages = subparsers.add_parser('show_languages', help='Overview
of all valid languages.')
@@ -202,6 +225,9 @@
parser_create = subparsers.add_parser('store', help='The store sub command
parsers the XML chunk files, extracts the information and stores it in a
MongoDB.')
parser_create.set_defaults(func=mongodb_script_launcher)
+ parser_dataset = subparsers.add_parser('dataset', help='Create a dataset
from the MongoDB and write it to a csv file.')
+ parser_dataset.set_defaults(func=dataset_launcher)
+
parser_all = subparsers.add_parser('all', help='The all sub command runs
the download, split, store and dataset commands.\n\nWARNING: THIS COULD TAKE
DAYS DEPENDING ON THE CONFIGURATION OF YOUR MACHINE AND THE SIZE OF THE
WIKIMEDIA DUMP FILE.')
parser_all.set_defaults(func=all_launcher)
@@ -230,6 +256,7 @@
args = parser.parse_args()
config.load_configuration(args)
locations = determine_file_locations(args)
+ about()
show_settings(args, **locations)
args.func(args, **locations)
Modified: trunk/tools/editor_trends/map_wiki_editors.py
===================================================================
--- trunk/tools/editor_trends/map_wiki_editors.py 2010-11-06 19:22:16 UTC
(rev 76210)
+++ trunk/tools/editor_trends/map_wiki_editors.py 2010-11-06 19:37:32 UTC
(rev 76211)
@@ -135,7 +135,7 @@
if settings.DEBUG:
messages = {}
vars = {}
-
+
while True:
try:
if debug:
@@ -261,7 +261,9 @@
for editor in cache[c]:
editor_cache.add(editor, cache[c][editor])
cache[c] = {}
- editor_cache.add('NEXT', '')
+ editor_cache.add('NEXT', '')
+ cache = {}
+
def load_bot_ids():
Modified: trunk/tools/editor_trends/optimize_editors.py
===================================================================
--- trunk/tools/editor_trends/optimize_editors.py 2010-11-06 19:22:16 UTC
(rev 76210)
+++ trunk/tools/editor_trends/optimize_editors.py 2010-11-06 19:37:32 UTC
(rev 76211)
@@ -82,7 +82,7 @@
edits = editor['edits']
edits = sorted(edits, key=itemgetter('date'))
edit_count = len(edits)
- new_wikipedian = edits[9]['date'].year
+ new_wikipedian = edits[9]['date']
first_edit = edits[0]['date']
final_edit = edits[-1]['date']
edits_by_year = determine_edits_by_year(edits)
@@ -91,7 +91,7 @@
output.insert({'editor': id, 'edits': edits,
'edits_by_year': edits_by_year,
- 'year_joined': new_wikipedian,
+ 'new_wikipedian': new_wikipedian,
'edit_count': edit_count,
'final_edit': final_edit,
'first_edit': first_edit,
Modified: trunk/tools/editor_trends/utils/utils.py
===================================================================
--- trunk/tools/editor_trends/utils/utils.py 2010-11-06 19:22:16 UTC (rev
76210)
+++ trunk/tools/editor_trends/utils/utils.py 2010-11-06 19:37:32 UTC (rev
76211)
@@ -276,15 +276,22 @@
return d
-def retrieve_file_list(location, extension):
+def retrieve_file_list(location, extension, mask=''):
+ '''
+ Retrieve a list of files from a specified location.
+ @location: either an absolute or relative path
+ @extension: only include files with extension (optional)
+ @mask: only include files that start with mask (optional)
+
+ @return: a list of files matching the criteria
+ '''
all_files = os.listdir(location)
if not extension.startswith('.'):
extension = '.' + extension
files = []
for file in all_files:
- if file.endswith(extension):
+ if file.startswith(mask) and file.endswith(extension):
files.append(file)
-
return files
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs