http://www.mediawiki.org/wiki/Special:Code/MediaWiki/89242
Revision: 89242
Author: diederik
Date: 2011-06-01 00:02:16 +0000 (Wed, 01 Jun 2011)
Log Message:
-----------
Preparing for Summer of Research, part 2
Modified Paths:
--------------
trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
trunk/tools/editor_trends/analyses/analyzer.py
trunk/tools/editor_trends/analyses/plugins/taxonomy_list_makers.py
trunk/tools/editor_trends/classes/analytics.py
trunk/tools/editor_trends/classes/buffer.py
trunk/tools/editor_trends/classes/dataset.py
trunk/tools/editor_trends/classes/runtime_settings.py
trunk/tools/editor_trends/classes/storage.py
trunk/tools/editor_trends/etl/differ.py
trunk/tools/editor_trends/etl/downloader.py
trunk/tools/editor_trends/etl/extracter.py
trunk/tools/editor_trends/etl/store.py
trunk/tools/editor_trends/etl/variables.py
trunk/tools/editor_trends/kaggle/training.py
trunk/tools/editor_trends/manage.py
trunk/tools/editor_trends/utils/log.py
Modified: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
===================================================================
--- trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py 2011-05-31
23:50:11 UTC (rev 89241)
+++ trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py 2011-06-01
00:02:16 UTC (rev 89242)
@@ -57,7 +57,7 @@
def retrieve_variables(obs, username, date):
- data = db.find_one('username', username)
+ data = db.find_one({'username': username})
year = str(date.year)
month = str(date.month)
if data:
Modified: trunk/tools/editor_trends/analyses/analyzer.py
===================================================================
--- trunk/tools/editor_trends/analyses/analyzer.py 2011-05-31 23:50:11 UTC
(rev 89241)
+++ trunk/tools/editor_trends/analyses/analyzer.py 2011-06-01 00:02:16 UTC
(rev 89242)
@@ -24,6 +24,7 @@
import types
import sys
import cPickle
+import pymongo
import gc
import os
import progressbar
@@ -65,7 +66,7 @@
def feedback(plugin, rts):
print 'Exporting data for chart: %s' % plugin
print 'Project: %s' % rts.dbname
- print 'Dataset: %s' % rts.editors_dataset
+ print 'Dataset: %s' % rts.collection
def write_output(ds, rts, stopwatch):
@@ -97,7 +98,8 @@
plugin = retrieve_plugin(func)
if not plugin:
- raise exceptions.UnknownPluginError(plugin, self.available_plugins)
+ available_plugins = inventory.available_analyses()
+ raise exceptions.UnknownPluginError(plugin, available_plugins)
plugin = getattr(plugin, func)
feedback(func, rts)
@@ -110,15 +112,16 @@
obs = dict()
obs_proxy = mgr.dict(obs)
- db = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset)
+ db = storage.init_database(rts.storage, rts.dbname, rts.collection)
editors = db.retrieve_distinct_keys('editor')
#editors = editors[:500]
- min_year, max_year = determine_project_year_range(db, 'new_wikipedian')
+ if rts.collection.find('editors_dataset') > -1:
+ min_year, max_year = determine_project_year_range(db, 'new_wikipedian')
+ kwargs['min_year'] = min_year
+ kwargs['max_year'] = max_year
fmt = kwargs.pop('format', 'long')
time_unit = kwargs.pop('time_unit', 'year')
- kwargs['min_year'] = min_year
- kwargs['max_year'] = max_year
var = dataset.Variable('count', time_unit, lock, obs_proxy, **kwargs)
@@ -153,24 +156,21 @@
ppills = cpu_count()
- while True:
- while ppills > 0:
- try:
- res = result.get()
- if res == True:
- pbar.update(pbar.currval + 1)
- else:
- ppills -= 1
- var = res
- print ppills
- except Empty:
- pass
- break
- print 'Waiting for tasks...'
+ while ppills > 0:
+ try:
+ res = result.get()
+ if res == True:
+ pbar.update(pbar.currval + 1)
+ else:
+ ppills -= 1
+ var = res
+ except Empty:
+ pass
+
tasks.join()
var = reconstruct_observations(var)
- ds = dataset.Dataset(plugin.func_name, rts, format=fmt, **kwargs)
+ ds = dataset.Dataset(func, rts, format=fmt, **kwargs)
ds.add_variable(var)
stopwatch.elapsed()
@@ -178,8 +178,8 @@
ds.summary()
- for n, c in get_refcounts()[:100]:
- print '%10d %s' % (n, c.__name__)
+ #for n, c in get_refcounts()[:100]:
+ # print '%10d %s' % (n, c.__name__)
def get_refcounts():
@@ -205,9 +205,12 @@
Determine the first and final year for the observed data
'''
try:
- obs = db.find(var, qualifier='max')
+ conditions = {var : {'$ne' : False}}
+
+ obs = db.find(conditions).sort(var, pymongo.ASCENDING).limit(1)[0]
max_year = obs[var].year + 1
- obs = db.find(var, qualifier='min')
+
+ obs = db.find(conditions).sort(var, pymongo.DESCENDING).limit(1)[0]
min_year = obs[var].year
except KeyError:
min_year = 2001
Modified: trunk/tools/editor_trends/analyses/plugins/taxonomy_list_makers.py
===================================================================
--- trunk/tools/editor_trends/analyses/plugins/taxonomy_list_makers.py
2011-05-31 23:50:11 UTC (rev 89241)
+++ trunk/tools/editor_trends/analyses/plugins/taxonomy_list_makers.py
2011-06-01 00:02:16 UTC (rev 89242)
@@ -71,4 +71,4 @@
cursor = db.find('category', 'List')
for c in cursor:
data[c['id']] = 1
- return data
+ return data, rts
Modified: trunk/tools/editor_trends/classes/analytics.py
===================================================================
--- trunk/tools/editor_trends/classes/analytics.py 2011-05-31 23:50:11 UTC
(rev 89241)
+++ trunk/tools/editor_trends/classes/analytics.py 2011-06-01 00:02:16 UTC
(rev 89242)
@@ -64,10 +64,8 @@
def __call__(self):
project = 'wiki'
- #rts = runtime_settings.init_environment('wiki', 'en', args)
for lang in self.languages:
self.rts = runtime_settings.init_environment(project, lang,
self.args)
- #self.rts.editors_dataset = 'editors_dataset'
self.rts.dbname = '%s%s' % (lang, project)
for cum_cutoff in self.cum_cutoff:
@@ -91,15 +89,16 @@
Generic loop function that loops over all the editors of a Wikipedia
project and then calls the plugin that does the actual mapping.
'''
- db = storage.init_database(self.rts.storage, self.rts.dbname,
self.rts.editors_dataset)
+ db = storage.init_database(self.rts.storage, self.rts.dbname,
self.rts.collection)
while True:
try:
editor_id = self.tasks.get(block=False)
+ self.tasks.task_done()
if editor_id == None:
self.result.put(self.var)
break
- editor = db.find_one('editor', editor_id)
- self.plugin(self.var, editor, dbname=self.rts.dbname,
data=self.data)
+ editor = db.find_one({'editor': editor_id})
+ self.plugin(self.var, editor, rts=self.rts, data=self.data)
self.result.put(True)
except Empty:
pass
Modified: trunk/tools/editor_trends/classes/buffer.py
===================================================================
--- trunk/tools/editor_trends/classes/buffer.py 2011-05-31 23:50:11 UTC (rev
89241)
+++ trunk/tools/editor_trends/classes/buffer.py 2011-06-01 00:02:16 UTC (rev
89242)
@@ -124,7 +124,9 @@
def simplify(self, revision):
row = []
for key in self.keys:
- row.append(revision[key].decode('utf-8'))
+ value = revision.get(key, None)
+ if value != None:
+ row.append(value.decode('utf-8'))
return row
def stringify(self, revision):
Modified: trunk/tools/editor_trends/classes/dataset.py
===================================================================
--- trunk/tools/editor_trends/classes/dataset.py 2011-05-31 23:50:11 UTC
(rev 89241)
+++ trunk/tools/editor_trends/classes/dataset.py 2011-06-01 00:02:16 UTC
(rev 89242)
@@ -176,6 +176,7 @@
#self.date = date
self.data = 0
self.time_unit = time_unit
+ self.date = date
self.t1, self.t0 = self.set_date_range(date)
self.id = id
self.props = []
@@ -515,7 +516,7 @@
variable.max = get_max(data)
variable.num_obs = variable.number_of_obs()
variable.num_dates = len(variable)
- #variable.first_obs, variable.last_obs = variable.get_date_range()
+ variable.first_obs, variable.last_obs = variable.get_date_range()
def summary(self):
'''
Modified: trunk/tools/editor_trends/classes/runtime_settings.py
===================================================================
--- trunk/tools/editor_trends/classes/runtime_settings.py 2011-05-31
23:50:11 UTC (rev 89241)
+++ trunk/tools/editor_trends/classes/runtime_settings.py 2011-06-01
00:02:16 UTC (rev 89242)
@@ -50,53 +50,57 @@
self.project = project
self.language = language
self.dbname = 'wikilytics'
+ self.file_choices = {'meta-full': 'stub-meta-history.xml.gz',
+ 'meta-current': 'stub-meta-current.xml.gz',
+ 'history-full': 'pages-meta-history.xml.7z',
+ 'history-current': 'pages-meta-current.xml.bz2'
+ }
+ if args:
+ self.args = args
+ self.id = '%s%s_%s' % (self.language.code, self.project.name,
'current_month')
+ #print self.settings.input_location
+ #print self.get_value('location')
+ self.project = self.update_project_settings()
+ self.language = self.update_language_settings()
- #if args:
- self.args = args
- self.id = '%s%s_%s' % (self.language.code, self.project.name,
'current_month')
- #print self.settings.input_location
- #print self.get_value('location')
- self.project = self.update_project_settings()
- self.language = self.update_language_settings()
+ self.input_location = self.set_input_location()
+ self.output_location = self.set_output_location()
- self.input_location = self.set_input_location()
- self.output_location = self.set_output_location()
+ self.plugins = self.set_plugin()
+ self.keywords = self.split_keywords()
+ self.namespaces = self.get_namespaces()
- self.plugins = self.set_plugin()
- self.keywords = self.split_keywords()
- self.namespaces = self.get_namespaces()
+ #self.kaggle = self.get_value('kaggle')
+ self.function = self.get_value('func')
+ self.ignore = self.get_value('except')
+ self.force = self.get_value('force')
+ self.analyzer_collection = self.get_value('collection')
- #self.kaggle = self.get_value('kaggle')
- self.function = self.get_value('func')
- self.ignore = self.get_value('except')
- self.force = self.get_value('force')
- self.analyzer_collection = self.get_value('collection')
+ self.dataset = os.path.join(self.dataset_location,
self.project.name)
+ self.txt = os.path.join(self.output_location, 'txt')
+ self.sorted = os.path.join(self.output_location, 'sorted')
+ self.diffs = os.path.join(self.output_location, 'diffs')
- self.dataset = os.path.join(self.dataset_location, self.project.name)
- self.txt = os.path.join(self.output_location, 'txt')
- self.sorted = os.path.join(self.output_location, 'sorted')
- self.diffs = os.path.join(self.output_location, 'diffs')
+ self.directories = [self.output_location,
+ self.txt,
+ self.sorted,
+ self.dataset,
+ self.diffs]
+ self.verify_environment(self.directories)
- self.directories = [self.output_location,
- self.txt,
- self.sorted,
- self.dataset,
- self.diffs]
- self.verify_environment(self.directories)
+ #Wikidump file related variables
+ self.dump_filename = self.generate_wikidump_filename()
+ self.dump_relative_path = self.set_dump_path()
+ self.dump_absolute_path = self.set_dump_path(absolute=True)
- #Wikidump file related variables
- self.dump_filename = self.generate_wikidump_filename()
- self.dump_relative_path = self.set_dump_path()
- self.dump_absolute_path = self.set_dump_path(absolute=True)
+ #Collection names
+ self.editors_raw = '%s%s_editors_raw' % (self.language.code,
self.project.name)
+ self.editors_dataset = '%s%s_editors_dataset' %
(self.language.code, self.project.name)
+ self.articles_raw = '%s%s_articles_raw' % (self.language.code,
self.project.name)
+ self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code,
self.project.name)
+ self.collection = self.set_collection()
- #Collection names
- self.editors_raw = '%s%s_editors_raw' % (self.language.code,
self.project.name)
- self.editors_dataset = '%s%s_editors_dataset' % (self.language.code,
self.project.name)
- self.articles_raw = '%s%s_articles_raw' % (self.language.code,
self.project.name)
- self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code,
self.project.name)
-
-
def __str__(self):
return 'Runtime Settings for project %s %s' % (self.language.name,
self.project.full_name)
@@ -105,14 +109,8 @@
for item in self.__dict__:
yield item
- def dict(self):
- '''
- Return a dictionary with all properties and their values
- '''
- props = {}
- for prop in self:
- props[prop] = getattr(self, prop)
- return props
+ def set_collection(self):
+ return getattr(self, self.get_value('collection'))
def split_keywords(self):
'''
@@ -141,7 +139,7 @@
'''
plugin = self.get_value('charts')
requested_plugins = []
- if plugin != None and isinstance(plugin, type('module')) == False:
+ if plugin != None:
plugins = plugin.split(',')
available_plugins = inventory.available_analyses()
for plugin in plugins:
@@ -220,8 +218,9 @@
'''
Generate the main name of the wikidump file to be downloaded.
'''
+ choice = self.get_value('file')
return '%s%s-latest-%s' % (self.language.code, self.project.name,
- self.get_value('file'))
+ self.file_choices[choice])
def update_language_settings(self):
'''
Modified: trunk/tools/editor_trends/classes/storage.py
===================================================================
--- trunk/tools/editor_trends/classes/storage.py 2011-05-31 23:50:11 UTC
(rev 89241)
+++ trunk/tools/editor_trends/classes/storage.py 2011-06-01 00:02:16 UTC
(rev 89242)
@@ -160,29 +160,22 @@
assert isinstance(data, dict), 'You need to feed me dictionaries.'
self.db[self.collection].update({key: value}, {'$set': data})
- def find(self, key=None, qualifier=None):
- if qualifier == 'min':
- return self.db[self.collection].find({
- key : {'$ne' : False}}).sort(key,
pymongo.ASCENDING).limit(1)[0]
- elif qualifier == 'max':
- return self.db[self.collection].find({
- key : {'$ne' : False}}).sort(key,
pymongo.DESCENDING).limit(1)[0]
- elif qualifier:
- return self.db[self.collection].find({key : qualifier})
- elif key != None:
- return self.db[self.collection].find({}, fields=[key])
+ def find(self, conditions, vars=None):
+ if conditions:
+ return self.db[self.collection].find(conditions, fields=vars)
else:
return self.db[self.collection].find()
- def find_one(self, key, value, vars=None):
+ def find_one(self, conditions, vars=None):
if vars:
#if you only want to retrieve a specific variable(s) then you need
to
#specify vars, if vars is None then you will get the entire BSON
object
vars = vars.split(',')
vars = dict([(var, 1) for var in vars])
- return self.db[self.collection].find_one({key: value}, vars)
+ return self.db[self.collection].find_one(conditions, vars)
else:
- return self.db[self.collection].find_one({key: value})
+ #conditions should be a dictionary
+ return self.db[self.collection].find_one(conditions)
def drop_collection(self):
Modified: trunk/tools/editor_trends/etl/differ.py
===================================================================
--- trunk/tools/editor_trends/etl/differ.py 2011-05-31 23:50:11 UTC (rev
89241)
+++ trunk/tools/editor_trends/etl/differ.py 2011-06-01 00:02:16 UTC (rev
89242)
@@ -213,7 +213,7 @@
def store_json_diffs(rts):
files = os.listdir(rts.diffs)
- print files, rts.diffs
+ #print files, rts.diffs
db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
buffer = cStringIO.StringIO()
@@ -226,12 +226,10 @@
obj = json.loads(obj)
obj[0]['article_id'] = int(obj[0]['article_id'])
for key, value in obj[0].iteritems():
- if type(value) == type(dict()):
- value['timestamp'] =
datetime.strptime(value['timestamp'], '%Y-%m-%dT%H:%M:%S')
+ if key == 'timestamp':
+ value = datetime.strptime(value,
'%Y-%m-%dT%H:%M:%S')
obj[0][key] = value
obj = obj[0]
- #print obj
- #print len(obj)
try:
db.save(obj)
except bson.errors.InvalidDocument, error:
@@ -279,6 +277,7 @@
def store_diffs_debug(rts):
db = storage.init_database(rts)
+ db.drop_collection()
files = os.listdir(rts.diffs)
for filename in files:
fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r',
'utf-8')
@@ -335,20 +334,22 @@
print 'Inserting poison pill %s...' % x
input_queue.put(None)
- extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id,
- rts, format])
- for process_id in xrange(processors)]
- for extracter in extracters:
- extracter.start()
+# extracters = [Process(target=stream_raw_xml, args=[input_queue,
process_id,
+# rts, format])
+# for process_id in xrange(processors)]
+# for extracter in extracters:
+# extracter.start()
+#
+# input_queue.join()
- input_queue.join()
-
store_json_diffs(rts)
db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
+
db.add_index('title')
db.add_index('timestamp')
db.add_index('username')
db.add_index('ns')
+ db.add_index('editor')
def launcher_simple():
Modified: trunk/tools/editor_trends/etl/downloader.py
===================================================================
--- trunk/tools/editor_trends/etl/downloader.py 2011-05-31 23:50:11 UTC (rev
89241)
+++ trunk/tools/editor_trends/etl/downloader.py 2011-06-01 00:02:16 UTC (rev
89242)
@@ -29,7 +29,7 @@
from utils import log
-def download_wiki_file(task_queue, properties):
+def download_wiki_file(task_queue, rts):
'''
This is a very simple replacement for wget and curl because Windows does
not have these tools installed by default
@@ -46,34 +46,34 @@
widgets = log.init_progressbar_widgets(filename)
extension = os.path.splitext(filename)[1]
filemode = file_utils.determine_file_mode(extension)
- filesize =
http_utils.determine_remote_filesize(properties.wp_dump_location,
-
properties.dump_relative_path,
+ filesize = http_utils.determine_remote_filesize(rts.wp_dump_location,
+ rts.dump_relative_path,
filename)
- mod_date =
http_utils.determine_modified_date(properties.wp_dump_location,
- properties.dump_relative_path,
+ mod_date = http_utils.determine_modified_date(rts.wp_dump_location,
+ rts.dump_relative_path,
filename)
- mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date,
properties.timestamp_server)
- if file_utils.check_file_exists(properties.input_location, filename):
- mod_loc = file_utils.get_modified_date(properties.input_location,
filename)
- if mod_loc == mod_date and (properties.force == False or
properties.force == None):
- print 'You already have downloaded the most recent %s%s
dumpfile.' % (properties.language.code, properties.project.name)
+ mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date,
rts.timestamp_server)
+ if file_utils.check_file_exists(rts.input_location, filename):
+ mod_loc = file_utils.get_modified_date(rts.input_location,
filename)
+ if mod_loc == mod_date and (rts.force == False or rts.force ==
None):
+ print 'You already have downloaded the most recent %s%s
dumpfile.' % (rts.language.code, rts.project.name)
continue
if filemode == 'w':
- fh = file_utils.create_txt_filehandle(properties.input_location,
+ fh = file_utils.create_txt_filehandle(rts.input_location,
filename,
filemode,
- properties.encoding)
+ rts.encoding)
else:
- fh =
file_utils.create_binary_filehandle(properties.input_location, filename, 'wb')
+ fh = file_utils.create_binary_filehandle(rts.input_location,
filename, 'wb')
if filesize != -1:
pbar = progressbar.ProgressBar(widgets=widgets,
maxval=filesize).start()
else:
pbar = progressbar.ProgressBar(widgets=widgets).start()
try:
- path = '%s%s' % (properties.dump_absolute_path, filename)
+ path = '%s%s' % (rts.dump_absolute_path, filename)
req = urllib2.Request(path)
response = urllib2.urlopen(req)
while True:
@@ -94,24 +94,24 @@
print 'Error: %s' % error
finally:
fh.close()
- file_utils.set_modified_data(mod_date, properties.input_location,
filename)
+ file_utils.set_modified_data(mod_date, rts.input_location,
filename)
-def launcher(properties, logger):
+def launcher(rts, logger):
print 'Creating list of files to be downloaded...'
- tasks = http_utils.create_list_dumpfiles(properties.wp_dump_location,
- properties.dump_relative_path,
- properties.dump_filename)
+ tasks = http_utils.create_list_dumpfiles(rts.wp_dump_location,
+ rts.dump_relative_path,
+ rts.dump_filename)
#print tasks.qsize()
- #if tasks.qsize() < properties.settings.number_of_processes:
- # properties..number_of_processes = tasks.qsize()
+ #if tasks.qsize() < rts.settings.number_of_processes:
+ # rts..number_of_processes = tasks.qsize()
if tasks.qsize() > 2:
consumers = [multiprocessing.Process(target=download_wiki_file,
- args=(tasks, properties))
- for i in xrange(properties.number_of_processes)]
+ args=(tasks, rts))
+ for i in xrange(rts.number_of_processes)]
else: consumers = [multiprocessing.Process(target=download_wiki_file,
- args=(tasks, properties))
+ args=(tasks, rts))
for i in xrange(1)]
print 'Starting consumers to download files...'
for w in consumers:
Modified: trunk/tools/editor_trends/etl/extracter.py
===================================================================
--- trunk/tools/editor_trends/etl/extracter.py 2011-05-31 23:50:11 UTC (rev
89241)
+++ trunk/tools/editor_trends/etl/extracter.py 2011-06-01 00:02:16 UTC (rev
89242)
@@ -72,8 +72,8 @@
text = variables.extract_revision_text(revision, xml_namespace)
article.update(contributor)
- comment = variables.extract_comment_text(revision_id, revision)
- cache.comments.update(comment)
+ #comment = variables.extract_comment_text(revision_id, revision)
+ #cache.comments.update(comment)
timestamp = revision.find('%s%s' % (xml_namespace, 'timestamp')).text
article['timestamp'] = timestamp
@@ -139,7 +139,7 @@
title = variables.parse_title(elem)
article['title'] = title
current_namespace = variables.determine_namespace(title,
namespaces, include_ns)
- title_meta = variables.parse_title_meta_data(title,
current_namespace)
+ title_meta = variables.parse_title_meta_data(title,
current_namespace, namespaces)
if current_namespace < 6:
parse = True
article['namespace'] = current_namespace
@@ -172,7 +172,7 @@
Determine id of article
'''
article['article_id'] = elem.text
- if isinstance(current_namespace, int):
+ if isinstance(current_namespace, int) and title_meta != {}:
cache.articles[article['article_id']] = title_meta
id = True
elem.clear()
Modified: trunk/tools/editor_trends/etl/store.py
===================================================================
--- trunk/tools/editor_trends/etl/store.py 2011-05-31 23:50:11 UTC (rev
89241)
+++ trunk/tools/editor_trends/etl/store.py 2011-06-01 00:02:16 UTC (rev
89242)
@@ -44,32 +44,31 @@
while True:
try:
filename = self.tasks.get(block=False)
- except Empty:
- break
+ self.tasks.task_done()
+ if filename == None:
+ self.result.put(None)
+ break
- self.tasks.task_done()
- if filename == None:
- self.result.put(None)
- break
+ fh = file_utils.create_txt_filehandle(self.rts.sorted,
filename,
+ 'r', 'utf-8')
+ for line in file_utils.read_raw_data(fh):
+ if len(line) == 1 or len(line) == 4:
+ continue
+ editor = line[0]
+ #print 'Parsing %s' % editor
+ if prev_editor != editor and prev_editor != -1:
+ editor_cache.add(prev_editor, 'NEXT')
- fh = file_utils.create_txt_filehandle(self.rts.sorted, filename,
- 'r', 'utf-8')
- for line in file_utils.read_raw_data(fh):
- if len(line) == 1 or len(line) == 4:
- continue
- editor = line[0]
- #print 'Parsing %s' % editor
- if prev_editor != editor and prev_editor != -1:
- editor_cache.add(prev_editor, 'NEXT')
+ data = prepare_data(line)
+ #print editor, data['username']
+ editor_cache.add(editor, data)
+ prev_editor = editor
+ fh.close()
+ self.result.put(True)
+ except Empty:
+ pass
- data = prepare_data(line)
- #print editor, data['username']
- editor_cache.add(editor, data)
- prev_editor = editor
- fh.close()
- self.result.put(True)
-
def prepare_data(line):
'''
Prepare a single line to store in the database, this entails converting
@@ -103,34 +102,34 @@
while True:
try:
filename = tasks.get(block=False)
+ if filename == None:
+ self.result.put(None)
+ break
+ print 'Processing %s...' % filename
+ fh = file_utils.create_txt_filehandle(rts.txt, filename, 'r',
'utf-8')
+ for line in fh:
+ line = line.strip()
+ line = line.split('\t')
+ data = {}
+ x, y = 0, 1
+ while y < len(line):
+ key, value = line[x], line[y]
+ if key == 'ns' or key == 'id':
+ data[key] = int(value)
+ else:
+ data[key] = value
+ x += 2
+ y += 2
+ db.insert(data)
+ fh.close()
except Empty:
- continue
-
- if filename == None:
- break
- print 'Processing %s...' % filename
- fh = file_utils.create_txt_filehandle(rts.txt, filename, 'r', 'utf-8')
- for line in fh:
- line = line.strip()
- line = line.split('\t')
- data = {}
- x, y = 0, 1
- while y < len(line):
- key, value = line[x], line[y]
- if key == 'ns' or key == 'id':
- data[key] = int(value)
- else:
- data[key] = value
- x += 2
- y += 2
- db.insert(data)
- fh.close()
+ pass
print 'Done storing articles...'
def launcher_articles(rts):
'''
- This function reads titles.csv and stores it in a separate collection.
+ This function reads articles.csv and stores it in a separate collection.
Besides containing the title of an article, it also includes:
* namespace
* category (if any)
@@ -172,7 +171,6 @@
This is the main entry point and creates a number of workers and launches
them.
'''
- #launcher_articles(rts)
print 'Input directory is: %s ' % rts.sorted
db = storage.init_database(rts.storage, rts.dbname, rts.editors_raw)
db.drop_collection()
Modified: trunk/tools/editor_trends/etl/variables.py
===================================================================
--- trunk/tools/editor_trends/etl/variables.py 2011-05-31 23:50:11 UTC (rev
89241)
+++ trunk/tools/editor_trends/etl/variables.py 2011-06-01 00:02:16 UTC (rev
89242)
@@ -68,21 +68,22 @@
return title.text
-def parse_title_meta_data(title, namespace):
+def parse_title_meta_data(title, ns, namespaces):
'''
This function categorizes an article to assist the Wikimedia Taxonomy
project. See
http://meta.wikimedia.org/wiki/Contribution_Taxonomy_Project/Research_Questions
'''
title_meta = {}
- if not namespace:
+ if not ns:
return title_meta
-
+ namespace = '%s:' % namespaces[ns]
+ title = title.replace(namespace, '')
title_meta['title'] = title
- title_meta['ns'] = namespace
+ title_meta['ns'] = ns
if title.startswith('List of'):
title_meta['category'] = 'List'
- elif namespace == 4 or namespace == 5:
+ elif ns == 4 or ns == 5:
if title.find('Articles for deletion') > -1:
title_meta['category'] = 'Deletion'
elif title.find('Mediation Committee') > -1:
@@ -105,6 +106,7 @@
title_meta['category'] = 'Featured Topic'
elif title.find('Good Article') > -1:
title_meta['category'] = 'Good Article'
+ #print title_meta
return title_meta
Modified: trunk/tools/editor_trends/kaggle/training.py
===================================================================
--- trunk/tools/editor_trends/kaggle/training.py 2011-05-31 23:50:11 UTC
(rev 89241)
+++ trunk/tools/editor_trends/kaggle/training.py 2011-06-01 00:02:16 UTC
(rev 89242)
@@ -26,7 +26,7 @@
from classes import storage
-location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction_solution'
+location = '/home/diederik/wikimedia/en/wiki/kaggle'
files = os.listdir(location)
files.reverse()
@@ -67,7 +67,7 @@
continue
id = line[2]
if id not in ids and id not in ignore_ids:
- res = db.find_one('editor', id)
+ res = db.find_one({'editor': id})
if res == None:
ignore_ids.add(id)
continue
@@ -100,7 +100,7 @@
fh = codecs.open('solutions.tsv', 'w', 'utf-8')
for id in ids:
if id not in ignore_ids:
- obs = db.find_one('editor', str(id), 'cum_edit_count_main_ns')
+ obs = db.find_one({'editor': str(id)}, 'cum_edit_count_main_ns')
if obs != None:
x += 1
n = obs['cum_edit_count_main_ns']
Modified: trunk/tools/editor_trends/manage.py
===================================================================
--- trunk/tools/editor_trends/manage.py 2011-05-31 23:50:11 UTC (rev 89241)
+++ trunk/tools/editor_trends/manage.py 2011-06-01 00:02:16 UTC (rev 89242)
@@ -150,14 +150,8 @@
language = languages.init(language_code)
project = projects.init(project)
pjc = projects.ProjectContainer()
- #rts = runtime_settings.RunTimeSettings(project, language)
+ rts = runtime_settings.RunTimeSettings(project, language)
- file_choices = {'meta-full': 'stub-meta-history.xml.gz',
- 'meta-current': 'stub-meta-current.xml.gz',
- 'history-full': 'pages-meta-history.xml.7z',
- 'history-current': 'pages-meta-current.xml.bz2'
- }
-
#Init Argument Parser
parser = ArgumentParser(prog='manage',
formatter_class=RawTextHelpFormatter)
subparsers = parser.add_subparsers(help='sub - command help')
@@ -218,7 +212,7 @@
parser_dataset.add_argument('-c', '--charts',
action='store',
help='Should be a valid function name that
matches one of the plugin functions',
-
default=inventory.available_analyses()['new_editor_count'])
+ default='new_editor_count')
parser_dataset.add_argument('-k', '--keywords',
action='store',
@@ -256,6 +250,13 @@
help='Indicate whether the output is for Kaggle or
not',
default=False)
+
+ parser.add_argument('-t', '--collection',
+ action='store',
+ help='Name of default collection',
+ default='editors_dataset'
+ )
+
parser.add_argument('-l', '--language',
action='store',
help='Example of valid languages.',
@@ -269,28 +270,17 @@
choices=pjc.supported_projects(),
default='wiki')
- parser.add_argument('-c', '--collection',
- action='store',
- help='Name of MongoDB collection',
- default='editors_raw')
-
-
parser.add_argument('-ns', '--namespace',
action='store',
help='A list of namespaces to include for analysis.',
default='0')
- parser.add_argument('-db', '--database',
- action='store',
- help='Specify the database that you want to use. Valid
choices are mongo and cassandra.',
- default='mongo')
-
parser.add_argument('-f', '--file',
action='store',
- choices=file_choices,
+ choices=rts.file_choices,
help='Indicate which dump you want to download. Valid choices are:\n \
- %s' % ''.join([f + ',\n' for f in file_choices]),
- default=file_choices['meta-full'])
+ %s' % ''.join([f + ',\n' for f in rts.file_choices]),
+ default='meta-full')
return parser
@@ -353,6 +343,7 @@
log.to_db(rts, 'dataset', 'store', stopwatch, event='start')
log.to_csv(logger, rts, 'Start', 'Store', store_launcher)
store.launcher(rts)
+ #store.launcher_articles(rts)
stopwatch.elapsed()
log.to_db(rts, 'dataset', 'store', stopwatch, event='finish')
log.to_csv(logger, rts, 'Finish', 'Store', store_launcher)
Modified: trunk/tools/editor_trends/utils/log.py
===================================================================
--- trunk/tools/editor_trends/utils/log.py 2011-05-31 23:50:11 UTC (rev
89241)
+++ trunk/tools/editor_trends/utils/log.py 2011-06-01 00:02:16 UTC (rev
89242)
@@ -31,8 +31,8 @@
def to_db(rts, jobtype, task, timer, event='start'):
db = storage.init_database(rts.storage, rts.dbname, 'jobs')
created = datetime.datetime.now()
- job = db.find_one('hash', rts.id)
-
+ job = db.find_one({'hash': rts.id})
+ #print job
data = {'hash': rts.id,
'created': created,
'jobtype': jobtype,
@@ -50,7 +50,7 @@
data['finished'] = True
_id = db.save(data)
- job = db.find_one('_id', _id)
+ job = db.find_one({'_id': _id})
tasks = job['tasks']
t = tasks.get(task, {})
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs