editor_trends

diederik Tue, 31 May 2011 17:02:34 -0700

http://www.mediawiki.org/wiki/Special:Code/MediaWiki/89242


Revision: 89242
Author:   diederik
Date:     2011-06-01 00:02:16 +0000 (Wed, 01 Jun 2011)
Log Message:
-----------
Preparing for Summer of Research, part 2

Modified Paths:
--------------
    trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
    trunk/tools/editor_trends/analyses/analyzer.py
    trunk/tools/editor_trends/analyses/plugins/taxonomy_list_makers.py
    trunk/tools/editor_trends/classes/analytics.py
    trunk/tools/editor_trends/classes/buffer.py
    trunk/tools/editor_trends/classes/dataset.py
    trunk/tools/editor_trends/classes/runtime_settings.py
    trunk/tools/editor_trends/classes/storage.py
    trunk/tools/editor_trends/etl/differ.py
    trunk/tools/editor_trends/etl/downloader.py
    trunk/tools/editor_trends/etl/extracter.py
    trunk/tools/editor_trends/etl/store.py
    trunk/tools/editor_trends/etl/variables.py
    trunk/tools/editor_trends/kaggle/training.py
    trunk/tools/editor_trends/manage.py
    trunk/tools/editor_trends/utils/log.py

Modified: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
===================================================================
--- trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py     2011-05-31 
23:50:11 UTC (rev 89241)
+++ trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py     2011-06-01 
00:02:16 UTC (rev 89242)
@@ -57,7 +57,7 @@
 
 
 def retrieve_variables(obs, username, date):
-    data = db.find_one('username', username)
+    data = db.find_one({'username': username})
     year = str(date.year)
     month = str(date.month)
     if data:

Modified: trunk/tools/editor_trends/analyses/analyzer.py
===================================================================
--- trunk/tools/editor_trends/analyses/analyzer.py      2011-05-31 23:50:11 UTC 
(rev 89241)
+++ trunk/tools/editor_trends/analyses/analyzer.py      2011-06-01 00:02:16 UTC 
(rev 89242)
@@ -24,6 +24,7 @@
 import types
 import sys
 import cPickle
+import pymongo
 import gc
 import os
 import progressbar
@@ -65,7 +66,7 @@
 def feedback(plugin, rts):
     print 'Exporting data for chart: %s' % plugin
     print 'Project: %s' % rts.dbname
-    print 'Dataset: %s' % rts.editors_dataset
+    print 'Dataset: %s' % rts.collection
 
 
 def write_output(ds, rts, stopwatch):
@@ -97,7 +98,8 @@
     plugin = retrieve_plugin(func)
 
     if not plugin:
-        raise exceptions.UnknownPluginError(plugin, self.available_plugins)
+        available_plugins = inventory.available_analyses()
+        raise exceptions.UnknownPluginError(plugin, available_plugins)
         plugin = getattr(plugin, func)
 
     feedback(func, rts)
@@ -110,15 +112,16 @@
     obs = dict()
     obs_proxy = mgr.dict(obs)
 
-    db = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset)
+    db = storage.init_database(rts.storage, rts.dbname, rts.collection)
     editors = db.retrieve_distinct_keys('editor')
     #editors = editors[:500]
-    min_year, max_year = determine_project_year_range(db, 'new_wikipedian')
+    if rts.collection.find('editors_dataset') > -1:
+        min_year, max_year = determine_project_year_range(db, 'new_wikipedian')
+        kwargs['min_year'] = min_year
+        kwargs['max_year'] = max_year
 
     fmt = kwargs.pop('format', 'long')
     time_unit = kwargs.pop('time_unit', 'year')
-    kwargs['min_year'] = min_year
-    kwargs['max_year'] = max_year
 
 
     var = dataset.Variable('count', time_unit, lock, obs_proxy, **kwargs)
@@ -153,24 +156,21 @@
 
 
     ppills = cpu_count()
-    while True:
-        while ppills > 0:
-            try:
-                res = result.get()
-                if res == True:
-                    pbar.update(pbar.currval + 1)
-                else:
-                    ppills -= 1
-                    var = res
-                    print ppills
-            except Empty:
-                pass
-        break
-    print 'Waiting for tasks...'
+    while ppills > 0:
+        try:
+            res = result.get()
+            if res == True:
+                pbar.update(pbar.currval + 1)
+            else:
+                ppills -= 1
+                var = res
+        except Empty:
+            pass
+
     tasks.join()
 
     var = reconstruct_observations(var)
-    ds = dataset.Dataset(plugin.func_name, rts, format=fmt, **kwargs)
+    ds = dataset.Dataset(func, rts, format=fmt, **kwargs)
     ds.add_variable(var)
 
     stopwatch.elapsed()
@@ -178,8 +178,8 @@
 
     ds.summary()
 
-    for n, c in get_refcounts()[:100]:
-        print '%10d %s' % (n, c.__name__)
+    #for n, c in get_refcounts()[:100]:
+    #    print '%10d %s' % (n, c.__name__)
 
 
 def get_refcounts():
@@ -205,9 +205,12 @@
     Determine the first and final year for the observed data
     '''
     try:
-        obs = db.find(var, qualifier='max')
+        conditions = {var : {'$ne' : False}}
+
+        obs = db.find(conditions).sort(var, pymongo.ASCENDING).limit(1)[0]
         max_year = obs[var].year + 1
-        obs = db.find(var, qualifier='min')
+
+        obs = db.find(conditions).sort(var, pymongo.DESCENDING).limit(1)[0]
         min_year = obs[var].year
     except KeyError:
         min_year = 2001

Modified: trunk/tools/editor_trends/analyses/plugins/taxonomy_list_makers.py
===================================================================
--- trunk/tools/editor_trends/analyses/plugins/taxonomy_list_makers.py  
2011-05-31 23:50:11 UTC (rev 89241)
+++ trunk/tools/editor_trends/analyses/plugins/taxonomy_list_makers.py  
2011-06-01 00:02:16 UTC (rev 89242)
@@ -71,4 +71,4 @@
     cursor = db.find('category', 'List')
     for c in cursor:
         data[c['id']] = 1
-    return data
+    return data, rts

Modified: trunk/tools/editor_trends/classes/analytics.py
===================================================================
--- trunk/tools/editor_trends/classes/analytics.py      2011-05-31 23:50:11 UTC 
(rev 89241)
+++ trunk/tools/editor_trends/classes/analytics.py      2011-06-01 00:02:16 UTC 
(rev 89242)
@@ -64,10 +64,8 @@
 
     def __call__(self):
         project = 'wiki'
-        #rts = runtime_settings.init_environment('wiki', 'en', args)
         for lang in self.languages:
             self.rts = runtime_settings.init_environment(project, lang, 
self.args)
-            #self.rts.editors_dataset = 'editors_dataset'
 
             self.rts.dbname = '%s%s' % (lang, project)
             for cum_cutoff in self.cum_cutoff:
@@ -91,15 +89,16 @@
         Generic loop function that loops over all the editors of a Wikipedia 
         project and then calls the plugin that does the actual mapping.
         '''
-        db = storage.init_database(self.rts.storage, self.rts.dbname, 
self.rts.editors_dataset)
+        db = storage.init_database(self.rts.storage, self.rts.dbname, 
self.rts.collection)
         while True:
             try:
                 editor_id = self.tasks.get(block=False)
+                self.tasks.task_done()
                 if editor_id == None:
                     self.result.put(self.var)
                     break
-                editor = db.find_one('editor', editor_id)
-                self.plugin(self.var, editor, dbname=self.rts.dbname, 
data=self.data)
+                editor = db.find_one({'editor': editor_id})
+                self.plugin(self.var, editor, rts=self.rts, data=self.data)
                 self.result.put(True)
             except Empty:
                 pass

Modified: trunk/tools/editor_trends/classes/buffer.py
===================================================================
--- trunk/tools/editor_trends/classes/buffer.py 2011-05-31 23:50:11 UTC (rev 
89241)
+++ trunk/tools/editor_trends/classes/buffer.py 2011-06-01 00:02:16 UTC (rev 
89242)
@@ -124,7 +124,9 @@
     def simplify(self, revision):
         row = []
         for key in self.keys:
-            row.append(revision[key].decode('utf-8'))
+            value = revision.get(key, None)
+            if value != None:
+                row.append(value.decode('utf-8'))
         return row
 
     def stringify(self, revision):

Modified: trunk/tools/editor_trends/classes/dataset.py
===================================================================
--- trunk/tools/editor_trends/classes/dataset.py        2011-05-31 23:50:11 UTC 
(rev 89241)
+++ trunk/tools/editor_trends/classes/dataset.py        2011-06-01 00:02:16 UTC 
(rev 89242)
@@ -176,6 +176,7 @@
         #self.date = date
         self.data = 0
         self.time_unit = time_unit
+        self.date = date
         self.t1, self.t0 = self.set_date_range(date)
         self.id = id
         self.props = []
@@ -515,7 +516,7 @@
             variable.max = get_max(data)
             variable.num_obs = variable.number_of_obs()
             variable.num_dates = len(variable)
-            #variable.first_obs, variable.last_obs = variable.get_date_range()
+            variable.first_obs, variable.last_obs = variable.get_date_range()
 
     def summary(self):
         '''

Modified: trunk/tools/editor_trends/classes/runtime_settings.py
===================================================================
--- trunk/tools/editor_trends/classes/runtime_settings.py       2011-05-31 
23:50:11 UTC (rev 89241)
+++ trunk/tools/editor_trends/classes/runtime_settings.py       2011-06-01 
00:02:16 UTC (rev 89242)
@@ -50,53 +50,57 @@
         self.project = project
         self.language = language
         self.dbname = 'wikilytics'
+        self.file_choices = {'meta-full': 'stub-meta-history.xml.gz',
+                             'meta-current': 'stub-meta-current.xml.gz',
+                             'history-full': 'pages-meta-history.xml.7z',
+                             'history-current': 'pages-meta-current.xml.bz2'
+                             }
+        if args:
+            self.args = args
+            self.id = '%s%s_%s' % (self.language.code, self.project.name, 
'current_month')
+            #print self.settings.input_location
+            #print self.get_value('location')
+            self.project = self.update_project_settings()
+            self.language = self.update_language_settings()
 
-        #if args:
-        self.args = args
-        self.id = '%s%s_%s' % (self.language.code, self.project.name, 
'current_month')
-        #print self.settings.input_location
-        #print self.get_value('location')
-        self.project = self.update_project_settings()
-        self.language = self.update_language_settings()
+            self.input_location = self.set_input_location()
+            self.output_location = self.set_output_location()
 
-        self.input_location = self.set_input_location()
-        self.output_location = self.set_output_location()
+            self.plugins = self.set_plugin()
+            self.keywords = self.split_keywords()
+            self.namespaces = self.get_namespaces()
 
-        self.plugins = self.set_plugin()
-        self.keywords = self.split_keywords()
-        self.namespaces = self.get_namespaces()
+            #self.kaggle = self.get_value('kaggle')
+            self.function = self.get_value('func')
+            self.ignore = self.get_value('except')
+            self.force = self.get_value('force')
+            self.analyzer_collection = self.get_value('collection')
 
-        #self.kaggle = self.get_value('kaggle')
-        self.function = self.get_value('func')
-        self.ignore = self.get_value('except')
-        self.force = self.get_value('force')
-        self.analyzer_collection = self.get_value('collection')
+            self.dataset = os.path.join(self.dataset_location, 
self.project.name)
+            self.txt = os.path.join(self.output_location, 'txt')
+            self.sorted = os.path.join(self.output_location, 'sorted')
+            self.diffs = os.path.join(self.output_location, 'diffs')
 
-        self.dataset = os.path.join(self.dataset_location, self.project.name)
-        self.txt = os.path.join(self.output_location, 'txt')
-        self.sorted = os.path.join(self.output_location, 'sorted')
-        self.diffs = os.path.join(self.output_location, 'diffs')
+            self.directories = [self.output_location,
+                                self.txt,
+                                self.sorted,
+                                self.dataset,
+                                self.diffs]
+            self.verify_environment(self.directories)
 
-        self.directories = [self.output_location,
-                            self.txt,
-                            self.sorted,
-                            self.dataset,
-                            self.diffs]
-        self.verify_environment(self.directories)
+            #Wikidump file related variables
+            self.dump_filename = self.generate_wikidump_filename()
+            self.dump_relative_path = self.set_dump_path()
+            self.dump_absolute_path = self.set_dump_path(absolute=True)
 
-        #Wikidump file related variables
-        self.dump_filename = self.generate_wikidump_filename()
-        self.dump_relative_path = self.set_dump_path()
-        self.dump_absolute_path = self.set_dump_path(absolute=True)
+            #Collection names
+            self.editors_raw = '%s%s_editors_raw' % (self.language.code, 
self.project.name)
+            self.editors_dataset = '%s%s_editors_dataset' % 
(self.language.code, self.project.name)
+            self.articles_raw = '%s%s_articles_raw' % (self.language.code, 
self.project.name)
+            self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, 
self.project.name)
+            self.collection = self.set_collection()
 
-        #Collection names
-        self.editors_raw = '%s%s_editors_raw' % (self.language.code, 
self.project.name)
-        self.editors_dataset = '%s%s_editors_dataset' % (self.language.code, 
self.project.name)
-        self.articles_raw = '%s%s_articles_raw' % (self.language.code, 
self.project.name)
-        self.diffs_dataset = '%s%s_diffs_dataset' % (self.language.code, 
self.project.name)
 
-
-
     def __str__(self):
         return 'Runtime Settings for project %s %s' % (self.language.name,
                                                       self.project.full_name)
@@ -105,14 +109,8 @@
         for item in self.__dict__:
             yield item
 
-    def dict(self):
-        '''
-        Return a dictionary with all properties and their values
-        '''
-        props = {}
-        for prop in self:
-            props[prop] = getattr(self, prop)
-        return props
+    def set_collection(self):
+        return getattr(self, self.get_value('collection'))
 
     def split_keywords(self):
         '''
@@ -141,7 +139,7 @@
         '''
         plugin = self.get_value('charts')
         requested_plugins = []
-        if plugin != None and isinstance(plugin, type('module')) == False:
+        if plugin != None:
             plugins = plugin.split(',')
             available_plugins = inventory.available_analyses()
             for plugin in plugins:
@@ -220,8 +218,9 @@
         '''
         Generate the main name of the wikidump file to be downloaded.
         '''
+        choice = self.get_value('file')
         return '%s%s-latest-%s' % (self.language.code, self.project.name,
-                                   self.get_value('file'))
+                                   self.file_choices[choice])
 
     def update_language_settings(self):
         '''

Modified: trunk/tools/editor_trends/classes/storage.py
===================================================================
--- trunk/tools/editor_trends/classes/storage.py        2011-05-31 23:50:11 UTC 
(rev 89241)
+++ trunk/tools/editor_trends/classes/storage.py        2011-06-01 00:02:16 UTC 
(rev 89242)
@@ -160,29 +160,22 @@
         assert isinstance(data, dict), 'You need to feed me dictionaries.'
         self.db[self.collection].update({key: value}, {'$set': data})
 
-    def find(self, key=None, qualifier=None):
-        if qualifier == 'min':
-            return self.db[self.collection].find({
-                key : {'$ne' : False}}).sort(key, 
pymongo.ASCENDING).limit(1)[0]
-        elif qualifier == 'max':
-            return self.db[self.collection].find({
-                key : {'$ne' : False}}).sort(key, 
pymongo.DESCENDING).limit(1)[0]
-        elif qualifier:
-            return self.db[self.collection].find({key : qualifier})
-        elif key != None:
-            return self.db[self.collection].find({}, fields=[key])
+    def find(self, conditions, vars=None):
+        if conditions:
+            return self.db[self.collection].find(conditions, fields=vars)
         else:
             return self.db[self.collection].find()
 
-    def find_one(self, key, value, vars=None):
+    def find_one(self, conditions, vars=None):
         if vars:
             #if you only want to retrieve a specific variable(s) then you need 
to
             #specify vars, if vars is None then you will get the entire BSON 
object
             vars = vars.split(',')
             vars = dict([(var, 1) for var in vars])
-            return self.db[self.collection].find_one({key: value}, vars)
+            return self.db[self.collection].find_one(conditions, vars)
         else:
-            return self.db[self.collection].find_one({key: value})
+            #conditions should be a dictionary
+            return self.db[self.collection].find_one(conditions)
 
 
     def drop_collection(self):

Modified: trunk/tools/editor_trends/etl/differ.py
===================================================================
--- trunk/tools/editor_trends/etl/differ.py     2011-05-31 23:50:11 UTC (rev 
89241)
+++ trunk/tools/editor_trends/etl/differ.py     2011-06-01 00:02:16 UTC (rev 
89242)
@@ -213,7 +213,7 @@
 
 def store_json_diffs(rts):
     files = os.listdir(rts.diffs)
-    print files, rts.diffs
+    #print files, rts.diffs
     db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
     buffer = cStringIO.StringIO()
 
@@ -226,12 +226,10 @@
                     obj = json.loads(obj)
                     obj[0]['article_id'] = int(obj[0]['article_id'])
                     for key, value in obj[0].iteritems():
-                        if type(value) == type(dict()):
-                            value['timestamp'] = 
datetime.strptime(value['timestamp'], '%Y-%m-%dT%H:%M:%S')
+                        if key == 'timestamp':
+                            value = datetime.strptime(value, 
'%Y-%m-%dT%H:%M:%S')
                         obj[0][key] = value
                     obj = obj[0]
-                    #print obj
-                    #print len(obj)
                     try:
                         db.save(obj)
                     except bson.errors.InvalidDocument, error:
@@ -279,6 +277,7 @@
 
 def store_diffs_debug(rts):
     db = storage.init_database(rts)
+    db.drop_collection()
     files = os.listdir(rts.diffs)
     for filename in files:
         fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 
'utf-8')
@@ -335,20 +334,22 @@
         print 'Inserting poison pill %s...' % x
         input_queue.put(None)
 
-    extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id,
-                                                       rts, format])
-                  for process_id in xrange(processors)]
-    for extracter in extracters:
-        extracter.start()
+#    extracters = [Process(target=stream_raw_xml, args=[input_queue, 
process_id,
+#                                                       rts, format])
+#                  for process_id in xrange(processors)]
+#    for extracter in extracters:
+#        extracter.start()
+#
+#    input_queue.join()
 
-    input_queue.join()
-
     store_json_diffs(rts)
     db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
+
     db.add_index('title')
     db.add_index('timestamp')
     db.add_index('username')
     db.add_index('ns')
+    db.add_index('editor')
 
 
 def launcher_simple():

Modified: trunk/tools/editor_trends/etl/downloader.py
===================================================================
--- trunk/tools/editor_trends/etl/downloader.py 2011-05-31 23:50:11 UTC (rev 
89241)
+++ trunk/tools/editor_trends/etl/downloader.py 2011-06-01 00:02:16 UTC (rev 
89242)
@@ -29,7 +29,7 @@
 from utils import log
 
 
-def download_wiki_file(task_queue, properties):
+def download_wiki_file(task_queue, rts):
     '''
     This is a very simple replacement for wget and curl because Windows does
     not have these tools installed by default
@@ -46,34 +46,34 @@
         widgets = log.init_progressbar_widgets(filename)
         extension = os.path.splitext(filename)[1]
         filemode = file_utils.determine_file_mode(extension)
-        filesize = 
http_utils.determine_remote_filesize(properties.wp_dump_location,
-                                                        
properties.dump_relative_path,
+        filesize = http_utils.determine_remote_filesize(rts.wp_dump_location,
+                                                        rts.dump_relative_path,
                                                         filename)
 
-        mod_date = 
http_utils.determine_modified_date(properties.wp_dump_location,
-                                                properties.dump_relative_path,
+        mod_date = http_utils.determine_modified_date(rts.wp_dump_location,
+                                                rts.dump_relative_path,
                                                 filename)
-        mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, 
properties.timestamp_server)
-        if file_utils.check_file_exists(properties.input_location, filename):
-            mod_loc = file_utils.get_modified_date(properties.input_location, 
filename)
-            if mod_loc == mod_date and (properties.force == False or 
properties.force == None):
-                print 'You already have downloaded the most recent %s%s 
dumpfile.' % (properties.language.code, properties.project.name)
+        mod_date = text_utils.convert_timestamp_to_datetime_naive(mod_date, 
rts.timestamp_server)
+        if file_utils.check_file_exists(rts.input_location, filename):
+            mod_loc = file_utils.get_modified_date(rts.input_location, 
filename)
+            if mod_loc == mod_date and (rts.force == False or rts.force == 
None):
+                print 'You already have downloaded the most recent %s%s 
dumpfile.' % (rts.language.code, rts.project.name)
                 continue
 
         if filemode == 'w':
-            fh = file_utils.create_txt_filehandle(properties.input_location,
+            fh = file_utils.create_txt_filehandle(rts.input_location,
                                                   filename,
                                                   filemode,
-                                                  properties.encoding)
+                                                  rts.encoding)
         else:
-            fh = 
file_utils.create_binary_filehandle(properties.input_location, filename, 'wb')
+            fh = file_utils.create_binary_filehandle(rts.input_location, 
filename, 'wb')
 
         if filesize != -1:
             pbar = progressbar.ProgressBar(widgets=widgets, 
maxval=filesize).start()
         else:
             pbar = progressbar.ProgressBar(widgets=widgets).start()
         try:
-            path = '%s%s' % (properties.dump_absolute_path, filename)
+            path = '%s%s' % (rts.dump_absolute_path, filename)
             req = urllib2.Request(path)
             response = urllib2.urlopen(req)
             while True:
@@ -94,24 +94,24 @@
             print 'Error: %s' % error
         finally:
             fh.close()
-            file_utils.set_modified_data(mod_date, properties.input_location, 
filename)
+            file_utils.set_modified_data(mod_date, rts.input_location, 
filename)
 
 
 
-def launcher(properties, logger):
+def launcher(rts, logger):
     print 'Creating list of files to be downloaded...'
-    tasks = http_utils.create_list_dumpfiles(properties.wp_dump_location,
-                                  properties.dump_relative_path,
-                                  properties.dump_filename)
+    tasks = http_utils.create_list_dumpfiles(rts.wp_dump_location,
+                                  rts.dump_relative_path,
+                                  rts.dump_filename)
     #print tasks.qsize()
-    #if tasks.qsize() < properties.settings.number_of_processes:
-    #    properties..number_of_processes = tasks.qsize()
+    #if tasks.qsize() < rts.settings.number_of_processes:
+    #    rts..number_of_processes = tasks.qsize()
     if tasks.qsize() > 2:
         consumers = [multiprocessing.Process(target=download_wiki_file,
-                    args=(tasks, properties))
-                    for i in xrange(properties.number_of_processes)]
+                    args=(tasks, rts))
+                    for i in xrange(rts.number_of_processes)]
     else: consumers = [multiprocessing.Process(target=download_wiki_file,
-                    args=(tasks, properties))
+                    args=(tasks, rts))
                     for i in xrange(1)]
     print 'Starting consumers to download files...'
     for w in consumers:

Modified: trunk/tools/editor_trends/etl/extracter.py
===================================================================
--- trunk/tools/editor_trends/etl/extracter.py  2011-05-31 23:50:11 UTC (rev 
89241)
+++ trunk/tools/editor_trends/etl/extracter.py  2011-06-01 00:02:16 UTC (rev 
89242)
@@ -72,8 +72,8 @@
     text = variables.extract_revision_text(revision, xml_namespace)
     article.update(contributor)
 
-    comment = variables.extract_comment_text(revision_id, revision)
-    cache.comments.update(comment)
+    #comment = variables.extract_comment_text(revision_id, revision)
+    #cache.comments.update(comment)
 
     timestamp = revision.find('%s%s' % (xml_namespace, 'timestamp')).text
     article['timestamp'] = timestamp
@@ -139,7 +139,7 @@
                 title = variables.parse_title(elem)
                 article['title'] = title
                 current_namespace = variables.determine_namespace(title, 
namespaces, include_ns)
-                title_meta = variables.parse_title_meta_data(title, 
current_namespace)
+                title_meta = variables.parse_title_meta_data(title, 
current_namespace, namespaces)
                 if current_namespace < 6:
                     parse = True
                     article['namespace'] = current_namespace
@@ -172,7 +172,7 @@
                 Determine id of article
                 '''
                 article['article_id'] = elem.text
-                if isinstance(current_namespace, int):
+                if isinstance(current_namespace, int) and title_meta != {}:
                     cache.articles[article['article_id']] = title_meta
                 id = True
                 elem.clear()

Modified: trunk/tools/editor_trends/etl/store.py
===================================================================
--- trunk/tools/editor_trends/etl/store.py      2011-05-31 23:50:11 UTC (rev 
89241)
+++ trunk/tools/editor_trends/etl/store.py      2011-06-01 00:02:16 UTC (rev 
89242)
@@ -44,32 +44,31 @@
         while True:
             try:
                 filename = self.tasks.get(block=False)
-            except Empty:
-                break
+                self.tasks.task_done()
+                if filename == None:
+                    self.result.put(None)
+                    break
 
-            self.tasks.task_done()
-            if filename == None:
-                self.result.put(None)
-                break
+                fh = file_utils.create_txt_filehandle(self.rts.sorted, 
filename,
+                                                      'r', 'utf-8')
+                for line in file_utils.read_raw_data(fh):
+                    if len(line) == 1 or len(line) == 4:
+                        continue
+                    editor = line[0]
+                    #print 'Parsing %s' % editor
+                    if prev_editor != editor and prev_editor != -1:
+                        editor_cache.add(prev_editor, 'NEXT')
 
-            fh = file_utils.create_txt_filehandle(self.rts.sorted, filename,
-                                                  'r', 'utf-8')
-            for line in file_utils.read_raw_data(fh):
-                if len(line) == 1 or len(line) == 4:
-                    continue
-                editor = line[0]
-                #print 'Parsing %s' % editor
-                if prev_editor != editor and prev_editor != -1:
-                    editor_cache.add(prev_editor, 'NEXT')
+                    data = prepare_data(line)
+                    #print editor, data['username']
+                    editor_cache.add(editor, data)
+                    prev_editor = editor
+                fh.close()
+                self.result.put(True)
+            except Empty:
+                pass
 
-                data = prepare_data(line)
-                #print editor, data['username']
-                editor_cache.add(editor, data)
-                prev_editor = editor
-            fh.close()
-            self.result.put(True)
 
-
 def prepare_data(line):
     '''
     Prepare a single line to store in the database, this entails converting
@@ -103,34 +102,34 @@
     while True:
         try:
             filename = tasks.get(block=False)
+            if filename == None:
+                self.result.put(None)
+                break
+            print 'Processing %s...' % filename
+            fh = file_utils.create_txt_filehandle(rts.txt, filename, 'r', 
'utf-8')
+            for line in fh:
+                line = line.strip()
+                line = line.split('\t')
+                data = {}
+                x, y = 0, 1
+                while y < len(line):
+                    key, value = line[x], line[y]
+                    if key == 'ns' or key == 'id':
+                        data[key] = int(value)
+                    else:
+                        data[key] = value
+                    x += 2
+                    y += 2
+                db.insert(data)
+            fh.close()
         except Empty:
-            continue
-
-        if filename == None:
-            break
-        print 'Processing %s...' % filename
-        fh = file_utils.create_txt_filehandle(rts.txt, filename, 'r', 'utf-8')
-        for line in fh:
-            line = line.strip()
-            line = line.split('\t')
-            data = {}
-            x, y = 0, 1
-            while y < len(line):
-                key, value = line[x], line[y]
-                if key == 'ns' or key == 'id':
-                    data[key] = int(value)
-                else:
-                    data[key] = value
-                x += 2
-                y += 2
-            db.insert(data)
-        fh.close()
+            pass
     print 'Done storing articles...'
 
 
 def launcher_articles(rts):
     '''
-    This function reads titles.csv and stores it in a separate collection.
+    This function reads articles.csv and stores it in a separate collection.
     Besides containing the title of an article, it also includes:
     * namespace
     * category (if any)
@@ -172,7 +171,6 @@
     This is the main entry point and creates a number of workers and launches
     them. 
     '''
-    #launcher_articles(rts)
     print 'Input directory is: %s ' % rts.sorted
     db = storage.init_database(rts.storage, rts.dbname, rts.editors_raw)
     db.drop_collection()

Modified: trunk/tools/editor_trends/etl/variables.py
===================================================================
--- trunk/tools/editor_trends/etl/variables.py  2011-05-31 23:50:11 UTC (rev 
89241)
+++ trunk/tools/editor_trends/etl/variables.py  2011-06-01 00:02:16 UTC (rev 
89242)
@@ -68,21 +68,22 @@
     return title.text
 
 
-def parse_title_meta_data(title, namespace):
+def parse_title_meta_data(title, ns, namespaces):
     '''
     This function categorizes an article to assist the Wikimedia Taxonomy
     project. See 
     
http://meta.wikimedia.org/wiki/Contribution_Taxonomy_Project/Research_Questions
     '''
     title_meta = {}
-    if not namespace:
+    if not ns:
         return title_meta
-
+    namespace = '%s:' % namespaces[ns]
+    title = title.replace(namespace, '')
     title_meta['title'] = title
-    title_meta['ns'] = namespace
+    title_meta['ns'] = ns
     if title.startswith('List of'):
         title_meta['category'] = 'List'
-    elif namespace == 4 or namespace == 5:
+    elif ns == 4 or ns == 5:
         if title.find('Articles for deletion') > -1:
             title_meta['category'] = 'Deletion'
         elif title.find('Mediation Committee') > -1:
@@ -105,6 +106,7 @@
             title_meta['category'] = 'Featured Topic'
         elif title.find('Good Article') > -1:
             title_meta['category'] = 'Good Article'
+    #print title_meta
     return title_meta
 
 

Modified: trunk/tools/editor_trends/kaggle/training.py
===================================================================
--- trunk/tools/editor_trends/kaggle/training.py        2011-05-31 23:50:11 UTC 
(rev 89241)
+++ trunk/tools/editor_trends/kaggle/training.py        2011-06-01 00:02:16 UTC 
(rev 89242)
@@ -26,7 +26,7 @@
 
 from classes import storage
 
-location = '/home/diederik/wikimedia/en/wiki/kaggle_prediction_solution'
+location = '/home/diederik/wikimedia/en/wiki/kaggle'
 files = os.listdir(location)
 files.reverse()
 
@@ -67,7 +67,7 @@
                 continue
             id = line[2]
             if id not in ids and id not in ignore_ids:
-                res = db.find_one('editor', id)
+                res = db.find_one({'editor': id})
                 if res == None:
                     ignore_ids.add(id)
                     continue
@@ -100,7 +100,7 @@
 fh = codecs.open('solutions.tsv', 'w', 'utf-8')
 for id in ids:
     if id not in ignore_ids:
-        obs = db.find_one('editor', str(id), 'cum_edit_count_main_ns')
+        obs = db.find_one({'editor': str(id)}, 'cum_edit_count_main_ns')
         if obs != None:
             x += 1
             n = obs['cum_edit_count_main_ns']

Modified: trunk/tools/editor_trends/manage.py
===================================================================
--- trunk/tools/editor_trends/manage.py 2011-05-31 23:50:11 UTC (rev 89241)
+++ trunk/tools/editor_trends/manage.py 2011-06-01 00:02:16 UTC (rev 89242)
@@ -150,14 +150,8 @@
     language = languages.init(language_code)
     project = projects.init(project)
     pjc = projects.ProjectContainer()
-    #rts = runtime_settings.RunTimeSettings(project, language)
+    rts = runtime_settings.RunTimeSettings(project, language)
 
-    file_choices = {'meta-full': 'stub-meta-history.xml.gz',
-                    'meta-current': 'stub-meta-current.xml.gz',
-                    'history-full': 'pages-meta-history.xml.7z',
-                    'history-current': 'pages-meta-current.xml.bz2'
-                    }
-
     #Init Argument Parser
     parser = ArgumentParser(prog='manage', 
formatter_class=RawTextHelpFormatter)
     subparsers = parser.add_subparsers(help='sub - command help')
@@ -218,7 +212,7 @@
     parser_dataset.add_argument('-c', '--charts',
                                 action='store',
                                 help='Should be a valid function name that 
matches one of the plugin functions',
-                                
default=inventory.available_analyses()['new_editor_count'])
+                                default='new_editor_count')
 
     parser_dataset.add_argument('-k', '--keywords',
                                 action='store',
@@ -256,6 +250,13 @@
                         help='Indicate whether the output is for Kaggle or 
not',
                         default=False)
 
+
+    parser.add_argument('-t', '--collection',
+        action='store',
+        help='Name of default collection',
+        default='editors_dataset'
+        )
+
     parser.add_argument('-l', '--language',
         action='store',
         help='Example of valid languages.',
@@ -269,28 +270,17 @@
         choices=pjc.supported_projects(),
         default='wiki')
 
-    parser.add_argument('-c', '--collection',
-        action='store',
-        help='Name of MongoDB collection',
-        default='editors_raw')
-
-
     parser.add_argument('-ns', '--namespace',
         action='store',
         help='A list of namespaces to include for analysis.',
         default='0')
 
-    parser.add_argument('-db', '--database',
-                        action='store',
-                        help='Specify the database that you want to use. Valid 
choices are mongo and cassandra.',
-                        default='mongo')
-
     parser.add_argument('-f', '--file',
         action='store',
-        choices=file_choices,
+        choices=rts.file_choices,
         help='Indicate which dump you want to download. Valid choices are:\n \
-            %s' % ''.join([f + ',\n' for f in file_choices]),
-        default=file_choices['meta-full'])
+            %s' % ''.join([f + ',\n' for f in rts.file_choices]),
+        default='meta-full')
 
     return parser
 
@@ -353,6 +343,7 @@
     log.to_db(rts, 'dataset', 'store', stopwatch, event='start')
     log.to_csv(logger, rts, 'Start', 'Store', store_launcher)
     store.launcher(rts)
+    #store.launcher_articles(rts)
     stopwatch.elapsed()
     log.to_db(rts, 'dataset', 'store', stopwatch, event='finish')
     log.to_csv(logger, rts, 'Finish', 'Store', store_launcher)

Modified: trunk/tools/editor_trends/utils/log.py
===================================================================
--- trunk/tools/editor_trends/utils/log.py      2011-05-31 23:50:11 UTC (rev 
89241)
+++ trunk/tools/editor_trends/utils/log.py      2011-06-01 00:02:16 UTC (rev 
89242)
@@ -31,8 +31,8 @@
 def to_db(rts, jobtype, task, timer, event='start'):
     db = storage.init_database(rts.storage, rts.dbname, 'jobs')
     created = datetime.datetime.now()
-    job = db.find_one('hash', rts.id)
-
+    job = db.find_one({'hash': rts.id})
+    #print job
     data = {'hash': rts.id,
           'created': created,
           'jobtype': jobtype,
@@ -50,7 +50,7 @@
             data['finished'] = True
             _id = db.save(data)
 
-        job = db.find_one('_id', _id)
+        job = db.find_one({'_id': _id})
 
     tasks = job['tasks']
     t = tasks.get(task, {})


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

[MediaWiki-CVS] SVN: [89242] trunk/tools/editor_trends

Reply via email to