enricher.py

diederik Thu, 24 Mar 2011 12:58:28 -0700

http://www.mediawiki.org/wiki/Special:Code/MediaWiki/84695


Revision: 84695
Author:   diederik
Date:     2011-03-24 19:58:20 +0000 (Thu, 24 Mar 2011)
Log Message:
-----------
Added counting number of edits for prediction dataset. 

Modified Paths:
--------------
    trunk/tools/editor_trends/etl/enricher.py

Modified: trunk/tools/editor_trends/etl/enricher.py
===================================================================
--- trunk/tools/editor_trends/etl/enricher.py   2011-03-24 19:26:32 UTC (rev 
84694)
+++ trunk/tools/editor_trends/etl/enricher.py   2011-03-24 19:58:20 UTC (rev 
84695)
@@ -34,6 +34,8 @@
 if '..' not in sys.path:
     sys.path.append('..')
 
+from utils import file_utils
+
 try:
     from database import cassandra
     import pycassa
@@ -265,7 +267,11 @@
         if ns == {}:
             for namespace in NAMESPACE.values():
                 if title.startswith(namespace):
-                    ns = False #article does not belong to either the main 
namespace, user, talk or user talk namespace.
+                    '''
+                    article does not belong to either the main namespace, 
user, 
+                    talk or user talk namespace.
+                    '''
+                    ns = False
                     return ns
             ns['namespace'] = 0
     else:
@@ -283,7 +289,7 @@
 
 def is_revision_reverted(hash_cur, hashes):
     revert = {}
-    if hash_cur in hashes:
+    if hash_cur in hashes and hash_cur != -1:
         revert['revert'] = 1
     else:
         revert['revert'] = 0
@@ -296,12 +302,34 @@
     if text != None and text.text != None:
         comment[revision_id] = text.text.encode('utf-8')
     return comment
-    
 
+
+
+def count_edits(article, counts, bots):
+    title = article['title'].text
+    namespace = determine_namespace(article['title'])
+
+    if namespace != False:
+        article_id = article['id'].text
+        revisions = article['revisions']
+        for revision in revisions:
+            if revision == None:
+                #the entire revision is empty, weird. 
+                continue
+            contributor = revision.find('contributor')
+            contributor = parse_contributor(contributor, bots)
+            if not contributor:
+                #editor is anonymous, ignore
+                continue
+            counts.setdefault(contributor['username'], 0)
+            counts[contributor['username']] += 1
+    return counts
+
+
 def create_variables(article, cache, bots):
     title = article['title'].text
     namespace = determine_namespace(article['title'])
-    
+
     if namespace != False:
         cache.stats.count_articles += 1
         article_id = article['id'].text
@@ -324,7 +352,7 @@
             if revision_id == None:
                 #revision_id is missing, which is weird
                 continue
-            
+
             row = prefill_row(title, article_id, namespace)
             row['revision_id'] = revision_id
             text = extract_revision_text(revision)
@@ -332,7 +360,7 @@
 
             comment = extract_comment_text(revision_id, revision)
             cache.comments.update(comment)
-            
+
             timestamp = revision.find('timestamp').text
             row['timestamp'] = timestamp
 
@@ -366,12 +394,17 @@
     return article
 
 
-def stream_raw_xml(input_queue, storage, id):
+def stream_raw_xml(input_queue, storage, id, dataset='training'):
     buffer = cStringIO.StringIO()
     parsing = False
+    i = 0
     bots = detector.retrieve_bots('en')
-    cache = Buffer(storage, id)
-    i = 0
+
+    if dataset == 'training':
+        cache = Buffer(storage, id)
+    else:
+        counts = {}
+
     while True:
         filename = input_queue.get()
         input_queue.task_done()
@@ -379,38 +412,32 @@
             break
 
         for data in unzip(filename):
-            if data.startswith('<page>'):
+            if data.find('<page>') > -1:
                 parsing = True
             if parsing:
                 buffer.write(data)
-                buffer.write('\n')
-                if data == '</page>':
+                if data.find('</page>') > -1:
                     i += 1
                     buffer.seek(0)
                     article = parse_xml(buffer)
-                    create_variables(article, cache, bots)
+                    if dataset == 'training':
+                        function(article, cache, bots)
+                    else:
+                        counts = function(article, counts, bots)
                     buffer = cStringIO.StringIO()
 
                     if i % 10000 == 0:
                         print 'Worker %s parsed %s articles' % (id, i)
 
-               
-    cache.empty()
-    print 'Finished parsing bz2 archives'
-    cache.stats.summary()
+    if dataset == 'training':
+        cache.empty()
+        print 'Finished parsing bz2 archives'
+        cache.stats.summary()
+    else:
+        location = os.getcwd()
+        file_utils.store_object(counts, location, 'counts.bin')
 
 
-def debug():
-    input_queue = JoinableQueue()
-    result_queue = JoinableQueue()
-    files = 
['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']
-
-    for file in files:
-        input_queue.put(file)
-
-    stream_raw_xml(input_queue, result_queue)
-
-
 def unzip(filename):
     '''
     Filename should be a fully qualified path to the bz2 file that will be 
@@ -419,26 +446,26 @@
     '''
     fh = bz2.BZ2File(filename, 'r')
     for line in fh:
-        line = line.strip()
         yield line
     fh.close()
     print 'Reached end of BZ2 file.'
 
+
 def setup(storage):
     keyspace_name = 'enwiki'
     if storage == 'cassandra':
         cassandra.install_schema(keyspace_name, drop_first=True)
 
 
-def launcher():
+def launcher(function, path):
     storage = 'csv'
     setup(storage)
     input_queue = JoinableQueue()
     #files = 
['C:\\Users\\diederik.vanliere\\Downloads\\enwiki-latest-pages-articles1.xml.bz2']
     #files = 
['/home/diederik/kaggle/enwiki-20100904-pages-meta-history2.xml.bz2']
-    path = '/media/wikipedia_dumps/batch1/'
-    files = file_utils.retrieve_file_list(path, 'bz2', mask=None)
 
+    files = file_utils.retrieve_file_list(path, 'bz2')
+
     for file in files:
         filename = os.path.join(path, file)
         print filename
@@ -447,20 +474,19 @@
     for x in xrange(cpu_count()):
         input_queue.put(None)
 
-    extracters = [Process(target=stream_raw_xml, args=[input_queue, storage, 
x])
+    extracters = [Process(target=stream_raw_xml, args=[input_queue, function, 
storage, x])
                   for x in xrange(cpu_count())]
     for extracter in extracters:
         extracter.start()
 
-    #creators = [Process(target=create_variables, args=[result_queue, storage, 
x])
-    #            for x in xrange(cpu_count())]
-    #for creator in creators:
-    #    creator.start()
-
-
     input_queue.join()
 
 
 if __name__ == '__main__':
-    #debug()
-    launcher()
+    path1 = '/media/wikipedia_dumps/batch1/'
+    path2 = '/media/wikipedia_dumps/batch2/'
+    function1 = create_variables
+    function2 = count_edits
+
+    launcher(function1, path1) # launcher for creating training data
+    launcher(function2, path2) # launcher for creating test data


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

[MediaWiki-CVS] SVN: [84695] trunk/tools/editor_trends/etl/enricher.py

Reply via email to