enricher.py

diederik Thu, 24 Mar 2011 05:56:19 -0700

http://www.mediawiki.org/wiki/Special:Code/MediaWiki/84672


Revision: 84672
Author:   diederik
Date:     2011-03-24 12:56:10 +0000 (Thu, 24 Mar 2011)
Log Message:
-----------
A bit more detailed progress logging.

Modified Paths:
--------------
    trunk/tools/editor_trends/etl/enricher.py

Modified: trunk/tools/editor_trends/etl/enricher.py
===================================================================
--- trunk/tools/editor_trends/etl/enricher.py   2011-03-24 11:11:15 UTC (rev 
84671)
+++ trunk/tools/editor_trends/etl/enricher.py   2011-03-24 12:56:10 UTC (rev 
84672)
@@ -25,10 +25,10 @@
 import codecs
 import re
 import sys
+import datetime
 import progressbar
 from multiprocessing import JoinableQueue, Process, cpu_count, current_process
 from xml.etree.cElementTree import fromstring, iterparse
-from lxml import objectify
 from collections import deque
 
 if '..' not in sys.path:
@@ -123,8 +123,8 @@
         self.stringify(revision)
         id = revision['revision_id']
         self.revisions[id] = revision
-        if len(self.revisions) > 1000:
-            print 'Emptying buffer'
+        if len(self.revisions) > 10000:
+            print '%s: Emptying buffer' % (datetime.datetime.now())
             self.store()
             self.clear()
 
@@ -296,8 +296,7 @@
     return comment
     
 
-def create_variables(article, cache, cache_comments, bots):
-
+def create_variables(article, cache, bots):
     title = article['title'].text
     namespace = determine_namespace(article['title'])
     
@@ -323,12 +322,15 @@
             if revision_id == None:
                 #revision_id is missing, which is weird
                 continue
-            comment = add_comment(revision_id, revision)
+            
             row = prefill_row(title, article_id, namespace)
             row['revision_id'] = revision_id
             text = extract_revision_text(revision)
             row.update(contributor)
 
+            comment = add_comment(revision_id, revision)
+            cache.comments.update(comment)
+            
             timestamp = revision.find('timestamp').text
             row['timestamp'] = timestamp
 
@@ -341,10 +343,8 @@
             row.update(size)
             row.update(revert)
             cache.add(row)
-    
 
 
-
 def parse_xml(buffer):
     context = iterparse(buffer, events=('end',))
     context = iter(context)
@@ -369,7 +369,6 @@
     parsing = False
     bots = detector.retrieve_bots('en')
     cache = Buffer(storage, id)
-    cache_comments = Buffer(storage, id)
     i = 0
     while True:
         filename = input_queue.get()
@@ -384,18 +383,17 @@
                 buffer.write(data)
                 buffer.write('\n')
                 if data == '</page>':
+                    i += 1
                     buffer.seek(0)
                     article = parse_xml(buffer)
-                    create_variables(article, cache, cache_comments, bots)
+                    create_variables(article, cache, bots)
                     buffer = cStringIO.StringIO()
-                i += 1
+
                 if i % 10000 == 0:
                     print 'Parsed %s articles' % i
 
                
     cache.empty()
-    cache_comments.empty()
-    print 'Buffer is empty'
     print 'Finished parsing bz2 archives'
     cache.stats.summary()
 


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

[MediaWiki-CVS] SVN: [84672] trunk/tools/editor_trends/etl/enricher.py

Reply via email to