http://www.mediawiki.org/wiki/Special:Code/MediaWiki/84672
Revision: 84672
Author: diederik
Date: 2011-03-24 12:56:10 +0000 (Thu, 24 Mar 2011)
Log Message:
-----------
A bit more detailed progress logging.
Modified Paths:
--------------
trunk/tools/editor_trends/etl/enricher.py
Modified: trunk/tools/editor_trends/etl/enricher.py
===================================================================
--- trunk/tools/editor_trends/etl/enricher.py 2011-03-24 11:11:15 UTC (rev
84671)
+++ trunk/tools/editor_trends/etl/enricher.py 2011-03-24 12:56:10 UTC (rev
84672)
@@ -25,10 +25,10 @@
import codecs
import re
import sys
+import datetime
import progressbar
from multiprocessing import JoinableQueue, Process, cpu_count, current_process
from xml.etree.cElementTree import fromstring, iterparse
-from lxml import objectify
from collections import deque
if '..' not in sys.path:
@@ -123,8 +123,8 @@
self.stringify(revision)
id = revision['revision_id']
self.revisions[id] = revision
- if len(self.revisions) > 1000:
- print 'Emptying buffer'
+ if len(self.revisions) > 10000:
+ print '%s: Emptying buffer' % (datetime.datetime.now())
self.store()
self.clear()
@@ -296,8 +296,7 @@
return comment
-def create_variables(article, cache, cache_comments, bots):
-
+def create_variables(article, cache, bots):
title = article['title'].text
namespace = determine_namespace(article['title'])
@@ -323,12 +322,15 @@
if revision_id == None:
#revision_id is missing, which is weird
continue
- comment = add_comment(revision_id, revision)
+
row = prefill_row(title, article_id, namespace)
row['revision_id'] = revision_id
text = extract_revision_text(revision)
row.update(contributor)
+ comment = add_comment(revision_id, revision)
+ cache.comments.update(comment)
+
timestamp = revision.find('timestamp').text
row['timestamp'] = timestamp
@@ -341,10 +343,8 @@
row.update(size)
row.update(revert)
cache.add(row)
-
-
def parse_xml(buffer):
context = iterparse(buffer, events=('end',))
context = iter(context)
@@ -369,7 +369,6 @@
parsing = False
bots = detector.retrieve_bots('en')
cache = Buffer(storage, id)
- cache_comments = Buffer(storage, id)
i = 0
while True:
filename = input_queue.get()
@@ -384,18 +383,17 @@
buffer.write(data)
buffer.write('\n')
if data == '</page>':
+ i += 1
buffer.seek(0)
article = parse_xml(buffer)
- create_variables(article, cache, cache_comments, bots)
+ create_variables(article, cache, bots)
buffer = cStringIO.StringIO()
- i += 1
+
if i % 10000 == 0:
print 'Parsed %s articles' % i
cache.empty()
- cache_comments.empty()
- print 'Buffer is empty'
print 'Finished parsing bz2 archives'
cache.stats.summary()
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs