http://www.mediawiki.org/wiki/Special:Code/MediaWiki/84696
Revision: 84696
Author: diederik
Date: 2011-03-24 20:00:50 +0000 (Thu, 24 Mar 2011)
Log Message:
-----------
Added counting number of edits for prediction dataset.
Modified Paths:
--------------
trunk/tools/editor_trends/etl/enricher.py
Modified: trunk/tools/editor_trends/etl/enricher.py
===================================================================
--- trunk/tools/editor_trends/etl/enricher.py 2011-03-24 19:58:20 UTC (rev
84695)
+++ trunk/tools/editor_trends/etl/enricher.py 2011-03-24 20:00:50 UTC (rev
84696)
@@ -394,7 +394,7 @@
return article
-def stream_raw_xml(input_queue, storage, id, dataset='training'):
+def stream_raw_xml(input_queue, storage, id, function, dataset='training'):
buffer = cStringIO.StringIO()
parsing = False
i = 0
@@ -457,7 +457,7 @@
cassandra.install_schema(keyspace_name, drop_first=True)
-def launcher(function, path):
+def launcher(function, path, dataset):
storage = 'csv'
setup(storage)
input_queue = JoinableQueue()
@@ -474,7 +474,7 @@
for x in xrange(cpu_count()):
input_queue.put(None)
- extracters = [Process(target=stream_raw_xml, args=[input_queue, function,
storage, x])
+ extracters = [Process(target=stream_raw_xml, args=[input_queue, function,
storage, x, dataset])
for x in xrange(cpu_count())]
for extracter in extracters:
extracter.start()
@@ -483,10 +483,12 @@
if __name__ == '__main__':
- path1 = '/media/wikipedia_dumps/batch1/'
- path2 = '/media/wikipedia_dumps/batch2/'
+ path1 = '/media/wikipedia_dumps/batch2/'
+ path2 = '/media/wikipedia_dumps/batch1/'
function1 = create_variables
function2 = count_edits
- launcher(function1, path1) # launcher for creating training data
- launcher(function2, path2) # launcher for creating test data
+ dataset1 = 'training'
+ dataset2 = 'prediction'
+ #launcher(function1, path1, dataset1) # launcher for creating training data
+ launcher(function2, path2, dataset2) # launcher for creating test data
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs