http://www.mediawiki.org/wiki/Special:Code/MediaWiki/84696

Revision: 84696
Author:   diederik
Date:     2011-03-24 20:00:50 +0000 (Thu, 24 Mar 2011)
Log Message:
-----------
Added counting number of edits for prediction dataset. 

Modified Paths:
--------------
    trunk/tools/editor_trends/etl/enricher.py

Modified: trunk/tools/editor_trends/etl/enricher.py
===================================================================
--- trunk/tools/editor_trends/etl/enricher.py   2011-03-24 19:58:20 UTC (rev 
84695)
+++ trunk/tools/editor_trends/etl/enricher.py   2011-03-24 20:00:50 UTC (rev 
84696)
@@ -394,7 +394,7 @@
     return article
 
 
-def stream_raw_xml(input_queue, storage, id, dataset='training'):
+def stream_raw_xml(input_queue, storage, id, function, dataset='training'):
     buffer = cStringIO.StringIO()
     parsing = False
     i = 0
@@ -457,7 +457,7 @@
         cassandra.install_schema(keyspace_name, drop_first=True)
 
 
-def launcher(function, path):
+def launcher(function, path, dataset):
     storage = 'csv'
     setup(storage)
     input_queue = JoinableQueue()
@@ -474,7 +474,7 @@
     for x in xrange(cpu_count()):
         input_queue.put(None)
 
-    extracters = [Process(target=stream_raw_xml, args=[input_queue, function, 
storage, x])
+    extracters = [Process(target=stream_raw_xml, args=[input_queue, function, 
storage, x, dataset])
                   for x in xrange(cpu_count())]
     for extracter in extracters:
         extracter.start()
@@ -483,10 +483,12 @@
 
 
 if __name__ == '__main__':
-    path1 = '/media/wikipedia_dumps/batch1/'
-    path2 = '/media/wikipedia_dumps/batch2/'
+    path1 = '/media/wikipedia_dumps/batch2/'
+    path2 = '/media/wikipedia_dumps/batch1/'
     function1 = create_variables
     function2 = count_edits
 
-    launcher(function1, path1) # launcher for creating training data
-    launcher(function2, path2) # launcher for creating test data
+    dataset1 = 'training'
+    dataset2 = 'prediction'
+    #launcher(function1, path1, dataset1) # launcher for creating training data
+    launcher(function2, path2, dataset2) # launcher for creating test data


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to