http://www.mediawiki.org/wiki/Special:Code/MediaWiki/95826
Revision: 95826
Author: swalker
Date: 2011-08-31 00:29:39 +0000 (Wed, 31 Aug 2011)
Log Message:
-----------
initial commit - process mobile logs for phone and browser versions (reducer)
Added Paths:
-----------
trunk/tools/wsor/mobile/reducer.py
Added: trunk/tools/wsor/mobile/reducer.py
===================================================================
--- trunk/tools/wsor/mobile/reducer.py (rev 0)
+++ trunk/tools/wsor/mobile/reducer.py 2011-08-31 00:29:39 UTC (rev 95826)
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+from operator import itemgetter
+import sys
+
+current_word = None
+current_count = 0
+word = None
+
+# input comes from STDIN
+for line in sys.stdin:
+ # remove leading and trailing whitespace
+ line = line.strip()
+
+ # parse the input we got from mapper.py
+ print line
+ try:
+ word, count = line.split('\t', 1)
+ except ValueError:
+ continue
+
+ # convert count (currently a string) to int
+ try:
+ count = int(count)
+ except ValueError:
+ # count was not a number, so silently
+ # ignore/discard this line
+ continue
+
+ # this IF-switch only works because Hadoop sorts map output
+ # by key (here: word) before it is passed to the reducer
+ if current_word == word:
+ current_count += count
+ else:
+ if current_word:
+ # write result to STDOUT
+ print '%s\t%s' % (current_word, current_count)
+ current_count = count
+ current_word = word
+
+# do not forget to output the last word if needed!
+if current_word == word:
+ print '%s\t%s' % (current_word, current_count)
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs