http://www.mediawiki.org/wiki/Special:Code/MediaWiki/95826

Revision: 95826
Author:   swalker
Date:     2011-08-31 00:29:39 +0000 (Wed, 31 Aug 2011)
Log Message:
-----------
initial commit - process mobile logs for phone and browser versions (reducer)

Added Paths:
-----------
    trunk/tools/wsor/mobile/reducer.py

Added: trunk/tools/wsor/mobile/reducer.py
===================================================================
--- trunk/tools/wsor/mobile/reducer.py                          (rev 0)
+++ trunk/tools/wsor/mobile/reducer.py  2011-08-31 00:29:39 UTC (rev 95826)
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+from operator import itemgetter
+import sys
+
+current_word = None
+current_count = 0
+word = None
+
+# input comes from STDIN
+for line in sys.stdin:
+    # remove leading and trailing whitespace
+    line = line.strip()
+
+    # parse the input we got from mapper.py
+    print line
+    try:
+        word, count = line.split('\t', 1)
+    except ValueError:
+        continue
+
+    # convert count (currently a string) to int
+    try:
+        count = int(count)
+    except ValueError:
+        # count was not a number, so silently
+        # ignore/discard this line
+        continue
+
+    # this IF-switch only works because Hadoop sorts map output
+    # by key (here: word) before it is passed to the reducer
+    if current_word == word:
+        current_count += count
+    else:
+        if current_word:
+            # write result to STDOUT
+            print '%s\t%s' % (current_word, current_count)
+        current_count = count
+        current_word = word
+
+# do not forget to output the last word if needed!
+if current_word == word:
+    print '%s\t%s' % (current_word, current_count)


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to