https://www.mediawiki.org/wiki/Special:Code/MediaWiki/113493

Revision: 113493
Author:   giovanni
Date:     2012-03-09 18:29:50 +0000 (Fri, 09 Mar 2012)
Log Message:
-----------
renamed mksample -> groupbyday; groupbyday now writes into numpy binary array 
files

Added Paths:
-----------
    trunk/tools/wsor/editor_lifecycle/scripts/groupbyday

Removed Paths:
-------------
    trunk/tools/wsor/editor_lifecycle/scripts/mksamples

Added: trunk/tools/wsor/editor_lifecycle/scripts/groupbyday
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/groupbyday                        
        (rev 0)
+++ trunk/tools/wsor/editor_lifecycle/scripts/groupbyday        2012-03-09 
18:29:50 UTC (rev 113493)
@@ -0,0 +1,64 @@
+#!/usr/bin/python
+#:vim:ft=python
+# encoding:utf-8
+
+''' groups user counts by day since registration '''
+
+import os
+from argparse import ArgumentParser
+import numpy as np
+from scipy.sparse import coo_matrix
+from collections import deque
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument('input_paths', metavar='file', nargs='+')
+parser.add_argument('-p', '--prefix', dest='out_prefix', default='daily_',
+        metavar='PREFIX', help="(default: %(metavar)s)")
+
+def group_by_day(counts):
+    '''
+    counts is a mapping between user IDs and edits-by-namespace count data
+    '''
+    # hold cohort daily counts in a mapping in memory
+    day_counts = {}
+
+    for uid in counts:
+        data = counts[uid].view(np.recarray)
+
+        # NS < 0 are virtual. Filter out those edits because they are junk
+        idx = data.ns >= 0
+        data = data[idx]
+
+        # Sparse matrix (num_days x namespaces) where num_days is the activity
+        # span in days. Summing along rows returns a dense matrix
+        counts_matrix = coo_matrix((data.edits, (data.day - data.day.min(), \
+                data.ns))).tocsc().sum(axis=1)
+        
+        # Add counts to cohort daily counts
+        for day in xrange(counts_matrix.shape[0]):
+            n = int(counts_matrix[day])
+            try:
+                day_counts[str(day)].append(n)
+            except KeyError:
+                day_counts[str(day)] = deque([n])
+
+    return day_counts
+
+def main(args):
+    for path in args.input_paths:
+        # if path is /a/b/c/whatever.npz, by default output will be in
+        # $WD/byday_whatever.npz where $WD is the working dir
+        out_path = args.out_prefix + os.path.basename(path)
+        out_path = os.path.splitext(out_path)[0] + '.npz'
+
+        # load input, group, save to file, tell user
+        user_counts = np.load(path)
+        N = len(user_counts.files)
+        print '%d users in %s' % (N, path)
+        day_counts = group_by_day(user_counts)
+        np.savez(out_path, **day_counts)
+        print '%s saved (%d days).' % (out_path, len(day_counts))
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    main(args)


Property changes on: trunk/tools/wsor/editor_lifecycle/scripts/groupbyday
___________________________________________________________________
Added: svn:executable
   + *

Deleted: trunk/tools/wsor/editor_lifecycle/scripts/mksamples
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/mksamples 2012-03-09 18:29:43 UTC 
(rev 113492)
+++ trunk/tools/wsor/editor_lifecycle/scripts/mksamples 2012-03-09 18:29:50 UTC 
(rev 113493)
@@ -1,47 +0,0 @@
-#!/usr/bin/python
-#:vim:ft=python
-# encoding:utf-8
-
-''' groups user counts by day since registration '''
-
-import os
-from argparse import ArgumentParser
-import numpy as np
-from scipy.sparse import coo_matrix
-from collections import deque
-from contextlib import closing
-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument('input_paths', metavar='file', nargs='+')
-parser.add_argument('-p', '--prefix', dest='output_prefix', default='',
-        metavar='PREFIX')
-
-def main(args):
-    for path in args.input_paths:
-        output_path = args.output_prefix + os.path.basename(path)
-        output_path = os.path.splitext(output_path)[0] + '.tsv'
-        day_counts = {}
-        archive = np.load(path)
-        N = len(archive.files)
-        print '%d users in %s' % (N, path)
-        with closing(open(output_path, 'w')) as out_file:
-            for uid in archive.files:
-                data = archive[uid].view(np.recarray)
-                idx = data.ns >= 0
-                data = data[idx]
-                counts = coo_matrix((data.edits, (data.day - data.day.min(),
-                    data.ns))).tocsr().sum(axis=1)
-                for day in xrange(counts.shape[0]):
-                    n = int(counts[day])
-                    try:
-                        day_counts[day].append(n)
-                    except KeyError:
-                        day_counts[day] = deque([n])
-            max_day = max(day_counts.keys())
-            for day in xrange(max_day):
-                print >> out_file, ' '.join(map(str, day_counts.get(day, [])))
-        print '%s saved.' % output_path
-
-if __name__ == '__main__':
-    args = parser.parse_args()
-    main(args)


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to