https://www.mediawiki.org/wiki/Special:Code/MediaWiki/113493
Revision: 113493
Author: giovanni
Date: 2012-03-09 18:29:50 +0000 (Fri, 09 Mar 2012)
Log Message:
-----------
renamed mksample -> groupbyday; groupbyday now writes into numpy binary array
files
Added Paths:
-----------
trunk/tools/wsor/editor_lifecycle/scripts/groupbyday
Removed Paths:
-------------
trunk/tools/wsor/editor_lifecycle/scripts/mksamples
Added: trunk/tools/wsor/editor_lifecycle/scripts/groupbyday
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/groupbyday
(rev 0)
+++ trunk/tools/wsor/editor_lifecycle/scripts/groupbyday 2012-03-09
18:29:50 UTC (rev 113493)
@@ -0,0 +1,64 @@
+#!/usr/bin/python
+#:vim:ft=python
+# encoding:utf-8
+
+''' groups user counts by day since registration '''
+
+import os
+from argparse import ArgumentParser
+import numpy as np
+from scipy.sparse import coo_matrix
+from collections import deque
+
+parser = ArgumentParser(description=__doc__)
+parser.add_argument('input_paths', metavar='file', nargs='+')
+parser.add_argument('-p', '--prefix', dest='out_prefix', default='daily_',
+ metavar='PREFIX', help="(default: %(metavar)s)")
+
+def group_by_day(counts):
+ '''
+ counts is a mapping between user IDs and edits-by-namespace count data
+ '''
+ # hold cohort daily counts in a mapping in memory
+ day_counts = {}
+
+ for uid in counts:
+ data = counts[uid].view(np.recarray)
+
+ # NS < 0 are virtual. Filter out those edits because they are junk
+ idx = data.ns >= 0
+ data = data[idx]
+
+ # Sparse matrix (num_days x namespaces) where num_days is the activity
+ # span in days. Summing along rows returns a dense matrix
+ counts_matrix = coo_matrix((data.edits, (data.day - data.day.min(), \
+ data.ns))).tocsc().sum(axis=1)
+
+ # Add counts to cohort daily counts
+ for day in xrange(counts_matrix.shape[0]):
+ n = int(counts_matrix[day])
+ try:
+ day_counts[str(day)].append(n)
+ except KeyError:
+ day_counts[str(day)] = deque([n])
+
+ return day_counts
+
+def main(args):
+ for path in args.input_paths:
+ # if path is /a/b/c/whatever.npz, by default output will be in
+ # $WD/byday_whatever.npz where $WD is the working dir
+ out_path = args.out_prefix + os.path.basename(path)
+ out_path = os.path.splitext(out_path)[0] + '.npz'
+
+ # load input, group, save to file, tell user
+ user_counts = np.load(path)
+ N = len(user_counts.files)
+ print '%d users in %s' % (N, path)
+ day_counts = group_by_day(user_counts)
+ np.savez(out_path, **day_counts)
+ print '%s saved (%d days).' % (out_path, len(day_counts))
+
+if __name__ == '__main__':
+ args = parser.parse_args()
+ main(args)
Property changes on: trunk/tools/wsor/editor_lifecycle/scripts/groupbyday
___________________________________________________________________
Added: svn:executable
+ *
Deleted: trunk/tools/wsor/editor_lifecycle/scripts/mksamples
===================================================================
--- trunk/tools/wsor/editor_lifecycle/scripts/mksamples 2012-03-09 18:29:43 UTC
(rev 113492)
+++ trunk/tools/wsor/editor_lifecycle/scripts/mksamples 2012-03-09 18:29:50 UTC
(rev 113493)
@@ -1,47 +0,0 @@
-#!/usr/bin/python
-#:vim:ft=python
-# encoding:utf-8
-
-''' groups user counts by day since registration '''
-
-import os
-from argparse import ArgumentParser
-import numpy as np
-from scipy.sparse import coo_matrix
-from collections import deque
-from contextlib import closing
-
-parser = ArgumentParser(description=__doc__)
-parser.add_argument('input_paths', metavar='file', nargs='+')
-parser.add_argument('-p', '--prefix', dest='output_prefix', default='',
- metavar='PREFIX')
-
-def main(args):
- for path in args.input_paths:
- output_path = args.output_prefix + os.path.basename(path)
- output_path = os.path.splitext(output_path)[0] + '.tsv'
- day_counts = {}
- archive = np.load(path)
- N = len(archive.files)
- print '%d users in %s' % (N, path)
- with closing(open(output_path, 'w')) as out_file:
- for uid in archive.files:
- data = archive[uid].view(np.recarray)
- idx = data.ns >= 0
- data = data[idx]
- counts = coo_matrix((data.edits, (data.day - data.day.min(),
- data.ns))).tocsr().sum(axis=1)
- for day in xrange(counts.shape[0]):
- n = int(counts[day])
- try:
- day_counts[day].append(n)
- except KeyError:
- day_counts[day] = deque([n])
- max_day = max(day_counts.keys())
- for day in xrange(max_day):
- print >> out_file, ' '.join(map(str, day_counts.get(day, [])))
- print '%s saved.' % output_path
-
-if __name__ == '__main__':
- args = parser.parse_args()
- main(args)
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs