http://www.mediawiki.org/wiki/Special:Code/MediaWiki/88349

Revision: 88349
Author:   diederik
Date:     2011-05-17 20:50:14 +0000 (Tue, 17 May 2011)
Log Message:
-----------
Compare quality of PPI editors vs. representative sample using a matching 
algorithm

Added Paths:
-----------
    trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py

Added: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
===================================================================
--- trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py                     
        (rev 0)
+++ trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py     2011-05-17 
20:50:14 UTC (rev 88349)
@@ -0,0 +1,94 @@
+__author__ = '''\n'''.join(['Diederik van Liere ([email protected])'])
+__email__ = 'dvanliere at gmail dot com'
+__date__ = '2011-05-17'
+__version__ = '0.1'
+
+from datetime import datetime
+from math import pow
+import time
+import sys
+import os
+sys.path.append('../../')
+
+from classes import settings
+from classes import storage
+
+rts = settings.Settings()
+db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
+
+def create_sample_a():
+    editors = {}
+    location = os.path.join(rts.csv_location, 'ppi_editors.csv')
+    fh = open(location, 'r')
+    for line in fh:
+        line = line.strip()
+        username, chars, date = line.split('\t')
+        chars = int(chars)
+        date = datetime.strptime(date, '%Y-%m-%d')
+        editors.setdefault(username, [])
+        editors[username].append(date)
+    fh.close()
+    return editors
+
+
+def create_sample_b():
+    date = datetime(2010, 6, 30)
+    cursor = db.find('reg_date', {'$gte': date})
+    return cursor
+
+
+def create_dataset(editors):
+    obs = {}
+    print '%s\t%s\t%s\t%s\t%s' % ('username', 'date', 'number of reverts', 
'number of characters added', 'registration date')
+    for username in editors:
+        for date in editors[username]:
+            month = str(date.month)
+            year = str(date.year)
+            data = db.find_one('username', username)
+            if data:
+                revert_count = data['revert_count'].get(year, {}).get(month, 
{}).get('0', 0)
+                character_count = data['character_count'].get(year, 
{}).get(month, {}).get('0', {}).get('added', 0)
+                reg_date = data.get('reg_date', datetime(2001, 1, 1))
+                epoch = time.mktime(reg_date.timetuple())
+                cum_edit_count_main_ns = data.get('cum_edit_count_main_ns', 0)
+                cum_edit_count_other_ns = data.get('cum_edit_count_other_ns', 
0)
+                article_count = data['article_count'].get(year, {}).get(month, 
0)
+                print '%s\t%s\t%s\t%s\t%s' % (username, date, revert_count, 
character_count, reg_date)
+                obs.setdefault(username, {})
+                obs[username]['revert_count'] = revert_count
+                obs[username]['character_count'] = character_count
+                obs[username]['reg_date'] = epoch
+                obs[username]['cum_edit_count_main_ns'] = 
cum_edit_count_main_ns
+                obs[username]['cum_edit_count_other_ns'] = 
cum_edit_count_other_ns
+                obs[username]['article_count'] = article_count
+    return obs
+
+def euclidean_distance(vars, person1, person2):
+  sum_of_squares = sum([pow(person1[item] - person2[item], 2) for item in 
vars])
+  return 1 / (1 + sum_of_squares)
+
+
+def calculate_distance_matrix(obs_a, obs_b):
+    vars = ['character_count', 'reg_date', 'cum_edit_count_main_ns', 
'cum_edit_count_other_ns', 'article_count']
+    matches = {}
+    for person1 in obs_a:
+        for person2 in obs_b:
+            d = euclidean_distance(vars, person1, person2)
+            matches.setdefault(person1, {})
+            matches[person1][person2] = d
+    return matches
+
+def find_partner(matches):
+    pass
+
+def launcher():
+    editors_a = create_sample_a()
+    obs_a = create_dataset(editors_a)
+    editors_b = create_sample_b()
+    obs_b = create_dataset(editors_b)
+    matches = calculate_distance_matrix(obs_a, obs_b)
+    find_partner(matches)
+
+
+if __name__ == '__main__':
+    launcher()


Property changes on: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
___________________________________________________________________
Added: svn:eol-style
   + native


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to