ppi_quality.py

diederik Thu, 19 May 2011 08:10:31 -0700

http://www.mediawiki.org/wiki/Special:Code/MediaWiki/88408


Revision: 88408
Author:   diederik
Date:     2011-05-19 15:10:24 +0000 (Thu, 19 May 2011)
Log Message:
-----------
Fixed euclidean distance formula.

Modified Paths:
--------------
    trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py

Modified: trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py
===================================================================
--- trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py     2011-05-19 
15:06:29 UTC (rev 88407)
+++ trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py     2011-05-19 
15:10:24 UTC (rev 88408)
@@ -4,7 +4,8 @@
 __version__ = '0.1'
 
 from datetime import datetime
-from math import pow
+from math import pow, log, sqrt
+import codecs
 import time
 import sys
 import os
@@ -32,62 +33,166 @@
 
 
 def create_sample_b():
-    date = datetime(2010, 6, 30)
-    cursor = db.find('reg_date', {'$gte': date})
-    return cursor
+    editors = []
+    start_date = datetime(2010, 9, 1)
+    end_date = datetime(2010, 11, 1)
+    cursor = db.find('reg_date', {'$gte': start_date, '$lt': end_date})
+    for editor in cursor:
+        editors.append(editor['username'])
+    return editors
 
 
+def retrieve_variables(obs, username, date):
+    data = db.find_one('username', username)
+    year = str(date.year)
+    month = str(date.month)
+    if data:
+        revert_count = data['revert_count'].get(year, {}).get(month, 
{}).get('0', 0)
+        character_count = data['character_count'].get(year, {}).get(month, 
{}).get('0', {}).get('added', 0)
+        reg_date = data.get('reg_date', datetime(2001, 1, 1))
+        #epoch = time.mktime(reg_date.timetuple())
+        cum_edit_count_main_ns = data.get('cum_edit_count_main_ns', 0)
+        cum_edit_count_other_ns = data.get('cum_edit_count_other_ns', 0)
+        article_count = data['article_count'].get(year, {}).get(month, 
{}).get('0', 0)
+
+        if character_count + cum_edit_count_main_ns + cum_edit_count_other_ns 
+ article_count > 0:
+            #print '%s\t%s\t%s\t%s\t%s' % (username, date, revert_count, 
character_count, reg_date)
+            obs.setdefault(username, {})
+            obs[username]['revert_count'] = float(revert_count)
+            obs[username]['character_count'] = float(character_count)
+            obs[username]['reg_date'] = reg_date    #epoch / 86400
+            obs[username]['cum_edit_count_main_ns'] = 
float(cum_edit_count_main_ns)
+            obs[username]['cum_edit_count_other_ns'] = 
float(cum_edit_count_other_ns)
+            obs[username]['article_count'] = float(article_count)
+    return obs
+
+
 def create_dataset(editors):
     obs = {}
-    print '%s\t%s\t%s\t%s\t%s' % ('username', 'date', 'number of reverts', 
'number of characters added', 'registration date')
+    #print '%s\t%s\t%s\t%s\t%s' % ('username', 'date', 'number of reverts', 
'number of characters added', 'registration date')
+    dates = [datetime(2010, 11, 30)] #, datetime(2010, 12, 31)]
     for username in editors:
-        for date in editors[username]:
-            month = str(date.month)
-            year = str(date.year)
-            data = db.find_one('username', username)
-            if data:
-                revert_count = data['revert_count'].get(year, {}).get(month, 
{}).get('0', 0)
-                character_count = data['character_count'].get(year, 
{}).get(month, {}).get('0', {}).get('added', 0)
-                reg_date = data.get('reg_date', datetime(2001, 1, 1))
-                epoch = time.mktime(reg_date.timetuple())
-                cum_edit_count_main_ns = data.get('cum_edit_count_main_ns', 0)
-                cum_edit_count_other_ns = data.get('cum_edit_count_other_ns', 
0)
-                article_count = data['article_count'].get(year, {}).get(month, 
0)
-                print '%s\t%s\t%s\t%s\t%s' % (username, date, revert_count, 
character_count, reg_date)
-                obs.setdefault(username, {})
-                obs[username]['revert_count'] = revert_count
-                obs[username]['character_count'] = character_count
-                obs[username]['reg_date'] = epoch
-                obs[username]['cum_edit_count_main_ns'] = 
cum_edit_count_main_ns
-                obs[username]['cum_edit_count_other_ns'] = 
cum_edit_count_other_ns
-                obs[username]['article_count'] = article_count
+        for date in dates:
+            obs = retrieve_variables(obs, username, date)
     return obs
 
+
 def euclidean_distance(vars, person1, person2):
-  sum_of_squares = sum([pow(person1[item] - person2[item], 2) for item in 
vars])
-  return 1 / (1 + sum_of_squares)
+    #handle the date variable
+    #sum_of_squares = sum([pow(person1[item] - person2[item], 2) for item in 
vars])
+    sum_of_squares = 0.0
+    for item in vars:
+        if item == 'reg_date' or item == 'revert_count':
+            pass
+#            dt = person1[item] - person2[item]
+#            dt = log(dt.days) if dt.days > 0 else 0
+#            sum_of_squares += pow(dt, 2)
+        else:
+            sum_of_squares += pow(person1[item] - person2[item], 2)
+    return 1 / (1 + sqrt(sum_of_squares))
 
 
-def calculate_distance_matrix(obs_a, obs_b):
-    vars = ['character_count', 'reg_date', 'cum_edit_count_main_ns', 
'cum_edit_count_other_ns', 'article_count']
-    matches = {}
+def calculate_distance_matrix(vars, obs_a, obs_b):
+    print 'Constructing distance matrix...'
+    distances = {}
     for person1 in obs_a:
         for person2 in obs_b:
-            d = euclidean_distance(vars, person1, person2)
-            matches.setdefault(person1, {})
-            matches[person1][person2] = d
+            if person1 != person2:
+                d = euclidean_distance(vars, obs_a[person1], obs_b[person2])
+                #print obs_a[person1].values(), obs_b[person2].values(), d
+                distances.setdefault(person1, {})
+                distances[person1][person2] = d
+    return distances
+
+
+def normalize_dataset(vars, obs):
+    editors = obs.keys()
+    data = []
+    for var in vars:
+        for editor in editors:
+            data.append(obs[editor][var])
+        sd = standard_deviation(data)
+        for editor in editors:
+            try:
+                obs[editor][var] = obs[editor][var] / sd
+            except ZeroDivisionError:
+                obs[editor][var] = 0
+    return obs
+
+
+def standard_deviation(data):
+    n = len(data)
+    values = sum(data)
+    sq_values = values * values
+    sd = (1.0 / n) * sq_values - (pow((1.0 / n) * values, 2))
+    return sd
+
+
+def inverse_dictionary(data):
+    return dict((v, k) for k, v in data.iteritems())
+
+
+def find_partner(distances):
+    print 'Finding similar partners...'
+    matches = []
+    ppi_editors = distances.keys()
+    for ppi_editor in ppi_editors:
+        data = inverse_dictionary(distances[ppi_editor])
+        min_d = min(data.keys())
+        max_d = max(data.keys())
+        match = data[max_d]
+        matches.append((ppi_editor, match))
+        for editor in distances:
+            try:
+                distances[editor].pop(match)
+            except KeyError:
+                pass
+        print ppi_editor, match, min_d, max_d
     return matches
 
-def find_partner(matches):
-    pass
 
+def write_dataset(vars, matches, obs_a, obs_b):
+    print 'Writing dataset to CSV file...'
+    fh = codecs.open('ppi_quality.csv', 'w', 'utf-8')
+    fh.write('%s\t' % ('editor_a'))
+    fh.write('_a\t'.join(vars))
+    fh.write('\t%s\t' % ('editor_b'))
+    fh.write('_b\t'.join(vars))
+    fh.write('\tdelta registration days\tid\n')
+    for i, match in enumerate(matches):
+        line = []
+        editor_a = match[0]
+        editor_b = match[1]
+        line.append(editor_a)
+        values_a = [str(obs_a[editor_a][v]) for v in vars]
+        values_b = [str(obs_b[editor_b][v]) for v in vars]
+        line.extend(values_a)
+        line.append(editor_b)
+        line.extend(values_b)
+        dt = obs_a[editor_a]['reg_date'] - obs_b[editor_b]['reg_date']
+        line.append(str(dt.days))
+        line.append(str(i))
+        line.append('\n')
+        print line
+        #line = '\t'.join([str(l).decode('utf-8') for l in line])
+        line = '\t'.join(line)
+        fh.write(line)
+    fh.close()
+
+
 def launcher():
+    print 'Retrieving datasets...'
+    vars = ['character_count', 'reg_date', 'cum_edit_count_main_ns',
+            'cum_edit_count_other_ns', 'article_count', 'revert_count']
     editors_a = create_sample_a()
     obs_a = create_dataset(editors_a)
+    #obs_a = normalize_dataset(vars, obs_a)
     editors_b = create_sample_b()
     obs_b = create_dataset(editors_b)
-    matches = calculate_distance_matrix(obs_a, obs_b)
-    find_partner(matches)
+    #obs_b = normalize_dataset(vars, obs_b)
+    distances = calculate_distance_matrix(vars, obs_a, obs_b)
+    matches = find_partner(distances)
+    write_dataset(vars, matches, obs_a, obs_b)
 
 
 if __name__ == '__main__':


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

[MediaWiki-CVS] SVN: [88408] trunk/tools/editor_trends/analyses/adhoc/ppi_quality.py

Reply via email to