http://www.mediawiki.org/wiki/Special:Code/MediaWiki/89424

Revision: 89424
Author:   halfak
Date:     2011-06-03 16:56:45 +0000 (Fri, 03 Jun 2011)
Log Message:
-----------
Added code for toolserver language samples for history coding

Added Paths:
-----------
    trunk/tools/wsor/ts_samples/add_diff_for_talk_edits.py
    trunk/tools/wsor/ts_samples/sample_talk_edits.py
    trunk/tools/wsor/ts_samples/testing.sql

Added: trunk/tools/wsor/ts_samples/add_diff_for_talk_edits.py
===================================================================
--- trunk/tools/wsor/ts_samples/add_diff_for_talk_edits.py                      
        (rev 0)
+++ trunk/tools/wsor/ts_samples/add_diff_for_talk_edits.py      2011-06-03 
16:56:45 UTC (rev 89424)
@@ -0,0 +1,152 @@
+import os, sys, logging, argparse, MySQLdb, urllib2, urllib, json
+
+def clean(v):
+       if v == None:
+               return "\N"
+       else:
+               return str(v).replace("\t", "\\t").replace("\n", 
"\\n").replace("\\", "\\\\")
+       
+
+def main(args):
+       LOGGING_STREAM = sys.stderr
+       logging.basicConfig(
+               level=logging.DEBUG,
+               stream=LOGGING_STREAM,
+               format='%(asctime)s %(levelname)-8s %(message)s',
+               datefmt='%b-%d %H:%M:%S'
+       )
+       
+       logging.info("Reading from %s." % args.input)
+       
+       #Print header
+       print(
+               "\t".join([
+                       'rev_id',
+                       'diff'
+               ])
+       )
+       
+       rowBuffer = []
+       for row in readTSVFile(args.input):
+               LOGGING_STREAM.write("<")
+               print(
+                       "\t".join([
+                               row['rev_id'],
+                               getSingleDiff(args.uri, 
row['rev_id']).replace("\\", "\\\\").replace("\n", "\\n").replace("\t", "\\t")
+                       ])
+               )
+               LOGGING_STREAM.write(">")
+               #rowBuffer.append(row)
+               #if len(rowBuffer) == 50:
+               #       LOGGING_STREAM.write("\n")
+               #       diffMap = buildDiffMap(args.uri, list(r['rev_id'] for r 
in rowBuffer))
+               #       for row in rowBuffer:
+               #               LOGGING_STREAM.write(">")
+               #               print(
+               #                       "\t".join([
+               #                               row['rev_id'],
+               #                               diffMap.get(row['rev_id'], 
'').replace("\\", "\\\\").replace("\n", "\\n").replace("\t", 
"\\t").encode('utf-8')
+               #                       ])
+               #               )
+               #       
+               #       rowBuffer = []
+               #       
+               #       LOGGING_STREAM.write("\n")
+       
+       LOGGING_STREAM.write("\n")      
+       #diffMap = buildDiffMap(args.uri, list(r['rev_id'] for r in rowBuffer))
+       #for row in rowBuffer:
+       #       LOGGING_STREAM.write(">")
+       #       print(
+       #               "\t".join([
+       #                       row['rev_id'],
+       #                       diffMap.get(row['rev_id'], '').replace("\\", 
"\\\\").replace("\n", "\\n").replace("\t", "\\t").encode('utf-8')
+       #               ])
+       #       )
+       #
+       #LOGGING_STREAM.write("\n")
+                               
+                       
+
+def getSingleDiff(uri, revId):
+       response = urllib2.urlopen(
+               uri,
+               urllib.urlencode({
+                       'action':   'query',
+                       'prop':     'revisions',
+                       'revids':   revId,
+                       'rvprop':   'ids|content',
+                       'rvdiffto': 'prev',
+                       'format':   'json'
+               }),
+       ).read()
+       result = json.loads(response)
+       diffMap = {}
+       try:
+               for page in result['query']['pages'].values():
+                       for rev in page['revisions']:
+                               return rev['diff'].get("*", "").encode('utf-8')
+       except Exception as e:
+               logging.error(response)
+               logging.error(result)
+               raise e
+       
+
+def buildDiffMap(uri, revIds):
+       if len(revIds) == 0:
+               return {}
+       else:
+               response = urllib2.urlopen(
+                       uri,
+                       urllib.urlencode({
+                               'action':   'query',
+                               'prop':     'revisions',
+                               'revids':   '|'.join(revIds),
+                               'rvprop':   'ids|content',
+                               'rvdiffto': 'prev',
+                               'format':   'json'
+                       }),
+               ).read()
+               result = json.loads(response)
+               diffMap = {}
+               try:
+                       for page in result['query']['pages'].values():
+                               for rev in page['revisions']:
+                                       diffMap[str(rev['revid'])] = 
rev['diff'].get("*", "")
+               except Exception as e:
+                       logging.error(response)
+                       logging.error(result)
+                       raise e
+                       
+               return diffMap
+               
+                       
+
+def readTSVFile(f):
+       headers = f.readline().strip().split("\t")
+       for line in f:
+               values = line.strip().split("\t")
+               yield dict(zip(headers,values))
+       
+
+
+if __name__ == "__main__":
+       parser = argparse.ArgumentParser(
+               description=
+                       'Adds diff information to a sample of talk edits'
+       )
+       parser.add_argument(
+               '-u', '--uri',
+               type=str, 
+               help='the uri of the api to connect to (defaults to enwp api)',
+               default="http://en.wikipedia.org/api.php";
+       )
+       parser.add_argument(
+               '-i', '--input',
+               metavar="<path>",
+               type=lambda fn:open(fn, "r"), 
+               help='the sample file to read (defaults to standard in)',
+               default=sys.stdin
+       )
+       args = parser.parse_args()
+       main(args)

Added: trunk/tools/wsor/ts_samples/sample_talk_edits.py
===================================================================
--- trunk/tools/wsor/ts_samples/sample_talk_edits.py                            
(rev 0)
+++ trunk/tools/wsor/ts_samples/sample_talk_edits.py    2011-06-03 16:56:45 UTC 
(rev 89424)
@@ -0,0 +1,210 @@
+import os, sys, logging, argparse, MySQLdb
+
+def clean(v):
+       if v == None:
+               return "\N"
+       else:
+               return str(v).replace("\t", "\\t").replace("\n", 
"\\n").replace("\\", "\\\\")
+       
+
+def main(args):
+       LOGGING_STREAM = sys.stderr
+       logging.basicConfig(
+               level=logging.DEBUG,
+               stream=LOGGING_STREAM,
+               format='%(asctime)s %(levelname)-8s %(message)s',
+               datefmt='%b-%d %H:%M:%S'
+       )
+       
+       logging.info("Connecting to %s_p using %s." % (args.db, args.cnf))
+       conn = MySQLdb.connect(
+               host="%s-p.rrdb.toolserver.org" % args.db, 
+               db='%s_p' % args.db, 
+               read_default_file=args.cnf
+       )
+       fetchConn = MySQLdb.connect(
+               host="%s-p.rrdb.toolserver.org" % args.db, 
+               db='%s_p' % args.db, 
+               read_default_file=args.cnf
+       )
+       
+       #Printing headers
+       print(
+               "\t".join([
+                       'user_id',
+                       'username',
+                       'registration',
+                       'end_of_newbie',
+                       'rev_id',
+                       'timestamp',
+                       'comment'
+               ])
+       )
+       for year in args.year:
+               logging.info("Processing %s:" % year)
+               yearCount = 0
+               for user in getUsersByYear(fetchConn, year):
+                       initialRevs = list(getFirst10Revs(conn, 
user['user_id']))
+                       if len(initialRevs) > 0:
+                               endOfNoob = initialRevs[-1]['rev_timestamp']
+                               talkRev = getRandNonSelfPostToTalkPage(
+                                       conn,
+                                       user['user_id'],
+                                       user['user_name'],
+                                       user['user_registration'],
+                                       endOfNoob
+                               )
+                               if talkRev != None:
+                                       print(
+                                               "\t".join(clean(v) for v in [
+                                                       user['user_id'],
+                                                       user['user_name'],
+                                                       
user['user_registration'],
+                                                       endOfNoob,
+                                                       talkRev['rev_id'],
+                                                       
talkRev['rev_timestamp'],
+                                                       talkRev['rev_comment']
+                                               ])
+                                       )
+                                       LOGGING_STREAM.write(".")
+                                       yearCount += 1
+                                       if yearCount >= args.n:
+                                               break
+                               else:
+                                       LOGGING_STREAM.write("s")
+                                       #logging.debug("User %s has no talk 
page revisions by other users. Skipping..." % user['username'])
+                               
+                       else:
+                               LOGGING_STREAM.write("-")
+                               #logging.debug("User %s has no revisions. 
Skipping..." % user['username'])
+                       
+               LOGGING_STREAM.write("\n")
+                       
+               
+               
+       
+def getUsersByYear(conn, year):
+       year  = int(year)
+       cursor = conn.cursor(MySQLdb.cursors.SSCursor)
+       yearBegin = "%s0000000000" % year
+       yearEnd   = "%s1231115959" % year
+       cursor.execute("""
+               SELECT * FROM user
+               WHERE user_registration BETWEEN %(year_begin)s AND %(year_end)s
+               ORDER BY RAND()
+               """,
+               {
+                       'year_begin': yearBegin,
+                       'year_end': yearEnd
+               }
+       )
+       for row in cursor:
+               yield dict(
+                       zip(
+                               (d[0] for d in cursor.description),
+                               row
+                       )
+               )
+                       
+       
+
+
+def getFirst10Revs(conn, userId):
+       user_id = int(userId)
+       cursor = conn.cursor()
+       cursor.execute("""
+               SELECT * FROM revision
+               WHERE rev_user = %(user_id)s
+               ORDER BY rev_timestamp ASC
+               LIMIT 10
+               """,
+               {
+                       'user_id': userId
+               }
+       )
+       for row in cursor:
+               yield dict(
+                       zip(
+                               (d[0] for d in cursor.description),
+                               row
+                       )
+               )
+
+def getRandNonSelfPostToTalkPage(conn, userId, username, start, end):
+       pageId = getTalkPageId(conn, username)
+       if pageId == None: return None
+       else:
+               cursor = conn.cursor()
+               cursor.execute("""
+                       SELECT * FROM revision
+                       WHERE rev_page = %(page_id)s
+                       AND rev_user != %(user_id)s
+                       AND rev_timestamp BETWEEN %(start)s AND %(end)s
+                       ORDER BY RAND()
+                       LIMIT 1
+                       """,
+                       {
+                               'page_id': pageId,
+                               'user_id': userId,
+                               'start':   start,
+                               'end':     end
+                       }
+               )
+               for rev in cursor:
+                       return dict(
+                               zip(
+                                       (d[0] for d in cursor.description),
+                                       rev
+                               )
+                       )
+               
+               return None
+       
+
+def getTalkPageId(conn, title):
+       cursor = conn.cursor()
+       cursor.execute("""
+               SELECT page_id FROM page 
+               WHERE page_title = %(title)s
+               AND page_namespace = 3
+               """,
+               {
+                       'title': title
+               }
+       )
+       for page in cursor:
+               return page[0]
+       
+       return None
+       
+if __name__ == "__main__":
+       parser = argparse.ArgumentParser(
+               description=
+                       'Samples editors by the year they made their first 
edit.'
+       )
+       parser.add_argument(
+               'n',
+               type=int, 
+               help='the number of editors to sample from each year'
+       )
+       parser.add_argument(
+               'year',
+               type=int, 
+               help='year(s) to sample from',
+               nargs="+"
+       )
+       parser.add_argument(
+               '-c', '--cnf',
+               metavar="<path>",
+               type=str, 
+               help='the path to MySQL config info (defaults to ~/.my.cnf)',
+               default=os.path.expanduser("~/.my.cnf")
+       )
+       parser.add_argument(
+               '-d', '--db',
+               type=str, 
+               help='the language db to run the query in (defaults to enwiki)',
+               default="enwiki"
+       )
+       args = parser.parse_args()
+       main(args)

Added: trunk/tools/wsor/ts_samples/testing.sql
===================================================================
--- trunk/tools/wsor/ts_samples/testing.sql                             (rev 0)
+++ trunk/tools/wsor/ts_samples/testing.sql     2011-06-03 16:56:45 UTC (rev 
89424)
@@ -0,0 +1,25 @@
+CREATE TABLE u_grphack.user_meta (
+       user_id      INT,
+       username     VARCHAR(255),
+       registration VARCHAR(14),
+       reg_year     INT
+);
+
+INSERT INTO u_grphack.user_meta
+SELECT user_id, user_name, user_registration, SUBSTRING(user_registration, 1,4)
+FROM user;
+
+CREATE INDEX user_meta_pkey ON u_grphack.user_meta (user_id) USING BTREE;
+CREATE INDEX user_meta_reg_year ON u_grphack.user_meta (reg_year) USING BTREE;
+
+
+
+
+explain SELECT * FROM u_grphack.user_meta
+WHERE reg_year = 2004
+ORDER BY RAND();
+
+explain SELECT * FROM user
+WHERE user_registration BETWEEN "20040000000000" AND "20041231115959"
+ORDER BY RAND()
+LIMIT 10;


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to