http://www.mediawiki.org/wiki/Special:Code/MediaWiki/89424
Revision: 89424
Author: halfak
Date: 2011-06-03 16:56:45 +0000 (Fri, 03 Jun 2011)
Log Message:
-----------
Added code for toolserver language samples for history coding
Added Paths:
-----------
trunk/tools/wsor/ts_samples/add_diff_for_talk_edits.py
trunk/tools/wsor/ts_samples/sample_talk_edits.py
trunk/tools/wsor/ts_samples/testing.sql
Added: trunk/tools/wsor/ts_samples/add_diff_for_talk_edits.py
===================================================================
--- trunk/tools/wsor/ts_samples/add_diff_for_talk_edits.py
(rev 0)
+++ trunk/tools/wsor/ts_samples/add_diff_for_talk_edits.py 2011-06-03
16:56:45 UTC (rev 89424)
@@ -0,0 +1,152 @@
+import os, sys, logging, argparse, MySQLdb, urllib2, urllib, json
+
+def clean(v):
+ if v == None:
+ return "\N"
+ else:
+ return str(v).replace("\t", "\\t").replace("\n",
"\\n").replace("\\", "\\\\")
+
+
+def main(args):
+ LOGGING_STREAM = sys.stderr
+ logging.basicConfig(
+ level=logging.DEBUG,
+ stream=LOGGING_STREAM,
+ format='%(asctime)s %(levelname)-8s %(message)s',
+ datefmt='%b-%d %H:%M:%S'
+ )
+
+ logging.info("Reading from %s." % args.input)
+
+ #Print header
+ print(
+ "\t".join([
+ 'rev_id',
+ 'diff'
+ ])
+ )
+
+ rowBuffer = []
+ for row in readTSVFile(args.input):
+ LOGGING_STREAM.write("<")
+ print(
+ "\t".join([
+ row['rev_id'],
+ getSingleDiff(args.uri,
row['rev_id']).replace("\\", "\\\\").replace("\n", "\\n").replace("\t", "\\t")
+ ])
+ )
+ LOGGING_STREAM.write(">")
+ #rowBuffer.append(row)
+ #if len(rowBuffer) == 50:
+ # LOGGING_STREAM.write("\n")
+ # diffMap = buildDiffMap(args.uri, list(r['rev_id'] for r
in rowBuffer))
+ # for row in rowBuffer:
+ # LOGGING_STREAM.write(">")
+ # print(
+ # "\t".join([
+ # row['rev_id'],
+ # diffMap.get(row['rev_id'],
'').replace("\\", "\\\\").replace("\n", "\\n").replace("\t",
"\\t").encode('utf-8')
+ # ])
+ # )
+ #
+ # rowBuffer = []
+ #
+ # LOGGING_STREAM.write("\n")
+
+ LOGGING_STREAM.write("\n")
+ #diffMap = buildDiffMap(args.uri, list(r['rev_id'] for r in rowBuffer))
+ #for row in rowBuffer:
+ # LOGGING_STREAM.write(">")
+ # print(
+ # "\t".join([
+ # row['rev_id'],
+ # diffMap.get(row['rev_id'], '').replace("\\",
"\\\\").replace("\n", "\\n").replace("\t", "\\t").encode('utf-8')
+ # ])
+ # )
+ #
+ #LOGGING_STREAM.write("\n")
+
+
+
+def getSingleDiff(uri, revId):
+ response = urllib2.urlopen(
+ uri,
+ urllib.urlencode({
+ 'action': 'query',
+ 'prop': 'revisions',
+ 'revids': revId,
+ 'rvprop': 'ids|content',
+ 'rvdiffto': 'prev',
+ 'format': 'json'
+ }),
+ ).read()
+ result = json.loads(response)
+ diffMap = {}
+ try:
+ for page in result['query']['pages'].values():
+ for rev in page['revisions']:
+ return rev['diff'].get("*", "").encode('utf-8')
+ except Exception as e:
+ logging.error(response)
+ logging.error(result)
+ raise e
+
+
+def buildDiffMap(uri, revIds):
+ if len(revIds) == 0:
+ return {}
+ else:
+ response = urllib2.urlopen(
+ uri,
+ urllib.urlencode({
+ 'action': 'query',
+ 'prop': 'revisions',
+ 'revids': '|'.join(revIds),
+ 'rvprop': 'ids|content',
+ 'rvdiffto': 'prev',
+ 'format': 'json'
+ }),
+ ).read()
+ result = json.loads(response)
+ diffMap = {}
+ try:
+ for page in result['query']['pages'].values():
+ for rev in page['revisions']:
+ diffMap[str(rev['revid'])] =
rev['diff'].get("*", "")
+ except Exception as e:
+ logging.error(response)
+ logging.error(result)
+ raise e
+
+ return diffMap
+
+
+
+def readTSVFile(f):
+ headers = f.readline().strip().split("\t")
+ for line in f:
+ values = line.strip().split("\t")
+ yield dict(zip(headers,values))
+
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description=
+ 'Adds diff information to a sample of talk edits'
+ )
+ parser.add_argument(
+ '-u', '--uri',
+ type=str,
+ help='the uri of the api to connect to (defaults to enwp api)',
+ default="http://en.wikipedia.org/api.php"
+ )
+ parser.add_argument(
+ '-i', '--input',
+ metavar="<path>",
+ type=lambda fn:open(fn, "r"),
+ help='the sample file to read (defaults to standard in)',
+ default=sys.stdin
+ )
+ args = parser.parse_args()
+ main(args)
Added: trunk/tools/wsor/ts_samples/sample_talk_edits.py
===================================================================
--- trunk/tools/wsor/ts_samples/sample_talk_edits.py
(rev 0)
+++ trunk/tools/wsor/ts_samples/sample_talk_edits.py 2011-06-03 16:56:45 UTC
(rev 89424)
@@ -0,0 +1,210 @@
+import os, sys, logging, argparse, MySQLdb
+
+def clean(v):
+ if v == None:
+ return "\N"
+ else:
+ return str(v).replace("\t", "\\t").replace("\n",
"\\n").replace("\\", "\\\\")
+
+
+def main(args):
+ LOGGING_STREAM = sys.stderr
+ logging.basicConfig(
+ level=logging.DEBUG,
+ stream=LOGGING_STREAM,
+ format='%(asctime)s %(levelname)-8s %(message)s',
+ datefmt='%b-%d %H:%M:%S'
+ )
+
+ logging.info("Connecting to %s_p using %s." % (args.db, args.cnf))
+ conn = MySQLdb.connect(
+ host="%s-p.rrdb.toolserver.org" % args.db,
+ db='%s_p' % args.db,
+ read_default_file=args.cnf
+ )
+ fetchConn = MySQLdb.connect(
+ host="%s-p.rrdb.toolserver.org" % args.db,
+ db='%s_p' % args.db,
+ read_default_file=args.cnf
+ )
+
+ #Printing headers
+ print(
+ "\t".join([
+ 'user_id',
+ 'username',
+ 'registration',
+ 'end_of_newbie',
+ 'rev_id',
+ 'timestamp',
+ 'comment'
+ ])
+ )
+ for year in args.year:
+ logging.info("Processing %s:" % year)
+ yearCount = 0
+ for user in getUsersByYear(fetchConn, year):
+ initialRevs = list(getFirst10Revs(conn,
user['user_id']))
+ if len(initialRevs) > 0:
+ endOfNoob = initialRevs[-1]['rev_timestamp']
+ talkRev = getRandNonSelfPostToTalkPage(
+ conn,
+ user['user_id'],
+ user['user_name'],
+ user['user_registration'],
+ endOfNoob
+ )
+ if talkRev != None:
+ print(
+ "\t".join(clean(v) for v in [
+ user['user_id'],
+ user['user_name'],
+
user['user_registration'],
+ endOfNoob,
+ talkRev['rev_id'],
+
talkRev['rev_timestamp'],
+ talkRev['rev_comment']
+ ])
+ )
+ LOGGING_STREAM.write(".")
+ yearCount += 1
+ if yearCount >= args.n:
+ break
+ else:
+ LOGGING_STREAM.write("s")
+ #logging.debug("User %s has no talk
page revisions by other users. Skipping..." % user['username'])
+
+ else:
+ LOGGING_STREAM.write("-")
+ #logging.debug("User %s has no revisions.
Skipping..." % user['username'])
+
+ LOGGING_STREAM.write("\n")
+
+
+
+
+def getUsersByYear(conn, year):
+ year = int(year)
+ cursor = conn.cursor(MySQLdb.cursors.SSCursor)
+ yearBegin = "%s0000000000" % year
+ yearEnd = "%s1231115959" % year
+ cursor.execute("""
+ SELECT * FROM user
+ WHERE user_registration BETWEEN %(year_begin)s AND %(year_end)s
+ ORDER BY RAND()
+ """,
+ {
+ 'year_begin': yearBegin,
+ 'year_end': yearEnd
+ }
+ )
+ for row in cursor:
+ yield dict(
+ zip(
+ (d[0] for d in cursor.description),
+ row
+ )
+ )
+
+
+
+
+def getFirst10Revs(conn, userId):
+ user_id = int(userId)
+ cursor = conn.cursor()
+ cursor.execute("""
+ SELECT * FROM revision
+ WHERE rev_user = %(user_id)s
+ ORDER BY rev_timestamp ASC
+ LIMIT 10
+ """,
+ {
+ 'user_id': userId
+ }
+ )
+ for row in cursor:
+ yield dict(
+ zip(
+ (d[0] for d in cursor.description),
+ row
+ )
+ )
+
+def getRandNonSelfPostToTalkPage(conn, userId, username, start, end):
+ pageId = getTalkPageId(conn, username)
+ if pageId == None: return None
+ else:
+ cursor = conn.cursor()
+ cursor.execute("""
+ SELECT * FROM revision
+ WHERE rev_page = %(page_id)s
+ AND rev_user != %(user_id)s
+ AND rev_timestamp BETWEEN %(start)s AND %(end)s
+ ORDER BY RAND()
+ LIMIT 1
+ """,
+ {
+ 'page_id': pageId,
+ 'user_id': userId,
+ 'start': start,
+ 'end': end
+ }
+ )
+ for rev in cursor:
+ return dict(
+ zip(
+ (d[0] for d in cursor.description),
+ rev
+ )
+ )
+
+ return None
+
+
+def getTalkPageId(conn, title):
+ cursor = conn.cursor()
+ cursor.execute("""
+ SELECT page_id FROM page
+ WHERE page_title = %(title)s
+ AND page_namespace = 3
+ """,
+ {
+ 'title': title
+ }
+ )
+ for page in cursor:
+ return page[0]
+
+ return None
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description=
+ 'Samples editors by the year they made their first
edit.'
+ )
+ parser.add_argument(
+ 'n',
+ type=int,
+ help='the number of editors to sample from each year'
+ )
+ parser.add_argument(
+ 'year',
+ type=int,
+ help='year(s) to sample from',
+ nargs="+"
+ )
+ parser.add_argument(
+ '-c', '--cnf',
+ metavar="<path>",
+ type=str,
+ help='the path to MySQL config info (defaults to ~/.my.cnf)',
+ default=os.path.expanduser("~/.my.cnf")
+ )
+ parser.add_argument(
+ '-d', '--db',
+ type=str,
+ help='the language db to run the query in (defaults to enwiki)',
+ default="enwiki"
+ )
+ args = parser.parse_args()
+ main(args)
Added: trunk/tools/wsor/ts_samples/testing.sql
===================================================================
--- trunk/tools/wsor/ts_samples/testing.sql (rev 0)
+++ trunk/tools/wsor/ts_samples/testing.sql 2011-06-03 16:56:45 UTC (rev
89424)
@@ -0,0 +1,25 @@
+CREATE TABLE u_grphack.user_meta (
+ user_id INT,
+ username VARCHAR(255),
+ registration VARCHAR(14),
+ reg_year INT
+);
+
+INSERT INTO u_grphack.user_meta
+SELECT user_id, user_name, user_registration, SUBSTRING(user_registration, 1,4)
+FROM user;
+
+CREATE INDEX user_meta_pkey ON u_grphack.user_meta (user_id) USING BTREE;
+CREATE INDEX user_meta_reg_year ON u_grphack.user_meta (reg_year) USING BTREE;
+
+
+
+
+explain SELECT * FROM u_grphack.user_meta
+WHERE reg_year = 2004
+ORDER BY RAND();
+
+explain SELECT * FROM user
+WHERE user_registration BETWEEN "20040000000000" AND "20041231115959"
+ORDER BY RAND()
+LIMIT 10;
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs