postings.py

rfaulk Thu, 03 May 2012 07:48:41 -0700

https://www.mediawiki.org/wiki/Special:Code/MediaWiki/115127


Revision: 115127
Author:   rfaulk
Date:     2012-05-03 14:48:35 +0000 (Thu, 03 May 2012)
Log Message:
-----------
modified WPAPI to search content of page on revision if the diff  is missing - 
this is indicative of the case where the revision is used to create the user 
talk page

Modified Paths:
--------------
    trunk/tools/wsor/message_templates/umetrics/postings.py

Modified: trunk/tools/wsor/message_templates/umetrics/postings.py
===================================================================
--- trunk/tools/wsor/message_templates/umetrics/postings.py     2012-05-03 
11:57:26 UTC (rev 115126)
+++ trunk/tools/wsor/message_templates/umetrics/postings.py     2012-05-03 
14:48:35 UTC (rev 115127)
@@ -24,49 +24,49 @@
 import time, datetime
 import MySQLdb, MySQLdb.cursors
 import urllib, urllib2, json, htmlentitydefs
-import wmf, settings
+import settings
 
 class MissingRevError(Exception):pass
 
 def encode(v):
-       if v == None: return "\N"
-       
-       if type(v) == types.LongType:     v = int(v)
-       elif type(v) == types.UnicodeType: v = v.encode('utf-8')
-       
-       return str(v).encode("string-escape")
+    if v == None: return "\N"
 
+    if type(v) == types.LongType:     v = int(v)
+    elif type(v) == types.UnicodeType: v = v.encode('utf-8')
+
+    return str(v).encode("string-escape")
+
 HEADERS = [
-       'recipient_name',
-       'timestamp',
-       'rev_id',
-       'poster_id',
-       'poster_name',
-       'message_match'
+    'recipient_name',
+    'timestamp',
+    'rev_id',
+    'poster_id',
+    'poster_name',
+    'message_match'
 ]
 
 def emit(rev):
-       if use_file:
-               postings_file.write("\t".join(encode(rev[h]) for h in HEADERS) 
+ "\n")
-       else:
-               print("\t".join(encode(rev[h]) for h in HEADERS))
+    if use_file:
+        postings_file.write("\t".join(encode(rev[h]) for h in HEADERS) + "\n")
+    else:
+        print("\t".join(encode(rev[h]) for h in HEADERS))
 
 #  MediaWiki Date format
 #
 #                      |  year |   month |     day |    hour |  minute |  
second |
 MW_DATE = 
re.compile(r"^[0-9]{4}[0-1][0-9][0-3][0-9][0-2][0-9][0-5][0-9][0-5][0-9]$")
 def mwDate(string):
-       if MW_DATE.match(string) == None:
-               raise ValueError("%r is not a valid date.  Expected 
YYMMDDHHmmSS" % string)
-       else:
-               return string
+    if MW_DATE.match(string) == None:
+        raise ValueError("%r is not a valid date.  Expected YYMMDDHHmmSS" % 
string)
+    else:
+        return string
 
 def main():
-       parser = argparse.ArgumentParser(
-               description="""
+    parser = argparse.ArgumentParser(
+        description="""
                Gathers experimental message postings from user_talk messages.
                """,
-               epilog="""
+        epilog="""
                python message_postings.py 
                -h db42 
                --start=20111222000000 
@@ -74,217 +74,216 @@
                --comment="\(\[\[WP:HG\|HG\]\]\)" 
                --message="Template:uw-vandalism1"
                """,
-               conflict_handler="resolve"
-       )
-       parser.add_argument(
-               '-c', '--cnf',
-               metavar="<path>",
-               type=str, 
-               help='the path to MySQL config info (defaults to ~/.my.cnf)',
-               default=os.path.expanduser("~/.my.cnf")
-       )
-       parser.add_argument(
-               '-h', '--host',
-               type=str, 
-               help='the database host to connect to (defaults to localhost)',
-               default="localhost"
-       )
-       parser.add_argument(
-               '-d', '--db',
-               type=str, 
-               help='the language db to run the query in (defaults to enwiki)',
-               default="enwiki"
-       )
-       parser.add_argument(
-               '-a', '--api_uri',
-               type=str, 
-               help='the mediawiki API to connect to in order to retrieve 
message content (defaults to http://en.wikipedia.org/w/api.php)',
-               default="http://pt.wikipedia.org/w/api.php";
-       )
-       parser.add_argument(
-               '--start',
-               type=mwDate,
-               help='the start of the experimental period. (Required)',
-               required=True
-       )
-       parser.add_argument(
-               '--end',
-               type=mwDate, 
-               help='the end of the experimental period.  (defaults to NOW())',
-               default=datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
-       )
-       parser.add_argument(
-               '--user_name',
-               type=str, 
-               help='the user_name to further filter postings by (useful for 
tracking bots)'
-       )
-       parser.add_argument(
-               '--comment',
-               type=re.compile,
-               help='regular expression to match against message posting 
comment'
-       )
-       parser.add_argument(
-               '--message',
-               type=re.compile,
-               help='regular expression to match against message content 
(required)',
-               required=True
-       )
-       parser.add_argument(
-               '--header',
-               action="store_true",
-               default=False
-       )
-       parser.add_argument(
-               '--debug',
-               action="store_true",
-               default=False
-       )
-       parser.add_argument(
-               '--outfilename',
-               type=str, 
-               help='the output file name.',
-               default=''
-       )
-       parser.add_argument(
-               '--use_in_file',
-               type=str, 
-               help='indicates that revisions should be read from a file.  
Name is to be specified.',
-               default=''
-       )
+        conflict_handler="resolve"
+    )
+    parser.add_argument(
+        '-c', '--cnf',
+        metavar="<path>",
+        type=str,
+        help='the path to MySQL config info (defaults to ~/.my.cnf)',
+        default=os.path.expanduser("~/.my.cnf")
+    )
+    parser.add_argument(
+        '-h', '--host',
+        type=str,
+        help='the database host to connect to (defaults to localhost)',
+        default="localhost"
+    )
+    parser.add_argument(
+        '-d', '--db',
+        type=str,
+        help='the language db to run the query in (defaults to enwiki)',
+        default="enwiki"
+    )
+    parser.add_argument(
+        '-a', '--api_uri',
+        type=str,
+        help='the mediawiki API to connect to in order to retrieve message 
content (defaults to http://en.wikipedia.org/w/api.php)',
+        default="http://en.wikipedia.org/w/api.php";
+    )
+    parser.add_argument(
+        '--start',
+        type=mwDate,
+        help='the start of the experimental period. (Required)',
+        required=True
+    )
+    parser.add_argument(
+        '--end',
+        type=mwDate,
+        help='the end of the experimental period.  (defaults to NOW())',
+        default=datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
+    )
+    parser.add_argument(
+        '--user_name',
+        type=str,
+        help='the user_name to further filter postings by (useful for tracking 
bots)'
+    )
+    parser.add_argument(
+        '--comment',
+        type=re.compile,
+        help='regular expression to match against message posting comment'
+    )
+    parser.add_argument(
+        '--message',
+        type=re.compile,
+        help='regular expression to match against message content (required)',
+        required=True
+    )
+    parser.add_argument(
+        '--header',
+        action="store_true",
+        default=False
+    )
+    parser.add_argument(
+        '--debug',
+        action="store_true",
+        default=False
+    )
+    parser.add_argument(
+        '--outfilename',
+        type=str,
+        help='the output file name.',
+        default=''
+    )
+    parser.add_argument(
+        '--use_in_file',
+        type=str,
+        help='indicates that revisions should be read from a file.  Name is to 
be specified.',
+        default=''
+    )
 
-       args = parser.parse_args()
-       
-       LOGGING_STREAM = sys.stderr
-       if args.debug:
-               logLevel = logging.DEBUG
-       else:
-               logLevel = logging.INFO
-       
-       logging.basicConfig(
-               level=logLevel,
-               stream=LOGGING_STREAM,
-               format='%(asctime)s %(levelname)-8s %(message)s',
-               datefmt='%b-%d %H:%M:%S'
-       )
-       logging.debug("Comment pattern is %r." % args.comment.pattern)
-       logging.debug("Message pattern is %r." % args.message.pattern)
-       
-       logging.info("Connecting to %s:%s using %s." % (args.host, args.db, 
args.cnf))
+    args = parser.parse_args()
 
-       db = Database(
-               host=args.host, 
-               db=args.db, 
-               read_default_file=args.cnf
-       )
-       
-       # Initialize the output File
-       FileHandler(outfilename=args.outfilename)
-       
-       logging.info("Connecting to API @ %s." % args.api_uri)
-       api = WPAPI(args.api_uri)
+    LOGGING_STREAM = sys.stderr
+    if args.debug:
+        logLevel = logging.DEBUG
+    else:
+        logLevel = logging.INFO
 
-       if args.header:
-               if use_file:
-                       postings_file.write("\t".join(HEADERS) + '\n')
-               else:
-                       print("\t".join(HEADERS))
-       
-       logging.info("Querying for matching revisions:")
-       revs = []
-       count = 0
-       
-       # Process input from file if args.use_in_file is not an empty string - 
otherwise use the enwiki slave
-       if cmp(args.use_in_file,'') != 0:
-               in_file = open(args.use_in_file, 'rb')
-               
-               line = in_file.readline()
-               cols = line.split('\t')
-               cols[len(cols) - 1] = cols[len(cols) - 1][:-1]
-               
-               line = in_file.readline()
-               while(line):                    
-                       entry = dict()
-       
-                       elems = line.split('\t')
-                       elems[len(cols) - 1] = elems[len(cols) - 1][:-1]
-                       index = 0
-                       try:
-                               for col_name in cols:
-                                       entry[col_name] = elems[index]
-                                       index = index + 1
-                               
-                               revs.append(entry)
-                       except:
-                               logging.info('Could not add row: %s' % 
str(elems))
-                               
-                       line = in_file.readline()
-       else:
-               for rev in db.getPostings(args.start, args.end, args.user_name, 
args.comment):
-                       count += 1
-                       revs.append(rev)
-                       if count % 100 == 0: LOGGING_STREAM.write("|")
-       
-       LOGGING_STREAM.write("\n")
+    logging.basicConfig(
+        level=logLevel,
+        stream=LOGGING_STREAM,
+        format='%(asctime)s %(levelname)-8s %(message)s',
+        datefmt='%b-%d %H:%M:%S'
+    )
+    # logging.debug("Comment pattern is %r." % str(args.comment.pattern))
+    # logging.debug("Message pattern is %r." % str(args.message.pattern))
 
-       logging.info("Checking for message templates")
-       count = {"matched": 0, "missed": 0}
-       for rev in revs:
-               logging.debug("Matching revision %(rev_id)s peformed by 
%(poster_name)s @ %(timestamp)s: %(rev_comment)s" % rev)
+    logging.info("Connecting to %s:%s using %s." % (args.host, args.db, 
args.cnf))
 
-               message = api.getAdded(rev['rev_id'])
-               
-               match = args.message.search(message)
-               if match != None:
-                       rev['message_match'] = match.group(0)
-                       
-                       emit(rev)
-                       LOGGING_STREAM.write("|")
-                       count['matched'] += 1
-               else:
-                       LOGGING_STREAM.write("o")
-                       count['missed'] += 1
-       
-       # Close the output file
-       if use_file:
-               postings_file.close()
-       
-       LOGGING_STREAM.write("\n")
-       logging.info("Process completed. %(matched)s messages matched, 
%(missed)s messages missed." % count)
+    db = Database(
+        host=args.host,
+        db=args.db,
+        read_default_file=args.cnf
+    )
 
+    # Initialize the output File
+    FileHandler(outfilename=args.outfilename)
 
+    logging.info("Connecting to API @ %s." % args.api_uri)
+    api = WPAPI(args.api_uri)
+
+    if args.header:
+        if use_file:
+            postings_file.write("\t".join(HEADERS) + '\n')
+        else:
+            print("\t".join(HEADERS))
+
+    logging.info("Querying for matching revisions:")
+    revs = []
+    count = 0
+
+    # Process input from file if args.use_in_file is not an empty string - 
otherwise use the enwiki slave
+    if cmp(args.use_in_file,'') != 0:
+        in_file = open(args.use_in_file, 'rb')
+
+        line = in_file.readline()
+        cols = line.split('\t')
+        cols[len(cols) - 1] = cols[len(cols) - 1][:-1]
+
+        line = in_file.readline()
+        while(line):
+            entry = dict()
+
+            elems = line.split('\t')
+            elems[len(cols) - 1] = elems[len(cols) - 1][:-1]
+            index = 0
+            try:
+                for col_name in cols:
+                    entry[col_name] = elems[index]
+                    index = index + 1
+
+                revs.append(entry)
+            except:
+                logging.info('Could not add row: %s' % str(elems))
+
+            line = in_file.readline()
+    else:
+
+        for rev in db.getPostings(args.start, args.end, 
userName=args.user_name, commentRE=args.comment):
+            count += 1
+            revs.append(rev)
+            if count % 100 == 0: LOGGING_STREAM.write("|")
+
+    LOGGING_STREAM.write("\n")
+
+    logging.info("Checking for message templates")
+    count = {"matched": 0, "missed": 0}
+    for rev in revs:
+        logging.debug("Matching revision %(rev_id)s peformed by 
%(poster_name)s @ %(timestamp)s: %(rev_comment)s" % rev)
+
+        message = api.getAdded(rev['rev_id'])
+
+        match = args.message.search(message)
+        if match != None:
+            rev['message_match'] = match.group(0)
+
+            emit(rev)
+            LOGGING_STREAM.write("|")
+            count['matched'] += 1
+        else:
+            LOGGING_STREAM.write("o")
+            count['missed'] += 1
+
+    # Close the output file
+    if use_file:
+        postings_file.close()
+
+    LOGGING_STREAM.write("\n")
+    logging.info("Process completed. %(matched)s messages matched, %(missed)s 
messages missed." % count)
+
+
 """
        File initialization
 """
 class FileHandler:
-       
-       def __init__(self, **kwargs):
-               
-               self.kwargs   = kwargs
-               
-               # Open the output file
-               global use_file
-               use_file = False
-               if(kwargs['outfilename']):
-                       global postings_file
-                       postings_file = open(settings.__output_directory__ + 
kwargs['outfilename'], 'w')
-                       
-                       global use_file
-                       use_file = True
 
+    def __init__(self, **kwargs):
 
+        self.kwargs   = kwargs
+
+        # Open the output file
+        global use_file
+        use_file = False
+        if(kwargs['outfilename']):
+            global postings_file
+            postings_file = open(settings.__output_directory__ + 
kwargs['outfilename'], 'w')
+
+            global use_file
+            use_file = True
+
+
 class Database:
-       
-       def __init__(self, *args, **kwargs):
-               self.args   = args
-               self.kwargs = kwargs
-               self.conn   = MySQLdb.connect(*args, **kwargs)
-       
-       def getPostings(self, start, end, userName=None, commentRE=None):
-               if (userName, commentRE) == (None, None):
-                       raise TypeError("Must specify at at least one of 
userName or commentRE.")
-               
-               cursor = self.conn.cursor(MySQLdb.cursors.SSDictCursor)
-               query = """
+
+    def __init__(self, *args, **kwargs):
+        self.args   = args
+        self.kwargs = kwargs
+        self.conn   = MySQLdb.connect(*args, **kwargs)
+
+    def getPostings(self, start, end, userName=None, commentRE=None):
+
+        cursor = self.conn.cursor(MySQLdb.cursors.SSDictCursor)
+        query = """
                        SELECT 
                                r.rev_id,
                                r.rev_timestamp as timestamp,
@@ -297,91 +296,120 @@
                        WHERE rev_timestamp BETWEEN %(start)s AND %(end)s
                        AND page_namespace = 3
                        """
-               if userName != None:
-                       query += "AND rev_user_text = %(user_name)s\n"
-               if commentRE != None:
-                       query += "AND rev_comment REGEXP %(comment_pattern)s\n"
-               
-               cursor.execute(
-                       query,
-                       {
-                               'start': start,
-                               'end': end,
-                               'user_name': userName,
-                               'comment_pattern': commentRE.pattern
-                       }
-               )
-               
-               return cursor
-               
-       
 
+        if userName != None:
+            query += 'AND rev_user_text = %(user_name)s\n'
+        if commentRE != None:
+            query += 'AND rev_comment REGEXP %(comment_pattern)s\n'
+
+        cursor.execute(
+            query,
+                {
+                'start': start,
+                'end': end,
+                'user_name': userName,
+                'comment_pattern': commentRE.pattern
+            }
+        )
+
+        return cursor
+
+
+
 class WPAPI:
-       DIFF_ADD_RE = re.compile(r'<td 
class="diff-addedline"><div>(.+)</div></td>')
-       
-       def __init__(self, uri):
-               self.uri = uri
-       
-       def getDiff(self, revId, retries=20):
-               attempt = 0
-               while attempt < retries:
-                       try:
-                               response = urllib2.urlopen(
-                                       self.uri,
-                                       urllib.urlencode({
-                                               'action': 'query',
-                                               'prop': 'revisions',
-                                               'revids': revId,
-                                               'rvprop': 'ids',
-                                               'rvdiffto': 'prev',
-                                               'format': 'json'
-                                       })
-                               )
-                               result = json.load(response)
-                               
-                               diff = 
result['query']['pages'].values()[0]['revisions'][0]['diff']['*']
-                               if type(diff) not in types.StringTypes: diff = 
''
-                               
-                               return diff
-                       except urllib2.HTTPError as e:
-                               time.sleep(2**attempt)
-                               attempt += 1
-                               logging.error("HTTP Error: %s.  Retry #%s in %s 
seconds..." % (e, attempt, 2**attempt))
-                               
-               
-                       
-       
-       def getAdded(self, revId):
-               diff = self.getDiff(revId)
-               
-               return self.unescape(
-                               "\n".join(
-                               match.group(1) 
-                               for match in WPAPI.DIFF_ADD_RE.finditer(diff)
-                       )
-               )
-               
-       def unescape(self, text):
-               def fixup(m):
-                       text = m.group(0)
-                       if text[:2] == "&#":
-                               # character reference
-                               try:
-                                       if text[:3] == "&#x":
-                                               return unichr(int(text[3:-1], 
16))
-                                       else:
-                                               return unichr(int(text[2:-1]))
-                               except ValueError:
-                                       pass
-                       else:
-                               # named entity
-                               try:
-                                       text = 
unichr(htmlentitydefs.name2codepoint[text[1:-1]])
-                               except KeyError:
-                                       pass
-                       return text # leave as is
-               return re.sub("&#?\w+;", fixup, text)
-       
-       
-if __name__ == "__main__": 
-       main()
+    DIFF_ADD_RE = re.compile(r'<td 
class="diff-addedline"><div>(.+)</div></td>')
+
+    def __init__(self, uri):
+        self.uri = uri
+
+    def getDiff(self, revId, retries=20):
+        attempt = 0
+        is_content = False
+
+        while attempt < retries:
+            try:
+                # e.g. url: 
http://en.wikipedia.org/w/api.php?format=xml&action=query&prop=revisions&revids=472419240&rvprop=ids&rvdiffto=prev&format=json
+                response = urllib2.urlopen(
+                    self.uri,
+                    urllib.urlencode({
+                        'action': 'query',
+                        'prop': 'revisions',
+                        'revids': revId,
+                        'rvprop': 'ids',
+                        'rvdiffto': 'prev',
+                        'format': 'json'
+                    })
+                )
+
+                result = json.load(response)
+
+                diff = 
result['query']['pages'].values()[0]['revisions'][0]['diff']['*']
+
+                # The diff will not exist if it included the creation of the 
user talk page
+                # in this case simply load the content of the page at this 
revision
+                if type(diff) not in types.StringTypes or diff == '':
+
+                    # e.g. url: 
http://en.wikipedia.org/w/api.php?format=xml&action=query&prop=revisions&revids=474338555&format=json&rvprop=content
+                    response = urllib2.urlopen(
+                        self.uri,
+                        urllib.urlencode({
+                            'action': 'query',
+                            'prop': 'revisions',
+                            'revids': revId,
+                            'rvprop': 'content',
+                            'format': 'json'
+                        })
+                    )
+
+                    result = json.load(response)
+                    diff = 
result['query']['pages'].values()[0]['revisions'][0]['*']
+
+                    # Add the diff tags such that the content is parsed as if 
it were a diff
+                    if type(diff) not in types.StringTypes: diff = ''
+
+                    is_content = True
+
+                return diff, is_content
+            except urllib2.HTTPError as e:
+                time.sleep(2**attempt)
+                attempt += 1
+                logging.error("HTTP Error: %s.  Retry #%s in %s seconds..." % 
(e, attempt, 2**attempt))
+
+
+    def getAdded(self, revId):
+        diff, is_content = self.getDiff(revId)
+
+        if is_content:
+            return diff
+        else:
+            return self.unescape(
+                "\n".join(
+                    match.group(1)
+                    for match in WPAPI.DIFF_ADD_RE.finditer(diff)
+                )
+        )
+
+    def unescape(self, text):
+        def fixup(m):
+            text = m.group(0)
+            if text[:2] == "&#":
+                # character reference
+                try:
+                    if text[:3] == "&#x":
+                        return unichr(int(text[3:-1], 16))
+                    else:
+                        return unichr(int(text[2:-1]))
+                except ValueError:
+                    pass
+            else:
+                # named entity
+                try:
+                    text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+                except KeyError:
+                    pass
+            return text # leave as is
+        return re.sub("&#?\w+;", fixup, text)
+
+
+if __name__ == "__main__":
+    main()


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

[MediaWiki-CVS] SVN: [115127] trunk/tools/wsor/message_templates/umetrics/postings.py

Reply via email to