https://www.mediawiki.org/wiki/Special:Code/MediaWiki/115161

Revision: 115161
Author:   rfaulk
Date:     2012-05-08 09:38:21 +0000 (Tue, 08 May 2012)
Log Message:
-----------
Added revision filters to postings which allow the revision data object to be 
filtered or permuted

Modified Paths:
--------------
    trunk/tools/wsor/message_templates/run_postings_and_metrics.py
    trunk/tools/wsor/message_templates/umetrics/postings.py

Modified: trunk/tools/wsor/message_templates/run_postings_and_metrics.py
===================================================================
--- trunk/tools/wsor/message_templates/run_postings_and_metrics.py      
2012-05-08 07:35:29 UTC (rev 115160)
+++ trunk/tools/wsor/message_templates/run_postings_and_metrics.py      
2012-05-08 09:38:21 UTC (rev 115161)
@@ -48,17 +48,20 @@
                         78 : False, 79 : False, 81 : False, 82 : False, # 
TWINKLE
                         4 : False, 5 : False,  # Welcome templates - chico
                         143 : False, 144 : False, 145 : False, 146 : False,  # 
28 bot
-                        147 : True # Rcsprinter bot
+                        1001 : True # Rcsprinter bot
     }
     #    template_indices = {78 : True}
     # Run postings and metrics
 
 
+    no_template = True
     generator = 'warnings'
     postings = False
 
-    # postings_cmd = './postings -h db1047 --start=%(start_time)s 
--end=%(end_time)s --comment="%(rev_comment)s" --message="{{%(template)s}}" 
--outfilename postings_%(file_name)s.tsv'
-    postings_cmd = './postings -h db42 --start=%(start_time)s 
--end=%(end_time)s --message="{{%(template)s}}" --outfilename 
postings_%(file_name)s.tsv'
+    if no_template:
+        postings_cmd = './postings -h db42 --start=%(start_time)s 
--end=%(end_time)s --message="" --outfilename postings_%(file_name)s.tsv'
+    else:
+        postings_cmd = './postings -h db42 --start=%(start_time)s 
--end=%(end_time)s --message="{{%(template)s}}" --outfilename 
postings_%(file_name)s.tsv'
 
     # metrics_cmd = 'cat ./output/postings_%(file_name)s.tsv | ./metrics -h 
db1047 --header --outfilename metrics_%(file_name)s_%(fname_generator)s.tsv 
%(generator)s'
     metrics_cmd = 'cat ./output/postings_%(file_name)s.tsv | ./metrics -h db42 
--header --outfilename metrics_%(file_name)s_%(fname_generator)s.tsv 
%(generator)s'
@@ -70,7 +73,7 @@
             template_name = 'z' + str(key)
             logging.info('Generating postings for %s' % template_name)
 
-            name, start_ts, end_ts, comment, user, api_uri, use_rev_file, 
namespace  = get_experiment(key)
+            name, start_ts, end_ts, comment, user, api_uri, use_rev_file, 
namespace, rev_filter  = get_experiment(key)
 
             if key >= 60 and key <= 116:
                 filename_part = start_ts[4:8] + '_' + end_ts[4:8] + '_' + 
template_name
@@ -95,6 +98,8 @@
                     cmd += ' --use_in_file %s' % use_rev_file
                 if namespace != None:
                     cmd += ' --namespace %s' % namespace
+                if rev_filter != None:
+                    cmd += ' --rev_filter %s' % rev_filter
 
             else:
                 cmd = metrics_cmd % {'file_name' : filename_part, 'generator' 
: generator, 'fname_generator' : generator}
@@ -118,6 +123,7 @@
     api_uri = None
     use_rev_file = None
     namespace = None
+    rev_filter = None
 
     if index >= 60 and index <= 77:
         test_handle = 'Huggle_3'
@@ -183,17 +189,21 @@
         user = '28bot'
         comment = '.*'
 
-    elif index == 147:
-        test_handle = 'RcsprinterBot'
+    elif index == 1001:
+        test_handle = 'RscprinterBot'
+
         start_ts = '20120119000000'
         end_ts = '20120501000000'
-        user = 'RcsprinterBot'
+
+        user = 'RscprinterBot'
+
         comment = '.*'
         namespace = 0
+        rev_filter = 'rscprinterbot'
 
     logging.info('Processing %(test_handle)s from %(start_ts)s to %(end_ts)s 
on comment "%(comment)s" for user "%(user)s" ...' % {'test_handle' : 
test_handle, 'start_ts' : start_ts, 'end_ts' : end_ts, 'comment' : comment, 
'user' : user})
 
-    return test_handle, start_ts, end_ts, comment, user, api_uri, 
use_rev_file, namespace
+    return test_handle, start_ts, end_ts, comment, user, api_uri, 
use_rev_file, namespace, rev_filter
 
 """
     Call main, exit when execution is complete

Modified: trunk/tools/wsor/message_templates/umetrics/postings.py
===================================================================
--- trunk/tools/wsor/message_templates/umetrics/postings.py     2012-05-08 
07:35:29 UTC (rev 115160)
+++ trunk/tools/wsor/message_templates/umetrics/postings.py     2012-05-08 
09:38:21 UTC (rev 115161)
@@ -61,6 +61,20 @@
     else:
         return string
 
+# return a filter function to modifiy revs - each of these functions takes a 
single rev (collection) as an argument
+#
+
+RSCPRINTERBOT_REV_FILTER_HANDLE = 'rscprinterbot'
+
+def get_rev_filter(filter_handle):
+
+    if filter_handle == RSCPRINTERBOT_REV_FILTER_HANDLE:
+        rev_filter = RscprinterbotFilter().extract_user_from_comment
+    else:
+        rev_filter = None
+
+    return rev_filter
+
 def main():
     parser = argparse.ArgumentParser(
         description="""
@@ -157,9 +171,17 @@
         help='Page namespace on which to read revisions.',
         default=3
     )
+    parser.add_argument(
+        '--rev_filter',
+        type=str,
+        help='Filter or permute the data in the revision records.',
+        default=''
+    )
 
     args = parser.parse_args()
 
+    rev_filter = get_rev_filter(args.rev_filter)
+
     LOGGING_STREAM = sys.stderr
     if args.debug:
         logLevel = logging.DEBUG
@@ -240,6 +262,11 @@
 
         message = api.getAdded(rev['rev_id'])
 
+        # If a revision filter was specified execute it
+        if rev_filter:
+            if not(rev_filter(rev)):
+                continue
+
         match = args.message.search(message)
         if match != None:
             rev['message_match'] = match.group(0)
@@ -299,20 +326,18 @@
                                REPLACE(p.page_title, "_", " ") AS 
recipient_name
                        FROM revision r
                        INNER JOIN page p ON r.rev_page = p.page_id
-                       WHERE rev_timestamp BETWEEN %(start)s AND %(end)s
+                       WHERE rev_timestamp BETWEEN "%(start)s" AND "%(end)s"
                        AND page_namespace = %(page_namespace)s
                        """
 
         if userName != None:
-            query += 'AND rev_user_text = %(user_name)s\n'
+            query += 'AND rev_user_text = "%(user_name)s"\n'
         if commentRE != None:
-            query += 'AND rev_comment REGEXP %(comment_pattern)s\n'
+            query += 'AND rev_comment REGEXP "%(comment_pattern)s"\n'
 
         query = query % {'start': start, 'end': end, 'user_name': userName, 
'comment_pattern': commentRE.pattern, 'page_namespace' : namespace}
 
-        cursor.execute(
-            query
-        )
+        cursor.execute(query)
 
         return cursor
 
@@ -418,5 +443,36 @@
         return re.sub("&#?\w+;", fixup, text)
 
 
+# Revision Filter Classes
+#
+# These classes provide a way to permute or filter the revision data via 
methods
+#
+# The method itself will modify the revision data structure and return TRUE or 
FALSE depending on the
+# outcome of the modification
+
+# Revision Filter Class - RscprinterBot class, extracts the recipient name 
from the revision comment
+#
+class RscprinterbotFilter:
+
+    COMMENT_PATT = re.compile(r'\[\[Special:Contributions/.*\|')
+
+    def extract_user_from_comment(self, rev):
+
+        rev_comment = rev['rev_comment']
+
+        if self.COMMENT_PATT.search(rev_comment):
+            user = rev_comment.split('[[Special:Contributions/')[1]
+            user = user.split('|')[0]
+
+            logging.debug('MATCHED:  ' + rev_comment)
+
+        else:
+            logging.debug('NOT MATCHED:  ' + rev_comment)
+            return False
+
+        rev['recipient_name'] = user
+        return True
+
+
 if __name__ == "__main__":
     main()


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to