https://www.mediawiki.org/wiki/Special:Code/MediaWiki/115161
Revision: 115161
Author: rfaulk
Date: 2012-05-08 09:38:21 +0000 (Tue, 08 May 2012)
Log Message:
-----------
Added revision filters to postings which allow the revision data object to be
filtered or permuted
Modified Paths:
--------------
trunk/tools/wsor/message_templates/run_postings_and_metrics.py
trunk/tools/wsor/message_templates/umetrics/postings.py
Modified: trunk/tools/wsor/message_templates/run_postings_and_metrics.py
===================================================================
--- trunk/tools/wsor/message_templates/run_postings_and_metrics.py
2012-05-08 07:35:29 UTC (rev 115160)
+++ trunk/tools/wsor/message_templates/run_postings_and_metrics.py
2012-05-08 09:38:21 UTC (rev 115161)
@@ -48,17 +48,20 @@
78 : False, 79 : False, 81 : False, 82 : False, #
TWINKLE
4 : False, 5 : False, # Welcome templates - chico
143 : False, 144 : False, 145 : False, 146 : False, #
28 bot
- 147 : True # Rcsprinter bot
+ 1001 : True # Rcsprinter bot
}
# template_indices = {78 : True}
# Run postings and metrics
+ no_template = True
generator = 'warnings'
postings = False
- # postings_cmd = './postings -h db1047 --start=%(start_time)s
--end=%(end_time)s --comment="%(rev_comment)s" --message="{{%(template)s}}"
--outfilename postings_%(file_name)s.tsv'
- postings_cmd = './postings -h db42 --start=%(start_time)s
--end=%(end_time)s --message="{{%(template)s}}" --outfilename
postings_%(file_name)s.tsv'
+ if no_template:
+ postings_cmd = './postings -h db42 --start=%(start_time)s
--end=%(end_time)s --message="" --outfilename postings_%(file_name)s.tsv'
+ else:
+ postings_cmd = './postings -h db42 --start=%(start_time)s
--end=%(end_time)s --message="{{%(template)s}}" --outfilename
postings_%(file_name)s.tsv'
# metrics_cmd = 'cat ./output/postings_%(file_name)s.tsv | ./metrics -h
db1047 --header --outfilename metrics_%(file_name)s_%(fname_generator)s.tsv
%(generator)s'
metrics_cmd = 'cat ./output/postings_%(file_name)s.tsv | ./metrics -h db42
--header --outfilename metrics_%(file_name)s_%(fname_generator)s.tsv
%(generator)s'
@@ -70,7 +73,7 @@
template_name = 'z' + str(key)
logging.info('Generating postings for %s' % template_name)
- name, start_ts, end_ts, comment, user, api_uri, use_rev_file,
namespace = get_experiment(key)
+ name, start_ts, end_ts, comment, user, api_uri, use_rev_file,
namespace, rev_filter = get_experiment(key)
if key >= 60 and key <= 116:
filename_part = start_ts[4:8] + '_' + end_ts[4:8] + '_' +
template_name
@@ -95,6 +98,8 @@
cmd += ' --use_in_file %s' % use_rev_file
if namespace != None:
cmd += ' --namespace %s' % namespace
+ if rev_filter != None:
+ cmd += ' --rev_filter %s' % rev_filter
else:
cmd = metrics_cmd % {'file_name' : filename_part, 'generator'
: generator, 'fname_generator' : generator}
@@ -118,6 +123,7 @@
api_uri = None
use_rev_file = None
namespace = None
+ rev_filter = None
if index >= 60 and index <= 77:
test_handle = 'Huggle_3'
@@ -183,17 +189,21 @@
user = '28bot'
comment = '.*'
- elif index == 147:
- test_handle = 'RcsprinterBot'
+ elif index == 1001:
+ test_handle = 'RscprinterBot'
+
start_ts = '20120119000000'
end_ts = '20120501000000'
- user = 'RcsprinterBot'
+
+ user = 'RscprinterBot'
+
comment = '.*'
namespace = 0
+ rev_filter = 'rscprinterbot'
logging.info('Processing %(test_handle)s from %(start_ts)s to %(end_ts)s
on comment "%(comment)s" for user "%(user)s" ...' % {'test_handle' :
test_handle, 'start_ts' : start_ts, 'end_ts' : end_ts, 'comment' : comment,
'user' : user})
- return test_handle, start_ts, end_ts, comment, user, api_uri,
use_rev_file, namespace
+ return test_handle, start_ts, end_ts, comment, user, api_uri,
use_rev_file, namespace, rev_filter
"""
Call main, exit when execution is complete
Modified: trunk/tools/wsor/message_templates/umetrics/postings.py
===================================================================
--- trunk/tools/wsor/message_templates/umetrics/postings.py 2012-05-08
07:35:29 UTC (rev 115160)
+++ trunk/tools/wsor/message_templates/umetrics/postings.py 2012-05-08
09:38:21 UTC (rev 115161)
@@ -61,6 +61,20 @@
else:
return string
+# return a filter function to modifiy revs - each of these functions takes a
single rev (collection) as an argument
+#
+
+RSCPRINTERBOT_REV_FILTER_HANDLE = 'rscprinterbot'
+
+def get_rev_filter(filter_handle):
+
+ if filter_handle == RSCPRINTERBOT_REV_FILTER_HANDLE:
+ rev_filter = RscprinterbotFilter().extract_user_from_comment
+ else:
+ rev_filter = None
+
+ return rev_filter
+
def main():
parser = argparse.ArgumentParser(
description="""
@@ -157,9 +171,17 @@
help='Page namespace on which to read revisions.',
default=3
)
+ parser.add_argument(
+ '--rev_filter',
+ type=str,
+ help='Filter or permute the data in the revision records.',
+ default=''
+ )
args = parser.parse_args()
+ rev_filter = get_rev_filter(args.rev_filter)
+
LOGGING_STREAM = sys.stderr
if args.debug:
logLevel = logging.DEBUG
@@ -240,6 +262,11 @@
message = api.getAdded(rev['rev_id'])
+ # If a revision filter was specified execute it
+ if rev_filter:
+ if not(rev_filter(rev)):
+ continue
+
match = args.message.search(message)
if match != None:
rev['message_match'] = match.group(0)
@@ -299,20 +326,18 @@
REPLACE(p.page_title, "_", " ") AS
recipient_name
FROM revision r
INNER JOIN page p ON r.rev_page = p.page_id
- WHERE rev_timestamp BETWEEN %(start)s AND %(end)s
+ WHERE rev_timestamp BETWEEN "%(start)s" AND "%(end)s"
AND page_namespace = %(page_namespace)s
"""
if userName != None:
- query += 'AND rev_user_text = %(user_name)s\n'
+ query += 'AND rev_user_text = "%(user_name)s"\n'
if commentRE != None:
- query += 'AND rev_comment REGEXP %(comment_pattern)s\n'
+ query += 'AND rev_comment REGEXP "%(comment_pattern)s"\n'
query = query % {'start': start, 'end': end, 'user_name': userName,
'comment_pattern': commentRE.pattern, 'page_namespace' : namespace}
- cursor.execute(
- query
- )
+ cursor.execute(query)
return cursor
@@ -418,5 +443,36 @@
return re.sub("&#?\w+;", fixup, text)
+# Revision Filter Classes
+#
+# These classes provide a way to permute or filter the revision data via
methods
+#
+# The method itself will modify the revision data structure and return TRUE or
FALSE depending on the
+# outcome of the modification
+
+# Revision Filter Class - RscprinterBot class, extracts the recipient name
from the revision comment
+#
+class RscprinterbotFilter:
+
+ COMMENT_PATT = re.compile(r'\[\[Special:Contributions/.*\|')
+
+ def extract_user_from_comment(self, rev):
+
+ rev_comment = rev['rev_comment']
+
+ if self.COMMENT_PATT.search(rev_comment):
+ user = rev_comment.split('[[Special:Contributions/')[1]
+ user = user.split('|')[0]
+
+ logging.debug('MATCHED: ' + rev_comment)
+
+ else:
+ logging.debug('NOT MATCHED: ' + rev_comment)
+ return False
+
+ rev['recipient_name'] = user
+ return True
+
+
if __name__ == "__main__":
main()
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs