[MediaWiki-commits] [Gerrit] operations/puppet[production]: openstack: add wikitech-grep as utility for adminscripts
Rush has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/363896 ) Change subject: openstack: add wikitech-grep as utility for adminscripts .. openstack: add wikitech-grep as utility for adminscripts * now with pep8 friendlyness * very simple utility script we have been using for rebrand cleanup. Bug: T169820 Change-Id: I867c7dae8e1ee064deed1276c705048890131c39 --- A modules/openstack/files/utils/wikitech-grep.py M modules/openstack/manifests/adminscripts.pp 2 files changed, 173 insertions(+), 0 deletions(-) Approvals: Rush: Looks good to me, approved jenkins-bot: Verified diff --git a/modules/openstack/files/utils/wikitech-grep.py b/modules/openstack/files/utils/wikitech-grep.py new file mode 100644 index 000..43db7ef --- /dev/null +++ b/modules/openstack/files/utils/wikitech-grep.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" + usage: mwgrep [-h] [--max-results N] [--timeout N] [--user | --module] +[--title TITLE] regex + + Grep for Lua or CSS and JS code fragments +on (per default) MediaWiki wiki pages + + positional arguments: +regexregex to search for + + optional arguments: +-h, --help show this help message and exit +--max-results N show at most this many results (default: 100) +--timeout N abort search after this many seconds (default: 30) +--user search NS_USER rather than NS_MEDIAWIKI +--module search NS_MODULE rather than NS_MEDIAWIKI +--title TITLErestrict search to pages with this title + + mwgrep will grep the MediaWiki namespace across Wikimedia wikis. specify + --user to search the user namespace instead. See the lucene documentation + for org.apache.lucene.util.automaton.RegExp for supported syntax. The current + lucene version is available from `curl search.svc.eqiad.wmnet:9200`. + +""" +import sys +reload(sys) +sys.setdefaultencoding('utf-8') + +import argparse +import bisect +import json +import urllib +import urllib2 + + +TIMEOUT = 30 +BASE_URI = 'http://search.svc.eqiad.wmnet:9200/_all/page/_search' +NS_MEDIAWIKI = 8 +NS_USER = 2 +NS_MODULE = 828 +PREFIX_NS = { +NS_MEDIAWIKI: 'MediaWiki:', +NS_USER: 'User:', +NS_MODULE: 'Module:' +} + +ap = argparse.ArgumentParser( +prog='mwgrep', +description='Grep for CSS and JS code fragments in MediaWiki wiki pages', +epilog='mwgrep will grep the MediaWiki namespace across Wikimedia wikis. ' + 'specify --user to search the user namespace instead.' +) +ap.add_argument('term', help='text to search for') + +ap.add_argument( +'--max-results', +metavar='N', +type=int, default=1000, +help='show at most this many results (default: 1000)' +) + +ap.add_argument( +'--timeout', +metavar='N', +type='{0}s'.format, +default='30', +help='abort search after this many seconds (default: 30)' +) + +args = ap.parse_args() + +filters = [ +{'bool': { +'must': [ +{'term': {'wiki': 'labswiki'}}, +{'source_regex': { +'regex': args.term, +'field': 'source_text', +'ngram_field': 'source_text.trigram', +'max_determinized_states': 2, +'max_expand': 10, +'case_sensitive': True, +'locale': 'en', +'timeout': args.timeout, +}}, +], +'must_not': [ +{'term': {'namespace': '2'}}, # User +{'term': {'namespace': '3'}}, # User talk +], +}}, +] + +search = { +'size': args.max_results, +'_source': ['namespace', 'title', 'namespace_text'], +'sort': ['_doc'], +'query': {'bool': {'filter': filters}}, +'stats': ['mwgrep'], +} + +query = { +'timeout': args.timeout, +} + +matches = {'public': [], 'private': []} +uri = BASE_URI + '?' + urllib.urlencode(query) +try: +req = urllib2.urlopen(uri, json.dumps(search)) +full_result = json.load(req) +result = full_result['hits'] + +for hit in result['hits']: +db_name = hit['_index'].rsplit('_', 2)[0] +title = hit['_source']['title'] +ns = hit['_source']['namespace_text'] +if ns != '': +ns = '%s:' % ns +page_name = '%s%s' % (ns, title) +bisect.insort(matches['public'], (db_name, page_name)) + +if matches['public']: +print('## Public wiki results') +for db_name, page_name in matches['public']: +print('{:<20}{}'.format(db_name, page_name)) + +print('') +print('(total: %s, shown: %s)' % (result['total'], len(result['hits']))) +if full_result['timed_out']: +print(""" +The query was unable to complete within the alloted time. Only partial results +are shown here, and the reported total hits is <= the true value. To speed up +the query: + +* Ensure the regular expression
[MediaWiki-commits] [Gerrit] operations/puppet[production]: openstack: add wikitech-grep as utility for adminscripts
Rush has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/363896 ) Change subject: openstack: add wikitech-grep as utility for adminscripts .. openstack: add wikitech-grep as utility for adminscripts Bug: T169820 Change-Id: I867c7dae8e1ee064deed1276c705048890131c39 --- A modules/openstack/files/utils/wikitech-grep.py M modules/openstack/manifests/adminscripts.pp 2 files changed, 168 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/96/363896/1 diff --git a/modules/openstack/files/utils/wikitech-grep.py b/modules/openstack/files/utils/wikitech-grep.py new file mode 100644 index 000..927bc72 --- /dev/null +++ b/modules/openstack/files/utils/wikitech-grep.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" + usage: mwgrep [-h] [--max-results N] [--timeout N] [--user | --module] +[--title TITLE] regex + + Grep for Lua or CSS and JS code fragments on (per default) MediaWiki wiki pages + + positional arguments: +regexregex to search for + + optional arguments: +-h, --help show this help message and exit +--max-results N show at most this many results (default: 100) +--timeout N abort search after this many seconds (default: 30) +--user search NS_USER rather than NS_MEDIAWIKI +--module search NS_MODULE rather than NS_MEDIAWIKI +--title TITLErestrict search to pages with this title + + mwgrep will grep the MediaWiki namespace across Wikimedia wikis. specify + --user to search the user namespace instead. See the lucene documentation + for org.apache.lucene.util.automaton.RegExp for supported syntax. The current + lucene version is available from `curl search.svc.eqiad.wmnet:9200`. + +""" +import sys +reload(sys) +sys.setdefaultencoding('utf-8') + +import argparse +import bisect +import collections +import json +import urllib +import urllib2 + + +TIMEOUT = 30 +BASE_URI = 'http://search.svc.eqiad.wmnet:9200/_all/page/_search' +NS_MEDIAWIKI = 8 +NS_USER = 2 +NS_MODULE = 828 +PREFIX_NS = { +NS_MEDIAWIKI: 'MediaWiki:', +NS_USER: 'User:', +NS_MODULE: 'Module:' +} + +ap = argparse.ArgumentParser( +prog='mwgrep', +description='Grep for CSS and JS code fragments in MediaWiki wiki pages', +epilog='mwgrep will grep the MediaWiki namespace across Wikimedia wikis. ' + 'specify --user to search the user namespace instead.' +) +ap.add_argument('term', help='text to search for') + +ap.add_argument( +'--max-results', +metavar='N', +type=int, default=1000, +help='show at most this many results (default: 1000)' +) + +ap.add_argument( +'--timeout', +metavar='N', +type='{0}s'.format, +default='30', +help='abort search after this many seconds (default: 30)' +) + +args = ap.parse_args() + +filters = [ +{'bool': { +'must': [ +{'term': {'wiki': 'labswiki'}}, +{'source_regex': { +'regex': args.term, +'field': 'source_text', +'ngram_field': 'source_text.trigram', +'max_determinized_states': 2, +'max_expand': 10, +'case_sensitive': True, +'locale': 'en', +'timeout': args.timeout, +}}, +], +'must_not': [ +{'term': {'namespace': '2'}}, # User +{'term': {'namespace': '3'}}, # User talk +], +}}, +] + +search = { +'size': args.max_results, +'_source': ['namespace', 'title', 'namespace_text'], +'sort': ['_doc'], +'query': {'bool': {'filter': filters}}, +'stats': ['mwgrep'], +} + +query = { +'timeout': args.timeout, +} + +matches = {'public': [], 'private': []} +uri = BASE_URI + '?' + urllib.urlencode(query) +try: +req = urllib2.urlopen(uri, json.dumps(search)) +full_result = json.load(req) +result = full_result['hits'] + +for hit in result['hits']: +db_name = hit['_index'].rsplit('_', 2)[0] +title = hit['_source']['title'] +ns = hit['_source']['namespace_text'] +if ns != '': +ns = '%s:' % ns +page_name = '%s%s' % (ns, title) +bisect.insort(matches['public'], (db_name, page_name)) + +if matches['public']: +print('## Public wiki results') +for db_name, page_name in matches['public']: +print('{:<20}{}'.format(db_name, page_name)) + +print('') +print('(total: %s, shown: %s)' % (result['total'], len(result['hits']))) +if full_result['timed_out']: +print(""" +The query was unable to complete within the alloted time. Only partial results +are shown here, and the reported total hits is <= the true value. To speed up +the query: + +* Ensure the regular expression contains one or more sets of 3 contiguous + characters. A character range