Eranroz has uploaded a new change for review. https://gerrit.wikimedia.org/r/206091
Change subject: Improve performence for replaceExcept ...................................................................... Improve performence for replaceExcept Improving performence for replaceExcept in textlib: * Avoid recompiling regexes. this is done using cache * Early terminate the method if the replacement isn't relevant. This avoid the large overhead of checking exceptions, parsing the new etc, in the common case where the replace isn't relevant. Change-Id: I65196e3a5748f950dce2037d2cb72c775a4c07dc --- M pywikibot/textlib.py 1 file changed, 49 insertions(+), 36 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core refs/changes/91/206091/1 diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py index 6a57668..b55c2be 100644 --- a/pywikibot/textlib.py +++ b/pywikibot/textlib.py @@ -46,6 +46,9 @@ 'or': u'୦୧୨୩୪୫୬୭୮୯', } +# cache for replaceExcept to avoid recompile or regexes each call +EXCEPTION_REGEXES_CACHE = None +EXCEPTION_REGEXES_CACHE_SITE = None def to_local_digits(phrase, lang): """ @@ -102,44 +105,49 @@ if nothing is changed, it is added at the end """ + global EXCEPTION_REGEXES_CACHE, EXCEPTION_REGEXES_CACHE_SITE + if site is None: site = pywikibot.Site() - exceptionRegexes = { - 'comment': re.compile(r'(?s)<!--.*?-->'), - # section headers - 'header': re.compile(r'\r?\n=+.+=+ *\r?\n'), - # preformatted text - 'pre': re.compile(r'(?ism)<pre>.*?</pre>'), - 'source': re.compile(r'(?is)<source .*?</source>'), - # inline references - 'ref': re.compile(r'(?ism)<ref[ >].*?</ref>'), - # lines that start with a space are shown in a monospace font and - # have whitespace preserved. - 'startspace': re.compile(r'(?m)^ (.*?)$'), - # tables often have whitespace that is used to improve wiki - # source code readability. - # TODO: handle nested tables. - 'table': re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'), - 'hyperlink': compileLinkR(), - 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'), - # this matches internal wikilinks, but also interwiki, categories, and - # images. - 'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'), - # also finds links to foreign sites with preleading ":" - 'interwiki': re.compile(r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*' - % '|'.join(site.validLanguageLinks() + - list(site.family.obsolete.keys()))), - # Wikibase property inclusions - 'property': re.compile(r'(?i)\{\{\s*#property:\s*p\d+\s*\}\}'), - # Module invocations (currently only Lua) - 'invoke': re.compile(r'(?i)\{\{\s*#invoke:.*?}\}'), - # categories - 'category': re.compile(u'\[\[ *(?:%s)\s*:.*?\]\]' % u'|'.join(site.namespace(14, all=True))), - # files - 'file': re.compile(u'\[\[ *(?:%s)\s*:.*?\]\]' % u'|'.join(site.namespace(6, all=True))), + if not EXCEPTION_REGEXES_CACHE_SITE or EXCEPTION_REGEXES_CACHE_SITE != site: + EXCEPTION_REGEXES_CACHE = { + 'comment': re.compile(r'(?s)<!--.*?-->'), + # section headers + 'header': re.compile(r'\r?\n=+.+=+ *\r?\n'), + # preformatted text + 'pre': re.compile(r'(?ism)<pre>.*?</pre>'), + 'source': re.compile(r'(?is)<source .*?</source>'), + # inline references + 'ref': re.compile(r'(?ism)<ref[ >].*?</ref>'), + # lines that start with a space are shown in a monospace font and + # have whitespace preserved. + 'startspace': re.compile(r'(?m)^ (.*?)$'), + # tables often have whitespace that is used to improve wiki + # source code readability. + # TODO: handle nested tables. + 'table': re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'), + 'hyperlink': compileLinkR(), + 'gallery': re.compile(r'(?is)<gallery.*?>.*?</gallery>'), + # this matches internal wikilinks, but also interwiki, categories, and + # images. + 'link': re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'), + # also finds links to foreign sites with preleading ":" + 'interwiki': re.compile(r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*' + % '|'.join(site.validLanguageLinks() + + list(site.family.obsolete.keys()))), + # Wikibase property inclusions + 'property': re.compile(r'(?i)\{\{\s*#property:\s*p\d+\s*\}\}'), + # Module invocations (currently only Lua) + 'invoke': re.compile(r'(?i)\{\{\s*#invoke:.*?}\}'), + # categories + 'category': re.compile(u'\[\[ *(?:%s)\s*:.*?\]\]' % u'|'.join(site.namespace(14, all=True))), + # files + 'file': re.compile(u'\[\[ *(?:%s)\s*:.*?\]\]' % u'|'.join(site.namespace(6, all=True))), + } + EXCEPTION_REGEXES_CACHE_SITE = site - } + exceptionRegexes = EXCEPTION_REGEXES_CACHE # if we got a string, compile it as a regular expression if isinstance(old, basestring): @@ -147,6 +155,10 @@ old = re.compile(old, re.IGNORECASE | re.UNICODE) else: old = re.compile(old) + + # early termination if not relevant + if not old.search(text): + return text dontTouchRegexes = [] except_templates = False @@ -161,8 +173,9 @@ else: # nowiki, noinclude, includeonly, timeline, math ond other # extensions - dontTouchRegexes.append(re.compile(r'(?is)<%s>.*?</%s>' - % (exc, exc))) + EXCEPTION_REGEXES_CACHE[exc] = re.compile(r'(?is)<%s>.*?</%s>' + % (exc, exc)) + dontTouchRegexes.append(EXCEPTION_REGEXES_CACHE[exc]) # handle alias if exc == 'source': dontTouchRegexes.append(re.compile( -- To view, visit https://gerrit.wikimedia.org/r/206091 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I65196e3a5748f950dce2037d2cb72c775a4c07dc Gerrit-PatchSet: 1 Gerrit-Project: pywikibot/core Gerrit-Branch: master Gerrit-Owner: Eranroz <eranro...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits