Eranroz has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/206091

Change subject: Improve performence for replaceExcept
......................................................................

Improve performence for replaceExcept

Improving performence for replaceExcept in textlib:
* Avoid recompiling regexes. this is done using cache
* Early terminate the method if the replacement isn't relevant.
  This avoid the large overhead of checking exceptions, parsing the new etc,
  in the common case where the replace isn't relevant.

Change-Id: I65196e3a5748f950dce2037d2cb72c775a4c07dc
---
M pywikibot/textlib.py
1 file changed, 49 insertions(+), 36 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/pywikibot/core 
refs/changes/91/206091/1

diff --git a/pywikibot/textlib.py b/pywikibot/textlib.py
index 6a57668..b55c2be 100644
--- a/pywikibot/textlib.py
+++ b/pywikibot/textlib.py
@@ -46,6 +46,9 @@
     'or': u'୦୧୨୩୪୫୬୭୮୯',
 }
 
+# cache for replaceExcept to avoid recompile or regexes each call
+EXCEPTION_REGEXES_CACHE = None
+EXCEPTION_REGEXES_CACHE_SITE = None
 
 def to_local_digits(phrase, lang):
     """
@@ -102,44 +105,49 @@
         if nothing is changed, it is added at the end
 
     """
+    global EXCEPTION_REGEXES_CACHE, EXCEPTION_REGEXES_CACHE_SITE
+
     if site is None:
         site = pywikibot.Site()
 
-    exceptionRegexes = {
-        'comment':      re.compile(r'(?s)<!--.*?-->'),
-        # section headers
-        'header':       re.compile(r'\r?\n=+.+=+ *\r?\n'),
-        # preformatted text
-        'pre':          re.compile(r'(?ism)<pre>.*?</pre>'),
-        'source':       re.compile(r'(?is)<source .*?</source>'),
-        # inline references
-        'ref':          re.compile(r'(?ism)<ref[ >].*?</ref>'),
-        # lines that start with a space are shown in a monospace font and
-        # have whitespace preserved.
-        'startspace':   re.compile(r'(?m)^ (.*?)$'),
-        # tables often have whitespace that is used to improve wiki
-        # source code readability.
-        # TODO: handle nested tables.
-        'table':        re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'),
-        'hyperlink':    compileLinkR(),
-        'gallery':      re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
-        # this matches internal wikilinks, but also interwiki, categories, and
-        # images.
-        'link':         re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
-        # also finds links to foreign sites with preleading ":"
-        'interwiki':    re.compile(r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*'
-                                   % '|'.join(site.validLanguageLinks() +
-                                              
list(site.family.obsolete.keys()))),
-        # Wikibase property inclusions
-        'property':     re.compile(r'(?i)\{\{\s*#property:\s*p\d+\s*\}\}'),
-        # Module invocations (currently only Lua)
-        'invoke':       re.compile(r'(?i)\{\{\s*#invoke:.*?}\}'),
-        # categories
-        'category':     re.compile(u'\[\[ *(?:%s)\s*:.*?\]\]' % 
u'|'.join(site.namespace(14, all=True))),
-        # files
-        'file':         re.compile(u'\[\[ *(?:%s)\s*:.*?\]\]' % 
u'|'.join(site.namespace(6, all=True))),
+    if not EXCEPTION_REGEXES_CACHE_SITE or EXCEPTION_REGEXES_CACHE_SITE != 
site:
+        EXCEPTION_REGEXES_CACHE = {
+            'comment':      re.compile(r'(?s)<!--.*?-->'),
+            # section headers
+            'header':       re.compile(r'\r?\n=+.+=+ *\r?\n'),
+            # preformatted text
+            'pre':          re.compile(r'(?ism)<pre>.*?</pre>'),
+            'source':       re.compile(r'(?is)<source .*?</source>'),
+            # inline references
+            'ref':          re.compile(r'(?ism)<ref[ >].*?</ref>'),
+            # lines that start with a space are shown in a monospace font and
+            # have whitespace preserved.
+            'startspace':   re.compile(r'(?m)^ (.*?)$'),
+            # tables often have whitespace that is used to improve wiki
+            # source code readability.
+            # TODO: handle nested tables.
+            'table':        
re.compile(r'(?ims)^{\|.*?^\|}|<table>.*?</table>'),
+            'hyperlink':    compileLinkR(),
+            'gallery':      re.compile(r'(?is)<gallery.*?>.*?</gallery>'),
+            # this matches internal wikilinks, but also interwiki, categories, 
and
+            # images.
+            'link':         re.compile(r'\[\[[^\]\|]*(\|[^\]]*)?\]\]'),
+            # also finds links to foreign sites with preleading ":"
+            'interwiki':    re.compile(r'(?i)\[\[:?(%s)\s?:[^\]]*\]\][\s]*'
+                                       % '|'.join(site.validLanguageLinks() +
+                                                  
list(site.family.obsolete.keys()))),
+            # Wikibase property inclusions
+            'property':     re.compile(r'(?i)\{\{\s*#property:\s*p\d+\s*\}\}'),
+            # Module invocations (currently only Lua)
+            'invoke':       re.compile(r'(?i)\{\{\s*#invoke:.*?}\}'),
+            # categories
+            'category':     re.compile(u'\[\[ *(?:%s)\s*:.*?\]\]' % 
u'|'.join(site.namespace(14, all=True))),
+            # files
+            'file':         re.compile(u'\[\[ *(?:%s)\s*:.*?\]\]' % 
u'|'.join(site.namespace(6, all=True))),
+        }
+        EXCEPTION_REGEXES_CACHE_SITE = site
 
-    }
+    exceptionRegexes = EXCEPTION_REGEXES_CACHE
 
     # if we got a string, compile it as a regular expression
     if isinstance(old, basestring):
@@ -147,6 +155,10 @@
             old = re.compile(old, re.IGNORECASE | re.UNICODE)
         else:
             old = re.compile(old)
+
+    # early termination if not relevant
+    if not old.search(text):
+        return text
 
     dontTouchRegexes = []
     except_templates = False
@@ -161,8 +173,9 @@
             else:
                 # nowiki, noinclude, includeonly, timeline, math ond other
                 # extensions
-                dontTouchRegexes.append(re.compile(r'(?is)<%s>.*?</%s>'
-                                                   % (exc, exc)))
+                EXCEPTION_REGEXES_CACHE[exc] = re.compile(r'(?is)<%s>.*?</%s>'
+                                                   % (exc, exc))
+                dontTouchRegexes.append(EXCEPTION_REGEXES_CACHE[exc])
             # handle alias
             if exc == 'source':
                 dontTouchRegexes.append(re.compile(

-- 
To view, visit https://gerrit.wikimedia.org/r/206091
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I65196e3a5748f950dce2037d2cb72c775a4c07dc
Gerrit-PatchSet: 1
Gerrit-Project: pywikibot/core
Gerrit-Branch: master
Gerrit-Owner: Eranroz <eranro...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to