wikipedia.py

purodha Sun, 21 Dec 2008 11:43:36 -0800

Revision: 6179
Author:   purodha
Date:     2008-12-21 19:43:24 +0000 (Sun, 21 Dec 2008)


Log Message:
-----------
Error corrections.
Allow separator before categories and interlanguage links
to be real strings other than cr/lf. Note that processing
was not altered for the current cr/lf separator, although
part of it is not really logical.

Modified Paths:
--------------
    trunk/pywikipedia/wikipedia.py

Modified: trunk/pywikipedia/wikipedia.py
===================================================================
--- trunk/pywikipedia/wikipedia.py      2008-12-21 14:34:34 UTC (rev 6178)
+++ trunk/pywikipedia/wikipedia.py      2008-12-21 19:43:24 UTC (rev 6179)
@@ -67,12 +67,18 @@
     decodeEsperantoX: decode Esperanto text using the x convention.
     encodeEsperantoX: convert wikitext to the Esperanto x-encoding.
     sectionencode: encode text for use as a section title in wiki-links.
+    findmarker(text, startwith, append): return a string which is not part
+        of text
+    expandmarker(text, marker, separator): return marker string expanded
+        backwards to include separator occurrences plus whitespace
 
 Wikitext manipulation functions for interlanguage links:
 
     getLanguageLinks(text,xx): extract interlanguage links from text and
         return in a dict
     removeLanguageLinks(text): remove all interlanguage links from text
+    removeLanguageLinksAndSeparator(text, site, marker, separator = ''):
+        remove language links, whitespace, preceeding separators from text
     replaceLanguageLinks(oldtext, new): remove the language links and
         replace them with links from a dict like the one returned by
         getLanguageLinks
@@ -87,6 +93,8 @@
     getCategoryLinks(text): return list of Category objects corresponding
         to links in text
     removeCategoryLinks(text): remove all category links from text
+    replaceCategoryLinksAndSeparator(text, site, marker, separator = ''):
+        remove language links, whitespace, preceeding separators from text
     replaceCategoryLinks(oldtext,new): replace the category links in oldtext by
         those in a list of Category objects
     replaceCategoryInPlace(text,oldcat,newtitle): replace a single link to
@@ -1853,19 +1861,13 @@
         thistxt  = removeDisabledParts(thistxt)
 
         # marker for inside templates or parameters
-        marker = u'@@'
-        while marker in thistxt:
-            marker += u'@'
+        marker = findmarker(thistxt,  u'@@', u'@')
 
         # marker for links
-        marker2 = u'##'
-        while marker2 in thistxt:
-            marker2 += u'#'
+        marker2 = findmarker(thistxt,  u'##', u'#')
 
         # marker for math
-        marker3 = u'%%'
-        while marker2 in thistxt:
-            marker3 += u'%'
+        marker3 = findmarker(thistxt,  u'%%', u'%')
 
         result = []
         inside = {}
@@ -3452,13 +3454,40 @@
     For the tags parameter, see removeDisabledParts() above.
     """
     # Find a marker that is not already in the text.
-    marker = '@@'
-    while marker in text:
-        marker += '@'
+    marker = findmarker(text, '@@', '@')
     text = text[:index] + marker + text[index:]
     text = removeDisabledParts(text, tags)
     return (marker not in text)
 
+def findmarker(text, startwith = u'@', append = u'@'):
+    # find a string which is not part of text
+    if len(append) <= 0:
+        append = u'@'
+    mymarker = startwith
+    while mymarker in text:
+        mymarker += append
+    return mymarker
+
+def expandmarker(text, marker = '', separator = ''):
+    # set to remove any number of separator occurrences plus arbitrary
+    # whitespace before, after, and between them,
+    # by allowing to include them into marker.
+    if separator:
+        firstinmarker = text.find(marker)
+        firstinseparator = firstinmarker
+        lenseparator = len(separator)
+        striploopcontinue = True
+        while firstinseparator > 0 and striploopcontinue:
+            striploopcontinue = False
+            if (firstinseparator >= lenseparator) and (separator == 
text[firstinseparator-lenseparator:firstinseparator]):
+                firstinseparator -= lenseparator
+                striploopcontinue = True
+            elif text[firstinseparator-1] < ' ':
+                firstinseparator -= 1
+                striploopcontinue = True
+        marker = text[firstinseparator:firstinmarker] + marker
+    return marker
+
 # Part of library dealing with interwiki language links
 
 # Note - MediaWiki supports two kinds of interwiki links; interlanguage and
@@ -3532,6 +3561,24 @@
                          ['nowiki', 'comment', 'math', 'pre', 'source'], 
marker=marker)
     return text.strip()
 
+def removeLanguageLinksAndSeparator(text, site = None, marker = '', separator 
= ''):
+    """Return text with all interlanguage links, plus any preceeding whitespace
+       and separateor occurrences removed.
+
+    If a link to an unknown language is encountered, a warning is printed.
+    If a marker is defined, that string is placed at the location of the
+    last occurence of an interwiki link (at the end if there are no
+    interwiki links).
+
+    """
+    if separator:
+        mymarker = findmarker(text, u'@L@')
+        newtext = removeLanguageLinks(text, site, mymarker)
+        mymarker = expandmarker(newtext, mymarker, separator)
+        return newtext.replace(mymarker, marker)
+    else:
+        return removeLanguageLinks(text, site, marker)
+
 def replaceLanguageLinks(oldtext, new, site = None, addOnly = False):
     """Replace interlanguage links in the text with a new set of links.
 
@@ -3540,17 +3587,18 @@
     function).
     """
     # Find a marker that is not already in the text.
-    marker = '@@'
-    while marker in oldtext:
-        marker += '@'
+    marker = findmarker( oldtext, u'@@')
     if site == None:
         site = getSite()
     separator = site.family.interwiki_text_separator
-    s = interwikiFormat(new, insite = site)
+    cseparator = site.family.category_text_separator
+    separatorstripped = separator.strip()
+    cseparatorstripped = cseparator.strip()
     if addOnly:
         s2 = oldtext
     else:
-        s2 = removeLanguageLinks(oldtext, site = site, marker = marker)
+        s2 = removeLanguageLinksAndSeparator(oldtext, site = site, marker = 
marker, separator = separatorstripped)
+    s = interwikiFormat(new, insite = site)
     if s:
         if site.language() in site.family.interwiki_attop:
             newtext = s + separator + s2.replace(marker,'').strip()
@@ -3563,11 +3611,13 @@
                 firstafter += len(marker)
             # Is there any text in the 'after' part that means we should keep 
it after?
             if "</noinclude>" in s2[firstafter:]:
+                if separatorstripped:
+                    s = separator + s
                 newtext = s2[:firstafter].replace(marker,'') + s + 
s2[firstafter:]
             elif site.language() in site.family.categories_last:
                 cats = getCategoryLinks(s2, site = site)
-                s2 = removeCategoryLinks(s2.replace(marker,'').strip(), site) 
+ separator + s
-                newtext = replaceCategoryLinks(s2, cats, site=site)
+                s2 = 
removeCategoryLinksAndSeparator(s2.replace(marker,'',cseparatorstripped).strip(),
 site) + separator + s
+                newtext = replaceCategoryLinks(s2, cats, site=site, 
addOnly=True)
             else:
                 newtext = s2.replace(marker,'').strip() + separator + s
     else:
@@ -3654,7 +3704,7 @@
     """Return text with all category links removed.
 
     Put the string marker after the last replacement (at the end of the text
-    if  there is no replacement).
+    if there is no replacement).
 
     """
     # This regular expression will find every link that is possibly an
@@ -3669,6 +3719,22 @@
         text = re.sub('\s*%s' % re.escape(marker), '\r\n' + marker, 
text.strip())
     return text.strip()
 
+def removeCategoryLinksAndSeparator(text, site = None, marker = '', separator 
= ''):
+    """Return text with all category links, plus any preceeding whitespace
+       and separateor occurrences removed.
+
+    Put the string marker after the last replacement (at the end of the text
+    if there is no replacement).
+
+    """
+    if separator:
+        mymarker = findmarker(text, u'@C@')
+        newtext = removeCategoryLinks(text, site, mymarker)
+        mymarker = expandmarker(newtext, mymarker, separator)
+        return newtext.replace(mymarker, marker)
+    else:
+        return removeCategoryLinks(text, site, marker)
+
 def replaceCategoryInPlace(oldtext, oldcat, newcat, site=None):
     """Replace the category oldcat with the category newcat and return
        the modified text.
@@ -3712,40 +3778,41 @@
     """
 
     # Find a marker that is not already in the text.
-    marker = '@@'
-    while marker in oldtext:
-        marker += '@'
-
+    marker = findmarker( oldtext, u'@@')
     if site is None:
         site = getSite()
     if site.sitename() == 'wikipedia:de' and "{{Personendaten" in oldtext:
         raise Error('The PyWikipediaBot is no longer allowed to touch 
categories on the German Wikipedia on pages that contain the person data 
template because of the non-standard placement of that template. See 
http://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/bis_2006#Position_der_Personendaten_am_.22Artikelende.22')
-
     separator = site.family.category_text_separator
-    s = categoryFormat(new, insite = site)
+    iseparator = site.family.interwiki_text_separator
+    separatorstripped = separator.strip()
+    iseparatorstripped = iseparator.strip()
     if addOnly:
         s2 = oldtext
     else:
-        s2 = removeCategoryLinks(oldtext, site = site, marker = marker)
-
+        s2 = removeCategoryLinksAndSeparator(oldtext, site = site, marker = 
marker, separator = separatorstripped)
+    s = categoryFormat(new, insite = site)
     if s:
         if site.language() in site.family.category_attop:
             newtext = s + separator + s2
         else:
             # calculate what was after the categories links on the page
+            firstafter = s2.find(marker)
             if firstafter < 0:
                 firstafter = len(s2)
             else:
                 firstafter += len(marker)
             # Is there any text in the 'after' part that means we should keep 
it after?
             if "</noinclude>" in s2[firstafter:]:
+                if separatorstripped:
+                    s = separator + s
                 newtext = s2[:firstafter].replace(marker,'') + s + 
s2[firstafter:]
             elif site.language() in site.family.categories_last:
                 newtext = s2.replace(marker,'').strip() + separator + s
             else:
                 interwiki = getLanguageLinks(s2)
-                s2 = removeLanguageLinks(s2.replace(marker,''), site) + 
separator + s
-                newtext = replaceLanguageLinks(s2, interwiki, site)
+                s2 = removeLanguageLinksAndSeparator(s2.replace(marker,''), 
site, '', iseparatorstripped) + separator + s
+                newtext = replaceLanguageLinks(s2, interwiki, site = site, 
addOnly = True)
     else:
         newtext = s2.replace(marker,'')
     return newtext.strip()



_______________________________________________
Pywikipedia-l mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-l

[Pywikipedia-l] SVN: [6179] trunk/pywikipedia/wikipedia.py

Reply via email to