BeautifulSoup.py

xqt Sat, 02 Mar 2013 04:58:05 -0800

http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11156


Revision: 11156
Author:   xqt
Date:     2013-03-02 12:57:54 +0000 (Sat, 02 Mar 2013)
Log Message:
-----------
some PEP8 changes

Modified Paths:
--------------
    trunk/pywikipedia/BeautifulSoup.py

Modified: trunk/pywikipedia/BeautifulSoup.py
===================================================================
--- trunk/pywikipedia/BeautifulSoup.py  2013-03-02 10:39:02 UTC (rev 11155)
+++ trunk/pywikipedia/BeautifulSoup.py  2013-03-02 12:57:54 UTC (rev 11156)
@@ -89,9 +89,9 @@
 import re
 import sgmllib
 try:
-  from htmlentitydefs import name2codepoint
+    from htmlentitydefs import name2codepoint
 except ImportError:
-  name2codepoint = {}
+    name2codepoint = {}
 try:
     set
 except NameError:
@@ -103,12 +103,13 @@
 
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 
+
 def _match_css_class(str):
     """Build a RE to match the given CSS class."""
     return re.compile(r"(^|.*\s)%s($|\s)" % str)
 
+
 # First, the classes that represent markup elements.
-
 class PageElement(object):
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
@@ -128,8 +129,8 @@
     def replaceWith(self, replaceWith):
         oldParent = self.parent
         myIndex = self.parent.index(self)
-        if hasattr(replaceWith, "parent")\
-                  and replaceWith.parent is self.parent:
+        if hasattr(replaceWith, "parent") and \
+           replaceWith.parent is self.parent:
             # We're replacing this element with one of its siblings.
             index = replaceWith.parent.index(replaceWith)
             if index and index < myIndex:
@@ -186,11 +187,11 @@
         return lastChild
 
     def insert(self, position, newChild):
-        if isinstance(newChild, basestring) \
-            and not isinstance(newChild, NavigableString):
+        if isinstance(newChild, basestring) and not \
+           isinstance(newChild, NavigableString):
             newChild = NavigableString(newChild)
 
-        position =  min(position, len(self.contents))
+        position = min(position, len(self.contents))
         if hasattr(newChild, 'parent') and newChild.parent is not None:
             # We're 'inserting' an element that's already one
             # of this object's children.
@@ -227,7 +228,7 @@
             while not parentsNextSibling:
                 parentsNextSibling = parent.nextSibling
                 parent = parent.parent
-                if not parent: # This is the last element in the document.
+                if not parent:  # This is the last element in the document.
                     break
             if parentsNextSibling:
                 newChildsLastElement.next = parentsNextSibling
@@ -272,8 +273,9 @@
         criteria and appear after this Tag in the document."""
         return self._findAll(name, attrs, text, limit,
                              self.nextSiblingGenerator, **kwargs)
-    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
 
+    fetchNextSiblings = findNextSiblings  # Compatibility with pre-3.x
+
     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
         """Returns the first item that matches the given criteria and
         appears before this Tag in the document."""
@@ -284,8 +286,8 @@
         """Returns all items that match the given criteria and appear
         before this Tag in the document."""
         return self._findAll(name, attrs, text, limit, self.previousGenerator,
-                           **kwargs)
-    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
+                             **kwargs)
+    fetchPrevious = findAllPrevious  # Compatibility with pre-3.x
 
     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
         """Returns the closest sibling to this Tag that matches the
@@ -299,7 +301,7 @@
         criteria and appear before this Tag in the document."""
         return self._findAll(name, attrs, text, limit,
                              self.previousSiblingGenerator, **kwargs)
-    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
+    fetchPreviousSiblings = findPreviousSiblings  # Compatibility with pre-3.x
 
     def findParent(self, name=None, attrs={}, **kwargs):
         """Returns the closest parent of this Tag that matches the given
@@ -318,8 +320,9 @@
 
         return self._findAll(name, attrs, None, limit, self.parentGenerator,
                              **kwargs)
-    fetchParents = findParents # Compatibility with pre-3.x
 
+    fetchParents = findParents  # Compatibility with pre-3.x
+
     #These methods do the real heavy lifting.
 
     def _findOne(self, method, name, attrs, text, **kwargs):
@@ -415,11 +418,12 @@
                 s = unicode(s)
         else:
             if encoding:
-                s  = self.toEncoding(str(s), encoding)
+                s = self.toEncoding(str(s), encoding)
             else:
                 s = unicode(s)
         return s
 
+
 class NavigableString(unicode, PageElement):
 
     def __new__(cls, value):
@@ -444,7 +448,8 @@
         if attr == 'string':
             return self
         else:
-            raise AttributeError, "'%s' object has no attribute '%s'" % 
(self.__class__.__name__, attr)
+            raise AttributeError("'%s' object has no attribute '%s'"
+                                 % (self.__class__.__name__, attr))
 
     def __unicode__(self):
         return str(self).decode(DEFAULT_OUTPUT_ENCODING)
@@ -455,11 +460,13 @@
         else:
             return self
 
+
 class CData(NavigableString):
 
     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
 
+
 class ProcessingInstruction(NavigableString):
     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         output = self
@@ -467,14 +474,17 @@
             output = self.substituteEncoding(output, encoding)
         return "<?%s?>" % self.toEncoding(output, encoding)
 
+
 class Comment(NavigableString):
     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         return "<!--%s-->" % NavigableString.__str__(self, encoding)
 
+
 class Declaration(NavigableString):
     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         return "<!%s>" % NavigableString.__str__(self, encoding)
 
+
 class Tag(PageElement):
 
     """Represents a found HTML tag with its attributes and contents."""
@@ -482,15 +492,15 @@
     def _invert(h):
         "Cheap function to invert a hash."
         i = {}
-        for k,v in h.items():
+        for k, v in h.items():
             i[v] = k
         return i
 
-    XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
-                                      "quot" : '"',
-                                      "amp" : "&",
-                                      "lt" : "<",
-                                      "gt" : ">" }
+    XML_ENTITIES_TO_SPECIAL_CHARS = {"apos": "'",
+                                     "quot": '"',
+                                     "amp": "&",
+                                     "lt": "<",
+                                     "gt": ">"}
 
     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
 
@@ -549,8 +559,8 @@
         self.attrs = map(convert, self.attrs)
 
     def getString(self):
-        if (len(self.contents) == 1
-            and isinstance(self.contents[0], NavigableString)):
+        if (len(self.contents) == 1 and isinstance(self.contents[0],
+                                                   NavigableString)):
             return self.contents[0]
 
     def setString(self, string):
@@ -592,7 +602,7 @@
         raise ValueError("Tag.index: element not in tag")
 
     def has_key(self, key):
-        return self._getAttrMap().has_key(key)
+        return key in self._getAttrMap()
 
     def __getitem__(self, key):
         """tag[key] returns the value of the 'key' attribute for the tag,
@@ -636,7 +646,7 @@
                 #We don't break because bad HTML can define the same
                 #attribute multiple times.
             self._getAttrMap()
-            if self.attrMap.has_key(key):
+            if key in self.attrMap:
                 del self.attrMap[key]
 
     def __call__(self, *args, **kwargs):
@@ -651,7 +661,8 @@
             return self.find(tag[:-3])
         elif tag.find('__') != 0:
             return self.find(tag)
-        raise AttributeError, "'%s' object has no attribute '%s'" % 
(self.__class__, tag)
+        raise AttributeError("'%s' object has no attribute '%s'"
+                             % (self.__class__, tag))
 
     def __eq__(self, other):
         """Returns true iff this tag has the same name, the same attributes,
@@ -661,7 +672,9 @@
         same attributes in a different order. Should this be fixed?"""
         if other is self:
             return True
-        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not 
hasattr(other, 'contents') or self.name != other.name or self.attrs != 
other.attrs or len(self) != len(other):
+        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or \
+           not hasattr(other, 'contents') or self.name != other.name or \
+           self.attrs != other.attrs or len(self) != len(other):
             return False
         for i in range(0, len(self.contents)):
             if self.contents[i] != other.contents[i]:
@@ -734,7 +747,8 @@
                     # value might also contain angle brackets, or
                     # ampersands that aren't part of entities. We need
                     # to escape those to XML entities too.
-                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, 
val)
+                    val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity,
+                                                             val)
 
                 attrs.append(fmt % (self.toEncoding(key, encoding),
                                     self.toEncoding(val, encoding)))
@@ -798,7 +812,7 @@
                        prettyPrint=False, indentLevel=0):
         """Renders the contents of this tag as a string in the given
         encoding. If encoding is None, returns a Unicode string.."""
-        s=[]
+        s = []
         for c in self:
             text = None
             if isinstance(c, NavigableString):
@@ -912,13 +926,13 @@
         if isinstance(markupName, Tag):
             markup = markupName
             markupAttrs = markup
-        callFunctionWithTagData = callable(self.name) \
-                                and not isinstance(markupName, Tag)
+        callFunctionWithTagData = callable(self.name) and \
+                                  not isinstance(markupName, Tag)
 
         if (not self.name) \
-               or callFunctionWithTagData \
-               or (markup and self._matches(markup, self.name)) \
-               or (not markup and self._matches(markupName, self.name)):
+           or callFunctionWithTagData \
+           or (markup and self._matches(markup, self.name)) \
+           or (not markup and self._matches(markupName, self.name)):
             if callFunctionWithTagData:
                 match = self.name(markupName, markupAttrs)
             else:
@@ -926,11 +940,11 @@
                 markupAttrMap = None
                 for attr, matchAgainst in self.attrs.items():
                     if not markupAttrMap:
-                         if hasattr(markupAttrs, 'get'):
+                        if hasattr(markupAttrs, 'get'):
                             markupAttrMap = markupAttrs
-                         else:
+                        else:
                             markupAttrMap = {}
-                            for k,v in markupAttrs:
+                            for k, v in markupAttrs:
                                 markupAttrMap[k] = v
                     attrValue = markupAttrMap.get(attr)
                     if not self._matches(attrValue, matchAgainst):
@@ -948,11 +962,10 @@
         found = None
         # If given a list of items, scan it for a text element that
         # matches.
-        if hasattr(markup, "__iter__") \
-                and not isinstance(markup, Tag):
+        if hasattr(markup, "__iter__") and not isinstance(markup, Tag):
             for element in markup:
-                if isinstance(element, NavigableString) \
-                       and self.search(element):
+                if isinstance(element, NavigableString) and \
+                   self.search(element):
                     found = element
                     break
         # If it's a Tag, make sure its name or attributes match.
@@ -961,13 +974,13 @@
             if not self.text:
                 found = self.searchTag(markup)
         # If it's text, make sure the text matches.
-        elif isinstance(markup, NavigableString) or \
-                 isinstance(markup, basestring):
+        elif isinstance(markup, NavigableString) or isinstance(markup,
+                                                               basestring):
             if self._matches(markup, self.text):
                 found = markup
         else:
-            raise Exception, "I don't know how to match against a %s" \
-                  % markup.__class__
+            raise Exception("I don't know how to match against a %s"
+                            % markup.__class__)
         return found
 
     def _matches(self, markup, matchAgainst):
@@ -988,10 +1001,10 @@
             if hasattr(matchAgainst, 'match'):
                 # It's a regexp object.
                 result = markup and matchAgainst.search(markup)
-            elif hasattr(matchAgainst, '__iter__'): # list-like
+            elif hasattr(matchAgainst, '__iter__'):  # list-like
                 result = markup in matchAgainst
             elif hasattr(matchAgainst, 'items'):
-                result = markup.has_key(matchAgainst)
+                result = matchAgainst in markup
             elif matchAgainst and isinstance(markup, basestring):
                 if isinstance(markup, unicode):
                     matchAgainst = unicode(matchAgainst)
@@ -1002,6 +1015,7 @@
                 result = matchAgainst == markup
         return result
 
+
 class ResultSet(list):
     """A ResultSet is just a list that keeps track of the SoupStrainer
     that created it."""
@@ -1009,6 +1023,7 @@
         list.__init__([])
         self.source = source
 
+
 # Now, some helper functions.
 
 def buildTagMap(default, *args):
@@ -1019,9 +1034,9 @@
     for portion in args:
         if hasattr(portion, 'items'):
             #It's a map. Merge it.
-            for k,v in portion.items():
+            for k, v in portion.items():
                 built[k] = v
-        elif hasattr(portion, '__iter__'): # is a list
+        elif hasattr(portion, '__iter__'):  # is a list
             #It's a list. Map each item to the default.
             for k in portion:
                 built[k] = default
@@ -1030,6 +1045,7 @@
             built[portion] = default
     return built
 
+
 # Now, the parser classes.
 
 class BeautifulStoneSoup(Tag, SGMLParser):
@@ -1074,7 +1090,7 @@
     # can be replaced with a single space. A text node that contains
     # fancy Unicode spaces (usually non-breaking) should be left
     # alone.
-    STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+    STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
 
     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
@@ -1151,7 +1167,7 @@
             n = int(name)
         except ValueError:
             return
-        if not 0 <= n <= 127 : # ASCII ends at 127, not 255
+        if not 0 <= n <= 127:  # ASCII ends at 127, not 255
             return
         return self.convert_codepoint(n)
 
@@ -1162,9 +1178,10 @@
             if not hasattr(self, 'originalEncoding'):
                 self.originalEncoding = None
         else:
-            dammit = UnicodeDammit\
-                     (markup, [self.fromEncoding, inDocumentEncoding],
-                      smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
+            dammit = UnicodeDammit(markup,
+                                   [self.fromEncoding, inDocumentEncoding],
+                                   smartQuotesTo=self.smartQuotesTo,
+                                   isHTML=isHTML)
             markup = dammit.unicode
             self.originalEncoding = dammit.originalEncoding
             self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
@@ -1194,7 +1211,7 @@
         #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
 
         if methodName.startswith('start_') or methodName.startswith('end_') \
-               or methodName.startswith('do_'):
+           or methodName.startswith('do_'):
             return SGMLParser.__getattr__(self, methodName)
         elif not methodName.startswith('__'):
             return Tag.__getattr__(self, methodName)
@@ -1204,8 +1221,8 @@
     def isSelfClosingTag(self, name):
         """Returns true iff the given string is the name of a
         self-closing tag according to this parser."""
-        return self.SELF_CLOSING_TAGS.has_key(name) \
-               or self.instanceSelfClosingTags.has_key(name)
+        return name in self.SELF_CLOSING_TAGS or \
+               name in self.instanceSelfClosingTags
 
     def reset(self):
         Tag.__init__(self, self, self.ROOT_TAG_NAME)
@@ -1244,8 +1261,8 @@
                     currentData = ' '
             self.currentData = []
             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
-                   (not self.parseOnlyThese.text or \
-                    not self.parseOnlyThese.search(currentData)):
+               (not self.parseOnlyThese.text or not
+                    self.parseOnlyThese.search(currentData)):
                 return
             o = containerClass(currentData)
             o.setup(self.currentTag, self.previous)
@@ -1254,7 +1271,6 @@
             self.previous = o
             self.currentTag.contents.append(o)
 
-
     def _popToTag(self, name, inclusivePop=True):
         """Pops the tag stack up to and including the most recent
         instance of the given tag. If inclusivePop is false, pops the tag
@@ -1296,8 +1312,8 @@
         """
 
         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
-        isNestable = nestingResetTriggers != None
-        isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
+        isNestable = nestingResetTriggers is not None
+        isResetNesting = name in self.RESET_NESTING_TAGS
         popTo = None
         inclusive = True
         for i in range(len(self.tagStack)-1, 0, -1):
@@ -1310,7 +1326,7 @@
             if (nestingResetTriggers is not None
                 and p.name in nestingResetTriggers) \
                 or (nestingResetTriggers is None and isResetNesting
-                    and self.RESET_NESTING_TAGS.has_key(p.name)):
+                    and p.name in self.RESET_NESTING_TAGS):
 
                 #If we encounter one of the nesting reset triggers
                 #peculiar to this tag, or we encounter another tag
@@ -1337,7 +1353,8 @@
             self._smartPop(name)
 
         if self.parseOnlyThese and len(self.tagStack) <= 1 \
-               and (self.parseOnlyThese.text or not 
self.parseOnlyThese.searchTag(name, attrs)):
+           and (self.parseOnlyThese.text or
+                not self.parseOnlyThese.searchTag(name, attrs)):
             return
 
         tag = Tag(self, name, attrs, self.currentTag, self.previous)
@@ -1411,7 +1428,7 @@
                 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
 
         if not data and self.convertHTMLEntities and \
-            not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+           not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
                 # TODO: We've got a problem here. We're told this is
                 # an entity reference, but it's not an XML entity
                 # reference or an HTML entity reference. Nonetheless,
@@ -1448,12 +1465,12 @@
         declaration as a CData object."""
         j = None
         if self.rawdata[i:i+9] == '<![CDATA[':
-             k = self.rawdata.find(']]>', i)
-             if k == -1:
-                 k = len(self.rawdata)
-             data = self.rawdata[i+9:k]
-             j = k+3
-             self._toStringSubclass(data, CData)
+            k = self.rawdata.find(']]>', i)
+            if k == -1:
+                k = len(self.rawdata)
+            data = self.rawdata[i+9:k]
+            j = k + 3
+            self._toStringSubclass(data, CData)
         else:
             try:
                 j = SGMLParser.parse_declaration(self, i)
@@ -1463,6 +1480,7 @@
                 j = i + len(toHandle)
         return j
 
+
 class BeautifulSoup(BeautifulStoneSoup):
 
     """This parser knows the following facts about HTML:
@@ -1512,18 +1530,18 @@
     BeautifulStoneSoup before writing your own subclass."""
 
     def __init__(self, *args, **kwargs):
-        if not kwargs.has_key('smartQuotesTo'):
+        if not 'smartQuotesTo' in kwargs:
             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
         kwargs['isHTML'] = True
         BeautifulStoneSoup.__init__(self, *args, **kwargs)
 
     SELF_CLOSING_TAGS = buildTagMap(None,
-                                    ('br' , 'hr', 'input', 'img', 'meta',
+                                    ('br', 'hr', 'input', 'img', 'meta',
                                     'spacer', 'link', 'frame', 'base', 'col'))
 
     PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
 
-    QUOTE_TAGS = {'script' : None, 'textarea' : None}
+    QUOTE_TAGS = {'script': None, 'textarea': None}
 
     #According to the HTML standard, each of these inline tags can
     #contain another tag of the same type. Furthermore, it's common
@@ -1537,21 +1555,21 @@
     NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
 
     #Lists can contain other lists, but there are restrictions.
-    NESTABLE_LIST_TAGS = { 'ol' : [],
-                           'ul' : [],
-                           'li' : ['ul', 'ol'],
-                           'dl' : [],
-                           'dd' : ['dl'],
-                           'dt' : ['dl'] }
+    NESTABLE_LIST_TAGS = {'ol': [],
+                          'ul': [],
+                          'li': ['ul', 'ol'],
+                          'dl': [],
+                          'dd': ['dl'],
+                          'dt': ['dl']}
 
     #Tables can contain other tables, but there are restrictions.
-    NESTABLE_TABLE_TAGS = {'table' : [],
-                           'tr' : ['table', 'tbody', 'tfoot', 'thead'],
-                           'td' : ['tr'],
-                           'th' : ['tr'],
-                           'thead' : ['table'],
-                           'tbody' : ['table'],
-                           'tfoot' : ['table'],
+    NESTABLE_TABLE_TAGS = {'table': [],
+                           'tr': ['table', 'tbody', 'tfoot', 'thead'],
+                           'td': ['tr'],
+                           'th': ['tr'],
+                           'thead': ['table'],
+                           'tbody': ['table'],
+                           'tfoot': ['table'],
                            }
 
     NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
@@ -1587,11 +1605,11 @@
                 contentType = value
                 contentTypeIndex = i
 
-        if httpEquiv and contentType: # It's an interesting meta tag.
+        if httpEquiv and contentType:  # It's an interesting meta tag.
             match = self.CHARSET_RE.search(contentType)
             if match:
                 if (self.declaredHTMLEncoding is not None or
-                    self.originalEncoding == self.fromEncoding):
+                        self.originalEncoding == self.fromEncoding):
                     # An HTML encoding was sniffed while converting
                     # the document to Unicode, or an HTML encoding was
                     # sniffed during a previous pass through the
@@ -1616,9 +1634,11 @@
         if tag and tagNeedsEncodingSubstitution:
             tag.containsSubstitutions = True
 
+
 class StopParsing(Exception):
     pass
 
+
 class ICantBelieveItsBeautifulSoup(BeautifulSoup):
 
     """The BeautifulSoup class is oriented towards skipping over
@@ -1644,10 +1664,10 @@
     it's valid HTML and BeautifulSoup screwed up by assuming it
     wouldn't be."""
 
-    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
-     ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
-      'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
-      'big')
+    I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = (
+        'em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 'cite',
+        'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 'big'
+    )
 
     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
 
@@ -1655,6 +1675,7 @@
                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
 
+
 class MinimalSoup(BeautifulSoup):
     """The MinimalSoup class is for parsing HTML that contains
     pathologically bad markup. It makes no assumptions about tag
@@ -1668,6 +1689,7 @@
     RESET_NESTING_TAGS = buildTagMap('noscript')
     NESTABLE_TAGS = {}
 
+
 class BeautifulSOAP(BeautifulStoneSoup):
     """This class will push a tag with only a single string child into
     the tag's parent as an attribute. The attribute's name is the tag
@@ -1695,10 +1717,11 @@
             parent._getAttrMap()
             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
                 isinstance(tag.contents[0], NavigableString) and
-                not parent.attrMap.has_key(tag.name)):
+                    not tag.name in parent.attrMap):
                 parent[tag.name] = tag.contents[0]
         BeautifulStoneSoup.popTag(self)
 
+
 #Enterprise class names! It has come to our attention that some people
 #think the names of the Beautiful Soup parser classes are too silly
 #and "unprofessional" for use in enterprise screen-scraping. We feel
@@ -1749,6 +1772,7 @@
 except ImportError:
     pass
 
+
 class UnicodeDammit:
     """A class for detecting the encoding of a *ML document and
     converting it to a Unicode string. If the source encoding is
@@ -1759,14 +1783,14 @@
     # meta tags to the corresponding Python codec names. It only covers
     # values that aren't in Python's aliases and can't be determined
     # by the heuristics in find_codec.
-    CHARSET_ALIASES = { "macintosh" : "mac-roman",
-                        "x-sjis" : "shift-jis" }
+    CHARSET_ALIASES = {"macintosh": "mac-roman",
+                       "x-sjis": "shift-jis"}
 
     def __init__(self, markup, overrideEncodings=[],
                  smartQuotesTo='xml', isHTML=False):
         self.declaredHTMLEncoding = None
         self.markup, documentEncoding, sniffedEncoding = \
-                     self._detectEncoding(markup, isHTML)
+            self._detectEncoding(markup, isHTML)
         self.smartQuotesTo = smartQuotesTo
         self.triedEncodings = []
         if markup == '' or isinstance(markup, unicode):
@@ -1819,9 +1843,8 @@
         if self.smartQuotesTo and proposed.lower() in("windows-1252",
                                                       "iso-8859-1",
                                                       "iso-8859-2"):
-            markup = re.compile("([\x80-\x9f])").sub \
-                     (lambda(x): self._subMSChar(x.group(1)),
-                      markup)
+            markup = re.compile("([\x80-\x9f])").sub(
+                lambda(x): self._subMSChar(x.group(1)), markup)
 
         try:
             # print "Trying to convert document to %s" % proposed
@@ -1841,11 +1864,11 @@
 
         # strip Byte Order Mark (if present)
         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
-               and (data[2:4] != '\x00\x00'):
+           and (data[2:4] != '\x00\x00'):
             encoding = 'utf-16be'
             data = data[2:]
-        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
-                 and (data[2:4] != '\x00\x00'):
+        elif (len(data) >= 4) and \
+             (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
             encoding = 'utf-16le'
             data = data[2:]
         elif data[:3] == '\xef\xbb\xbf':
@@ -1871,8 +1894,8 @@
                 # UTF-16BE
                 sniffed_xml_encoding = 'utf-16be'
                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
-                     and (xml_data[2:4] != '\x00\x00'):
+            elif (len(xml_data) >= 4) and \
+                 (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != 
'\x00\x00'):
                 # UTF-16BE with BOM
                 sniffed_xml_encoding = 'utf-16be'
                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
@@ -1881,7 +1904,7 @@
                 sniffed_xml_encoding = 'utf-16le'
                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
-                     (xml_data[2:4] != '\x00\x00'):
+                 (xml_data[2:4] != '\x00\x00'):
                 # UTF-16LE with BOM
                 sniffed_xml_encoding = 'utf-16le'
                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
@@ -1927,7 +1950,6 @@
                 xml_encoding = sniffed_xml_encoding
         return xml_data, xml_encoding, sniffed_xml_encoding
 
-
     def find_codec(self, charset):
         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
                or (charset and self._codec(charset.replace("-", ""))) \
@@ -1945,63 +1967,70 @@
         return codec
 
     EBCDIC_TO_ASCII_MAP = None
+
     def _ebcdic_to_ascii(self, s):
         c = self.__class__
         if not c.EBCDIC_TO_ASCII_MAP:
-            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
-                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
-                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
-                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
-                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
-                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
-                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
-                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
-                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
-                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
-                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
-                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
-                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
-                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
-                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
-                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
-                    250,251,252,253,254,255)
+            emap = (0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 
14,
+                    15, 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28,
+                    29, 30, 31, 128, 129, 130, 131, 132, 10, 23, 27, 136, 137,
+                    138, 139, 140, 5, 6, 7, 144, 145, 22, 147, 148, 149, 150, 
4,
+                    152, 153, 154, 155, 20, 21, 158, 26, 32, 160, 161, 162, 
163,
+                    164, 165, 166, 167, 168, 91, 46, 60, 40, 43, 33, 38, 169,
+                    170, 171, 172, 173, 174, 175, 176, 177, 93, 36, 42, 41, 59,
+                    94, 45, 47, 178, 179, 180, 181, 182, 183, 184, 185, 124, 
44,
+                    37, 95, 62, 63, 186, 187, 188, 189, 190, 191, 192, 193, 
194,
+                    96, 58, 35, 64, 39, 61, 34, 195, 97, 98, 99, 100, 101, 102,
+                    103, 104, 105, 196, 197, 198, 199, 200, 201, 202, 106, 107,
+                    108, 109, 110, 111, 112, 113, 114, 203, 204, 205, 206, 207,
+                    208, 209, 126, 115, 116, 117, 118, 119, 120, 121, 122, 210,
+                    211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
+                    223, 224, 225, 226, 227, 228, 229, 230, 231, 123, 65, 66,
+                    67, 68, 69, 70, 71, 72, 73, 232, 233, 234, 235, 236, 237,
+                    125, 74, 75, 76, 77, 78, 79, 80, 81, 82, 238, 239, 240, 
241,
+                    242, 243, 92, 159, 83, 84, 85, 86, 87, 88, 89, 90, 244, 
245,
+                    246, 247, 248, 249, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
+                    250, 251, 252, 253, 254, 255)
             import string
-            c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
-            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+            c.EBCDIC_TO_ASCII_MAP = string.maketrans(''.join(map(chr,
+                                                                 range(256))),
+                                                     ''.join(map(chr, emap)))
         return s.translate(c.EBCDIC_TO_ASCII_MAP)
 
-    MS_CHARS = { '\x80' : ('euro', '20AC'),
-                 '\x81' : ' ',
-                 '\x82' : ('sbquo', '201A'),
-                 '\x83' : ('fnof', '192'),
-                 '\x84' : ('bdquo', '201E'),
-                 '\x85' : ('hellip', '2026'),
-                 '\x86' : ('dagger', '2020'),
-                 '\x87' : ('Dagger', '2021'),
-                 '\x88' : ('circ', '2C6'),
-                 '\x89' : ('permil', '2030'),
-                 '\x8A' : ('Scaron', '160'),
-                 '\x8B' : ('lsaquo', '2039'),
-                 '\x8C' : ('OElig', '152'),
-                 '\x8D' : '?',
-                 '\x8E' : ('#x17D', '17D'),
-                 '\x8F' : '?',
-                 '\x90' : '?',
-                 '\x91' : ('lsquo', '2018'),
-                 '\x92' : ('rsquo', '2019'),
-                 '\x93' : ('ldquo', '201C'),
-                 '\x94' : ('rdquo', '201D'),
-                 '\x95' : ('bull', '2022'),
-                 '\x96' : ('ndash', '2013'),
-                 '\x97' : ('mdash', '2014'),
-                 '\x98' : ('tilde', '2DC'),
-                 '\x99' : ('trade', '2122'),
-                 '\x9a' : ('scaron', '161'),
-                 '\x9b' : ('rsaquo', '203A'),
-                 '\x9c' : ('oelig', '153'),
-                 '\x9d' : '?',
-                 '\x9e' : ('#x17E', '17E'),
-                 '\x9f' : ('Yuml', ''),}
+    MS_CHARS = {
+        '\x80': ('euro', '20AC'),
+        '\x81': ' ',
+        '\x82': ('sbquo', '201A'),
+        '\x83': ('fnof', '192'),
+        '\x84': ('bdquo', '201E'),
+        '\x85': ('hellip', '2026'),
+        '\x86': ('dagger', '2020'),
+        '\x87': ('Dagger', '2021'),
+        '\x88': ('circ', '2C6'),
+        '\x89': ('permil', '2030'),
+        '\x8A': ('Scaron', '160'),
+        '\x8B': ('lsaquo', '2039'),
+        '\x8C': ('OElig', '152'),
+        '\x8D': '?',
+        '\x8E': ('#x17D', '17D'),
+        '\x8F': '?',
+        '\x90': '?',
+        '\x91': ('lsquo', '2018'),
+        '\x92': ('rsquo', '2019'),
+        '\x93': ('ldquo', '201C'),
+        '\x94': ('rdquo', '201D'),
+        '\x95': ('bull', '2022'),
+        '\x96': ('ndash', '2013'),
+        '\x97': ('mdash', '2014'),
+        '\x98': ('tilde', '2DC'),
+        '\x99': ('trade', '2122'),
+        '\x9a': ('scaron', '161'),
+        '\x9b': ('rsaquo', '203A'),
+        '\x9c': ('oelig', '153'),
+        '\x9d': '?',
+        '\x9e': ('#x17E', '17E'),
+        '\x9f': ('Yuml', ''),
+    }
 
 #######################################################################
 


_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn

[Pywikipedia-svn] SVN: [11156] trunk/pywikipedia/BeautifulSoup.py

Reply via email to