BeautifulSoup.py

multichill Fri, 08 May 2009 00:52:56 -0700

Revision: 6857
Author:   multichill
Date:     2009-05-08 07:52:49 +0000 (Fri, 08 May 2009)


Log Message:
-----------
Reverting changes in r6842 (update BeautifulSoup to 3.1.0.1). This seems to 
make the bot unstable, see [ pywikipediabot-Bugs-2788812 ] TypeError:       
unsubscriptable object

Modified Paths:
--------------
    trunk/pywikipedia/BeautifulSoup.py

Modified: trunk/pywikipedia/BeautifulSoup.py
===================================================================
--- trunk/pywikipedia/BeautifulSoup.py  2009-05-07 19:03:06 UTC (rev 6856)
+++ trunk/pywikipedia/BeautifulSoup.py  2009-05-08 07:52:49 UTC (rev 6857)
@@ -42,7 +42,7 @@
 
 Here, have some legalese:
 
-Copyright (c) 2004-2009, Leonard Richardson
+Copyright (c) 2004-2007, Leonard Richardson
 
 All rights reserved.
 
@@ -79,38 +79,27 @@
 from __future__ import generators
 
 __author__ = "Leonard Richardson ([email protected])"
-__version__ = "3.1.0.1"
-__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
+__version__ = "3.0.6"
+__copyright__ = "Copyright (c) 2004-2008 Leonard Richardson"
 __license__ = "New-style BSD"
 
+from sgmllib import SGMLParser, SGMLParseError
 import codecs
-import markupbase
 import types
 import re
-from HTMLParser import HTMLParser, HTMLParseError
+import sgmllib
 try:
-    from htmlentitydefs import name2codepoint
+  from htmlentitydefs import name2codepoint
 except ImportError:
-    name2codepoint = {}
-try:
-    set
-except NameError:
-    from sets import Set as set
+  name2codepoint = {}
 
-#These hacks make Beautiful Soup able to parse XML with namespaces
-markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
+#This hack makes Beautiful Soup able to parse XML with namespaces
+sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
 
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 
 # First, the classes that represent markup elements.
 
-def sob(unicode, encoding):
-    """Returns either the given Unicode string or its encoding."""
-    if encoding is None:
-        return unicode
-    else:
-        return unicode.encode(encoding)
-
 class PageElement:
     """Contains the navigational information for some part of the page
     (either a tag or a piece of text)"""
@@ -402,20 +391,8 @@
 
 class NavigableString(unicode, PageElement):
 
-    def __new__(cls, value):
-        """Create a new NavigableString.
-
-        When unpickling a NavigableString, this method is called with
-        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
-        passed in to the superclass's __new__ or the superclass won't know
-        how to handle non-ASCII characters.
-        """
-        if isinstance(value, unicode):
-            return unicode.__new__(cls, value)
-        return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
-
     def __getnewargs__(self):
-        return (unicode(self),)
+        return (NavigableString.__str__(self),)
 
     def __getattr__(self, attr):
         """text.string gives you text. This is for backwards
@@ -426,32 +403,34 @@
         else:
             raise AttributeError, "'%s' object has no attribute '%s'" % 
(self.__class__.__name__, attr)
 
-    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        return self.decode().encode(encoding)
+    def __unicode__(self):
+        return str(self).decode(DEFAULT_OUTPUT_ENCODING)
 
-    def decodeGivenEventualEncoding(self, eventualEncoding):
-        return self
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        if encoding:
+            return self.encode(encoding)
+        else:
+            return self
 
 class CData(NavigableString):
 
-    def decodeGivenEventualEncoding(self, eventualEncoding):
-        return u'<![CDATA[' + self + u']]>'
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
 
 class ProcessingInstruction(NavigableString):
-
-    def decodeGivenEventualEncoding(self, eventualEncoding):
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         output = self
-        if u'%SOUP-ENCODING%' in output:
-            output = self.substituteEncoding(output, eventualEncoding)
-        return u'<?' + output + u'?>'
+        if "%SOUP-ENCODING%" in output:
+            output = self.substituteEncoding(output, encoding)
+        return "<?%s?>" % self.toEncoding(output, encoding)
 
 class Comment(NavigableString):
-    def decodeGivenEventualEncoding(self, eventualEncoding):
-        return u'<!--' + self + u'-->'
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return "<!--%s-->" % NavigableString.__str__(self, encoding)
 
 class Declaration(NavigableString):
-    def decodeGivenEventualEncoding(self, eventualEncoding):
-        return u'<!' + self + u'>'
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+        return "<!%s>" % NavigableString.__str__(self, encoding)
 
 class Tag(PageElement):
 
@@ -517,13 +496,11 @@
         self.convertXMLEntities = parser.convertXMLEntities
         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
 
-        def convert(kval):
-            "Converts HTML, XML and numeric entities in the attribute value."
-            k, val = kval
-            if val is None:
-                return kval
-            return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
-                              self._convertEntities, val))
+        # Convert any HTML, XML, or numeric entities in the attribute values.
+        convert = lambda(k, val): (k,
+                                   re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
+                                          self._convertEntities,
+                                          val))
         self.attrs = map(convert, self.attrs)
 
     def get(self, key, default=None):
@@ -614,8 +591,11 @@
 
     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
         """Renders this tag as a string."""
-        return self.decode(eventualEncoding=encoding)
+        return self.__str__(encoding)
 
+    def __unicode__(self):
+        return self.__str__(None)
+
     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
                                            + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
                                            + ")")
@@ -625,30 +605,24 @@
         appropriate XML entity for an XML special character."""
         return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
 
-    def __unicode__(self):
-        return self.decode()
+    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
+                prettyPrint=False, indentLevel=0):
+        """Returns a string or Unicode representation of this tag and
+        its contents. To get Unicode, pass None for encoding.
 
-    def __str__(self):
-        return self.encode()
+        NOTE: since Python's HTML parser consumes whitespace, this
+        method is not certain to reproduce the whitespace present in
+        the original string."""
 
-    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
-               prettyPrint=False, indentLevel=0):
-        return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
+        encodedName = self.toEncoding(self.name, encoding)
 
-    def decode(self, prettyPrint=False, indentLevel=0,
-               eventualEncoding=DEFAULT_OUTPUT_ENCODING):
-        """Returns a string or Unicode representation of this tag and
-        its contents. To get Unicode, pass None for encoding."""
-
         attrs = []
         if self.attrs:
             for key, val in self.attrs:
                 fmt = '%s="%s"'
                 if isString(val):
-                    if (self.containsSubstitutions
-                        and eventualEncoding is not None
-                        and '%SOUP-ENCODING%' in val):
-                        val = self.substituteEncoding(val, eventualEncoding)
+                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
+                        val = self.substituteEncoding(val, encoding)
 
                     # The attribute value either:
                     #
@@ -677,26 +651,22 @@
                     # ampersands that aren't part of entities. We need
                     # to escape those to XML entities too.
                     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, 
val)
-                if val is None:
-                    # Handle boolean attributes.
-                    decoded = key
-                else:
-                    decoded = fmt % (key, val)
-                attrs.append(decoded)
+
+                attrs.append(fmt % (self.toEncoding(key, encoding),
+                                    self.toEncoding(val, encoding)))
         close = ''
         closeTag = ''
         if self.isSelfClosing:
             close = ' /'
         else:
-            closeTag = '</%s>' % self.name
+            closeTag = '</%s>' % encodedName
 
         indentTag, indentContents = 0, 0
         if prettyPrint:
             indentTag = indentLevel
             space = (' ' * (indentTag-1))
             indentContents = indentTag + 1
-        contents = self.decodeContents(prettyPrint, indentContents,
-                                       eventualEncoding)
+        contents = self.renderContents(encoding, prettyPrint, indentContents)
         if self.hidden:
             s = contents
         else:
@@ -706,7 +676,7 @@
                 attributeString = ' ' + ' '.join(attrs)
             if prettyPrint:
                 s.append(space)
-            s.append('<%s%s%s>' % (self.name, attributeString, close))
+            s.append('<%s%s%s>' % (encodedName, attributeString, close))
             if prettyPrint:
                 s.append("\n")
             s.append(contents)
@@ -731,23 +701,19 @@
         self.extract()
 
     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
-        return self.encode(encoding, True)
+        return self.__str__(encoding, True)
 
-    def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
+    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
                        prettyPrint=False, indentLevel=0):
-        return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
-
-    def decodeContents(self, prettyPrint=False, indentLevel=0,
-                       eventualEncoding=DEFAULT_OUTPUT_ENCODING):
         """Renders the contents of this tag as a string in the given
         encoding. If encoding is None, returns a Unicode string.."""
         s=[]
         for c in self:
             text = None
             if isinstance(c, NavigableString):
-                text = c.decodeGivenEventualEncoding(eventualEncoding)
+                text = c.__str__(encoding)
             elif isinstance(c, Tag):
-                s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
+                s.append(c.__str__(encoding, prettyPrint, indentLevel))
             if text and prettyPrint:
                 text = text.strip()
             if text:
@@ -788,7 +754,7 @@
         return self._findAll(name, attrs, text, limit, generator, **kwargs)
     findChildren = findAll
 
-    # Pre-3.x compatibility methods. Will go away in 4.0.
+    # Pre-3.x compatibility methods
     first = find
     fetch = findAll
 
@@ -798,15 +764,6 @@
     def firstText(self, text=None, recursive=True):
         return self.find(text=text, recursive=recursive)
 
-    # 3.x compatibility methods. Will go away in 4.0.
-    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
-                       prettyPrint=False, indentLevel=0):
-        if encoding is None:
-            return self.decodeContents(prettyPrint, indentLevel, encoding)
-        else:
-            return self.encodeContents(encoding, prettyPrint, indentLevel)
-
-
     #Private methods
 
     def _getAttrMap(self):
@@ -819,24 +776,26 @@
         return self.attrMap
 
     #Generator methods
-    def recursiveChildGenerator(self):
-        if not len(self.contents):
-            raise StopIteration
-        stopNode = self._lastRecursiveChild().next
-        current = self.contents[0]
-        while current is not stopNode:
-            yield current
-            current = current.next
-
     def childGenerator(self):
-        if not len(self.contents):
-            raise StopIteration
-        current = self.contents[0]
-        while current:
-            yield current
-            current = current.nextSibling
+        for i in range(0, len(self.contents)):
+            yield self.contents[i]
         raise StopIteration
 
+    def recursiveChildGenerator(self):
+        stack = [(self, 0)]
+        while stack:
+            tag, start = stack.pop()
+            if isinstance(tag, Tag):
+                for i in range(start, len(tag.contents)):
+                    a = tag.contents[i]
+                    yield a
+                    if isinstance(a, Tag) and tag.contents:
+                        if i < len(tag.contents) - 1:
+                            stack.append((tag, i+1))
+                        stack.append((a, 0))
+                        break
+        raise StopIteration
+
 # Next, a couple classes to represent queries and their results.
 class SoupStrainer:
     """Encapsulates a number of ways of matching a markup element (tag or
@@ -937,14 +896,13 @@
             #other ways of matching match the tag name as a string.
             if isinstance(markup, Tag):
                 markup = markup.name
-            if markup is not None and not isString(markup):
+            if markup and not isString(markup):
                 markup = unicode(markup)
             #Now we know that chunk is either a string, or None.
             if hasattr(matchAgainst, 'match'):
                 # It's a regexp object.
                 result = markup and matchAgainst.search(markup)
-            elif (isList(matchAgainst)
-                  and (markup is not None or not isString(matchAgainst))):
+            elif isList(matchAgainst):
                 result = markup in matchAgainst
             elif hasattr(matchAgainst, 'items'):
                 result = markup.has_key(matchAgainst)
@@ -970,8 +928,8 @@
 def isList(l):
     """Convenience method that works with all 2.x versions of Python
     to determine whether or not something is listlike."""
-    return ((hasattr(l, '__iter__') and not isString(l))
-            or (type(l) in (types.ListType, types.TupleType)))
+    return hasattr(l, '__iter__') \
+           or (type(l) in (types.ListType, types.TupleType))
 
 def isString(s):
     """Convenience method that works with all 2.x versions of Python
@@ -991,7 +949,7 @@
             #It's a map. Merge it.
             for k,v in portion.items():
                 built[k] = v
-        elif isList(portion) and not isString(portion):
+        elif isList(portion):
             #It's a list. Map each item to the default.
             for k in portion:
                 built[k] = default
@@ -1002,123 +960,8 @@
 
 # Now, the parser classes.
 
-class HTMLParserBuilder(HTMLParser):
+class BeautifulStoneSoup(Tag, SGMLParser):
 
-    def __init__(self, soup):
-        HTMLParser.__init__(self)
-        self.soup = soup
-
-    # We inherit feed() and reset().
-
-    def handle_starttag(self, name, attrs):
-        if name == 'meta':
-            self.soup.extractCharsetFromMeta(attrs)
-        else:
-            self.soup.unknown_starttag(name, attrs)
-
-    def handle_endtag(self, name):
-        self.soup.unknown_endtag(name)
-
-    def handle_data(self, content):
-        self.soup.handle_data(content)
-
-    def _toStringSubclass(self, text, subclass):
-        """Adds a certain piece of text to the tree as a NavigableString
-        subclass."""
-        self.soup.endData()
-        self.handle_data(text)
-        self.soup.endData(subclass)
-
-    def handle_pi(self, text):
-        """Handle a processing instruction as a ProcessingInstruction
-        object, possibly one with a %SOUP-ENCODING% slot into which an
-        encoding will be plugged later."""
-        if text[:3] == "xml":
-            text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
-        self._toStringSubclass(text, ProcessingInstruction)
-
-    def handle_comment(self, text):
-        "Handle comments as Comment objects."
-        self._toStringSubclass(text, Comment)
-
-    def handle_charref(self, ref):
-        "Handle character references as data."
-        if self.soup.convertEntities:
-            data = unichr(int(ref))
-        else:
-            data = '&#%s;' % ref
-        self.handle_data(data)
-
-    def handle_entityref(self, ref):
-        """Handle entity references as data, possibly converting known
-        HTML and/or XML entity references to the corresponding Unicode
-        characters."""
-        data = None
-        if self.soup.convertHTMLEntities:
-            try:
-                data = unichr(name2codepoint[ref])
-            except KeyError:
-                pass
-
-        if not data and self.soup.convertXMLEntities:
-                data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
-
-        if not data and self.soup.convertHTMLEntities and \
-            not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
-                # TODO: We've got a problem here. We're told this is
-                # an entity reference, but it's not an XML entity
-                # reference or an HTML entity reference. Nonetheless,
-                # the logical thing to do is to pass it through as an
-                # unrecognized entity reference.
-                #
-                # Except: when the input is "&carol;" this function
-                # will be called with input "carol". When the input is
-                # "AT&T", this function will be called with input
-                # "T". We have no way of knowing whether a semicolon
-                # was present originally, so we don't know whether
-                # this is an unknown entity or just a misplaced
-                # ampersand.
-                #
-                # The more common case is a misplaced ampersand, so I
-                # escape the ampersand and omit the trailing semicolon.
-                data = "&amp;%s" % ref
-        if not data:
-            # This case is different from the one above, because we
-            # haven't already gone through a supposedly comprehensive
-            # mapping of entities to Unicode characters. We might not
-            # have gone through any mapping at all. So the chances are
-            # very high that this is a real entity, and not a
-            # misplaced ampersand.
-            data = "&%s;" % ref
-        self.handle_data(data)
-
-    def handle_decl(self, data):
-        "Handle DOCTYPEs and the like as Declaration objects."
-        self._toStringSubclass(data, Declaration)
-
-    def parse_declaration(self, i):
-        """Treat a bogus SGML declaration as raw data. Treat a CDATA
-        declaration as a CData object."""
-        j = None
-        if self.rawdata[i:i+9] == '<![CDATA[':
-             k = self.rawdata.find(']]>', i)
-             if k == -1:
-                 k = len(self.rawdata)
-             data = self.rawdata[i+9:k]
-             j = k+3
-             self._toStringSubclass(data, CData)
-        else:
-            try:
-                j = HTMLParser.parse_declaration(self, i)
-            except HTMLParseError:
-                toHandle = self.rawdata[i:]
-                self.handle_data(toHandle)
-                j = i + len(toHandle)
-        return j
-
-
-class BeautifulStoneSoup(Tag):
-
     """This class contains the basic parser and search code. It defines
     a parser that knows nothing about tag behavior except for the
     following:
@@ -1139,7 +982,6 @@
     NESTABLE_TAGS = {}
     RESET_NESTING_TAGS = {}
     QUOTE_TAGS = {}
-    PRESERVE_WHITESPACE_TAGS = []
 
     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
                        lambda x: x.group(1) + ' />'),
@@ -1163,15 +1005,14 @@
 
     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
-                 convertEntities=None, selfClosingTags=None, isHTML=False,
-                 builder=HTMLParserBuilder):
+                 convertEntities=None, selfClosingTags=None):
         """The Soup object is initialized as the 'root tag', and the
         provided markup (which can be a string or a file-like object)
         is fed into the underlying parser.
 
-        HTMLParser will process most bad HTML, and the BeautifulSoup
+        sgmllib will process most bad HTML, and the BeautifulSoup
         class has some tricks for dealing with some HTML that kills
-        HTMLParser, but Beautiful Soup can nonetheless choke or lose data
+        sgmllib, but Beautiful Soup can nonetheless choke or lose data
         if your data uses self-closing tags or declarations
         incorrectly.
 
@@ -1181,7 +1022,7 @@
         you'll get better performance.
 
         The default parser massage techniques fix the two most common
-        instances of invalid HTML that choke HTMLParser:
+        instances of invalid HTML that choke sgmllib:
 
          <br/> (No space between name of closing tag and tag close)
          <! --Comment--> (Extraneous whitespace in declaration)
@@ -1219,21 +1060,29 @@
             self.escapeUnrecognizedEntities = False
 
         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
-        self.builder = builder(self)
-        self.reset()
+        SGMLParser.__init__(self)
 
         if hasattr(markup, 'read'):        # It's a file-type object.
             markup = markup.read()
         self.markup = markup
         self.markupMassage = markupMassage
         try:
-            self._feed(isHTML=isHTML)
+            self._feed()
         except StopParsing:
             pass
-        self.markup = None                 # The markup can now be GCed.
-        self.builder = None                # So can the builder.
+        self.markup = None                 # The markup can now be GCed
 
-    def _feed(self, inDocumentEncoding=None, isHTML=False):
+    def convert_charref(self, name):
+        """This method fixes a bug in Python's SGMLParser."""
+        try:
+            n = int(name)
+        except ValueError:
+            return
+        if not 0 <= n <= 127 : # ASCII ends at 127, not 255
+            return
+        return self.convert_codepoint(n)
+
+    def _feed(self, inDocumentEncoding=None):
         # Convert the document to Unicode.
         markup = self.markup
         if isinstance(markup, unicode):
@@ -1242,10 +1091,9 @@
         else:
             dammit = UnicodeDammit\
                      (markup, [self.fromEncoding, inDocumentEncoding],
-                      smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
+                      smartQuotesTo=self.smartQuotesTo)
             markup = dammit.unicode
             self.originalEncoding = dammit.originalEncoding
-            self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
         if markup:
             if self.markupMassage:
                 if not isList(self.markupMassage):
@@ -1258,14 +1106,27 @@
                 # was relying on the existence of markupMassage, this
                 # might cause problems.
                 del(self.markupMassage)
-        self.builder.reset()
+        self.reset()
 
-        self.builder.feed(markup)
+        SGMLParser.feed(self, markup)
         # Close out any unfinished strings and close all the open tags.
         self.endData()
         while self.currentTag.name != self.ROOT_TAG_NAME:
             self.popTag()
 
+    def __getattr__(self, methodName):
+        """This method routes method call requests to either the SGMLParser
+        superclass or the Tag superclass, depending on the method name."""
+        #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
+
+        if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
+               or methodName.find('do_') == 0:
+            return SGMLParser.__getattr__(self, methodName)
+        elif methodName.find('__') != 0:
+            return Tag.__getattr__(self, methodName)
+        else:
+            raise AttributeError
+
     def isSelfClosingTag(self, name):
         """Returns true iff the given string is the name of a
         self-closing tag according to this parser."""
@@ -1275,7 +1136,7 @@
     def reset(self):
         Tag.__init__(self, self, self.ROOT_TAG_NAME)
         self.hidden = 1
-        self.builder.reset()
+        SGMLParser.reset(self)
         self.currentData = []
         self.currentTag = None
         self.tagStack = []
@@ -1305,10 +1166,8 @@
 
     def endData(self, containerClass=NavigableString):
         if self.currentData:
-            currentData = u''.join(self.currentData)
-            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
-                not set([tag.name for tag in self.tagStack]).intersection(
-                    self.PRESERVE_WHITESPACE_TAGS)):
+            currentData = ''.join(self.currentData)
+            if not currentData.translate(self.STRIP_ASCII_SPACES):
                 if '\n' in currentData:
                     currentData = '\n'
                 else:
@@ -1440,10 +1299,100 @@
     def handle_data(self, data):
         self.currentData.append(data)
 
-    def extractCharsetFromMeta(self, attrs):
-        self.unknown_starttag('meta', attrs)
+    def _toStringSubclass(self, text, subclass):
+        """Adds a certain piece of text to the tree as a NavigableString
+        subclass."""
+        self.endData()
+        self.handle_data(text)
+        self.endData(subclass)
 
+    def handle_pi(self, text):
+        """Handle a processing instruction as a ProcessingInstruction
+        object, possibly one with a %SOUP-ENCODING% slot into which an
+        encoding will be plugged later."""
+        if text[:3] == "xml":
+            text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
+        self._toStringSubclass(text, ProcessingInstruction)
 
+    def handle_comment(self, text):
+        "Handle comments as Comment objects."
+        self._toStringSubclass(text, Comment)
+
+    def handle_charref(self, ref):
+        "Handle character references as data."
+        if self.convertEntities:
+            data = unichr(int(ref))
+        else:
+            data = '&#%s;' % ref
+        self.handle_data(data)
+
+    def handle_entityref(self, ref):
+        """Handle entity references as data, possibly converting known
+        HTML and/or XML entity references to the corresponding Unicode
+        characters."""
+        data = None
+        if self.convertHTMLEntities:
+            try:
+                data = unichr(name2codepoint[ref])
+            except KeyError:
+                pass
+
+        if not data and self.convertXMLEntities:
+                data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
+
+        if not data and self.convertHTMLEntities and \
+            not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+                # TODO: We've got a problem here. We're told this is
+                # an entity reference, but it's not an XML entity
+                # reference or an HTML entity reference. Nonetheless,
+                # the logical thing to do is to pass it through as an
+                # unrecognized entity reference.
+                #
+                # Except: when the input is "&carol;" this function
+                # will be called with input "carol". When the input is
+                # "AT&T", this function will be called with input
+                # "T". We have no way of knowing whether a semicolon
+                # was present originally, so we don't know whether
+                # this is an unknown entity or just a misplaced
+                # ampersand.
+                #
+                # The more common case is a misplaced ampersand, so I
+                # escape the ampersand and omit the trailing semicolon.
+                data = "&amp;%s" % ref
+        if not data:
+            # This case is different from the one above, because we
+            # haven't already gone through a supposedly comprehensive
+            # mapping of entities to Unicode characters. We might not
+            # have gone through any mapping at all. So the chances are
+            # very high that this is a real entity, and not a
+            # misplaced ampersand.
+            data = "&%s;" % ref
+        self.handle_data(data)
+
+    def handle_decl(self, data):
+        "Handle DOCTYPEs and the like as Declaration objects."
+        self._toStringSubclass(data, Declaration)
+
+    def parse_declaration(self, i):
+        """Treat a bogus SGML declaration as raw data. Treat a CDATA
+        declaration as a CData object."""
+        j = None
+        if self.rawdata[i:i+9] == '<![CDATA[':
+             k = self.rawdata.find(']]>', i)
+             if k == -1:
+                 k = len(self.rawdata)
+             data = self.rawdata[i+9:k]
+             j = k+3
+             self._toStringSubclass(data, CData)
+        else:
+            try:
+                j = SGMLParser.parse_declaration(self, i)
+            except SGMLParseError:
+                toHandle = self.rawdata[i:]
+                self.handle_data(toHandle)
+                j = i + len(toHandle)
+        return j
+
 class BeautifulSoup(BeautifulStoneSoup):
 
     """This parser knows the following facts about HTML:
@@ -1495,15 +1444,12 @@
     def __init__(self, *args, **kwargs):
         if not kwargs.has_key('smartQuotesTo'):
             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
-        kwargs['isHTML'] = True
         BeautifulStoneSoup.__init__(self, *args, **kwargs)
 
     SELF_CLOSING_TAGS = buildTagMap(None,
                                     ['br' , 'hr', 'input', 'img', 'meta',
                                     'spacer', 'link', 'frame', 'base'])
 
-    PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
-
     QUOTE_TAGS = {'script' : None, 'textarea' : None}
 
     #According to the HTML standard, each of these inline tags can
@@ -1548,9 +1494,9 @@
                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
 
     # Used to detect the charset in a META tag; see start_meta
-    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
 
-    def extractCharsetFromMeta(self, attrs):
+    def start_meta(self, attrs):
         """Beautiful Soup can detect a charset included in a META tag,
         try to convert the document to that charset, and re-parse the
         document from the beginning."""
@@ -1571,33 +1517,29 @@
         if httpEquiv and contentType: # It's an interesting meta tag.
             match = self.CHARSET_RE.search(contentType)
             if match:
-                if (self.declaredHTMLEncoding is not None or
-                    self.originalEncoding == self.fromEncoding):
-                    # An HTML encoding was sniffed while converting
-                    # the document to Unicode, or an HTML encoding was
-                    # sniffed during a previous pass through the
-                    # document, or an encoding was specified
-                    # explicitly and it worked. Rewrite the meta tag.
-                    def rewrite(match):
-                        return match.group(1) + "%SOUP-ENCODING%"
-                    newAttr = self.CHARSET_RE.sub(rewrite, contentType)
+                if getattr(self, 'declaredHTMLEncoding') or \
+                       (self.originalEncoding == self.fromEncoding):
+                    # This is our second pass through the document, or
+                    # else an encoding was specified explicitly and it
+                    # worked. Rewrite the meta tag.
+                    newAttr = self.CHARSET_RE.sub\
+                              (lambda(match):match.group(1) +
+                               "%SOUP-ENCODING%", contentType)
                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
                                                newAttr)
                     tagNeedsEncodingSubstitution = True
                 else:
                     # This is our first pass through the document.
-                    # Go through it again with the encoding information.
+                    # Go through it again with the new information.
                     newCharset = match.group(3)
                     if newCharset and newCharset != self.originalEncoding:
                         self.declaredHTMLEncoding = newCharset
                         self._feed(self.declaredHTMLEncoding)
                         raise StopParsing
-                    pass
         tag = self.unknown_starttag("meta", attrs)
         if tag and tagNeedsEncodingSubstitution:
             tag.containsSubstitutions = True
 
-
 class StopParsing(Exception):
     pass
 
@@ -1745,10 +1687,9 @@
                         "x-sjis" : "shift-jis" }
 
     def __init__(self, markup, overrideEncodings=[],
-                 smartQuotesTo='xml', isHTML=False):
-        self.declaredHTMLEncoding = None
+                 smartQuotesTo='xml'):
         self.markup, documentEncoding, sniffedEncoding = \
-                     self._detectEncoding(markup, isHTML)
+                     self._detectEncoding(markup)
         self.smartQuotesTo = smartQuotesTo
         self.triedEncodings = []
         if markup == '' or isinstance(markup, unicode):
@@ -1774,22 +1715,18 @@
             for proposed_encoding in ("utf-8", "windows-1252"):
                 u = self._convertFrom(proposed_encoding)
                 if u: break
-
         self.unicode = u
         if not u: self.originalEncoding = None
 
-    def _subMSChar(self, match):
+    def _subMSChar(self, orig):
         """Changes a MS smart quote character to an XML or HTML
         entity."""
-        orig = match.group(1)
         sub = self.MS_CHARS.get(orig)
         if type(sub) == types.TupleType:
             if self.smartQuotesTo == 'xml':
-                sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
+                sub = '&#x%s;' % sub[1]
             else:
-                sub = '&'.encode() + sub[0].encode() + ';'.encode()
-        else:
-            sub = sub.encode()
+                sub = '&%s;' % sub[0]
         return sub
 
     def _convertFrom(self, proposed):
@@ -1804,9 +1741,9 @@
         if self.smartQuotesTo and proposed.lower() in("windows-1252",
                                                       "iso-8859-1",
                                                       "iso-8859-2"):
-            smart_quotes_re = "([\x80-\x9f])"
-            smart_quotes_compiled = re.compile(smart_quotes_re)
-            markup = smart_quotes_compiled.sub(self._subMSChar, markup)
+            markup = re.compile("([\x80-\x9f])").sub \
+                     (lambda(x): self._subMSChar(x.group(1)),
+                      markup)
 
         try:
             # print "Trying to convert document to %s" % proposed
@@ -1845,7 +1782,7 @@
         newdata = unicode(data, encoding)
         return newdata
 
-    def _detectEncoding(self, xml_data, isHTML=False):
+    def _detectEncoding(self, xml_data):
         """Given a document, tries to detect its XML encoding."""
         xml_encoding = sniffed_xml_encoding = None
         try:
@@ -1893,19 +1830,13 @@
             else:
                 sniffed_xml_encoding = 'ascii'
                 pass
+            xml_encoding_match = re.compile \
+                                 ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
+                                 .match(xml_data)
         except:
             xml_encoding_match = None
-        xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
-        xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
-        if not xml_encoding_match and isHTML:
-            meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
-            regexp = re.compile(meta_re, re.I)
-            xml_encoding_match = regexp.search(xml_data)
-        if xml_encoding_match is not None:
-            xml_encoding = xml_encoding_match.groups()[0].decode(
-                'ascii').lower()
-            if isHTML:
-                self.declaredHTMLEncoding = xml_encoding
+        if xml_encoding_match:
+            xml_encoding = xml_encoding_match.groups()[0].lower()
             if sniffed_xml_encoding and \
                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
@@ -1996,5 +1927,5 @@
 #By default, act as an HTML pretty-printer.
 if __name__ == '__main__':
     import sys
-    soup = BeautifulSoup(sys.stdin)
+    soup = BeautifulSoup(sys.stdin.read())
     print soup.prettify()



_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn

[Pywikipedia-svn] SVN: [6857] trunk/pywikipedia/BeautifulSoup.py

Reply via email to