[Xbmc-addons] Scripts branch, frodo, updated. daea5ff13406f01c3e99ec1369ed74460eff91b9

Martijn Kaijser Tue, 24 Sep 2013 13:31:19 -0700

The branch, frodo has been updated
       via  daea5ff13406f01c3e99ec1369ed74460eff91b9 (commit)
      from  ad42d4c5647dd07974e5d18d31947b06cf739705 (commit)


- Log -----------------------------------------------------------------
http://xbmc.git.sourceforge.net/git/gitweb.cgi?p=xbmc/scripts;a=commit;h=daea5ff13406f01c3e99ec1369ed74460eff91b9

commit daea5ff13406f01c3e99ec1369ed74460eff91b9
Author: Martijn Kaijser <[email protected]>
Date:   Tue Sep 24 22:30:03 2013 +0200

    [script.module.feedparser] 5.1.3

diff --git a/script.module.feedparser/addon.xml 
b/script.module.feedparser/addon.xml
index 8b653ab..7cbdcb8 100644
--- a/script.module.feedparser/addon.xml
+++ b/script.module.feedparser/addon.xml
@@ -1,19 +1,17 @@
 <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-<addon id="script.module.feedparser"
-       name="feedparser"
-       version="5.1.2"
-       provider-name="Tristan Fischer ([email protected])">
-  <requires>
-    <import addon="xbmc.python" 
-            version="2.0"/>
-  </requires>
-  <extension point="xbmc.python.module"
-             library="lib" />
-  <extension point="xbmc.addon.metadata">
-    <language></language>
-    <summary lang="en">Python helper module</summary>
-    <description lang="en">feedparser is a Python library that parses feeds in 
all known formats, including Atom, RSS, and RDF. It runs on Python 2.4 all the 
way up to 3.2.</description>
-    <license>MIT</license>
-    <platform>all</platform>
-  </extension>
+<addon id="script.module.feedparser" name="feedparser" version="5.1.3" 
provider-name="Tristan Fischer ([email protected])">
+    <requires>
+        <import addon="xbmc.python" version="2.1.0"/>
+    </requires>
+    <extension point="xbmc.python.module" library="lib" />
+    <extension point="xbmc.addon.metadata">
+        <language></language>
+        <summary lang="en">Python helper module</summary>
+        <description lang="en">feedparser is a Python library that parses 
feeds in all known formats, including Atom, RSS, and RDF. It runs on Python 2.4 
all the way up to 3.2.</description>
+        <license>MIT</license>
+        <platform>all</platform>
+        <website>https://code.google.com/p/feedparser/</website>
+        <source>https://code.google.com/p/feedparser/source/browse/</source>
+        <forum></forum>
+    </extension>
 </addon>
\ No newline at end of file
diff --git a/script.module.feedparser/changelog.txt 
b/script.module.feedparser/changelog.txt
index 7d8846b..281a3d2 100644
--- a/script.module.feedparser/changelog.txt
+++ b/script.module.feedparser/changelog.txt
@@ -1,2 +1,6 @@
+5.1.3 (24.09.2013)
+ Sync with upstream
+ From: https://feedparser.googlecode.com/files/feedparser-5.1.3.zip
+
 0.5.8
- Original: http://code.google.com/p/feedparser/
\ No newline at end of file
+ Original: http://code.google.com/p/feedparser/
diff --git a/script.module.feedparser/lib/feedparser.py 
b/script.module.feedparser/lib/feedparser.py
index 45555c9..c78e6a3 100644
--- a/script.module.feedparser/lib/feedparser.py
+++ b/script.module.feedparser/lib/feedparser.py
@@ -9,7 +9,7 @@ Required: Python 2.4 or later
 Recommended: iconv_codec <http://cjkpython.i18n.org/>
 """
 
-__version__ = "5.1.2"
+__version__ = "5.1.3"
 __license__ = """
 Copyright (c) 2010-2012 Kurt McKee <[email protected]>
 Copyright (c) 2002-2008 Mark Pilgrim
@@ -44,7 +44,8 @@ __contributors__ = ["Jason Diamond <http://injektilo.org/>",
                     "Sam Ruby <http://intertwingly.net/>",
                     "Ade Oshineye <http://blog.oshineye.com/>",
                     "Martin Pool <http://sourcefrog.net/>",
-                    "Kurt McKee <http://kurtmckee.org/>"]
+                    "Kurt McKee <http://kurtmckee.org/>",
+                    "Bernd Schlapsi <https://github.com/brot>",]
 
 # HTTP "User-Agent" header to send to servers when downloading feeds.
 # If you are embedding feedparser in a larger application, you should
@@ -284,15 +285,6 @@ except ImportError:
     BeautifulSoup = None
     PARSE_MICROFORMATS = False
 
-try:
-    # the utf_32 codec was introduced in Python 2.6; it's necessary to
-    # check this as long as feedparser supports Python 2.4 and 2.5
-    codecs.lookup('utf_32')
-except LookupError:
-    _UTF32_AVAILABLE = False
-else:
-    _UTF32_AVAILABLE = True
-
 # ---------- don't touch these ----------
 class ThingsNobodyCaresAboutButMe(Exception): pass
 class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
@@ -1980,6 +1972,7 @@ class _BaseHTMLProcessor(sgmllib.SGMLParser):
     def handle_charref(self, ref):
         # called for each character reference, e.g. for '&#160;', ref will be 
'160'
         # Reconstruct the original character reference.
+        ref = ref.lower()
         if ref.startswith('x'):
             value = int(ref[1:], 16)
         else:
@@ -2464,7 +2457,10 @@ class _MicroformatsParser:
            linktype.startswith('video/') or \
            (linktype.startswith('application/') and not 
linktype.endswith('xml')):
             return 1
-        path = urlparse.urlparse(attrsD['href'])[2]
+        try:
+            path = urlparse.urlparse(attrsD['href'])[2]
+        except ValueError:
+            return 0
         if path.find('.') == -1:
             return 0
         fileext = path.split('.').pop().lower()
@@ -2550,7 +2546,8 @@ class _RelativeURIResolver(_BaseHTMLProcessor):
                      ('object', 'data'),
                      ('object', 'usemap'),
                      ('q', 'cite'),
-                     ('script', 'src')])
+                     ('script', 'src'),
+                     ('video', 'poster')])
 
     def __init__(self, baseuri, encoding, _type):
         _BaseHTMLProcessor.__init__(self, encoding, _type)
@@ -2627,13 +2624,13 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
       'loop', 'loopcount', 'loopend', 'loopstart', 'low', 'lowsrc', 'max',
       'maxlength', 'media', 'method', 'min', 'multiple', 'name', 'nohref',
       'noshade', 'nowrap', 'open', 'optimum', 'pattern', 'ping', 'point-size',
-      'prompt', 'pqg', 'radiogroup', 'readonly', 'rel', 'repeat-max',
-      'repeat-min', 'replace', 'required', 'rev', 'rightspacing', 'rows',
-      'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src',
-      'start', 'step', 'summary', 'suppress', 'tabindex', 'target', 'template',
-      'title', 'toppadding', 'type', 'unselectable', 'usemap', 'urn', 'valign',
-      'value', 'variable', 'volume', 'vspace', 'vrml', 'width', 'wrap',
-      'xml:lang'])
+      'poster', 'pqg', 'preload', 'prompt', 'radiogroup', 'readonly', 'rel',
+      'repeat-max', 'repeat-min', 'replace', 'required', 'rev', 'rightspacing',
+      'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', 'span',
+      'src', 'start', 'step', 'summary', 'suppress', 'tabindex', 'target',
+      'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
+      'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
+      'width', 'wrap', 'xml:lang'])
 
     unacceptable_elements_with_end_tag = set(['script', 'applet', 'style'])
 
@@ -2985,9 +2982,9 @@ def _open_resource(url_file_stream_or_string, etag, 
modified, agent, referrer, h
             url_file_stream_or_string = 'http:' + url_file_stream_or_string[5:]
         if not agent:
             agent = USER_AGENT
-        # test for inline user:password for basic auth
+        # Test for inline user:password credentials for HTTP basic auth
         auth = None
-        if base64:
+        if base64 and not url_file_stream_or_string.startswith('ftp:'):
             urltype, rest = urllib.splittype(url_file_stream_or_string)
             realhost, rest = urllib.splithost(rest)
             if realhost:
@@ -3480,15 +3477,7 @@ _rfc822_match = re.compile(
     "(?:%s, )?%s(?: %s)?" % (_rfc822_dayname, _rfc822_date, _rfc822_time)
 ).match
 
-def _parse_date_rfc822(dt):
-    """Parse RFC 822 dates and times, with one minor
-    difference: years may be 4DIGIT or 2DIGIT.
-    http://tools.ietf.org/html/rfc822#section-5""";
-    try:
-        m = _rfc822_match(dt.lower()).groupdict(0)
-    except AttributeError:
-        return None
-
+def _parse_date_group_rfc822(m):
     # Calculate a date and timestamp
     for k in ('year', 'day', 'hour', 'minute', 'second'):
         m[k] = int(m[k])
@@ -3521,8 +3510,36 @@ def _parse_date_rfc822(dt):
 
     # Return the date and timestamp in UTC
     return (stamp - delta).utctimetuple()
+
+def _parse_date_rfc822(dt):
+    """Parse RFC 822 dates and times, with one minor
+    difference: years may be 4DIGIT or 2DIGIT.
+    http://tools.ietf.org/html/rfc822#section-5""";
+    try:
+        m = _rfc822_match(dt.lower()).groupdict(0)
+    except AttributeError:
+        return None
+
+    return _parse_date_group_rfc822(m)
 registerDateHandler(_parse_date_rfc822)
 
+def _parse_date_rfc822_grubby(dt):
+    """Parse date format similar to RFC 822, but 
+    the comma after the dayname is optional and
+    month/day are inverted"""
+    _rfc822_date_grubby = "%s %s %s" % (_rfc822_month, _rfc822_day, 
_rfc822_year)
+    _rfc822_match_grubby = re.compile(
+        "(?:%s[,]? )?%s(?: %s)?" % (_rfc822_dayname, _rfc822_date_grubby, 
_rfc822_time)
+    ).match
+
+    try:
+        m = _rfc822_match_grubby(dt.lower()).groupdict(0)
+    except AttributeError:
+        return None
+
+    return _parse_date_group_rfc822(m)
+registerDateHandler(_parse_date_rfc822_grubby)
+
 def _parse_date_asctime(dt):
     """Parse asctime-style dates"""
     dayname, month, day, remainder = dt.split(None, 3)
@@ -3566,217 +3583,283 @@ def _parse_date(dateString):
         return date9tuple
     return None
 
-def _getCharacterEncoding(http_headers, xml_data):
-    '''Get the character encoding of the XML document
+# Each marker represents some of the characters of the opening XML
+# processing instruction ('<?xm') in the specified encoding.
+EBCDIC_MARKER = _l2bytes([0x4C, 0x6F, 0xA7, 0x94])
+UTF16BE_MARKER = _l2bytes([0x00, 0x3C, 0x00, 0x3F])
+UTF16LE_MARKER = _l2bytes([0x3C, 0x00, 0x3F, 0x00])
+UTF32BE_MARKER = _l2bytes([0x00, 0x00, 0x00, 0x3C])
+UTF32LE_MARKER = _l2bytes([0x3C, 0x00, 0x00, 0x00])
+
+ZERO_BYTES = _l2bytes([0x00, 0x00])
+
+# Match the opening XML declaration.
+# Example: <?xml version="1.0" encoding="utf-8"?>
+RE_XML_DECLARATION = re.compile('^<\?xml[^>]*?>')
+
+# Capture the value of the XML processing instruction's encoding attribute.
+# Example: <?xml version="1.0" encoding="utf-8"?>
+RE_XML_PI_ENCODING = 
re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>'))
+
+def convert_to_utf8(http_headers, data):
+    '''Detect and convert the character encoding to UTF-8.
 
     http_headers is a dictionary
-    xml_data is a raw string (not Unicode)
-
-    This is so much trickier than it sounds, it's not even funny.
-    According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
-    is application/xml, application/*+xml,
-    application/xml-external-parsed-entity, or application/xml-dtd,
-    the encoding given in the charset parameter of the HTTP Content-Type
-    takes precedence over the encoding given in the XML prefix within the
-    document, and defaults to 'utf-8' if neither are specified.  But, if
-    the HTTP Content-Type is text/xml, text/*+xml, or
-    text/xml-external-parsed-entity, the encoding given in the XML prefix
-    within the document is ALWAYS IGNORED and only the encoding given in
-    the charset parameter of the HTTP Content-Type header should be
-    respected, and it defaults to 'us-ascii' if not specified.
-
-    Furthermore, discussion on the atom-syntax mailing list with the
-    author of RFC 3023 leads me to the conclusion that any document
-    served with a Content-Type of text/* and no charset parameter
-    must be treated as us-ascii.  (We now do this.)  And also that it
-    must always be flagged as non-well-formed.  (We now do this too.)
-
-    If Content-Type is unspecified (input was local file or non-HTTP source)
-    or unrecognized (server just got it totally wrong), then go by the
-    encoding given in the XML prefix of the document and default to
-    'iso-8859-1' as per the HTTP specification (RFC 2616).
-
-    Then, assuming we didn't find a character encoding in the HTTP headers
-    (and the HTTP Content-type allowed us to look in the body), we need
-    to sniff the first few bytes of the XML data and try to determine
-    whether the encoding is ASCII-compatible.  Section F of the XML
-    specification shows the way here:
-    http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
-
-    If the sniffed encoding is not ASCII-compatible, we need to make it
-    ASCII compatible so that we can sniff further into the XML declaration
-    to find the encoding attribute, which will tell us the true encoding.
-
-    Of course, none of this guarantees that we will be able to parse the
-    feed in the declared character encoding (assuming it was declared
-    correctly, which many are not).  iconv_codec can help a lot;
-    you should definitely install it if you can.
-    http://cjkpython.i18n.org/
-    '''
+    data is a raw string (not Unicode)'''
+
+    # This is so much trickier than it sounds, it's not even funny.
+    # According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
+    # is application/xml, application/*+xml,
+    # application/xml-external-parsed-entity, or application/xml-dtd,
+    # the encoding given in the charset parameter of the HTTP Content-Type
+    # takes precedence over the encoding given in the XML prefix within the
+    # document, and defaults to 'utf-8' if neither are specified.  But, if
+    # the HTTP Content-Type is text/xml, text/*+xml, or
+    # text/xml-external-parsed-entity, the encoding given in the XML prefix
+    # within the document is ALWAYS IGNORED and only the encoding given in
+    # the charset parameter of the HTTP Content-Type header should be
+    # respected, and it defaults to 'us-ascii' if not specified.
+
+    # Furthermore, discussion on the atom-syntax mailing list with the
+    # author of RFC 3023 leads me to the conclusion that any document
+    # served with a Content-Type of text/* and no charset parameter
+    # must be treated as us-ascii.  (We now do this.)  And also that it
+    # must always be flagged as non-well-formed.  (We now do this too.)
+
+    # If Content-Type is unspecified (input was local file or non-HTTP source)
+    # or unrecognized (server just got it totally wrong), then go by the
+    # encoding given in the XML prefix of the document and default to
+    # 'iso-8859-1' as per the HTTP specification (RFC 2616).
+
+    # Then, assuming we didn't find a character encoding in the HTTP headers
+    # (and the HTTP Content-type allowed us to look in the body), we need
+    # to sniff the first few bytes of the XML data and try to determine
+    # whether the encoding is ASCII-compatible.  Section F of the XML
+    # specification shows the way here:
+    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+
+    # If the sniffed encoding is not ASCII-compatible, we need to make it
+    # ASCII compatible so that we can sniff further into the XML declaration
+    # to find the encoding attribute, which will tell us the true encoding.
 
-    def _parseHTTPContentType(content_type):
-        '''takes HTTP Content-Type header and returns (content type, charset)
-
-        If no charset is specified, returns (content type, '')
-        If no content type is specified, returns ('', '')
-        Both return parameters are guaranteed to be lowercase strings
-        '''
-        content_type = content_type or ''
-        content_type, params = cgi.parse_header(content_type)
-        charset = params.get('charset', '').replace("'", "")
-        if not isinstance(charset, unicode):
-            charset = charset.decode('utf-8', 'ignore')
-        return content_type, charset
-
-    sniffed_xml_encoding = u''
+    # Of course, none of this guarantees that we will be able to parse the
+    # feed in the declared character encoding (assuming it was declared
+    # correctly, which many are not).  iconv_codec can help a lot;
+    # you should definitely install it if you can.
+    # http://cjkpython.i18n.org/
+
+    bom_encoding = u''
     xml_encoding = u''
-    true_encoding = u''
-    http_content_type, http_encoding = 
_parseHTTPContentType(http_headers.get('content-type'))
-    # Must sniff for non-ASCII-compatible character encodings before
-    # searching for XML declaration.  This heuristic is defined in
-    # section F of the XML specification:
+    rfc3023_encoding = u''
+
+    # Look at the first few bytes of the document to guess what
+    # its encoding may be. We only need to decode enough of the
+    # document that we can use an ASCII-compatible regular
+    # expression to search for an XML encoding declaration.
+    # The heuristic follows the XML specification, section F:
     # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+    # Check for BOMs first.
+    if data[:4] == codecs.BOM_UTF32_BE:
+        bom_encoding = u'utf-32be'
+        data = data[4:]
+    elif data[:4] == codecs.BOM_UTF32_LE:
+        bom_encoding = u'utf-32le'
+        data = data[4:]
+    elif data[:2] == codecs.BOM_UTF16_BE and data[2:4] != ZERO_BYTES:
+        bom_encoding = u'utf-16be'
+        data = data[2:]
+    elif data[:2] == codecs.BOM_UTF16_LE and data[2:4] != ZERO_BYTES:
+        bom_encoding = u'utf-16le'
+        data = data[2:]
+    elif data[:3] == codecs.BOM_UTF8:
+        bom_encoding = u'utf-8'
+        data = data[3:]
+    # Check for the characters '<?xm' in several encodings.
+    elif data[:4] == EBCDIC_MARKER:
+        bom_encoding = u'cp037'
+    elif data[:4] == UTF16BE_MARKER:
+        bom_encoding = u'utf-16be'
+    elif data[:4] == UTF16LE_MARKER:
+        bom_encoding = u'utf-16le'
+    elif data[:4] == UTF32BE_MARKER:
+        bom_encoding = u'utf-32be'
+    elif data[:4] == UTF32LE_MARKER:
+        bom_encoding = u'utf-32le'
+
+    tempdata = data
     try:
-        if xml_data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]):
-            # In all forms of EBCDIC, these four bytes correspond
-            # to the string '<?xm'; try decoding using CP037
-            sniffed_xml_encoding = u'cp037'
-            xml_data = xml_data.decode('cp037').encode('utf-8')
-        elif xml_data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]):
-            # UTF-16BE
-            sniffed_xml_encoding = u'utf-16be'
-            xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
-        elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xfe, 0xff])) 
and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
-            # UTF-16BE with BOM
-            sniffed_xml_encoding = u'utf-16be'
-            xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
-        elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
-            # UTF-16LE
-            sniffed_xml_encoding = u'utf-16le'
-            xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
-        elif (len(xml_data) >= 4) and (xml_data[:2] == _l2bytes([0xff, 0xfe])) 
and (xml_data[2:4] != _l2bytes([0x00, 0x00])):
-            # UTF-16LE with BOM
-            sniffed_xml_encoding = u'utf-16le'
-            xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
-        elif xml_data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
-            # UTF-32BE
-            sniffed_xml_encoding = u'utf-32be'
-            if _UTF32_AVAILABLE:
-                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
-        elif xml_data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
-            # UTF-32LE
-            sniffed_xml_encoding = u'utf-32le'
-            if _UTF32_AVAILABLE:
-                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
-        elif xml_data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
-            # UTF-32BE with BOM
-            sniffed_xml_encoding = u'utf-32be'
-            if _UTF32_AVAILABLE:
-                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
-        elif xml_data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
-            # UTF-32LE with BOM
-            sniffed_xml_encoding = u'utf-32le'
-            if _UTF32_AVAILABLE:
-                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
-        elif xml_data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
-            # UTF-8 with BOM
-            sniffed_xml_encoding = u'utf-8'
-            xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
-        else:
-            # ASCII-compatible
-            pass
-        xml_encoding_match = 
re.compile(_s2bytes('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')).match(xml_data)
-    except UnicodeDecodeError:
+        if bom_encoding:
+            tempdata = data.decode(bom_encoding).encode('utf-8')
+    except (UnicodeDecodeError, LookupError):
+        # feedparser recognizes UTF-32 encodings that aren't
+        # available in Python 2.4 and 2.5, so it's possible to
+        # encounter a LookupError during decoding.
         xml_encoding_match = None
+    else:
+        xml_encoding_match = RE_XML_PI_ENCODING.match(tempdata)
+
     if xml_encoding_match:
         xml_encoding = xml_encoding_match.groups()[0].decode('utf-8').lower()
-        if sniffed_xml_encoding and (xml_encoding in (u'iso-10646-ucs-2', 
u'ucs-2', u'csunicode', u'iso-10646-ucs-4', u'ucs-4', u'csucs4', u'utf-16', 
u'utf-32', u'utf_16', u'utf_32', u'utf16', u'u16')):
-            xml_encoding = sniffed_xml_encoding
+        # Normalize the xml_encoding if necessary.
+        if bom_encoding and (xml_encoding in (
+            u'u16', u'utf-16', u'utf16', u'utf_16',
+            u'u32', u'utf-32', u'utf32', u'utf_32',
+            u'iso-10646-ucs-2', u'iso-10646-ucs-4',
+            u'csucs4', u'csunicode', u'ucs-2', u'ucs-4'
+        )):
+            xml_encoding = bom_encoding
+
+    # Find the HTTP Content-Type and, hopefully, a character
+    # encoding provided by the server. The Content-Type is used
+    # to choose the "correct" encoding among the BOM encoding,
+    # XML declaration encoding, and HTTP encoding, following the
+    # heuristic defined in RFC 3023.
+    http_content_type = http_headers.get('content-type') or ''
+    http_content_type, params = cgi.parse_header(http_content_type)
+    http_encoding = params.get('charset', '').replace("'", "")
+    if not isinstance(http_encoding, unicode):
+        http_encoding = http_encoding.decode('utf-8', 'ignore')
+
     acceptable_content_type = 0
-    application_content_types = (u'application/xml', u'application/xml-dtd', 
u'application/xml-external-parsed-entity')
+    application_content_types = (u'application/xml', u'application/xml-dtd',
+                                 u'application/xml-external-parsed-entity')
     text_content_types = (u'text/xml', u'text/xml-external-parsed-entity')
     if (http_content_type in application_content_types) or \
-       (http_content_type.startswith(u'application/') and 
http_content_type.endswith(u'+xml')):
+       (http_content_type.startswith(u'application/') and 
+        http_content_type.endswith(u'+xml')):
         acceptable_content_type = 1
-        true_encoding = http_encoding or xml_encoding or u'utf-8'
+        rfc3023_encoding = http_encoding or xml_encoding or u'utf-8'
     elif (http_content_type in text_content_types) or \
-         (http_content_type.startswith(u'text/')) and 
http_content_type.endswith(u'+xml'):
+         (http_content_type.startswith(u'text/') and
+          http_content_type.endswith(u'+xml')):
         acceptable_content_type = 1
-        true_encoding = http_encoding or u'us-ascii'
+        rfc3023_encoding = http_encoding or u'us-ascii'
     elif http_content_type.startswith(u'text/'):
-        true_encoding = http_encoding or u'us-ascii'
+        rfc3023_encoding = http_encoding or u'us-ascii'
     elif http_headers and 'content-type' not in http_headers:
-        true_encoding = xml_encoding or u'iso-8859-1'
+        rfc3023_encoding = xml_encoding or u'iso-8859-1'
     else:
-        true_encoding = xml_encoding or u'utf-8'
-    # some feeds claim to be gb2312 but are actually gb18030.
-    # apparently MSIE and Firefox both do the following switch:
-    if true_encoding.lower() == u'gb2312':
-        true_encoding = u'gb18030'
-    return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, 
acceptable_content_type
-
-def _toUTF8(data, encoding):
-    '''Changes an XML data stream on the fly to specify a new encoding
-
-    data is a raw sequence of bytes (not Unicode) that is presumed to be in 
%encoding already
-    encoding is a string recognized by encodings.aliases
-    '''
-    # strip Byte Order Mark (if present)
-    if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and 
(data[2:4] != _l2bytes([0x00, 0x00])):
-        encoding = 'utf-16be'
-        data = data[2:]
-    elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and 
(data[2:4] != _l2bytes([0x00, 0x00])):
-        encoding = 'utf-16le'
-        data = data[2:]
-    elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
-        encoding = 'utf-8'
-        data = data[3:]
-    elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
-        encoding = 'utf-32be'
-        data = data[4:]
-    elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
-        encoding = 'utf-32le'
-        data = data[4:]
-    newdata = unicode(data, encoding)
-    declmatch = re.compile('^<\?xml[^>]*?>')
-    newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
-    if declmatch.search(newdata):
-        newdata = declmatch.sub(newdecl, newdata)
-    else:
-        newdata = newdecl + u'\n' + newdata
-    return newdata.encode('utf-8')
+        rfc3023_encoding = xml_encoding or u'utf-8'
+    # gb18030 is a superset of gb2312, so always replace gb2312
+    # with gb18030 for greater compatibility.
+    if rfc3023_encoding.lower() == u'gb2312':
+        rfc3023_encoding = u'gb18030'
+    if xml_encoding.lower() == u'gb2312':
+        xml_encoding = u'gb18030'
 
-def _stripDoctype(data):
-    '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
+    # there are four encodings to keep track of:
+    # - http_encoding is the encoding declared in the Content-Type HTTP header
+    # - xml_encoding is the encoding declared in the <?xml declaration
+    # - bom_encoding is the encoding sniffed from the first 4 bytes of the XML 
data
+    # - rfc3023_encoding is the actual encoding, as per RFC 3023 and a variety 
of other conflicting specifications
+    error = None
+
+    if http_headers and (not acceptable_content_type):
+        if 'content-type' in http_headers:
+            msg = '%s is not an XML media type' % http_headers['content-type']
+        else:
+            msg = 'no Content-type specified'
+        error = NonXMLContentType(msg)
+
+    # determine character encoding
+    known_encoding = 0
+    chardet_encoding = None
+    tried_encodings = []
+    if chardet:
+        chardet_encoding = unicode(chardet.detect(data)['encoding'] or '', 
'ascii', 'ignore')
+    # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
+    for proposed_encoding in (rfc3023_encoding, xml_encoding, bom_encoding,
+                              chardet_encoding, u'utf-8', u'windows-1252', 
u'iso-8859-2'):
+        if not proposed_encoding:
+            continue
+        if proposed_encoding in tried_encodings:
+            continue
+        tried_encodings.append(proposed_encoding)
+        try:
+            data = data.decode(proposed_encoding)
+        except (UnicodeDecodeError, LookupError):
+            pass
+        else:
+            known_encoding = 1
+            # Update the encoding in the opening XML processing instruction.
+            new_declaration = '''<?xml version='1.0' encoding='utf-8'?>'''
+            if RE_XML_DECLARATION.search(data):
+                data = RE_XML_DECLARATION.sub(new_declaration, data)
+            else:
+                data = new_declaration + u'\n' + data
+            data = data.encode('utf-8')
+            break
+    # if still no luck, give up
+    if not known_encoding:
+        error = CharacterEncodingUnknown(
+            'document encoding unknown, I tried ' +
+            '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' %
+            (rfc3023_encoding, xml_encoding))
+        rfc3023_encoding = u''
+    elif proposed_encoding != rfc3023_encoding:
+        error = CharacterEncodingOverride(
+            'document declared as %s, but parsed as %s' %
+            (rfc3023_encoding, proposed_encoding))
+        rfc3023_encoding = proposed_encoding
+
+    return data, rfc3023_encoding, error
+
+# Match XML entity declarations.
+# Example: <!ENTITY copyright "(C)">
+RE_ENTITY_PATTERN = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), 
re.MULTILINE)
+
+# Match XML DOCTYPE declarations.
+# Example: <!DOCTYPE feed [ ]>
+RE_DOCTYPE_PATTERN = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), 
re.MULTILINE)
+
+# Match safe entity declarations.
+# This will allow hexadecimal character references through,
+# as well as text, but not arbitrary nested entities.
+# Example: cubed "&#179;"
+# Example: copyright "(C)"
+# Forbidden: explode1 "&explode2;&explode2;"
+RE_SAFE_ENTITY_PATTERN = re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
+
+def replace_doctype(data):
+    '''Strips and replaces the DOCTYPE, returns (rss_version, stripped_data)
 
     rss_version may be 'rss091n' or None
-    stripped_data is the same XML document, minus the DOCTYPE
+    stripped_data is the same XML document with a replaced DOCTYPE
     '''
+
+    # Divide the document into two groups by finding the location
+    # of the first element that doesn't begin with '<?' or '<!'.
     start = re.search(_s2bytes('<\w'), data)
     start = start and start.start() or -1
-    head,data = data[:start+1], data[start+1:]
+    head, data = data[:start+1], data[start+1:]
+
+    # Save and then remove all of the ENTITY declarations.
+    entity_results = RE_ENTITY_PATTERN.findall(head)
+    head = RE_ENTITY_PATTERN.sub(_s2bytes(''), head)
 
-    entity_pattern = re.compile(_s2bytes(r'^\s*<!ENTITY([^>]*?)>'), 
re.MULTILINE)
-    entity_results=entity_pattern.findall(head)
-    head = entity_pattern.sub(_s2bytes(''), head)
-    doctype_pattern = re.compile(_s2bytes(r'^\s*<!DOCTYPE([^>]*?)>'), 
re.MULTILINE)
-    doctype_results = doctype_pattern.findall(head)
+    # Find the DOCTYPE declaration and check the feed type.
+    doctype_results = RE_DOCTYPE_PATTERN.findall(head)
     doctype = doctype_results and doctype_results[0] or _s2bytes('')
-    if doctype.lower().count(_s2bytes('netscape')):
+    if _s2bytes('netscape') in doctype.lower():
         version = u'rss091n'
     else:
         version = None
 
-    # only allow in 'safe' inline entity definitions
-    replacement=_s2bytes('')
-    if len(doctype_results)==1 and entity_results:
-        safe_pattern=re.compile(_s2bytes('\s+(\w+)\s+"(&#\w+;|[^&"]*)"'))
-        safe_entities=filter(lambda e: safe_pattern.match(e),entity_results)
+    # Re-insert the safe ENTITY declarations if a DOCTYPE was found.
+    replacement = _s2bytes('')
+    if len(doctype_results) == 1 and entity_results:
+        match_safe_entities = lambda e: RE_SAFE_ENTITY_PATTERN.match(e)
+        safe_entities = filter(match_safe_entities, entity_results)
         if safe_entities:
-            replacement=_s2bytes('<!DOCTYPE feed [\n  <!ENTITY') + 
_s2bytes('>\n  <!ENTITY ').join(safe_entities) + _s2bytes('>\n]>')
-    data = doctype_pattern.sub(replacement, head) + data
+            replacement = _s2bytes('<!DOCTYPE feed [\n<!ENTITY') \
+                        + _s2bytes('>\n<!ENTITY ').join(safe_entities) \
+                        + _s2bytes('>\n]>')
+    data = RE_DOCTYPE_PATTERN.sub(replacement, head) + data
 
-    return version, data, dict(replacement and [(k.decode('utf-8'), 
v.decode('utf-8')) for k, v in safe_pattern.findall(replacement)])
+    # Precompute the safe entities for the loose parser.
+    safe_entities = dict((k.decode('utf-8'), v.decode('utf-8'))
+                      for k, v in RE_SAFE_ENTITY_PATTERN.findall(replacement))
+    return version, data, safe_entities
 
 def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, 
referrer=None, handlers=None, request_headers=None, response_headers=None):
     '''Parse a feed from a URL, file, stream, or string.
@@ -3874,22 +3957,22 @@ def parse(url_file_stream_or_string, etag=None, 
modified=None, agent=None, refer
     if data is None:
         return result
 
-    # there are four encodings to keep track of:
-    # - http_encoding is the encoding declared in the Content-Type HTTP header
-    # - xml_encoding is the encoding declared in the <?xml declaration
-    # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the 
XML data
-    # - result['encoding'] is the actual encoding, as per RFC 3023 and a 
variety of other conflicting specifications
-    result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, 
acceptable_content_type = \
-        _getCharacterEncoding(http_headers, data)
-    if http_headers and (not acceptable_content_type):
-        if 'content-type' in http_headers:
-            bozo_message = '%s is not an XML media type' % 
http_headers['content-type']
-        else:
-            bozo_message = 'no Content-type specified'
+    # Stop processing if the server sent HTTP 304 Not Modified.
+    if getattr(f, 'code', 0) == 304:
+        result['version'] = u''
+        result['debug_message'] = 'The feed has not changed since you last 
checked, ' + \
+            'so the server sent no data.  This is a feature, not a bug!'
+        return result
+
+    data, result['encoding'], error = convert_to_utf8(http_headers, data)
+    use_strict_parser = result['encoding'] and True or False
+    if error is not None:
         result['bozo'] = 1
-        result['bozo_exception'] = NonXMLContentType(bozo_message)
+        result['bozo_exception'] = error
+
+    result['version'], data, entities = replace_doctype(data)
 
-    # ensure that baseuri is an absolute uri using an acceptable URI scheme
+    # Ensure that baseuri is an absolute URI using an acceptable URI scheme.
     contentloc = http_headers.get('content-location', u'')
     href = result.get('href', u'')
     baseuri = _makeSafeAbsoluteURI(href, contentloc) or 
_makeSafeAbsoluteURI(contentloc) or href
@@ -3898,93 +3981,6 @@ def parse(url_file_stream_or_string, etag=None, 
modified=None, agent=None, refer
     if not isinstance(baselang, unicode) and baselang is not None:
         baselang = baselang.decode('utf-8', 'ignore')
 
-    # if server sent 304, we're done
-    if getattr(f, 'code', 0) == 304:
-        result['version'] = u''
-        result['debug_message'] = 'The feed has not changed since you last 
checked, ' + \
-            'so the server sent no data.  This is a feature, not a bug!'
-        return result
-
-    # if there was a problem downloading, we're done
-    if data is None:
-        return result
-
-    # determine character encoding
-    use_strict_parser = 0
-    known_encoding = 0
-    tried_encodings = []
-    # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
-    for proposed_encoding in (result['encoding'], xml_encoding, 
sniffed_xml_encoding):
-        if not proposed_encoding:
-            continue
-        if proposed_encoding in tried_encodings:
-            continue
-        tried_encodings.append(proposed_encoding)
-        try:
-            data = _toUTF8(data, proposed_encoding)
-        except (UnicodeDecodeError, LookupError):
-            pass
-        else:
-            known_encoding = use_strict_parser = 1
-            break
-    # if no luck and we have auto-detection library, try that
-    if (not known_encoding) and chardet:
-        proposed_encoding = unicode(chardet.detect(data)['encoding'], 'ascii', 
'ignore')
-        if proposed_encoding and (proposed_encoding not in tried_encodings):
-            tried_encodings.append(proposed_encoding)
-            try:
-                data = _toUTF8(data, proposed_encoding)
-            except (UnicodeDecodeError, LookupError):
-                pass
-            else:
-                known_encoding = use_strict_parser = 1
-    # if still no luck and we haven't tried utf-8 yet, try that
-    if (not known_encoding) and (u'utf-8' not in tried_encodings):
-        proposed_encoding = u'utf-8'
-        tried_encodings.append(proposed_encoding)
-        try:
-            data = _toUTF8(data, proposed_encoding)
-        except UnicodeDecodeError:
-            pass
-        else:
-            known_encoding = use_strict_parser = 1
-    # if still no luck and we haven't tried windows-1252 yet, try that
-    if (not known_encoding) and (u'windows-1252' not in tried_encodings):
-        proposed_encoding = u'windows-1252'
-        tried_encodings.append(proposed_encoding)
-        try:
-            data = _toUTF8(data, proposed_encoding)
-        except UnicodeDecodeError:
-            pass
-        else:
-            known_encoding = use_strict_parser = 1
-    # if still no luck and we haven't tried iso-8859-2 yet, try that.
-    if (not known_encoding) and (u'iso-8859-2' not in tried_encodings):
-        proposed_encoding = u'iso-8859-2'
-        tried_encodings.append(proposed_encoding)
-        try:
-            data = _toUTF8(data, proposed_encoding)
-        except UnicodeDecodeError:
-            pass
-        else:
-            known_encoding = use_strict_parser = 1
-    # if still no luck, give up
-    if not known_encoding:
-        result['bozo'] = 1
-        result['bozo_exception'] = CharacterEncodingUnknown( \
-            'document encoding unknown, I tried ' + \
-            '%s, %s, utf-8, windows-1252, and iso-8859-2 but nothing worked' % 
\
-            (result['encoding'], xml_encoding))
-        result['encoding'] = u''
-    elif proposed_encoding != result['encoding']:
-        result['bozo'] = 1
-        result['bozo_exception'] = CharacterEncodingOverride( \
-            'document declared as %s, but parsed as %s' % \
-            (result['encoding'], proposed_encoding))
-        result['encoding'] = proposed_encoding
-
-    result['version'], data, entities = _stripDoctype(data)
-
     if not _XML_AVAILABLE:
         use_strict_parser = 0
     if use_strict_parser:
@@ -4003,7 +3999,7 @@ def parse(url_file_stream_or_string, etag=None, 
modified=None, agent=None, refer
         source.setByteStream(_StringIO(data))
         try:
             saxparser.parse(source)
-        except xml.sax.SAXParseException, e:
+        except xml.sax.SAXException, e:
             result['bozo'] = 1
             result['bozo_exception'] = feedparser.exc or e
             use_strict_parser = 0

-----------------------------------------------------------------------

Summary of changes:
 script.module.feedparser/addon.xml         |   32 +-
 script.module.feedparser/changelog.txt     |    6 +-
 script.module.feedparser/lib/feedparser.py |  620 ++++++++++++++--------------
 3 files changed, 328 insertions(+), 330 deletions(-)


hooks/post-receive
-- 
Scripts

------------------------------------------------------------------------------
October Webinars: Code for Performance
Free Intel webinars can help you accelerate application performance.
Explore tips for MPI, OpenMP, advanced profiling, and more. Get the most from 
the latest Intel processors and coprocessors. See abstracts and register >
http://pubads.g.doubleclick.net/gampad/clk?id=60133471&iu=/4140/ostg.clktrk
_______________________________________________
Xbmc-addons mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/xbmc-addons

[Xbmc-addons] Scripts branch, frodo, updated. daea5ff13406f01c3e99ec1369ed74460eff91b9

Reply via email to