http://www.mediawiki.org/wiki/Special:Code/pywikipedia/11156
Revision: 11156
Author: xqt
Date: 2013-03-02 12:57:54 +0000 (Sat, 02 Mar 2013)
Log Message:
-----------
some PEP8 changes
Modified Paths:
--------------
trunk/pywikipedia/BeautifulSoup.py
Modified: trunk/pywikipedia/BeautifulSoup.py
===================================================================
--- trunk/pywikipedia/BeautifulSoup.py 2013-03-02 10:39:02 UTC (rev 11155)
+++ trunk/pywikipedia/BeautifulSoup.py 2013-03-02 12:57:54 UTC (rev 11156)
@@ -89,9 +89,9 @@
import re
import sgmllib
try:
- from htmlentitydefs import name2codepoint
+ from htmlentitydefs import name2codepoint
except ImportError:
- name2codepoint = {}
+ name2codepoint = {}
try:
set
except NameError:
@@ -103,12 +103,13 @@
DEFAULT_OUTPUT_ENCODING = "utf-8"
+
def _match_css_class(str):
"""Build a RE to match the given CSS class."""
return re.compile(r"(^|.*\s)%s($|\s)" % str)
+
# First, the classes that represent markup elements.
-
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -128,8 +129,8 @@
def replaceWith(self, replaceWith):
oldParent = self.parent
myIndex = self.parent.index(self)
- if hasattr(replaceWith, "parent")\
- and replaceWith.parent is self.parent:
+ if hasattr(replaceWith, "parent") and \
+ replaceWith.parent is self.parent:
# We're replacing this element with one of its siblings.
index = replaceWith.parent.index(replaceWith)
if index and index < myIndex:
@@ -186,11 +187,11 @@
return lastChild
def insert(self, position, newChild):
- if isinstance(newChild, basestring) \
- and not isinstance(newChild, NavigableString):
+ if isinstance(newChild, basestring) and not \
+ isinstance(newChild, NavigableString):
newChild = NavigableString(newChild)
- position = min(position, len(self.contents))
+ position = min(position, len(self.contents))
if hasattr(newChild, 'parent') and newChild.parent is not None:
# We're 'inserting' an element that's already one
# of this object's children.
@@ -227,7 +228,7 @@
while not parentsNextSibling:
parentsNextSibling = parent.nextSibling
parent = parent.parent
- if not parent: # This is the last element in the document.
+ if not parent: # This is the last element in the document.
break
if parentsNextSibling:
newChildsLastElement.next = parentsNextSibling
@@ -272,8 +273,9 @@
criteria and appear after this Tag in the document."""
return self._findAll(name, attrs, text, limit,
self.nextSiblingGenerator, **kwargs)
- fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
+ fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
+
def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the first item that matches the given criteria and
appears before this Tag in the document."""
@@ -284,8 +286,8 @@
"""Returns all items that match the given criteria and appear
before this Tag in the document."""
return self._findAll(name, attrs, text, limit, self.previousGenerator,
- **kwargs)
- fetchPrevious = findAllPrevious # Compatibility with pre-3.x
+ **kwargs)
+ fetchPrevious = findAllPrevious # Compatibility with pre-3.x
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the closest sibling to this Tag that matches the
@@ -299,7 +301,7 @@
criteria and appear before this Tag in the document."""
return self._findAll(name, attrs, text, limit,
self.previousSiblingGenerator, **kwargs)
- fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
+ fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
def findParent(self, name=None, attrs={}, **kwargs):
"""Returns the closest parent of this Tag that matches the given
@@ -318,8 +320,9 @@
return self._findAll(name, attrs, None, limit, self.parentGenerator,
**kwargs)
- fetchParents = findParents # Compatibility with pre-3.x
+ fetchParents = findParents # Compatibility with pre-3.x
+
#These methods do the real heavy lifting.
def _findOne(self, method, name, attrs, text, **kwargs):
@@ -415,11 +418,12 @@
s = unicode(s)
else:
if encoding:
- s = self.toEncoding(str(s), encoding)
+ s = self.toEncoding(str(s), encoding)
else:
s = unicode(s)
return s
+
class NavigableString(unicode, PageElement):
def __new__(cls, value):
@@ -444,7 +448,8 @@
if attr == 'string':
return self
else:
- raise AttributeError, "'%s' object has no attribute '%s'" %
(self.__class__.__name__, attr)
+ raise AttributeError("'%s' object has no attribute '%s'"
+ % (self.__class__.__name__, attr))
def __unicode__(self):
return str(self).decode(DEFAULT_OUTPUT_ENCODING)
@@ -455,11 +460,13 @@
else:
return self
+
class CData(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
+
class ProcessingInstruction(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
output = self
@@ -467,14 +474,17 @@
output = self.substituteEncoding(output, encoding)
return "<?%s?>" % self.toEncoding(output, encoding)
+
class Comment(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
return "<!--%s-->" % NavigableString.__str__(self, encoding)
+
class Declaration(NavigableString):
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
return "<!%s>" % NavigableString.__str__(self, encoding)
+
class Tag(PageElement):
"""Represents a found HTML tag with its attributes and contents."""
@@ -482,15 +492,15 @@
def _invert(h):
"Cheap function to invert a hash."
i = {}
- for k,v in h.items():
+ for k, v in h.items():
i[v] = k
return i
- XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
- "quot" : '"',
- "amp" : "&",
- "lt" : "<",
- "gt" : ">" }
+ XML_ENTITIES_TO_SPECIAL_CHARS = {"apos": "'",
+ "quot": '"',
+ "amp": "&",
+ "lt": "<",
+ "gt": ">"}
XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
@@ -549,8 +559,8 @@
self.attrs = map(convert, self.attrs)
def getString(self):
- if (len(self.contents) == 1
- and isinstance(self.contents[0], NavigableString)):
+ if (len(self.contents) == 1 and isinstance(self.contents[0],
+ NavigableString)):
return self.contents[0]
def setString(self, string):
@@ -592,7 +602,7 @@
raise ValueError("Tag.index: element not in tag")
def has_key(self, key):
- return self._getAttrMap().has_key(key)
+ return key in self._getAttrMap()
def __getitem__(self, key):
"""tag[key] returns the value of the 'key' attribute for the tag,
@@ -636,7 +646,7 @@
#We don't break because bad HTML can define the same
#attribute multiple times.
self._getAttrMap()
- if self.attrMap.has_key(key):
+ if key in self.attrMap:
del self.attrMap[key]
def __call__(self, *args, **kwargs):
@@ -651,7 +661,8 @@
return self.find(tag[:-3])
elif tag.find('__') != 0:
return self.find(tag)
- raise AttributeError, "'%s' object has no attribute '%s'" %
(self.__class__, tag)
+ raise AttributeError("'%s' object has no attribute '%s'"
+ % (self.__class__, tag))
def __eq__(self, other):
"""Returns true iff this tag has the same name, the same attributes,
@@ -661,7 +672,9 @@
same attributes in a different order. Should this be fixed?"""
if other is self:
return True
- if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not
hasattr(other, 'contents') or self.name != other.name or self.attrs !=
other.attrs or len(self) != len(other):
+ if not hasattr(other, 'name') or not hasattr(other, 'attrs') or \
+ not hasattr(other, 'contents') or self.name != other.name or \
+ self.attrs != other.attrs or len(self) != len(other):
return False
for i in range(0, len(self.contents)):
if self.contents[i] != other.contents[i]:
@@ -734,7 +747,8 @@
# value might also contain angle brackets, or
# ampersands that aren't part of entities. We need
# to escape those to XML entities too.
- val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity,
val)
+ val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity,
+ val)
attrs.append(fmt % (self.toEncoding(key, encoding),
self.toEncoding(val, encoding)))
@@ -798,7 +812,7 @@
prettyPrint=False, indentLevel=0):
"""Renders the contents of this tag as a string in the given
encoding. If encoding is None, returns a Unicode string.."""
- s=[]
+ s = []
for c in self:
text = None
if isinstance(c, NavigableString):
@@ -912,13 +926,13 @@
if isinstance(markupName, Tag):
markup = markupName
markupAttrs = markup
- callFunctionWithTagData = callable(self.name) \
- and not isinstance(markupName, Tag)
+ callFunctionWithTagData = callable(self.name) and \
+ not isinstance(markupName, Tag)
if (not self.name) \
- or callFunctionWithTagData \
- or (markup and self._matches(markup, self.name)) \
- or (not markup and self._matches(markupName, self.name)):
+ or callFunctionWithTagData \
+ or (markup and self._matches(markup, self.name)) \
+ or (not markup and self._matches(markupName, self.name)):
if callFunctionWithTagData:
match = self.name(markupName, markupAttrs)
else:
@@ -926,11 +940,11 @@
markupAttrMap = None
for attr, matchAgainst in self.attrs.items():
if not markupAttrMap:
- if hasattr(markupAttrs, 'get'):
+ if hasattr(markupAttrs, 'get'):
markupAttrMap = markupAttrs
- else:
+ else:
markupAttrMap = {}
- for k,v in markupAttrs:
+ for k, v in markupAttrs:
markupAttrMap[k] = v
attrValue = markupAttrMap.get(attr)
if not self._matches(attrValue, matchAgainst):
@@ -948,11 +962,10 @@
found = None
# If given a list of items, scan it for a text element that
# matches.
- if hasattr(markup, "__iter__") \
- and not isinstance(markup, Tag):
+ if hasattr(markup, "__iter__") and not isinstance(markup, Tag):
for element in markup:
- if isinstance(element, NavigableString) \
- and self.search(element):
+ if isinstance(element, NavigableString) and \
+ self.search(element):
found = element
break
# If it's a Tag, make sure its name or attributes match.
@@ -961,13 +974,13 @@
if not self.text:
found = self.searchTag(markup)
# If it's text, make sure the text matches.
- elif isinstance(markup, NavigableString) or \
- isinstance(markup, basestring):
+ elif isinstance(markup, NavigableString) or isinstance(markup,
+ basestring):
if self._matches(markup, self.text):
found = markup
else:
- raise Exception, "I don't know how to match against a %s" \
- % markup.__class__
+ raise Exception("I don't know how to match against a %s"
+ % markup.__class__)
return found
def _matches(self, markup, matchAgainst):
@@ -988,10 +1001,10 @@
if hasattr(matchAgainst, 'match'):
# It's a regexp object.
result = markup and matchAgainst.search(markup)
- elif hasattr(matchAgainst, '__iter__'): # list-like
+ elif hasattr(matchAgainst, '__iter__'): # list-like
result = markup in matchAgainst
elif hasattr(matchAgainst, 'items'):
- result = markup.has_key(matchAgainst)
+ result = matchAgainst in markup
elif matchAgainst and isinstance(markup, basestring):
if isinstance(markup, unicode):
matchAgainst = unicode(matchAgainst)
@@ -1002,6 +1015,7 @@
result = matchAgainst == markup
return result
+
class ResultSet(list):
"""A ResultSet is just a list that keeps track of the SoupStrainer
that created it."""
@@ -1009,6 +1023,7 @@
list.__init__([])
self.source = source
+
# Now, some helper functions.
def buildTagMap(default, *args):
@@ -1019,9 +1034,9 @@
for portion in args:
if hasattr(portion, 'items'):
#It's a map. Merge it.
- for k,v in portion.items():
+ for k, v in portion.items():
built[k] = v
- elif hasattr(portion, '__iter__'): # is a list
+ elif hasattr(portion, '__iter__'): # is a list
#It's a list. Map each item to the default.
for k in portion:
built[k] = default
@@ -1030,6 +1045,7 @@
built[portion] = default
return built
+
# Now, the parser classes.
class BeautifulStoneSoup(Tag, SGMLParser):
@@ -1074,7 +1090,7 @@
# can be replaced with a single space. A text node that contains
# fancy Unicode spaces (usually non-breaking) should be left
# alone.
- STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+ STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
markupMassage=True, smartQuotesTo=XML_ENTITIES,
@@ -1151,7 +1167,7 @@
n = int(name)
except ValueError:
return
- if not 0 <= n <= 127 : # ASCII ends at 127, not 255
+ if not 0 <= n <= 127: # ASCII ends at 127, not 255
return
return self.convert_codepoint(n)
@@ -1162,9 +1178,10 @@
if not hasattr(self, 'originalEncoding'):
self.originalEncoding = None
else:
- dammit = UnicodeDammit\
- (markup, [self.fromEncoding, inDocumentEncoding],
- smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
+ dammit = UnicodeDammit(markup,
+ [self.fromEncoding, inDocumentEncoding],
+ smartQuotesTo=self.smartQuotesTo,
+ isHTML=isHTML)
markup = dammit.unicode
self.originalEncoding = dammit.originalEncoding
self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
@@ -1194,7 +1211,7 @@
#print "__getattr__ called on %s.%s" % (self.__class__, methodName)
if methodName.startswith('start_') or methodName.startswith('end_') \
- or methodName.startswith('do_'):
+ or methodName.startswith('do_'):
return SGMLParser.__getattr__(self, methodName)
elif not methodName.startswith('__'):
return Tag.__getattr__(self, methodName)
@@ -1204,8 +1221,8 @@
def isSelfClosingTag(self, name):
"""Returns true iff the given string is the name of a
self-closing tag according to this parser."""
- return self.SELF_CLOSING_TAGS.has_key(name) \
- or self.instanceSelfClosingTags.has_key(name)
+ return name in self.SELF_CLOSING_TAGS or \
+ name in self.instanceSelfClosingTags
def reset(self):
Tag.__init__(self, self, self.ROOT_TAG_NAME)
@@ -1244,8 +1261,8 @@
currentData = ' '
self.currentData = []
if self.parseOnlyThese and len(self.tagStack) <= 1 and \
- (not self.parseOnlyThese.text or \
- not self.parseOnlyThese.search(currentData)):
+ (not self.parseOnlyThese.text or not
+ self.parseOnlyThese.search(currentData)):
return
o = containerClass(currentData)
o.setup(self.currentTag, self.previous)
@@ -1254,7 +1271,6 @@
self.previous = o
self.currentTag.contents.append(o)
-
def _popToTag(self, name, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag
@@ -1296,8 +1312,8 @@
"""
nestingResetTriggers = self.NESTABLE_TAGS.get(name)
- isNestable = nestingResetTriggers != None
- isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
+ isNestable = nestingResetTriggers is not None
+ isResetNesting = name in self.RESET_NESTING_TAGS
popTo = None
inclusive = True
for i in range(len(self.tagStack)-1, 0, -1):
@@ -1310,7 +1326,7 @@
if (nestingResetTriggers is not None
and p.name in nestingResetTriggers) \
or (nestingResetTriggers is None and isResetNesting
- and self.RESET_NESTING_TAGS.has_key(p.name)):
+ and p.name in self.RESET_NESTING_TAGS):
#If we encounter one of the nesting reset triggers
#peculiar to this tag, or we encounter another tag
@@ -1337,7 +1353,8 @@
self._smartPop(name)
if self.parseOnlyThese and len(self.tagStack) <= 1 \
- and (self.parseOnlyThese.text or not
self.parseOnlyThese.searchTag(name, attrs)):
+ and (self.parseOnlyThese.text or
+ not self.parseOnlyThese.searchTag(name, attrs)):
return
tag = Tag(self, name, attrs, self.currentTag, self.previous)
@@ -1411,7 +1428,7 @@
data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
if not data and self.convertHTMLEntities and \
- not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
+ not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
# TODO: We've got a problem here. We're told this is
# an entity reference, but it's not an XML entity
# reference or an HTML entity reference. Nonetheless,
@@ -1448,12 +1465,12 @@
declaration as a CData object."""
j = None
if self.rawdata[i:i+9] == '<![CDATA[':
- k = self.rawdata.find(']]>', i)
- if k == -1:
- k = len(self.rawdata)
- data = self.rawdata[i+9:k]
- j = k+3
- self._toStringSubclass(data, CData)
+ k = self.rawdata.find(']]>', i)
+ if k == -1:
+ k = len(self.rawdata)
+ data = self.rawdata[i+9:k]
+ j = k + 3
+ self._toStringSubclass(data, CData)
else:
try:
j = SGMLParser.parse_declaration(self, i)
@@ -1463,6 +1480,7 @@
j = i + len(toHandle)
return j
+
class BeautifulSoup(BeautifulStoneSoup):
"""This parser knows the following facts about HTML:
@@ -1512,18 +1530,18 @@
BeautifulStoneSoup before writing your own subclass."""
def __init__(self, *args, **kwargs):
- if not kwargs.has_key('smartQuotesTo'):
+ if not 'smartQuotesTo' in kwargs:
kwargs['smartQuotesTo'] = self.HTML_ENTITIES
kwargs['isHTML'] = True
BeautifulStoneSoup.__init__(self, *args, **kwargs)
SELF_CLOSING_TAGS = buildTagMap(None,
- ('br' , 'hr', 'input', 'img', 'meta',
+ ('br', 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base', 'col'))
PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
- QUOTE_TAGS = {'script' : None, 'textarea' : None}
+ QUOTE_TAGS = {'script': None, 'textarea': None}
#According to the HTML standard, each of these inline tags can
#contain another tag of the same type. Furthermore, it's common
@@ -1537,21 +1555,21 @@
NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
#Lists can contain other lists, but there are restrictions.
- NESTABLE_LIST_TAGS = { 'ol' : [],
- 'ul' : [],
- 'li' : ['ul', 'ol'],
- 'dl' : [],
- 'dd' : ['dl'],
- 'dt' : ['dl'] }
+ NESTABLE_LIST_TAGS = {'ol': [],
+ 'ul': [],
+ 'li': ['ul', 'ol'],
+ 'dl': [],
+ 'dd': ['dl'],
+ 'dt': ['dl']}
#Tables can contain other tables, but there are restrictions.
- NESTABLE_TABLE_TAGS = {'table' : [],
- 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
- 'td' : ['tr'],
- 'th' : ['tr'],
- 'thead' : ['table'],
- 'tbody' : ['table'],
- 'tfoot' : ['table'],
+ NESTABLE_TABLE_TAGS = {'table': [],
+ 'tr': ['table', 'tbody', 'tfoot', 'thead'],
+ 'td': ['tr'],
+ 'th': ['tr'],
+ 'thead': ['table'],
+ 'tbody': ['table'],
+ 'tfoot': ['table'],
}
NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
@@ -1587,11 +1605,11 @@
contentType = value
contentTypeIndex = i
- if httpEquiv and contentType: # It's an interesting meta tag.
+ if httpEquiv and contentType: # It's an interesting meta tag.
match = self.CHARSET_RE.search(contentType)
if match:
if (self.declaredHTMLEncoding is not None or
- self.originalEncoding == self.fromEncoding):
+ self.originalEncoding == self.fromEncoding):
# An HTML encoding was sniffed while converting
# the document to Unicode, or an HTML encoding was
# sniffed during a previous pass through the
@@ -1616,9 +1634,11 @@
if tag and tagNeedsEncodingSubstitution:
tag.containsSubstitutions = True
+
class StopParsing(Exception):
pass
+
class ICantBelieveItsBeautifulSoup(BeautifulSoup):
"""The BeautifulSoup class is oriented towards skipping over
@@ -1644,10 +1664,10 @@
it's valid HTML and BeautifulSoup screwed up by assuming it
wouldn't be."""
- I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
- ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
- 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
- 'big')
+ I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = (
+ 'em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 'cite',
+ 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 'big'
+ )
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
@@ -1655,6 +1675,7 @@
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
+
class MinimalSoup(BeautifulSoup):
"""The MinimalSoup class is for parsing HTML that contains
pathologically bad markup. It makes no assumptions about tag
@@ -1668,6 +1689,7 @@
RESET_NESTING_TAGS = buildTagMap('noscript')
NESTABLE_TAGS = {}
+
class BeautifulSOAP(BeautifulStoneSoup):
"""This class will push a tag with only a single string child into
the tag's parent as an attribute. The attribute's name is the tag
@@ -1695,10 +1717,11 @@
parent._getAttrMap()
if (isinstance(tag, Tag) and len(tag.contents) == 1 and
isinstance(tag.contents[0], NavigableString) and
- not parent.attrMap.has_key(tag.name)):
+ not tag.name in parent.attrMap):
parent[tag.name] = tag.contents[0]
BeautifulStoneSoup.popTag(self)
+
#Enterprise class names! It has come to our attention that some people
#think the names of the Beautiful Soup parser classes are too silly
#and "unprofessional" for use in enterprise screen-scraping. We feel
@@ -1749,6 +1772,7 @@
except ImportError:
pass
+
class UnicodeDammit:
"""A class for detecting the encoding of a *ML document and
converting it to a Unicode string. If the source encoding is
@@ -1759,14 +1783,14 @@
# meta tags to the corresponding Python codec names. It only covers
# values that aren't in Python's aliases and can't be determined
# by the heuristics in find_codec.
- CHARSET_ALIASES = { "macintosh" : "mac-roman",
- "x-sjis" : "shift-jis" }
+ CHARSET_ALIASES = {"macintosh": "mac-roman",
+ "x-sjis": "shift-jis"}
def __init__(self, markup, overrideEncodings=[],
smartQuotesTo='xml', isHTML=False):
self.declaredHTMLEncoding = None
self.markup, documentEncoding, sniffedEncoding = \
- self._detectEncoding(markup, isHTML)
+ self._detectEncoding(markup, isHTML)
self.smartQuotesTo = smartQuotesTo
self.triedEncodings = []
if markup == '' or isinstance(markup, unicode):
@@ -1819,9 +1843,8 @@
if self.smartQuotesTo and proposed.lower() in("windows-1252",
"iso-8859-1",
"iso-8859-2"):
- markup = re.compile("([\x80-\x9f])").sub \
- (lambda(x): self._subMSChar(x.group(1)),
- markup)
+ markup = re.compile("([\x80-\x9f])").sub(
+ lambda(x): self._subMSChar(x.group(1)), markup)
try:
# print "Trying to convert document to %s" % proposed
@@ -1841,11 +1864,11 @@
# strip Byte Order Mark (if present)
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
- and (data[2:4] != '\x00\x00'):
+ and (data[2:4] != '\x00\x00'):
encoding = 'utf-16be'
data = data[2:]
- elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
- and (data[2:4] != '\x00\x00'):
+ elif (len(data) >= 4) and \
+ (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
encoding = 'utf-16le'
data = data[2:]
elif data[:3] == '\xef\xbb\xbf':
@@ -1871,8 +1894,8 @@
# UTF-16BE
sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
- elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
- and (xml_data[2:4] != '\x00\x00'):
+ elif (len(xml_data) >= 4) and \
+ (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] !=
'\x00\x00'):
# UTF-16BE with BOM
sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
@@ -1881,7 +1904,7 @@
sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
- (xml_data[2:4] != '\x00\x00'):
+ (xml_data[2:4] != '\x00\x00'):
# UTF-16LE with BOM
sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
@@ -1927,7 +1950,6 @@
xml_encoding = sniffed_xml_encoding
return xml_data, xml_encoding, sniffed_xml_encoding
-
def find_codec(self, charset):
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
or (charset and self._codec(charset.replace("-", ""))) \
@@ -1945,63 +1967,70 @@
return codec
EBCDIC_TO_ASCII_MAP = None
+
def _ebcdic_to_ascii(self, s):
c = self.__class__
if not c.EBCDIC_TO_ASCII_MAP:
- emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
- 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
- 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
- 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
- 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
- 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
- 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
- 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
- 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
- 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
- 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
- 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
- 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
- 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
- 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
- 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
- 250,251,252,253,254,255)
+ emap = (0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13,
14,
+ 15, 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28,
+ 29, 30, 31, 128, 129, 130, 131, 132, 10, 23, 27, 136, 137,
+ 138, 139, 140, 5, 6, 7, 144, 145, 22, 147, 148, 149, 150,
4,
+ 152, 153, 154, 155, 20, 21, 158, 26, 32, 160, 161, 162,
163,
+ 164, 165, 166, 167, 168, 91, 46, 60, 40, 43, 33, 38, 169,
+ 170, 171, 172, 173, 174, 175, 176, 177, 93, 36, 42, 41, 59,
+ 94, 45, 47, 178, 179, 180, 181, 182, 183, 184, 185, 124,
44,
+ 37, 95, 62, 63, 186, 187, 188, 189, 190, 191, 192, 193,
194,
+ 96, 58, 35, 64, 39, 61, 34, 195, 97, 98, 99, 100, 101, 102,
+ 103, 104, 105, 196, 197, 198, 199, 200, 201, 202, 106, 107,
+ 108, 109, 110, 111, 112, 113, 114, 203, 204, 205, 206, 207,
+ 208, 209, 126, 115, 116, 117, 118, 119, 120, 121, 122, 210,
+ 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222,
+ 223, 224, 225, 226, 227, 228, 229, 230, 231, 123, 65, 66,
+ 67, 68, 69, 70, 71, 72, 73, 232, 233, 234, 235, 236, 237,
+ 125, 74, 75, 76, 77, 78, 79, 80, 81, 82, 238, 239, 240,
241,
+ 242, 243, 92, 159, 83, 84, 85, 86, 87, 88, 89, 90, 244,
245,
+ 246, 247, 248, 249, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
+ 250, 251, 252, 253, 254, 255)
import string
- c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
- ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
+ c.EBCDIC_TO_ASCII_MAP = string.maketrans(''.join(map(chr,
+ range(256))),
+ ''.join(map(chr, emap)))
return s.translate(c.EBCDIC_TO_ASCII_MAP)
- MS_CHARS = { '\x80' : ('euro', '20AC'),
- '\x81' : ' ',
- '\x82' : ('sbquo', '201A'),
- '\x83' : ('fnof', '192'),
- '\x84' : ('bdquo', '201E'),
- '\x85' : ('hellip', '2026'),
- '\x86' : ('dagger', '2020'),
- '\x87' : ('Dagger', '2021'),
- '\x88' : ('circ', '2C6'),
- '\x89' : ('permil', '2030'),
- '\x8A' : ('Scaron', '160'),
- '\x8B' : ('lsaquo', '2039'),
- '\x8C' : ('OElig', '152'),
- '\x8D' : '?',
- '\x8E' : ('#x17D', '17D'),
- '\x8F' : '?',
- '\x90' : '?',
- '\x91' : ('lsquo', '2018'),
- '\x92' : ('rsquo', '2019'),
- '\x93' : ('ldquo', '201C'),
- '\x94' : ('rdquo', '201D'),
- '\x95' : ('bull', '2022'),
- '\x96' : ('ndash', '2013'),
- '\x97' : ('mdash', '2014'),
- '\x98' : ('tilde', '2DC'),
- '\x99' : ('trade', '2122'),
- '\x9a' : ('scaron', '161'),
- '\x9b' : ('rsaquo', '203A'),
- '\x9c' : ('oelig', '153'),
- '\x9d' : '?',
- '\x9e' : ('#x17E', '17E'),
- '\x9f' : ('Yuml', ''),}
+ MS_CHARS = {
+ '\x80': ('euro', '20AC'),
+ '\x81': ' ',
+ '\x82': ('sbquo', '201A'),
+ '\x83': ('fnof', '192'),
+ '\x84': ('bdquo', '201E'),
+ '\x85': ('hellip', '2026'),
+ '\x86': ('dagger', '2020'),
+ '\x87': ('Dagger', '2021'),
+ '\x88': ('circ', '2C6'),
+ '\x89': ('permil', '2030'),
+ '\x8A': ('Scaron', '160'),
+ '\x8B': ('lsaquo', '2039'),
+ '\x8C': ('OElig', '152'),
+ '\x8D': '?',
+ '\x8E': ('#x17D', '17D'),
+ '\x8F': '?',
+ '\x90': '?',
+ '\x91': ('lsquo', '2018'),
+ '\x92': ('rsquo', '2019'),
+ '\x93': ('ldquo', '201C'),
+ '\x94': ('rdquo', '201D'),
+ '\x95': ('bull', '2022'),
+ '\x96': ('ndash', '2013'),
+ '\x97': ('mdash', '2014'),
+ '\x98': ('tilde', '2DC'),
+ '\x99': ('trade', '2122'),
+ '\x9a': ('scaron', '161'),
+ '\x9b': ('rsaquo', '203A'),
+ '\x9c': ('oelig', '153'),
+ '\x9d': '?',
+ '\x9e': ('#x17E', '17E'),
+ '\x9f': ('Yuml', ''),
+ }
#######################################################################
_______________________________________________
Pywikipedia-svn mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-svn