Revision: 166ba7bc2fdd
Author: Pekka Klärck
Date: Mon Sep 12 13:19:13 2011
Log: Reverted changes to use lxml module for HTML parsing when
available.
Update issue 942
Status: WontFix
Owner:
Cc:
Although lxml turned out to be quite a bit faster than the standard
HTMLParser (parsing a larger HTML file was about 70% faster) we faced
several problems with the change and decided to revert it. The biggest
problems are itemized below. We can think about this again if we ever
decide to support lxml for XML parsing.
Problems:
- After the refactorings needed to support different parsers, the standard
parser got about 5% slower. That was rather surprising considering the
changes, but apparently the parser is used so heavily that even small
changes matter. Nevertheless, using the standard parser is the common case
and we don't want to slow it.
- lxml isn't readily and easily available for all platforms. For example we
failed to install it into our CI server which has Python 2.5 on 64bit Linux.
- In same cases lxml parsed malformed HTML slightly differently than the
standard parser. This could have caused backwards compatibility problems.
- Documenting the lxml support would have requires some additional work.
- Parsing the same data in a plain text files was still 50% faster than
parsing it in HTML with lxml. People with large test suites should switch
to plain text format.
http://code.google.com/p/robotframework/source/detail?r=166ba7bc2fdd
Deleted:
/src/robot/parsing/lxmlhtmlparser.py
/src/robot/parsing/stdhtmlparser.py
Modified:
/src/robot/parsing/htmlreader.py
/utest/parsing/test_htmlreader.py
=======================================
--- /src/robot/parsing/lxmlhtmlparser.py Wed Aug 31 16:28:16 2011
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2008-2011 Nokia Siemens Networks Oyj
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-if os.environ.get('ROBOT_NO_LXML'):
- raise ImportError('Using lxml is disabled')
-from lxml import etree
-
-
-class RobotHtmlParser(object):
-
- def __init__(self, reader):
- self._reader = reader
-
- def parse(self, htmlfile):
- parser = etree.HTMLParser(target=self)
- etree.parse(htmlfile, parser)
-
- def start(self, tag, attrs):
- self._reader.start(tag)
-
- def end(self, tag):
- self._reader.end(tag)
-
- def data(self, data):
- self._reader.data(self._normalize_nbsp_and_tilde(data))
-
- def _normalize_nbsp_and_tilde(self, data):
- return data.replace(u'\xa0', u' ').replace(u'\u02dc', u'~')
-
- def close(self):
- pass
=======================================
--- /src/robot/parsing/stdhtmlparser.py Wed Aug 31 16:18:32 2011
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright 2008-2011 Nokia Siemens Networks Oyj
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import HTMLParser
-from htmlentitydefs import entitydefs
-import sys
-
-extra_entitydefs = {'nbsp': ' ', 'apos': "'", 'tilde': '~'}
-
-
-# Workaround for following bug in Python 2.6:
http://bugs.python.org/issue3932
-if sys.version_info[:2] > (2, 5):
- def _unescape_from_py25(self, s):
- if '&' not in s:
- return s
- s = s.replace("<", "<")
- s = s.replace(">", ">")
- s = s.replace("'", "'")
- s = s.replace(""", '"')
- s = s.replace("&", "&") # Must be last
- return s
-
- HTMLParser.HTMLParser.unescape = _unescape_from_py25
-
-
-class RobotHtmlParser(HTMLParser.HTMLParser):
-
- def __init__(self, reader):
- HTMLParser.HTMLParser.__init__(self)
- self._reader = reader
- self._encoding = 'ISO-8859-1'
-
- def parse(self, htmlfile):
- for line in htmlfile:
- self.feed(line)
- self.close()
-
- def handle_starttag(self, tag, attrs):
- if tag == 'meta':
- self._set_encoding(self._get_encoding_from_meta(attrs))
- else:
- self._reader.start(tag)
-
- def _set_encoding(self, encoding):
- if encoding:
- self._encoding = encoding
-
- def handle_endtag(self, tag):
- self._reader.end(tag)
-
- def handle_data(self, data, decode=True):
- if decode:
- data = data.decode(self._encoding)
- self._reader.data(data)
-
- def handle_entityref(self, name):
- value = self._handle_entityref(name)
- self.handle_data(value, decode=False)
-
- def _handle_entityref(self, name):
- if extra_entitydefs.has_key(name):
- return extra_entitydefs[name]
- try:
- value = entitydefs[name]
- except KeyError:
- return '&'+name+';'
- if value.startswith('&#'):
- return unichr(int(value[2:-1]))
- return value.decode('ISO-8859-1')
-
- def handle_charref(self, number):
- value = self._handle_charref(number)
- self.handle_data(value, decode=False)
-
- def _handle_charref(self, number):
- try:
- return unichr(int(number))
- except ValueError:
- return '&#'+number+';'
-
- def handle_pi(self, data):
- self._set_encoding(self._get_encoding_from_pi(data))
-
- def _get_encoding_from_pi(self, data):
- data = data.strip()
- if not data.lower().startswith('xml '):
- return None
- if data.endswith('?'):
- data = data[:-1]
- for token in data.split():
- if token.lower().startswith('encoding='):
- encoding = token[9:]
- if encoding.startswith("'") or encoding.startswith('"'):
- encoding = encoding[1:-1]
- return encoding
- return None
-
- def unknown_decl(self, data):
- # Ignore everything even if it's invalid. This kind of stuff comes
- # at least from MS Excel
- pass
-
- def _get_encoding_from_meta(self, attrs):
- valid_http_equiv = False
- encoding = None
- for name, value in attrs:
- name = name.lower()
- if name == 'http-equiv' and value.lower() == 'content-type':
- valid_http_equiv = True
- if name == 'content':
- for token in value.split(';'):
- token = token.strip()
- if token.lower().startswith('charset='):
- encoding = token[8:]
- return encoding if valid_http_equiv else None
-
=======================================
--- /src/robot/parsing/htmlreader.py Wed Aug 31 16:18:32 2011
+++ /src/robot/parsing/htmlreader.py Mon Sep 12 13:19:13 2011
@@ -12,48 +12,99 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-try:
- from lxmlhtmlparser import RobotHtmlParser
-except ImportError:
- from stdhtmlparser import RobotHtmlParser
-
-
-class HtmlReader(object):
+
+import HTMLParser
+import sys
+from htmlentitydefs import entitydefs
+
+extra_entitydefs = {'nbsp': ' ', 'apos': "'", 'tilde': '~'}
+
+
+class HtmlReader(HTMLParser.HTMLParser):
IGNORE = 0
INITIAL = 1
PROCESS = 2
- def __init__(self, parser=RobotHtmlParser):
- self._parser = parser(self)
- self._start_handlers = {'table': self.table_start,
- 'tr': self.tr_start,
- 'td': self.td_start,
- 'th': self.td_start,
- 'br': self.br_start}
- self._end_handlers = {'table': self.table_end,
- 'tr': self.tr_end,
- 'td': self.td_end,
- 'th': self.td_end}
+ def __init__(self):
+ HTMLParser.HTMLParser.__init__(self)
+ self._encoding = 'ISO-8859-1'
+ self._handlers = { 'table_start' : self.table_start,
+ 'table_end' : self.table_end,
+ 'tr_start' : self.tr_start,
+ 'tr_end' : self.tr_end,
+ 'td_start' : self.td_start,
+ 'td_end' : self.td_end,
+ 'th_start' : self.td_start,
+ 'th_end' : self.td_end,
+ 'br_start' : self.br_start,
+ 'meta_start' : self.meta_start }
def read(self, htmlfile, populator):
self.populator = populator
self.state = self.IGNORE
self.current_row = None
self.current_cell = None
- self._parser.parse(htmlfile)
+ for line in htmlfile.readlines():
+ self.feed(line)
+ # Calling close is required by the HTMLParser but may cause
problems
+ # if the same instance of our HtmlParser is reused. Currently it's
+ # used only once so there's no problem.
+ self.close()
self.populator.eof()
- def start(self, tag):
- handler = self._start_handlers.get(tag)
- if handler:
- handler()
-
- def end(self, tag):
- handler = self._end_handlers.get(tag)
- if handler:
+ def handle_starttag(self, tag, attrs):
+ handler = self._handlers.get(tag+'_start')
+ if handler is not None:
+ handler(attrs)
+
+ def handle_endtag(self, tag):
+ handler = self._handlers.get(tag+'_end')
+ if handler is not None:
handler()
- def table_start(self):
+ def handle_data(self, data, decode=True):
+ if self.state == self.IGNORE or self.current_cell is None:
+ return
+ if decode:
+ data = data.decode(self._encoding)
+ self.current_cell.append(data)
+
+ def handle_entityref(self, name):
+ value = self._handle_entityref(name)
+ self.handle_data(value, decode=False)
+
+ def _handle_entityref(self, name):
+ if extra_entitydefs.has_key(name):
+ return extra_entitydefs[name]
+ try:
+ value = entitydefs[name]
+ except KeyError:
+ return '&'+name+';'
+ if value.startswith('&#'):
+ return unichr(int(value[2:-1]))
+ return value.decode('ISO-8859-1')
+
+ def handle_charref(self, number):
+ value = self._handle_charref(number)
+ self.handle_data(value, decode=False)
+
+ def _handle_charref(self, number):
+ try:
+ return unichr(int(number))
+ except ValueError:
+ return '&#'+number+';'
+
+ def handle_pi(self, data):
+ encoding = self._get_encoding_from_pi(data)
+ if encoding:
+ self._encoding = encoding
+
+ def unknown_decl(self, data):
+ # Ignore everything even if it's invalid. This kind of stuff comes
+ # at least from MS Excel
+ pass
+
+ def table_start(self, attrs=None):
self.state = self.INITIAL
self.current_row = None
self.current_cell = None
@@ -63,7 +114,7 @@
self.tr_end()
self.state = self.IGNORE
- def tr_start(self):
+ def tr_start(self, attrs=None):
if self.current_row is not None:
self.tr_end()
self.current_row = []
@@ -74,7 +125,7 @@
if self.current_cell is not None:
self.td_end()
if self.state == self.INITIAL:
- if self.current_row:
+ if len(self.current_row) > 0:
if self.populator.start_table(self.current_row):
self.state = self.PROCESS
else:
@@ -85,7 +136,7 @@
self.populator.add(self.current_row)
self.current_row = None
- def td_start(self):
+ def td_start(self, attrs=None):
if self.current_cell is not None:
self.td_end()
if self.current_row is None:
@@ -98,10 +149,54 @@
self.current_row.append(cell)
self.current_cell = None
- def br_start(self):
+ def br_start(self, attrs=None):
if self.current_cell is not None and self.state != self.IGNORE:
self.current_cell.append('\n')
- def data(self, data):
- if self.current_cell is not None and self.state != self.IGNORE:
- self.current_cell.append(data)
+ def meta_start(self, attrs):
+ encoding = self._get_encoding_from_meta(attrs)
+ if encoding:
+ self._encoding = encoding
+
+ def _get_encoding_from_meta(self, attrs):
+ valid_http_equiv = False
+ encoding = None
+ for name, value in attrs:
+ name = name.lower()
+ if name == 'http-equiv' and value.lower() == 'content-type':
+ valid_http_equiv = True
+ if name == 'content':
+ for token in value.split(';'):
+ token = token.strip()
+ if token.lower().startswith('charset='):
+ encoding = token[8:]
+ return valid_http_equiv and encoding or None
+
+ def _get_encoding_from_pi(self, data):
+ data = data.strip()
+ if not data.lower().startswith('xml '):
+ return None
+ if data.endswith('?'):
+ data = data[:-1]
+ for token in data.split():
+ if token.lower().startswith('encoding='):
+ encoding = token[9:]
+ if encoding.startswith("'") or encoding.startswith('"'):
+ encoding = encoding[1:-1]
+ return encoding
+ return None
+
+
+# Workaround for following bug in Python 2.6:
http://bugs.python.org/issue3932
+if sys.version_info[:2] > (2, 5):
+ def unescape_from_py25(self, s):
+ if '&' not in s:
+ return s
+ s = s.replace("<", "<")
+ s = s.replace(">", ">")
+ s = s.replace("'", "'")
+ s = s.replace(""", '"')
+ s = s.replace("&", "&") # Must be last
+ return s
+
+ HTMLParser.HTMLParser.unescape = unescape_from_py25
=======================================
--- /utest/parsing/test_htmlreader.py Wed Aug 31 16:15:39 2011
+++ /utest/parsing/test_htmlreader.py Mon Sep 12 13:19:13 2011
@@ -1,15 +1,9 @@
+import sys
import unittest
-from StringIO import StringIO
+from types import UnicodeType
from robot.parsing.htmlreader import HtmlReader
-from robot.parsing.stdhtmlparser import RobotHtmlParser as StdHtmlParser
-try:
- from robot.parsing.lxmlhtmlparser import RobotHtmlParser as
LxmlHtmlParser
-except ImportError:
- def LxmlHtmlParser(*args):
- raise RuntimeError('This test requires lxml module to be
installed')
-
-from robot.utils.asserts import assert_equals
+from robot.utils.asserts import *
VALID_TABLES = [ "Variable", "Setting", "Test Case", "Test
Suite", "Keyword" ]
@@ -35,50 +29,50 @@
def add(self, cells):
self.tables[self.current].append(cells)
- def eof(self):
- pass
-
-
-class TestHtmlReaderWithStdHtmlParser(unittest.TestCase):
- parser = StdHtmlParser
+
+class TestHtmlReader(unittest.TestCase):
def setUp(self):
- self.reader = HtmlReader(parser=self.parser)
-
- def _read(self, *data):
- self.reader.read(StringIO('\n'.join(data)), PopulatorMock())
-
- def test_empty_table(self):
- self._read('<table></table>')
+ self.reader = HtmlReader()
+ self.reader.populator = PopulatorMock()
+
+ def test_initial_state(self):
+ self.reader.state = self.reader.IGNORE
+ self.reader.feed('<table>')
+ assert_equals(self.reader.state, self.reader.INITIAL)
+ self.reader.feed('</table>')
assert_equals(self.reader.state, self.reader.IGNORE)
def test_start_valid_table(self):
for name in VALID_TABLES:
- self._read('<table>', ROW_TEMPLATE % (name, 'Value
1', 'Value2'))
+ self.reader.feed('<table>')
+ self.reader.feed(ROW_TEMPLATE % (name, 'Value 1', 'Value2'))
+ assert_equals(self.reader.state, self.reader.PROCESS)
assert_equals(self.reader.populator.current, name)
-
- def test_start_invalid_table(self):
- for name in ["Foo", "VariableTable"]:
- self._read('<table>', ROW_TEMPLATE % (name, 'Value
1', 'Value2'),
- ROW_TEMPLATE % ('This', 'row', 'is ignored'))
+ self.reader.feed('</table>')
+ assert_equals(self.reader.state, self.reader.IGNORE)
+
+ def test_process_invalid_table(self):
+ for name in [ "Foo", "VaribleTable" ]:
+ self.reader.feed('<table>')
+ self.reader.feed(ROW_TEMPLATE % (name, 'Value 1', 'Value2'))
assert_equals(self.reader.state, self.reader.IGNORE)
- assert_equals(self.reader.populator.current, None)
- assert_equals(self.reader.populator.tables, {})
+ assert_none(self.reader.populator.current)
+ self.reader.feed(ROW_TEMPLATE % ('This', 'row', 'is ignored'))
+ assert_equals(self.reader.state, self.reader.IGNORE)
+ assert_equals(len(self.reader.populator.tables.values()), 0)
+ self.reader.feed('</table>')
+ assert_equals(self.reader.state, self.reader.IGNORE)
def test_br(self):
- inp = ['x<br>y', '1<br />2', '<br><br>']
+ inp = ('x<br>y', '1<br />2', '<br><br>')
exp = ['x\ny', '1\n2', '\n\n']
for name in VALID_TABLES:
- self._read('<table>', ROW_TEMPLATE % (name, 'Value
1', 'Value2'),
- ROW_TEMPLATE % tuple(inp), '</table>')
- assert_equals(self.reader.populator.tables[name], [exp])
-
- def test_comment(self):
- self._read('<table>', ROW_TEMPLATE % ('Setting', 'Value
1', 'Value2'),
- '<!-- ignore me please -->', ROW_TEMPLATE %
tuple('ABC'),
- ROW_TEMPLATE % tuple('123'), '</table>')
- assert_equals(self.reader.populator.tables['Setting'],
- [['A', 'B', 'C'], ['1', '2', '3']])
+ self.reader.feed('<table>')
+ self.reader.feed(ROW_TEMPLATE % (name, 'Value 1', 'Value2'))
+ self.reader.feed(ROW_TEMPLATE % inp)
+ self.reader.feed('</table>')
+ assert_equals(self.reader.populator.tables[name], [ exp ])
def test_processing(self):
self._row_processing(ROW_TEMPLATE)
@@ -94,101 +88,101 @@
self._row_processing('<tr><td>%s<td>%s</td><td>%s</td></tr></tr>')
def test_missing_start_tr(self):
- self._row_processing('<td>%s<td>%s</td><td>%s</td></tr>')
+ self._row_processing('<td>%s<td>%s</td><td>%s</td></tr></tr>')
def _row_processing(self, row_template):
- row_data = [['Just', 'some', 'data'],
- ['here', '', 'for'],
- ['', 'these', 'rows']]
for name in VALID_TABLES:
- rows = ['<table>', row_template % (name, 'Value 1', 'Value2')]
+ \
- [row_template % tuple(row) for row in row_data] +
['</table>']
- self._read(*rows)
- assert_equals(self.reader.populator.tables[name], row_data)
+ self.reader.feed('<table>')
+ self.reader.feed(row_template % (name, 'Value 1', 'Value2'))
+ row_data = [ ['Just', 'some', 'data'],
+ ['here', '', 'for'],
+ ['', 'these', 'rows'] ]
+ for data in row_data:
+ self.reader.feed(row_template % tuple(data))
+ assert_equals(self.reader.state, self.reader.PROCESS)
+ self.reader.feed('</table>')
assert_equals(self.reader.state, self.reader.IGNORE)
+ assert_equals(self.reader.populator.tables[name], row_data)
-class TestHtmlReaderWithLxmlParser(TestHtmlReaderWithStdHtmlParser):
- parser=LxmlHtmlParser
-
- def test_missing_start_tr(self):
- # lxml doens't handle this: it ignores also </tr> if there's no
<tr>
- pass
-
-
-
-class TestEntityAndCharRefsWithStdHtmlParser(unittest.TestCase):
- parser = StdHtmlParser
-
- def test_entityrefs(self):
- for inp, exp in [('nbsp', ' '),
- ('apos', "'"),
- ('tilde', '~'),
- ('lt', '<'),
- ('gt', '>'),
- ('amp', '&'),
- ('quot', '"'),
- ('auml', u'\u00E4'),
- ('ouml', u'\u00F6'),
- ('uuml', u'\u00FC'),
- ('aring', u'\u00E5'),
- ('ntilde', u'\u00F1'),
- ('Auml', u'\u00C4'),
- ('Ouml', u'\u00D6'),
- ('Uuml', u'\u00DC'),
- ('Aring', u'\u00C5'),
- ('Ntilde', u'\u00D1'),
- ('nabla', u'\u2207'),
- ('ldquo', u'\u201c'),
- ('invalid', '&invalid;')]:
- self._test('&%s;' % inp, exp)
-
- def test_charrefs(self):
- for inp, exp in [('82', 'R'), ('228', u'\u00E4')]:
- self._test('&#%s;' % inp, exp)
-
- def test_invalid_charref(self):
- self._test('&#invalid;', '&#invalid;')
-
- def _test(self, input, expected):
- result = []
- def collect_data(data):
- result.append(data)
- reader = HtmlReader(parser=self.parser)
- reader.data = collect_data
- reader.read(StringIO(input), PopulatorMock())
- msg = "'%s': %r != %r" % (input, ''.join(result), expected)
- assert_equals(''.join(result), expected, msg, values=False)
-
-
-class
TestEntityAndCharRefsWithLxmlParser(TestEntityAndCharRefsWithStdHtmlParser):
- parser = LxmlHtmlParser
-
- def test_invalid_charref(self):
- self._test('&#invalid;', 'invalid;')
-
-
-class TestEncodingWithStdHtmlParser(unittest.TestCase):
+class TestEntityAndCharRefs(unittest.TestCase):
+
+ def setUp(self):
+ self.reader = HtmlReader()
+ self.reader.handle_data = self._handle_response
+
+ def _handle_response(self, value, decode):
+ self.response = value
+
+ def test_handle_entiryrefs(self):
+ for inp, exp in [ ('nbsp', ' '),
+ ('apos', "'"),
+ ('tilde', '~'),
+ ('lt', '<'),
+ ('gt', '>'),
+ ('amp', '&'),
+ ('quot', '"'),
+ ('auml', u'\u00E4'),
+ ('ouml', u'\u00F6'),
+ ('uuml', u'\u00FC'),
+ ('aring', u'\u00E5'),
+ ('ntilde', u'\u00F1'),
+ ('Auml', u'\u00C4'),
+ ('Ouml', u'\u00D6'),
+ ('Uuml', u'\u00DC'),
+ ('Aring', u'\u00C5'),
+ ('Ntilde', u'\u00D1'),
+ ('nabla', u'\u2207'),
+ ('ldquo', u'\u201c'),
+ ('invalid', '&invalid;') ]:
+ self.reader.handle_entityref(inp)
+ msg = '%s: %r != %r' % (inp, self.response, exp)
+ assert_equals(self.response, exp, msg, False)
+
+ def test_handle_charefs(self):
+ for inp, exp in [ ('82', 'R'),
+ ('228', u'\u00E4'),
+ ('invalid', '&#invalid;') ]:
+ self.reader.handle_charref(inp)
+ msg = '%s: %r != %r' % (inp, self.response, exp)
+ assert_equals(self.response, exp, msg, False)
+
+
+class TestEncoding(unittest.TestCase):
def test_default_encoding(self):
- assert_equals(StdHtmlParser(reader=None)._encoding, 'ISO-8859-1')
+ assert_equals(HtmlReader()._encoding, 'ISO-8859-1')
def test_encoding_is_read_from_meta_tag(self):
- self._test_encoding('<meta http-equiv="Content-Type"
content="text/html; charset=utf-8" />', 'utf-8')
- self._test_encoding('<META HTTP-EQUIV="CONTENT-TYPE"
CONTENT="TEXT/HTML; CHARSET=UTF-8">', 'UTF-8')
-
- def test_valid_http_equiv_is_required_in_meta(self):
- self._test_encoding('<meta content="text/html; charset=utf-8"
/>', 'ISO-8859-1')
- self._test_encoding('<meta http-equiv="Invalid"
content="text/html; charset=utf-8" />', 'ISO-8859-1')
-
- def test_encoding_is_read_from_pi(self):
- self._test_encoding('<?xml version="1.0"
encoding="UTF-8"?>', 'UTF-8')
- self._test_encoding('<?xml encoding=US-ASCII
version="1.0"?>', 'US-ASCII')
-
- def _test_encoding(self, data, expected):
- parser = StdHtmlParser(reader=HtmlReader())
- parser.parse(StringIO(data))
- assert_equals(parser._encoding, expected)
+ reader = HtmlReader()
+ reader.feed('<meta http-equiv="Content-Type" content="text/html;
charset=utf-8" />')
+ assert_equals(reader._encoding, 'utf-8')
+ reader.feed('<meta http-equiv="Content-Type" content="text/html;
charset=UTF-8">')
+ assert_equals(reader._encoding, 'UTF-8')
+
+ def test_valid_http_equiv_is_required(self):
+ reader = HtmlReader()
+ reader.feed('<meta content="text/html; charset=utf-8" />')
+ assert_equals(reader._encoding, 'ISO-8859-1')
+ reader.feed('<meta http-equiv="Invalid" content="text/html;
charset=utf-8" />')
+ assert_equals(reader._encoding, 'ISO-8859-1')
+
+ def test_encoding_is_set_from_xml_preamble(self):
+ reader = HtmlReader()
+ reader.feed('<?xml version="1.0" encoding="UTF-8"?>')
+ assert_equals(reader._encoding, 'UTF-8')
+ reader.feed('<?xml encoding=US-ASCII version="1.0"?>')
+ assert_equals(reader._encoding, 'US-ASCII')
+
+ def test_encoding_and_entityrefs(self):
+ reader = HtmlReader()
+ reader.populator = PopulatorMock()
+ reader.feed('<meta content="text/html; charset=utf-8" />')
+ reader.feed('<table><tr><td>Setting</td></tr>')
+ reader.feed('<tr><td>äiti')
+ assert_equals(reader.current_cell, [u'\xe4', u'iti'])
+ reader.feed('</tr>')
+ assert_equals(reader.populator.tables['Setting'][0], [u'\xe4iti'])
if __name__ == '__main__':