Hi, I am using a simple sublclass of HTMLParser like this:
class LinkCollector(HTMLParser): def reset(self): self.links = [] HTMLParser.reset(self) def handle_starttag(self,tag,attr): if tag in ("a","link"): key = "href" elif tag in ("img","script"): key = "src" else: return self.links.extend([v for k,v in attr if k == key]) This gives following error: Traceback (most recent call last): File "downloader.py", line 209, in <module> if __name__ == "__main__": main() File "downloader.py", line 201, in main link_collect.feed(response) File "C:\Python27\lib\HTMLParser.py", line 108, in feed self.goahead(0) File "C:\Python27\lib\HTMLParser.py", line 148, in goahead k = self.parse_starttag(i) File "C:\Python27\lib\HTMLParser.py", line 252, in parse_starttag attrvalue = self.unescape(attrvalue) File "C:\Python27\lib\HTMLParser.py", line 393, in unescape return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s) File "C:\Python27\lib\re.py", line 151, in sub return _compile(pattern, flags).sub(repl, string, count) UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 13: ordinal not in range(128) Rest of the code available as attachment. Does anyone know how to solve this? -- http://yasar.serveblog.net/
downloader.py
Description: Binary data
-- http://mail.python.org/mailman/listinfo/python-list