I'm trying to read my iTunes library in Python using iterparse. My current stub is:
---- Snip ---- import sys import datetime import xml.etree.ElementTree as ET import argparse import re class Library: unmarshallers = { # collections "array": lambda x: [v.text for v in x], "dict": lambda x: dict((x[i].text, x[i+1].text) for i in range(0, len(x), 2)), "key": lambda x: x.text or "", # simple types "string": lambda x: x.text or "", "data": lambda x: base64.decodestring(x.text or ""), "date": lambda x: datetime.datetime(*map(int, re.findall("\d+", x.text))), "true": lambda x: True, "false": lambda x: False, "real": lambda x: float(x.text), "integer": lambda x: int(x.text) } def load(self, file): print('Starting...') parser = ET.iterparse(file) for action, elem in parser: unmarshal = self.unmarshallers.get(elem.tag) if unmarshal: data = unmarshal(elem) elem.clear() elem.text = data print(elem.text) elif elem.tag != "plist": raise IOError("unknown plist type: %r" % elem.tag) return parser.root[0].text def __init__(self, infile): self.root = self.load(infile) if __name__ == "__main__": parser = argparse.ArgumentParser(description = "Parse an iTunes library file to a set of CSV files suitable for import to a database.") parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin) args=parser.parse_args() print('Infile = ', args.infile) library = Library(args.infile) My input file (reduced to home in on the error) is: ---- snip ----- <?xml version="1.0" encoding="UTF-8"?> <plist version="1.0"> <dict> <dict> <key>15078</key> <dict> <key>Name</key><string>Part 2. The Death Of Enkidu. Skon Přitele Mého Mne Zdeptal Težče</string> </dict> </dict> </dict> </plist> ---- snip ---- <?xml version="1.0" encoding="UTF-8"?> <plist version="1.0"> <dict> <dict> <key>15078</key> <dict> <key>Name</key><string>Part 2. The Death Of Enkidu. Skon Přitele Mého Mne Zdeptal Težče</string> </dict> </dict> </dict> </plist> I'm getting an error on one part of the XML: File "C:\Users\digit\Anaconda3\lib\encodings\cp1252.py", line 23, in decode return codecs.charmap_decode(input,self.errors,decoding_table)[0] UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 202: character maps to <undefined> I suspect the issue is that it's using cp1252.py, which I don't think is UTF-8 as specified in the XML prolog. Is this an iterparse problem, or am I using it wrongly? Thanks. -- https://mail.python.org/mailman/listinfo/python-list