Hi friends, I've been OFF-Python now for quite a while and am glad being back. At least to some part as work permits.
Q: What's a good way to encode and decode those entities like € or € ? I need isolated functions to process lines. Looking at the xml and sgmlib stuff I didn't really get a clue as to what's the most pythonic way. Are there library functions I didn't see? FYI, here is what I hacked down and what will probably (hopefully...) do the job. Feel free to comment. # -*- coding: iso-8859-1 -*- """\ entity_stuff.py, mb, 2008-03-14, 2008-03-18 """ import htmlentitydefs import re RE_OBJ_entity = re.compile('(&.+?;)') def entity2uc(entity): """Convert entity like { to unichr. Return (result,True) on success or (input string, False) otherwise. Example: entity2cp('€') -> (u'\u20ac',True) entity2cp('€') -> (u'\u20ac',True) entity2cp('€') -> (u'\u20ac',True) entity2cp('&foobar;') -> ('&foobar;',False) """ gotCodepoint = False gotUnichr = False if entity.startswith('&#'): if entity[2] == 'x': base = 16 digits = entity[3:-1] else: base = 10 digits = entity[2:-1] try: v = int(digits,base) gotCodepoint = True except: pass else: v = htmlentitydefs.name2codepoint.get(entity[1:-1],None) if not v is None: gotCodepoint = True if gotCodepoint: try: v = unichr(v) gotUnichr = True except: pass if gotUnichr: return v, gotUnichr else: return entity, gotUnichr def line_entities_to_uc(line): result = [] cntProblems = 0 for e in RE_OBJ_entity.split(line): if e.startswith('&'): e,success = entity2uc(e) if not success: cntProblems += 1 result.append(e) return u''.join(result), cntProblems def uc2entity(uc): cp = ord(uc) if cp > 127: name = htmlentitydefs.codepoint2name.get(cp,None) if name: result = '&%s;' % name else: result = '&#x%x;' % cp else: result = chr(cp) return result def encode_line(line): return ''.join([uc2entity(u) for u in line]) if 1 and __name__=="__main__": import codecs infile = 'temp.ascii.xml' outfile = 'temp.utf8.xml' of = codecs.open(outfile,'wb','utf-8') totalProblems = 0 totalLines = 0 for line in file(infile,'rb'): line2, cntProblems = line_entities_to_uc(line) of.write(line2) totalLines += 1 totalProblems += cntProblems of.close() print print "Summary:" print " Infile : %s" % (infile,) print " Outfile: %s" % (outfile,) print ' %8d %s %s' % (totalLines, ['lines','line'][totalLines==1], 'written.') print ' %8d %s %s' % (totalProblems, ['entities','entity'][totalProblems==1], 'left unconverted.') print '%s' % ('Done.',) Have a nice day and ru, Martin (read you, ;-) -- http://mail.python.org/mailman/listinfo/python-list