James G. Sack (jim) wrote: > For anyone messing with unicode, here's a little python ditty to search > for unicode name patterns, and show various info (including utf-8 encoding).
I seem to have gotten confused about attachments again. :-[ Here it is pasted in: --------------------- #!/usr/bin/env python # (c) 2007 jgsack. non-exclusive rights granted to everybody # inspired by an example code from: # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/251871 # comment at: unicodedata is your friend, Tiago Henriques, 2003/11/17 import unicodedata import re usage=""" Usage: %s <match-pattern> gives information about unicode codepoints <match-pattern> is a regex (case insensitive) sample patterns (quoting as required): LATIN '.*greek' '.* M$' 'CIRCLED DIGIT ONE' output format is codepoint-in-decimal (hex): character unicode-name <utf8> (output-encoding is UTF-8) (char may not be present in your font) only searches the 64K chars of the BMP (Basic Multilingual Plane) """ def findmatches(pat): """ caller must insert leading ".*" if required """ if isinstance(pat,basestring): r = re.compile(pat,re.IGNORECASE) else: r=pat chars={} for i in range(0xffff): u=unichr(i) try: n=unicodedata.name(u) if r.match(n): chars[u]=n except ValueError: pass return chars if __name__ == "__main__": import os, sys, codecs parm = sys.argv[1:] if not parm or (parm[0] == '--help'): print usage % os.path.basename(__file__) exit(1) enc, dec, iwrap, owrap = codecs.lookup("utf-8") outf = owrap(sys.stdout) ll= findmatches(parm[0]) for uu in sorted(ll.keys()): utf=" ".join(re.split(r'\\x',repr(enc(uu)[0]).strip("'"))[1:]) print >>outf, "%5d (0x%0.4X): %s %s <%s>" % ( ord(uu), ord(uu), uu, ll[uu], utf) #===eof=== ----------------------------------------------------------------------- Regards, ..jim -- [email protected] http://www.kernel-panic.org/cgi-bin/mailman/listinfo/kplug-lpsg
