The following ticket, and commits could use some review from somebody who understands the iri/uri conversion stuff well. I read the relevent RFC's and implemented something that passes all our tests, and appears to work properly for me. But I've done very little real world development with non-asci charset stuff in url's.
The ticket is: http://trac.turbogears.org/ticket/2115 First failed attempt at a wrapper is checked in here: http://trac.turbogears.org/changeset/5990 And the second, is here: http://trac.turbogears.org/changeset/6000 The critical difference between the two is that I discovered that routes does not handle utf8 encoded strings either. The reason there's a separate endoder (rather than using the quote function in urllib) is that utf8 strings were not properly handled there either, so I stole Joe Gregario's encoder from httplib2 and adapted it to our needs and put it in tg.util. This solution seems to work, but it feels a bit wierd, so I'd love to hear if anybody has any better thoughts. --Mark ---------- Forwarded message ---------- From: <[email protected]> Date: Sat, Jan 3, 2009 at 10:57 AM Subject: [turbogears-commits] r6000 - in trunk/tg: . tests To: [email protected] Author: mramm Date: Sat Jan 3 09:57:49 2009 New Revision: 6000 URL: http://trac.turbogears.org/changeset/6000 Log: Lots of url/iri updates. We now generate url's using a combination of Joe Gregario's iri2uri encoder, and routes. Seems to work ok, but does provide some crazy ugly url's for non asci characters. Modified: trunk/tg/controllers.py trunk/tg/tests/test_controllers.py trunk/tg/util.py Modified: trunk/tg/controllers.py ============================================================================== --- trunk/tg/controllers.py (original) +++ trunk/tg/controllers.py Sat Jan 3 09:57:49 2009 @@ -22,6 +22,7 @@ from pylons import url as pylons_url from pylons.controllers import WSGIController +from tg.util import iri2uri from tg.exceptions import (HTTPFound, HTTPNotFound, HTTPException, HTTPClientError) from tg.render import render as tg_render @@ -560,13 +561,13 @@ #Next we do utf8 encoding for everything for arg in args: if isinstance(arg, unicode): - new_args.append(arg.encode('utf8')) + new_args.append(iri2uri(arg)) else: new_args.append(arg) for key, value in kwargs.iteritems(): if isinstance(value, unicode): - new_kwargs[key] = value.encode('utf8') + new_kwargs[key] = iri2uri(value) else: new_kwargs[key] = value Modified: trunk/tg/tests/test_controllers.py ============================================================================== --- trunk/tg/tests/test_controllers.py (original) +++ trunk/tg/tests/test_controllers.py Sat Jan 3 09:57:49 2009 @@ -40,12 +40,14 @@ def test_unicode(): """url() can handle unicode parameters""" create_request("/") - eq_(url('/', x=u'\N{LATIN SMALL LETTER A WITH GRAVE}' + unicodestring = (u'\N{LATIN SMALL LETTER A WITH GRAVE}' u'\N{LATIN SMALL LETTER E WITH GRAVE}' u'\N{LATIN SMALL LETTER I WITH GRAVE}' u'\N{LATIN SMALL LETTER O WITH GRAVE}' - u'\N{LATIN SMALL LETTER U WITH GRAVE}'), - '/?x=%C3%A0%C3%A8%C3%AC%C3%B2%C3%B9' + u'\N{LATIN SMALL LETTER U WITH GRAVE}') + print unicodestring.encode('utf8') + eq_(url('/', x=unicodestring), + '/?x=%25C3%25A0%25C3%25A8%25C3%25AC%25C3%25B2%25C3%25B9' ) def test_list(): Modified: trunk/tg/util.py ============================================================================== --- trunk/tg/util.py (original) +++ trunk/tg/util.py Sat Jan 3 09:57:49 2009 @@ -2,6 +2,7 @@ from pylons import config import os, sys import pkg_resources +import urlparse from pkg_resources import resource_filename def get_project_meta(name): @@ -137,3 +138,58 @@ result = template_name return result + + +# The characters we need to enocde and escape are defined in the spec: +# +# iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD +# ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF +# / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD +# / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD +# / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD +# / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD +# / %xD0000-DFFFD / %xE1000-EFFFD + +escape_range = [ + (0xA0, 0xD7FF ), + (0xE000, 0xF8FF ), + (0xF900, 0xFDCF ), + (0xFDF0, 0xFFEF), + (0x10000, 0x1FFFD ), + (0x20000, 0x2FFFD ), + (0x30000, 0x3FFFD), + (0x40000, 0x4FFFD ), + (0x50000, 0x5FFFD ), + (0x60000, 0x6FFFD), + (0x70000, 0x7FFFD ), + (0x80000, 0x8FFFD ), + (0x90000, 0x9FFFD), + (0xA0000, 0xAFFFD ), + (0xB0000, 0xBFFFD ), + (0xC0000, 0xCFFFD), + (0xD0000, 0xDFFFD ), + (0xE1000, 0xEFFFD), + (0xF0000, 0xFFFFD ), + (0x100000, 0x10FFFD) +] + +def encode(c): + retval = c + i = ord(c) + for low, high in escape_range: + if i < low: + break + if i >= low and i <= high: + retval = "".join(["%%%2X" % ord(o) for o in c.encode('utf-8')]) + break + return retval + + +def iri2uri(uri): + """Convert an IRI to a URI. Note that IRIs must be + passed in a unicode strings. That is, do not utf-8 encode + the IRI before passing it into the function.""" + if isinstance(uri ,unicode): + + uri = "".join([encode(c) for c in uri]).encode('utf8') + return uri \ No newline at end of file -- Mark Ramm-Christensen email: mark at compoundthinking dot com blog: www.compoundthinking.com/blog --~--~---------~--~----~------------~-------~--~----~ You received this message because you are subscribed to the Google Groups "TurboGears Trunk" group. To post to this group, send email to [email protected] To unsubscribe from this group, send email to [email protected] For more options, visit this group at http://groups.google.com/group/turbogears-trunk?hl=en -~----------~----~----~----~------~----~------~--~---
