The following ticket, and commits could use some review from somebody
who understands the iri/uri conversion stuff well.   I read the
relevent RFC's and implemented something that passes all our tests,
and appears to work properly for me.   But I've done very little real
world development with non-asci charset stuff in url's.

The ticket is:

http://trac.turbogears.org/ticket/2115

First failed attempt at a wrapper is checked in here:

http://trac.turbogears.org/changeset/5990

And the second, is here:

http://trac.turbogears.org/changeset/6000

The critical difference between the two is that I discovered that
routes does not handle utf8 encoded strings either.

The reason there's a separate endoder (rather than using the quote
function in urllib) is that utf8 strings were not properly handled
there either, so I stole Joe Gregario's encoder from httplib2 and
adapted it to our needs and put it in tg.util.

This solution seems to work, but it feels a bit wierd, so I'd love to
hear if anybody has any better thoughts.

--Mark

---------- Forwarded message ----------
From:  <[email protected]>
Date: Sat, Jan 3, 2009 at 10:57 AM
Subject: [turbogears-commits] r6000 - in trunk/tg: . tests
To: [email protected]



Author: mramm
Date: Sat Jan  3 09:57:49 2009
New Revision: 6000
URL: http://trac.turbogears.org/changeset/6000

Log:
Lots of url/iri updates.   We now generate url's using a combination
of Joe Gregario's iri2uri encoder, and routes.  Seems to work ok, but
does provide some crazy ugly url's for non asci characters.

Modified:
  trunk/tg/controllers.py
  trunk/tg/tests/test_controllers.py
  trunk/tg/util.py

Modified: trunk/tg/controllers.py
==============================================================================
--- trunk/tg/controllers.py     (original)
+++ trunk/tg/controllers.py     Sat Jan  3 09:57:49 2009
@@ -22,6 +22,7 @@
 from pylons import url as pylons_url
 from pylons.controllers import WSGIController

+from tg.util import iri2uri
 from tg.exceptions import (HTTPFound, HTTPNotFound, HTTPException,
    HTTPClientError)
 from tg.render import render as tg_render
@@ -560,13 +561,13 @@
    #Next we do utf8 encoding for everything
    for arg in args:
        if isinstance(arg, unicode):
-            new_args.append(arg.encode('utf8'))
+            new_args.append(iri2uri(arg))
        else:
            new_args.append(arg)

    for key, value in kwargs.iteritems():
        if isinstance(value, unicode):
-            new_kwargs[key] = value.encode('utf8')
+            new_kwargs[key] = iri2uri(value)
        else:
            new_kwargs[key] = value


Modified: trunk/tg/tests/test_controllers.py
==============================================================================
--- trunk/tg/tests/test_controllers.py  (original)
+++ trunk/tg/tests/test_controllers.py  Sat Jan  3 09:57:49 2009
@@ -40,12 +40,14 @@
 def test_unicode():
    """url() can handle unicode parameters"""
    create_request("/")
-    eq_(url('/', x=u'\N{LATIN SMALL LETTER A WITH GRAVE}'
+    unicodestring = (u'\N{LATIN SMALL LETTER A WITH GRAVE}'
        u'\N{LATIN SMALL LETTER E WITH GRAVE}'
        u'\N{LATIN SMALL LETTER I WITH GRAVE}'
        u'\N{LATIN SMALL LETTER O WITH GRAVE}'
-        u'\N{LATIN SMALL LETTER U WITH GRAVE}'),
-        '/?x=%C3%A0%C3%A8%C3%AC%C3%B2%C3%B9'
+        u'\N{LATIN SMALL LETTER U WITH GRAVE}')
+    print unicodestring.encode('utf8')
+    eq_(url('/', x=unicodestring),
+        '/?x=%25C3%25A0%25C3%25A8%25C3%25AC%25C3%25B2%25C3%25B9'
        )

 def test_list():

Modified: trunk/tg/util.py
==============================================================================
--- trunk/tg/util.py    (original)
+++ trunk/tg/util.py    Sat Jan  3 09:57:49 2009
@@ -2,6 +2,7 @@
 from pylons import config
 import os, sys
 import pkg_resources
+import urlparse
 from pkg_resources import resource_filename

 def get_project_meta(name):
@@ -137,3 +138,58 @@
        result = template_name

    return result
+
+
+# The characters we need to enocde and escape are defined in the spec:
+#
+# iprivate =  %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD
+# ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
+#         / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
+#         / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
+#         / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
+#         / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
+#         / %xD0000-DFFFD / %xE1000-EFFFD
+
+escape_range = [
+   (0xA0, 0xD7FF ),
+   (0xE000, 0xF8FF ),
+   (0xF900, 0xFDCF ),
+   (0xFDF0, 0xFFEF),
+   (0x10000, 0x1FFFD ),
+   (0x20000, 0x2FFFD ),
+   (0x30000, 0x3FFFD),
+   (0x40000, 0x4FFFD ),
+   (0x50000, 0x5FFFD ),
+   (0x60000, 0x6FFFD),
+   (0x70000, 0x7FFFD ),
+   (0x80000, 0x8FFFD ),
+   (0x90000, 0x9FFFD),
+   (0xA0000, 0xAFFFD ),
+   (0xB0000, 0xBFFFD ),
+   (0xC0000, 0xCFFFD),
+   (0xD0000, 0xDFFFD ),
+   (0xE1000, 0xEFFFD),
+   (0xF0000, 0xFFFFD ),
+   (0x100000, 0x10FFFD)
+]
+
+def encode(c):
+    retval = c
+    i = ord(c)
+    for low, high in escape_range:
+        if i < low:
+            break
+        if i >= low and i <= high:
+            retval = "".join(["%%%2X" % ord(o) for o in c.encode('utf-8')])
+            break
+    return retval
+
+
+def iri2uri(uri):
+    """Convert an IRI to a URI. Note that IRIs must be
+    passed in a unicode strings. That is, do not utf-8 encode
+    the IRI before passing it into the function."""
+    if isinstance(uri ,unicode):
+
+        uri = "".join([encode(c) for c in uri]).encode('utf8')
+    return uri
\ No newline at end of file



-- 
Mark Ramm-Christensen
email: mark at compoundthinking dot com
blog: www.compoundthinking.com/blog

--~--~---------~--~----~------------~-------~--~----~
You received this message because you are subscribed to the Google Groups 
"TurboGears Trunk" group.
To post to this group, send email to [email protected]
To unsubscribe from this group, send email to 
[email protected]
For more options, visit this group at 
http://groups.google.com/group/turbogears-trunk?hl=en
-~----------~----~----~----~------~----~------~--~---

Reply via email to