Author: Amaury Forgeot d'Arc <amaur...@gmail.com>
Branch: 
Changeset: r54045:f3a0dbfc3c3a
Date: 2012-03-28 22:48 +0200
http://bitbucket.org/pypy/pypy/changeset/f3a0dbfc3c3a/

Log:    Improve documentation of parsestr()

diff --git a/pypy/interpreter/pyparser/parsestring.py 
b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -2,17 +2,25 @@
 from pypy.interpreter import unicodehelper
 from pypy.rlib.rstring import StringBuilder
 
-def parsestr(space, encoding, s, unicode_literals=False):
-    # compiler.transformer.Transformer.decode_literal depends on what 
-    # might seem like minor details of this function -- changes here 
-    # must be reflected there.
+def parsestr(space, encoding, s, unicode_literal=False):
+    """Parses a string or unicode literal, and return a wrapped value.
+
+    If encoding=iso8859-1, the source string is also in this encoding.
+    If encoding=None, the source string is ascii only.
+    In other cases, the source string is in utf-8 encoding.
+
+    When a bytes string is returned, it will be encoded with the
+    original encoding.
+
+    Yes, it's very inefficient.
+    Yes, CPython has very similar code.
+    """
 
     # we use ps as "pointer to s"
     # q is the virtual last char index of the string
     ps = 0
     quote = s[ps]
     rawmode = False
-    unicode = unicode_literals
 
     # string decoration handling
     o = ord(quote)
@@ -21,11 +29,11 @@
         if quote == 'b' or quote == 'B':
             ps += 1
             quote = s[ps]
-            unicode = False
+            unicode_literal = False
         elif quote == 'u' or quote == 'U':
             ps += 1
             quote = s[ps]
-            unicode = True
+            unicode_literal = True
         if quote == 'r' or quote == 'R':
             ps += 1
             quote = s[ps]
@@ -46,21 +54,28 @@
                                         'unmatched triple quotes in literal')
         q -= 2
 
-    if unicode: # XXX Py_UnicodeFlag is ignored for now
+    if unicode_literal: # XXX Py_UnicodeFlag is ignored for now
         if encoding is None or encoding == "iso-8859-1":
+            # 'unicode_escape' expects latin-1 bytes, string is ready.
             buf = s
             bufp = ps
             bufq = q
             u = None
         else:
-            # "\XX" may become "\u005c\uHHLL" (12 bytes)
+            # String is utf8-encoded, but 'unicode_escape' expects
+            # latin-1; So multibyte sequences must be escaped.
             lis = [] # using a list to assemble the value
             end = q
+            # Worst case: "\XX" may become "\u005c\uHHLL" (12 bytes)
             while ps < end:
                 if s[ps] == '\\':
                     lis.append(s[ps])
                     ps += 1
                     if ord(s[ps]) & 0x80:
+                        # A multibyte sequence will follow, it will be
+                        # escaped like \u1234. To avoid confusion with
+                        # the backslash we just wrote, we emit "\u005c"
+                        # instead.
                         lis.append("u005c")
                 if ord(s[ps]) & 0x80: # XXX inefficient
                     w, ps = decode_utf8(space, s, ps, end, "utf-16-be")
@@ -86,13 +101,11 @@
 
     need_encoding = (encoding is not None and
                      encoding != "utf-8" and encoding != "iso-8859-1")
-    # XXX add strchr like interface to rtyper
     assert 0 <= ps <= q
     substr = s[ps : q]
     if rawmode or '\\' not in s[ps:]:
         if need_encoding:
             w_u = space.wrap(unicodehelper.PyUnicode_DecodeUTF8(space, substr))
-            #w_v = space.wrap(space.unwrap(w_u).encode(encoding)) this works
             w_v = unicodehelper.PyUnicode_AsEncodedString(space, w_u, 
space.wrap(encoding))
             return w_v
         else:
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
http://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to