Author: Amaury Forgeot d'Arc <[email protected]>
Branch:
Changeset: r54045:f3a0dbfc3c3a
Date: 2012-03-28 22:48 +0200
http://bitbucket.org/pypy/pypy/changeset/f3a0dbfc3c3a/
Log: Improve documentation of parsestr()
diff --git a/pypy/interpreter/pyparser/parsestring.py
b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -2,17 +2,25 @@
from pypy.interpreter import unicodehelper
from pypy.rlib.rstring import StringBuilder
-def parsestr(space, encoding, s, unicode_literals=False):
- # compiler.transformer.Transformer.decode_literal depends on what
- # might seem like minor details of this function -- changes here
- # must be reflected there.
+def parsestr(space, encoding, s, unicode_literal=False):
+ """Parses a string or unicode literal, and return a wrapped value.
+
+ If encoding=iso8859-1, the source string is also in this encoding.
+ If encoding=None, the source string is ascii only.
+ In other cases, the source string is in utf-8 encoding.
+
+ When a bytes string is returned, it will be encoded with the
+ original encoding.
+
+ Yes, it's very inefficient.
+ Yes, CPython has very similar code.
+ """
# we use ps as "pointer to s"
# q is the virtual last char index of the string
ps = 0
quote = s[ps]
rawmode = False
- unicode = unicode_literals
# string decoration handling
o = ord(quote)
@@ -21,11 +29,11 @@
if quote == 'b' or quote == 'B':
ps += 1
quote = s[ps]
- unicode = False
+ unicode_literal = False
elif quote == 'u' or quote == 'U':
ps += 1
quote = s[ps]
- unicode = True
+ unicode_literal = True
if quote == 'r' or quote == 'R':
ps += 1
quote = s[ps]
@@ -46,21 +54,28 @@
'unmatched triple quotes in literal')
q -= 2
- if unicode: # XXX Py_UnicodeFlag is ignored for now
+ if unicode_literal: # XXX Py_UnicodeFlag is ignored for now
if encoding is None or encoding == "iso-8859-1":
+ # 'unicode_escape' expects latin-1 bytes, string is ready.
buf = s
bufp = ps
bufq = q
u = None
else:
- # "\XX" may become "\u005c\uHHLL" (12 bytes)
+ # String is utf8-encoded, but 'unicode_escape' expects
+ # latin-1; So multibyte sequences must be escaped.
lis = [] # using a list to assemble the value
end = q
+ # Worst case: "\XX" may become "\u005c\uHHLL" (12 bytes)
while ps < end:
if s[ps] == '\\':
lis.append(s[ps])
ps += 1
if ord(s[ps]) & 0x80:
+ # A multibyte sequence will follow, it will be
+ # escaped like \u1234. To avoid confusion with
+ # the backslash we just wrote, we emit "\u005c"
+ # instead.
lis.append("u005c")
if ord(s[ps]) & 0x80: # XXX inefficient
w, ps = decode_utf8(space, s, ps, end, "utf-16-be")
@@ -86,13 +101,11 @@
need_encoding = (encoding is not None and
encoding != "utf-8" and encoding != "iso-8859-1")
- # XXX add strchr like interface to rtyper
assert 0 <= ps <= q
substr = s[ps : q]
if rawmode or '\\' not in s[ps:]:
if need_encoding:
w_u = space.wrap(unicodehelper.PyUnicode_DecodeUTF8(space, substr))
- #w_v = space.wrap(space.unwrap(w_u).encode(encoding)) this works
w_v = unicodehelper.PyUnicode_AsEncodedString(space, w_u,
space.wrap(encoding))
return w_v
else:
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit