Author: Amaury Forgeot d'Arc <amaur...@gmail.com> Branch: Changeset: r54045:f3a0dbfc3c3a Date: 2012-03-28 22:48 +0200 http://bitbucket.org/pypy/pypy/changeset/f3a0dbfc3c3a/
Log: Improve documentation of parsestr() diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py --- a/pypy/interpreter/pyparser/parsestring.py +++ b/pypy/interpreter/pyparser/parsestring.py @@ -2,17 +2,25 @@ from pypy.interpreter import unicodehelper from pypy.rlib.rstring import StringBuilder -def parsestr(space, encoding, s, unicode_literals=False): - # compiler.transformer.Transformer.decode_literal depends on what - # might seem like minor details of this function -- changes here - # must be reflected there. +def parsestr(space, encoding, s, unicode_literal=False): + """Parses a string or unicode literal, and return a wrapped value. + + If encoding=iso8859-1, the source string is also in this encoding. + If encoding=None, the source string is ascii only. + In other cases, the source string is in utf-8 encoding. + + When a bytes string is returned, it will be encoded with the + original encoding. + + Yes, it's very inefficient. + Yes, CPython has very similar code. + """ # we use ps as "pointer to s" # q is the virtual last char index of the string ps = 0 quote = s[ps] rawmode = False - unicode = unicode_literals # string decoration handling o = ord(quote) @@ -21,11 +29,11 @@ if quote == 'b' or quote == 'B': ps += 1 quote = s[ps] - unicode = False + unicode_literal = False elif quote == 'u' or quote == 'U': ps += 1 quote = s[ps] - unicode = True + unicode_literal = True if quote == 'r' or quote == 'R': ps += 1 quote = s[ps] @@ -46,21 +54,28 @@ 'unmatched triple quotes in literal') q -= 2 - if unicode: # XXX Py_UnicodeFlag is ignored for now + if unicode_literal: # XXX Py_UnicodeFlag is ignored for now if encoding is None or encoding == "iso-8859-1": + # 'unicode_escape' expects latin-1 bytes, string is ready. buf = s bufp = ps bufq = q u = None else: - # "\XX" may become "\u005c\uHHLL" (12 bytes) + # String is utf8-encoded, but 'unicode_escape' expects + # latin-1; So multibyte sequences must be escaped. lis = [] # using a list to assemble the value end = q + # Worst case: "\XX" may become "\u005c\uHHLL" (12 bytes) while ps < end: if s[ps] == '\\': lis.append(s[ps]) ps += 1 if ord(s[ps]) & 0x80: + # A multibyte sequence will follow, it will be + # escaped like \u1234. To avoid confusion with + # the backslash we just wrote, we emit "\u005c" + # instead. lis.append("u005c") if ord(s[ps]) & 0x80: # XXX inefficient w, ps = decode_utf8(space, s, ps, end, "utf-16-be") @@ -86,13 +101,11 @@ need_encoding = (encoding is not None and encoding != "utf-8" and encoding != "iso-8859-1") - # XXX add strchr like interface to rtyper assert 0 <= ps <= q substr = s[ps : q] if rawmode or '\\' not in s[ps:]: if need_encoding: w_u = space.wrap(unicodehelper.PyUnicode_DecodeUTF8(space, substr)) - #w_v = space.wrap(space.unwrap(w_u).encode(encoding)) this works w_v = unicodehelper.PyUnicode_AsEncodedString(space, w_u, space.wrap(encoding)) return w_v else: _______________________________________________ pypy-commit mailing list pypy-commit@python.org http://mail.python.org/mailman/listinfo/pypy-commit