[pypy-commit] pypy py3.5: fix and clean up of f-strings: now internally handled as utf-8, instead of

arigo Tue, 07 Feb 2017 06:23:14 -0800

Author: Armin Rigo <[email protected]>
Branch: py3.5
Changeset: r89999:4fd6ad6559bf
Date: 2017-02-07 13:39 +0100
http://bitbucket.org/pypy/pypy/changeset/4fd6ad6559bf/


Log:    fix and clean up of f-strings: now internally handled as utf-8,
        instead of as unicode-that-can-only-contain-ascii-anyway

diff --git a/pypy/interpreter/astcompiler/fstring.py 
b/pypy/interpreter/astcompiler/fstring.py
--- a/pypy/interpreter/astcompiler/fstring.py
+++ b/pypy/interpreter/astcompiler/fstring.py
@@ -2,7 +2,7 @@
 from pypy.interpreter.pyparser import parsestring
 from pypy.interpreter import error
 from pypy.interpreter import unicodehelper
-from rpython.rlib.rstring import UnicodeBuilder
+from rpython.rlib.rstring import StringBuilder
 
 
 def add_constant_string(astbuilder, joined_pieces, w_string, atom_node):
@@ -44,7 +44,8 @@
     if astbuilder.recursive_parser is None:
         astbuilder.error("internal error: parser not available for parsing "
                    "the expressions inside the f-string", atom_node)
-    source = '(%s)' % source.encode('utf-8')
+    assert isinstance(source, str)    # utf-8 encoded
+    source = '(%s)' % source
 
     info = pyparse.CompileInfo("<fstring>", "eval",
                                consts.PyCF_SOURCE_IS_UTF8 |
@@ -85,13 +86,13 @@
 
     # The first char must be a left brace, or we wouldn't have gotten
     # here. Skip over it.
-    u = fstr.unparsed
+    s = fstr.unparsed
     i = fstr.current_index
-    assert u[i] == u'{'
+    assert s[i] == '{'
     i += 1
 
     expr_start = i
-    while i < len(u):
+    while i < len(s):
 
         # Loop invariants.
         assert nested_depth >= 0
@@ -100,9 +101,9 @@
         else:
             assert string_type == 0
 
-        ch = u[i]
+        ch = s[i]
         # Nowhere inside an expression is a backslash allowed.
-        if ch == u'\\':
+        if ch == '\\':
             # Error: can't include a backslash character, inside
             # parens or strings or not.
             astbuilder.error("f-string expression part "
@@ -116,7 +117,7 @@
                 # Does this match the string_type (single or triple
                 # quoted)?
                 if string_type == 3:
-                    if i + 2 < len(u) and u[i + 1] == u[i + 2] == ch:
+                    if i + 2 < len(s) and s[i + 1] == s[i + 2] == ch:
                         # We're at the end of a triple quoted string.
                         i += 3
                         string_type = 0
@@ -128,9 +129,9 @@
                     string_type = 0
                     quote_char = 0
                     continue
-        elif ch == u"'" or ch == u'"':
+        elif ch == "'" or ch == '"':
             # Is this a triple quoted string?
-            if i + 2 < len(u) and u[i + 1] == u[i + 2] == ch:
+            if i + 2 < len(s) and s[i + 1] == s[i + 2] == ch:
                 string_type = 3
                 i += 2
             else:
@@ -138,20 +139,20 @@
                 string_type = 1
             # Start looking for the end of the string.
             quote_char = ord(ch)
-        elif ch in u"[{(":
+        elif ch in "[{(":
             nested_depth += 1
-        elif nested_depth != 0 and ch in u"]})":
+        elif nested_depth != 0 and ch in "]})":
             nested_depth -= 1
-        elif ch == u'#':
+        elif ch == '#':
             # Error: can't include a comment character, inside parens
             # or not.
             astbuilder.error("f-string expression part cannot include '#'",
                              atom_node)
-        elif nested_depth == 0 and ch in u"!:}":
+        elif nested_depth == 0 and ch in "!:}":
             # First, test for the special case of "!=". Since '=' is
             # not an allowed conversion character, nothing is lost in
             # this test.
-            if ch == '!' and i + 1 < len(u) and u[i+1] == u'=':
+            if ch == '!' and i + 1 < len(s) and s[i+1] == '=':
                 # This isn't a conversion character, just continue.
                 i += 1
                 continue
@@ -171,33 +172,33 @@
     if nested_depth:
         astbuilder.error("f-string: mismatched '(', '{' or '['", atom_node)
 
-    if i >= len(u):
+    if i >= len(s):
         unexpected_end_of_string(astbuilder, atom_node)
 
     # Compile the expression as soon as possible, so we show errors
     # related to the expression before errors related to the
     # conversion or format_spec.
-    expr = f_string_compile(astbuilder, u[expr_start:i], atom_node)
+    expr = f_string_compile(astbuilder, s[expr_start:i], atom_node)
     assert isinstance(expr, ast.Expression)
 
     # Check for a conversion char, if present.
-    if u[i] == u'!':
+    if s[i] == '!':
         i += 1
-        if i >= len(u):
+        if i >= len(s):
             unexpected_end_of_string(astbuilder, atom_node)
 
-        conversion = ord(u[i])
+        conversion = ord(s[i])
         i += 1
         if conversion not in (ord('s'), ord('r'), ord('a')):
             astbuilder.error("f-string: invalid conversion character: "
                              "expected 's', 'r', or 'a'", atom_node)
 
     # Check for the format spec, if present.
-    if i >= len(u):
+    if i >= len(s):
         unexpected_end_of_string(astbuilder, atom_node)
-    if u[i] == u':':
+    if s[i] == ':':
         i += 1
-        if i >= len(u):
+        if i >= len(s):
             unexpected_end_of_string(astbuilder, atom_node)
         fstr.current_index = i
         subpieces = []
@@ -205,7 +206,7 @@
         format_spec = f_string_to_ast_node(astbuilder, subpieces, atom_node)
         i = fstr.current_index
 
-    if i >= len(u) or u[i] != u'}':
+    if i >= len(s) or s[i] != '}':
         unexpected_end_of_string(astbuilder, atom_node)
 
     # We're at a right brace. Consume it.
@@ -222,7 +223,7 @@
 def fstring_find_literal(astbuilder, fstr, atom_node, rec):
     # Return the next literal part.  Updates the current index inside 'fstr'.
     # Differs from CPython: this version handles double-braces on its own.
-    u = fstr.unparsed
+    s = fstr.unparsed
     literal_start = fstr.current_index
     in_named_escape = False
 
@@ -230,21 +231,21 @@
     # brace (which isn't part of a unicode name escape such as
     # "\N{EULER CONSTANT}"), or the end of the string.
     i = literal_start
-    builder = UnicodeBuilder()
-    while i < len(u):
-        ch = u[i]
-        if (not in_named_escape and ch == u'{' and i - literal_start >= 2
-                and u[i - 2] == u'\\' and u[i - 1] == u'N'):
+    builder = StringBuilder()
+    while i < len(s):
+        ch = s[i]
+        if (not in_named_escape and ch == '{' and i - literal_start >= 2
+                and s[i - 2] == '\\' and s[i - 1] == 'N'):
             in_named_escape = True
-        elif in_named_escape and ch == u'}':
+        elif in_named_escape and ch == '}':
             in_named_escape = False
-        elif ch == u'{' or ch == u'}':
+        elif ch == '{' or ch == '}':
             # Check for doubled braces, but only at the top level. If
             # we checked at every level, then f'{0:{3}}' would fail
             # with the two closing braces.
-            if rec == 0 and i + 1 < len(u) and u[i + 1] == ch:
+            if rec == 0 and i + 1 < len(s) and s[i + 1] == ch:
                 i += 1   # skip over the second brace
-            elif rec == 0 and ch == u'}':
+            elif rec == 0 and ch == '}':
                 # Where a single '{' is the start of a new expression, a
                 # single '}' is not allowed.
                 astbuilder.error("f-string: single '}' is not allowed",
@@ -259,14 +260,13 @@
 
     fstr.current_index = i
     literal = builder.build()
-    if not fstr.raw_mode and u'\\' in literal:
-        # xxx messy
+    if not fstr.raw_mode and '\\' in literal:
         space = astbuilder.space
-        literal = literal.encode('utf-8')
         literal = parsestring.decode_unicode_utf8(space, literal, 0,
                                                   len(literal))
-        literal = unicodehelper.decode_unicode_escape(space, literal)
-    return literal
+        return unicodehelper.decode_unicode_escape(space, literal)
+    else:
+        return literal.decode('utf-8')
 
 
 def fstring_find_literal_and_expr(astbuilder, fstr, atom_node, rec):
@@ -274,15 +274,15 @@
     # following expression node.  Updates the current index inside 'fstr'.
     literal = fstring_find_literal(astbuilder, fstr, atom_node, rec)
 
-    u = fstr.unparsed
+    s = fstr.unparsed
     i = fstr.current_index
-    if i >= len(u) or u[i] == u'}':
+    if i >= len(s) or s[i] == '}':
         # We're at the end of the string or the end of a nested
         # f-string: no expression.
         expr = None
     else:
         # We must now be the start of an expression, on a '{'.
-        assert u[i] == u'{'
+        assert s[i] == '{'
         expr = fstring_find_expr(astbuilder, fstr, atom_node, rec)
     return literal, expr
 
@@ -320,14 +320,16 @@
         astbuilder.error("f-string: unexpected end of string", atom_node)
 
     if rec != 0 and (fstr.current_index >= len(fstr.unparsed) or
-                     fstr.unparsed[fstr.current_index] != u'}'):
+                     fstr.unparsed[fstr.current_index] != '}'):
         astbuilder.error("f-string: expecting '}'", atom_node)
 
 
 def f_string_to_ast_node(astbuilder, joined_pieces, atom_node):
     # remove empty Strs
+    space = astbuilder.space
     values = [node for node in joined_pieces
-                   if not (isinstance(node, ast.Str) and not node.s)]
+                   if not isinstance(node, ast.Str)
+                      or space.is_true(node.s)]
     if len(values) > 1:
         return ast.JoinedStr(values, atom_node.get_lineno(),
                                      atom_node.get_column())
diff --git a/pypy/interpreter/astcompiler/test/test_compiler.py 
b/pypy/interpreter/astcompiler/test/test_compiler.py
--- a/pypy/interpreter/astcompiler/test/test_compiler.py
+++ b/pypy/interpreter/astcompiler/test/test_compiler.py
@@ -1194,6 +1194,38 @@
         raises(SyntaxError, self.run, "f'''{5)\n#}'''")
         raises(SyntaxError, self.run, "f'\\x'")
 
+    def test_fstring_encoding(self):
+        src = """# -*- coding: latin-1 -*-\nz=ord(f'{"\xd8"}')\n"""
+        yield self.st, src, 'z', 0xd8
+        src = """# -*- coding: utf-8 -*-\nz=ord(f'{"\xc3\x98"}')\n"""
+        yield self.st, src, 'z', 0xd8
+
+        src = """z=ord(f'\\xd8')"""
+        yield self.st, src, 'z', 0xd8
+        src = """z=ord(f'\\u00d8')"""
+        yield self.st, src, 'z', 0xd8
+
+        src = """# -*- coding: latin-1 -*-\nz=ord(f'\xd8')\n"""
+        yield self.st, src, 'z', 0xd8
+        src = """# -*- coding: utf-8 -*-\nz=ord(f'\xc3\x98')\n"""
+        yield self.st, src, 'z', 0xd8
+
+    def test_fstring_encoding_r(self):
+        src = """# -*- coding: latin-1 -*-\nz=ord(fr'{"\xd8"}')\n"""
+        yield self.st, src, 'z', 0xd8
+        src = """# -*- coding: utf-8 -*-\nz=ord(rf'{"\xc3\x98"}')\n"""
+        yield self.st, src, 'z', 0xd8
+
+        src = """z=fr'\\xd8'"""
+        yield self.st, src, 'z', "\\xd8"
+        src = """z=rf'\\u00d8'"""
+        yield self.st, src, 'z', "\\u00d8"
+
+        src = """# -*- coding: latin-1 -*-\nz=ord(rf'\xd8')\n"""
+        yield self.st, src, 'z', 0xd8
+        src = """# -*- coding: utf-8 -*-\nz=ord(fr'\xc3\x98')\n"""
+        yield self.st, src, 'z', 0xd8
+
 
 class AppTestCompiler:
 
diff --git a/pypy/interpreter/pyparser/parsestring.py 
b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -7,7 +7,7 @@
 
 class W_FString(W_Root):
     def __init__(self, unparsed, raw_mode):
-        assert isinstance(unparsed, unicode)
+        assert isinstance(unparsed, str)    # utf-8 encoded string
         self.unparsed = unparsed     # but the quotes are removed
         self.raw_mode = raw_mode
         self.current_index = 0       # for astcompiler.fstring
@@ -85,14 +85,13 @@
         q -= 2
 
     if unicode_literal and not rawmode: # XXX Py_UnicodeFlag is ignored for now
+        assert 0 <= ps <= q
+        if saw_f:
+            return W_FString(s[ps:q], rawmode)
         if encoding is None:
-            assert 0 <= ps <= q
             substr = s[ps:q]
         else:
             substr = decode_unicode_utf8(space, s, ps, q)
-        if saw_f:
-            v = unicodehelper.decode_utf8(space, substr)
-            return W_FString(v, rawmode)
         v = unicodehelper.decode_unicode_escape(space, substr)
         return space.wrap(v)
 
@@ -109,10 +108,10 @@
     if rawmode or '\\' not in substr:
         if not unicode_literal:
             return space.newbytes(substr)
+        elif saw_f:
+            return W_FString(substr, rawmode)
         else:
             v = unicodehelper.decode_utf8(space, substr)
-            if saw_f:
-                return W_FString(v, rawmode)
             return space.wrap(v)
 
     v = PyString_DecodeEscape(space, substr, 'strict', encoding)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy py3.5: fix and clean up of f-strings: now internally handled as utf-8, instead of

Reply via email to