[pypy-commit] pypy py3k: try hard to give good error messages when we are unable to convert a string to int() or float(). To do so, we do the parsing directly in unicode instead of trying to convert to ASCII and do the parsing there.

antocuni Wed, 18 Jul 2012 07:19:11 -0700

Author: Antonio Cuni <[email protected]>
Branch: py3k
Changeset: r56159:3e5f50aa9403
Date: 2012-07-18 16:09 +0200
http://bitbucket.org/pypy/pypy/changeset/3e5f50aa9403/


Log:    try hard to give good error messages when we are unable to convert a
        string to int() or float(). To do so, we do the parsing directly in
        unicode instead of trying to convert to ASCII and do the parsing
        there.

diff --git a/pypy/objspace/std/longtype.py b/pypy/objspace/std/longtype.py
--- a/pypy/objspace/std/longtype.py
+++ b/pypy/objspace/std/longtype.py
@@ -34,6 +34,7 @@
             return string_to_w_long(space, w_longtype,
                                     unicode_to_decimal_w(space, w_value))
         elif space.isinstance_w(w_value, space.w_bytearray):
+            # XXX: convert to unicode
             return string_to_w_long(space, w_longtype, 
space.bufferstr_w(w_value))
         else:
             # otherwise, use the __int__() or the __trunc__ methods
diff --git a/pypy/objspace/std/strutil.py b/pypy/objspace/std/strutil.py
--- a/pypy/objspace/std/strutil.py
+++ b/pypy/objspace/std/strutil.py
@@ -2,6 +2,7 @@
 Pure Python implementation of string utilities.
 """
 
+from pypy.rlib.objectmodel import enforceargs
 from pypy.rlib.rarithmetic import ovfcheck
 from pypy.rlib.rfloat import rstring_to_float, INFINITY, NAN
 from pypy.rlib.rbigint import rbigint, parse_digit_string
@@ -11,18 +12,20 @@
 # XXX factor more functions out of stringobject.py.
 # This module is independent from PyPy.
 
+@enforceargs(unicode)
 def strip_spaces(s):
     # XXX this is not locale-dependent
     p = 0
     q = len(s)
-    while p < q and s[p] in ' \f\n\r\t\v':
+    while p < q and s[p] in u' \f\n\r\t\v':
         p += 1
-    while p < q and s[q-1] in ' \f\n\r\t\v':
+    while p < q and s[q-1] in u' \f\n\r\t\v':
         q -= 1
     assert q >= p     # annotator hint, don't remove
     return s[p:q]
 
 class ParseStringError(Exception):
+    @enforceargs(None, unicode)
     def __init__(self, msg):
         self.msg = msg
 
@@ -34,39 +37,40 @@
 class NumberStringParser:
 
     def error(self):
-        raise ParseStringError("invalid literal for %s() with base %d: '%s'" %
+        raise ParseStringError(u"invalid literal for %s() with base %d: '%s'" %
                                (self.fname, self.original_base, self.literal))
 
+    @enforceargs(None, unicode, unicode, int, unicode)
     def __init__(self, s, literal, base, fname):
         self.literal = literal
         self.fname = fname
         sign = 1
-        if s.startswith('-'):
+        if s.startswith(u'-'):
             sign = -1
             s = strip_spaces(s[1:])
-        elif s.startswith('+'):
+        elif s.startswith(u'+'):
             s = strip_spaces(s[1:])
         self.sign = sign
         self.original_base = base
 
         if base == 0:
-            if s.startswith('0x') or s.startswith('0X'):
+            if s.startswith(u'0x') or s.startswith(u'0X'):
                 base = 16
-            elif s.startswith('0b') or s.startswith('0B'):
+            elif s.startswith(u'0b') or s.startswith(u'0B'):
                 base = 2
-            elif s.startswith('0'): # also covers the '0o' case
+            elif s.startswith(u'0'): # also covers the '0o' case
                 base = 8
             else:
                 base = 10
         elif base < 2 or base > 36:
-            raise ParseStringError, "%s() base must be >= 2 and <= 36" % 
(fname,)
+            raise ParseStringError, u"%s() base must be >= 2 and <= 36" % 
(fname,)
         self.base = base
 
-        if base == 16 and (s.startswith('0x') or s.startswith('0X')):
+        if base == 16 and (s.startswith(u'0x') or s.startswith(u'0X')):
             s = s[2:]
-        if base == 8 and (s.startswith('0o') or s.startswith('0O')):
+        if base == 8 and (s.startswith(u'0o') or s.startswith(u'0O')):
             s = s[2:]
-        if base == 2 and (s.startswith('0b') or s.startswith('0B')):
+        if base == 2 and (s.startswith(u'0b') or s.startswith(u'0B')):
             s = s[2:]
         if not s:
             self.error()
@@ -81,12 +85,12 @@
         if self.i < self.n:
             c = self.s[self.i]
             digit = ord(c)
-            if '0' <= c <= '9':
-                digit -= ord('0')
-            elif 'A' <= c <= 'Z':
-                digit = (digit - ord('A')) + 10
-            elif 'a' <= c <= 'z':
-                digit = (digit - ord('a')) + 10
+            if u'0' <= c <= u'9':
+                digit -= ord(u'0')
+            elif u'A' <= c <= u'Z':
+                digit = (digit - ord(u'A')) + 10
+            elif u'a' <= c <= u'z':
+                digit = (digit - ord(u'a')) + 10
             else:
                 self.error()
             if digit >= self.base:
@@ -103,7 +107,7 @@
     Raises ParseStringOverflowError in case the result does not fit.
     """
     s = literal = strip_spaces(s)
-    p = NumberStringParser(s, literal, base, 'int')
+    p = NumberStringParser(s, literal, base, u'int')
     base = p.base
     result = 0
     while True:
@@ -125,10 +129,10 @@
     and returns an rbigint."""
     if parser is None:
         s = literal = strip_spaces(s)
-        if (s.endswith('l') or s.endswith('L')) and base < 22:
+        if (s.endswith(u'l') or s.endswith(u'L')) and base < 22:
             # in base 22 and above, 'L' is a valid digit!  try: long('L',22)
             s = s[:-1]
-        p = NumberStringParser(s, literal, base, 'long')
+        p = NumberStringParser(s, literal, base, u'long')
     else:
         p = parser
     return parse_digit_string(p)
@@ -155,6 +159,7 @@
 del calc_mantissa_bits
 MANTISSA_DIGITS = len(str( (1L << MANTISSA_BITS)-1 )) + 1
 
+@enforceargs(unicode)
 def string_to_float(s):
     """
     Conversion of string to float.
@@ -167,22 +172,25 @@
     s = strip_spaces(s)
 
     if not s:
-        raise ParseStringError("empty string for float()")
+        raise ParseStringError(u"empty string for float()")
 
 
     low = s.lower()
-    if low == "-inf" or low == "-infinity":
+    if low == u"-inf" or low == u"-infinity":
         return -INFINITY
-    elif low == "inf" or low == "+inf":
+    elif low == u"inf" or low == u"+inf":
         return INFINITY
-    elif low == "infinity" or low == "+infinity":
+    elif low == u"infinity" or low == u"+infinity":
         return INFINITY
-    elif low == "nan" or low == "+nan":
+    elif low == u"nan" or low == u"+nan":
         return NAN
-    elif low == "-nan":
+    elif low == u"-nan":
         return -NAN
 
+    # rstring_to_float only supports byte strings, but we have an unicode
+    # here. Do as CPython does: convert it to UTF-8
+    mystring = s.encode('utf-8')
     try:
-        return rstring_to_float(s)
+        return rstring_to_float(mystring)
     except ValueError:
-        raise ParseStringError("invalid literal for float()")
+        raise ParseStringError(u"invalid literal for float()")
diff --git a/pypy/objspace/std/test/test_floatobject.py 
b/pypy/objspace/std/test/test_floatobject.py
--- a/pypy/objspace/std/test/test_floatobject.py
+++ b/pypy/objspace/std/test/test_floatobject.py
@@ -1,3 +1,5 @@
+# -*- encoding: utf-8 -*-
+
 from pypy.objspace.std import floatobject as fobj
 from pypy.objspace.std.multimethod import FailedToImplement
 import py, sys
@@ -439,6 +441,10 @@
         b = A(5).real
         assert type(b) is float
 
+    def test_float_from_unicode(self):
+        s = '\U0001D7CF\U0001D7CE.4' # &#120783;&#120782;.4
+        assert float(s) == 10.4
+
 
 class AppTestFloatHex:
     def w_identical(self, x, y):
diff --git a/pypy/objspace/std/test/test_longobject.py 
b/pypy/objspace/std/test/test_longobject.py
--- a/pypy/objspace/std/test/test_longobject.py
+++ b/pypy/objspace/std/test/test_longobject.py
@@ -1,3 +1,4 @@
+# -*- encoding: utf-8 -*-
 import py
 import sys
 from pypy.objspace.std import longobject as lobj
@@ -318,3 +319,7 @@
         class A(int): pass
         b = A(5).real
         assert type(b) is int
+
+    def test_long_from_unicode(self):
+        s = '\U0001D7CF\U0001D7CE' # &#120783;&#120782;
+        assert int(s) == 10
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -66,33 +66,31 @@
 
 registerimplementation(W_UnicodeObject)
 
-# Helper for converting int/long
+# Helper for converting int/long this is called only from
+# {int,long,float}type.descr__new__: in the default branch this is implemented
+# using the same logic as PyUnicode_EncodeDecimal, as CPython 2.7 does.
+#
+# In CPython3 the call to PyUnicode_EncodeDecimal has been replaced to a call
+# to PyUnicode_TransformDecimalToASCII, which is much simpler. Here, we do the
+# equivalent.
+#
+# Note that, differently than default, we return an *unicode* RPython string
 def unicode_to_decimal_w(space, w_unistr):
     if not isinstance(w_unistr, W_UnicodeObject):
         raise operationerrfmt(space.w_TypeError,
                               "expected unicode, got '%s'",
                               space.type(w_unistr).getname(space))
     unistr = w_unistr._value
-    result = ['\0'] * len(unistr)
-    digits = [ '0', '1', '2', '3', '4',
-               '5', '6', '7', '8', '9']
+    result = [u'\0'] * len(unistr)
     for i in xrange(len(unistr)):
         uchr = ord(unistr[i])
-        if unicodedb.isspace(uchr):
-            result[i] = ' '
-            continue
-        try:
-            result[i] = digits[unicodedb.decimal(uchr)]
-        except KeyError:
-            if 0 < uchr < 256:
-                result[i] = chr(uchr)
-            else:
-                w_encoding = space.wrap('decimal')
-                w_start = space.wrap(i)
-                w_end = space.wrap(i+1)
-                w_reason = space.wrap('invalid decimal Unicode string')
-                raise OperationError(space.w_UnicodeEncodeError, 
space.newtuple([w_encoding, w_unistr, w_start, w_end, w_reason]))
-    return ''.join(result)
+        if uchr > 127:
+            try:
+                uchr = ord(u'0') + unicodedb.decimal(uchr)
+            except KeyError:
+                pass
+        result[i] = unichr(uchr)
+    return u''.join(result)
 
 def str__Unicode(space, w_uni):
     if space.is_w(space.type(w_uni), space.w_unicode):
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy py3k: try hard to give good error messages when we are unable to convert a string to int() or float(). To do so, we do the parsing directly in unicode instead of trying to convert to ASCII and do the parsing there.

Reply via email to