Author: Antonio Cuni <[email protected]>
Branch: py3k
Changeset: r56159:3e5f50aa9403
Date: 2012-07-18 16:09 +0200
http://bitbucket.org/pypy/pypy/changeset/3e5f50aa9403/

Log:    try hard to give good error messages when we are unable to convert a
        string to int() or float(). To do so, we do the parsing directly in
        unicode instead of trying to convert to ASCII and do the parsing
        there.

diff --git a/pypy/objspace/std/longtype.py b/pypy/objspace/std/longtype.py
--- a/pypy/objspace/std/longtype.py
+++ b/pypy/objspace/std/longtype.py
@@ -34,6 +34,7 @@
             return string_to_w_long(space, w_longtype,
                                     unicode_to_decimal_w(space, w_value))
         elif space.isinstance_w(w_value, space.w_bytearray):
+            # XXX: convert to unicode
             return string_to_w_long(space, w_longtype, 
space.bufferstr_w(w_value))
         else:
             # otherwise, use the __int__() or the __trunc__ methods
diff --git a/pypy/objspace/std/strutil.py b/pypy/objspace/std/strutil.py
--- a/pypy/objspace/std/strutil.py
+++ b/pypy/objspace/std/strutil.py
@@ -2,6 +2,7 @@
 Pure Python implementation of string utilities.
 """
 
+from pypy.rlib.objectmodel import enforceargs
 from pypy.rlib.rarithmetic import ovfcheck
 from pypy.rlib.rfloat import rstring_to_float, INFINITY, NAN
 from pypy.rlib.rbigint import rbigint, parse_digit_string
@@ -11,18 +12,20 @@
 # XXX factor more functions out of stringobject.py.
 # This module is independent from PyPy.
 
+@enforceargs(unicode)
 def strip_spaces(s):
     # XXX this is not locale-dependent
     p = 0
     q = len(s)
-    while p < q and s[p] in ' \f\n\r\t\v':
+    while p < q and s[p] in u' \f\n\r\t\v':
         p += 1
-    while p < q and s[q-1] in ' \f\n\r\t\v':
+    while p < q and s[q-1] in u' \f\n\r\t\v':
         q -= 1
     assert q >= p     # annotator hint, don't remove
     return s[p:q]
 
 class ParseStringError(Exception):
+    @enforceargs(None, unicode)
     def __init__(self, msg):
         self.msg = msg
 
@@ -34,39 +37,40 @@
 class NumberStringParser:
 
     def error(self):
-        raise ParseStringError("invalid literal for %s() with base %d: '%s'" %
+        raise ParseStringError(u"invalid literal for %s() with base %d: '%s'" %
                                (self.fname, self.original_base, self.literal))
 
+    @enforceargs(None, unicode, unicode, int, unicode)
     def __init__(self, s, literal, base, fname):
         self.literal = literal
         self.fname = fname
         sign = 1
-        if s.startswith('-'):
+        if s.startswith(u'-'):
             sign = -1
             s = strip_spaces(s[1:])
-        elif s.startswith('+'):
+        elif s.startswith(u'+'):
             s = strip_spaces(s[1:])
         self.sign = sign
         self.original_base = base
 
         if base == 0:
-            if s.startswith('0x') or s.startswith('0X'):
+            if s.startswith(u'0x') or s.startswith(u'0X'):
                 base = 16
-            elif s.startswith('0b') or s.startswith('0B'):
+            elif s.startswith(u'0b') or s.startswith(u'0B'):
                 base = 2
-            elif s.startswith('0'): # also covers the '0o' case
+            elif s.startswith(u'0'): # also covers the '0o' case
                 base = 8
             else:
                 base = 10
         elif base < 2 or base > 36:
-            raise ParseStringError, "%s() base must be >= 2 and <= 36" % 
(fname,)
+            raise ParseStringError, u"%s() base must be >= 2 and <= 36" % 
(fname,)
         self.base = base
 
-        if base == 16 and (s.startswith('0x') or s.startswith('0X')):
+        if base == 16 and (s.startswith(u'0x') or s.startswith(u'0X')):
             s = s[2:]
-        if base == 8 and (s.startswith('0o') or s.startswith('0O')):
+        if base == 8 and (s.startswith(u'0o') or s.startswith(u'0O')):
             s = s[2:]
-        if base == 2 and (s.startswith('0b') or s.startswith('0B')):
+        if base == 2 and (s.startswith(u'0b') or s.startswith(u'0B')):
             s = s[2:]
         if not s:
             self.error()
@@ -81,12 +85,12 @@
         if self.i < self.n:
             c = self.s[self.i]
             digit = ord(c)
-            if '0' <= c <= '9':
-                digit -= ord('0')
-            elif 'A' <= c <= 'Z':
-                digit = (digit - ord('A')) + 10
-            elif 'a' <= c <= 'z':
-                digit = (digit - ord('a')) + 10
+            if u'0' <= c <= u'9':
+                digit -= ord(u'0')
+            elif u'A' <= c <= u'Z':
+                digit = (digit - ord(u'A')) + 10
+            elif u'a' <= c <= u'z':
+                digit = (digit - ord(u'a')) + 10
             else:
                 self.error()
             if digit >= self.base:
@@ -103,7 +107,7 @@
     Raises ParseStringOverflowError in case the result does not fit.
     """
     s = literal = strip_spaces(s)
-    p = NumberStringParser(s, literal, base, 'int')
+    p = NumberStringParser(s, literal, base, u'int')
     base = p.base
     result = 0
     while True:
@@ -125,10 +129,10 @@
     and returns an rbigint."""
     if parser is None:
         s = literal = strip_spaces(s)
-        if (s.endswith('l') or s.endswith('L')) and base < 22:
+        if (s.endswith(u'l') or s.endswith(u'L')) and base < 22:
             # in base 22 and above, 'L' is a valid digit!  try: long('L',22)
             s = s[:-1]
-        p = NumberStringParser(s, literal, base, 'long')
+        p = NumberStringParser(s, literal, base, u'long')
     else:
         p = parser
     return parse_digit_string(p)
@@ -155,6 +159,7 @@
 del calc_mantissa_bits
 MANTISSA_DIGITS = len(str( (1L << MANTISSA_BITS)-1 )) + 1
 
+@enforceargs(unicode)
 def string_to_float(s):
     """
     Conversion of string to float.
@@ -167,22 +172,25 @@
     s = strip_spaces(s)
 
     if not s:
-        raise ParseStringError("empty string for float()")
+        raise ParseStringError(u"empty string for float()")
 
 
     low = s.lower()
-    if low == "-inf" or low == "-infinity":
+    if low == u"-inf" or low == u"-infinity":
         return -INFINITY
-    elif low == "inf" or low == "+inf":
+    elif low == u"inf" or low == u"+inf":
         return INFINITY
-    elif low == "infinity" or low == "+infinity":
+    elif low == u"infinity" or low == u"+infinity":
         return INFINITY
-    elif low == "nan" or low == "+nan":
+    elif low == u"nan" or low == u"+nan":
         return NAN
-    elif low == "-nan":
+    elif low == u"-nan":
         return -NAN
 
+    # rstring_to_float only supports byte strings, but we have an unicode
+    # here. Do as CPython does: convert it to UTF-8
+    mystring = s.encode('utf-8')
     try:
-        return rstring_to_float(s)
+        return rstring_to_float(mystring)
     except ValueError:
-        raise ParseStringError("invalid literal for float()")
+        raise ParseStringError(u"invalid literal for float()")
diff --git a/pypy/objspace/std/test/test_floatobject.py 
b/pypy/objspace/std/test/test_floatobject.py
--- a/pypy/objspace/std/test/test_floatobject.py
+++ b/pypy/objspace/std/test/test_floatobject.py
@@ -1,3 +1,5 @@
+# -*- encoding: utf-8 -*-
+
 from pypy.objspace.std import floatobject as fobj
 from pypy.objspace.std.multimethod import FailedToImplement
 import py, sys
@@ -439,6 +441,10 @@
         b = A(5).real
         assert type(b) is float
 
+    def test_float_from_unicode(self):
+        s = '\U0001D7CF\U0001D7CE.4' # &#120783;&#120782;.4
+        assert float(s) == 10.4
+
 
 class AppTestFloatHex:
     def w_identical(self, x, y):
diff --git a/pypy/objspace/std/test/test_longobject.py 
b/pypy/objspace/std/test/test_longobject.py
--- a/pypy/objspace/std/test/test_longobject.py
+++ b/pypy/objspace/std/test/test_longobject.py
@@ -1,3 +1,4 @@
+# -*- encoding: utf-8 -*-
 import py
 import sys
 from pypy.objspace.std import longobject as lobj
@@ -318,3 +319,7 @@
         class A(int): pass
         b = A(5).real
         assert type(b) is int
+
+    def test_long_from_unicode(self):
+        s = '\U0001D7CF\U0001D7CE' # &#120783;&#120782;
+        assert int(s) == 10
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -66,33 +66,31 @@
 
 registerimplementation(W_UnicodeObject)
 
-# Helper for converting int/long
+# Helper for converting int/long this is called only from
+# {int,long,float}type.descr__new__: in the default branch this is implemented
+# using the same logic as PyUnicode_EncodeDecimal, as CPython 2.7 does.
+#
+# In CPython3 the call to PyUnicode_EncodeDecimal has been replaced to a call
+# to PyUnicode_TransformDecimalToASCII, which is much simpler. Here, we do the
+# equivalent.
+#
+# Note that, differently than default, we return an *unicode* RPython string
 def unicode_to_decimal_w(space, w_unistr):
     if not isinstance(w_unistr, W_UnicodeObject):
         raise operationerrfmt(space.w_TypeError,
                               "expected unicode, got '%s'",
                               space.type(w_unistr).getname(space))
     unistr = w_unistr._value
-    result = ['\0'] * len(unistr)
-    digits = [ '0', '1', '2', '3', '4',
-               '5', '6', '7', '8', '9']
+    result = [u'\0'] * len(unistr)
     for i in xrange(len(unistr)):
         uchr = ord(unistr[i])
-        if unicodedb.isspace(uchr):
-            result[i] = ' '
-            continue
-        try:
-            result[i] = digits[unicodedb.decimal(uchr)]
-        except KeyError:
-            if 0 < uchr < 256:
-                result[i] = chr(uchr)
-            else:
-                w_encoding = space.wrap('decimal')
-                w_start = space.wrap(i)
-                w_end = space.wrap(i+1)
-                w_reason = space.wrap('invalid decimal Unicode string')
-                raise OperationError(space.w_UnicodeEncodeError, 
space.newtuple([w_encoding, w_unistr, w_start, w_end, w_reason]))
-    return ''.join(result)
+        if uchr > 127:
+            try:
+                uchr = ord(u'0') + unicodedb.decimal(uchr)
+            except KeyError:
+                pass
+        result[i] = unichr(uchr)
+    return u''.join(result)
 
 def str__Unicode(space, w_uni):
     if space.is_w(space.type(w_uni), space.w_unicode):
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to