Author: Antonio Cuni <[email protected]>
Branch: py3k
Changeset: r56159:3e5f50aa9403
Date: 2012-07-18 16:09 +0200
http://bitbucket.org/pypy/pypy/changeset/3e5f50aa9403/
Log: try hard to give good error messages when we are unable to convert a
string to int() or float(). To do so, we do the parsing directly in
unicode instead of trying to convert to ASCII and do the parsing
there.
diff --git a/pypy/objspace/std/longtype.py b/pypy/objspace/std/longtype.py
--- a/pypy/objspace/std/longtype.py
+++ b/pypy/objspace/std/longtype.py
@@ -34,6 +34,7 @@
return string_to_w_long(space, w_longtype,
unicode_to_decimal_w(space, w_value))
elif space.isinstance_w(w_value, space.w_bytearray):
+ # XXX: convert to unicode
return string_to_w_long(space, w_longtype,
space.bufferstr_w(w_value))
else:
# otherwise, use the __int__() or the __trunc__ methods
diff --git a/pypy/objspace/std/strutil.py b/pypy/objspace/std/strutil.py
--- a/pypy/objspace/std/strutil.py
+++ b/pypy/objspace/std/strutil.py
@@ -2,6 +2,7 @@
Pure Python implementation of string utilities.
"""
+from pypy.rlib.objectmodel import enforceargs
from pypy.rlib.rarithmetic import ovfcheck
from pypy.rlib.rfloat import rstring_to_float, INFINITY, NAN
from pypy.rlib.rbigint import rbigint, parse_digit_string
@@ -11,18 +12,20 @@
# XXX factor more functions out of stringobject.py.
# This module is independent from PyPy.
+@enforceargs(unicode)
def strip_spaces(s):
# XXX this is not locale-dependent
p = 0
q = len(s)
- while p < q and s[p] in ' \f\n\r\t\v':
+ while p < q and s[p] in u' \f\n\r\t\v':
p += 1
- while p < q and s[q-1] in ' \f\n\r\t\v':
+ while p < q and s[q-1] in u' \f\n\r\t\v':
q -= 1
assert q >= p # annotator hint, don't remove
return s[p:q]
class ParseStringError(Exception):
+ @enforceargs(None, unicode)
def __init__(self, msg):
self.msg = msg
@@ -34,39 +37,40 @@
class NumberStringParser:
def error(self):
- raise ParseStringError("invalid literal for %s() with base %d: '%s'" %
+ raise ParseStringError(u"invalid literal for %s() with base %d: '%s'" %
(self.fname, self.original_base, self.literal))
+ @enforceargs(None, unicode, unicode, int, unicode)
def __init__(self, s, literal, base, fname):
self.literal = literal
self.fname = fname
sign = 1
- if s.startswith('-'):
+ if s.startswith(u'-'):
sign = -1
s = strip_spaces(s[1:])
- elif s.startswith('+'):
+ elif s.startswith(u'+'):
s = strip_spaces(s[1:])
self.sign = sign
self.original_base = base
if base == 0:
- if s.startswith('0x') or s.startswith('0X'):
+ if s.startswith(u'0x') or s.startswith(u'0X'):
base = 16
- elif s.startswith('0b') or s.startswith('0B'):
+ elif s.startswith(u'0b') or s.startswith(u'0B'):
base = 2
- elif s.startswith('0'): # also covers the '0o' case
+ elif s.startswith(u'0'): # also covers the '0o' case
base = 8
else:
base = 10
elif base < 2 or base > 36:
- raise ParseStringError, "%s() base must be >= 2 and <= 36" %
(fname,)
+ raise ParseStringError, u"%s() base must be >= 2 and <= 36" %
(fname,)
self.base = base
- if base == 16 and (s.startswith('0x') or s.startswith('0X')):
+ if base == 16 and (s.startswith(u'0x') or s.startswith(u'0X')):
s = s[2:]
- if base == 8 and (s.startswith('0o') or s.startswith('0O')):
+ if base == 8 and (s.startswith(u'0o') or s.startswith(u'0O')):
s = s[2:]
- if base == 2 and (s.startswith('0b') or s.startswith('0B')):
+ if base == 2 and (s.startswith(u'0b') or s.startswith(u'0B')):
s = s[2:]
if not s:
self.error()
@@ -81,12 +85,12 @@
if self.i < self.n:
c = self.s[self.i]
digit = ord(c)
- if '0' <= c <= '9':
- digit -= ord('0')
- elif 'A' <= c <= 'Z':
- digit = (digit - ord('A')) + 10
- elif 'a' <= c <= 'z':
- digit = (digit - ord('a')) + 10
+ if u'0' <= c <= u'9':
+ digit -= ord(u'0')
+ elif u'A' <= c <= u'Z':
+ digit = (digit - ord(u'A')) + 10
+ elif u'a' <= c <= u'z':
+ digit = (digit - ord(u'a')) + 10
else:
self.error()
if digit >= self.base:
@@ -103,7 +107,7 @@
Raises ParseStringOverflowError in case the result does not fit.
"""
s = literal = strip_spaces(s)
- p = NumberStringParser(s, literal, base, 'int')
+ p = NumberStringParser(s, literal, base, u'int')
base = p.base
result = 0
while True:
@@ -125,10 +129,10 @@
and returns an rbigint."""
if parser is None:
s = literal = strip_spaces(s)
- if (s.endswith('l') or s.endswith('L')) and base < 22:
+ if (s.endswith(u'l') or s.endswith(u'L')) and base < 22:
# in base 22 and above, 'L' is a valid digit! try: long('L',22)
s = s[:-1]
- p = NumberStringParser(s, literal, base, 'long')
+ p = NumberStringParser(s, literal, base, u'long')
else:
p = parser
return parse_digit_string(p)
@@ -155,6 +159,7 @@
del calc_mantissa_bits
MANTISSA_DIGITS = len(str( (1L << MANTISSA_BITS)-1 )) + 1
+@enforceargs(unicode)
def string_to_float(s):
"""
Conversion of string to float.
@@ -167,22 +172,25 @@
s = strip_spaces(s)
if not s:
- raise ParseStringError("empty string for float()")
+ raise ParseStringError(u"empty string for float()")
low = s.lower()
- if low == "-inf" or low == "-infinity":
+ if low == u"-inf" or low == u"-infinity":
return -INFINITY
- elif low == "inf" or low == "+inf":
+ elif low == u"inf" or low == u"+inf":
return INFINITY
- elif low == "infinity" or low == "+infinity":
+ elif low == u"infinity" or low == u"+infinity":
return INFINITY
- elif low == "nan" or low == "+nan":
+ elif low == u"nan" or low == u"+nan":
return NAN
- elif low == "-nan":
+ elif low == u"-nan":
return -NAN
+ # rstring_to_float only supports byte strings, but we have an unicode
+ # here. Do as CPython does: convert it to UTF-8
+ mystring = s.encode('utf-8')
try:
- return rstring_to_float(s)
+ return rstring_to_float(mystring)
except ValueError:
- raise ParseStringError("invalid literal for float()")
+ raise ParseStringError(u"invalid literal for float()")
diff --git a/pypy/objspace/std/test/test_floatobject.py
b/pypy/objspace/std/test/test_floatobject.py
--- a/pypy/objspace/std/test/test_floatobject.py
+++ b/pypy/objspace/std/test/test_floatobject.py
@@ -1,3 +1,5 @@
+# -*- encoding: utf-8 -*-
+
from pypy.objspace.std import floatobject as fobj
from pypy.objspace.std.multimethod import FailedToImplement
import py, sys
@@ -439,6 +441,10 @@
b = A(5).real
assert type(b) is float
+ def test_float_from_unicode(self):
+ s = '\U0001D7CF\U0001D7CE.4' # 𝟏𝟎.4
+ assert float(s) == 10.4
+
class AppTestFloatHex:
def w_identical(self, x, y):
diff --git a/pypy/objspace/std/test/test_longobject.py
b/pypy/objspace/std/test/test_longobject.py
--- a/pypy/objspace/std/test/test_longobject.py
+++ b/pypy/objspace/std/test/test_longobject.py
@@ -1,3 +1,4 @@
+# -*- encoding: utf-8 -*-
import py
import sys
from pypy.objspace.std import longobject as lobj
@@ -318,3 +319,7 @@
class A(int): pass
b = A(5).real
assert type(b) is int
+
+ def test_long_from_unicode(self):
+ s = '\U0001D7CF\U0001D7CE' # 𝟏𝟎
+ assert int(s) == 10
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -66,33 +66,31 @@
registerimplementation(W_UnicodeObject)
-# Helper for converting int/long
+# Helper for converting int/long this is called only from
+# {int,long,float}type.descr__new__: in the default branch this is implemented
+# using the same logic as PyUnicode_EncodeDecimal, as CPython 2.7 does.
+#
+# In CPython3 the call to PyUnicode_EncodeDecimal has been replaced to a call
+# to PyUnicode_TransformDecimalToASCII, which is much simpler. Here, we do the
+# equivalent.
+#
+# Note that, differently than default, we return an *unicode* RPython string
def unicode_to_decimal_w(space, w_unistr):
if not isinstance(w_unistr, W_UnicodeObject):
raise operationerrfmt(space.w_TypeError,
"expected unicode, got '%s'",
space.type(w_unistr).getname(space))
unistr = w_unistr._value
- result = ['\0'] * len(unistr)
- digits = [ '0', '1', '2', '3', '4',
- '5', '6', '7', '8', '9']
+ result = [u'\0'] * len(unistr)
for i in xrange(len(unistr)):
uchr = ord(unistr[i])
- if unicodedb.isspace(uchr):
- result[i] = ' '
- continue
- try:
- result[i] = digits[unicodedb.decimal(uchr)]
- except KeyError:
- if 0 < uchr < 256:
- result[i] = chr(uchr)
- else:
- w_encoding = space.wrap('decimal')
- w_start = space.wrap(i)
- w_end = space.wrap(i+1)
- w_reason = space.wrap('invalid decimal Unicode string')
- raise OperationError(space.w_UnicodeEncodeError,
space.newtuple([w_encoding, w_unistr, w_start, w_end, w_reason]))
- return ''.join(result)
+ if uchr > 127:
+ try:
+ uchr = ord(u'0') + unicodedb.decimal(uchr)
+ except KeyError:
+ pass
+ result[i] = unichr(uchr)
+ return u''.join(result)
def str__Unicode(space, w_uni):
if space.is_w(space.type(w_uni), space.w_unicode):
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit