Author: fijal
Branch: unicode-utf8
Changeset: r93298:8d468e08f3fe
Date: 2017-12-07 18:03 +0200
http://bitbucket.org/pypy/pypy/changeset/8d468e08f3fe/
Log: whack a few more places, handle surrogates correctly
diff --git a/pypy/objspace/std/formatting.py b/pypy/objspace/std/formatting.py
--- a/pypy/objspace/std/formatting.py
+++ b/pypy/objspace/std/formatting.py
@@ -330,8 +330,7 @@
space = self.space
if do_unicode:
cp = rutf8.codepoint_at_pos(self.fmt, self.fmtpos - 1)
- flag = rutf8.get_flag_from_code(cp)
- w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1, flag)
+ w_s = space.newutf8(rutf8.unichr_as_utf8(cp), 1)
else:
cp = ord(self.fmt[self.fmtpos - 1])
w_s = space.newbytes(chr(cp))
diff --git a/pypy/objspace/std/unicodeobject.py
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -170,7 +170,8 @@
def _istitle(self, ch):
return unicodedb.isupper(ch) or unicodedb.istitle(ch)
- def _isspace(self, ch):
+ @staticmethod
+ def _isspace(ch):
return unicodedb.isspace(ch)
def _isalpha(self, ch):
@@ -188,8 +189,8 @@
def _iscased(self, ch):
return unicodedb.iscased(ch)
- def _islinebreak(self, s, pos):
- return rutf8.islinebreak(s, pos)
+ def _islinebreak(self, ch):
+ return unicodedb.islinebreak(ch)
@staticmethod
@unwrap_spec(w_string=WrappedDefault(""))
@@ -610,7 +611,7 @@
while pos < length:
sol = pos
lgt = 0
- while pos < length and not self._islinebreak(value, pos):
+ while pos < length and not
self._islinebreak(rutf8.codepoint_at_pos(value, pos)):
pos = rutf8.next_codepoint_pos(value, pos)
lgt += 1
eol = pos
@@ -792,7 +793,7 @@
if pos < 0:
return space.newtuple([self, self._empty(), self._empty()])
else:
- lgt, _ = rutf8.check_utf8(value, True, stop=pos)
+ lgt = rutf8.check_utf8(value, True, stop=pos)
return space.newtuple(
[W_UnicodeObject(value[0:pos], lgt), w_sub,
W_UnicodeObject(value[pos + len(sub._utf8):len(value)],
@@ -810,7 +811,7 @@
if pos < 0:
return space.newtuple([self._empty(), self._empty(), self])
else:
- lgt, _ = rutf8.check_utf8(value, True, stop=pos)
+ lgt = rutf8.check_utf8(value, True, stop=pos)
return space.newtuple(
[W_UnicodeObject(value[0:pos], lgt), w_sub,
W_UnicodeObject(value[pos + len(sub._utf8):len(value)],
@@ -1087,7 +1088,10 @@
return space.newbytes(s)
if ((encoding is None and space.sys.defaultencoding == 'utf8') or
encoding == 'utf-8' or encoding == 'utf8' or encoding == 'UTF-8'):
- return space.newbytes(space.utf8_w(w_object))
+ utf8 = space.utf8_w(w_object)
+ if rutf8.has_surrogates(utf8):
+ utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
+ return space.newbytes(utf8)
if w_encoder is None:
from pypy.module._codecs.interp_codecs import lookup_codec
w_encoder = space.getitem(lookup_codec(space, encoding),
space.newint(0))
@@ -1728,14 +1732,12 @@
result = ['\0'] * w_unistr._length
digits = ['0', '1', '2', '3', '4',
'5', '6', '7', '8', '9']
- i = 0
res_pos = 0
- while i < len(unistr):
- uchr = rutf8.codepoint_at_pos(unistr, i)
- if rutf8.isspace(unistr, i):
+ iter = rutf8.Utf8StringIterator(unistr)
+ for uchr in iter:
+ if W_UnicodeObject._isspace(uchr):
result[res_pos] = ' '
res_pos += 1
- i = rutf8.next_codepoint_pos(unistr, i)
continue
try:
result[res_pos] = digits[unicodedb.decimal(uchr)]
@@ -1744,14 +1746,14 @@
result[res_pos] = chr(uchr)
else:
w_encoding = space.newtext('decimal')
- w_start = space.newint(i)
- w_end = space.newint(i+1)
+ pos = iter.get_pos()
+ w_start = space.newint(pos)
+ w_end = space.newint(pos+1)
w_reason = space.newtext('invalid decimal Unicode string')
raise OperationError(space.w_UnicodeEncodeError,
space.newtuple([w_encoding, w_unistr,
w_start, w_end,
w_reason]))
- i = rutf8.next_codepoint_pos(unistr, i)
res_pos += 1
return ''.join(result)
diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -410,6 +410,13 @@
assert pos - continuation_bytes >= 0
return pos - continuation_bytes
+def has_surrogates(utf8):
+ # XXX write a faster version maybe
+ for ch in Utf8StringIterator(utf8):
+ if 0xD800 <= ch <= 0xDBFF:
+ return True
+ return False
+
def reencode_utf8_with_surrogates(utf8):
""" Receiving valid UTF8 which contains surrogates, combine surrogate
pairs into correct UTF8 with pairs collpased. This is a rare case
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit