Author: Ronan Lamy <[email protected]>
Branch: unicode-utf8
Changeset: r93203:290c2d5ff0bb
Date: 2017-11-28 19:23 +0000
http://bitbucket.org/pypy/pypy/changeset/290c2d5ff0bb/
Log: Merge branch 'utf8-io': fix the _io module
diff --git a/pypy/module/_io/interp_stringio.py
b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -17,20 +17,20 @@
if len(self.data) > newlength:
self.data = self.data[:newlength]
if len(self.data) < newlength:
- self.data.extend([u'\0'] * (newlength - len(self.data)))
+ self.data.extend(['\0'] * (newlength - len(self.data)))
def read(self, size):
start = self.pos
available = len(self.data) - start
if available <= 0:
- return u''
+ return ''
if size >= 0 and size <= available:
end = start + size
else:
end = len(self.data)
assert 0 <= start <= end
self.pos = end
- return u''.join(self.data[start:end])
+ return ''.join(self.data[start:end])
def _convert_limit(self, limit):
if limit < 0 or limit > len(self.data) - self.pos:
@@ -58,7 +58,7 @@
else:
break
self.pos = pos
- result = u''.join(self.data[start:pos])
+ result = ''.join(self.data[start:pos])
return result
def readline(self, marker, limit):
@@ -79,7 +79,7 @@
if not found:
pos = end
self.pos = pos
- result = u''.join(self.data[start:pos])
+ result = ''.join(self.data[start:pos])
return result
def write(self, string):
@@ -99,7 +99,7 @@
self.resize(size)
def getvalue(self):
- return u''.join(self.data)
+ return ''.join(self.data)
class W_StringIO(W_TextIOBase):
@@ -118,10 +118,10 @@
if space.is_w(w_newline, space.w_None):
newline = None
else:
- newline = space.unicode_w(w_newline)
+ newline = space.utf8_w(w_newline)
- if (newline is not None and newline != u"" and newline != u"\n" and
- newline != u"\r" and newline != u"\r\n"):
+ if (newline is not None and newline != "" and newline != "\n" and
+ newline != "\r" and newline != "\r\n"):
# Not using oefmt() because I don't know how to use it
# with unicode
raise OperationError(space.w_ValueError,
@@ -131,9 +131,9 @@
)
if newline is not None:
self.readnl = newline
- self.readuniversal = newline is None or newline == u""
+ self.readuniversal = newline is None or newline == ""
self.readtranslate = newline is None
- if newline and newline[0] == u"\r":
+ if newline and newline[0] == "\r":
self.writenl = newline
if self.readuniversal:
self.w_decoder = space.call_function(
@@ -152,7 +152,7 @@
if self.readnl is None:
w_readnl = space.w_None
else:
- w_readnl = space.str(space.newunicode(self.readnl)) # YYY
+ w_readnl = space.str(space.new_from_utf8(self.readnl)) # YYY
return space.newtuple([
w_initialval, w_readnl, space.newint(self.buf.pos), w_dict
])
@@ -179,7 +179,7 @@
# because the string value in the state tuple has already been
# translated once by __init__. So we do not take any chance and replace
# object's buffer completely
- initval = space.unicode_w(w_initval)
+ initval = space.utf8_w(w_initval)
pos = space.getindex_w(w_pos, space.w_TypeError)
if pos < 0:
raise oefmt(space.w_ValueError,
@@ -215,8 +215,8 @@
if self.writenl:
w_decoded = space.call_method(
w_decoded, "replace",
- space.newtext("\n"), space.newunicode(self.writenl))
- string = space.unicode_w(w_decoded)
+ space.newtext("\n"), space.new_from_utf8(self.writenl))
+ string = space.utf8_w(w_decoded)
if string:
self.buf.write(string)
@@ -225,7 +225,7 @@
def read_w(self, space, w_size=None):
self._check_closed(space)
size = convert_size(space, w_size)
- return space.newunicode(self.buf.read(size))
+ return space.new_from_utf8(self.buf.read(size))
def readline_w(self, space, w_limit=None):
self._check_closed(space)
@@ -235,11 +235,11 @@
else:
if self.readtranslate:
# Newlines are already translated, only search for \n
- newline = u'\n'
+ newline = '\n'
else:
newline = self.readnl
result = self.buf.readline(newline, limit)
- return space.newunicode(result)
+ return space.new_from_utf8(result)
@unwrap_spec(pos=int, mode=int)
@@ -276,7 +276,7 @@
def getvalue_w(self, space):
self._check_closed(space)
- return space.newunicode(self.buf.getvalue())
+ return space.new_from_utf8(self.buf.getvalue())
def readable_w(self, space):
self._check_closed(space)
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -11,7 +11,8 @@
from rpython.rlib.rarithmetic import intmask, r_uint, r_ulonglong
from rpython.rlib.rbigint import rbigint
from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.rutf8 import FLAG_ASCII, check_utf8
+from rpython.rlib.rutf8 import (
+ FLAG_ASCII, check_utf8, next_codepoint_pos, codepoints_in_utf8)
STATE_ZERO, STATE_OK, STATE_DETACHED = range(3)
@@ -303,7 +304,7 @@
def set(self, space, w_decoded):
check_decoded(space, w_decoded)
- self.text = space.unicode_w(w_decoded)
+ self.text = space.utf8_w(w_decoded)
self.pos = 0
def reset(self):
@@ -312,7 +313,7 @@
def get_chars(self, size):
if self.text is None:
- return u""
+ return ""
available = len(self.text) - self.pos
if size < 0 or size > available:
@@ -341,7 +342,7 @@
if self.exhausted():
raise StopIteration
ch = self.text[self.pos]
- self.pos += 1
+ self.pos = next_codepoint_pos(self.text, self.pos)
return ch
def peek_char(self):
@@ -362,16 +363,16 @@
ch = self.next_char()
except StopIteration:
return False
- if ch == u'\n':
+ if ch == '\n':
return True
- if ch == u'\r':
+ if ch == '\r':
if scanned >= limit:
return False
try:
ch = self.peek_char()
except StopIteration:
return False
- if ch == u'\n':
+ if ch == '\n':
self.next_char()
return True
else:
@@ -388,11 +389,11 @@
except StopIteration:
return False
scanned += 1
- if ch == u'\r':
+ if ch == '\r':
if scanned >= limit:
return False
try:
- if self.peek_char() == u'\n':
+ if self.peek_char() == '\n':
self.next_char()
return True
except StopIteration:
@@ -420,6 +421,7 @@
if not space.isinstance_w(w_decoded, space.w_unicode):
msg = "decoder should return a string result, not '%T'"
raise oefmt(space.w_TypeError, msg, w_decoded)
+ return w_decoded
class W_TextIOWrapper(W_TextIOBase):
@@ -705,11 +707,11 @@
else:
if self.readtranslate:
# Newlines are already translated, only search for \n
- newline = u'\n'
+ newline = '\n'
else:
# Non-universal mode.
newline = self.readnl
- if newline == u'\r\n':
+ if newline == '\r\n':
return self.decoded.find_crlf(limit)
else:
return self.decoded.find_char(newline[0], limit)
@@ -945,13 +947,14 @@
w_decoded = space.call_method(self.w_decoder, "decode",
w_chunk,
space.newbool(bool(cookie.need_eof)))
- self.decoded.set(space, w_decoded)
+ w_decoded = check_decoded(space, w_decoded)
# Skip chars_to_skip of the decoded characters
- if len(self.decoded.text) < cookie.chars_to_skip:
+ if space.len_w(w_decoded) < cookie.chars_to_skip:
raise oefmt(space.w_IOError,
"can't restore logical file position")
- self.decoded.pos = cookie.chars_to_skip
+ self.decoded.set(space, w_decoded)
+ self.decoded.pos = w_decoded._index_to_byte(cookie.chars_to_skip)
else:
self.snapshot = PositionSnapshot(cookie.dec_flags, "")
@@ -963,10 +966,8 @@
def tell_w(self, space):
self._check_closed(space)
-
if not self.seekable:
raise oefmt(space.w_IOError, "underlying stream is not seekable")
-
if not self.telling:
raise oefmt(space.w_IOError,
"telling position disabled by next() call")
@@ -992,7 +993,8 @@
# We haven't moved from the snapshot point.
return space.newlong_from_rbigint(cookie.pack())
- chars_to_skip = self.decoded.pos
+ chars_to_skip = codepoints_in_utf8(
+ self.decoded.text, end=self.decoded.pos)
# Starting from the snapshot position, we will walk the decoder
# forward until it gives us enough decoded characters.
@@ -1036,14 +1038,14 @@
# We didn't get enough decoded data; signal EOF to get more.
w_decoded = space.call_method(self.w_decoder, "decode",
space.newbytes(""),
- space.newint(1)) # final=1
+ space.newint(1)) # final=1
check_decoded(space, w_decoded)
- chars_decoded += len(space.unicode_w(w_decoded))
+ chars_decoded += space.len_w(w_decoded)
cookie.need_eof = 1
if chars_decoded < chars_to_skip:
raise oefmt(space.w_IOError,
- "can't reconstruct logical file position")
+ "can't reconstruct logical file position")
finally:
space.call_method(self.w_decoder, "setstate", w_saved_state)
diff --git a/pypy/module/_io/test/test_interp_textio.py
b/pypy/module/_io/test/test_interp_textio.py
--- a/pypy/module/_io/test/test_interp_textio.py
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -27,7 +27,8 @@
w_newline=space.newtext(mode))
lines = []
while True:
- line = space.unicode_w(w_textio.readline_w(space, space.newint(limit)))
+ w_line = w_textio.readline_w(space, space.newint(limit))
+ line = space.utf8_w(w_line).decode('utf-8')
if limit > 0:
assert len(line) <= limit
if line:
@@ -38,31 +39,27 @@
@given(st.text())
def test_read_buffer(text):
- buf = DecodeBuffer(text)
- assert buf.get_chars(-1) == text
+ buf = DecodeBuffer(text.encode('utf-8'))
+ assert buf.get_chars(-1) == text.encode('utf-8')
assert buf.exhausted()
@given(st.text(), st.lists(st.integers(min_value=0)))
def test_readn_buffer(text, sizes):
- buf = DecodeBuffer(text)
+ buf = DecodeBuffer(text.encode('utf-8'))
strings = []
for n in sizes:
s = buf.get_chars(n)
if not buf.exhausted():
- assert len(s) == n
+ assert len(s.decode('utf-8')) == n
else:
- assert len(s) <= n
+ assert len(s.decode('utf-8')) <= n
strings.append(s)
- assert ''.join(strings) == text[:sum(sizes)]
+ assert ''.join(strings) == text[:sum(sizes)].encode('utf-8')
@given(st.text())
def test_next_char(text):
- buf = DecodeBuffer(text)
- chars = []
- try:
- while True:
- chars.append(buf.next_char())
- except StopIteration:
- pass
+ buf = DecodeBuffer(text.encode('utf-8'))
+ for i in range(len(text)):
+ ch = buf.next_char()
+ assert ch == text[i].encode('utf-8')[0]
assert buf.exhausted()
- assert u''.join(chars) == text
diff --git a/pypy/module/_io/test/test_ztranslation.py
b/pypy/module/_io/test/test_ztranslation.py
deleted file mode 100644
--- a/pypy/module/_io/test/test_ztranslation.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from pypy.objspace.fake.checkmodule import checkmodule
-
-def test_checkmodule():
- checkmodule('_io')
diff --git a/pypy/objspace/fake/objspace.py b/pypy/objspace/fake/objspace.py
--- a/pypy/objspace/fake/objspace.py
+++ b/pypy/objspace/fake/objspace.py
@@ -212,6 +212,12 @@
def newutf8(self, x, l, f):
return w_some_obj()
+ def new_from_utf8(self, a):
+ return w_some_obj()
+
+ def newunicode(self, a):
+ return w_some_obj()
+
newtext = newbytes
newtext_or_none = newbytes
newfilename = newbytes
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -367,6 +367,12 @@
assert isinstance(utf8s, str)
return W_UnicodeObject(utf8s, length, flag)
+ def new_from_utf8(self, utf8s):
+ # XXX: kill me!
+ assert isinstance(utf8s, str)
+ length, flag = rutf8.check_utf8(utf8s, True)
+ return W_UnicodeObject(utf8s, length, flag)
+
def newfilename(self, s):
assert isinstance(s, str) # on pypy3, this decodes the byte string
return W_BytesObject(s) # with the filesystem encoding
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit