Author: Ronan Lamy <ronan.l...@gmail.com> Branch: unicode-utf8 Changeset: r93203:290c2d5ff0bb Date: 2017-11-28 19:23 +0000 http://bitbucket.org/pypy/pypy/changeset/290c2d5ff0bb/
Log: Merge branch 'utf8-io': fix the _io module diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py --- a/pypy/module/_io/interp_stringio.py +++ b/pypy/module/_io/interp_stringio.py @@ -17,20 +17,20 @@ if len(self.data) > newlength: self.data = self.data[:newlength] if len(self.data) < newlength: - self.data.extend([u'\0'] * (newlength - len(self.data))) + self.data.extend(['\0'] * (newlength - len(self.data))) def read(self, size): start = self.pos available = len(self.data) - start if available <= 0: - return u'' + return '' if size >= 0 and size <= available: end = start + size else: end = len(self.data) assert 0 <= start <= end self.pos = end - return u''.join(self.data[start:end]) + return ''.join(self.data[start:end]) def _convert_limit(self, limit): if limit < 0 or limit > len(self.data) - self.pos: @@ -58,7 +58,7 @@ else: break self.pos = pos - result = u''.join(self.data[start:pos]) + result = ''.join(self.data[start:pos]) return result def readline(self, marker, limit): @@ -79,7 +79,7 @@ if not found: pos = end self.pos = pos - result = u''.join(self.data[start:pos]) + result = ''.join(self.data[start:pos]) return result def write(self, string): @@ -99,7 +99,7 @@ self.resize(size) def getvalue(self): - return u''.join(self.data) + return ''.join(self.data) class W_StringIO(W_TextIOBase): @@ -118,10 +118,10 @@ if space.is_w(w_newline, space.w_None): newline = None else: - newline = space.unicode_w(w_newline) + newline = space.utf8_w(w_newline) - if (newline is not None and newline != u"" and newline != u"\n" and - newline != u"\r" and newline != u"\r\n"): + if (newline is not None and newline != "" and newline != "\n" and + newline != "\r" and newline != "\r\n"): # Not using oefmt() because I don't know how to use it # with unicode raise OperationError(space.w_ValueError, @@ -131,9 +131,9 @@ ) if newline is not None: self.readnl = newline - self.readuniversal = newline is None or newline == u"" + self.readuniversal = newline is None or newline == "" self.readtranslate = newline is None - if newline and newline[0] == u"\r": + if newline and newline[0] == "\r": self.writenl = newline if self.readuniversal: self.w_decoder = space.call_function( @@ -152,7 +152,7 @@ if self.readnl is None: w_readnl = space.w_None else: - w_readnl = space.str(space.newunicode(self.readnl)) # YYY + w_readnl = space.str(space.new_from_utf8(self.readnl)) # YYY return space.newtuple([ w_initialval, w_readnl, space.newint(self.buf.pos), w_dict ]) @@ -179,7 +179,7 @@ # because the string value in the state tuple has already been # translated once by __init__. So we do not take any chance and replace # object's buffer completely - initval = space.unicode_w(w_initval) + initval = space.utf8_w(w_initval) pos = space.getindex_w(w_pos, space.w_TypeError) if pos < 0: raise oefmt(space.w_ValueError, @@ -215,8 +215,8 @@ if self.writenl: w_decoded = space.call_method( w_decoded, "replace", - space.newtext("\n"), space.newunicode(self.writenl)) - string = space.unicode_w(w_decoded) + space.newtext("\n"), space.new_from_utf8(self.writenl)) + string = space.utf8_w(w_decoded) if string: self.buf.write(string) @@ -225,7 +225,7 @@ def read_w(self, space, w_size=None): self._check_closed(space) size = convert_size(space, w_size) - return space.newunicode(self.buf.read(size)) + return space.new_from_utf8(self.buf.read(size)) def readline_w(self, space, w_limit=None): self._check_closed(space) @@ -235,11 +235,11 @@ else: if self.readtranslate: # Newlines are already translated, only search for \n - newline = u'\n' + newline = '\n' else: newline = self.readnl result = self.buf.readline(newline, limit) - return space.newunicode(result) + return space.new_from_utf8(result) @unwrap_spec(pos=int, mode=int) @@ -276,7 +276,7 @@ def getvalue_w(self, space): self._check_closed(space) - return space.newunicode(self.buf.getvalue()) + return space.new_from_utf8(self.buf.getvalue()) def readable_w(self, space): self._check_closed(space) diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -11,7 +11,8 @@ from rpython.rlib.rarithmetic import intmask, r_uint, r_ulonglong from rpython.rlib.rbigint import rbigint from rpython.rlib.rstring import StringBuilder -from rpython.rlib.rutf8 import FLAG_ASCII, check_utf8 +from rpython.rlib.rutf8 import ( + FLAG_ASCII, check_utf8, next_codepoint_pos, codepoints_in_utf8) STATE_ZERO, STATE_OK, STATE_DETACHED = range(3) @@ -303,7 +304,7 @@ def set(self, space, w_decoded): check_decoded(space, w_decoded) - self.text = space.unicode_w(w_decoded) + self.text = space.utf8_w(w_decoded) self.pos = 0 def reset(self): @@ -312,7 +313,7 @@ def get_chars(self, size): if self.text is None: - return u"" + return "" available = len(self.text) - self.pos if size < 0 or size > available: @@ -341,7 +342,7 @@ if self.exhausted(): raise StopIteration ch = self.text[self.pos] - self.pos += 1 + self.pos = next_codepoint_pos(self.text, self.pos) return ch def peek_char(self): @@ -362,16 +363,16 @@ ch = self.next_char() except StopIteration: return False - if ch == u'\n': + if ch == '\n': return True - if ch == u'\r': + if ch == '\r': if scanned >= limit: return False try: ch = self.peek_char() except StopIteration: return False - if ch == u'\n': + if ch == '\n': self.next_char() return True else: @@ -388,11 +389,11 @@ except StopIteration: return False scanned += 1 - if ch == u'\r': + if ch == '\r': if scanned >= limit: return False try: - if self.peek_char() == u'\n': + if self.peek_char() == '\n': self.next_char() return True except StopIteration: @@ -420,6 +421,7 @@ if not space.isinstance_w(w_decoded, space.w_unicode): msg = "decoder should return a string result, not '%T'" raise oefmt(space.w_TypeError, msg, w_decoded) + return w_decoded class W_TextIOWrapper(W_TextIOBase): @@ -705,11 +707,11 @@ else: if self.readtranslate: # Newlines are already translated, only search for \n - newline = u'\n' + newline = '\n' else: # Non-universal mode. newline = self.readnl - if newline == u'\r\n': + if newline == '\r\n': return self.decoded.find_crlf(limit) else: return self.decoded.find_char(newline[0], limit) @@ -945,13 +947,14 @@ w_decoded = space.call_method(self.w_decoder, "decode", w_chunk, space.newbool(bool(cookie.need_eof))) - self.decoded.set(space, w_decoded) + w_decoded = check_decoded(space, w_decoded) # Skip chars_to_skip of the decoded characters - if len(self.decoded.text) < cookie.chars_to_skip: + if space.len_w(w_decoded) < cookie.chars_to_skip: raise oefmt(space.w_IOError, "can't restore logical file position") - self.decoded.pos = cookie.chars_to_skip + self.decoded.set(space, w_decoded) + self.decoded.pos = w_decoded._index_to_byte(cookie.chars_to_skip) else: self.snapshot = PositionSnapshot(cookie.dec_flags, "") @@ -963,10 +966,8 @@ def tell_w(self, space): self._check_closed(space) - if not self.seekable: raise oefmt(space.w_IOError, "underlying stream is not seekable") - if not self.telling: raise oefmt(space.w_IOError, "telling position disabled by next() call") @@ -992,7 +993,8 @@ # We haven't moved from the snapshot point. return space.newlong_from_rbigint(cookie.pack()) - chars_to_skip = self.decoded.pos + chars_to_skip = codepoints_in_utf8( + self.decoded.text, end=self.decoded.pos) # Starting from the snapshot position, we will walk the decoder # forward until it gives us enough decoded characters. @@ -1036,14 +1038,14 @@ # We didn't get enough decoded data; signal EOF to get more. w_decoded = space.call_method(self.w_decoder, "decode", space.newbytes(""), - space.newint(1)) # final=1 + space.newint(1)) # final=1 check_decoded(space, w_decoded) - chars_decoded += len(space.unicode_w(w_decoded)) + chars_decoded += space.len_w(w_decoded) cookie.need_eof = 1 if chars_decoded < chars_to_skip: raise oefmt(space.w_IOError, - "can't reconstruct logical file position") + "can't reconstruct logical file position") finally: space.call_method(self.w_decoder, "setstate", w_saved_state) diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py --- a/pypy/module/_io/test/test_interp_textio.py +++ b/pypy/module/_io/test/test_interp_textio.py @@ -27,7 +27,8 @@ w_newline=space.newtext(mode)) lines = [] while True: - line = space.unicode_w(w_textio.readline_w(space, space.newint(limit))) + w_line = w_textio.readline_w(space, space.newint(limit)) + line = space.utf8_w(w_line).decode('utf-8') if limit > 0: assert len(line) <= limit if line: @@ -38,31 +39,27 @@ @given(st.text()) def test_read_buffer(text): - buf = DecodeBuffer(text) - assert buf.get_chars(-1) == text + buf = DecodeBuffer(text.encode('utf-8')) + assert buf.get_chars(-1) == text.encode('utf-8') assert buf.exhausted() @given(st.text(), st.lists(st.integers(min_value=0))) def test_readn_buffer(text, sizes): - buf = DecodeBuffer(text) + buf = DecodeBuffer(text.encode('utf-8')) strings = [] for n in sizes: s = buf.get_chars(n) if not buf.exhausted(): - assert len(s) == n + assert len(s.decode('utf-8')) == n else: - assert len(s) <= n + assert len(s.decode('utf-8')) <= n strings.append(s) - assert ''.join(strings) == text[:sum(sizes)] + assert ''.join(strings) == text[:sum(sizes)].encode('utf-8') @given(st.text()) def test_next_char(text): - buf = DecodeBuffer(text) - chars = [] - try: - while True: - chars.append(buf.next_char()) - except StopIteration: - pass + buf = DecodeBuffer(text.encode('utf-8')) + for i in range(len(text)): + ch = buf.next_char() + assert ch == text[i].encode('utf-8')[0] assert buf.exhausted() - assert u''.join(chars) == text diff --git a/pypy/module/_io/test/test_ztranslation.py b/pypy/module/_io/test/test_ztranslation.py deleted file mode 100644 --- a/pypy/module/_io/test/test_ztranslation.py +++ /dev/null @@ -1,4 +0,0 @@ -from pypy.objspace.fake.checkmodule import checkmodule - -def test_checkmodule(): - checkmodule('_io') diff --git a/pypy/objspace/fake/objspace.py b/pypy/objspace/fake/objspace.py --- a/pypy/objspace/fake/objspace.py +++ b/pypy/objspace/fake/objspace.py @@ -212,6 +212,12 @@ def newutf8(self, x, l, f): return w_some_obj() + def new_from_utf8(self, a): + return w_some_obj() + + def newunicode(self, a): + return w_some_obj() + newtext = newbytes newtext_or_none = newbytes newfilename = newbytes diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py --- a/pypy/objspace/std/objspace.py +++ b/pypy/objspace/std/objspace.py @@ -367,6 +367,12 @@ assert isinstance(utf8s, str) return W_UnicodeObject(utf8s, length, flag) + def new_from_utf8(self, utf8s): + # XXX: kill me! + assert isinstance(utf8s, str) + length, flag = rutf8.check_utf8(utf8s, True) + return W_UnicodeObject(utf8s, length, flag) + def newfilename(self, s): assert isinstance(s, str) # on pypy3, this decodes the byte string return W_BytesObject(s) # with the filesystem encoding _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit