[pypy-commit] pypy unicode-utf8: Merge branch 'utf8-io': fix the _io module

rlamy Tue, 28 Nov 2017 11:26:13 -0800

Author: Ronan Lamy <ronan.l...@gmail.com>
Branch: unicode-utf8
Changeset: r93203:290c2d5ff0bb
Date: 2017-11-28 19:23 +0000
http://bitbucket.org/pypy/pypy/changeset/290c2d5ff0bb/


Log:    Merge branch 'utf8-io': fix the _io module

diff --git a/pypy/module/_io/interp_stringio.py 
b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -17,20 +17,20 @@
         if len(self.data) > newlength:
             self.data = self.data[:newlength]
         if len(self.data) < newlength:
-            self.data.extend([u'\0'] * (newlength - len(self.data)))
+            self.data.extend(['\0'] * (newlength - len(self.data)))
 
     def read(self, size):
         start = self.pos
         available = len(self.data) - start
         if available <= 0:
-            return u''
+            return ''
         if size >= 0 and size <= available:
             end = start + size
         else:
             end = len(self.data)
         assert 0 <= start <= end
         self.pos = end
-        return u''.join(self.data[start:end])
+        return ''.join(self.data[start:end])
 
     def _convert_limit(self, limit):
         if limit < 0 or limit > len(self.data) - self.pos:
@@ -58,7 +58,7 @@
                 else:
                     break
         self.pos = pos
-        result = u''.join(self.data[start:pos])
+        result = ''.join(self.data[start:pos])
         return result
 
     def readline(self, marker, limit):
@@ -79,7 +79,7 @@
         if not found:
             pos = end
         self.pos = pos
-        result = u''.join(self.data[start:pos])
+        result = ''.join(self.data[start:pos])
         return result
 
     def write(self, string):
@@ -99,7 +99,7 @@
             self.resize(size)
 
     def getvalue(self):
-        return u''.join(self.data)
+        return ''.join(self.data)
 
 
 class W_StringIO(W_TextIOBase):
@@ -118,10 +118,10 @@
         if space.is_w(w_newline, space.w_None):
             newline = None
         else:
-            newline = space.unicode_w(w_newline)
+            newline = space.utf8_w(w_newline)
 
-        if (newline is not None and newline != u"" and newline != u"\n" and
-                newline != u"\r" and newline != u"\r\n"):
+        if (newline is not None and newline != "" and newline != "\n" and
+                newline != "\r" and newline != "\r\n"):
             # Not using oefmt() because I don't know how to use it
             # with unicode
             raise OperationError(space.w_ValueError,
@@ -131,9 +131,9 @@
             )
         if newline is not None:
             self.readnl = newline
-        self.readuniversal = newline is None or newline == u""
+        self.readuniversal = newline is None or newline == ""
         self.readtranslate = newline is None
-        if newline and newline[0] == u"\r":
+        if newline and newline[0] == "\r":
             self.writenl = newline
         if self.readuniversal:
             self.w_decoder = space.call_function(
@@ -152,7 +152,7 @@
         if self.readnl is None:
             w_readnl = space.w_None
         else:
-            w_readnl = space.str(space.newunicode(self.readnl))  # YYY
+            w_readnl = space.str(space.new_from_utf8(self.readnl))  # YYY
         return space.newtuple([
             w_initialval, w_readnl, space.newint(self.buf.pos), w_dict
         ])
@@ -179,7 +179,7 @@
         # because the string value in the state tuple has already been
         # translated once by __init__. So we do not take any chance and replace
         # object's buffer completely
-        initval = space.unicode_w(w_initval)
+        initval = space.utf8_w(w_initval)
         pos = space.getindex_w(w_pos, space.w_TypeError)
         if pos < 0:
             raise oefmt(space.w_ValueError,
@@ -215,8 +215,8 @@
         if self.writenl:
             w_decoded = space.call_method(
                 w_decoded, "replace",
-                space.newtext("\n"), space.newunicode(self.writenl))
-        string = space.unicode_w(w_decoded)
+                space.newtext("\n"), space.new_from_utf8(self.writenl))
+        string = space.utf8_w(w_decoded)
         if string:
             self.buf.write(string)
 
@@ -225,7 +225,7 @@
     def read_w(self, space, w_size=None):
         self._check_closed(space)
         size = convert_size(space, w_size)
-        return space.newunicode(self.buf.read(size))
+        return space.new_from_utf8(self.buf.read(size))
 
     def readline_w(self, space, w_limit=None):
         self._check_closed(space)
@@ -235,11 +235,11 @@
         else:
             if self.readtranslate:
                 # Newlines are already translated, only search for \n
-                newline = u'\n'
+                newline = '\n'
             else:
                 newline = self.readnl
             result = self.buf.readline(newline, limit)
-        return space.newunicode(result)
+        return space.new_from_utf8(result)
 
 
     @unwrap_spec(pos=int, mode=int)
@@ -276,7 +276,7 @@
 
     def getvalue_w(self, space):
         self._check_closed(space)
-        return space.newunicode(self.buf.getvalue())
+        return space.new_from_utf8(self.buf.getvalue())
 
     def readable_w(self, space):
         self._check_closed(space)
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -11,7 +11,8 @@
 from rpython.rlib.rarithmetic import intmask, r_uint, r_ulonglong
 from rpython.rlib.rbigint import rbigint
 from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.rutf8 import FLAG_ASCII, check_utf8
+from rpython.rlib.rutf8 import (
+    FLAG_ASCII, check_utf8, next_codepoint_pos, codepoints_in_utf8)
 
 
 STATE_ZERO, STATE_OK, STATE_DETACHED = range(3)
@@ -303,7 +304,7 @@
 
     def set(self, space, w_decoded):
         check_decoded(space, w_decoded)
-        self.text = space.unicode_w(w_decoded)
+        self.text = space.utf8_w(w_decoded)
         self.pos = 0
 
     def reset(self):
@@ -312,7 +313,7 @@
 
     def get_chars(self, size):
         if self.text is None:
-            return u""
+            return ""
 
         available = len(self.text) - self.pos
         if size < 0 or size > available:
@@ -341,7 +342,7 @@
         if self.exhausted():
             raise StopIteration
         ch = self.text[self.pos]
-        self.pos += 1
+        self.pos = next_codepoint_pos(self.text, self.pos)
         return ch
 
     def peek_char(self):
@@ -362,16 +363,16 @@
                 ch = self.next_char()
             except StopIteration:
                 return False
-            if ch == u'\n':
+            if ch == '\n':
                 return True
-            if ch == u'\r':
+            if ch == '\r':
                 if scanned >= limit:
                     return False
                 try:
                     ch = self.peek_char()
                 except StopIteration:
                     return False
-                if ch == u'\n':
+                if ch == '\n':
                     self.next_char()
                     return True
                 else:
@@ -388,11 +389,11 @@
             except StopIteration:
                 return False
             scanned += 1
-            if ch == u'\r':
+            if ch == '\r':
                 if scanned >= limit:
                     return False
                 try:
-                    if self.peek_char() == u'\n':
+                    if self.peek_char() == '\n':
                         self.next_char()
                         return True
                 except StopIteration:
@@ -420,6 +421,7 @@
     if not space.isinstance_w(w_decoded, space.w_unicode):
         msg = "decoder should return a string result, not '%T'"
         raise oefmt(space.w_TypeError, msg, w_decoded)
+    return w_decoded
 
 
 class W_TextIOWrapper(W_TextIOBase):
@@ -705,11 +707,11 @@
         else:
             if self.readtranslate:
                 # Newlines are already translated, only search for \n
-                newline = u'\n'
+                newline = '\n'
             else:
                 # Non-universal mode.
                 newline = self.readnl
-            if newline == u'\r\n':
+            if newline == '\r\n':
                 return self.decoded.find_crlf(limit)
             else:
                 return self.decoded.find_char(newline[0], limit)
@@ -945,13 +947,14 @@
 
             w_decoded = space.call_method(self.w_decoder, "decode",
                                           w_chunk, 
space.newbool(bool(cookie.need_eof)))
-            self.decoded.set(space, w_decoded)
+            w_decoded = check_decoded(space, w_decoded)
 
             # Skip chars_to_skip of the decoded characters
-            if len(self.decoded.text) < cookie.chars_to_skip:
+            if space.len_w(w_decoded) < cookie.chars_to_skip:
                 raise oefmt(space.w_IOError,
                             "can't restore logical file position")
-            self.decoded.pos = cookie.chars_to_skip
+            self.decoded.set(space, w_decoded)
+            self.decoded.pos = w_decoded._index_to_byte(cookie.chars_to_skip)
         else:
             self.snapshot = PositionSnapshot(cookie.dec_flags, "")
 
@@ -963,10 +966,8 @@
 
     def tell_w(self, space):
         self._check_closed(space)
-
         if not self.seekable:
             raise oefmt(space.w_IOError, "underlying stream is not seekable")
-
         if not self.telling:
             raise oefmt(space.w_IOError,
                         "telling position disabled by next() call")
@@ -992,7 +993,8 @@
             # We haven't moved from the snapshot point.
             return space.newlong_from_rbigint(cookie.pack())
 
-        chars_to_skip = self.decoded.pos
+        chars_to_skip = codepoints_in_utf8(
+            self.decoded.text, end=self.decoded.pos)
 
         # Starting from the snapshot position, we will walk the decoder
         # forward until it gives us enough decoded characters.
@@ -1036,14 +1038,14 @@
                 # We didn't get enough decoded data; signal EOF to get more.
                 w_decoded = space.call_method(self.w_decoder, "decode",
                                               space.newbytes(""),
-                                              space.newint(1)) # final=1
+                                              space.newint(1))  # final=1
                 check_decoded(space, w_decoded)
-                chars_decoded += len(space.unicode_w(w_decoded))
+                chars_decoded += space.len_w(w_decoded)
                 cookie.need_eof = 1
 
                 if chars_decoded < chars_to_skip:
                     raise oefmt(space.w_IOError,
-                                "can't reconstruct logical file position")
+                        "can't reconstruct logical file position")
         finally:
             space.call_method(self.w_decoder, "setstate", w_saved_state)
 
diff --git a/pypy/module/_io/test/test_interp_textio.py 
b/pypy/module/_io/test/test_interp_textio.py
--- a/pypy/module/_io/test/test_interp_textio.py
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -27,7 +27,8 @@
         w_newline=space.newtext(mode))
     lines = []
     while True:
-        line = space.unicode_w(w_textio.readline_w(space, space.newint(limit)))
+        w_line = w_textio.readline_w(space, space.newint(limit))
+        line = space.utf8_w(w_line).decode('utf-8')
         if limit > 0:
             assert len(line) <= limit
         if line:
@@ -38,31 +39,27 @@
 
 @given(st.text())
 def test_read_buffer(text):
-    buf = DecodeBuffer(text)
-    assert buf.get_chars(-1) == text
+    buf = DecodeBuffer(text.encode('utf-8'))
+    assert buf.get_chars(-1) == text.encode('utf-8')
     assert buf.exhausted()
 
 @given(st.text(), st.lists(st.integers(min_value=0)))
 def test_readn_buffer(text, sizes):
-    buf = DecodeBuffer(text)
+    buf = DecodeBuffer(text.encode('utf-8'))
     strings = []
     for n in sizes:
         s = buf.get_chars(n)
         if not buf.exhausted():
-            assert len(s) == n
+            assert len(s.decode('utf-8')) == n
         else:
-            assert len(s) <= n
+            assert len(s.decode('utf-8')) <= n
         strings.append(s)
-    assert ''.join(strings) == text[:sum(sizes)]
+    assert ''.join(strings) == text[:sum(sizes)].encode('utf-8')
 
 @given(st.text())
 def test_next_char(text):
-    buf = DecodeBuffer(text)
-    chars = []
-    try:
-        while True:
-            chars.append(buf.next_char())
-    except StopIteration:
-        pass
+    buf = DecodeBuffer(text.encode('utf-8'))
+    for i in range(len(text)):
+        ch = buf.next_char()
+        assert ch == text[i].encode('utf-8')[0]
     assert buf.exhausted()
-    assert u''.join(chars) == text
diff --git a/pypy/module/_io/test/test_ztranslation.py 
b/pypy/module/_io/test/test_ztranslation.py
deleted file mode 100644
--- a/pypy/module/_io/test/test_ztranslation.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from pypy.objspace.fake.checkmodule import checkmodule
-
-def test_checkmodule():
-    checkmodule('_io')
diff --git a/pypy/objspace/fake/objspace.py b/pypy/objspace/fake/objspace.py
--- a/pypy/objspace/fake/objspace.py
+++ b/pypy/objspace/fake/objspace.py
@@ -212,6 +212,12 @@
     def newutf8(self, x, l, f):
         return w_some_obj()
 
+    def new_from_utf8(self, a):
+        return w_some_obj()
+
+    def newunicode(self, a):
+        return w_some_obj()
+
     newtext = newbytes
     newtext_or_none = newbytes
     newfilename = newbytes
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -367,6 +367,12 @@
         assert isinstance(utf8s, str)
         return W_UnicodeObject(utf8s, length, flag)
 
+    def new_from_utf8(self, utf8s):
+        # XXX: kill me!
+        assert isinstance(utf8s, str)
+        length, flag = rutf8.check_utf8(utf8s, True)
+        return W_UnicodeObject(utf8s, length, flag)
+
     def newfilename(self, s):
         assert isinstance(s, str) # on pypy3, this decodes the byte string
         return W_BytesObject(s)   # with the filesystem encoding
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy unicode-utf8: Merge branch 'utf8-io': fix the _io module

Reply via email to