Author: Matti Picus <[email protected]>
Branch: unicode-utf8-py3
Changeset: r95574:2cb8cf7a5047
Date: 2019-01-03 07:55 +0200
http://bitbucket.org/pypy/pypy/changeset/2cb8cf7a5047/
Log: take tests from py3.5, code from unicode-utf8
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -317,35 +317,44 @@
def __init__(self, text=None):
self.text = text
self.pos = 0
+ self.upos = 0
def set(self, space, w_decoded):
check_decoded(space, w_decoded)
self.text = space.utf8_w(w_decoded)
self.pos = 0
+ self.upos = 0
def reset(self):
self.text = None
self.pos = 0
+ self.upos = 0
def get_chars(self, size):
- if self.text is None:
+ if self.text is None or size == 0:
return ""
- available = len(self.text) - self.pos
+ lgt = codepoints_in_utf8(self.text)
+ available = lgt - self.upos
if size < 0 or size > available:
size = available
assert size >= 0
if self.pos > 0 or size < available:
start = self.pos
- end = self.pos + size
+ pos = start
+ for i in range(size):
+ pos = next_codepoint_pos(self.text, pos)
+ self.upos += 1
assert start >= 0
- assert end >= 0
- chars = self.text[start:end]
+ assert pos >= 0
+ chars = self.text[start:pos]
+ self.pos = pos
else:
chars = self.text
+ self.pos = len(self.text)
+ self.upos = lgt
- self.pos += size
return chars
def has_data(self):
@@ -357,16 +366,24 @@
def next_char(self):
if self.exhausted():
raise StopIteration
- ch = self.text[self.pos]
- self.pos = next_codepoint_pos(self.text, self.pos)
+ newpos = next_codepoint_pos(self.text, self.pos)
+ pos = self.pos
+ assert pos >= 0
+ assert newpos >= 0
+ ch = self.text[pos:newpos]
+ self.pos = newpos
+ self.upos += 1
return ch
def peek_char(self):
# like next_char, but doesn't advance pos
if self.exhausted():
raise StopIteration
- ch = self.text[self.pos]
- return ch
+ newpos = next_codepoint_pos(self.text, self.pos)
+ pos = self.pos
+ assert pos >= 0
+ assert newpos >= 0
+ return self.text[pos:newpos]
def find_newline_universal(self, limit):
# Universal newline search. Find any of \r, \r\n, \n
@@ -416,6 +433,7 @@
except StopIteration:
# This is the tricky case: we found a \r right at the end
self.pos -= 1
+ self.upos -= 1
return False
return False
diff --git a/pypy/module/_io/test/test_interp_textio.py
b/pypy/module/_io/test/test_interp_textio.py
--- a/pypy/module/_io/test/test_interp_textio.py
+++ b/pypy/module/_io/test/test_interp_textio.py
@@ -58,28 +58,34 @@
@given(st.text())
def test_read_buffer(text):
- buf = DecodeBuffer(text.encode('utf-8'))
- assert buf.get_chars(-1) == text.encode('utf-8')
+ buf = DecodeBuffer(text)
+ assert buf.get_chars(-1) == text
assert buf.exhausted()
@given(st.text(), st.lists(st.integers(min_value=0)))
@example(u'\x80', [1])
def test_readn_buffer(text, sizes):
- buf = DecodeBuffer(text.encode('utf-8'))
+ buf = DecodeBuffer(text)
strings = []
for n in sizes:
s = buf.get_chars(n)
if not buf.exhausted():
- assert len(s.decode('utf-8')) == n
+ assert len(s) == n
else:
- assert len(s.decode('utf-8')) <= n
+ assert len(s) <= n
strings.append(s)
- assert ''.join(strings) == text[:sum(sizes)].encode('utf-8')
+ assert ''.join(strings) == text[:sum(sizes)]
@given(st.text())
+@example(u'\x800')
def test_next_char(text):
- buf = DecodeBuffer(text.encode('utf-8'))
- for i in range(len(text)):
- ch = buf.next_char()
- assert ch == text[i].encode('utf-8')
+ buf = DecodeBuffer(text)
+ chars = []
+ try:
+ while True:
+ ch = buf.next_char()
+ chars.append(ch)
+ except StopIteration:
+ pass
assert buf.exhausted()
+ assert u''.join(chars) == text
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit