Author: Ronan Lamy <[email protected]>
Branch: StringIO-perf
Changeset: r98669:c83626075169
Date: 2020-02-05 21:42 +0000
http://bitbucket.org/pypy/pypy/changeset/c83626075169/
Log: Add new operation mode for W_StringIO, backed by a W_UnicodeObject,
for read-only operations
diff --git a/pypy/module/_io/interp_stringio.py
b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -1,12 +1,24 @@
-from rpython.rlib.rutf8 import codepoints_in_utf8, next_codepoint_pos
+from rpython.rlib.rutf8 import (
+ codepoints_in_utf8, codepoint_at_pos, Utf8StringIterator)
from pypy.interpreter.error import OperationError, oefmt
from pypy.interpreter.typedef import (
TypeDef, generic_new_descr, GetSetProperty)
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
from pypy.module._io.interp_textio import (
- W_TextIOBase, W_IncrementalNewlineDecoder)
+ W_TextIOBase, W_IncrementalNewlineDecoder)
from pypy.module._io.interp_iobase import convert_size
+from pypy.objspace.std.unicodeobject import W_UnicodeObject
+
+
+def _find_end(start, size, total):
+ available = total - start
+ if size >= 0 and size <= available:
+ end = start + size
+ else:
+ end = total
+ assert 0 <= start and 0 <= end
+ return end
class UnicodeIO(object):
def __init__(self, data=None):
@@ -22,14 +34,7 @@
self.data.extend([u'\0'] * (newlength - len(self.data)))
def read(self, start, size):
- available = len(self.data) - start
- if available <= 0:
- return ''
- if size >= 0 and size <= available:
- end = start + size
- else:
- end = len(self.data)
- assert 0 <= start <= end
+ end = _find_end(start, size, len(self.data))
return u''.join(self.data[start:end])
def _convert_limit(self, limit, start):
@@ -95,20 +100,27 @@
def getvalue(self):
return u''.join(self.data).encode('utf-8')
-INITIAL, CLOSED = range(2)
-
+READING, RWBUFFER, CLOSED = range(3)
class W_StringIO(W_TextIOBase):
def __init__(self, space):
W_TextIOBase.__init__(self, space)
- self.buf = UnicodeIO()
+ self.buf = None
+ self.w_value = W_UnicodeObject.EMPTY
self.pos = 0
- self.state = INITIAL
+ self.state = READING
+
+ def get_length(self):
+ """Return the total size (in codepoints) of the object"""
+ if self.state == READING:
+ return self.w_value._len()
+ else:
+ return len(self.buf.data)
@unwrap_spec(w_newline=WrappedDefault("\n"))
def descr_init(self, space, w_initvalue=None, w_newline=None):
# In case __init__ is called multiple times
- self.buf = UnicodeIO()
+ self.buf = None
self.pos = 0
self.w_decoder = None
self.readnl = None
@@ -142,8 +154,11 @@
)
if not space.is_none(w_initvalue):
- self.write_w(space, w_initvalue)
- self.pos = 0
+ self.w_value = self._decode_string(space, w_initvalue)
+ else:
+ self.w_value = W_UnicodeObject.EMPTY
+ self.pos = 0
+ self.state = READING
def descr_getstate(self, space):
w_initialval = self.getvalue_w(space)
@@ -173,18 +188,19 @@
raise oefmt(space.w_TypeError,
"unicode argument expected, got '%T'", w_initval)
# Initialize state
- self.descr_init(space, None, w_readnl)
+ self.descr_init(space, w_initval, w_readnl)
# Restore the buffer state. We're not doing it via __init__
# because the string value in the state tuple has already been
# translated once by __init__. So we do not take any chance and replace
# object's buffer completely
initval = space.utf8_w(w_initval)
+ self.buf = UnicodeIO(initval)
+
pos = space.getindex_w(w_pos, space.w_TypeError)
if pos < 0:
raise oefmt(space.w_ValueError,
"position value cannot be negative")
- self.buf = UnicodeIO(initval)
self.pos = pos
if not space.is_w(w_dict, space.w_None):
if not space.isinstance_w(w_dict, space.w_dict):
@@ -201,12 +217,11 @@
message = "I/O operation on closed file"
raise OperationError(space.w_ValueError, space.newtext(message))
- def write_w(self, space, w_obj):
+ def _decode_string(self, space, w_obj):
if not space.isinstance_w(w_obj, space.w_unicode):
raise oefmt(space.w_TypeError,
"unicode argument expected, got '%T'", w_obj)
self._check_closed(space)
- orig_size = space.len_w(w_obj)
if self.w_decoder is not None:
w_decoded = space.call_method(
@@ -220,6 +235,16 @@
space.newtext("\n"),
space.newutf8(writenl, codepoints_in_utf8(writenl)),
)
+ return w_decoded
+
+ def write_w(self, space, w_obj):
+ w_decoded = self._decode_string(space, w_obj)
+ orig_size = space.len_w(w_obj)
+ if self.state == READING:
+ self.buf = UnicodeIO(space.utf8_w(self.w_value))
+ self.w_value = None
+ self.state = RWBUFFER
+
string = space.utf8_w(w_decoded)
if string:
written = self.buf.write(string, self.pos)
@@ -230,6 +255,15 @@
def read_w(self, space, w_size=None):
self._check_closed(space)
size = convert_size(space, w_size)
+ if self.state == READING:
+ length = self.w_value._len()
+ end = _find_end(self.pos, size, length)
+ if self.pos > end:
+ return space.newutf8('', 0)
+ w_res = self.w_value._unicode_sliced(space, self.pos, end)
+ self.pos = end
+ return w_res
+ assert self.state == RWBUFFER
result_u = self.buf.read(self.pos, size)
self.pos += len(result_u)
return space.newutf8(result_u.encode('utf-8'), len(result_u))
@@ -237,6 +271,61 @@
def readline_w(self, space, w_limit=None):
self._check_closed(space)
limit = convert_size(space, w_limit)
+ if self.state == READING:
+ length = self.w_value._len()
+ end = _find_end(self.pos, limit, length)
+ if self.readuniversal:
+ start = self.pos
+ start_offset = self.w_value._index_to_byte(start)
+ it = Utf8StringIterator(self.w_value._utf8)
+ it._pos = start_offset
+ for ch in it:
+ if ch == ord(u'\n'):
+ self.pos += 1
+ break
+ elif ch == ord(u'\r'):
+ self.pos += 1
+ if self.pos >= end:
+ break
+ if it.next() == ord(u'\n'):
+ self.pos += 1
+ break
+ else:
+ # `it` has gone one char too far, but we don't care
+ break
+ self.pos += 1
+ if self.pos >= end:
+ break
+ w_res = self.w_value._unicode_sliced(space, start, self.pos)
+ return w_res
+ else:
+ if self.readtranslate:
+ # Newlines are already translated, only search for \n
+ newline = '\n'
+ else:
+ newline = self.readnl
+
+ start = self.pos
+ start_offset = self.w_value._index_to_byte(start)
+ it = Utf8StringIterator(self.w_value._utf8)
+ it._pos = start_offset
+ for ch in it:
+ self.pos += 1
+ if ch == ord(newline[0]):
+ if len(newline) == 1 or self.pos >= end:
+ break
+ else:
+ ch = codepoint_at_pos(self.w_value._utf8,
it.get_pos())
+ if ch == ord(newline[1]):
+ self.pos += 1
+ break
+ else:
+ continue
+ if self.pos >= end:
+ break
+ w_res = self.w_value._unicode_sliced(space, start, self.pos)
+ return w_res
+
if self.readuniversal:
result_u = self.buf.readline_universal(self.pos, limit)
else:
@@ -265,7 +354,7 @@
if mode == 1:
pos = self.pos
elif mode == 2:
- pos = len(self.buf.data)
+ pos = self.get_length()
assert pos >= 0
self.pos = pos
return space.newint(pos)
@@ -278,11 +367,17 @@
size = space.int_w(w_size)
if size < 0:
raise oefmt(space.w_ValueError, "Negative size value %d", size)
- self.buf.truncate(size)
+ if self.state == READING:
+ if size < self.w_value._len():
+ self.w_value = self.w_value._unicode_sliced(space, 0, size)
+ else:
+ self.buf.truncate(size)
return space.newint(size)
def getvalue_w(self, space):
self._check_closed(space)
+ if self.state == READING:
+ return self.w_value
v = self.buf.getvalue()
lgt = codepoints_in_utf8(v)
return space.newutf8(v, lgt)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit