Author: Armin Rigo <[email protected]>
Branch: unicode-utf8
Changeset: r93134:25ac6121d03c
Date: 2017-11-23 10:26 +0100
http://bitbucket.org/pypy/pypy/changeset/25ac6121d03c/
Log: merge heads
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -272,7 +272,7 @@
self._typed_unwrap_error(space, "unicode")
def convert_to_w_unicode(self, space):
- self._typed_unwrap_error(space, "unicode")
+ self._typed_unwrap_error(space, "unicode")
def bytearray_list_of_chars_w(self, space):
self._typed_unwrap_error(space, "bytearray")
@@ -1759,6 +1759,11 @@
def utf8_w(self, w_obj):
return w_obj.utf8_w(self)
+
+ def unicode_w(self, w_obj):
+ # XXX: kill me!
+ return w_obj.utf8_w(self).decode('utf-8')
+
def convert_to_w_unicode(self, w_obj):
return w_obj.convert_to_w_unicode(self)
diff --git a/pypy/module/_io/interp_stringio.py
b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -184,9 +184,7 @@
start,
end
)
- if endpos >= 0:
- endpos += start
- else:
+ if endpos < 0:
endpos = end
assert endpos >= 0
self.pos = endpos
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -97,7 +97,7 @@
output_len -= 1
if output_len == 0:
- return space.newutf8("", 1, FLAG_ASCII)
+ return space.newutf8("", 0, FLAG_ASCII)
# Record which newlines are read and do newline translation if
# desired, all in one pass.
@@ -224,30 +224,28 @@
def _find_line_ending(self, line, start, end):
size = end - start
if self.readtranslate:
-
# Newlines are already translated, only search for \n
- pos = line.find(u'\n', start, end)
+ pos = line.find('\n', start, end)
if pos >= 0:
- return pos - start + 1, 0
+ return pos + 1, 0
else:
return -1, size
elif self.readuniversal:
# Universal newline search. Find any of \r, \r\n, \n
# The decoder ensures that \r\n are not split in two pieces
- i = 0
+ i = start
while True:
- # Fast path for non-control chars. The loop always ends
- # since the Py_UNICODE storage is NUL-terminated.
- while i < size and line[start + i] > '\r':
+ # Fast path for non-control chars.
+ while i < end and line[i] > '\r':
i += 1
- if i >= size:
+ if i >= end:
return -1, size
- ch = line[start + i]
+ ch = line[i]
i += 1
if ch == '\n':
return i, 0
if ch == '\r':
- if line[start + i] == '\n':
+ if line[i] == '\n':
return i + 1, 0
else:
return i, 0
@@ -255,7 +253,7 @@
# Non-universal mode.
pos = line.find(self.readnl, start, end)
if pos >= 0:
- return pos - start + len(self.readnl), 0
+ return pos + len(self.readnl), 0
else:
pos = line.find(self.readnl[0], start, end)
if pos >= 0:
@@ -520,8 +518,13 @@
# _____________________________________________________________
# read methods
- def _set_decoded_chars(self, chars):
- self.decoded_chars = chars
+ def _unset_decoded(self):
+ self.decoded_chars = None
+ self.decoded_chars_used = 0
+
+ def _set_decoded(self, space, w_decoded):
+ check_decoded(space, w_decoded)
+ self.decoded_chars = space.utf8_w(w_decoded)
self.decoded_chars_used = 0
def _get_decoded_chars(self, size):
@@ -580,8 +583,7 @@
eof = space.len_w(w_input) == 0
w_decoded = space.call_method(self.w_decoder, "decode",
w_input, space.newbool(eof))
- check_decoded(space, w_decoded)
- self._set_decoded_chars(space.utf8_w(w_decoded))
+ self._set_decoded(space, w_decoded)
if space.len_w(w_decoded) > 0:
eof = False
@@ -617,13 +619,13 @@
w_bytes = space.call_method(self.w_buffer, "read")
w_decoded = space.call_method(self.w_decoder, "decode", w_bytes,
space.w_True)
check_decoded(space, w_decoded)
- w_result = space.newunicode(self._get_decoded_chars(-1))
+ w_result = space.new_from_utf8(self._get_decoded_chars(-1))
w_final = space.add(w_result, w_decoded)
self.snapshot = None
return w_final
remaining = size
- builder = UnicodeBuilder(size)
+ builder = StringBuilder(size)
# Keep reading chunks until we have n characters to return
while True:
@@ -643,7 +645,7 @@
continue
raise
- return space.newunicode(builder.build())
+ return space.new_from_utf8(builder.build())
def readline_w(self, space, w_limit=None):
self._check_attached(space)
@@ -651,11 +653,10 @@
self._writeflush(space)
limit = convert_size(space, w_limit)
- chunked = 0
line = None
remaining = None
- chunks = []
+ builder = StringBuilder()
while True:
# First, get some data if necessary
@@ -671,7 +672,7 @@
raise
if not has_data:
# end of file
- self._set_decoded_chars(None)
+ self._unset_decoded()
self.snapshot = None
start = endpos = offset_to_buffer = 0
break
@@ -689,8 +690,8 @@
line_len = len(line)
endpos, consumed = self._find_line_ending(line, start, line_len)
+ chunked = builder.getlength()
if endpos >= 0:
- endpos += start
if limit >= 0 and endpos >= start + limit - chunked:
endpos = start + limit - chunked
assert endpos >= 0
@@ -708,15 +709,15 @@
# No line ending seen yet - put aside current data
if endpos > start:
s = line[start:endpos]
- chunks.append(s)
- chunked += len(s)
+ builder.append(s)
+
# There may be some remaining bytes we'll have to prepend to the
# next chunk of data
if endpos < line_len:
remaining = line[endpos:]
line = None
# We have consumed the buffer
- self._set_decoded_chars(None)
+ self._unset_decoded()
if line:
# Our line ends in the current buffer
@@ -725,18 +726,12 @@
self.decoded_chars_used = decoded_chars_used
if start > 0 or endpos < len(line):
line = line[start:endpos]
- if remaining:
- chunks.append(remaining)
- remaining = None
- if chunks:
- if line:
- chunks.append(line)
- line = u''.join(chunks)
+ builder.append(line)
+ elif remaining:
+ builder.append(remaining)
- if line:
- return space.newunicode(line)
- else:
- return space.newunicode(u'')
+ result = builder.build()
+ return space.new_from_utf8(result)
# _____________________________________________________________
# write methods
@@ -759,8 +754,8 @@
if text.find('\n') >= 0:
haslf = True
if haslf and self.writetranslate and self.writenl:
- w_text = space.call_method(w_text, "replace",
space.newunicode(u'\n'),
- space.newunicode(self.writenl))
+ w_text = space.call_method(w_text, "replace",
space.new_from_utf8('\n'),
+ space.new_from_utf8(self.writenl))
text = space.utf8_w(w_text)
needflush = False
@@ -867,7 +862,7 @@
raise oefmt(space.w_IOError,
"can't do nonzero end-relative seeks")
space.call_method(self, "flush")
- self._set_decoded_chars(None)
+ self._unset_decoded()
self.snapshot = None
if self.w_decoder:
space.call_method(self.w_decoder, "reset")
@@ -892,7 +887,7 @@
# Seek back to the safe start point
space.call_method(self.w_buffer, "seek",
space.newint(cookie.start_pos))
- self._set_decoded_chars(None)
+ self._unset_decoded()
self.snapshot = None
# Restore the decoder to its state from the safe start point.
@@ -913,8 +908,7 @@
w_decoded = space.call_method(self.w_decoder, "decode",
w_chunk,
space.newbool(bool(cookie.need_eof)))
- check_decoded(space, w_decoded)
- self._set_decoded_chars(space.unicode_w(w_decoded))
+ self._set_decoded(space, w_decoded)
# Skip chars_to_skip of the decoded characters
if len(self.decoded_chars) < cookie.chars_to_skip:
@@ -982,7 +976,7 @@
w_decoded = space.call_method(self.w_decoder, "decode",
space.newbytes(input[i]))
check_decoded(space, w_decoded)
- chars_decoded += len(space.unicode_w(w_decoded))
+ chars_decoded += space.len_w(w_decoded)
cookie.bytes_to_feed += 1
diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -367,10 +367,23 @@
assert isinstance(utf8s, str)
return W_UnicodeObject(utf8s, length, flag)
+ def new_from_utf8(self, utf8s):
+ # XXX: kill me!
+ assert isinstance(utf8s, str)
+ length, flag = rutf8.check_utf8(utf8s, True)
+ return W_UnicodeObject(utf8s, length, flag)
+
def newfilename(self, s):
assert isinstance(s, str) # on pypy3, this decodes the byte string
return W_BytesObject(s) # with the filesystem encoding
+ def newunicode(self, unistr):
+ # XXX: kill me!
+ assert isinstance(unistr, unicode)
+ utf8s = unistr.encode("utf-8")
+ length, flag = rutf8.check_utf8(utf8s, True)
+ return self.newutf8(utf8s, length, flag)
+
def type(self, w_obj):
jit.promote(w_obj.__class__)
return w_obj.getclass(self)
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit