Author: Armin Rigo <[email protected]> Branch: unicode-utf8 Changeset: r93186:350cb9b5b92b Date: 2017-11-27 22:16 +0100 http://bitbucket.org/pypy/pypy/changeset/350cb9b5b92b/
Log: merge heads diff too long, truncating to 2000 out of 2094 lines diff --git a/TODO b/TODO --- a/TODO +++ b/TODO @@ -9,3 +9,5 @@ * remove assertions from W_UnicodeObject.__init__ if all the builders pass * what to do with error handlers that go backwards. There were tests in test_codecs that would check for that + +* fix _pypyjson to not use a wrapped dict when decoding an object diff --git a/extra_tests/test_textio.py b/extra_tests/test_textio.py new file mode 100644 --- /dev/null +++ b/extra_tests/test_textio.py @@ -0,0 +1,28 @@ +from hypothesis import given, strategies as st + +from io import BytesIO, TextIOWrapper + +LINESEP = ['', '\r', '\n', '\r\n'] + [email protected] +def text_with_newlines(draw): + sep = draw(st.sampled_from(LINESEP)) + lines = draw(st.lists(st.text(max_size=10), max_size=10)) + return sep.join(lines) + +@given(txt=text_with_newlines(), + mode=st.sampled_from(['\r', '\n', '\r\n', '']), + limit=st.integers(min_value=-1)) +def test_readline(txt, mode, limit): + textio = TextIOWrapper( + BytesIO(txt.encode('utf-8')), encoding='utf-8', newline=mode) + lines = [] + while True: + line = textio.readline(limit) + if limit > 0: + assert len(line) < limit + if line: + lines.append(line) + else: + break + assert u''.join(lines) == txt diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py --- a/pypy/interpreter/baseobjspace.py +++ b/pypy/interpreter/baseobjspace.py @@ -1760,10 +1760,6 @@ def utf8_w(self, w_obj): return w_obj.utf8_w(self) - def unicode_w(self, w_obj): - # XXX: kill me! - return w_obj.utf8_w(self).decode('utf-8') - def convert_to_w_unicode(self, w_obj): return w_obj.convert_to_w_unicode(self) diff --git a/pypy/module/_continuation/test/conftest.py b/pypy/module/_continuation/test/conftest.py new file mode 100644 --- /dev/null +++ b/pypy/module/_continuation/test/conftest.py @@ -0,0 +1,7 @@ +import pytest +import sys + +def pytest_configure(config): + if sys.platform.startswith('linux'): + from rpython.rlib.rvmprof.cintf import configure_libbacktrace_linux + configure_libbacktrace_linux() diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py --- a/pypy/module/_io/interp_stringio.py +++ b/pypy/module/_io/interp_stringio.py @@ -2,21 +2,115 @@ from pypy.interpreter.typedef import ( TypeDef, generic_new_descr, GetSetProperty) from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault -from pypy.module._io.interp_textio import W_TextIOBase, W_IncrementalNewlineDecoder +from pypy.module._io.interp_textio import ( + W_TextIOBase, W_IncrementalNewlineDecoder) from pypy.module._io.interp_iobase import convert_size +class UnicodeIO(object): + def __init__(self, data=None, pos=0): + if data is None: + data = [] + self.data = data + self.pos = pos + + def resize(self, newlength): + if len(self.data) > newlength: + self.data = self.data[:newlength] + if len(self.data) < newlength: + self.data.extend([u'\0'] * (newlength - len(self.data))) + + def read(self, size): + start = self.pos + available = len(self.data) - start + if available <= 0: + return u'' + if size >= 0 and size <= available: + end = start + size + else: + end = len(self.data) + assert 0 <= start <= end + self.pos = end + return u''.join(self.data[start:end]) + + def _convert_limit(self, limit): + if limit < 0 or limit > len(self.data) - self.pos: + limit = len(self.data) - self.pos + assert limit >= 0 + return limit + + def readline_universal(self, limit): + # Universal newline search. Find any of \r, \r\n, \n + limit = self._convert_limit(limit) + start = self.pos + end = start + limit + pos = start + while pos < end: + ch = self.data[pos] + pos += 1 + if ch == '\n': + break + if ch == '\r': + if pos >= end: + break + if self.data[pos] == '\n': + pos += 1 + break + else: + break + self.pos = pos + result = u''.join(self.data[start:pos]) + return result + + def readline(self, marker, limit): + start = self.pos + limit = self._convert_limit(limit) + end = start + limit + found = False + for pos in range(start, end - len(marker) + 1): + ch = self.data[pos] + if ch == marker[0]: + for j in range(1, len(marker)): + if self.data[pos + j] != marker[j]: + break # from inner loop + else: + pos += len(marker) + found = True + break + if not found: + pos = end + self.pos = pos + result = u''.join(self.data[start:pos]) + return result + + def write(self, string): + length = len(string) + if self.pos + length > len(self.data): + self.resize(self.pos + length) + + for i in range(length): + self.data[self.pos + i] = string[i] + self.pos += length + + def seek(self, pos): + self.pos = pos + + def truncate(self, size): + if size < len(self.data): + self.resize(size) + + def getvalue(self): + return u''.join(self.data) + class W_StringIO(W_TextIOBase): def __init__(self, space): W_TextIOBase.__init__(self, space) - self.buf = [] - self.pos = 0 + self.buf = UnicodeIO() - @unwrap_spec(w_newline = WrappedDefault("\n")) + @unwrap_spec(w_newline=WrappedDefault("\n")) def descr_init(self, space, w_initvalue=None, w_newline=None): # In case __init__ is called multiple times - self.buf = [] - self.pos = 0 + self.buf = UnicodeIO() self.w_decoder = None self.readnl = None self.writenl = None @@ -27,7 +121,7 @@ newline = space.unicode_w(w_newline) if (newline is not None and newline != u"" and newline != u"\n" and - newline != u"\r" and newline != u"\r\n"): + newline != u"\r" and newline != u"\r\n"): # Not using oefmt() because I don't know how to use it # with unicode raise OperationError(space.w_ValueError, @@ -50,7 +144,7 @@ if not space.is_none(w_initvalue): self.write_w(space, w_initvalue) - self.pos = 0 + self.buf.pos = 0 def descr_getstate(self, space): w_initialval = self.getvalue_w(space) @@ -58,9 +152,9 @@ if self.readnl is None: w_readnl = space.w_None else: - w_readnl = space.str(space.newunicode(self.readnl)) # YYY + w_readnl = space.str(space.newunicode(self.readnl)) # YYY return space.newtuple([ - w_initialval, w_readnl, space.newint(self.pos), w_dict + w_initialval, w_readnl, space.newint(self.buf.pos), w_dict ]) def descr_setstate(self, space, w_state): @@ -69,34 +163,33 @@ # We allow the state tuple to be longer than 4, because we may need # someday to extend the object's state without breaking # backwards-compatibility - if not space.isinstance_w(w_state, space.w_tuple) or space.len_w(w_state) < 4: + if (not space.isinstance_w(w_state, space.w_tuple) + or space.len_w(w_state) < 4): raise oefmt(space.w_TypeError, "%T.__setstate__ argument should be a 4-tuple, got %T", self, w_state) w_initval, w_readnl, w_pos, w_dict = space.unpackiterable(w_state, 4) + if not space.isinstance_w(w_initval, space.w_unicode): + raise oefmt(space.w_TypeError, + "unicode argument expected, got '%T'", w_initval) # Initialize state - self.descr_init(space, w_initval, w_readnl) + self.descr_init(space, None, w_readnl) - # Restore the buffer state. Even if __init__ did initialize the buffer, - # we have to initialize it again since __init__ may translates the - # newlines in the inital_value string. We clearly do not want that + # Restore the buffer state. We're not doing it via __init__ # because the string value in the state tuple has already been # translated once by __init__. So we do not take any chance and replace # object's buffer completely initval = space.unicode_w(w_initval) - size = len(initval) - self.resize_buffer(size) - self.buf = list(initval) pos = space.getindex_w(w_pos, space.w_TypeError) if pos < 0: raise oefmt(space.w_ValueError, "position value cannot be negative") - self.pos = pos + self.buf = UnicodeIO(list(initval), pos) if not space.is_w(w_dict, space.w_None): if not space.isinstance_w(w_dict, space.w_dict): - raise oefmt(space.w_TypeError, - "fourth item of state should be a dict, got a %T", - w_dict) + raise oefmt( + space.w_TypeError, + "fourth item of state should be a dict, got a %T", w_dict) # Alternatively, we could replace the internal dictionary # completely. However, it seems more practical to just update it. space.call_method(self.w_dict, "update", w_dict) @@ -107,88 +200,47 @@ message = "I/O operation on closed file" raise OperationError(space.w_ValueError, space.newtext(message)) - def resize_buffer(self, newlength): - if len(self.buf) > newlength: - self.buf = self.buf[:newlength] - if len(self.buf) < newlength: - self.buf.extend([u'\0'] * (newlength - len(self.buf))) - - def write(self, string): - length = len(string) - if self.pos + length > len(self.buf): - self.resize_buffer(self.pos + length) - - for i in range(length): - self.buf[self.pos + i] = string[i] - self.pos += length - def write_w(self, space, w_obj): if not space.isinstance_w(w_obj, space.w_unicode): raise oefmt(space.w_TypeError, "unicode argument expected, got '%T'", w_obj) self._check_closed(space) - orig_size = space.len_w(w_obj) if self.w_decoder is not None: w_decoded = space.call_method( - self.w_decoder, "decode", w_obj, space.w_True - ) + self.w_decoder, "decode", w_obj, space.w_True) else: w_decoded = w_obj - if self.writenl: w_decoded = space.call_method( - w_decoded, "replace", space.newtext("\n"), space.newunicode(self.writenl) - ) + w_decoded, "replace", + space.newtext("\n"), space.newunicode(self.writenl)) + string = space.unicode_w(w_decoded) + if string: + self.buf.write(string) - string = space.unicode_w(w_decoded) - size = len(string) - - if size: - self.write(string) return space.newint(orig_size) def read_w(self, space, w_size=None): self._check_closed(space) size = convert_size(space, w_size) - start = self.pos - available = len(self.buf) - start - if available <= 0: - return space.newunicode(u"") - if size >= 0 and size <= available: - end = start + size - else: - end = len(self.buf) - assert 0 <= start <= end - self.pos = end - return space.newunicode(u''.join(self.buf[start:end])) + return space.newunicode(self.buf.read(size)) def readline_w(self, space, w_limit=None): self._check_closed(space) limit = convert_size(space, w_limit) + if self.readuniversal: + result = self.buf.readline_universal(limit) + else: + if self.readtranslate: + # Newlines are already translated, only search for \n + newline = u'\n' + else: + newline = self.readnl + result = self.buf.readline(newline, limit) + return space.newunicode(result) - if self.pos >= len(self.buf): - return space.newunicode(u"") - - start = self.pos - if limit < 0 or limit > len(self.buf) - self.pos: - limit = len(self.buf) - self.pos - - assert limit >= 0 - end = start + limit - - endpos, consumed = self._find_line_ending( - # XXX: super inefficient, makes a copy of the entire contents. - u"".join(self.buf), - start, - end - ) - if endpos < 0: - endpos = end - assert endpos >= 0 - self.pos = endpos - return space.newunicode(u"".join(self.buf[start:endpos])) @unwrap_spec(pos=int, mode=int) def seek_w(self, space, pos, mode=0): @@ -204,32 +256,27 @@ # XXX: this makes almost no sense, but its how CPython does it. if mode == 1: - pos = self.pos + pos = self.buf.pos elif mode == 2: - pos = len(self.buf) - + pos = len(self.buf.data) assert pos >= 0 - self.pos = pos + self.buf.seek(pos) return space.newint(pos) def truncate_w(self, space, w_size=None): self._check_closed(space) if space.is_none(w_size): - size = self.pos + size = self.buf.pos else: size = space.int_w(w_size) - if size < 0: raise oefmt(space.w_ValueError, "Negative size value %d", size) - - if size < len(self.buf): - self.resize_buffer(size) - + self.buf.truncate(size) return space.newint(size) def getvalue_w(self, space): self._check_closed(space) - return space.newunicode(u''.join(self.buf)) + return space.newunicode(self.buf.getvalue()) def readable_w(self, space): self._check_closed(space) diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -221,46 +221,6 @@ def newlines_get_w(self, space): return space.w_None - def _find_line_ending(self, line, start, end): - size = end - start - if self.readtranslate: - # Newlines are already translated, only search for \n - pos = line.find('\n', start, end) - if pos >= 0: - return pos + 1, 0 - else: - return -1, size - elif self.readuniversal: - # Universal newline search. Find any of \r, \r\n, \n - # The decoder ensures that \r\n are not split in two pieces - i = start - while True: - # Fast path for non-control chars. - while i < end and line[i] > '\r': - i += 1 - if i >= end: - return -1, size - ch = line[i] - i += 1 - if ch == '\n': - return i, 0 - if ch == '\r': - if line[i] == '\n': - return i + 1, 0 - else: - return i, 0 - else: - # Non-universal mode. - pos = line.find(self.readnl, start, end) - if pos >= 0: - return pos + len(self.readnl), 0 - else: - pos = line.find(self.readnl[0], start, end) - if pos >= 0: - return -1, pos - start - return -1, size - - W_TextIOBase.typedef = TypeDef( '_io._TextIOBase', W_IOBase.typedef, __new__ = generic_new_descr(W_TextIOBase), @@ -336,6 +296,126 @@ self.input = input +class DecodeBuffer(object): + def __init__(self, text=None): + self.text = text + self.pos = 0 + + def set(self, space, w_decoded): + check_decoded(space, w_decoded) + self.text = space.unicode_w(w_decoded) + self.pos = 0 + + def reset(self): + self.text = None + self.pos = 0 + + def get_chars(self, size): + if self.text is None: + return u"" + + available = len(self.text) - self.pos + if size < 0 or size > available: + size = available + assert size >= 0 + + if self.pos > 0 or size < available: + start = self.pos + end = self.pos + size + assert start >= 0 + assert end >= 0 + chars = self.text[start:end] + else: + chars = self.text + + self.pos += size + return chars + + def has_data(self): + return (self.text is not None and not self.exhausted()) + + def exhausted(self): + return self.pos >= len(self.text) + + def next_char(self): + if self.exhausted(): + raise StopIteration + ch = self.text[self.pos] + self.pos += 1 + return ch + + def peek_char(self): + # like next_char, but doesn't advance pos + if self.exhausted(): + raise StopIteration + ch = self.text[self.pos] + return ch + + def find_newline_universal(self, limit): + # Universal newline search. Find any of \r, \r\n, \n + # The decoder ensures that \r\n are not split in two pieces + if limit < 0: + limit = sys.maxint + scanned = 0 + while scanned < limit: + try: + ch = self.next_char() + except StopIteration: + return False + if ch == u'\n': + return True + if ch == u'\r': + if scanned >= limit: + return False + try: + ch = self.peek_char() + except StopIteration: + return False + if ch == u'\n': + self.next_char() + return True + else: + return True + return False + + def find_crlf(self, limit): + if limit < 0: + limit = sys.maxint + scanned = 0 + while scanned < limit: + try: + ch = self.next_char() + except StopIteration: + return False + scanned += 1 + if ch == u'\r': + if scanned >= limit: + return False + try: + if self.peek_char() == u'\n': + self.next_char() + return True + except StopIteration: + # This is the tricky case: we found a \r right at the end + self.pos -= 1 + return False + return False + + def find_char(self, marker, limit): + if limit < 0: + limit = sys.maxint + scanned = 0 + while scanned < limit: + try: + ch = self.next_char() + except StopIteration: + return False + if ch == marker: + return True + scanned += 1 + return False + + def check_decoded(space, w_decoded): if not space.isinstance_w(w_decoded, space.w_unicode): msg = "decoder should return a string result, not '%T'" @@ -349,8 +429,7 @@ self.w_encoder = None self.w_decoder = None - self.decoded_chars = None # buffer for text returned from decoder - self.decoded_chars_used = 0 # offset into _decoded_chars for read() + self.decoded = DecodeBuffer() self.pending_bytes = None # list of bytes objects waiting to be # written, or NULL self.chunk_size = 8192 @@ -518,40 +597,10 @@ # _____________________________________________________________ # read methods - def _unset_decoded(self): - self.decoded_chars = None - self.decoded_chars_used = 0 - - def _set_decoded(self, space, w_decoded): - check_decoded(space, w_decoded) - self.decoded_chars = space.utf8_w(w_decoded) - self.decoded_chars_used = 0 - - def _get_decoded_chars(self, size): - if self.decoded_chars is None: - return "" - - available = len(self.decoded_chars) - self.decoded_chars_used - if size < 0 or size > available: - size = available - assert size >= 0 - - if self.decoded_chars_used > 0 or size < available: - start = self.decoded_chars_used - end = self.decoded_chars_used + size - assert start >= 0 - assert end >= 0 - chars = self.decoded_chars[start:end] - else: - chars = self.decoded_chars - - self.decoded_chars_used += size - return chars - def _read_chunk(self, space): """Read and decode the next chunk of data from the BufferedReader. The return value is True unless EOF was reached. The decoded string - is placed in self._decoded_chars (replacing its previous value). + is placed in self.decoded (replacing its previous value). The entire input chunk is sent to the decoder, though some of it may remain buffered in the decoder, yet to be converted.""" @@ -571,7 +620,7 @@ dec_buffer = None dec_flags = 0 - # Read a chunk, decode it, and put the result in self._decoded_chars + # Read a chunk, decode it, and put the result in self.decoded w_input = space.call_method(self.w_buffer, "read1", space.newint(self.chunk_size)) @@ -583,7 +632,7 @@ eof = space.len_w(w_input) == 0 w_decoded = space.call_method(self.w_decoder, "decode", w_input, space.newbool(eof)) - self._set_decoded(space, w_decoded) + self.decoded.set(space, w_decoded) if space.len_w(w_decoded) > 0: eof = False @@ -595,6 +644,19 @@ return not eof + def _ensure_data(self, space): + while not self.decoded.has_data(): + try: + if not self._read_chunk(space): + self.decoded.reset() + self.snapshot = None + return False + except OperationError as e: + if trap_eintr(space, e): + continue + raise + return True + def next_w(self, space): self._check_attached(space) self.telling = False @@ -619,7 +681,7 @@ w_bytes = space.call_method(self.w_buffer, "read") w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, space.w_True) check_decoded(space, w_decoded) - w_result = space.new_from_utf8(self._get_decoded_chars(-1)) + w_result = space.new_from_utf8(self.decoded.get_chars(-1)) w_final = space.add(w_result, w_decoded) self.snapshot = None return w_final @@ -628,24 +690,29 @@ builder = StringBuilder(size) # Keep reading chunks until we have n characters to return - while True: - data = self._get_decoded_chars(remaining) + while remaining > 0: + if not self._ensure_data(space): + break + data = self.decoded.get_chars(remaining) builder.append(data) remaining -= len(data) - if remaining <= 0: # Done - break + return space.new_from_utf8(builder.build()) - try: - if not self._read_chunk(space): - # EOF - break - except OperationError as e: - if trap_eintr(space, e): - continue - raise - - return space.new_from_utf8(builder.build()) + def _scan_line_ending(self, limit): + if self.readuniversal: + return self.decoded.find_newline_universal(limit) + else: + if self.readtranslate: + # Newlines are already translated, only search for \n + newline = u'\n' + else: + # Non-universal mode. + newline = self.readnl + if newline == u'\r\n': + return self.decoded.find_crlf(limit) + else: + return self.decoded.find_char(newline[0], limit) def readline_w(self, space, w_limit=None): self._check_attached(space) @@ -653,82 +720,52 @@ self._writeflush(space) limit = convert_size(space, w_limit) - - line = None - remaining = None + remnant = None builder = StringBuilder() - while True: # First, get some data if necessary - has_data = True - while not self.decoded_chars: - try: - if not self._read_chunk(space): - has_data = False - break - except OperationError as e: - if trap_eintr(space, e): - continue - raise + has_data = self._ensure_data(space) if not has_data: # end of file - self._unset_decoded() - self.snapshot = None - start = endpos = offset_to_buffer = 0 + if remnant: + builder.append(remnant) break - if not remaining: - line = self.decoded_chars - start = self.decoded_chars_used - offset_to_buffer = 0 + if remnant: + assert not self.readtranslate and self.readnl == '\r\n' + assert self.decoded.pos == 0 + if remnant == '\r' and self.decoded.text[0] == '\n': + builder.append('\r\n') + self.decoded.pos = 1 + remnant = None + break + else: + builder.append(remnant) + remnant = None + continue + + if limit > 0: + remaining = limit - builder.getlength() + assert remaining >= 0 else: - assert self.decoded_chars_used == 0 - line = remaining + self.decoded_chars - start = 0 - offset_to_buffer = len(remaining) - remaining = None + remaining = -1 + start = self.decoded.pos + assert start >= 0 + found = self._scan_line_ending(remaining) + end_scan = self.decoded.pos + if end_scan > start: + s = self.decoded.text[start:end_scan] + builder.append(s) - line_len = len(line) - endpos, consumed = self._find_line_ending(line, start, line_len) - chunked = builder.getlength() - if endpos >= 0: - if limit >= 0 and endpos >= start + limit - chunked: - endpos = start + limit - chunked - assert endpos >= 0 - break - assert consumed >= 0 - - # We can put aside up to `endpos` - endpos = consumed + start - if limit >= 0 and endpos >= start + limit - chunked: - # Didn't find line ending, but reached length limit - endpos = start + limit - chunked - assert endpos >= 0 + if found or (limit >= 0 and builder.getlength() >= limit): break - # No line ending seen yet - put aside current data - if endpos > start: - s = line[start:endpos] - builder.append(s) - - # There may be some remaining bytes we'll have to prepend to the + # There may be some remaining chars we'll have to prepend to the # next chunk of data - if endpos < line_len: - remaining = line[endpos:] - line = None + if not self.decoded.exhausted(): + remnant = self.decoded.get_chars(-1) # We have consumed the buffer - self._unset_decoded() - - if line: - # Our line ends in the current buffer - decoded_chars_used = endpos - offset_to_buffer - assert decoded_chars_used >= 0 - self.decoded_chars_used = decoded_chars_used - if start > 0 or endpos < len(line): - line = line[start:endpos] - builder.append(line) - elif remaining: - builder.append(remaining) + self.decoded.reset() result = builder.build() return space.new_from_utf8(result) @@ -862,7 +899,7 @@ raise oefmt(space.w_IOError, "can't do nonzero end-relative seeks") space.call_method(self, "flush") - self._unset_decoded() + self.decoded.reset() self.snapshot = None if self.w_decoder: space.call_method(self.w_decoder, "reset") @@ -887,7 +924,7 @@ # Seek back to the safe start point space.call_method(self.w_buffer, "seek", space.newint(cookie.start_pos)) - self._unset_decoded() + self.decoded.reset() self.snapshot = None # Restore the decoder to its state from the safe start point. @@ -908,13 +945,13 @@ w_decoded = space.call_method(self.w_decoder, "decode", w_chunk, space.newbool(bool(cookie.need_eof))) - self._set_decoded(space, w_decoded) + self.decoded.set(space, w_decoded) # Skip chars_to_skip of the decoded characters - if len(self.decoded_chars) < cookie.chars_to_skip: + if len(self.decoded.text) < cookie.chars_to_skip: raise oefmt(space.w_IOError, "can't restore logical file position") - self.decoded_chars_used = cookie.chars_to_skip + self.decoded.pos = cookie.chars_to_skip else: self.snapshot = PositionSnapshot(cookie.dec_flags, "") @@ -940,7 +977,7 @@ w_pos = space.call_method(self.w_buffer, "tell") if self.w_decoder is None or self.snapshot is None: - assert not self.decoded_chars + assert not self.decoded.text return w_pos cookie = PositionCookie(space.bigint_w(w_pos)) @@ -951,11 +988,11 @@ cookie.start_pos -= len(input) # How many decoded characters have been used up since the snapshot? - if not self.decoded_chars_used: + if not self.decoded.pos: # We haven't moved from the snapshot point. return space.newlong_from_rbigint(cookie.pack()) - chars_to_skip = self.decoded_chars_used + chars_to_skip = self.decoded.pos # Starting from the snapshot position, we will walk the decoder # forward until it gives us enough decoded characters. diff --git a/pypy/module/_io/test/test_interp_textio.py b/pypy/module/_io/test/test_interp_textio.py new file mode 100644 --- /dev/null +++ b/pypy/module/_io/test/test_interp_textio.py @@ -0,0 +1,68 @@ +import pytest +try: + from hypothesis import given, strategies as st, assume +except ImportError: + pytest.skip("hypothesis required") +from pypy.module._io.interp_bytesio import W_BytesIO +from pypy.module._io.interp_textio import W_TextIOWrapper, DecodeBuffer + +LINESEP = ['', '\r', '\n', '\r\n'] + [email protected] +def text_with_newlines(draw): + sep = draw(st.sampled_from(LINESEP)) + lines = draw(st.lists(st.text(max_size=10), max_size=10)) + return sep.join(lines) + +@given(txt=text_with_newlines(), + mode=st.sampled_from(['\r', '\n', '\r\n', '']), + limit=st.integers(min_value=-1)) +def test_readline(space, txt, mode, limit): + assume(limit != 0) + w_stream = W_BytesIO(space) + w_stream.descr_init(space, space.newbytes(txt.encode('utf-8'))) + w_textio = W_TextIOWrapper(space) + w_textio.descr_init( + space, w_stream, encoding='utf-8', + w_newline=space.newtext(mode)) + lines = [] + while True: + line = space.unicode_w(w_textio.readline_w(space, space.newint(limit))) + if limit > 0: + assert len(line) <= limit + if line: + lines.append(line) + else: + break + assert u''.join(lines) == txt + +@given(st.text()) +def test_read_buffer(text): + buf = DecodeBuffer(text) + assert buf.get_chars(-1) == text + assert buf.exhausted() + +@given(st.text(), st.lists(st.integers(min_value=0))) +def test_readn_buffer(text, sizes): + buf = DecodeBuffer(text) + strings = [] + for n in sizes: + s = buf.get_chars(n) + if not buf.exhausted(): + assert len(s) == n + else: + assert len(s) <= n + strings.append(s) + assert ''.join(strings) == text[:sum(sizes)] + +@given(st.text()) +def test_next_char(text): + buf = DecodeBuffer(text) + chars = [] + try: + while True: + chars.append(buf.next_char()) + except StopIteration: + pass + assert buf.exhausted() + assert u''.join(chars) == text diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -197,19 +197,21 @@ MBENC_FLUSH = 1 MBENC_RESET = 2 -def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None): +def encode(codec, unicodedata, length, errors="strict", errorcb=None, + namecb=None): encodebuf = pypy_cjk_enc_new(codec) if not encodebuf: raise MemoryError try: - return encodeex(encodebuf, unicodedata, errors, errorcb, namecb) + return encodeex(encodebuf, unicodedata, length, errors, errorcb, namecb) finally: pypy_cjk_enc_free(encodebuf) -def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None, +def encodeex(encodebuf, utf8data, length, errors="strict", errorcb=None, namecb=None, ignore_error=0): - inleft = len(unicodedata) - with rffi.scoped_nonmoving_unicodebuffer(unicodedata) as inbuf: + inleft = length + inbuf = rffi.utf82wcharp(utf8data, length) + try: if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0: raise MemoryError if ignore_error == 0: @@ -221,16 +223,18 @@ if r == 0 or r == ignore_error: break multibytecodec_encerror(encodebuf, r, errors, - errorcb, namecb, unicodedata) + errorcb, namecb, utf8data) while flags & MBENC_RESET: r = pypy_cjk_enc_reset(encodebuf) if r == 0: break multibytecodec_encerror(encodebuf, r, errors, - errorcb, namecb, unicodedata) + errorcb, namecb, utf8data) src = pypy_cjk_enc_outbuf(encodebuf) length = pypy_cjk_enc_outlen(encodebuf) return rffi.charpsize2str(src, length) + finally: + lltype.free(inbuf, flavor='raw') def multibytecodec_encerror(encodebuf, e, errors, errorcb, namecb, unicodedata): @@ -256,21 +260,16 @@ elif errors == "replace": codec = pypy_cjk_enc_getcodec(encodebuf) try: - replace = encode(codec, u"?") + replace = encode(codec, "?", 1) except EncodeDecodeError: replace = "?" else: assert errorcb - XXX - retu, rets, end = errorcb(errors, namecb, reason, - unicodedata.encode("utf8"), start, end) - if rets is not None: - # py3k only - replace = rets - else: - assert retu is not None - codec = pypy_cjk_enc_getcodec(encodebuf) - replace = encode(codec, retu, "strict", errorcb, namecb) + rets, end = errorcb(errors, namecb, reason, + unicodedata, start, end) + codec = pypy_cjk_enc_getcodec(encodebuf) + lgt, _ = rutf8.get_utf8_length_flag(rets) + replace = encode(codec, rets, lgt, "strict", errorcb, namecb) with rffi.scoped_nonmovingbuffer(replace) as inbuf: r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end) if r == MBERR_NOMEMORY: diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py --- a/pypy/module/_multibytecodec/interp_incremental.py +++ b/pypy/module/_multibytecodec/interp_incremental.py @@ -1,4 +1,5 @@ from rpython.rtyper.lltypesystem import lltype +from rpython.rlib import rutf8 from pypy.module._multibytecodec import c_codecs from pypy.module._multibytecodec.interp_multibytecodec import ( MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror, @@ -65,7 +66,8 @@ pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf) assert 0 <= pos <= len(object) self.pending = object[pos:] - return space.newunicode(output) + lgt, flag = rutf8.get_utf8_length_flag(output) + return space.newutf8(output, lgt, flag) @unwrap_spec(errors="text_or_none") @@ -88,7 +90,8 @@ def _initialize(self): self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec) - self.pending = u"" + self.pending = "" + self.pending_len = 0 def _free(self): self.pending = None @@ -96,25 +99,37 @@ c_codecs.pypy_cjk_enc_free(self.encodebuf) self.encodebuf = lltype.nullptr(c_codecs.ENCODEBUF_P.TO) - @unwrap_spec(object='utf8', final=bool) - def encode_w(self, object, final=False): - u_object = object.decode('utf8') + @unwrap_spec(final=bool) + def encode_w(self, space, w_object, final=False): + utf8data, length = space.utf8_len_w(w_object) space = self.space state = space.fromcache(CodecState) if len(self.pending) > 0: - u_object = self.pending + u_object + utf8data = self.pending + utf8data + length += self.pending_len try: - output = c_codecs.encodeex(self.encodebuf, u_object, self.errors, + output = c_codecs.encodeex(self.encodebuf, utf8data, length, + self.errors, state.encode_error_handler, self.name, get_ignore_error(final)) except c_codecs.EncodeDecodeError as e: - raise wrap_unicodeencodeerror(space, e, object, len(u_object), + raise wrap_unicodeencodeerror(space, e, utf8data, length, self.name) except RuntimeError: raise wrap_runtimeerror(space) pos = c_codecs.pypy_cjk_enc_inbuf_consumed(self.encodebuf) - assert 0 <= pos <= len(u_object) - self.pending = u_object[pos:] + assert 0 <= pos <= length + # scan the utf8 string until we hit pos + i = 0 + stop = length - pos + self.pending_len = stop + if stop > 0: + while pos > 0: + i = rutf8.next_codepoint_pos(utf8data, i) + pos -= 1 + self.pending = utf8data[i:] + else: + self.pending = "" return space.newbytes(output) diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py --- a/pypy/module/_multibytecodec/interp_multibytecodec.py +++ b/pypy/module/_multibytecodec/interp_multibytecodec.py @@ -31,23 +31,23 @@ return space.newtuple([space.newutf8(utf8_output, lgt, flag), space.newint(len(input))]) - @unwrap_spec(input='utf8', errors="text_or_none") - def encode(self, space, input, errors=None): + @unwrap_spec(errors="text_or_none") + def encode(self, space, w_input, errors=None): if errors is None: errors = 'strict' state = space.fromcache(CodecState) + input, length = space.utf8_len_w(w_input) # - u_input = input.decode('utf8') try: - output = c_codecs.encode(self.codec, u_input, errors, + output = c_codecs.encode(self.codec, input, length, errors, state.encode_error_handler, self.name) except c_codecs.EncodeDecodeError as e: - raise wrap_unicodeencodeerror(space, e, input, len(u_input), + raise wrap_unicodeencodeerror(space, e, input, length, self.name) except RuntimeError: raise wrap_runtimeerror(space) return space.newtuple([space.newbytes(output), - space.newint(len(u_input))]) + space.newint(length)]) MultibyteCodec.typedef = TypeDef( diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py b/pypy/module/_multibytecodec/test/test_c_codecs.py --- a/pypy/module/_multibytecodec/test/test_c_codecs.py +++ b/pypy/module/_multibytecodec/test/test_c_codecs.py @@ -14,27 +14,27 @@ def test_decode_gbk(): c = getcodec("gbk") u = decode(c, "\xA1\xAA") - assert u == unichr(0x2014) + assert u == unichr(0x2014).encode('utf8') u = decode(c, "foobar") - assert u == u"foobar" + assert u == "foobar" def test_decode_hz(): # stateful c = getcodec("hz") u = decode(c, "~{abc}") - assert u == u'\u5f95\u6cef' + assert u == u'\u5f95\u6cef'.encode('utf8') u = decode(c, "~{") - assert u == u'' + assert u == '' def test_decodeex_hz(): c = getcodec("hz") decodebuf = c_codecs.pypy_cjk_dec_new(c) u = c_codecs.decodeex(decodebuf, "~{abcd~}") - assert u == u'\u5f95\u6c85' + assert u == u'\u5f95\u6c85'.encode('utf8') u = c_codecs.decodeex(decodebuf, "~{efgh~}") - assert u == u'\u5f50\u73b7' + assert u == u'\u5f50\u73b7'.encode('utf8') u = c_codecs.decodeex(decodebuf, "!~{abcd~}xyz~{efgh") - assert u == u'!\u5f95\u6c85xyz\u5f50\u73b7' + assert u == u'!\u5f95\u6c85xyz\u5f50\u73b7'.encode('utf8') c_codecs.pypy_cjk_dec_free(decodebuf) def test_decodeex_hz_incomplete(): @@ -64,7 +64,7 @@ buf += c u = c_codecs.decodeex(decodebuf, buf, ignore_error = c_codecs.MBERR_TOOFEW) - assert u == output + assert u == output.encode('utf8') incompletepos = c_codecs.pypy_cjk_dec_inbuf_consumed(decodebuf) buf = buf[incompletepos:] assert buf == '' @@ -86,46 +86,47 @@ def test_decode_hz_ignore(): c = getcodec("hz") u = decode(c, 'def~{}abc', 'ignore') - assert u == u'def\u5fcf' + assert u == u'def\u5fcf'.encode('utf8') def test_decode_hz_replace(): c = getcodec("hz") u = decode(c, 'def~{}abc', 'replace') - assert u == u'def\ufffd\u5fcf' + assert u == u'def\ufffd\u5fcf'.encode('utf8') def test_encode_hz(): c = getcodec("hz") - s = encode(c, u'foobar') + s = encode(c, u'foobar'.encode('utf8'), 6) assert s == 'foobar' and type(s) is str - s = encode(c, u'\u5f95\u6cef') + s = encode(c, u'\u5f95\u6cef'.encode('utf8'), 2) assert s == '~{abc}~}' def test_encode_hz_error(): # error c = getcodec("hz") - e = py.test.raises(EncodeDecodeError, encode, c, u'abc\u1234def').value + e = py.test.raises(EncodeDecodeError, encode, c, u'abc\u1234def'.encode('utf8'), 7).value assert e.start == 3 assert e.end == 4 assert e.reason == "illegal multibyte sequence" def test_encode_hz_ignore(): c = getcodec("hz") - s = encode(c, u'abc\u1234def', 'ignore') + s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'ignore') assert s == 'abcdef' def test_encode_hz_replace(): c = getcodec("hz") - s = encode(c, u'abc\u1234def', 'replace') + s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'replace') assert s == 'abc?def' def test_encode_jisx0208(): c = getcodec('iso2022_jp') - s = encode(c, u'\u83ca\u5730\u6642\u592b') + s = encode(c, u'\u83ca\u5730\u6642\u592b'.encode('utf8'), 4) assert s == '\x1b$B5FCO;~IW\x1b(B' and type(s) is str def test_encode_custom_error_handler_bytes(): + py.test.skip("needs revamping in py3k") c = getcodec("hz") def errorhandler(errors, enc, msg, t, startingpos, endingpos): - return None, '\xc3', endingpos - s = encode(c, u'abc\u1234def', 'foo', errorhandler) + return u'\xc3'.encode('utf8'), endingpos + s = encode(c, u'abc\u1234def'.encode('utf8'), 7, 'foo', errorhandler) assert '\xc3' in s diff --git a/pypy/module/_multibytecodec/test/test_translation.py b/pypy/module/_multibytecodec/test/test_translation.py --- a/pypy/module/_multibytecodec/test/test_translation.py +++ b/pypy/module/_multibytecodec/test/test_translation.py @@ -1,6 +1,7 @@ from pypy.module._multibytecodec import c_codecs from rpython.translator.c.test import test_standalone from rpython.config.translationoption import get_combined_translation_config +from rpython.rlib import rutf8 class TestTranslation(test_standalone.StandaloneTests): @@ -13,7 +14,8 @@ codecname, string = argv[1], argv[2] c = c_codecs.getcodec(codecname) u = c_codecs.decode(c, string) - r = c_codecs.encode(c, u) + lgt, _ = rutf8.get_utf8_length_flag(u) + r = c_codecs.encode(c, u, lgt) print r return 0 # diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py --- a/pypy/module/_pypyjson/interp_decoder.py +++ b/pypy/module/_pypyjson/interp_decoder.py @@ -1,7 +1,7 @@ import sys from rpython.rlib.rstring import StringBuilder from rpython.rlib.objectmodel import specialize, always_inline, r_dict -from rpython.rlib import rfloat, runicode +from rpython.rlib import rfloat, runicode, rutf8 from rpython.rtyper.lltypesystem import lltype, rffi from pypy.interpreter.error import oefmt from pypy.interpreter import unicodehelper @@ -19,29 +19,6 @@ return 0.0 return x * NEG_POW_10[exp] -def strslice2unicode_latin1(s, start, end): - """ - Convert s[start:end] to unicode. s is supposed to be an RPython string - encoded in latin-1, which means that the numeric value of each char is the - same as the corresponding unicode code point. - - Internally it's implemented at the level of low-level helpers, to avoid - the extra copy we would need if we take the actual slice first. - - No bound checking is done, use carefully. - """ - from rpython.rtyper.annlowlevel import llstr, hlunicode - from rpython.rtyper.lltypesystem.rstr import malloc, UNICODE - from rpython.rtyper.lltypesystem.lltype import cast_primitive, UniChar - length = end-start - ll_s = llstr(s) - ll_res = malloc(UNICODE, length) - ll_res.hash = 0 - for i in range(length): - ch = ll_s.chars[start+i] - ll_res.chars[i] = cast_primitive(UniChar, ch) - return hlunicode(ll_res) - def slice_eq(a, b): (ll_chars1, start1, length1, _) = a (ll_chars2, start2, length2, _) = b @@ -270,10 +247,11 @@ self.pos = i+1 return self.space.newdict() - d = {} + # XXX this should be improved to use an unwrapped dict + w_dict = self.space.newdict() while True: # parse a key: value - name = self.decode_key(i) + w_name = self.decode_key(i) i = self.skip_whitespace(self.pos) ch = self.ll_chars[i] if ch != ':': @@ -282,13 +260,13 @@ i = self.skip_whitespace(i) # w_value = self.decode_any(i) - d[name] = w_value + self.space.setitem(w_dict, w_name, w_value) i = self.skip_whitespace(self.pos) ch = self.ll_chars[i] i += 1 if ch == '}': self.pos = i - return self._create_dict(d) + return w_dict elif ch == ',': pass elif ch == '\0': @@ -297,10 +275,6 @@ self._raise("Unexpected '%s' when decoding object (char %d)", ch, i-1) - def _create_dict(self, d): - from pypy.objspace.std.dictmultiobject import from_unicode_key_dict - return from_unicode_key_dict(self.space, d) - def decode_string(self, i): start = i bits = 0 @@ -312,8 +286,7 @@ bits |= ord(ch) if ch == '"': self.pos = i - return self.space.newunicode( - self._create_string(start, i - 1, bits)) + return self._create_string(start, i - 1, bits) elif ch == '\\' or ch < '\x20': self.pos = i-1 return self.decode_string_escaped(start) @@ -322,12 +295,15 @@ if bits & 0x80: # the 8th bit is set, it's an utf8 string content_utf8 = self.getslice(start, end) - return unicodehelper.decode_utf8(self.space, content_utf8) + lgt, flag = unicodehelper.check_utf8_or_raise(self.space, + content_utf8) + return self.space.newutf8(content_utf8, lgt, flag) else: # ascii only, fast path (ascii is a strict subset of # latin1, and we already checked that all the chars are < # 128) - return strslice2unicode_latin1(self.s, start, end) + return self.space.newutf8(self.getslice(start, end), + end - start, rutf8.FLAG_ASCII) def decode_string_escaped(self, start): i = self.pos @@ -340,9 +316,10 @@ i += 1 if ch == '"': content_utf8 = builder.build() - content_unicode = unicodehelper.decode_utf8(self.space, content_utf8) + lgt, f = unicodehelper.check_utf8_or_raise(self.space, + content_utf8) self.pos = i - return self.space.newunicode(content_unicode) + return self.space.newutf8(content_utf8, lgt, f) elif ch == '\\': i = self.decode_escape_sequence(i, builder) elif ch < '\x20': @@ -389,8 +366,7 @@ return # help the annotator to know that we'll never go beyond # this point # - uchr = runicode.code_to_unichr(val) # may be a surrogate pair again - utf8_ch = unicodehelper.encode_utf8(self.space, uchr) + utf8_ch = rutf8.unichr_as_utf8(val, allow_surrogates=True) builder.append(utf8_ch) return i @@ -404,7 +380,7 @@ return 0x10000 + (((highsurr - 0xd800) << 10) | (lowsurr - 0xdc00)) def decode_key(self, i): - """ returns an unwrapped unicode """ + """ returns a wrapped unicode """ from rpython.rlib.rarithmetic import intmask i = self.skip_whitespace(i) diff --git a/pypy/module/_pypyjson/interp_encoder.py b/pypy/module/_pypyjson/interp_encoder.py --- a/pypy/module/_pypyjson/interp_encoder.py +++ b/pypy/module/_pypyjson/interp_encoder.py @@ -1,5 +1,5 @@ from rpython.rlib.rstring import StringBuilder -from rpython.rlib.runicode import str_decode_utf_8 +from rpython.rlib import rutf8 from pypy.interpreter import unicodehelper @@ -30,11 +30,8 @@ # the input is a string with only non-special ascii chars return w_string - eh = unicodehelper.decode_error_handler(space) - u = str_decode_utf_8( - s, len(s), None, final=True, errorhandler=eh, - allow_surrogates=True)[0] - sb = StringBuilder(len(u)) + unicodehelper.check_utf8_or_raise(space, s) + sb = StringBuilder(len(s)) sb.append_slice(s, 0, first) else: # We used to check if 'u' contains only safe characters, and return @@ -44,29 +41,31 @@ # a string (with the ascii encoding). This requires two passes # over the characters. So we may as well directly turn it into a # string here --- only one pass. - u = space.unicode_w(w_string) - sb = StringBuilder(len(u)) + s = space.utf8_w(w_string) + sb = StringBuilder(len(s)) first = 0 - for i in range(first, len(u)): - c = u[i] - if c <= u'~': - if c == u'"' or c == u'\\': + it = rutf8.Utf8StringIterator(s) + for i in range(first): + it.next() + for c in it: + if c <= ord('~'): + if c == ord('"') or c == ord('\\'): sb.append('\\') - elif c < u' ': - sb.append(ESCAPE_BEFORE_SPACE[ord(c)]) + elif c < ord(' '): + sb.append(ESCAPE_BEFORE_SPACE[c]) continue - sb.append(chr(ord(c))) + sb.append(chr(c)) else: - if c <= u'\uffff': + if c <= ord(u'\uffff'): sb.append('\\u') - sb.append(HEX[ord(c) >> 12]) - sb.append(HEX[(ord(c) >> 8) & 0x0f]) - sb.append(HEX[(ord(c) >> 4) & 0x0f]) - sb.append(HEX[ord(c) & 0x0f]) + sb.append(HEX[c >> 12]) + sb.append(HEX[(c >> 8) & 0x0f]) + sb.append(HEX[(c >> 4) & 0x0f]) + sb.append(HEX[c & 0x0f]) else: # surrogate pair - n = ord(c) - 0x10000 + n = c - 0x10000 s1 = 0xd800 | ((n >> 10) & 0x3ff) sb.append('\\ud') sb.append(HEX[(s1 >> 8) & 0x0f]) diff --git a/pypy/module/_pypyjson/test/test__pypyjson.py b/pypy/module/_pypyjson/test/test__pypyjson.py --- a/pypy/module/_pypyjson/test/test__pypyjson.py +++ b/pypy/module/_pypyjson/test/test__pypyjson.py @@ -10,10 +10,14 @@ assert dec.skip_whitespace(8) == len(s) dec.close() +class FakeSpace(object): + def newutf8(self, s, l, f): + return s + def test_decode_key(): s1 = "123" * 100 s = ' "%s" "%s" ' % (s1, s1) - dec = JSONDecoder('fake space', s) + dec = JSONDecoder(FakeSpace(), s) assert dec.pos == 0 x = dec.decode_key(0) assert x == s1 diff --git a/pypy/module/_rawffi/alt/interp_funcptr.py b/pypy/module/_rawffi/alt/interp_funcptr.py --- a/pypy/module/_rawffi/alt/interp_funcptr.py +++ b/pypy/module/_rawffi/alt/interp_funcptr.py @@ -167,8 +167,8 @@ addr = rffi.cast(rffi.ULONG, buf) self.argchain.arg(addr) - def handle_unichar_p(self, w_ffitype, w_obj, unicodeval): - buf = rffi.unicode2wcharp(unicodeval) + def handle_unichar_p(self, w_ffitype, w_obj, utf8val, utf8len): + buf = rffi.utf82wcharp(utf8val, utf8len) self.w_func.to_free.append(rffi.cast(rffi.VOIDP, buf)) addr = rffi.cast(rffi.ULONG, buf) self.argchain.arg(addr) diff --git a/pypy/module/_rawffi/alt/test/test_type_converter.py b/pypy/module/_rawffi/alt/test/test_type_converter.py --- a/pypy/module/_rawffi/alt/test/test_type_converter.py +++ b/pypy/module/_rawffi/alt/test/test_type_converter.py @@ -6,7 +6,7 @@ class DummyFromAppLevelConverter(FromAppLevelConverter): - def handle_all(self, w_ffitype, w_obj, val): + def handle_all(self, w_ffitype, w_obj, val, lgt=None): self.lastval = val handle_signed = handle_all @@ -120,8 +120,8 @@ def test_strings(self): # first, try automatic conversion from applevel self.check(app_types.char_p, self.space.newbytes('foo'), 'foo') - self.check(app_types.unichar_p, self.space.wrap(u'foo\u1234'), u'foo\u1234') - self.check(app_types.unichar_p, self.space.wrap('foo'), u'foo') + self.check(app_types.unichar_p, self.space.wrap(u'foo\u1234'), u'foo\u1234'.encode('utf8')) + self.check(app_types.unichar_p, self.space.wrap('foo'), 'foo') # then, try to pass explicit pointers self.check(app_types.char_p, self.space.wrap(42), 42) self.check(app_types.unichar_p, self.space.wrap(42), 42) diff --git a/pypy/module/_rawffi/alt/type_converter.py b/pypy/module/_rawffi/alt/type_converter.py --- a/pypy/module/_rawffi/alt/type_converter.py +++ b/pypy/module/_rawffi/alt/type_converter.py @@ -1,6 +1,6 @@ from rpython.rlib import libffi -from rpython.rlib import jit -from rpython.rlib.rarithmetic import r_uint +from rpython.rlib import jit, rutf8 +from rpython.rlib.rarithmetic import r_uint, intmask from pypy.interpreter.error import oefmt from pypy.module._rawffi.structure import W_StructureInstance, W_Structure from pypy.module._rawffi.alt.interp_ffitype import app_types @@ -85,8 +85,8 @@ return True elif w_ffitype.is_unichar_p() and (w_type is self.space.w_bytes or w_type is self.space.w_unicode): - unicodeval = self.space.unicode_w(w_obj) - self.handle_unichar_p(w_ffitype, w_obj, unicodeval) + utf8, lgt = self.space.utf8_len_w(w_obj) + self.handle_unichar_p(w_ffitype, w_obj, utf8, lgt) return True return False @@ -147,7 +147,7 @@ """ self.error(w_ffitype, w_obj) - def handle_unichar_p(self, w_ffitype, w_obj, unicodeval): + def handle_unichar_p(self, w_ffitype, w_obj, utf8val, utf8len): """ unicodeval: interp-level unicode """ @@ -228,7 +228,8 @@ return space.newbytes(chr(ucharval)) elif w_ffitype.is_unichar(): wcharval = self.get_unichar(w_ffitype) - return space.newunicode(unichr(wcharval)) + return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1, + rutf8.get_flag_from_code(intmask(wcharval))) elif w_ffitype.is_double(): return self._float(w_ffitype) elif w_ffitype.is_singlefloat(): diff --git a/pypy/module/_rawffi/interp_rawffi.py b/pypy/module/_rawffi/interp_rawffi.py --- a/pypy/module/_rawffi/interp_rawffi.py +++ b/pypy/module/_rawffi/interp_rawffi.py @@ -10,6 +10,7 @@ from rpython.rtyper.lltypesystem import lltype, rffi from rpython.rtyper.tool import rffi_platform from rpython.rlib.unroll import unrolling_iterable +from rpython.rlib import rutf8 from rpython.rlib.objectmodel import specialize import rpython.rlib.rposix as rposix @@ -416,13 +417,13 @@ val = s[0] push_func(add_arg, argdesc, val) elif letter == 'u': - s = space.unicode_w(w_arg) - if len(s) != 1: + s, lgt = space.utf8_len_w(w_arg) + if lgt != 1: raise oefmt(space.w_TypeError, "Expected unicode string of length one as wide " "character") - val = s[0] - push_func(add_arg, argdesc, val) + val = rutf8.codepoint_at_pos(s, 0) + push_func(add_arg, argdesc, rffi.cast(rffi.WCHAR_T, val)) else: for c in unroll_letters_for_numbers: if letter == c: diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py --- a/pypy/module/_sre/interp_sre.py +++ b/pypy/module/_sre/interp_sre.py @@ -7,7 +7,8 @@ from pypy.interpreter.error import OperationError, oefmt from rpython.rlib.rarithmetic import intmask from rpython.rlib import jit -from rpython.rlib.rstring import StringBuilder, UnicodeBuilder +from rpython.rlib.rstring import StringBuilder +from rpython.rlib.rutf8 import Utf8StringBuilder # ____________________________________________________________ # @@ -237,8 +238,8 @@ filter_is_callable = True else: if space.isinstance_w(w_ptemplate, space.w_unicode): - filter_as_unicode = space.unicode_w(w_ptemplate) - literal = u'\\' not in filter_as_unicode + filter_as_unicode = space.utf8_w(w_ptemplate) + literal = '\\' not in filter_as_unicode use_builder = ( space.isinstance_w(w_string, space.w_unicode) and literal) else: @@ -267,7 +268,7 @@ sublist_w = strbuilder = unicodebuilder = None if use_builder: if filter_as_unicode is not None: - unicodebuilder = UnicodeBuilder(ctx.end) + unicodebuilder = Utf8StringBuilder(ctx.end) else: assert filter_as_string is not None strbuilder = StringBuilder(ctx.end) @@ -335,7 +336,9 @@ return space.newbytes(strbuilder.build()), n else: assert unicodebuilder is not None - return space.newunicode(unicodebuilder.build()), n + return space.newutf8(unicodebuilder.build(), + unicodebuilder.get_length(), + unicodebuilder.get_flag()), n else: if space.isinstance_w(w_string, space.w_unicode): w_emptystr = space.newunicode(u'') diff --git a/pypy/module/_ssl/interp_ssl.py b/pypy/module/_ssl/interp_ssl.py --- a/pypy/module/_ssl/interp_ssl.py +++ b/pypy/module/_ssl/interp_ssl.py @@ -1566,12 +1566,13 @@ cadata = space.bufferstr_w(w_cadata) else: ca_file_type = SSL_FILETYPE_PEM - try: - cadata = space.unicode_w(w_cadata).encode('ascii') - except UnicodeEncodeError: + w_uni = space.convert_arg_to_w_unicode(w_cadata) + if not w_uni.is_ascii(): raise oefmt(space.w_TypeError, "cadata should be a ASCII string or a " "bytes-like object") + cadata = space.utf8_w(w_uni) + if cafile is None and capath is None and cadata is None: raise oefmt(space.w_TypeError, "cafile and capath cannot be both omitted") diff --git a/pypy/objspace/std/dictmultiobject.py b/pypy/objspace/std/dictmultiobject.py --- a/pypy/objspace/std/dictmultiobject.py +++ b/pypy/objspace/std/dictmultiobject.py @@ -1257,12 +1257,6 @@ create_iterator_classes(UnicodeDictStrategy) -def from_unicode_key_dict(space, d): - strategy = space.fromcache(UnicodeDictStrategy) - storage = strategy.erase(d) - return W_DictObject(space, strategy, storage) - - class IntDictStrategy(AbstractTypedStrategy, DictStrategy): erase, unerase = rerased.new_erasing_pair("int") erase = staticmethod(erase) diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py --- a/pypy/objspace/std/objspace.py +++ b/pypy/objspace/std/objspace.py @@ -367,23 +367,10 @@ assert isinstance(utf8s, str) return W_UnicodeObject(utf8s, length, flag) - def new_from_utf8(self, utf8s): - # XXX: kill me! - assert isinstance(utf8s, str) - length, flag = rutf8.check_utf8(utf8s, True) - return W_UnicodeObject(utf8s, length, flag) - def newfilename(self, s): assert isinstance(s, str) # on pypy3, this decodes the byte string return W_BytesObject(s) # with the filesystem encoding - def newunicode(self, unistr): - # XXX: kill me! - assert isinstance(unistr, unicode) - utf8s = unistr.encode("utf-8") - length, flag = rutf8.check_utf8(utf8s, True) - return self.newutf8(utf8s, length, flag) - def type(self, w_obj): jit.promote(w_obj.__class__) return w_obj.getclass(self) diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py --- a/pypy/objspace/std/unicodeobject.py +++ b/pypy/objspace/std/unicodeobject.py @@ -64,6 +64,11 @@ # - malloced object, which means it has index, then # _index_storage.flags determines the kind + @staticmethod + def from_utf8builder(builder): + return W_UnicodeObject( + builder.build(), builder.get_length(), builder.get_flag()) + def __repr__(self): """representation for debugging purposes""" return "%s(%r)" % (self.__class__.__name__, self._utf8) @@ -344,57 +349,38 @@ return mod_format(space, w_values, self, do_unicode=True) def descr_swapcase(self, space): - selfvalue = self._utf8 - builder = StringBuilder(len(selfvalue)) - flag = self._get_flag() - i = 0 - while i < len(selfvalue): - ch = rutf8.codepoint_at_pos(selfvalue, i) - i = rutf8.next_codepoint_pos(selfvalue, i) + input = self._utf8 + builder = rutf8.Utf8StringBuilder(len(input)) + for ch in rutf8.Utf8StringIterator(input): if unicodedb.isupper(ch): ch = unicodedb.tolower(ch) elif unicodedb.islower(ch): ch = unicodedb.toupper(ch) - if ch >= 0x80: - flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) - rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True) - return W_UnicodeObject(builder.build(), self._length, flag) + builder.append_code(ch) + return self.from_utf8builder(builder) def descr_title(self, space): if len(self._utf8) == 0: return self - utf8, flag = self.title_unicode(self._utf8) - return W_UnicodeObject(utf8, self._len(), flag) + return self.title_unicode(self._utf8) @jit.elidable def title_unicode(self, value): input = self._utf8 - builder = StringBuilder(len(input)) - i = 0 + builder = rutf8.Utf8StringBuilder(len(input)) previous_is_cased = False - flag = self._get_flag() - while i < len(input): - ch = rutf8.codepoint_at_pos(input, i) - i = rutf8.next_codepoint_pos(input, i) + for ch in rutf8.Utf8StringIterator(input): if not previous_is_cased: ch = unicodedb.totitle(ch) else: ch = unicodedb.tolower(ch) - if ch >= 0x80: - flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) - rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True) + builder.append_code(ch) previous_is_cased = unicodedb.iscased(ch) - return builder.build(), flag + return self.from_utf8builder(builder) def descr_translate(self, space, w_table): - input = self._utf8 - result = StringBuilder(len(input)) - result_length = 0 - flag = self._get_flag() - i = 0 - while i < len(input): - codepoint = rutf8.codepoint_at_pos(input, i) - i = rutf8.next_codepoint_pos(input, i) + builder = rutf8.Utf8StringBuilder(len(self._utf8)) + for codepoint in rutf8.Utf8StringIterator(self._utf8): try: w_newval = space.getitem(w_table, space.newint(codepoint)) except OperationError as e: @@ -406,24 +392,19 @@ elif space.isinstance_w(w_newval, space.w_int): codepoint = space.int_w(w_newval) elif isinstance(w_newval, W_UnicodeObject): - result.append(w_newval._utf8) - flag = rutf8.combine_flags(flag, w_newval._get_flag()) - result_length += w_newval._length + builder.append_utf8( + w_newval._utf8, w_newval._length, w_newval._get_flag()) continue else: raise oefmt(space.w_TypeError, "character mapping must return integer, None " "or unicode") try: - if codepoint >= 0x80: - flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) - rutf8.unichr_as_utf8_append(result, codepoint, - allow_surrogates=True) - result_length += 1 + builder.append_code(codepoint) except ValueError: raise oefmt(space.w_TypeError, "character mapping must be in range(0x110000)") - return W_UnicodeObject(result.build(), result_length, flag) + return self.from_utf8builder(builder) def descr_find(self, space, w_sub, w_start=None, w_end=None): w_result = self._unwrap_and_search(space, w_sub, w_start, w_end) @@ -517,12 +498,6 @@ def _join_return_one(self, space, w_obj): return space.is_w(space.type(w_obj), space.w_unicode) - def _join_check_item(self, space, w_obj): - if (space.isinstance_w(w_obj, space.w_bytes) or - space.isinstance_w(w_obj, space.w_unicode)): - return 0 - return 1 - def descr_formatter_parser(self, space): from pypy.objspace.std.newformat import unicode_template_formatter tformat = unicode_template_formatter(space, space.utf8_w(self)) @@ -534,16 +509,11 @@ return tformat.formatter_field_name_split() def descr_lower(self, space): - builder = StringBuilder(len(self._utf8)) - pos = 0 - flag = self._get_flag() - while pos < len(self._utf8): - lower = unicodedb.tolower(rutf8.codepoint_at_pos(self._utf8, pos)) - if lower >= 0x80: - flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) - rutf8.unichr_as_utf8_append(builder, lower, allow_surrogates=True) - pos = rutf8.next_codepoint_pos(self._utf8, pos) - return W_UnicodeObject(builder.build(), self._len(), flag) + builder = rutf8.Utf8StringBuilder(len(self._utf8)) + for ch in rutf8.Utf8StringIterator(self._utf8): + lower = unicodedb.tolower(ch) + builder.append_code(lower) + return self.from_utf8builder(builder) def descr_isdecimal(self, space): return self._is_generic(space, '_isdecimal') @@ -657,13 +627,11 @@ flag = self._get_flag() for i in range(size): w_s = list_w[i] - check_item = self._join_check_item(space, w_s) - if check_item == 1: + if not (space.isinstance_w(w_s, space.w_bytes) or + space.isinstance_w(w_s, space.w_unicode)): raise oefmt(space.w_TypeError, - "sequence item %d: expected string, %T found", + "sequence item %d: expected string or unicode, %T found", i, w_s) - elif check_item == 2: - return self._join_autoconvert(space, list_w) # XXX Maybe the extra copy here is okay? It was basically going to # happen anyway, what with being placed into the builder w_u = self.convert_arg_to_w_unicode(space, w_s) @@ -711,18 +679,11 @@ return space.newlist(strs_w) def descr_upper(self, space): - value = self._utf8 - builder = StringBuilder(len(value)) - flag = self._get_flag() - i = 0 - while i < len(value): - uchar = rutf8.codepoint_at_pos(value, i) - uchar = unicodedb.toupper(uchar) - if uchar >= 0x80: - flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) - i = rutf8.next_codepoint_pos(value, i) - rutf8.unichr_as_utf8_append(builder, uchar, allow_surrogates=True) - return W_UnicodeObject(builder.build(), self._length, flag) + builder = rutf8.Utf8StringBuilder(len(self._utf8)) + for ch in rutf8.Utf8StringIterator(self._utf8): + ch = unicodedb.toupper(ch) + builder.append_code(ch) + return self.from_utf8builder(builder) @unwrap_spec(width=int) def descr_zfill(self, space, width): @@ -826,22 +787,15 @@ if len(value) == 0: return self._empty() - flag = self._get_flag() - builder = StringBuilder(len(value)) - uchar = rutf8.codepoint_at_pos(value, 0) - i = rutf8.next_codepoint_pos(value, 0) + builder = rutf8.Utf8StringBuilder(len(self._utf8)) + it = rutf8.Utf8StringIterator(self._utf8) + uchar = it.next() ch = unicodedb.toupper(uchar) - rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True) - if ch >= 0x80: - flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) - while i < len(value): - uchar = rutf8.codepoint_at_pos(value, i) - i = rutf8.next_codepoint_pos(value, i) - ch = unicodedb.tolower(uchar) - rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True) - if ch >= 0x80: - flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR) - return W_UnicodeObject(builder.build(), self._len(), flag) + builder.append_code(ch) + for ch in it: + ch = unicodedb.tolower(ch) + builder.append_code(ch) + return self.from_utf8builder(builder) @unwrap_spec(width=int, w_fillchar=WrappedDefault(' ')) def descr_center(self, space, width, w_fillchar): diff --git a/requirements.txt b/requirements.txt --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ cffi>=1.4.0 -vmprof>=0.4.10 # required to parse log files in rvmprof tests + +# parse log files in rvmprof tests +vmprof>=0.4.10; 'x86' in platform.machine #skip arm, s390x # hypothesis is used for test generation on untranslated tests hypothesis diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py --- a/rpython/rlib/rutf8.py +++ b/rpython/rlib/rutf8.py @@ -687,6 +687,11 @@ self._lgt += 1 unichr_as_utf8_append(self._s, code, True) + def append_utf8(self, utf8, length, flag): + self._flag = combine_flags(self._flag, flag) + self._lgt += length + self._s.append(utf8) + def build(self): return self._s.build() @@ -702,10 +707,12 @@ self._end = len(utf8s) self._pos = 0 - def done(self): - return self._pos == self._end + def __iter__(self): + return self def next(self): _______________________________________________ pypy-commit mailing list [email protected] https://mail.python.org/mailman/listinfo/pypy-commit
