Author: fijal Branch: unicode-utf8 Changeset: r93126:559a0a0bb302 Date: 2017-11-22 23:50 +0100 http://bitbucket.org/pypy/pypy/changeset/559a0a0bb302/
Log: in progress io diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py --- a/pypy/interpreter/baseobjspace.py +++ b/pypy/interpreter/baseobjspace.py @@ -1779,6 +1779,9 @@ assert not hasattr(self, 'is_fake_objspace') return W_UnicodeObject.convert_arg_to_w_unicode(self, w_obj, strict) + def utf8_len_w(self, w_obj): + w_obj = self.convert_arg_to_w_unicode(w_obj) + return w_obj._utf8, w_obj._len() def realutf8_w(self, w_obj): # Like utf8_w(), but only works if w_obj is really of type diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -10,7 +10,8 @@ from pypy.module._io.interp_iobase import W_IOBase, convert_size, trap_eintr from rpython.rlib.rarithmetic import intmask, r_uint, r_ulonglong from rpython.rlib.rbigint import rbigint -from rpython.rlib.rstring import UnicodeBuilder +from rpython.rlib.rstring import StringBuilder +from rpython.rlib.rutf8 import FLAG_ASCII, check_utf8 STATE_ZERO, STATE_OK, STATE_DETACHED = range(3) @@ -29,17 +30,22 @@ def __init__(self, space): self.w_newlines_dict = { - SEEN_CR: space.newunicode(u"\r"), - SEEN_LF: space.newunicode(u"\n"), - SEEN_CRLF: space.newunicode(u"\r\n"), + SEEN_CR: space.newutf8("\r", 1, FLAG_ASCII), + SEEN_LF: space.newutf8("\n", 1, FLAG_ASCII), + SEEN_CRLF: space.newutf8("\r\n", 2, FLAG_ASCII), SEEN_CR | SEEN_LF: space.newtuple( - [space.newunicode(u"\r"), space.newunicode(u"\n")]), + [space.newutf8("\r", 1, FLAG_ASCII), + space.newutf8("\n", 1, FLAG_ASCII)]), SEEN_CR | SEEN_CRLF: space.newtuple( - [space.newunicode(u"\r"), space.newunicode(u"\r\n")]), + [space.newutf8("\r", 1, FLAG_ASCII), + space.newutf8("\r\n", 2, FLAG_ASCII)]), SEEN_LF | SEEN_CRLF: space.newtuple( - [space.newunicode(u"\n"), space.newunicode(u"\r\n")]), + [space.newutf8("\n", 1, FLAG_ASCII), + space.newutf8("\r\n", 2, FLAG_ASCII)]), SEEN_CR | SEEN_LF | SEEN_CRLF: space.newtuple( - [space.newunicode(u"\r"), space.newunicode(u"\n"), space.newunicode(u"\r\n")]), + [space.newutf8("\r", 1, FLAG_ASCII), + space.newutf8("\n", 1, FLAG_ASCII), + space.newutf8("\r\n", 2, FLAG_ASCII)]), } @unwrap_spec(translate=int) @@ -73,25 +79,25 @@ raise oefmt(space.w_TypeError, "decoder should return a string result") - output = space.unicode_w(w_output) + output, output_len = space.utf8_len_w(w_output) output_len = len(output) if self.pendingcr and (final or output_len): - output = u'\r' + output + output = '\r' + output self.pendingcr = False output_len += 1 # retain last \r even when not translating data: # then readline() is sure to get \r\n in one pass if not final and output_len > 0: - last = output_len - 1 + last = len(output) - 1 assert last >= 0 - if output[last] == u'\r': + if output[last] == '\r': output = output[:last] self.pendingcr = True output_len -= 1 if output_len == 0: - return space.newunicode(u"") + return space.newutf8("", 1, FLAG_ASCII) # Record which newlines are read and do newline translation if # desired, all in one pass. @@ -101,52 +107,53 @@ # for the \r only_lf = False if seennl == SEEN_LF or seennl == 0: - only_lf = (output.find(u'\r') < 0) + only_lf = (output.find('\r') < 0) if only_lf: # If not already seen, quick scan for a possible "\n" character. # (there's nothing else to be done, even when in translation mode) - if seennl == 0 and output.find(u'\n') >= 0: + if seennl == 0 and output.find('\n') >= 0: seennl |= SEEN_LF # Finished: we have scanned for newlines, and none of them # need translating. elif not self.translate: i = 0 - while i < output_len: + while i < len(output): if seennl == SEEN_ALL: break c = output[i] i += 1 - if c == u'\n': + if c == '\n': seennl |= SEEN_LF - elif c == u'\r': - if i < output_len and output[i] == u'\n': + elif c == '\r': + if i < len(output) and output[i] == '\n': seennl |= SEEN_CRLF i += 1 else: seennl |= SEEN_CR - elif output.find(u'\r') >= 0: + elif output.find('\r') >= 0: # Translate! - builder = UnicodeBuilder(output_len) + builder = StringBuilder(len(output)) i = 0 while i < output_len: c = output[i] i += 1 - if c == u'\n': + if c == '\n': seennl |= SEEN_LF - elif c == u'\r': - if i < output_len and output[i] == u'\n': + elif c == '\r': + if i < len(output) and output[i] == '\n': seennl |= SEEN_CRLF i += 1 else: seennl |= SEEN_CR - builder.append(u'\n') + builder.append('\n') continue builder.append(c) output = builder.build() self.seennl |= seennl - return space.newunicode(output) + lgt, flag = check_utf8(output, True) + return space.newutf8(output, lgt, flag) def reset_w(self, space): self.seennl = 0 @@ -373,8 +380,8 @@ if space.is_none(w_newline): newline = None else: - newline = space.unicode_w(w_newline) - if newline and newline not in (u'\n', u'\r\n', u'\r'): + newline = space.utf8_w(w_newline) + if newline and newline not in ('\n', '\r\n', '\r'): raise oefmt(space.w_ValueError, "illegal newline value: %R", w_newline) @@ -384,13 +391,13 @@ self.readtranslate = newline is None self.readnl = newline - self.writetranslate = (newline != u'') + self.writetranslate = (newline != '') if not self.readuniversal: self.writenl = self.readnl - if self.writenl == u'\n': + if self.writenl == '\n': self.writenl = None elif _WINDOWS: - self.writenl = u"\r\n" + self.writenl = "\r\n" else: self.writenl = None @@ -519,7 +526,7 @@ def _get_decoded_chars(self, size): if self.decoded_chars is None: - return u"" + return "" available = len(self.decoded_chars) - self.decoded_chars_used if size < 0 or size > available: @@ -574,7 +581,7 @@ w_decoded = space.call_method(self.w_decoder, "decode", w_input, space.newbool(eof)) check_decoded(space, w_decoded) - self._set_decoded_chars(space.unicode_w(w_decoded)) + self._set_decoded_chars(space.utf8_w(w_decoded)) if space.len_w(w_decoded) > 0: eof = False @@ -745,20 +752,19 @@ raise oefmt(space.w_TypeError, "unicode argument expected, got '%T'", w_text) - text = space.unicode_w(w_text) - textlen = len(text) + text, textlen = space.utf8_len_w(w_text) haslf = False if (self.writetranslate and self.writenl) or self.line_buffering: - if text.find(u'\n') >= 0: + if text.find('\n') >= 0: haslf = True if haslf and self.writetranslate and self.writenl: w_text = space.call_method(w_text, "replace", space.newunicode(u'\n'), space.newunicode(self.writenl)) - text = space.unicode_w(w_text) + text = space.utf8_w(w_text) needflush = False - if self.line_buffering and (haslf or text.find(u'\r') >= 0): + if self.line_buffering and (haslf or text.find('\r') >= 0): needflush = True # XXX What if we were just reading? _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit