[pypy-commit] pypy unicode-utf8: merge heads

2017-11-23 Thread arigo
Author: Armin Rigo 
Branch: unicode-utf8
Changeset: r93134:25ac6121d03c
Date: 2017-11-23 10:26 +0100
http://bitbucket.org/pypy/pypy/changeset/25ac6121d03c/

Log:merge heads

diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -272,7 +272,7 @@
 self._typed_unwrap_error(space, "unicode")
 
 def convert_to_w_unicode(self, space):
-self._typed_unwrap_error(space, "unicode")
+self._typed_unwrap_error(space, "unicode")
 
 def bytearray_list_of_chars_w(self, space):
 self._typed_unwrap_error(space, "bytearray")
@@ -1759,6 +1759,11 @@
 
 def utf8_w(self, w_obj):
 return w_obj.utf8_w(self)
+
+def unicode_w(self, w_obj):
+# XXX: kill me!
+return w_obj.utf8_w(self).decode('utf-8')
+
 def convert_to_w_unicode(self, w_obj):
 return w_obj.convert_to_w_unicode(self)
 
diff --git a/pypy/module/_io/interp_stringio.py 
b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -184,9 +184,7 @@
 start,
 end
 )
-if endpos >= 0:
-endpos += start
-else:
+if endpos < 0:
 endpos = end
 assert endpos >= 0
 self.pos = endpos
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -97,7 +97,7 @@
 output_len -= 1
 
 if output_len == 0:
-return space.newutf8("", 1, FLAG_ASCII)
+return space.newutf8("", 0, FLAG_ASCII)
 
 # Record which newlines are read and do newline translation if
 # desired, all in one pass.
@@ -224,30 +224,28 @@
 def _find_line_ending(self, line, start, end):
 size = end - start
 if self.readtranslate:
-
 # Newlines are already translated, only search for \n
-pos = line.find(u'\n', start, end)
+pos = line.find('\n', start, end)
 if pos >= 0:
-return pos - start + 1, 0
+return pos + 1, 0
 else:
 return -1, size
 elif self.readuniversal:
 # Universal newline search. Find any of \r, \r\n, \n
 # The decoder ensures that \r\n are not split in two pieces
-i = 0
+i = start
 while True:
-# Fast path for non-control chars. The loop always ends
-# since the Py_UNICODE storage is NUL-terminated.
-while i < size and line[start + i] > '\r':
+# Fast path for non-control chars.
+while i < end and line[i] > '\r':
 i += 1
-if i >= size:
+if i >= end:
 return -1, size
-ch = line[start + i]
+ch = line[i]
 i += 1
 if ch == '\n':
 return i, 0
 if ch == '\r':
-if line[start + i] == '\n':
+if line[i] == '\n':
 return i + 1, 0
 else:
 return i, 0
@@ -255,7 +253,7 @@
 # Non-universal mode.
 pos = line.find(self.readnl, start, end)
 if pos >= 0:
-return pos - start + len(self.readnl), 0
+return pos + len(self.readnl), 0
 else:
 pos = line.find(self.readnl[0], start, end)
 if pos >= 0:
@@ -520,8 +518,13 @@
 # _
 # read methods
 
-def _set_decoded_chars(self, chars):
-self.decoded_chars = chars
+def _unset_decoded(self):
+self.decoded_chars = None
+self.decoded_chars_used = 0
+
+def _set_decoded(self, space, w_decoded):
+check_decoded(space, w_decoded)
+self.decoded_chars = space.utf8_w(w_decoded)
 self.decoded_chars_used = 0
 
 def _get_decoded_chars(self, size):
@@ -580,8 +583,7 @@
 eof = space.len_w(w_input) == 0
 w_decoded = space.call_method(self.w_decoder, "decode",
   w_input, space.newbool(eof))
-check_decoded(space, w_decoded)
-self._set_decoded_chars(space.utf8_w(w_decoded))
+self._set_decoded(space, w_decoded)
 if space.len_w(w_decoded) > 0:
 eof = False
 
@@ -617,13 +619,13 @@
 w_bytes = space.call_method(self.w_buffer, "read")
 w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, 
space.w_True)
 check_decoded(space, w_decoded)
-w_result = space.newunicode(self._get_decoded_chars(-1))
+w_result = space.new_from_utf8(self._get_decoded_chars(-1))

[pypy-commit] pypy unicode-utf8: Tweak the unicode FLAG_xx values for performance; collapse two identical helpers; move combine_flags() to rutf8

2017-11-23 Thread arigo
Author: Armin Rigo 
Branch: unicode-utf8
Changeset: r93133:a1cf21d7a124
Date: 2017-11-23 10:24 +0100
http://bitbucket.org/pypy/pypy/changeset/a1cf21d7a124/

Log:Tweak the unicode FLAG_xx values for performance; collapse two
identical helpers; move combine_flags() to rutf8

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -3,6 +3,7 @@
 from pypy.interpreter.error import OperationError
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib import rutf8
+from rpython.rlib.rutf8 import combine_flags
 from rpython.rlib.rarithmetic import r_uint, intmask
 from rpython.rlib.rstring import StringBuilder
 from pypy.module._codecs import interp_codecs
@@ -43,14 +44,6 @@
 from pypy.objspace.std.unicodeobject import encode_object
 return encode_object(space, w_data, encoding, errors)
 
-def combine_flags(one, two):
-if one == rutf8.FLAG_ASCII and two == rutf8.FLAG_ASCII:
-return rutf8.FLAG_ASCII
-elif (one == rutf8.FLAG_HAS_SURROGATES or
-  two == rutf8.FLAG_HAS_SURROGATES):
-return rutf8.FLAG_HAS_SURROGATES
-return rutf8.FLAG_REGULAR
-
 
 def _has_surrogate(u):
 for c in u:
@@ -788,7 +781,8 @@
 # first surrogate
 surrogate = outCh
 else:
-flag = combine_flags(flag, rutf8.unichr_to_flag(outCh))
+flag = combine_flags(flag,
+ rutf8.get_flag_from_code(outCh))
 outsize += 1
 assert outCh >= 0
 rutf8.unichr_as_utf8_append(result, outCh, True)
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -356,7 +356,7 @@
 elif unicodedb.islower(ch):
 ch = unicodedb.toupper(ch)
 if ch >= 0x80:
-flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
 rutf8.unichr_as_utf8_append(builder, ch)
 return W_UnicodeObject(builder.build(), self._length, flag)
 
@@ -381,7 +381,7 @@
 else:
 ch = unicodedb.tolower(ch)
 if ch >= 0x80:
-flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
 rutf8.unichr_as_utf8_append(builder, ch)
 previous_is_cased = unicodedb.iscased(ch)
 return builder.build(), flag
@@ -407,7 +407,7 @@
 codepoint = space.int_w(w_newval)
 elif isinstance(w_newval, W_UnicodeObject):
 result.append(w_newval._utf8)
-flag = unicodehelper.combine_flags(flag, 
w_newval._get_flag())
+flag = rutf8.combine_flags(flag, w_newval._get_flag())
 result_length += w_newval._length
 continue
 else:
@@ -416,7 +416,7 @@
 "or unicode")
 try:
 if codepoint >= 0x80:
-flag = unicodehelper.combine_flags(flag, 
rutf8.FLAG_REGULAR)
+flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
 rutf8.unichr_as_utf8_append(result, codepoint,
 allow_surrogates=True)
 result_length += 1
@@ -540,7 +540,7 @@
 while pos < len(self._utf8):
 lower = unicodedb.tolower(rutf8.codepoint_at_pos(self._utf8, pos))
 if lower >= 0x80:
-flag = unicodehelper.combine_flags(flag, rutf8.FLAG_REGULAR)
+flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
 rutf8.unichr_as_utf8_append(builder, lower) # XXX allow surrogates?
 pos = rutf8.next_codepoint_pos(self._utf8, pos)
 return W_UnicodeObject(builder.build(), self._len(), flag)
@@ -642,7 +642,7 @@
 if e.match(space, space.w_TypeError):
 return space.w_NotImplemented
 raise
-flag = unicodehelper.combine_flags(self._get_flag(), 
w_other._get_flag())
+flag = rutf8.combine_flags(self._get_flag(), w_other._get_flag())
 return W_UnicodeObject(self._utf8 + w_other._utf8,
self._len() + w_other._len(), flag)
 
@@ -667,7 +667,7 @@
 # XXX Maybe the extra copy here is okay? It was basically going to
 # happen anyway, what with being placed into the builder
 w_u = self.convert_arg_to_w_unicode(space, w_s)
-flag = unicodehelper.combine_flags(flag, w_u._get_flag())
+flag = rutf8.combine_flags(flag, w_u._get_flag())
 unwrappe

[pypy-commit] pypy unicode-utf8: Tests and fixes for 'allow_surrogates=True' in various unicode methods

2017-11-23 Thread arigo
Author: Armin Rigo 
Branch: unicode-utf8
Changeset: r93135:16bfad77e3d5
Date: 2017-11-23 10:33 +0100
http://bitbucket.org/pypy/pypy/changeset/16bfad77e3d5/

Log:Tests and fixes for 'allow_surrogates=True' in various unicode
methods

diff --git a/pypy/objspace/std/test/test_unicodeobject.py 
b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -299,6 +299,7 @@
 assert u"Brown Fox".title() == u"Brown Fox"
 assert u"bro!wn fox".title() == u"Bro!Wn Fox"
 assert u"brow\u4321n fox".title() == u"Brow\u4321N Fox"
+assert u'\ud800'.title() == u'\ud800'
 
 def test_istitle(self):
 assert u"".istitle() == False
@@ -328,10 +329,12 @@
 assert u'A'.lower() == u'a'
 assert u'\u0105'.lower() == u'\u0105'
 assert u'\u0104'.lower() == u'\u0105'
+assert u'\ud800'.lower() == u'\ud800'
 assert u'a'.upper() == u'A'
 assert u'A'.upper() == u'A'
 assert u'\u0105'.upper() == u'\u0104'
 assert u'\u0104'.upper() == u'\u0104'
+assert u'\ud800'.upper() == u'\ud800'
 
 def test_capitalize(self):
 assert u"brown fox".capitalize() == u"Brown fox"
@@ -354,6 +357,8 @@
 # check with Ll chars with no upper - nothing changes here
 assert (u'\u019b\u1d00\u1d86\u0221\u1fb7'.capitalize() ==
 u'\u019b\u1d00\u1d86\u0221\u1fb7')
+assert u'\ud800'.capitalize() == u'\ud800'
+assert u'xx\ud800'.capitalize() == u'Xx\ud800'
 
 def test_rjust(self):
 s = u"abc"
@@ -844,6 +849,7 @@
 
 def test_swapcase(self):
 assert u'\xe4\xc4\xdf'.swapcase() == u'\xc4\xe4\xdf'
+assert u'\ud800'.swapcase() == u'\ud800'
 
 def test_buffer(self):
 buf = buffer(u'XY')
diff --git a/pypy/objspace/std/unicodeobject.py 
b/pypy/objspace/std/unicodeobject.py
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -357,7 +357,7 @@
 ch = unicodedb.toupper(ch)
 if ch >= 0x80:
 flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-rutf8.unichr_as_utf8_append(builder, ch)
+rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
 return W_UnicodeObject(builder.build(), self._length, flag)
 
 def descr_title(self, space):
@@ -382,7 +382,7 @@
 ch = unicodedb.tolower(ch)
 if ch >= 0x80:
 flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-rutf8.unichr_as_utf8_append(builder, ch)
+rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
 previous_is_cased = unicodedb.iscased(ch)
 return builder.build(), flag
 
@@ -541,7 +541,7 @@
 lower = unicodedb.tolower(rutf8.codepoint_at_pos(self._utf8, pos))
 if lower >= 0x80:
 flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
-rutf8.unichr_as_utf8_append(builder, lower) # XXX allow surrogates?
+rutf8.unichr_as_utf8_append(builder, lower, allow_surrogates=True)
 pos = rutf8.next_codepoint_pos(self._utf8, pos)
 return W_UnicodeObject(builder.build(), self._len(), flag)
 
@@ -721,7 +721,7 @@
 if uchar >= 0x80:
 flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
 i = rutf8.next_codepoint_pos(value, i)
-rutf8.unichr_as_utf8_append(builder, uchar)
+rutf8.unichr_as_utf8_append(builder, uchar, allow_surrogates=True)
 return W_UnicodeObject(builder.build(), self._length, flag)
 
 @unwrap_spec(width=int)
@@ -831,14 +831,14 @@
 uchar = rutf8.codepoint_at_pos(value, 0)
 i = rutf8.next_codepoint_pos(value, 0)
 ch = unicodedb.toupper(uchar)
-rutf8.unichr_as_utf8_append(builder, ch)
+rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
 if ch >= 0x80:
 flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
 while i < len(value):
 uchar = rutf8.codepoint_at_pos(value, i)
 i = rutf8.next_codepoint_pos(value, i)
 ch = unicodedb.tolower(uchar)
-rutf8.unichr_as_utf8_append(builder, ch)
+rutf8.unichr_as_utf8_append(builder, ch, allow_surrogates=True)
 if ch >= 0x80:
 flag = rutf8.combine_flags(flag, rutf8.FLAG_REGULAR)
 return W_UnicodeObject(builder.build(), self._len(), flag)
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: Review for surrogates

2017-11-23 Thread arigo
Author: Armin Rigo 
Branch: unicode-utf8
Changeset: r93136:dc6582a05b85
Date: 2017-11-23 10:48 +0100
http://bitbucket.org/pypy/pypy/changeset/dc6582a05b85/

Log:Review for surrogates

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -370,14 +370,15 @@
 builder.append(res)
 else:
 # when we get here, chr is a 32-bit unicode character
-if chr > 0x10:
+try:
+rutf8.unichr_as_utf8_append(builder, intmask(chr), True)
+except ValueError:
 message = "illegal Unicode character"
 res, pos = errorhandler(errors, encoding,
 message, s, pos-2, pos+digits)
 size, flag = rutf8.check_utf8(res, True)
 builder.append(res)
 else:
-rutf8.unichr_as_utf8_append(builder, intmask(chr), True)
 flag = rutf8.get_flag_from_code(intmask(chr))
 pos += digits
 size = 1
@@ -466,7 +467,7 @@
 pos += 1
 x = (x<<3) + ord(ch) - ord('0')
 outsize += 1
-if x >= 0x7F:
+if x > 0x7F:
 rutf8.unichr_as_utf8_append(builder, x)
 flag = combine_flags(rutf8.FLAG_REGULAR, flag)
 else:
@@ -524,7 +525,9 @@
 pos = look + 1
 outsize += 1
 flag = combine_flags(flag, rutf8.get_flag_from_code(code))
-rutf8.unichr_as_utf8_append(builder, code)
+rutf8.unichr_as_utf8_append(builder, code,
+allow_surrogates=True)
+# xxx 'code' is probably always within range here...
 else:
 res, pos = errorhandler(errors, "unicodeescape",
 message, s, pos-1, look+1)
@@ -772,7 +775,8 @@
 surrogate = 0
 continue
 else:
-rutf8.unichr_as_utf8_append(result, surrogate)
+rutf8.unichr_as_utf8_append(result, surrogate,
+allow_surrogates=True)
 flag = rutf8.FLAG_HAS_SURROGATES
 outsize += 1
 surrogate = 0
@@ -1236,7 +1240,7 @@
 result.append(r)
 continue
 
-rutf8.unichr_as_utf8_append(result, ch)
+rutf8.unichr_as_utf8_append(result, ch, allow_surrogates=True)
 pos += 4
 r = result.build()
 lgt, flag = rutf8.check_utf8(r, True)
@@ -1360,7 +1364,7 @@
 s, pos, pos + unicode_bytes)
 result.append(res)
 continue
-rutf8.unichr_as_utf8_append(result, intmask(t))
+rutf8.unichr_as_utf8_append(result, intmask(t), allow_surrogates=True)
 pos += unicode_bytes
 r = result.build()
 lgt, flag = rutf8.check_utf8(r, True)
diff --git a/pypy/module/_multibytecodec/c_codecs.py 
b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -127,7 +127,7 @@
 errorcb, namecb, stringdata)
 src = pypy_cjk_dec_outbuf(decodebuf)
 length = pypy_cjk_dec_outlen(decodebuf)
-return rffi.wcharpsize2utf8(src, length)
+return rffi.wcharpsize2utf8(src, length) # assumes no out-of-range 
chars
 
 def multibytecodec_decerror(decodebuf, e, errors,
 errorcb, namecb, stringdata):
diff --git a/rpython/rtyper/lltypesystem/rffi.py 
b/rpython/rtyper/lltypesystem/rffi.py
--- a/rpython/rtyper/lltypesystem/rffi.py
+++ b/rpython/rtyper/lltypesystem/rffi.py
@@ -1012,6 +1012,7 @@
 def wcharpsize2utf8(w, size):
 """ Helper to convert WCHARP pointer to utf8 in one go.
 Equivalent to wcharpsize2unicode().encode("utf8")
+Raises ValueError if characters are outside range(0x11)!
 """
 from rpython.rlib import rutf8
 
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: Fixes for _cffi_backend

2017-11-23 Thread arigo
Author: Armin Rigo 
Branch: unicode-utf8
Changeset: r93137:a94b5860dbb3
Date: 2017-11-23 15:40 +0100
http://bitbucket.org/pypy/pypy/changeset/a94b5860dbb3/

Log:Fixes for _cffi_backend

diff --git a/pypy/module/_cffi_backend/ctypearray.py 
b/pypy/module/_cffi_backend/ctypearray.py
--- a/pypy/module/_cffi_backend/ctypearray.py
+++ b/pypy/module/_cffi_backend/ctypearray.py
@@ -64,13 +64,10 @@
 elif space.isinstance_w(w_value, space.w_unicode):
 from pypy.module._cffi_backend import wchar_helper
 w_u = space.convert_arg_to_w_unicode(w_value)
-if self.citem.size == 4:
+if self.ctitem.size == 2:
+length = wchar_helper.utf8_size_as_char16(w_u._utf8)
+else:
 length = w_u._len()
-else:
-if not w_u._has_surrogates():
-length = w_u._len()
-else:
-length = wchar_helper.unicode_size_as_char16(w_u._utf8, 
w_u._len())
 return (w_value, length + 1)
 else:
 explicitlength = space.getindex_w(w_value, space.w_OverflowError)
diff --git a/pypy/module/_cffi_backend/ctypeprim.py 
b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -40,16 +40,13 @@
 return ord(s[0])
 
 def cast_unicode(self, w_ob):
-import pdb
-pdb.set_trace()
 space = self.space
 w_u = space.convert_arg_to_w_unicode(w_ob)
 if w_u._len() != 1:
 raise oefmt(space.w_TypeError,
 "cannot cast unicode string of length %d to ctype 
'%s'",
 w_u._len(), self.name)
-ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0)
-return intmask(ordinal)
+return rutf8.codepoint_at_pos(w_u._utf8, 0)
 
 def cast(self, w_ob):
 from pypy.module._cffi_backend import ctypeptr
@@ -175,21 +172,19 @@
 return self.space.newint(value)# r_uint => 'long' object
 
 def convert_to_object(self, cdata):
-if self.is_signed_wchar:
-code = ord(rffi.cast(rffi.CWCHARP, cdata)[0])
-return self.space.newutf8(
-rutf8.unichr_as_utf8(code), 1,
-rutf8.get_flag_from_code(code))
-else:
-value = misc.read_raw_ulong_data(cdata, self.size)   # r_uint
-try:
-u = wchar_helper.ordinal_to_unicode(value)
-except wchar_helper.OutOfRange as e:
-raise oefmt(self.space.w_ValueError,
-"char32_t out of range for "
-"conversion to unicode: %s", hex(e.ordinal))
-return self.space.newutf8(rutf8.unichr_as_utf8(ord(u)), 1,
-rutf8.get_flag_from_code(ord(u)))
+value = misc.read_raw_ulong_data(cdata, self.size)   # r_uint
+try:
+utf8 = rutf8.unichr_as_utf8(value, allow_surrogates=True)
+except ValueError:
+if self.is_signed_wchar:
+s = hex(intmask(value))
+else:
+s = hex(value)
+raise oefmt(self.space.w_ValueError,
+"%s out of range for conversion to unicode: %s",
+self.name, s)
+flag = rutf8.get_flag_from_code(intmask(value))
+return self.space.newutf8(utf8, 1, flag)
 
 def string(self, cdataobj, maxlen):
 with cdataobj as ptr:
@@ -200,7 +195,13 @@
 # returns a r_uint.  If self.size == 2, it is smaller than 0x1
 space = self.space
 if space.isinstance_w(w_ob, space.w_unicode):
-return rutf8.codepoint_at_pos(space.utf8_w(w_ob), 0)
+w_u = space.convert_arg_to_w_unicode(w_ob)
+if w_u._len() != 1:
+raise self._convert_error("single character", w_ob)
+ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0)
+if self.size == 2 and ordinal > 0x:
+raise self._convert_error("single character <= 0x", w_ob)
+return r_uint(ordinal)
 elif (isinstance(w_ob, cdataobj.W_CData) and
isinstance(w_ob.ctype, W_CTypePrimitiveUniChar) and
w_ob.ctype.size == self.size):
@@ -214,15 +215,15 @@
 
 def unpack_ptr(self, w_ctypeptr, ptr, length):
 if self.size == 2:
-u = wchar_helper.unicode_from_char16(ptr, length)
+utf8, lgt, flag = wchar_helper.utf8_from_char16(ptr, length)
 else:
 try:
-u = wchar_helper.unicode_from_char32(ptr, length)
+utf8, lgt, flag = wchar_helper.utf8_from_char32(ptr, length)
 except wchar_helper.OutOfRange as e:
 raise oefmt(self.space.w_ValueError,
-"char32_t out of range for "
-"conversion to unicode: %s", hex(e.ordinal))
-return self.s

[pypy-commit] pypy unicode-utf8: Utf8StringBuilder

2017-11-23 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93138:9ede67aee27e
Date: 2017-11-23 15:49 +0100
http://bitbucket.org/pypy/pypy/changeset/9ede67aee27e/

Log:Utf8StringBuilder

diff --git a/rpython/rlib/rutf8.py b/rpython/rlib/rutf8.py
--- a/rpython/rlib/rutf8.py
+++ b/rpython/rlib/rutf8.py
@@ -16,9 +16,11 @@
 """
 
 import sys
-from rpython.rlib.objectmodel import enforceargs, we_are_translated
+from rpython.rlib.objectmodel import enforceargs, we_are_translated, specialize
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib import jit
+from rpython.rlib.signature import signature
+from rpython.rlib.types import char, none
 from rpython.rlib.rarithmetic import r_uint
 from rpython.rlib.unicodedata import unicodedb
 from rpython.rtyper.lltypesystem import lltype, rffi
@@ -316,6 +318,11 @@
 return res, flag
 raise CheckError(~res)
 
+def get_utf8_length_flag(s):
+""" Get the length and flag out of valid utf8. For now just calls 
check_utf8
+"""
+return check_utf8(s, True)
+
 @jit.elidable
 def _check_utf8(s, allow_surrogates, start, stop):
 pos = start
@@ -655,6 +662,53 @@
 
 return unicode_escape #, char_escape_helper
 
+class Utf8StringBuilder(object):
+def __init__(self, size=0):
+self._s = StringBuilder(size)
+self._lgt = 0
+self._flag = FLAG_ASCII
+
+def append(self, s):
+# for strings
+self._s.append(s)
+newlgt, newflag = get_utf8_length_flag(s)
+self._lgt += newlgt
+self._flag = combine_flags(self._flag, newflag)
+
+@signature(char(), returns=none())
+def append_char(self, s):
+# for characters, ascii
+self._lgt += 1
+self._s.append(s)
+
+def append_code(self, code):
+self._flag = combine_flags(self._flag, get_flag_from_code(code))
+self._lgt += 1
+unichr_as_utf8_append(self._s, code, True)
+
+def build(self):
+return self._s.build()
+
+def get_flag(self):
+return self._flag
+
+def get_length(self):
+return self._lgt
+
+class Utf8StringIterator(object):
+def __init__(self, utf8s):
+self._utf8 = utf8s
+self._end = len(utf8s)
+self._pos = 0
+
+def done(self):
+return self._pos == self._end
+
+def next(self):
+ret = codepoint_at_pos(self._utf8, self._pos)
+self._pos = next_codepoint_pos(self._utf8, self._pos)
+return ret
+
 def decode_latin_1(s):
 if len(s) == 0:
 return s
diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -139,3 +139,39 @@
 result = rutf8.surrogate_in_utf8(uni)
 expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff')
 assert result == expected
+
+@given(strategies.text())
+def test_get_utf8_length_flag(u):
+exp_lgt = len(u)
+exp_flag = rutf8.FLAG_ASCII
+for c in u:
+if ord(c) > 0x7F:
+exp_flag = rutf8.FLAG_REGULAR
+lgt, flag = rutf8.get_utf8_length_flag(u.encode('utf8'))
+assert lgt == exp_lgt
+assert flag == exp_flag
+
+def test_utf8_string_builder():
+s = rutf8.Utf8StringBuilder()
+s.append("foo")
+s.append_char("x")
+assert s.get_flag() == rutf8.FLAG_ASCII
+assert s.get_length() == 4
+assert s.build() == "foox"
+s.append(u"\u1234".encode("utf8"))
+assert s.get_flag() == rutf8.FLAG_REGULAR
+assert s.get_length() == 5
+assert s.build().decode("utf8") == u"foox\u1234"
+s.append("foo")
+s.append_char("x")
+assert s.get_flag() == rutf8.FLAG_REGULAR
+assert s.get_length() == 9
+assert s.build().decode("utf8") == u"foox\u1234foox"
+s = rutf8.Utf8StringBuilder()
+s.append_code(0x1234)
+assert s.build().decode("utf8") == u"\u1234"
+assert s.get_flag() == rutf8.FLAG_REGULAR
+assert s.get_length() == 1
+s.append_code(0xD800)
+assert s.get_flag() == rutf8.FLAG_HAS_SURROGATES
+assert s.get_length() == 2
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: merge

2017-11-23 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93139:3e45feebc910
Date: 2017-11-23 15:49 +0100
http://bitbucket.org/pypy/pypy/changeset/3e45feebc910/

Log:merge

diff --git a/pypy/module/_cffi_backend/ctypearray.py 
b/pypy/module/_cffi_backend/ctypearray.py
--- a/pypy/module/_cffi_backend/ctypearray.py
+++ b/pypy/module/_cffi_backend/ctypearray.py
@@ -64,13 +64,10 @@
 elif space.isinstance_w(w_value, space.w_unicode):
 from pypy.module._cffi_backend import wchar_helper
 w_u = space.convert_arg_to_w_unicode(w_value)
-if self.citem.size == 4:
+if self.ctitem.size == 2:
+length = wchar_helper.utf8_size_as_char16(w_u._utf8)
+else:
 length = w_u._len()
-else:
-if not w_u._has_surrogates():
-length = w_u._len()
-else:
-length = wchar_helper.unicode_size_as_char16(w_u._utf8, 
w_u._len())
 return (w_value, length + 1)
 else:
 explicitlength = space.getindex_w(w_value, space.w_OverflowError)
diff --git a/pypy/module/_cffi_backend/ctypeprim.py 
b/pypy/module/_cffi_backend/ctypeprim.py
--- a/pypy/module/_cffi_backend/ctypeprim.py
+++ b/pypy/module/_cffi_backend/ctypeprim.py
@@ -40,16 +40,13 @@
 return ord(s[0])
 
 def cast_unicode(self, w_ob):
-import pdb
-pdb.set_trace()
 space = self.space
 w_u = space.convert_arg_to_w_unicode(w_ob)
 if w_u._len() != 1:
 raise oefmt(space.w_TypeError,
 "cannot cast unicode string of length %d to ctype 
'%s'",
 w_u._len(), self.name)
-ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0)
-return intmask(ordinal)
+return rutf8.codepoint_at_pos(w_u._utf8, 0)
 
 def cast(self, w_ob):
 from pypy.module._cffi_backend import ctypeptr
@@ -175,21 +172,19 @@
 return self.space.newint(value)# r_uint => 'long' object
 
 def convert_to_object(self, cdata):
-if self.is_signed_wchar:
-code = ord(rffi.cast(rffi.CWCHARP, cdata)[0])
-return self.space.newutf8(
-rutf8.unichr_as_utf8(code), 1,
-rutf8.get_flag_from_code(code))
-else:
-value = misc.read_raw_ulong_data(cdata, self.size)   # r_uint
-try:
-u = wchar_helper.ordinal_to_unicode(value)
-except wchar_helper.OutOfRange as e:
-raise oefmt(self.space.w_ValueError,
-"char32_t out of range for "
-"conversion to unicode: %s", hex(e.ordinal))
-return self.space.newutf8(rutf8.unichr_as_utf8(ord(u)), 1,
-rutf8.get_flag_from_code(ord(u)))
+value = misc.read_raw_ulong_data(cdata, self.size)   # r_uint
+try:
+utf8 = rutf8.unichr_as_utf8(value, allow_surrogates=True)
+except ValueError:
+if self.is_signed_wchar:
+s = hex(intmask(value))
+else:
+s = hex(value)
+raise oefmt(self.space.w_ValueError,
+"%s out of range for conversion to unicode: %s",
+self.name, s)
+flag = rutf8.get_flag_from_code(intmask(value))
+return self.space.newutf8(utf8, 1, flag)
 
 def string(self, cdataobj, maxlen):
 with cdataobj as ptr:
@@ -200,7 +195,13 @@
 # returns a r_uint.  If self.size == 2, it is smaller than 0x1
 space = self.space
 if space.isinstance_w(w_ob, space.w_unicode):
-return rutf8.codepoint_at_pos(space.utf8_w(w_ob), 0)
+w_u = space.convert_arg_to_w_unicode(w_ob)
+if w_u._len() != 1:
+raise self._convert_error("single character", w_ob)
+ordinal = rutf8.codepoint_at_pos(w_u._utf8, 0)
+if self.size == 2 and ordinal > 0x:
+raise self._convert_error("single character <= 0x", w_ob)
+return r_uint(ordinal)
 elif (isinstance(w_ob, cdataobj.W_CData) and
isinstance(w_ob.ctype, W_CTypePrimitiveUniChar) and
w_ob.ctype.size == self.size):
@@ -214,15 +215,15 @@
 
 def unpack_ptr(self, w_ctypeptr, ptr, length):
 if self.size == 2:
-u = wchar_helper.unicode_from_char16(ptr, length)
+utf8, lgt, flag = wchar_helper.utf8_from_char16(ptr, length)
 else:
 try:
-u = wchar_helper.unicode_from_char32(ptr, length)
+utf8, lgt, flag = wchar_helper.utf8_from_char32(ptr, length)
 except wchar_helper.OutOfRange as e:
 raise oefmt(self.space.w_ValueError,
-"char32_t out of range for "
-"conversion to unicode: %s", hex(e.ordinal))
-return self.space.newunicode(u)
+

[pypy-commit] pypy unicode-utf8: provide explicit examples

2017-11-23 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93140:d24fe4f59c96
Date: 2017-11-23 15:57 +0100
http://bitbucket.org/pypy/pypy/changeset/d24fe4f59c96/

Log:provide explicit examples

diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -30,6 +30,7 @@
 
 @settings(max_examples=1)
 @given(strategies.binary(), strategies.booleans())
+@example('\xf1\x80\x80\x80', False)
 def test_check_utf8(s, allow_surrogates):
 _test_check_utf8(s, allow_surrogates)
 
@@ -134,19 +135,23 @@
 assert repr(u) == repr_func(u.encode('utf8'))
 
 @given(strategies.lists(strategies.characters()))
+@example([u'\ud800', u'\udc00'])
 def test_surrogate_in_utf8(unichars):
 uni = u''.join(unichars).encode('utf-8')
 result = rutf8.surrogate_in_utf8(uni)
 expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff')
 assert result == expected
 
-@given(strategies.text())
-def test_get_utf8_length_flag(u):
+@given(strategies.lists(strategies.characters()))
+def test_get_utf8_length_flag(unichars):
+u = u''.join(unichars)
 exp_lgt = len(u)
 exp_flag = rutf8.FLAG_ASCII
 for c in u:
 if ord(c) > 0x7F:
 exp_flag = rutf8.FLAG_REGULAR
+if 0xD800 <= ord(c) <= 0xDFFF:
+exp_flag = rutf8.FLAG_HAS_SURROGATES
 lgt, flag = rutf8.get_utf8_length_flag(u.encode('utf8'))
 assert lgt == exp_lgt
 assert flag == exp_flag
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: fix test on narrow host

2017-11-23 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93141:eb564d44a7c8
Date: 2017-11-23 16:15 +0100
http://bitbucket.org/pypy/pypy/changeset/eb564d44a7c8/

Log:fix test on narrow host

diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -57,12 +57,13 @@
 assert ~(length) == e.start
 else:
 assert valid
-assert length == len(u)
 if flag == rutf8.FLAG_ASCII:
 s.decode('ascii') # assert did not raise
 elif flag == rutf8.FLAG_HAS_SURROGATES:
 assert allow_surrogates
 assert _has_surrogates(s)
+if sys.maxunicode == 0x10 or not _has_surrogates(s):
+assert length == len(u)
 
 @given(strategies.characters())
 def test_next_pos(uni):
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: fix tests on narrow host

2017-11-23 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93142:fa3bcbe5b09f
Date: 2017-11-23 16:17 +0100
http://bitbucket.org/pypy/pypy/changeset/fa3bcbe5b09f/

Log:fix tests on narrow host

diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -138,7 +138,7 @@
 @given(strategies.lists(strategies.characters()))
 @example([u'\ud800', u'\udc00'])
 def test_surrogate_in_utf8(unichars):
-uni = u''.join(unichars).encode('utf-8')
+uni = ''.join([u.encode('utf8') for u in unichars])
 result = rutf8.surrogate_in_utf8(uni)
 expected = any(uch for uch in unichars if u'\ud800' <= uch <= u'\udfff')
 assert result == expected
@@ -153,6 +153,7 @@
 exp_flag = rutf8.FLAG_REGULAR
 if 0xD800 <= ord(c) <= 0xDFFF:
 exp_flag = rutf8.FLAG_HAS_SURROGATES
+break
 lgt, flag = rutf8.get_utf8_length_flag(u.encode('utf8'))
 assert lgt == exp_lgt
 assert flag == exp_flag
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: more tests

2017-11-23 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93143:e4a568e4514c
Date: 2017-11-23 16:32 +0100
http://bitbucket.org/pypy/pypy/changeset/e4a568e4514c/

Log:more tests

diff --git a/rpython/rlib/test/test_rutf8.py b/rpython/rlib/test/test_rutf8.py
--- a/rpython/rlib/test/test_rutf8.py
+++ b/rpython/rlib/test/test_rutf8.py
@@ -154,8 +154,9 @@
 if 0xD800 <= ord(c) <= 0xDFFF:
 exp_flag = rutf8.FLAG_HAS_SURROGATES
 break
-lgt, flag = rutf8.get_utf8_length_flag(u.encode('utf8'))
-assert lgt == exp_lgt
+lgt, flag = rutf8.get_utf8_length_flag(''.join([c.encode('utf8') for c in 
u]))
+if exp_flag != rutf8.FLAG_HAS_SURROGATES:
+assert lgt == exp_lgt
 assert flag == exp_flag
 
 def test_utf8_string_builder():
@@ -182,3 +183,11 @@
 s.append_code(0xD800)
 assert s.get_flag() == rutf8.FLAG_HAS_SURROGATES
 assert s.get_length() == 2
+
+@given(strategies.text())
+def test_utf8_iterator(arg):
+u = rutf8.Utf8StringIterator(arg.encode('utf8'))
+l = []
+while not u.done():
+l.append(unichr(u.next()))
+assert list(arg) == l
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: merge default

2017-11-23 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93144:177352fb8cf4
Date: 2017-11-23 16:46 +0100
http://bitbucket.org/pypy/pypy/changeset/177352fb8cf4/

Log:merge default

diff too long, truncating to 2000 out of 7577 lines

diff --git a/.hgignore b/.hgignore
--- a/.hgignore
+++ b/.hgignore
@@ -71,6 +71,8 @@
 ^lib_pypy/.+.c$
 ^lib_pypy/.+.o$
 ^lib_pypy/.+.so$
+^lib_pypy/.+.pyd$
+^lib_pypy/Release/
 ^pypy/doc/discussion/.+\.html$
 ^include/.+\.h$
 ^include/.+\.inl$
diff --git a/extra_tests/requirements.txt b/extra_tests/requirements.txt
new file mode 100644
--- /dev/null
+++ b/extra_tests/requirements.txt
@@ -0,0 +1,2 @@
+pytest
+hypothesis
diff --git a/extra_tests/test_bytes.py b/extra_tests/test_bytes.py
new file mode 100644
--- /dev/null
+++ b/extra_tests/test_bytes.py
@@ -0,0 +1,84 @@
+from hypothesis import strategies as st
+from hypothesis import given, example
+
+st_bytestring = st.binary() | st.binary().map(bytearray)
+
+@given(st_bytestring, st_bytestring, st_bytestring)
+def test_find(u, prefix, suffix):
+s = prefix + u + suffix
+assert 0 <= s.find(u) <= len(prefix)
+assert s.find(u, len(prefix), len(s) - len(suffix)) == len(prefix)
+
+@given(st_bytestring, st_bytestring, st_bytestring)
+def test_index(u, prefix, suffix):
+s = prefix + u + suffix
+assert 0 <= s.index(u) <= len(prefix)
+assert s.index(u, len(prefix), len(s) - len(suffix)) == len(prefix)
+
+@given(st_bytestring, st_bytestring, st_bytestring)
+def test_rfind(u, prefix, suffix):
+s = prefix + u + suffix
+assert s.rfind(u) >= len(prefix)
+assert s.rfind(u, len(prefix), len(s) - len(suffix)) == len(prefix)
+
+@given(st_bytestring, st_bytestring, st_bytestring)
+def test_rindex(u, prefix, suffix):
+s = prefix + u + suffix
+assert s.rindex(u) >= len(prefix)
+assert s.rindex(u, len(prefix), len(s) - len(suffix)) == len(prefix)
+
+def adjust_indices(u, start, end):
+if end < 0:
+end = max(end + len(u), 0)
+else:
+end = min(end, len(u))
+if start < 0:
+start = max(start + len(u), 0)
+return start, end
+
+@given(st_bytestring, st_bytestring)
+def test_startswith_basic(u, v):
+assert u.startswith(v) is (u[:len(v)] == v)
+
+@example(b'x', b'', 1)
+@example(b'x', b'', 2)
+@given(st_bytestring, st_bytestring, st.integers())
+def test_startswith_start(u, v, start):
+expected = u[start:].startswith(v) if v else (start <= len(u))
+assert u.startswith(v, start) is expected
+
+@example(b'x', b'', 1, 0)
+@example(b'xx', b'', -1, 0)
+@given(st_bytestring, st_bytestring, st.integers(), st.integers())
+def test_startswith_3(u, v, start, end):
+if v:
+expected = u[start:end].startswith(v)
+else:  # CPython leaks implementation details in this case
+start0, end0 = adjust_indices(u, start, end)
+expected = start0 <= len(u) and start0 <= end0
+assert u.startswith(v, start, end) is expected
+
+@given(st_bytestring, st_bytestring)
+def test_endswith_basic(u, v):
+if len(v) > len(u):
+assert u.endswith(v) is False
+else:
+assert u.endswith(v) is (u[len(u) - len(v):] == v)
+
+@example(b'x', b'', 1)
+@example(b'x', b'', 2)
+@given(st_bytestring, st_bytestring, st.integers())
+def test_endswith_2(u, v, start):
+expected = u[start:].endswith(v) if v else (start <= len(u))
+assert u.endswith(v, start) is expected
+
+@example(b'x', b'', 1, 0)
+@example(b'xx', b'', -1, 0)
+@given(st_bytestring, st_bytestring, st.integers(), st.integers())
+def test_endswith_3(u, v, start, end):
+if v:
+expected = u[start:end].endswith(v)
+else:  # CPython leaks implementation details in this case
+start0, end0 = adjust_indices(u, start, end)
+expected = start0 <= len(u) and start0 <= end0
+assert u.endswith(v, start, end) is expected
diff --git a/extra_tests/test_unicode.py b/extra_tests/test_unicode.py
--- a/extra_tests/test_unicode.py
+++ b/extra_tests/test_unicode.py
@@ -1,3 +1,4 @@
+import sys
 import pytest
 from hypothesis import strategies as st
 from hypothesis import given, settings, example
@@ -32,3 +33,89 @@
 @given(s=st.text())
 def test_composition(s, norm1, norm2, norm3):
 assert normalize(norm2, normalize(norm1, s)) == normalize(norm3, s)
+
+@given(st.text(), st.text(), st.text())
+def test_find(u, prefix, suffix):
+s = prefix + u + suffix
+assert 0 <= s.find(u) <= len(prefix)
+assert s.find(u, len(prefix), len(s) - len(suffix)) == len(prefix)
+
+@given(st.text(), st.text(), st.text())
+def test_index(u, prefix, suffix):
+s = prefix + u + suffix
+assert 0 <= s.index(u) <= len(prefix)
+assert s.index(u, len(prefix), len(s) - len(suffix)) == len(prefix)
+
+@given(st.text(), st.text(), st.text())
+def test_rfind(u, prefix, suffix):
+s = prefix + u + suffix
+assert s.rfind(u) >= len(prefix)
+assert s.rfind(u, len(prefix), len(s) - len(suffix)) == len(prefix)
+
+@given(st.text(), st.text(), st.text())
+def test_rindex(u, prefix, suffi

[pypy-commit] pypy default: refactor

2017-11-23 Thread rlamy
Author: Ronan Lamy 
Branch: 
Changeset: r93145:ff05ee1c4b6a
Date: 2017-11-23 16:48 +
http://bitbucket.org/pypy/pypy/changeset/ff05ee1c4b6a/

Log:refactor

diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -541,6 +541,10 @@
 self.decoded_chars_used += size
 return chars
 
+def _has_data(self):
+return (self.decoded_chars is not None and
+self.decoded_chars_used < len(self.decoded_chars))
+
 def _read_chunk(self, space):
 """Read and decode the next chunk of data from the BufferedReader.
 The return value is True unless EOF was reached.  The decoded string
@@ -588,6 +592,19 @@
 
 return not eof
 
+def _ensure_data(self, space):
+while not self._has_data():
+try:
+if not self._read_chunk(space):
+self._unset_decoded()
+self.snapshot = None
+return False
+except OperationError as e:
+if trap_eintr(space, e):
+continue
+raise
+return True
+
 def next_w(self, space):
 self._check_attached(space)
 self.telling = False
@@ -621,23 +638,13 @@
 builder = UnicodeBuilder(size)
 
 # Keep reading chunks until we have n characters to return
-while True:
+while remaining > 0:
+if not self._ensure_data(space):
+break
 data = self._get_decoded_chars(remaining)
 builder.append(data)
 remaining -= len(data)
 
-if remaining <= 0: # Done
-break
-
-try:
-if not self._read_chunk(space):
-# EOF
-break
-except OperationError as e:
-if trap_eintr(space, e):
-continue
-raise
-
 return space.newunicode(builder.build())
 
 def readline_w(self, space, w_limit=None):
@@ -653,20 +660,9 @@
 
 while True:
 # First, get some data if necessary
-has_data = True
-while not self.decoded_chars:
-try:
-if not self._read_chunk(space):
-has_data = False
-break
-except OperationError as e:
-if trap_eintr(space, e):
-continue
-raise
+has_data = self._ensure_data(space)
 if not has_data:
 # end of file
-self._unset_decoded()
-self.snapshot = None
 start = endpos = offset_to_buffer = 0
 break
 
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: fix multibytecodec

2017-11-23 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93146:99ca8cf9bbc4
Date: 2017-11-23 18:30 +0100
http://bitbucket.org/pypy/pypy/changeset/99ca8cf9bbc4/

Log:fix multibytecodec

diff --git a/pypy/module/_multibytecodec/c_codecs.py 
b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -197,19 +197,21 @@
 MBENC_FLUSH = 1
 MBENC_RESET = 2
 
-def encode(codec, unicodedata, errors="strict", errorcb=None, namecb=None):
+def encode(codec, unicodedata, length, errors="strict", errorcb=None,
+   namecb=None):
 encodebuf = pypy_cjk_enc_new(codec)
 if not encodebuf:
 raise MemoryError
 try:
-return encodeex(encodebuf, unicodedata, errors, errorcb, namecb)
+return encodeex(encodebuf, unicodedata, length, errors, errorcb, 
namecb)
 finally:
 pypy_cjk_enc_free(encodebuf)
 
-def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None,
+def encodeex(encodebuf, utf8data, length, errors="strict", errorcb=None,
  namecb=None, ignore_error=0):
-inleft = len(unicodedata)
-with rffi.scoped_nonmoving_unicodebuffer(unicodedata) as inbuf:
+inleft = length
+inbuf = rffi.utf82wcharp(utf8data, length)
+try:
 if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0:
 raise MemoryError
 if ignore_error == 0:
@@ -221,16 +223,18 @@
 if r == 0 or r == ignore_error:
 break
 multibytecodec_encerror(encodebuf, r, errors,
-errorcb, namecb, unicodedata)
+errorcb, namecb, utf8data)
 while flags & MBENC_RESET:
 r = pypy_cjk_enc_reset(encodebuf)
 if r == 0:
 break
 multibytecodec_encerror(encodebuf, r, errors,
-errorcb, namecb, unicodedata)
+errorcb, namecb, utf8data)
 src = pypy_cjk_enc_outbuf(encodebuf)
 length = pypy_cjk_enc_outlen(encodebuf)
 return rffi.charpsize2str(src, length)
+finally:
+lltype.free(inbuf, flavor='raw')
 
 def multibytecodec_encerror(encodebuf, e, errors,
 errorcb, namecb, unicodedata):
@@ -256,21 +260,16 @@
 elif errors == "replace":
 codec = pypy_cjk_enc_getcodec(encodebuf)
 try:
-replace = encode(codec, u"?")
+replace = encode(codec, "?", 1)
 except EncodeDecodeError:
 replace = "?"
 else:
 assert errorcb
-XXX
-retu, rets, end = errorcb(errors, namecb, reason,
-  unicodedata.encode("utf8"), start, end)
-if rets is not None:
-# py3k only
-replace = rets
-else:
-assert retu is not None
-codec = pypy_cjk_enc_getcodec(encodebuf)
-replace = encode(codec, retu, "strict", errorcb, namecb)
+rets, end = errorcb(errors, namecb, reason,
+unicodedata, start, end)
+codec = pypy_cjk_enc_getcodec(encodebuf)
+lgt, _ = rutf8.get_utf8_length_flag(rets)
+replace = encode(codec, rets, lgt, "strict", errorcb, namecb)
 with rffi.scoped_nonmovingbuffer(replace) as inbuf:
 r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
 if r == MBERR_NOMEMORY:
diff --git a/pypy/module/_multibytecodec/interp_incremental.py 
b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -1,4 +1,5 @@
 from rpython.rtyper.lltypesystem import lltype
+from rpython.rlib import rutf8
 from pypy.module._multibytecodec import c_codecs
 from pypy.module._multibytecodec.interp_multibytecodec import (
 MultibyteCodec, wrap_unicodedecodeerror, wrap_runtimeerror,
@@ -65,7 +66,8 @@
 pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
 assert 0 <= pos <= len(object)
 self.pending = object[pos:]
-return space.newunicode(output)
+lgt, flag = rutf8.get_utf8_length_flag(output)
+return space.newutf8(output, lgt, flag)
 
 
 @unwrap_spec(errors="text_or_none")
@@ -88,7 +90,8 @@
 
 def _initialize(self):
 self.encodebuf = c_codecs.pypy_cjk_enc_new(self.codec)
-self.pending = u""
+self.pending = ""
+self.pending_len = 0
 
 def _free(self):
 self.pending = None
@@ -96,25 +99,37 @@
 c_codecs.pypy_cjk_enc_free(self.encodebuf)
 self.encodebuf = lltype.nullptr(c_codecs.ENCODEBUF_P.TO)
 
-@unwrap_spec(object='utf8', final=bool)
-def encode_w(self, object, final=False):
-u_object = object.decode('utf8')
+@unwrap_spec(final=bool)
+def encode_w(self, space, w_object, final=False):
+utf8data, length = space.utf8_len_w(w_object)
 

[pypy-commit] pypy default: Simplify _find_line_ending() and fix logic in the case of embedded \r and self.readnl=='\r\n'

2017-11-23 Thread rlamy
Author: Ronan Lamy 
Branch: 
Changeset: r93147:8369cd92f7d0
Date: 2017-11-23 17:52 +
http://bitbucket.org/pypy/pypy/changeset/8369cd92f7d0/

Log:Simplify _find_line_ending() and fix logic in the case of embedded
\r and self.readnl=='\r\n'

diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -216,14 +216,7 @@
 
 def _find_line_ending(self, line, start, end):
 size = end - start
-if self.readtranslate:
-# Newlines are already translated, only search for \n
-pos = line.find(u'\n', start, end)
-if pos >= 0:
-return pos + 1, 0
-else:
-return -1, size
-elif self.readuniversal:
+if self.readuniversal:
 # Universal newline search. Find any of \r, \r\n, \n
 # The decoder ensures that \r\n are not split in two pieces
 i = start
@@ -242,16 +235,22 @@
 return i + 1, 0
 else:
 return i, 0
+if self.readtranslate:
+# Newlines are already translated, only search for \n
+newline = u'\n'
 else:
 # Non-universal mode.
-pos = line.find(self.readnl, start, end)
-if pos >= 0:
-return pos + len(self.readnl), 0
-else:
-pos = line.find(self.readnl[0], start, end)
-if pos >= 0:
-return -1, pos - start
-return -1, size
+newline = self.readnl
+end_scan = end - len(newline) + 1
+for i in range(start, end_scan):
+ch = line[i]
+if ch == newline[0]:
+for j in range(1, len(newline)):
+if line[i + j] != newline[j]:
+break
+else:
+return i + len(newline), 0
+return -1, end_scan
 
 
 W_TextIOBase.typedef = TypeDef(
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: one part of interp_sre

2017-11-23 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93148:5a057586add0
Date: 2017-11-23 19:02 +0100
http://bitbucket.org/pypy/pypy/changeset/5a057586add0/

Log:one part of interp_sre

diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -7,7 +7,8 @@
 from pypy.interpreter.error import OperationError, oefmt
 from rpython.rlib.rarithmetic import intmask
 from rpython.rlib import jit
-from rpython.rlib.rstring import StringBuilder, UnicodeBuilder
+from rpython.rlib.rstring import StringBuilder
+from rpython.rlib.rutf8 import Utf8StringBuilder
 
 # 
 #
@@ -237,8 +238,8 @@
 filter_is_callable = True
 else:
 if space.isinstance_w(w_ptemplate, space.w_unicode):
-filter_as_unicode = space.unicode_w(w_ptemplate)
-literal = u'\\' not in filter_as_unicode
+filter_as_unicode = space.utf8_w(w_ptemplate)
+literal = '\\' not in filter_as_unicode
 use_builder = (
 space.isinstance_w(w_string, space.w_unicode) and literal)
 else:
@@ -267,7 +268,7 @@
 sublist_w = strbuilder = unicodebuilder = None
 if use_builder:
 if filter_as_unicode is not None:
-unicodebuilder = UnicodeBuilder(ctx.end)
+unicodebuilder = Utf8StringBuilder(ctx.end)
 else:
 assert filter_as_string is not None
 strbuilder = StringBuilder(ctx.end)
@@ -335,7 +336,9 @@
 return space.newbytes(strbuilder.build()), n
 else:
 assert unicodebuilder is not None
-return space.newunicode(unicodebuilder.build()), n
+return space.newutf8(unicodebuilder.build(),
+ unicodebuilder.get_length(),
+ unicodebuilder.get_flag()), n
 else:
 if space.isinstance_w(w_string, space.w_unicode):
 w_emptystr = space.newunicode(u'')
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: hg merge default

2017-11-23 Thread rlamy
Author: Ronan Lamy 
Branch: unicode-utf8
Changeset: r93149:0797bb6394b6
Date: 2017-11-23 18:07 +
http://bitbucket.org/pypy/pypy/changeset/0797bb6394b6/

Log:hg merge default

diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -223,14 +223,7 @@
 
 def _find_line_ending(self, line, start, end):
 size = end - start
-if self.readtranslate:
-# Newlines are already translated, only search for \n
-pos = line.find('\n', start, end)
-if pos >= 0:
-return pos + 1, 0
-else:
-return -1, size
-elif self.readuniversal:
+if self.readuniversal:
 # Universal newline search. Find any of \r, \r\n, \n
 # The decoder ensures that \r\n are not split in two pieces
 i = start
@@ -249,16 +242,22 @@
 return i + 1, 0
 else:
 return i, 0
+if self.readtranslate:
+# Newlines are already translated, only search for \n
+newline = '\n'
 else:
 # Non-universal mode.
-pos = line.find(self.readnl, start, end)
-if pos >= 0:
-return pos + len(self.readnl), 0
-else:
-pos = line.find(self.readnl[0], start, end)
-if pos >= 0:
-return -1, pos - start
-return -1, size
+newline = self.readnl
+end_scan = end - len(newline) + 1
+for i in range(start, end_scan):
+ch = line[i]
+if ch == newline[0]:
+for j in range(1, len(newline)):
+if line[i + j] != newline[j]:
+break
+else:
+return i + len(newline), 0
+return -1, end_scan
 
 
 W_TextIOBase.typedef = TypeDef(
@@ -548,6 +547,10 @@
 self.decoded_chars_used += size
 return chars
 
+def _has_data(self):
+return (self.decoded_chars is not None and
+self.decoded_chars_used < len(self.decoded_chars))
+
 def _read_chunk(self, space):
 """Read and decode the next chunk of data from the BufferedReader.
 The return value is True unless EOF was reached.  The decoded string
@@ -595,6 +598,19 @@
 
 return not eof
 
+def _ensure_data(self, space):
+while not self._has_data():
+try:
+if not self._read_chunk(space):
+self._unset_decoded()
+self.snapshot = None
+return False
+except OperationError as e:
+if trap_eintr(space, e):
+continue
+raise
+return True
+
 def next_w(self, space):
 self._check_attached(space)
 self.telling = False
@@ -628,23 +644,13 @@
 builder = StringBuilder(size)
 
 # Keep reading chunks until we have n characters to return
-while True:
+while remaining > 0:
+if not self._ensure_data(space):
+break
 data = self._get_decoded_chars(remaining)
 builder.append(data)
 remaining -= len(data)
 
-if remaining <= 0: # Done
-break
-
-try:
-if not self._read_chunk(space):
-# EOF
-break
-except OperationError as e:
-if trap_eintr(space, e):
-continue
-raise
-
 return space.new_from_utf8(builder.build())
 
 def readline_w(self, space, w_limit=None):
@@ -660,20 +666,9 @@
 
 while True:
 # First, get some data if necessary
-has_data = True
-while not self.decoded_chars:
-try:
-if not self._read_chunk(space):
-has_data = False
-break
-except OperationError as e:
-if trap_eintr(space, e):
-continue
-raise
+has_data = self._ensure_data(space)
 if not has_data:
 # end of file
-self._unset_decoded()
-self.snapshot = None
 start = endpos = offset_to_buffer = 0
 break
 
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy default: fix test use of eci for vmprof_start_sampling, vmprof_start_sampling

2017-11-23 Thread mattip
Author: Matti Picus 
Branch: 
Changeset: r93151:72001f56a97f
Date: 2017-11-23 20:28 +0200
http://bitbucket.org/pypy/pypy/changeset/72001f56a97f/

Log:fix test use of eci for vmprof_start_sampling, vmprof_start_sampling

diff --git a/rpython/rlib/rvmprof/cintf.py b/rpython/rlib/rvmprof/cintf.py
--- a/rpython/rlib/rvmprof/cintf.py
+++ b/rpython/rlib/rvmprof/cintf.py
@@ -9,6 +9,7 @@
 from rpython.rtyper.tool import rffi_platform as platform
 from rpython.rlib import rthread, jit
 from rpython.rlib.objectmodel import we_are_translated
+from rpython.config.translationoption import get_translation_config
 
 class VMProfPlatformUnsupported(Exception):
 pass
@@ -133,11 +134,17 @@
 #endif
 """])
 
+if get_translation_config() is None:
+# tests need the full eci here
+_eci = global_eci
+else:
+_eci = auto_eci
+
 vmprof_stop_sampling = rffi.llexternal("vmprof_stop_sampling", [],
-   rffi.INT, compilation_info=auto_eci,
+   rffi.INT, compilation_info=_eci,
_nowrapper=True)
 vmprof_start_sampling = rffi.llexternal("vmprof_start_sampling", [],
-lltype.Void, compilation_info=auto_eci,
+lltype.Void, compilation_info=_eci,
 _nowrapper=True)
 
 
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy default: cannot pip install vmprof on arm, s390x

2017-11-23 Thread mattip
Author: Matti Picus 
Branch: 
Changeset: r93150:8c42f0f755c0
Date: 2017-11-23 18:48 +0200
http://bitbucket.org/pypy/pypy/changeset/8c42f0f755c0/

Log:cannot pip install vmprof on arm, s390x

diff --git a/requirements.txt b/requirements.txt
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,7 @@
 cffi>=1.4.0
-vmprof>=0.4.10  # required to parse log files in rvmprof tests
+
+# parse log files in rvmprof tests
+vmprof>=0.4.10; 'x86' in platform.machine #skip arm, s390x
 
 # hypothesis is used for test generation on untranslated tests
 hypothesis
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy py3.5: merge default into py3.5

2017-11-23 Thread mattip
Author: Matti Picus 
Branch: py3.5
Changeset: r93152:ce6402cbdf3c
Date: 2017-11-23 22:08 +0200
http://bitbucket.org/pypy/pypy/changeset/ce6402cbdf3c/

Log:merge default into py3.5

diff --git a/pypy/module/_io/interp_stringio.py 
b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -184,9 +184,7 @@
 start,
 end
 )
-if endpos >= 0:
-endpos += start
-else:
+if endpos < 0:
 endpos = end
 assert endpos >= 0
 self.pos = endpos
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -216,44 +216,41 @@
 
 def _find_line_ending(self, line, start, end):
 size = end - start
-if self.readtranslate:
-
-# Newlines are already translated, only search for \n
-pos = line.find(u'\n', start, end)
-if pos >= 0:
-return pos - start + 1, 0
-else:
-return -1, size
-elif self.readuniversal:
+if self.readuniversal:
 # Universal newline search. Find any of \r, \r\n, \n
 # The decoder ensures that \r\n are not split in two pieces
-i = 0
+i = start
 while True:
-# Fast path for non-control chars. The loop always ends
-# since the Py_UNICODE storage is NUL-terminated.
-while i < size and line[start + i] > '\r':
+# Fast path for non-control chars.
+while i < end and line[i] > '\r':
 i += 1
-if i >= size:
+if i >= end:
 return -1, size
-ch = line[start + i]
+ch = line[i]
 i += 1
 if ch == '\n':
 return i, 0
 if ch == '\r':
-if line[start + i] == '\n':
+if line[i] == '\n':
 return i + 1, 0
 else:
 return i, 0
+if self.readtranslate:
+# Newlines are already translated, only search for \n
+newline = u'\n'
 else:
 # Non-universal mode.
-pos = line.find(self.readnl, start, end)
-if pos >= 0:
-return pos - start + len(self.readnl), 0
-else:
-pos = line.find(self.readnl[0], start, end)
-if pos >= 0:
-return -1, pos - start
-return -1, size
+newline = self.readnl
+end_scan = end - len(newline) + 1
+for i in range(start, end_scan):
+ch = line[i]
+if ch == newline[0]:
+for j in range(1, len(newline)):
+if line[i + j] != newline[j]:
+break
+else:
+return i + len(newline), 0
+return -1, end_scan
 
 
 W_TextIOBase.typedef = TypeDef(
@@ -549,8 +546,13 @@
 # _
 # read methods
 
-def _set_decoded_chars(self, chars):
-self.decoded_chars = chars
+def _unset_decoded(self):
+self.decoded_chars = None
+self.decoded_chars_used = 0
+
+def _set_decoded(self, space, w_decoded):
+check_decoded(space, w_decoded)
+self.decoded_chars = space.unicode_w(w_decoded)
 self.decoded_chars_used = 0
 
 def _get_decoded_chars(self, size):
@@ -574,6 +576,10 @@
 self.decoded_chars_used += size
 return chars
 
+def _has_data(self):
+return (self.decoded_chars is not None and
+self.decoded_chars_used < len(self.decoded_chars))
+
 def _read_chunk(self, space):
 """Read and decode the next chunk of data from the BufferedReader.
 The return value is True unless EOF was reached.  The decoded string
@@ -616,8 +622,7 @@
 eof = input_buf.getlength() == 0
 w_decoded = space.call_method(self.w_decoder, "decode",
   w_input, space.newbool(eof))
-check_decoded(space, w_decoded)
-self._set_decoded_chars(space.unicode_w(w_decoded))
+self._set_decoded(space, w_decoded)
 if space.len_w(w_decoded) > 0:
 eof = False
 
@@ -629,6 +634,19 @@
 
 return not eof
 
+def _ensure_data(self, space):
+while not self._has_data():
+try:
+if not self._read_chunk(space):
+self._unset_decoded()
+self.snapshot = None
+return False
+except OperationError as e:
+if trap_eintr(space, e):
+continue
+raise
+return True
+
 def next_w(sel

[pypy-commit] pypy default: generate conf.h for tests

2017-11-23 Thread mattip
Author: Matti Picus 
Branch: 
Changeset: r93153:d7c94a4970dd
Date: 2017-11-24 09:16 +0200
http://bitbucket.org/pypy/pypy/changeset/d7c94a4970dd/

Log:generate conf.h for tests

diff --git a/pypy/module/_continuation/test/conftest.py 
b/pypy/module/_continuation/test/conftest.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/conftest.py
@@ -0,0 +1,7 @@
+import pytest
+import sys
+
+def pytest_configure(config):
+if sys.platform.startswith('linux'):
+from rpython.rlib.rvmprof.cintf import configure_libbacktrace_linux
+configure_libbacktrace_linux()
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy py3.5: merge default into py3.5

2017-11-23 Thread mattip
Author: Matti Picus 
Branch: py3.5
Changeset: r93154:d2807ddb8178
Date: 2017-11-24 09:17 +0200
http://bitbucket.org/pypy/pypy/changeset/d2807ddb8178/

Log:merge default into py3.5

diff --git a/pypy/module/_continuation/test/conftest.py 
b/pypy/module/_continuation/test/conftest.py
new file mode 100644
--- /dev/null
+++ b/pypy/module/_continuation/test/conftest.py
@@ -0,0 +1,7 @@
+import pytest
+import sys
+
+def pytest_configure(config):
+if sys.platform.startswith('linux'):
+from rpython.rlib.rvmprof.cintf import configure_libbacktrace_linux
+configure_libbacktrace_linux()
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit