[pypy-commit] pypy unicode-utf8-test: fix space.newunicode

2017-12-08 Thread rlamy
Author: Ronan Lamy 
Branch: unicode-utf8-test
Changeset: r93323:9fe5f582087d
Date: 2017-12-08 13:37 +
http://bitbucket.org/pypy/pypy/changeset/9fe5f582087d/

Log:fix space.newunicode

diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py
--- a/pypy/objspace/std/objspace.py
+++ b/pypy/objspace/std/objspace.py
@@ -375,8 +375,8 @@
 # XXX: kill me!
 assert isinstance(unistr, unicode)
 utf8s = unistr.encode("utf-8")
-length, flag = rutf8.check_utf8(utf8s, True)
-return self.newutf8(utf8s, length, flag)
+length = rutf8.check_utf8(utf8s, True)
+return self.newutf8(utf8s, length)
 
 def type(self, w_obj):
 jit.promote(w_obj.__class__)
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy py3.6: hg merge py3.5 (+ fixes)

2017-12-08 Thread mjacob
Author: Manuel Jacob 
Branch: py3.6
Changeset: r93320:f04d4604c7e3
Date: 2017-12-09 03:14 +0100
http://bitbucket.org/pypy/pypy/changeset/f04d4604c7e3/

Log:hg merge py3.5 (+ fixes)

I'm not 100% sure about the merge in test_dis.py, but most of the
tests are failing anyway.

diff too long, truncating to 2000 out of 12565 lines

diff --git a/.hgignore b/.hgignore
--- a/.hgignore
+++ b/.hgignore
@@ -59,6 +59,7 @@
 ^rpython/rlib/rvmprof/src/shared/libbacktrace/config.h$
 ^rpython/rlib/rvmprof/src/shared/libbacktrace/config.log$
 ^rpython/rlib/rvmprof/src/shared/libbacktrace/config.status$
+^pypy/tool/dest$
 ^pypy/goal/pypy-translation-snapshot$
 ^pypy/goal/pypy-c
 ^pypy/goal/pypy3-c
diff --git a/_pytest/terminal.py b/_pytest/terminal.py
--- a/_pytest/terminal.py
+++ b/_pytest/terminal.py
@@ -366,11 +366,11 @@
 EXIT_OK, EXIT_TESTSFAILED, EXIT_INTERRUPTED, EXIT_USAGEERROR,
 EXIT_NOTESTSCOLLECTED)
 if exitstatus in summary_exit_codes:
-self.config.hook.pytest_terminal_summary(terminalreporter=self)
 self.summary_errors()
 self.summary_failures()
 self.summary_warnings()
 self.summary_passes()
+self.config.hook.pytest_terminal_summary(terminalreporter=self)
 if exitstatus == EXIT_INTERRUPTED:
 self._report_keyboardinterrupt()
 del self._keyboardinterrupt_memo
diff --git a/extra_tests/requirements.txt b/extra_tests/requirements.txt
new file mode 100644
--- /dev/null
+++ b/extra_tests/requirements.txt
@@ -0,0 +1,2 @@
+pytest
+hypothesis
diff --git a/extra_tests/test_bytes.py b/extra_tests/test_bytes.py
new file mode 100644
--- /dev/null
+++ b/extra_tests/test_bytes.py
@@ -0,0 +1,84 @@
+from hypothesis import strategies as st
+from hypothesis import given, example
+
+st_bytestring = st.binary() | st.binary().map(bytearray)
+
+@given(st_bytestring, st_bytestring, st_bytestring)
+def test_find(u, prefix, suffix):
+s = prefix + u + suffix
+assert 0 <= s.find(u) <= len(prefix)
+assert s.find(u, len(prefix), len(s) - len(suffix)) == len(prefix)
+
+@given(st_bytestring, st_bytestring, st_bytestring)
+def test_index(u, prefix, suffix):
+s = prefix + u + suffix
+assert 0 <= s.index(u) <= len(prefix)
+assert s.index(u, len(prefix), len(s) - len(suffix)) == len(prefix)
+
+@given(st_bytestring, st_bytestring, st_bytestring)
+def test_rfind(u, prefix, suffix):
+s = prefix + u + suffix
+assert s.rfind(u) >= len(prefix)
+assert s.rfind(u, len(prefix), len(s) - len(suffix)) == len(prefix)
+
+@given(st_bytestring, st_bytestring, st_bytestring)
+def test_rindex(u, prefix, suffix):
+s = prefix + u + suffix
+assert s.rindex(u) >= len(prefix)
+assert s.rindex(u, len(prefix), len(s) - len(suffix)) == len(prefix)
+
+def adjust_indices(u, start, end):
+if end < 0:
+end = max(end + len(u), 0)
+else:
+end = min(end, len(u))
+if start < 0:
+start = max(start + len(u), 0)
+return start, end
+
+@given(st_bytestring, st_bytestring)
+def test_startswith_basic(u, v):
+assert u.startswith(v) is (u[:len(v)] == v)
+
+@example(b'x', b'', 1)
+@example(b'x', b'', 2)
+@given(st_bytestring, st_bytestring, st.integers())
+def test_startswith_start(u, v, start):
+expected = u[start:].startswith(v) if v else (start <= len(u))
+assert u.startswith(v, start) is expected
+
+@example(b'x', b'', 1, 0)
+@example(b'xx', b'', -1, 0)
+@given(st_bytestring, st_bytestring, st.integers(), st.integers())
+def test_startswith_3(u, v, start, end):
+if v:
+expected = u[start:end].startswith(v)
+else:  # CPython leaks implementation details in this case
+start0, end0 = adjust_indices(u, start, end)
+expected = start0 <= len(u) and start0 <= end0
+assert u.startswith(v, start, end) is expected
+
+@given(st_bytestring, st_bytestring)
+def test_endswith_basic(u, v):
+if len(v) > len(u):
+assert u.endswith(v) is False
+else:
+assert u.endswith(v) is (u[len(u) - len(v):] == v)
+
+@example(b'x', b'', 1)
+@example(b'x', b'', 2)
+@given(st_bytestring, st_bytestring, st.integers())
+def test_endswith_2(u, v, start):
+expected = u[start:].endswith(v) if v else (start <= len(u))
+assert u.endswith(v, start) is expected
+
+@example(b'x', b'', 1, 0)
+@example(b'xx', b'', -1, 0)
+@given(st_bytestring, st_bytestring, st.integers(), st.integers())
+def test_endswith_3(u, v, start, end):
+if v:
+expected = u[start:end].endswith(v)
+else:  # CPython leaks implementation details in this case
+start0, end0 = adjust_indices(u, start, end)
+expected = start0 <= len(u) and start0 <= end0
+assert u.endswith(v, start, end) is expected
diff --git a/extra_tests/test_textio.py b/extra_tests/test_textio.py
new file mode 100644
--- /dev/null
+++ b/extra_tests/test_textio.py
@@ -0,0 +1,48 @@
+from hypothesis import given, 

[pypy-commit] pypy unicode-utf8-test: hg merge unicode-utf8

2017-12-08 Thread rlamy
Author: Ronan Lamy 
Branch: unicode-utf8-test
Changeset: r93322:33d09fc56c08
Date: 2017-12-08 13:28 +
http://bitbucket.org/pypy/pypy/changeset/33d09fc56c08/

Log:hg merge unicode-utf8

diff too long, truncating to 2000 out of 3186 lines

diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -9,5 +9,6 @@
 * remove assertions from W_UnicodeObject.__init__ if all the builders pass
 * what to do with error handlers that go backwards. There were tests
   in test_codecs that would check for that
+* improve performance of splitlines
 
 * fix _pypyjson to not use a wrapped dict when decoding an object
diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py
--- a/pypy/interpreter/baseobjspace.py
+++ b/pypy/interpreter/baseobjspace.py
@@ -1087,8 +1087,11 @@
 def newlist_utf8(self, list_u, is_ascii):
 l_w = [None] * len(list_u)
 for i, item in enumerate(list_u):
-length, flag = rutf8.check_utf8(item, True)
-l_w[i] = self.newutf8(item, length, flag)
+if not is_ascii:
+length = rutf8.check_utf8(item, True)
+else:
+length = len(item)
+l_w[i] = self.newutf8(item, length)
 return self.newlist(l_w)
 
 def newlist_int(self, list_i):
diff --git a/pypy/interpreter/pyparser/parsestring.py 
b/pypy/interpreter/pyparser/parsestring.py
--- a/pypy/interpreter/pyparser/parsestring.py
+++ b/pypy/interpreter/pyparser/parsestring.py
@@ -64,8 +64,8 @@
 r = unicodehelper.decode_raw_unicode_escape(space, substr)
 else:
 r = unicodehelper.decode_unicode_escape(space, substr)
-v, length, flag = r
-return space.newutf8(v, length, flag)
+v, length = r
+return space.newutf8(v, length)
 
 need_encoding = (encoding is not None and
  encoding != "utf-8" and encoding != "utf8" and
@@ -74,8 +74,8 @@
 substr = s[ps : q]
 if rawmode or '\\' not in s[ps:]:
 if need_encoding:
-lgt, flag = unicodehelper.check_utf8_or_raise(space, substr)
-w_u = space.newutf8(substr, lgt, flag)
+lgt = unicodehelper.check_utf8_or_raise(space, substr)
+w_u = space.newutf8(substr, lgt)
 w_v = unicodehelper.encode(space, w_u, encoding)
 return w_v
 else:
@@ -234,8 +234,8 @@
 p = ps
 while p < end and ord(s[p]) & 0x80:
 p += 1
-lgt, flag = unicodehelper.check_utf8_or_raise(space, s, ps, p)
-w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt, flag),
+lgt = unicodehelper.check_utf8_or_raise(space, s, ps, p)
+w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt),
recode_encoding)
 v = space.bytes_w(w_v)
 return v, p
diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -10,13 +10,13 @@
 return str_decode_utf8(u, True, "strict", None)
 
 def test_decode_utf8():
-assert decode_utf8("abc") == ("abc", 3, 3, rutf8.FLAG_ASCII)
-assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1, 
rutf8.FLAG_REGULAR)
-assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1, 
rutf8.FLAG_HAS_SURROGATES)
-assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1, 
rutf8.FLAG_HAS_SURROGATES)
+assert decode_utf8("abc") == ("abc", 3, 3)
+assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1)
+assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1)
+assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1)
 assert decode_utf8("\xed\xa0\x80\xed\xb0\x80") == (
-"\xed\xa0\x80\xed\xb0\x80", 6, 2, rutf8.FLAG_HAS_SURROGATES)
-assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1, 
rutf8.FLAG_REGULAR)
+"\xed\xa0\x80\xed\xb0\x80", 6, 2)
+assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1)
 
 def test_utf8_encode_ascii():
 assert utf8_encode_ascii("abc", "??", "??") == "abc"
@@ -41,19 +41,19 @@
 assert utf8_encode_ascii(u.encode("utf8"), "replace", eh) == 
u.encode("ascii", "replace")
 
 def test_str_decode_ascii():
-assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3, 
rutf8.FLAG_ASCII)
+assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3)
 def eh(errors, encoding, reason, p, start, end):
 lst.append((errors, encoding, p, start, end))
 return u"\u1234\u5678".encode("utf8"), end
 lst = []
 input = "\xe8"
 exp = u"\u1234\u5678".encode("utf8")
-assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2, 
rutf8.FLAG_REGULAR)
+assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2)
 assert lst == [("??", "ascii", input, 0, 1)]
 lst = []
 input = "\xe8\xe9abc\xea\xeb"
 assert str_decode_ascii(input, "??", 

[pypy-commit] pypy unicode-utf8-test: hg merge unicode-utf8

2017-12-08 Thread rlamy
Author: Ronan Lamy 
Branch: unicode-utf8-test
Changeset: r93324:e6db8eec731a
Date: 2017-12-09 02:46 +
http://bitbucket.org/pypy/pypy/changeset/e6db8eec731a/

Log:hg merge unicode-utf8

diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,3 +1,4 @@
+import pytest
 from hypothesis import given, strategies
 
 from rpython.rlib import rutf8
@@ -5,6 +6,7 @@
 from pypy.interpreter.unicodehelper import str_decode_utf8
 from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
 from pypy.interpreter import unicodehelper as uh
+from pypy.module._codecs.interp_codecs import CodecState
 
 def decode_utf8(u):
 return str_decode_utf8(u, True, "strict", None)
@@ -68,3 +70,16 @@
 def test_unicode_escape(u):
 r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
 assert r == u.encode("unicode-escape")
+
+def test_encode_decimal(space):
+assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 '
+with pytest.raises(ValueError):
+uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None)
+state = space.fromcache(CodecState)
+handler = state.encode_error_handler
+assert uh.unicode_encode_decimal(
+u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v'
+
+result = uh.unicode_encode_decimal(
+u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler)
+assert result == '12'
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -7,6 +7,7 @@
 from rpython.rlib.rstring import StringBuilder
 from rpython.rtyper.lltypesystem import rffi
 from pypy.module._codecs import interp_codecs
+from pypy.module.unicodedata import unicodedb
 
 @specialize.memo()
 def decode_error_handler(space):
@@ -35,6 +36,16 @@
  space.newtext(msg)]))
 return raise_unicode_exception_encode
 
+def default_error_encode(
+errors, encoding, msg, u, startingpos, endingpos):
+"""A default handler, for tests"""
+assert endingpos >= 0
+if errors == 'replace':
+return '?', endingpos
+if errors == 'ignore':
+return '', endingpos
+raise ValueError
+
 def convert_arg_to_w_unicode(space, w_arg, strict=None):
 return space.convert_arg_to_w_unicode(w_arg)
 
@@ -1458,3 +1469,70 @@
 pos = rutf8.next_codepoint_pos(s, pos)
 return result.build()
 
+# 
+# Decimal Encoder
+def unicode_encode_decimal(s, errors, errorhandler=None):
+"""Converts whitespace to ' ', decimal characters to their
+corresponding ASCII digit and all other Latin-1 characters except
+\0 as-is. Characters outside this range (Unicode ordinals 1-256)
+are treated as errors. This includes embedded NULL bytes.
+"""
+if errorhandler is None:
+errorhandler = default_error_encode
+result = StringBuilder(len(s))
+pos = 0
+i = 0
+it = rutf8.Utf8StringIterator(s)
+for ch in it:
+if unicodedb.isspace(ch):
+result.append(' ')
+i += 1
+continue
+try:
+decimal = unicodedb.decimal(ch)
+except KeyError:
+pass
+else:
+result.append(chr(48 + decimal))
+i += 1
+continue
+if 0 < ch < 256:
+result.append(chr(ch))
+i += 1
+continue
+# All other characters are considered unencodable
+start_index = i
+i += 1
+while not it.done():
+ch = rutf8.codepoint_at_pos(s, it.get_pos())
+try:
+if (0 < ch < 256 or unicodedb.isspace(ch) or
+unicodedb.decimal(ch) >= 0):
+break
+except KeyError:
+# not a decimal
+pass
+if it.done():
+break
+ch = next(it)
+i += 1
+end_index = i
+msg = "invalid decimal Unicode string"
+r, pos = errorhandler(
+errors, 'decimal', msg, s, start_index, end_index)
+for ch in rutf8.Utf8StringIterator(r):
+if unicodedb.isspace(ch):
+result.append(' ')
+continue
+try:
+decimal = unicodedb.decimal(ch)
+except KeyError:
+pass
+else:
+result.append(chr(48 + decimal))
+continue
+if 0 < ch < 256:
+result.append(chr(ch))
+continue
+errorhandler('strict', 'decimal', msg, s, start_index, end_index)
+return result.build()
diff --git a/pypy/module/_pypyjson/interp_decoder.py 

[pypy-commit] pypy unicode-utf8: translation fixes

2017-12-08 Thread rlamy
Author: Ronan Lamy 
Branch: unicode-utf8
Changeset: r93321:598f10607a50
Date: 2017-12-09 02:44 +
http://bitbucket.org/pypy/pypy/changeset/598f10607a50/

Log:translation fixes

diff --git a/pypy/module/_pypyjson/interp_decoder.py 
b/pypy/module/_pypyjson/interp_decoder.py
--- a/pypy/module/_pypyjson/interp_decoder.py
+++ b/pypy/module/_pypyjson/interp_decoder.py
@@ -3,6 +3,7 @@
 from rpython.rlib.objectmodel import specialize, always_inline, r_dict
 from rpython.rlib import rfloat, runicode, rutf8
 from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.rlib.rarithmetic import r_uint
 from pypy.interpreter.error import oefmt
 from pypy.interpreter import unicodehelper
 
@@ -366,7 +367,7 @@
 return # help the annotator to know that we'll never go beyond
# this point
 #
-utf8_ch = rutf8.unichr_as_utf8(val, allow_surrogates=True)
+utf8_ch = rutf8.unichr_as_utf8(r_uint(val), allow_surrogates=True)
 builder.append(utf8_ch)
 return i
 
@@ -400,7 +401,7 @@
 break
 elif ch == '\\' or ch < '\x20':
 self.pos = i-1
-return self.space.unicode_w(self.decode_string_escaped(start))
+return self.decode_string_escaped(start)
 strhash = intmask((103 * strhash) ^ ord(ll_chars[i]))
 bits |= ord(ch)
 length = i - start - 1
diff --git a/pypy/module/_rawffi/alt/type_converter.py 
b/pypy/module/_rawffi/alt/type_converter.py
--- a/pypy/module/_rawffi/alt/type_converter.py
+++ b/pypy/module/_rawffi/alt/type_converter.py
@@ -128,7 +128,7 @@
 intval: lltype.Signed
 """
 self.error(w_ffitype, w_obj)
-
+
 def handle_unichar(self, w_ffitype, w_obj, intval):
 """
 intval: lltype.Signed
@@ -174,7 +174,7 @@
 def handle_struct_rawffi(self, w_ffitype, w_structinstance):
 """
 This method should be killed as soon as we remove support for _rawffi 
structures
-
+
 w_structinstance: W_StructureInstance
 """
 self.error(w_ffitype, w_structinstance)
@@ -228,7 +228,7 @@
 return space.newbytes(chr(ucharval))
 elif w_ffitype.is_unichar():
 wcharval = self.get_unichar(w_ffitype)
-return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1)
+return space.newutf8(rutf8.unichr_as_utf8(r_uint(wcharval)), 1)
 elif w_ffitype.is_double():
 return self._float(w_ffitype)
 elif w_ffitype.is_singlefloat():
@@ -349,7 +349,7 @@
 def get_struct_rawffi(self, w_ffitype, w_structdescr):
 """
 This should be killed as soon as we kill support for _rawffi structures
-
+
 Return type: lltype.Unsigned
 (the address of the structure)
 """
diff --git a/pypy/module/_rawffi/interp_rawffi.py 
b/pypy/module/_rawffi/interp_rawffi.py
--- a/pypy/module/_rawffi/interp_rawffi.py
+++ b/pypy/module/_rawffi/interp_rawffi.py
@@ -596,9 +596,9 @@
 return space.w_None
 wcharp_addr = rffi.cast(rffi.CWCHARP, address)
 if maxlength == -1:
-s = rffi.wcharp2utf8(wcharp_addr)
+s = rffi.wcharp2unicode(wcharp_addr)
 else:
-s = rffi.wcharpsize2utf8(wcharp_addr, maxlength)
+s = rffi.wcharp2unicoden(wcharp_addr, maxlength)
 return space.newunicode(s)
 
 @unwrap_spec(address=r_uint, maxlength=int)
diff --git a/pypy/module/array/interp_array.py 
b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -1,7 +1,7 @@
 from rpython.rlib import jit, rgc, rutf8
 from rpython.rlib.buffer import RawBuffer
 from rpython.rlib.objectmodel import keepalive_until_here
-from rpython.rlib.rarithmetic import ovfcheck, widen
+from rpython.rlib.rarithmetic import ovfcheck, widen, r_uint
 from rpython.rlib.unroll import unrolling_iterable
 from rpython.rtyper.annlowlevel import llstr
 from rpython.rtyper.lltypesystem import lltype, rffi
@@ -1013,7 +1013,7 @@
 elif mytype.typecode == 'c':
 return space.newbytes(item)
 elif mytype.typecode == 'u':
-code = ord(item)
+code = r_uint(ord(item))
 return space.newutf8(rutf8.unichr_as_utf8(code), 1)
 assert 0, "unreachable"
 
diff --git a/pypy/module/pyexpat/interp_pyexpat.py 
b/pypy/module/pyexpat/interp_pyexpat.py
--- a/pypy/module/pyexpat/interp_pyexpat.py
+++ b/pypy/module/pyexpat/interp_pyexpat.py
@@ -483,7 +483,7 @@
 except rutf8.CheckError:
 from pypy.interpreter import unicodehelper
 # get the correct error msg
-unicodehelper.str_decode_utf8(s, len(s), 'string', True,
+unicodehelper.str_decode_utf8(s, 'string', True,
 unicodehelper.decode_error_handler(space))
 assert False, "always raises"

[pypy-commit] pypy unicode-utf8: Add utf8-based replacement for runicode.unicode_encode_decimal() to unicodehelper and fix PyUnicode_EncodeDecimal()

2017-12-08 Thread rlamy
Author: Ronan Lamy 
Branch: unicode-utf8
Changeset: r93319:ac75e33e51bb
Date: 2017-12-09 01:36 +
http://bitbucket.org/pypy/pypy/changeset/ac75e33e51bb/

Log:Add utf8-based replacement for runicode.unicode_encode_decimal() to
unicodehelper and fix PyUnicode_EncodeDecimal()

diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -1,3 +1,4 @@
+import pytest
 from hypothesis import given, strategies
 
 from rpython.rlib import rutf8
@@ -5,6 +6,7 @@
 from pypy.interpreter.unicodehelper import str_decode_utf8
 from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
 from pypy.interpreter import unicodehelper as uh
+from pypy.module._codecs.interp_codecs import CodecState
 
 def decode_utf8(u):
 return str_decode_utf8(u, True, "strict", None)
@@ -68,3 +70,16 @@
 def test_unicode_escape(u):
 r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None)
 assert r == u.encode("unicode-escape")
+
+def test_encode_decimal(space):
+assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 '
+with pytest.raises(ValueError):
+uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None)
+state = space.fromcache(CodecState)
+handler = state.encode_error_handler
+assert uh.unicode_encode_decimal(
+u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v'
+
+result = uh.unicode_encode_decimal(
+u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler)
+assert result == '12'
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -7,6 +7,7 @@
 from rpython.rlib.rstring import StringBuilder
 from rpython.rtyper.lltypesystem import rffi
 from pypy.module._codecs import interp_codecs
+from pypy.module.unicodedata import unicodedb
 
 @specialize.memo()
 def decode_error_handler(space):
@@ -35,6 +36,16 @@
  space.newtext(msg)]))
 return raise_unicode_exception_encode
 
+def default_error_encode(
+errors, encoding, msg, u, startingpos, endingpos):
+"""A default handler, for tests"""
+assert endingpos >= 0
+if errors == 'replace':
+return '?', endingpos
+if errors == 'ignore':
+return '', endingpos
+raise ValueError
+
 def convert_arg_to_w_unicode(space, w_arg, strict=None):
 return space.convert_arg_to_w_unicode(w_arg)
 
@@ -1458,3 +1469,70 @@
 pos = rutf8.next_codepoint_pos(s, pos)
 return result.build()
 
+# 
+# Decimal Encoder
+def unicode_encode_decimal(s, errors, errorhandler=None):
+"""Converts whitespace to ' ', decimal characters to their
+corresponding ASCII digit and all other Latin-1 characters except
+\0 as-is. Characters outside this range (Unicode ordinals 1-256)
+are treated as errors. This includes embedded NULL bytes.
+"""
+if errorhandler is None:
+errorhandler = default_error_encode
+result = StringBuilder(len(s))
+pos = 0
+i = 0
+it = rutf8.Utf8StringIterator(s)
+for ch in it:
+if unicodedb.isspace(ch):
+result.append(' ')
+i += 1
+continue
+try:
+decimal = unicodedb.decimal(ch)
+except KeyError:
+pass
+else:
+result.append(chr(48 + decimal))
+i += 1
+continue
+if 0 < ch < 256:
+result.append(chr(ch))
+i += 1
+continue
+# All other characters are considered unencodable
+start_index = i
+i += 1
+while not it.done():
+ch = rutf8.codepoint_at_pos(s, it.get_pos())
+try:
+if (0 < ch < 256 or unicodedb.isspace(ch) or
+unicodedb.decimal(ch) >= 0):
+break
+except KeyError:
+# not a decimal
+pass
+if it.done():
+break
+ch = next(it)
+i += 1
+end_index = i
+msg = "invalid decimal Unicode string"
+r, pos = errorhandler(
+errors, 'decimal', msg, s, start_index, end_index)
+for ch in rutf8.Utf8StringIterator(r):
+if unicodedb.isspace(ch):
+result.append(' ')
+continue
+try:
+decimal = unicodedb.decimal(ch)
+except KeyError:
+pass
+else:
+result.append(chr(48 + decimal))
+continue
+if 0 < ch < 256:
+result.append(chr(ch))
+continue
+errorhandler('strict', 'decimal', msg, s, start_index, 

[pypy-commit] pypy unicode-utf8: Fix PyUnicode_DecodeUTF16/32

2017-12-08 Thread rlamy
Author: Ronan Lamy 
Branch: unicode-utf8
Changeset: r93318:d53d8f486841
Date: 2017-12-08 16:53 +
http://bitbucket.org/pypy/pypy/changeset/d53d8f486841/

Log:Fix PyUnicode_DecodeUTF16/32

diff --git a/pypy/module/cpyext/unicodeobject.py 
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -3,7 +3,8 @@
 from rpython.tool.sourcetools import func_renamer
 
 from pypy.interpreter.error import OperationError, oefmt
-from pypy.interpreter.unicodehelper import wcharpsize2utf8
+from pypy.interpreter.unicodehelper import (
+wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper)
 from pypy.module.unicodedata import unicodedb
 from pypy.module.cpyext.api import (
 CANNOT_FAIL, Py_ssize_t, build_type_checkers_flags, cpython_api,
@@ -568,15 +569,11 @@
 else:
 errors = None
 
-result, length, byteorder = runicode.str_decode_utf_16_helper(
-string, size, errors,
-True, # final ? false for multiple passes?
-None, # errorhandler
-byteorder)
+result, _,  length, byteorder = str_decode_utf_16_helper(
+string, errors, final=True, errorhandler=None, byteorder=byteorder)
 if pbyteorder is not None:
 pbyteorder[0] = rffi.cast(rffi.INT, byteorder)
-
-return space.newunicode(result)
+return space.newutf8(result, length)
 
 @cpython_api([CONST_STRING, Py_ssize_t, CONST_STRING, rffi.INTP], PyObject)
 def PyUnicode_DecodeUTF32(space, s, size, llerrors, pbyteorder):
@@ -624,15 +621,11 @@
 else:
 errors = None
 
-result, length, byteorder = runicode.str_decode_utf_32_helper(
-string, size, errors,
-True, # final ? false for multiple passes?
-None, # errorhandler
-byteorder)
+result, _,  length, byteorder = str_decode_utf_32_helper(
+string, errors, final=True, errorhandler=None, byteorder=byteorder)
 if pbyteorder is not None:
 pbyteorder[0] = rffi.cast(rffi.INT, byteorder)
-
-return space.newunicode(result)
+return space.newutf8(result, length)
 
 @cpython_api([rffi.CWCHARP, Py_ssize_t, rffi.CCHARP, CONST_STRING],
  rffi.INT_real, error=-1)
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: fixes

2017-12-08 Thread rlamy
Author: Ronan Lamy 
Branch: unicode-utf8
Changeset: r93317:5677dc1909e9
Date: 2017-12-08 14:45 +
http://bitbucket.org/pypy/pypy/changeset/5677dc1909e9/

Log:fixes

diff --git a/pypy/module/cpyext/longobject.py b/pypy/module/cpyext/longobject.py
--- a/pypy/module/cpyext/longobject.py
+++ b/pypy/module/cpyext/longobject.py
@@ -4,6 +4,7 @@
 CONST_STRING, ADDR, CANNOT_FAIL)
 from pypy.objspace.std.longobject import W_LongObject
 from pypy.interpreter.error import OperationError
+from pypy.interpreter.unicodehelper import wcharpsize2utf8
 from pypy.module.cpyext.intobject import PyInt_AsUnsignedLongMask
 from rpython.rlib.rbigint import rbigint
 
@@ -191,7 +192,7 @@
 string, length gives the number of characters, and base is the radix
 for the conversion.  The radix must be in the range [2, 36]; if it is
 out of range, ValueError will be raised."""
-w_value = space.newunicode(rffi.wcharpsize2unicode(u, length))
+w_value = space.newutf8(wcharpsize2utf8(space, u, length), length)
 w_base = space.newint(rffi.cast(lltype.Signed, base))
 return space.call_function(space.w_long, w_value, w_base)
 
diff --git a/pypy/module/cpyext/object.py b/pypy/module/cpyext/object.py
--- a/pypy/module/cpyext/object.py
+++ b/pypy/module/cpyext/object.py
@@ -246,7 +246,7 @@
 the Python expression unicode(o).  Called by the unicode() built-in
 function."""
 if w_obj is None:
-return space.newunicode(u"")
+return space.newutf8("", 6)
 return space.call_function(space.w_unicode, w_obj)
 
 @cpython_api([PyObject, PyObject], rffi.INT_real, error=-1)
@@ -302,7 +302,7 @@
 if opid == Py_EQ:
 return 1
 if opid == Py_NE:
-return 0 
+return 0
 w_res = PyObject_RichCompare(space, w_o1, w_o2, opid_int)
 return int(space.is_true(w_res))
 
diff --git a/pypy/module/cpyext/unicodeobject.py 
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -710,12 +710,17 @@
 """Return 1 if substr matches str[start:end] at the given tail end
 (direction == -1 means to do a prefix match, direction == 1 a
 suffix match), 0 otherwise. Return -1 if an error occurred."""
+space.utf8_w(w_str)  # type check
+space.utf8_w(w_substr)
 w_start = space.newint(start)
 w_end = space.newint(end)
 if rffi.cast(lltype.Signed, direction) <= 0:
-return space.call_method(w_str, "startswith", w_substr, w_start, w_end)
+w_result = space.call_method(
+w_str, "startswith", w_substr, w_start, w_end)
 else:
-return space.call_method(w_str, "endswith", w_substr, w_start, w_end)
+w_result = space.call_method(
+w_str, "endswith", w_substr, w_start, w_end)
+return space.int_w(w_result)
 
 @cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t], Py_ssize_t, 
error=-1)
 def PyUnicode_Count(space, w_str, w_substr, start, end):
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: Some unicode>utf8 conversions in cpyext/unicodeobject.py

2017-12-08 Thread rlamy
Author: Ronan Lamy 
Branch: unicode-utf8
Changeset: r93316:8cc0253e1ece
Date: 2017-12-08 13:07 +
http://bitbucket.org/pypy/pypy/changeset/8cc0253e1ece/

Log:Some unicode>utf8 conversions in cpyext/unicodeobject.py

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -1,10 +1,11 @@
 import sys
 
-from pypy.interpreter.error import OperationError
+from pypy.interpreter.error import OperationError, oefmt
 from rpython.rlib.objectmodel import specialize
 from rpython.rlib import rutf8
 from rpython.rlib.rarithmetic import r_uint, intmask
 from rpython.rlib.rstring import StringBuilder
+from rpython.rtyper.lltypesystem import rffi
 from pypy.module._codecs import interp_codecs
 
 @specialize.memo()
@@ -204,7 +205,7 @@
 if c > 0x7F:
 errorhandler("strict", 'ascii',
  'ordinal not in range(128)', utf8,
- pos, pos + 1)  
+ pos, pos + 1)
 j = rutf8.next_codepoint_pos(r, j)
 pos = newpos
 res.append(r)
@@ -530,6 +531,19 @@
 
 return builder.build(), pos, outsize
 
+def wcharpsize2utf8(space, wcharp, size):
+"""Safe version of rffi.wcharpsize2utf8.
+
+Raises app-level ValueError if any wchar value is outside the valid
+codepoint range.
+"""
+try:
+return rffi.wcharpsize2utf8(wcharp, size)
+except ValueError:
+raise oefmt(space.w_ValueError,
+"character is not in range [U+; U+10]")
+
+
 # 
 # Raw unicode escape
 
diff --git a/pypy/module/cpyext/unicodeobject.py 
b/pypy/module/cpyext/unicodeobject.py
--- a/pypy/module/cpyext/unicodeobject.py
+++ b/pypy/module/cpyext/unicodeobject.py
@@ -1,5 +1,9 @@
+from rpython.rtyper.lltypesystem import rffi, lltype
+from rpython.rlib import rstring, runicode
+from rpython.tool.sourcetools import func_renamer
+
 from pypy.interpreter.error import OperationError, oefmt
-from rpython.rtyper.lltypesystem import rffi, lltype
+from pypy.interpreter.unicodehelper import wcharpsize2utf8
 from pypy.module.unicodedata import unicodedb
 from pypy.module.cpyext.api import (
 CANNOT_FAIL, Py_ssize_t, build_type_checkers_flags, cpython_api,
@@ -13,8 +17,6 @@
 from pypy.module.sys.interp_encoding import setdefaultencoding
 from pypy.module._codecs.interp_codecs import CodecState
 from pypy.objspace.std import unicodeobject
-from rpython.rlib import rstring, runicode
-from rpython.tool.sourcetools import func_renamer
 import sys
 
 ## See comment in bytesobject.py.
@@ -61,10 +63,10 @@
 def unicode_attach(space, py_obj, w_obj, w_userdata=None):
 "Fills a newly allocated PyUnicodeObject with a unicode string"
 py_unicode = rffi.cast(PyUnicodeObject, py_obj)
-s = space.unicode_w(w_obj)
-py_unicode.c_length = len(s)
+s, length = space.utf8_len_w(w_obj)
+py_unicode.c_length = length
 py_unicode.c_str = lltype.nullptr(rffi.CWCHARP.TO)
-py_unicode.c_hash = space.hash_w(space.newunicode(s))
+py_unicode.c_hash = space.hash_w(space.newutf8(s, length))
 py_unicode.c_defenc = lltype.nullptr(PyObject.TO)
 
 def unicode_realize(space, py_obj):
@@ -73,11 +75,12 @@
 be modified after this call.
 """
 py_uni = rffi.cast(PyUnicodeObject, py_obj)
-s = rffi.wcharpsize2unicode(py_uni.c_str, py_uni.c_length)
+length = py_uni.c_length
+s = wcharpsize2utf8(space, py_uni.c_str, length)
 w_type = from_ref(space, rffi.cast(PyObject, py_obj.c_ob_type))
 w_obj = space.allocate_instance(unicodeobject.W_UnicodeObject, w_type)
-w_obj.__init__(s)
-py_uni.c_hash = space.hash_w(space.newunicode(s))
+w_obj.__init__(s, length)
+py_uni.c_hash = space.hash_w(space.newutf8(s, length))
 track_reference(space, py_obj, w_obj)
 return w_obj
 
@@ -214,8 +217,8 @@
 if not ref_unicode.c_str:
 # Copy unicode buffer
 w_unicode = from_ref(space, rffi.cast(PyObject, ref))
-u = space.unicode_w(w_unicode)
-ref_unicode.c_str = rffi.unicode2wcharp(u)
+u, length = space.utf8_len_w(w_unicode)
+ref_unicode.c_str = rffi.utf82wcharp(u, length)
 return ref_unicode.c_str
 
 @cpython_api([PyObject], rffi.CWCHARP)
@@ -335,8 +338,8 @@
 Therefore, modification of the resulting Unicode object is only allowed 
when u
 is NULL."""
 if wchar_p:
-s = rffi.wcharpsize2unicode(wchar_p, length)
-return make_ref(space, space.newunicode(s))
+s = wcharpsize2utf8(space, wchar_p, length)
+return make_ref(space, space.newutf8(s, length))
 else:
 return rffi.cast(PyObject, new_empty_unicode(space, length))
 
@@ -506,7 +509,8 @@
 """Encode the Py_UNICODE buffer of the given size and return a
 Python string object.  Return 

[pypy-commit] pypy unicode-utf8-re: Fix test_search

2017-12-08 Thread arigo
Author: Armin Rigo 
Branch: unicode-utf8-re
Changeset: r93314:80ff594175dc
Date: 2017-12-08 12:57 +0100
http://bitbucket.org/pypy/pypy/changeset/80ff594175dc/

Log:Fix test_search

diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
--- a/rpython/rlib/rsre/rsre_utf8.py
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -68,23 +68,41 @@
 return   # end of string is fine
 assert not (0x80 <= self._utf8[position] < 0xC0)   # continuation byte
 
+def maximum_distance(self, position_low, position_high):
+# may overestimate if there are non-ascii chars
+return position_high - position_low
+
+
+def make_utf8_ctx(pattern, utf8string, bytestart, byteend, flags):
+if bytestart < 0: bytestart = 0
+elif bytestart > len(utf8string): bytestart = len(utf8string)
+if byteend < 0: byteend = 0
+elif byteend > len(utf8string): byteend = len(utf8string)
+ctx = Utf8MatchContext(pattern, utf8string, bytestart, byteend, flags)
+ctx.debug_check_pos(bytestart)
+ctx.debug_check_pos(byteend)
+return ctx
 
 def utf8search(pattern, utf8string, bytestart=0, byteend=sys.maxint, flags=0):
 # bytestart and byteend must be valid byte positions inside the
 # utf8string.
 from rpython.rlib.rsre.rsre_core import search_context
 
-assert 0 <= bytestart <= len(utf8string)
-assert 0 <= byteend
-if byteend > len(utf8string):
-byteend = len(utf8string)
-ctx = Utf8MatchContext(pattern, utf8string, bytestart, byteend, flags)
-ctx.debug_check_pos(bytestart)
-ctx.debug_check_pos(byteend)
+ctx = make_utf8_ctx(pattern, utf8string, bytestart, byteend, flags)
 if search_context(ctx):
 return ctx
 else:
 return None
 
-def utf8match(*args, **kwds):
-NOT_IMPLEMENTED
+def utf8match(pattern, utf8string, bytestart=0, byteend=sys.maxint, flags=0,
+  fullmatch=False):
+# bytestart and byteend must be valid byte positions inside the
+# utf8string.
+from rpython.rlib.rsre.rsre_core import match_context
+
+ctx = make_utf8_ctx(pattern, utf8string, bytestart, byteend, flags)
+ctx.fullmatch_only = fullmatch
+if match_context(ctx):
+return ctx
+else:
+return None
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-re: Fix test_match

2017-12-08 Thread arigo
Author: Armin Rigo 
Branch: unicode-utf8-re
Changeset: r93315:e2017b23843a
Date: 2017-12-08 13:03 +0100
http://bitbucket.org/pypy/pypy/changeset/e2017b23843a/

Log:Fix test_match

diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -207,13 +207,6 @@
 return (-1, -1)
 return (fmarks[groupnum], fmarks[groupnum+1])
 
-def group(self, groupnum=0):
-frm, to = self.span(groupnum)
-if 0 <= frm <= to:
-return self._string[frm:to]
-else:
-return None
-
 def fresh_copy(self, start):
 raise NotImplementedError
 
diff --git a/rpython/rlib/rsre/test/support.py 
b/rpython/rlib/rsre/test/support.py
--- a/rpython/rlib/rsre/test/support.py
+++ b/rpython/rlib/rsre/test/support.py
@@ -54,12 +54,19 @@
 raise EndOfString
 return Position(r)
 
-def slowly_convert_byte_pos_to_index(self, position):
+def _real_pos(self, position):
 if type(position) is int and position == -1:
 return -1
 assert isinstance(position, Position)
 return position._p
 
+def group(self, groupnum=0):
+frm, to = self.span(groupnum)
+if self.ZERO <= frm <= to:
+return self._string[self._real_pos(frm):self._real_pos(to)]
+else:
+return None
+
 def str(self, position):
 assert isinstance(position, Position)
 return ord(self._string[position._p])
diff --git a/rpython/rlib/rsre/test/test_match.py 
b/rpython/rlib/rsre/test/test_match.py
--- a/rpython/rlib/rsre/test/test_match.py
+++ b/rpython/rlib/rsre/test/test_match.py
@@ -1,7 +1,7 @@
 import re, random, py
 from rpython.rlib.rsre import rsre_char
 from rpython.rlib.rsre.rpy import get_code, VERSION
-from rpython.rlib.rsre.test.support import match, fullmatch, Position
+from rpython.rlib.rsre.test.support import match, fullmatch, Position as P
 
 
 def get_code_and_re(regexp):
@@ -51,20 +51,20 @@
 def test_assert(self):
 r = get_code(r"abc(?=def)(.)")
 res = match(r, "abcdefghi")
-assert res is not None and res.get_mark(1) == 4
+assert res is not None and res.get_mark(1) == P(4)
 assert not match(r, "abcdeFghi")
 
 def test_assert_not(self):
 r = get_code(r"abc(?!def)(.)")
 res = match(r, "abcdeFghi")
-assert res is not None and res.get_mark(1) == 4
+assert res is not None and res.get_mark(1) == P(4)
 assert not match(r, "abcdefghi")
 
 def test_lookbehind(self):
 r = get_code(r"([a-z]*)(?<=de)")
 assert match(r, "ade")
 res = match(r, "adefg")
-assert res is not None and res.get_mark(1) == 3
+assert res is not None and res.get_mark(1) == P(3)
 assert not match(r, "abc")
 assert not match(r, "X")
 assert not match(r, "eX")
@@ -75,13 +75,13 @@
 assert res is not None
 return res.get_mark(1)
 r = get_code(r"([a-z]*)(?https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-re: duh

2017-12-08 Thread arigo
Author: Armin Rigo 
Branch: unicode-utf8-re
Changeset: r93313:68c926785f51
Date: 2017-12-08 12:52 +0100
http://bitbucket.org/pypy/pypy/changeset/68c926785f51/

Log:duh

diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
--- a/rpython/rlib/rsre/rsre_utf8.py
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -56,7 +56,7 @@
 for i in range(n):
 if upos <= r_uint(start_position):
 raise EndOfString
-upos = rutf8.next_codepoint_pos(self._utf8, upos)
+upos = rutf8.prev_codepoint_pos(self._utf8, upos)
 position = intmask(upos)
 assert position >= 0
 return position
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-re: Remove slowly_convert_byte_pos_to_index

2017-12-08 Thread arigo
Author: Armin Rigo 
Branch: unicode-utf8-re
Changeset: r93312:b58a53172e21
Date: 2017-12-08 12:44 +0100
http://bitbucket.org/pypy/pypy/changeset/b58a53172e21/

Log:Remove slowly_convert_byte_pos_to_index

diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -159,9 +159,6 @@
 def prev_n(self, position, n, start_position):
 raise NotImplementedError
 @not_rpython
-def slowly_convert_byte_pos_to_index(self, position):
-raise NotImplementedError
-@not_rpython
 def debug_check_pos(self, position):
 raise NotImplementedError
 @not_rpython
@@ -178,15 +175,13 @@
 raise NotImplementedError
 
 def get_mark(self, gid):
-mark = find_mark(self.match_marks, gid)
-return self.slowly_convert_byte_pos_to_index(mark)
+return find_mark(self.match_marks, gid)
 
 def flatten_marks(self):
 # for testing
 if self.match_marks_flat is None:
 self._compute_flattened_marks()
-return [self.slowly_convert_byte_pos_to_index(i)
-for i in self.match_marks_flat]
+return self.match_marks_flat
 
 def _compute_flattened_marks(self):
 self.match_marks_flat = [self.match_start, self.match_end]
@@ -249,9 +244,6 @@
 raise EndOfString
 return position
 
-def slowly_convert_byte_pos_to_index(self, position):
-return position
-
 def debug_check_pos(self, position):
 pass
 
diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
--- a/rpython/rlib/rsre/rsre_utf8.py
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -3,16 +3,19 @@
 from rpython.rlib.rarithmetic import r_uint, intmask
 from rpython.rlib.rsre.rsre_core import AbstractMatchContext, EndOfString
 from rpython.rlib.rsre import rsre_char
+from rpython.rlib.objectmodel import we_are_translated
 from rpython.rlib import rutf8
 
 
 class Utf8MatchContext(AbstractMatchContext):
+"""A context that matches unicode, but encoded in a utf8 string.
+Be careful because most positions taken by, handled in, and returned
+by this class are expressed in *bytes*, not in characters.
+"""
 
-def __init__(self, pattern, utf8string, index_storage,
- match_start, end, flags):
+def __init__(self, pattern, utf8string, match_start, end, flags):
 AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
 self._utf8 = utf8string
-self._index_storage = index_storage
 
 def str(self, index):
 check_nonneg(index)
@@ -58,16 +61,15 @@
 assert position >= 0
 return position
 
-def slowly_convert_byte_pos_to_index(self, position):
-return rutf8.codepoint_index_at_byte_position(
-self._utf8, self._index_storage, position)
-
 def debug_check_pos(self, position):
+if we_are_translated():
+return
+if position == len(self._utf8):
+return   # end of string is fine
 assert not (0x80 <= self._utf8[position] < 0xC0)   # continuation byte
 
 
-def utf8search(pattern, utf8string, index_storage=None, bytestart=0,
-   byteend=sys.maxint, flags=0):
+def utf8search(pattern, utf8string, bytestart=0, byteend=sys.maxint, flags=0):
 # bytestart and byteend must be valid byte positions inside the
 # utf8string.
 from rpython.rlib.rsre.rsre_core import search_context
@@ -76,11 +78,9 @@
 assert 0 <= byteend
 if byteend > len(utf8string):
 byteend = len(utf8string)
-if index_storage is None: # should be restricted to tests only
-length = rutf8.check_utf8(utf8string, allow_surrogates=True)
-index_storage = rutf8.create_utf8_index_storage(utf8string, length)
-ctx = Utf8MatchContext(pattern, utf8string, index_storage,
-   bytestart, byteend, flags)
+ctx = Utf8MatchContext(pattern, utf8string, bytestart, byteend, flags)
+ctx.debug_check_pos(bytestart)
+ctx.debug_check_pos(byteend)
 if search_context(ctx):
 return ctx
 else:
diff --git a/rpython/rlib/rsre/test/test_search.py 
b/rpython/rlib/rsre/test/test_search.py
--- a/rpython/rlib/rsre/test/test_search.py
+++ b/rpython/rlib/rsre/test/test_search.py
@@ -12,19 +12,22 @@
 assert res is None
 res = self.search(r_code1, "fooahcdixxx")
 assert res is not None
-assert res.span() == (5, 8)
+P = self.P
+assert res.span() == (P(5), P(8))
 
 def test_code2(self):
 r_code2 = get_code(r'\s*(.*?)')
 res = self.search(r_code2, "foo bar   abcdef")
 assert res is not None
-assert res.span() == (8, 34)
+P = self.P
+assert res.span() == (P(8), P(34))
 
 def test_pure_literal(self):
 r_code3 = get_code(r'foobar')
 res = self.search(r_code3, "foo bar foobar baz")
 

[pypy-commit] pypy unicode-utf8-re: in-progress

2017-12-08 Thread arigo
Author: Armin Rigo 
Branch: unicode-utf8-re
Changeset: r93311:336fb075d139
Date: 2017-12-08 12:22 +0100
http://bitbucket.org/pypy/pypy/changeset/336fb075d139/

Log:in-progress

diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -13,7 +13,7 @@
 #
 # Constants and exposed functions
 
-from rpython.rlib.rsre import rsre_core
+from rpython.rlib.rsre import rsre_core, rsre_utf8
 from rpython.rlib.rsre.rsre_char import CODESIZE, MAXREPEAT, getlower, 
set_unicode_db
 
 
@@ -40,7 +40,8 @@
 end-start))
 if isinstance(ctx, rsre_core.StrMatchContext):
 return space.newbytes(ctx._string[start:end])
-elif isinstance(ctx, rsre_core.UnicodeMatchContext):
+elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+XXX
 s = ctx._unicodestr[start:end]
 lgt = rutf8.check_utf8(s, True)
 return space.newutf8(s, lgt)
@@ -103,7 +104,7 @@
 raise oefmt(space.w_TypeError, "cannot copy this pattern object")
 
 def make_ctx(self, w_string, pos=0, endpos=sys.maxint):
-"""Make a StrMatchContext, BufMatchContext or a UnicodeMatchContext for
+"""Make a StrMatchContext, BufMatchContext or a Utf8MatchContext for
 searching in the given w_string object."""
 space = self.space
 if pos < 0:
@@ -111,17 +112,26 @@
 if endpos < pos:
 endpos = pos
 if space.isinstance_w(w_string, space.w_unicode):
-utf8str, length = space.utf8_len_w(w_string)
-if pos >= length:
+# xxx fish for the _index_storage
+w_string = space.convert_arg_to_w_unicode(w_string)
+utf8str = w_string._utf8
+length = w_string._len()
+index_storage = w_string._get_index_storage()
+#
+if pos <= 0:
+bytepos = 0
+elif pos >= length:
 bytepos = len(utf8str)
 else:
-bytepos = rutf8.codepoint_at_index(..)
-
-pos = length
+bytepos = rutf8.codepoint_at_index(utf8str, index_storage, pos)
 if endpos >= length:
-endpos = length
-return rsre_core.UnicodeMatchContext(self.code, unicodestr,
- pos, endpos, self.flags)
+endbytepos = len(utf8str)
+else:
+endbytepos = rutf8.codepoint_at_index(utf8str, index_storage,
+  endpos)
+return rsre_utf8.Utf8MatchContext(
+self.code, unicodestr, index_storage,
+bytepos, endbytepos, self.flags)
 elif space.isinstance_w(w_string, space.w_bytes):
 str = space.bytes_w(w_string)
 if pos > len(str):
@@ -372,7 +382,8 @@
 if isinstance(ctx, rsre_core.StrMatchContext):
 assert strbuilder is not None
 return strbuilder.append_slice(ctx._string, start, end)
-elif isinstance(ctx, rsre_core.UnicodeMatchContext):
+elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+XXX
 assert unicodebuilder is not None
 return unicodebuilder.append_slice(ctx._unicodestr, start, end)
 assert 0, "unreachable"
@@ -578,7 +589,8 @@
 return space.newbytes(ctx._buffer.as_str())
 elif isinstance(ctx, rsre_core.StrMatchContext):
 return space.newbytes(ctx._string)
-elif isinstance(ctx, rsre_core.UnicodeMatchContext):
+elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
+
 lgt = rutf8.check_utf8(ctx._unicodestr, True)
 return space.newutf8(ctx._unicodestr, lgt)
 else:
diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -55,6 +55,8 @@
 specific subclass, calling 'func' is a direct call; if 'ctx' is only known
 to be of class AbstractMatchContext, calling 'func' is an indirect call.
 """
+from rpython.rlib.rsre.rsre_utf8 import Utf8MatchContext
+
 assert func.func_code.co_varnames[0] == 'ctx'
 specname = '_spec_' + func.func_name
 while specname in _seen_specname:
@@ -65,7 +67,8 @@
 specialized_methods = []
 for prefix, concreteclass in [('buf', BufMatchContext),
   ('str', StrMatchContext),
-  ('uni', UnicodeMatchContext)]:
+  ('uni', UnicodeMatchContext),
+  ('utf8', Utf8MatchContext)]:
 newfunc = func_with_new_name(func, prefix + specname)
 assert not hasattr(concreteclass, specname)
 setattr(concreteclass, specname, newfunc)

[pypy-commit] pypy unicode-utf8: whack at _io module

2017-12-08 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93308:7ffcfc6493e6
Date: 2017-12-08 10:38 +0200
http://bitbucket.org/pypy/pypy/changeset/7ffcfc6493e6/

Log:whack at _io module

diff --git a/pypy/module/_io/interp_stringio.py 
b/pypy/module/_io/interp_stringio.py
--- a/pypy/module/_io/interp_stringio.py
+++ b/pypy/module/_io/interp_stringio.py
@@ -1,3 +1,5 @@
+from rpython.rlib.rutf8 import get_utf8_length
+
 from pypy.interpreter.error import OperationError, oefmt
 from pypy.interpreter.typedef import (
 TypeDef, generic_new_descr, GetSetProperty)
@@ -152,7 +154,7 @@
 if self.readnl is None:
 w_readnl = space.w_None
 else:
-w_readnl = space.str(space.new_from_utf8(self.readnl))  # YYY
+w_readnl = space.str(space.newutf8(self.readnl, 
get_utf8_length(self.readnl)))  # YYY
 return space.newtuple([
 w_initialval, w_readnl, space.newint(self.buf.pos), w_dict
 ])
@@ -215,7 +217,8 @@
 if self.writenl:
 w_decoded = space.call_method(
 w_decoded, "replace",
-space.newtext("\n"), space.new_from_utf8(self.writenl))
+space.newtext("\n"), space.newutf8(self.writenl,
+get_utf8_length(self.writenl)))
 string = space.utf8_w(w_decoded)
 if string:
 self.buf.write(string)
@@ -225,7 +228,9 @@
 def read_w(self, space, w_size=None):
 self._check_closed(space)
 size = convert_size(space, w_size)
-return space.new_from_utf8(self.buf.read(size))
+v = self.buf.read(size)
+lgt = get_utf8_length(v)
+return space.newutf8(v, lgt)
 
 def readline_w(self, space, w_limit=None):
 self._check_closed(space)
@@ -239,7 +244,8 @@
 else:
 newline = self.readnl
 result = self.buf.readline(newline, limit)
-return space.new_from_utf8(result)
+resultlen = get_utf8_length(result)
+return space.newutf8(result, resultlen)
 
 
 @unwrap_spec(pos=int, mode=int)
@@ -276,7 +282,9 @@
 
 def getvalue_w(self, space):
 self._check_closed(space)
-return space.new_from_utf8(self.buf.getvalue())
+v = self.buf.getvalue()
+lgt = get_utf8_length(v)
+return space.newutf8(v, lgt)
 
 def readable_w(self, space):
 self._check_closed(space)
diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py
--- a/pypy/module/_io/interp_textio.py
+++ b/pypy/module/_io/interp_textio.py
@@ -12,7 +12,8 @@
 from rpython.rlib.rbigint import rbigint
 from rpython.rlib.rstring import StringBuilder
 from rpython.rlib.rutf8 import (check_utf8, next_codepoint_pos,
-codepoints_in_utf8)
+codepoints_in_utf8, get_utf8_length,
+Utf8StringBuilder)
 
 
 STATE_ZERO, STATE_OK, STATE_DETACHED = range(3)
@@ -684,13 +685,15 @@
 w_bytes = space.call_method(self.w_buffer, "read")
 w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, 
space.w_True)
 check_decoded(space, w_decoded)
-w_result = space.new_from_utf8(self.decoded.get_chars(-1))
+chars = self.decoded.get_chars(-1)
+lgt = get_utf8_length(chars)
+w_result = space.newutf8(chars, lgt)
 w_final = space.add(w_result, w_decoded)
 self.snapshot = None
 return w_final
 
 remaining = size
-builder = StringBuilder(size)
+builder = Utf8StringBuilder(size)
 
 # Keep reading chunks until we have n characters to return
 while remaining > 0:
@@ -700,7 +703,7 @@
 builder.append(data)
 remaining -= len(data)
 
-return space.new_from_utf8(builder.build())
+return space.newutf8(builder.build(), builder.get_length())
 
 def _scan_line_ending(self, limit):
 if self.readuniversal:
@@ -725,6 +728,7 @@
 limit = convert_size(space, w_limit)
 remnant = None
 builder = StringBuilder()
+# XXX maybe use Utf8StringBuilder instead?
 while True:
 # First, get some data if necessary
 has_data = self._ensure_data(space)
@@ -771,7 +775,8 @@
 self.decoded.reset()
 
 result = builder.build()
-return space.new_from_utf8(result)
+lgt = get_utf8_length(result)
+return space.newutf8(result, lgt)
 
 # _
 # write methods
@@ -794,8 +799,8 @@
 if text.find('\n') >= 0:
 haslf = True
 if haslf and self.writetranslate and self.writenl:
-w_text = space.call_method(w_text, "replace", 
space.new_from_utf8('\n'),
-   space.new_from_utf8(self.writenl))
+w_text = space.call_method(w_text, "replace", 

[pypy-commit] pypy unicode-utf8: kill dead code

2017-12-08 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93306:eb61e553bfd4
Date: 2017-12-07 18:07 +0200
http://bitbucket.org/pypy/pypy/changeset/eb61e553bfd4/

Log:kill dead code

diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -466,18 +466,6 @@
 if rutf8.has_surrogates(utf8):
 utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
 return space.newtuple([space.newbytes(utf8), space.newint(lgt)])
-#@unwrap_spec(uni=unicode, errors='text_or_none')
-#def utf_8_encode(space, uni, errors="strict"):
-#if errors is None:
-#errors = 'strict'
-#state = space.fromcache(CodecState)
-## NB. can't call unicode_encode_utf_8() directly because that's
-## an @elidable function nowadays.  Instead, we need the _impl().
-## (The problem is the errorhandler, which calls arbitrary Python.)
-#result = runicode.unicode_encode_utf_8_impl(
-#uni, len(uni), errors, state.encode_error_handler,
-#allow_surrogates=True)
-#return space.newtuple([space.newbytes(result), space.newint(len(uni))])
 
 @unwrap_spec(string='bufferstr', errors='text_or_none',
  w_final = WrappedDefault(False))
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: more fixes

2017-12-08 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93310:e4ed73204961
Date: 2017-12-08 10:50 +0200
http://bitbucket.org/pypy/pypy/changeset/e4ed73204961/

Log:more fixes

diff --git a/pypy/module/array/interp_array.py 
b/pypy/module/array/interp_array.py
--- a/pypy/module/array/interp_array.py
+++ b/pypy/module/array/interp_array.py
@@ -451,7 +451,7 @@
 """
 if self.typecode == 'u':
 buf = rffi.cast(UNICODE_ARRAY, self._buffer_as_unsigned())
-return space.newutf8(rffi.wcharpsize2unicode(buf, self.len))
+return space.newutf8(rffi.wcharpsize2utf8(buf, self.len), self.len)
 else:
 raise oefmt(space.w_ValueError,
 "tounicode() may only be called on type 'u' arrays")
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: fix _codecs

2017-12-08 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93307:bf4ecad403eb
Date: 2017-12-08 10:19 +0200
http://bitbucket.org/pypy/pypy/changeset/bf4ecad403eb/

Log:fix _codecs

diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -575,8 +575,8 @@
 digits = 4 if s[pos] == 'u' else 8
 message = "truncated \\u"
 pos += 1
-pos, _, _ = hexescape(result, s, pos, digits,
-"rawunicodeescape", errorhandler, message, errors)
+pos, _ = hexescape(result, s, pos, digits,
+   "rawunicodeescape", errorhandler, message, errors)
 
 r = result.build()
 lgt = rutf8.check_utf8(r, True)
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: whack the slowpath too

2017-12-08 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93305:a50930e1db6b
Date: 2017-12-07 18:07 +0200
http://bitbucket.org/pypy/pypy/changeset/a50930e1db6b/

Log:whack the slowpath too

diff --git a/pypy/module/_codecs/interp_codecs.py 
b/pypy/module/_codecs/interp_codecs.py
--- a/pypy/module/_codecs/interp_codecs.py
+++ b/pypy/module/_codecs/interp_codecs.py
@@ -460,10 +460,12 @@
 
 # utf-8 functions are not regular, because we have to pass
 # "allow_surrogates=True"
-@unwrap_spec(utf8='utf8', errors='text_or_none')
-def utf_8_encode(space, utf8, errors="strict"):
-length, _ = rutf8.check_utf8(utf8, allow_surrogates=True)
-return space.newtuple([space.newbytes(utf8), space.newint(length)])
+@unwrap_spec(errors='text_or_none')
+def utf_8_encode(space, w_obj, errors="strict"):
+utf8, lgt = space.utf8_len_w(w_obj)
+if rutf8.has_surrogates(utf8):
+utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
+return space.newtuple([space.newbytes(utf8), space.newint(lgt)])
 #@unwrap_spec(uni=unicode, errors='text_or_none')
 #def utf_8_encode(space, uni, errors="strict"):
 #if errors is None:
diff --git a/pypy/objspace/std/test/test_unicodeobject.py 
b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -741,6 +741,8 @@
 assert u'\u20ac'.encode('utf-8') == '\xe2\x82\xac'
 assert u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82'
 assert u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96'
+assert u'\ud800\udc02'.encode('uTf-8') == '\xf0\x90\x80\x82'
+assert u'\ud84d\udc56'.encode('Utf8') == '\xf0\xa3\x91\x96'
 assert u'\ud800'.encode('utf-8') == '\xed\xa0\x80'
 assert u'\udc00'.encode('utf-8') == '\xed\xb0\x80'
 assert (u'\ud800\udc02'*1000).encode('utf-8') == 
'\xf0\x90\x80\x82'*1000
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8: fix _multibytecodec

2017-12-08 Thread fijal
Author: fijal
Branch: unicode-utf8
Changeset: r93309:affb72fc7cf7
Date: 2017-12-08 10:40 +0200
http://bitbucket.org/pypy/pypy/changeset/affb72fc7cf7/

Log:fix _multibytecodec

diff --git a/pypy/module/_multibytecodec/c_codecs.py 
b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -157,7 +157,7 @@
 replace, end = errorcb(errors, namecb, reason,
stringdata, start, end)
 # 'replace' is RPython unicode here
-lgt, _ = rutf8.check_utf8(replace, True)
+lgt = rutf8.get_utf8_length(replace)
 inbuf = rffi.utf82wcharp(replace, lgt)
 try:
 r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end)
@@ -268,7 +268,7 @@
 rets, end = errorcb(errors, namecb, reason,
 unicodedata, start, end)
 codec = pypy_cjk_enc_getcodec(encodebuf)
-lgt, _ = rutf8.get_utf8_length_flag(rets)
+lgt = rutf8.get_utf8_length(rets)
 replace = encode(codec, rets, lgt, "strict", errorcb, namecb)
 with rffi.scoped_nonmovingbuffer(replace) as inbuf:
 r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end)
diff --git a/pypy/module/_multibytecodec/interp_incremental.py 
b/pypy/module/_multibytecodec/interp_incremental.py
--- a/pypy/module/_multibytecodec/interp_incremental.py
+++ b/pypy/module/_multibytecodec/interp_incremental.py
@@ -66,7 +66,7 @@
 pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf)
 assert 0 <= pos <= len(object)
 self.pending = object[pos:]
-lgt = rutf8.get_utf8_length_flag(output)
+lgt = rutf8.get_utf8_length(output)
 return space.newutf8(output, lgt)
 
 
diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py 
b/pypy/module/_multibytecodec/interp_multibytecodec.py
--- a/pypy/module/_multibytecodec/interp_multibytecodec.py
+++ b/pypy/module/_multibytecodec/interp_multibytecodec.py
@@ -27,8 +27,8 @@
 raise wrap_unicodedecodeerror(space, e, input, self.name)
 except RuntimeError:
 raise wrap_runtimeerror(space)
-lgt, flag = rutf8.check_utf8(utf8_output, True)
-return space.newtuple([space.newutf8(utf8_output, lgt, flag),
+lgt = rutf8.get_utf8_length(utf8_output)
+return space.newtuple([space.newutf8(utf8_output, lgt),
space.newint(len(input))])
 
 @unwrap_spec(errors="text_or_none")
diff --git a/pypy/module/_multibytecodec/test/test_translation.py 
b/pypy/module/_multibytecodec/test/test_translation.py
--- a/pypy/module/_multibytecodec/test/test_translation.py
+++ b/pypy/module/_multibytecodec/test/test_translation.py
@@ -14,7 +14,7 @@
 codecname, string = argv[1], argv[2]
 c = c_codecs.getcodec(codecname)
 u = c_codecs.decode(c, string)
-lgt, _ = rutf8.get_utf8_length_flag(u)
+lgt = rutf8.get_utf8_length(u)
 r = c_codecs.encode(c, u, lgt)
 print r
 return 0
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-re: in-progress

2017-12-08 Thread arigo
Author: Armin Rigo 
Branch: unicode-utf8-re
Changeset: r93303:0fd38947b59e
Date: 2017-12-08 11:45 +0100
http://bitbucket.org/pypy/pypy/changeset/0fd38947b59e/

Log:in-progress

diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py
--- a/pypy/module/_sre/interp_sre.py
+++ b/pypy/module/_sre/interp_sre.py
@@ -6,9 +6,8 @@
 from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
 from pypy.interpreter.error import OperationError, oefmt
 from rpython.rlib.rarithmetic import intmask
-from rpython.rlib import jit
+from rpython.rlib import jit, rutf8
 from rpython.rlib.rstring import StringBuilder
-from rpython.rlib.rutf8 import Utf8StringBuilder
 
 # 
 #
@@ -110,11 +109,15 @@
 if endpos < pos:
 endpos = pos
 if space.isinstance_w(w_string, space.w_unicode):
-unicodestr = space.unicode_w(w_string)
-if pos > len(unicodestr):
-pos = len(unicodestr)
-if endpos > len(unicodestr):
-endpos = len(unicodestr)
+utf8str, length = space.utf8_len_w(w_string)
+if pos >= length:
+bytepos = len(utf8str)
+else:
+bytepos = rutf8.codepoint_at_index(..)
+
+pos = length
+if endpos >= length:
+endpos = length
 return rsre_core.UnicodeMatchContext(self.code, unicodestr,
  pos, endpos, self.flags)
 elif space.isinstance_w(w_string, space.w_bytes):
diff --git a/pypy/module/_sre/test/test_app_sre.py 
b/pypy/module/_sre/test/test_app_sre.py
--- a/pypy/module/_sre/test/test_app_sre.py
+++ b/pypy/module/_sre/test/test_app_sre.py
@@ -87,6 +87,13 @@
 assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus")
 assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs")
 
+def test_findall_unicode(self):
+import re
+assert [u"\u1234"] == re.findall(u"\u1234", u"\u1000\u1234\u2000")
+assert ["a", "u"] == re.findall("b(.)", "abalbus")
+assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus")
+assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs")
+
 def test_finditer(self):
 import re
 it = re.finditer("b(.)", "brabbel")
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit


[pypy-commit] pypy unicode-utf8-re: hg merge unicode-utf8

2017-12-08 Thread arigo
Author: Armin Rigo 
Branch: unicode-utf8-re
Changeset: r93304:be4b4c164598
Date: 2017-12-08 11:46 +0100
http://bitbucket.org/pypy/pypy/changeset/be4b4c164598/

Log:hg merge unicode-utf8

diff too long, truncating to 2000 out of 3797 lines

diff --git a/TODO b/TODO
--- a/TODO
+++ b/TODO
@@ -9,5 +9,6 @@
 * remove assertions from W_UnicodeObject.__init__ if all the builders pass
 * what to do with error handlers that go backwards. There were tests
   in test_codecs that would check for that
+* improve performance of splitlines
 
 * fix _pypyjson to not use a wrapped dict when decoding an object
diff --git a/extra_tests/test_textio.py b/extra_tests/test_textio.py
--- a/extra_tests/test_textio.py
+++ b/extra_tests/test_textio.py
@@ -1,28 +1,48 @@
 from hypothesis import given, strategies as st
 
 from io import BytesIO, TextIOWrapper
+import os
 
-LINESEP = ['', '\r', '\n', '\r\n']
+def translate_newlines(text):
+text = text.replace('\r\n', '\n')
+text = text.replace('\r', '\n')
+return text.replace('\n', os.linesep)
 
 @st.composite
-def text_with_newlines(draw):
-sep = draw(st.sampled_from(LINESEP))
-lines = draw(st.lists(st.text(max_size=10), max_size=10))
-return sep.join(lines)
+def st_readline_universal(
+draw, st_nlines=st.integers(min_value=0, max_value=10)):
+n_lines = draw(st_nlines)
+lines = draw(st.lists(
+st.text(st.characters(blacklist_characters='\r\n')),
+min_size=n_lines, max_size=n_lines))
+limits = []
+for line in lines:
+limit = draw(st.integers(min_value=0, max_value=len(line) + 5))
+limits.append(limit)
+limits.append(-1)
+endings = draw(st.lists(
+st.sampled_from(['\n', '\r', '\r\n']),
+min_size=n_lines, max_size=n_lines))
+return (
+''.join(line + ending for line, ending in zip(lines, endings)),
+limits)
 
-@given(txt=text_with_newlines(),
-   mode=st.sampled_from(['\r', '\n', '\r\n', '']),
-   limit=st.integers(min_value=-1))
-def test_readline(txt, mode, limit):
+@given(data=st_readline_universal(),
+   mode=st.sampled_from(['\r', '\n', '\r\n', '', None]))
+def test_readline(data, mode):
+txt, limits = data
 textio = TextIOWrapper(
-BytesIO(txt.encode('utf-8')), encoding='utf-8', newline=mode)
+BytesIO(txt.encode('utf-8', 'surrogatepass')),
+encoding='utf-8', errors='surrogatepass', newline=mode)
 lines = []
-while True:
+for limit in limits:
 line = textio.readline(limit)
-if limit > 0:
-assert len(line) < limit
+if limit >= 0:
+assert len(line) <= limit
 if line:
 lines.append(line)
-else:
+elif limit:
 break
-assert u''.join(lines) == txt
+if mode is None:
+txt = translate_newlines(txt)
+assert txt.startswith(u''.join(lines))
diff --git a/lib_pypy/resource.py b/lib_pypy/resource.py
--- a/lib_pypy/resource.py
+++ b/lib_pypy/resource.py
@@ -20,6 +20,7 @@
 or via the attributes ru_utime, ru_stime, ru_maxrss, and so on."""
 
 __metaclass__ = _structseq.structseqtype
+name = "resource.struct_rusage"
 
 ru_utime = _structseq.structseqfield(0,"user time used")
 ru_stime = _structseq.structseqfield(1,"system time used")
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -26,3 +26,6 @@
 
 .. branch: fix-vmprof-stacklet-switch
 Fix a vmprof+continulets (i.e. greenelts, eventlet, gevent, ...)
+
+.. branch: win32-vcvars
+
diff --git a/pypy/doc/windows.rst b/pypy/doc/windows.rst
--- a/pypy/doc/windows.rst
+++ b/pypy/doc/windows.rst
@@ -25,8 +25,10 @@
 
 This compiler, while the standard one for Python 2.7, is deprecated. Microsoft 
has
 made it available as the `Microsoft Visual C++ Compiler for Python 2.7`_ (the 
link
-was checked in Nov 2016). Note that the compiler suite will be installed in
-``C:\Users\\AppData\Local\Programs\Common\Microsoft\Visual C++ for 
Python``.
+was checked in Nov 2016). Note that the compiler suite may be installed in
+``C:\Users\\AppData\Local\Programs\Common\Microsoft\Visual C++ for 
Python``
+or in
+``C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for Python``.
 A current version of ``setuptools`` will be able to find it there. For
 Windows 10, you must right-click the download, and under ``Properties`` ->
 ``Compatibility`` mark it as ``Run run this program in comatibility mode for``
@@ -41,7 +43,6 @@
 ---
 
 We routinely test translation using v9, also known as Visual Studio 2008.
-Our buildbot is still using the Express Edition, not the compiler noted above.
 Other configurations may work as well.
 
 The translation scripts will set up the appropriate environment variables
@@ -81,6 +82,30 @@
 
 .. _build instructions: http://pypy.org/download.html#building-from-source
 
+Setting Up Visual Studio 

[pypy-commit] pypy unicode-utf8-re: in-progress

2017-12-08 Thread arigo
Author: Armin Rigo 
Branch: unicode-utf8-re
Changeset: r93302:cb5b89596a2f
Date: 2017-12-08 11:44 +0100
http://bitbucket.org/pypy/pypy/changeset/cb5b89596a2f/

Log:in-progress

diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py
--- a/rpython/rlib/rsre/rsre_core.py
+++ b/rpython/rlib/rsre/rsre_core.py
@@ -142,6 +142,7 @@
 # Utf8MatchContext.  The non-utf8 implementation is provided
 # by the FixedMatchContext abstract subclass, in order to use
 # the same @not_rpython safety trick as above.
+ZERO = 0
 @not_rpython
 def next(self, position):
 raise NotImplementedError
@@ -221,9 +222,8 @@
 
 class FixedMatchContext(AbstractMatchContext):
 """Abstract subclass to introduce the default implementation for
-these position methods.  The Utf8 subclass doesn't inherit from here."""
-
-ZERO = 0
+these position methods.  The Utf8MatchContext subclass doesn't
+inherit from here."""
 
 def next(self, position):
 return position + 1
diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py
new file mode 100644
--- /dev/null
+++ b/rpython/rlib/rsre/rsre_utf8.py
@@ -0,0 +1,59 @@
+from rpython.rlib.debug import check_nonneg
+from rpython.rlib.rarithmetic import r_uint, intmask
+from rpython.rlib.rsre.rsre_core import AbstractMatchContext, EndOfString
+from rpython.rlib.rsre import rsre_char
+from rpython.rlib import rutf8
+
+
+class Utf8MatchContext(AbstractMatchContext):
+
+def __init__(self, pattern, utf8string, match_start, end, flags):
+AbstractMatchContext.__init__(self, pattern, match_start, end, flags)
+self._utf8 = utf8string
+
+def str(self, index):
+check_nonneg(index)
+return rutf8.codepoint_at_pos(self._utf8, index)
+
+def lowstr(self, index):
+c = self.str(index)
+return rsre_char.getlower(c, self.flags)
+
+def get_single_byte(self, base_position, index):
+return self.str(base_position + index)
+
+def fresh_copy(self, start):
+return Utf8MatchContext(self.pattern, self._utf8, start,
+self.end, self.flags)
+
+def next(self, position):
+return rutf8.next_codepoint_pos(self._utf8, position)
+
+def prev(self, position):
+if position <= 0:
+raise EndOfString
+upos = r_uint(position)
+upos = rutf8.prev_codepoint_pos(self._utf8, upos)
+position = intmask(upos)
+assert position >= 0
+return position
+
+def next_n(self, position, n, end_position):
+for i in range(n):
+if position >= end_position:
+raise EndOfString
+position = rutf8.next_codepoint_pos(self._utf8, position)
+return position
+
+def prev_n(self, position, n, start_position):
+upos = r_uint(position)
+for i in range(n):
+if upos <= r_uint(start_position):
+raise EndOfString
+upos = rutf8.next_codepoint_pos(self._utf8, upos)
+position = intmask(upos)
+assert position >= 0
+return position
+
+def slowly_convert_byte_pos_to_index(self, position):
+
diff --git a/rpython/rlib/rsre/test/test_search.py 
b/rpython/rlib/rsre/test/test_search.py
--- a/rpython/rlib/rsre/test/test_search.py
+++ b/rpython/rlib/rsre/test/test_search.py
@@ -1,7 +1,7 @@
 import re, py
 from rpython.rlib.rsre.test.test_match import get_code, get_code_and_re
 from rpython.rlib.rsre.test import support
-from rpython.rlib.rsre import rsre_core
+from rpython.rlib.rsre import rsre_core, rsre_utf8
 
 
 class BaseTestSearch:
@@ -222,3 +222,8 @@
 search = staticmethod(rsre_core.search)
 match = staticmethod(rsre_core.match)
 Position = staticmethod(lambda n: n)
+
+class TestSearchUtf8(BaseTestSearch):
+search = staticmethod(rsre_utf8.utf8search)
+match = staticmethod(rsre_utf8.utf8match)
+Position = staticmethod(lambda n: n)   # NB. only for plain ascii
___
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit