[pypy-commit] pypy unicode-utf8-test: fix space.newunicode
Author: Ronan LamyBranch: unicode-utf8-test Changeset: r93323:9fe5f582087d Date: 2017-12-08 13:37 + http://bitbucket.org/pypy/pypy/changeset/9fe5f582087d/ Log:fix space.newunicode diff --git a/pypy/objspace/std/objspace.py b/pypy/objspace/std/objspace.py --- a/pypy/objspace/std/objspace.py +++ b/pypy/objspace/std/objspace.py @@ -375,8 +375,8 @@ # XXX: kill me! assert isinstance(unistr, unicode) utf8s = unistr.encode("utf-8") -length, flag = rutf8.check_utf8(utf8s, True) -return self.newutf8(utf8s, length, flag) +length = rutf8.check_utf8(utf8s, True) +return self.newutf8(utf8s, length) def type(self, w_obj): jit.promote(w_obj.__class__) ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy py3.6: hg merge py3.5 (+ fixes)
Author: Manuel JacobBranch: py3.6 Changeset: r93320:f04d4604c7e3 Date: 2017-12-09 03:14 +0100 http://bitbucket.org/pypy/pypy/changeset/f04d4604c7e3/ Log:hg merge py3.5 (+ fixes) I'm not 100% sure about the merge in test_dis.py, but most of the tests are failing anyway. diff too long, truncating to 2000 out of 12565 lines diff --git a/.hgignore b/.hgignore --- a/.hgignore +++ b/.hgignore @@ -59,6 +59,7 @@ ^rpython/rlib/rvmprof/src/shared/libbacktrace/config.h$ ^rpython/rlib/rvmprof/src/shared/libbacktrace/config.log$ ^rpython/rlib/rvmprof/src/shared/libbacktrace/config.status$ +^pypy/tool/dest$ ^pypy/goal/pypy-translation-snapshot$ ^pypy/goal/pypy-c ^pypy/goal/pypy3-c diff --git a/_pytest/terminal.py b/_pytest/terminal.py --- a/_pytest/terminal.py +++ b/_pytest/terminal.py @@ -366,11 +366,11 @@ EXIT_OK, EXIT_TESTSFAILED, EXIT_INTERRUPTED, EXIT_USAGEERROR, EXIT_NOTESTSCOLLECTED) if exitstatus in summary_exit_codes: -self.config.hook.pytest_terminal_summary(terminalreporter=self) self.summary_errors() self.summary_failures() self.summary_warnings() self.summary_passes() +self.config.hook.pytest_terminal_summary(terminalreporter=self) if exitstatus == EXIT_INTERRUPTED: self._report_keyboardinterrupt() del self._keyboardinterrupt_memo diff --git a/extra_tests/requirements.txt b/extra_tests/requirements.txt new file mode 100644 --- /dev/null +++ b/extra_tests/requirements.txt @@ -0,0 +1,2 @@ +pytest +hypothesis diff --git a/extra_tests/test_bytes.py b/extra_tests/test_bytes.py new file mode 100644 --- /dev/null +++ b/extra_tests/test_bytes.py @@ -0,0 +1,84 @@ +from hypothesis import strategies as st +from hypothesis import given, example + +st_bytestring = st.binary() | st.binary().map(bytearray) + +@given(st_bytestring, st_bytestring, st_bytestring) +def test_find(u, prefix, suffix): +s = prefix + u + suffix +assert 0 <= s.find(u) <= len(prefix) +assert s.find(u, len(prefix), len(s) - len(suffix)) == len(prefix) + +@given(st_bytestring, st_bytestring, st_bytestring) +def test_index(u, prefix, suffix): +s = prefix + u + suffix +assert 0 <= s.index(u) <= len(prefix) +assert s.index(u, len(prefix), len(s) - len(suffix)) == len(prefix) + +@given(st_bytestring, st_bytestring, st_bytestring) +def test_rfind(u, prefix, suffix): +s = prefix + u + suffix +assert s.rfind(u) >= len(prefix) +assert s.rfind(u, len(prefix), len(s) - len(suffix)) == len(prefix) + +@given(st_bytestring, st_bytestring, st_bytestring) +def test_rindex(u, prefix, suffix): +s = prefix + u + suffix +assert s.rindex(u) >= len(prefix) +assert s.rindex(u, len(prefix), len(s) - len(suffix)) == len(prefix) + +def adjust_indices(u, start, end): +if end < 0: +end = max(end + len(u), 0) +else: +end = min(end, len(u)) +if start < 0: +start = max(start + len(u), 0) +return start, end + +@given(st_bytestring, st_bytestring) +def test_startswith_basic(u, v): +assert u.startswith(v) is (u[:len(v)] == v) + +@example(b'x', b'', 1) +@example(b'x', b'', 2) +@given(st_bytestring, st_bytestring, st.integers()) +def test_startswith_start(u, v, start): +expected = u[start:].startswith(v) if v else (start <= len(u)) +assert u.startswith(v, start) is expected + +@example(b'x', b'', 1, 0) +@example(b'xx', b'', -1, 0) +@given(st_bytestring, st_bytestring, st.integers(), st.integers()) +def test_startswith_3(u, v, start, end): +if v: +expected = u[start:end].startswith(v) +else: # CPython leaks implementation details in this case +start0, end0 = adjust_indices(u, start, end) +expected = start0 <= len(u) and start0 <= end0 +assert u.startswith(v, start, end) is expected + +@given(st_bytestring, st_bytestring) +def test_endswith_basic(u, v): +if len(v) > len(u): +assert u.endswith(v) is False +else: +assert u.endswith(v) is (u[len(u) - len(v):] == v) + +@example(b'x', b'', 1) +@example(b'x', b'', 2) +@given(st_bytestring, st_bytestring, st.integers()) +def test_endswith_2(u, v, start): +expected = u[start:].endswith(v) if v else (start <= len(u)) +assert u.endswith(v, start) is expected + +@example(b'x', b'', 1, 0) +@example(b'xx', b'', -1, 0) +@given(st_bytestring, st_bytestring, st.integers(), st.integers()) +def test_endswith_3(u, v, start, end): +if v: +expected = u[start:end].endswith(v) +else: # CPython leaks implementation details in this case +start0, end0 = adjust_indices(u, start, end) +expected = start0 <= len(u) and start0 <= end0 +assert u.endswith(v, start, end) is expected diff --git a/extra_tests/test_textio.py b/extra_tests/test_textio.py new file mode 100644 --- /dev/null +++ b/extra_tests/test_textio.py @@ -0,0 +1,48 @@ +from hypothesis import given,
[pypy-commit] pypy unicode-utf8-test: hg merge unicode-utf8
Author: Ronan LamyBranch: unicode-utf8-test Changeset: r93322:33d09fc56c08 Date: 2017-12-08 13:28 + http://bitbucket.org/pypy/pypy/changeset/33d09fc56c08/ Log:hg merge unicode-utf8 diff too long, truncating to 2000 out of 3186 lines diff --git a/TODO b/TODO --- a/TODO +++ b/TODO @@ -9,5 +9,6 @@ * remove assertions from W_UnicodeObject.__init__ if all the builders pass * what to do with error handlers that go backwards. There were tests in test_codecs that would check for that +* improve performance of splitlines * fix _pypyjson to not use a wrapped dict when decoding an object diff --git a/pypy/interpreter/baseobjspace.py b/pypy/interpreter/baseobjspace.py --- a/pypy/interpreter/baseobjspace.py +++ b/pypy/interpreter/baseobjspace.py @@ -1087,8 +1087,11 @@ def newlist_utf8(self, list_u, is_ascii): l_w = [None] * len(list_u) for i, item in enumerate(list_u): -length, flag = rutf8.check_utf8(item, True) -l_w[i] = self.newutf8(item, length, flag) +if not is_ascii: +length = rutf8.check_utf8(item, True) +else: +length = len(item) +l_w[i] = self.newutf8(item, length) return self.newlist(l_w) def newlist_int(self, list_i): diff --git a/pypy/interpreter/pyparser/parsestring.py b/pypy/interpreter/pyparser/parsestring.py --- a/pypy/interpreter/pyparser/parsestring.py +++ b/pypy/interpreter/pyparser/parsestring.py @@ -64,8 +64,8 @@ r = unicodehelper.decode_raw_unicode_escape(space, substr) else: r = unicodehelper.decode_unicode_escape(space, substr) -v, length, flag = r -return space.newutf8(v, length, flag) +v, length = r +return space.newutf8(v, length) need_encoding = (encoding is not None and encoding != "utf-8" and encoding != "utf8" and @@ -74,8 +74,8 @@ substr = s[ps : q] if rawmode or '\\' not in s[ps:]: if need_encoding: -lgt, flag = unicodehelper.check_utf8_or_raise(space, substr) -w_u = space.newutf8(substr, lgt, flag) +lgt = unicodehelper.check_utf8_or_raise(space, substr) +w_u = space.newutf8(substr, lgt) w_v = unicodehelper.encode(space, w_u, encoding) return w_v else: @@ -234,8 +234,8 @@ p = ps while p < end and ord(s[p]) & 0x80: p += 1 -lgt, flag = unicodehelper.check_utf8_or_raise(space, s, ps, p) -w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt, flag), +lgt = unicodehelper.check_utf8_or_raise(space, s, ps, p) +w_v = unicodehelper.encode(space, space.newutf8(s[ps:p], lgt), recode_encoding) v = space.bytes_w(w_v) return v, p diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -10,13 +10,13 @@ return str_decode_utf8(u, True, "strict", None) def test_decode_utf8(): -assert decode_utf8("abc") == ("abc", 3, 3, rutf8.FLAG_ASCII) -assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1, rutf8.FLAG_REGULAR) -assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1, rutf8.FLAG_HAS_SURROGATES) -assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1, rutf8.FLAG_HAS_SURROGATES) +assert decode_utf8("abc") == ("abc", 3, 3) +assert decode_utf8("\xe1\x88\xb4") == ("\xe1\x88\xb4", 3, 1) +assert decode_utf8("\xed\xa0\x80") == ("\xed\xa0\x80", 3, 1) +assert decode_utf8("\xed\xb0\x80") == ("\xed\xb0\x80", 3, 1) assert decode_utf8("\xed\xa0\x80\xed\xb0\x80") == ( -"\xed\xa0\x80\xed\xb0\x80", 6, 2, rutf8.FLAG_HAS_SURROGATES) -assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1, rutf8.FLAG_REGULAR) +"\xed\xa0\x80\xed\xb0\x80", 6, 2) +assert decode_utf8("\xf0\x90\x80\x80") == ("\xf0\x90\x80\x80", 4, 1) def test_utf8_encode_ascii(): assert utf8_encode_ascii("abc", "??", "??") == "abc" @@ -41,19 +41,19 @@ assert utf8_encode_ascii(u.encode("utf8"), "replace", eh) == u.encode("ascii", "replace") def test_str_decode_ascii(): -assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3, rutf8.FLAG_ASCII) +assert str_decode_ascii("abc", "??", True, "??") == ("abc", 3, 3) def eh(errors, encoding, reason, p, start, end): lst.append((errors, encoding, p, start, end)) return u"\u1234\u5678".encode("utf8"), end lst = [] input = "\xe8" exp = u"\u1234\u5678".encode("utf8") -assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2, rutf8.FLAG_REGULAR) +assert str_decode_ascii(input, "??", True, eh) == (exp, 1, 2) assert lst == [("??", "ascii", input, 0, 1)] lst = [] input = "\xe8\xe9abc\xea\xeb" assert str_decode_ascii(input, "??",
[pypy-commit] pypy unicode-utf8-test: hg merge unicode-utf8
Author: Ronan LamyBranch: unicode-utf8-test Changeset: r93324:e6db8eec731a Date: 2017-12-09 02:46 + http://bitbucket.org/pypy/pypy/changeset/e6db8eec731a/ Log:hg merge unicode-utf8 diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -1,3 +1,4 @@ +import pytest from hypothesis import given, strategies from rpython.rlib import rutf8 @@ -5,6 +6,7 @@ from pypy.interpreter.unicodehelper import str_decode_utf8 from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii from pypy.interpreter import unicodehelper as uh +from pypy.module._codecs.interp_codecs import CodecState def decode_utf8(u): return str_decode_utf8(u, True, "strict", None) @@ -68,3 +70,16 @@ def test_unicode_escape(u): r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None) assert r == u.encode("unicode-escape") + +def test_encode_decimal(space): +assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 ' +with pytest.raises(ValueError): +uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None) +state = space.fromcache(CodecState) +handler = state.encode_error_handler +assert uh.unicode_encode_decimal( +u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v' + +result = uh.unicode_encode_decimal( +u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler) +assert result == '12' diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -7,6 +7,7 @@ from rpython.rlib.rstring import StringBuilder from rpython.rtyper.lltypesystem import rffi from pypy.module._codecs import interp_codecs +from pypy.module.unicodedata import unicodedb @specialize.memo() def decode_error_handler(space): @@ -35,6 +36,16 @@ space.newtext(msg)])) return raise_unicode_exception_encode +def default_error_encode( +errors, encoding, msg, u, startingpos, endingpos): +"""A default handler, for tests""" +assert endingpos >= 0 +if errors == 'replace': +return '?', endingpos +if errors == 'ignore': +return '', endingpos +raise ValueError + def convert_arg_to_w_unicode(space, w_arg, strict=None): return space.convert_arg_to_w_unicode(w_arg) @@ -1458,3 +1469,70 @@ pos = rutf8.next_codepoint_pos(s, pos) return result.build() +# +# Decimal Encoder +def unicode_encode_decimal(s, errors, errorhandler=None): +"""Converts whitespace to ' ', decimal characters to their +corresponding ASCII digit and all other Latin-1 characters except +\0 as-is. Characters outside this range (Unicode ordinals 1-256) +are treated as errors. This includes embedded NULL bytes. +""" +if errorhandler is None: +errorhandler = default_error_encode +result = StringBuilder(len(s)) +pos = 0 +i = 0 +it = rutf8.Utf8StringIterator(s) +for ch in it: +if unicodedb.isspace(ch): +result.append(' ') +i += 1 +continue +try: +decimal = unicodedb.decimal(ch) +except KeyError: +pass +else: +result.append(chr(48 + decimal)) +i += 1 +continue +if 0 < ch < 256: +result.append(chr(ch)) +i += 1 +continue +# All other characters are considered unencodable +start_index = i +i += 1 +while not it.done(): +ch = rutf8.codepoint_at_pos(s, it.get_pos()) +try: +if (0 < ch < 256 or unicodedb.isspace(ch) or +unicodedb.decimal(ch) >= 0): +break +except KeyError: +# not a decimal +pass +if it.done(): +break +ch = next(it) +i += 1 +end_index = i +msg = "invalid decimal Unicode string" +r, pos = errorhandler( +errors, 'decimal', msg, s, start_index, end_index) +for ch in rutf8.Utf8StringIterator(r): +if unicodedb.isspace(ch): +result.append(' ') +continue +try: +decimal = unicodedb.decimal(ch) +except KeyError: +pass +else: +result.append(chr(48 + decimal)) +continue +if 0 < ch < 256: +result.append(chr(ch)) +continue +errorhandler('strict', 'decimal', msg, s, start_index, end_index) +return result.build() diff --git a/pypy/module/_pypyjson/interp_decoder.py
[pypy-commit] pypy unicode-utf8: translation fixes
Author: Ronan LamyBranch: unicode-utf8 Changeset: r93321:598f10607a50 Date: 2017-12-09 02:44 + http://bitbucket.org/pypy/pypy/changeset/598f10607a50/ Log:translation fixes diff --git a/pypy/module/_pypyjson/interp_decoder.py b/pypy/module/_pypyjson/interp_decoder.py --- a/pypy/module/_pypyjson/interp_decoder.py +++ b/pypy/module/_pypyjson/interp_decoder.py @@ -3,6 +3,7 @@ from rpython.rlib.objectmodel import specialize, always_inline, r_dict from rpython.rlib import rfloat, runicode, rutf8 from rpython.rtyper.lltypesystem import lltype, rffi +from rpython.rlib.rarithmetic import r_uint from pypy.interpreter.error import oefmt from pypy.interpreter import unicodehelper @@ -366,7 +367,7 @@ return # help the annotator to know that we'll never go beyond # this point # -utf8_ch = rutf8.unichr_as_utf8(val, allow_surrogates=True) +utf8_ch = rutf8.unichr_as_utf8(r_uint(val), allow_surrogates=True) builder.append(utf8_ch) return i @@ -400,7 +401,7 @@ break elif ch == '\\' or ch < '\x20': self.pos = i-1 -return self.space.unicode_w(self.decode_string_escaped(start)) +return self.decode_string_escaped(start) strhash = intmask((103 * strhash) ^ ord(ll_chars[i])) bits |= ord(ch) length = i - start - 1 diff --git a/pypy/module/_rawffi/alt/type_converter.py b/pypy/module/_rawffi/alt/type_converter.py --- a/pypy/module/_rawffi/alt/type_converter.py +++ b/pypy/module/_rawffi/alt/type_converter.py @@ -128,7 +128,7 @@ intval: lltype.Signed """ self.error(w_ffitype, w_obj) - + def handle_unichar(self, w_ffitype, w_obj, intval): """ intval: lltype.Signed @@ -174,7 +174,7 @@ def handle_struct_rawffi(self, w_ffitype, w_structinstance): """ This method should be killed as soon as we remove support for _rawffi structures - + w_structinstance: W_StructureInstance """ self.error(w_ffitype, w_structinstance) @@ -228,7 +228,7 @@ return space.newbytes(chr(ucharval)) elif w_ffitype.is_unichar(): wcharval = self.get_unichar(w_ffitype) -return space.newutf8(rutf8.unichr_as_utf8(wcharval), 1) +return space.newutf8(rutf8.unichr_as_utf8(r_uint(wcharval)), 1) elif w_ffitype.is_double(): return self._float(w_ffitype) elif w_ffitype.is_singlefloat(): @@ -349,7 +349,7 @@ def get_struct_rawffi(self, w_ffitype, w_structdescr): """ This should be killed as soon as we kill support for _rawffi structures - + Return type: lltype.Unsigned (the address of the structure) """ diff --git a/pypy/module/_rawffi/interp_rawffi.py b/pypy/module/_rawffi/interp_rawffi.py --- a/pypy/module/_rawffi/interp_rawffi.py +++ b/pypy/module/_rawffi/interp_rawffi.py @@ -596,9 +596,9 @@ return space.w_None wcharp_addr = rffi.cast(rffi.CWCHARP, address) if maxlength == -1: -s = rffi.wcharp2utf8(wcharp_addr) +s = rffi.wcharp2unicode(wcharp_addr) else: -s = rffi.wcharpsize2utf8(wcharp_addr, maxlength) +s = rffi.wcharp2unicoden(wcharp_addr, maxlength) return space.newunicode(s) @unwrap_spec(address=r_uint, maxlength=int) diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py --- a/pypy/module/array/interp_array.py +++ b/pypy/module/array/interp_array.py @@ -1,7 +1,7 @@ from rpython.rlib import jit, rgc, rutf8 from rpython.rlib.buffer import RawBuffer from rpython.rlib.objectmodel import keepalive_until_here -from rpython.rlib.rarithmetic import ovfcheck, widen +from rpython.rlib.rarithmetic import ovfcheck, widen, r_uint from rpython.rlib.unroll import unrolling_iterable from rpython.rtyper.annlowlevel import llstr from rpython.rtyper.lltypesystem import lltype, rffi @@ -1013,7 +1013,7 @@ elif mytype.typecode == 'c': return space.newbytes(item) elif mytype.typecode == 'u': -code = ord(item) +code = r_uint(ord(item)) return space.newutf8(rutf8.unichr_as_utf8(code), 1) assert 0, "unreachable" diff --git a/pypy/module/pyexpat/interp_pyexpat.py b/pypy/module/pyexpat/interp_pyexpat.py --- a/pypy/module/pyexpat/interp_pyexpat.py +++ b/pypy/module/pyexpat/interp_pyexpat.py @@ -483,7 +483,7 @@ except rutf8.CheckError: from pypy.interpreter import unicodehelper # get the correct error msg -unicodehelper.str_decode_utf8(s, len(s), 'string', True, +unicodehelper.str_decode_utf8(s, 'string', True, unicodehelper.decode_error_handler(space)) assert False, "always raises"
[pypy-commit] pypy unicode-utf8: Add utf8-based replacement for runicode.unicode_encode_decimal() to unicodehelper and fix PyUnicode_EncodeDecimal()
Author: Ronan LamyBranch: unicode-utf8 Changeset: r93319:ac75e33e51bb Date: 2017-12-09 01:36 + http://bitbucket.org/pypy/pypy/changeset/ac75e33e51bb/ Log:Add utf8-based replacement for runicode.unicode_encode_decimal() to unicodehelper and fix PyUnicode_EncodeDecimal() diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py --- a/pypy/interpreter/test/test_unicodehelper.py +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -1,3 +1,4 @@ +import pytest from hypothesis import given, strategies from rpython.rlib import rutf8 @@ -5,6 +6,7 @@ from pypy.interpreter.unicodehelper import str_decode_utf8 from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii from pypy.interpreter import unicodehelper as uh +from pypy.module._codecs.interp_codecs import CodecState def decode_utf8(u): return str_decode_utf8(u, True, "strict", None) @@ -68,3 +70,16 @@ def test_unicode_escape(u): r = uh.utf8_encode_unicode_escape(u.encode("utf8"), "strict", None) assert r == u.encode("unicode-escape") + +def test_encode_decimal(space): +assert uh.unicode_encode_decimal(u' 12, 34 ', None) == ' 12, 34 ' +with pytest.raises(ValueError): +uh.unicode_encode_decimal(u' 12, \u1234 '.encode('utf8'), None) +state = space.fromcache(CodecState) +handler = state.encode_error_handler +assert uh.unicode_encode_decimal( +u'u\u1234\u1235v'.encode('utf8'), 'replace', handler) == 'u??v' + +result = uh.unicode_encode_decimal( +u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler) +assert result == '12' diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -7,6 +7,7 @@ from rpython.rlib.rstring import StringBuilder from rpython.rtyper.lltypesystem import rffi from pypy.module._codecs import interp_codecs +from pypy.module.unicodedata import unicodedb @specialize.memo() def decode_error_handler(space): @@ -35,6 +36,16 @@ space.newtext(msg)])) return raise_unicode_exception_encode +def default_error_encode( +errors, encoding, msg, u, startingpos, endingpos): +"""A default handler, for tests""" +assert endingpos >= 0 +if errors == 'replace': +return '?', endingpos +if errors == 'ignore': +return '', endingpos +raise ValueError + def convert_arg_to_w_unicode(space, w_arg, strict=None): return space.convert_arg_to_w_unicode(w_arg) @@ -1458,3 +1469,70 @@ pos = rutf8.next_codepoint_pos(s, pos) return result.build() +# +# Decimal Encoder +def unicode_encode_decimal(s, errors, errorhandler=None): +"""Converts whitespace to ' ', decimal characters to their +corresponding ASCII digit and all other Latin-1 characters except +\0 as-is. Characters outside this range (Unicode ordinals 1-256) +are treated as errors. This includes embedded NULL bytes. +""" +if errorhandler is None: +errorhandler = default_error_encode +result = StringBuilder(len(s)) +pos = 0 +i = 0 +it = rutf8.Utf8StringIterator(s) +for ch in it: +if unicodedb.isspace(ch): +result.append(' ') +i += 1 +continue +try: +decimal = unicodedb.decimal(ch) +except KeyError: +pass +else: +result.append(chr(48 + decimal)) +i += 1 +continue +if 0 < ch < 256: +result.append(chr(ch)) +i += 1 +continue +# All other characters are considered unencodable +start_index = i +i += 1 +while not it.done(): +ch = rutf8.codepoint_at_pos(s, it.get_pos()) +try: +if (0 < ch < 256 or unicodedb.isspace(ch) or +unicodedb.decimal(ch) >= 0): +break +except KeyError: +# not a decimal +pass +if it.done(): +break +ch = next(it) +i += 1 +end_index = i +msg = "invalid decimal Unicode string" +r, pos = errorhandler( +errors, 'decimal', msg, s, start_index, end_index) +for ch in rutf8.Utf8StringIterator(r): +if unicodedb.isspace(ch): +result.append(' ') +continue +try: +decimal = unicodedb.decimal(ch) +except KeyError: +pass +else: +result.append(chr(48 + decimal)) +continue +if 0 < ch < 256: +result.append(chr(ch)) +continue +errorhandler('strict', 'decimal', msg, s, start_index,
[pypy-commit] pypy unicode-utf8: Fix PyUnicode_DecodeUTF16/32
Author: Ronan LamyBranch: unicode-utf8 Changeset: r93318:d53d8f486841 Date: 2017-12-08 16:53 + http://bitbucket.org/pypy/pypy/changeset/d53d8f486841/ Log:Fix PyUnicode_DecodeUTF16/32 diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py --- a/pypy/module/cpyext/unicodeobject.py +++ b/pypy/module/cpyext/unicodeobject.py @@ -3,7 +3,8 @@ from rpython.tool.sourcetools import func_renamer from pypy.interpreter.error import OperationError, oefmt -from pypy.interpreter.unicodehelper import wcharpsize2utf8 +from pypy.interpreter.unicodehelper import ( +wcharpsize2utf8, str_decode_utf_16_helper, str_decode_utf_32_helper) from pypy.module.unicodedata import unicodedb from pypy.module.cpyext.api import ( CANNOT_FAIL, Py_ssize_t, build_type_checkers_flags, cpython_api, @@ -568,15 +569,11 @@ else: errors = None -result, length, byteorder = runicode.str_decode_utf_16_helper( -string, size, errors, -True, # final ? false for multiple passes? -None, # errorhandler -byteorder) +result, _, length, byteorder = str_decode_utf_16_helper( +string, errors, final=True, errorhandler=None, byteorder=byteorder) if pbyteorder is not None: pbyteorder[0] = rffi.cast(rffi.INT, byteorder) - -return space.newunicode(result) +return space.newutf8(result, length) @cpython_api([CONST_STRING, Py_ssize_t, CONST_STRING, rffi.INTP], PyObject) def PyUnicode_DecodeUTF32(space, s, size, llerrors, pbyteorder): @@ -624,15 +621,11 @@ else: errors = None -result, length, byteorder = runicode.str_decode_utf_32_helper( -string, size, errors, -True, # final ? false for multiple passes? -None, # errorhandler -byteorder) +result, _, length, byteorder = str_decode_utf_32_helper( +string, errors, final=True, errorhandler=None, byteorder=byteorder) if pbyteorder is not None: pbyteorder[0] = rffi.cast(rffi.INT, byteorder) - -return space.newunicode(result) +return space.newutf8(result, length) @cpython_api([rffi.CWCHARP, Py_ssize_t, rffi.CCHARP, CONST_STRING], rffi.INT_real, error=-1) ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: fixes
Author: Ronan LamyBranch: unicode-utf8 Changeset: r93317:5677dc1909e9 Date: 2017-12-08 14:45 + http://bitbucket.org/pypy/pypy/changeset/5677dc1909e9/ Log:fixes diff --git a/pypy/module/cpyext/longobject.py b/pypy/module/cpyext/longobject.py --- a/pypy/module/cpyext/longobject.py +++ b/pypy/module/cpyext/longobject.py @@ -4,6 +4,7 @@ CONST_STRING, ADDR, CANNOT_FAIL) from pypy.objspace.std.longobject import W_LongObject from pypy.interpreter.error import OperationError +from pypy.interpreter.unicodehelper import wcharpsize2utf8 from pypy.module.cpyext.intobject import PyInt_AsUnsignedLongMask from rpython.rlib.rbigint import rbigint @@ -191,7 +192,7 @@ string, length gives the number of characters, and base is the radix for the conversion. The radix must be in the range [2, 36]; if it is out of range, ValueError will be raised.""" -w_value = space.newunicode(rffi.wcharpsize2unicode(u, length)) +w_value = space.newutf8(wcharpsize2utf8(space, u, length), length) w_base = space.newint(rffi.cast(lltype.Signed, base)) return space.call_function(space.w_long, w_value, w_base) diff --git a/pypy/module/cpyext/object.py b/pypy/module/cpyext/object.py --- a/pypy/module/cpyext/object.py +++ b/pypy/module/cpyext/object.py @@ -246,7 +246,7 @@ the Python expression unicode(o). Called by the unicode() built-in function.""" if w_obj is None: -return space.newunicode(u"") +return space.newutf8("", 6) return space.call_function(space.w_unicode, w_obj) @cpython_api([PyObject, PyObject], rffi.INT_real, error=-1) @@ -302,7 +302,7 @@ if opid == Py_EQ: return 1 if opid == Py_NE: -return 0 +return 0 w_res = PyObject_RichCompare(space, w_o1, w_o2, opid_int) return int(space.is_true(w_res)) diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py --- a/pypy/module/cpyext/unicodeobject.py +++ b/pypy/module/cpyext/unicodeobject.py @@ -710,12 +710,17 @@ """Return 1 if substr matches str[start:end] at the given tail end (direction == -1 means to do a prefix match, direction == 1 a suffix match), 0 otherwise. Return -1 if an error occurred.""" +space.utf8_w(w_str) # type check +space.utf8_w(w_substr) w_start = space.newint(start) w_end = space.newint(end) if rffi.cast(lltype.Signed, direction) <= 0: -return space.call_method(w_str, "startswith", w_substr, w_start, w_end) +w_result = space.call_method( +w_str, "startswith", w_substr, w_start, w_end) else: -return space.call_method(w_str, "endswith", w_substr, w_start, w_end) +w_result = space.call_method( +w_str, "endswith", w_substr, w_start, w_end) +return space.int_w(w_result) @cpython_api([PyObject, PyObject, Py_ssize_t, Py_ssize_t], Py_ssize_t, error=-1) def PyUnicode_Count(space, w_str, w_substr, start, end): ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: Some unicode>utf8 conversions in cpyext/unicodeobject.py
Author: Ronan LamyBranch: unicode-utf8 Changeset: r93316:8cc0253e1ece Date: 2017-12-08 13:07 + http://bitbucket.org/pypy/pypy/changeset/8cc0253e1ece/ Log:Some unicode>utf8 conversions in cpyext/unicodeobject.py diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -1,10 +1,11 @@ import sys -from pypy.interpreter.error import OperationError +from pypy.interpreter.error import OperationError, oefmt from rpython.rlib.objectmodel import specialize from rpython.rlib import rutf8 from rpython.rlib.rarithmetic import r_uint, intmask from rpython.rlib.rstring import StringBuilder +from rpython.rtyper.lltypesystem import rffi from pypy.module._codecs import interp_codecs @specialize.memo() @@ -204,7 +205,7 @@ if c > 0x7F: errorhandler("strict", 'ascii', 'ordinal not in range(128)', utf8, - pos, pos + 1) + pos, pos + 1) j = rutf8.next_codepoint_pos(r, j) pos = newpos res.append(r) @@ -530,6 +531,19 @@ return builder.build(), pos, outsize +def wcharpsize2utf8(space, wcharp, size): +"""Safe version of rffi.wcharpsize2utf8. + +Raises app-level ValueError if any wchar value is outside the valid +codepoint range. +""" +try: +return rffi.wcharpsize2utf8(wcharp, size) +except ValueError: +raise oefmt(space.w_ValueError, +"character is not in range [U+; U+10]") + + # # Raw unicode escape diff --git a/pypy/module/cpyext/unicodeobject.py b/pypy/module/cpyext/unicodeobject.py --- a/pypy/module/cpyext/unicodeobject.py +++ b/pypy/module/cpyext/unicodeobject.py @@ -1,5 +1,9 @@ +from rpython.rtyper.lltypesystem import rffi, lltype +from rpython.rlib import rstring, runicode +from rpython.tool.sourcetools import func_renamer + from pypy.interpreter.error import OperationError, oefmt -from rpython.rtyper.lltypesystem import rffi, lltype +from pypy.interpreter.unicodehelper import wcharpsize2utf8 from pypy.module.unicodedata import unicodedb from pypy.module.cpyext.api import ( CANNOT_FAIL, Py_ssize_t, build_type_checkers_flags, cpython_api, @@ -13,8 +17,6 @@ from pypy.module.sys.interp_encoding import setdefaultencoding from pypy.module._codecs.interp_codecs import CodecState from pypy.objspace.std import unicodeobject -from rpython.rlib import rstring, runicode -from rpython.tool.sourcetools import func_renamer import sys ## See comment in bytesobject.py. @@ -61,10 +63,10 @@ def unicode_attach(space, py_obj, w_obj, w_userdata=None): "Fills a newly allocated PyUnicodeObject with a unicode string" py_unicode = rffi.cast(PyUnicodeObject, py_obj) -s = space.unicode_w(w_obj) -py_unicode.c_length = len(s) +s, length = space.utf8_len_w(w_obj) +py_unicode.c_length = length py_unicode.c_str = lltype.nullptr(rffi.CWCHARP.TO) -py_unicode.c_hash = space.hash_w(space.newunicode(s)) +py_unicode.c_hash = space.hash_w(space.newutf8(s, length)) py_unicode.c_defenc = lltype.nullptr(PyObject.TO) def unicode_realize(space, py_obj): @@ -73,11 +75,12 @@ be modified after this call. """ py_uni = rffi.cast(PyUnicodeObject, py_obj) -s = rffi.wcharpsize2unicode(py_uni.c_str, py_uni.c_length) +length = py_uni.c_length +s = wcharpsize2utf8(space, py_uni.c_str, length) w_type = from_ref(space, rffi.cast(PyObject, py_obj.c_ob_type)) w_obj = space.allocate_instance(unicodeobject.W_UnicodeObject, w_type) -w_obj.__init__(s) -py_uni.c_hash = space.hash_w(space.newunicode(s)) +w_obj.__init__(s, length) +py_uni.c_hash = space.hash_w(space.newutf8(s, length)) track_reference(space, py_obj, w_obj) return w_obj @@ -214,8 +217,8 @@ if not ref_unicode.c_str: # Copy unicode buffer w_unicode = from_ref(space, rffi.cast(PyObject, ref)) -u = space.unicode_w(w_unicode) -ref_unicode.c_str = rffi.unicode2wcharp(u) +u, length = space.utf8_len_w(w_unicode) +ref_unicode.c_str = rffi.utf82wcharp(u, length) return ref_unicode.c_str @cpython_api([PyObject], rffi.CWCHARP) @@ -335,8 +338,8 @@ Therefore, modification of the resulting Unicode object is only allowed when u is NULL.""" if wchar_p: -s = rffi.wcharpsize2unicode(wchar_p, length) -return make_ref(space, space.newunicode(s)) +s = wcharpsize2utf8(space, wchar_p, length) +return make_ref(space, space.newutf8(s, length)) else: return rffi.cast(PyObject, new_empty_unicode(space, length)) @@ -506,7 +509,8 @@ """Encode the Py_UNICODE buffer of the given size and return a Python string object. Return
[pypy-commit] pypy unicode-utf8-re: Fix test_search
Author: Armin RigoBranch: unicode-utf8-re Changeset: r93314:80ff594175dc Date: 2017-12-08 12:57 +0100 http://bitbucket.org/pypy/pypy/changeset/80ff594175dc/ Log:Fix test_search diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py --- a/rpython/rlib/rsre/rsre_utf8.py +++ b/rpython/rlib/rsre/rsre_utf8.py @@ -68,23 +68,41 @@ return # end of string is fine assert not (0x80 <= self._utf8[position] < 0xC0) # continuation byte +def maximum_distance(self, position_low, position_high): +# may overestimate if there are non-ascii chars +return position_high - position_low + + +def make_utf8_ctx(pattern, utf8string, bytestart, byteend, flags): +if bytestart < 0: bytestart = 0 +elif bytestart > len(utf8string): bytestart = len(utf8string) +if byteend < 0: byteend = 0 +elif byteend > len(utf8string): byteend = len(utf8string) +ctx = Utf8MatchContext(pattern, utf8string, bytestart, byteend, flags) +ctx.debug_check_pos(bytestart) +ctx.debug_check_pos(byteend) +return ctx def utf8search(pattern, utf8string, bytestart=0, byteend=sys.maxint, flags=0): # bytestart and byteend must be valid byte positions inside the # utf8string. from rpython.rlib.rsre.rsre_core import search_context -assert 0 <= bytestart <= len(utf8string) -assert 0 <= byteend -if byteend > len(utf8string): -byteend = len(utf8string) -ctx = Utf8MatchContext(pattern, utf8string, bytestart, byteend, flags) -ctx.debug_check_pos(bytestart) -ctx.debug_check_pos(byteend) +ctx = make_utf8_ctx(pattern, utf8string, bytestart, byteend, flags) if search_context(ctx): return ctx else: return None -def utf8match(*args, **kwds): -NOT_IMPLEMENTED +def utf8match(pattern, utf8string, bytestart=0, byteend=sys.maxint, flags=0, + fullmatch=False): +# bytestart and byteend must be valid byte positions inside the +# utf8string. +from rpython.rlib.rsre.rsre_core import match_context + +ctx = make_utf8_ctx(pattern, utf8string, bytestart, byteend, flags) +ctx.fullmatch_only = fullmatch +if match_context(ctx): +return ctx +else: +return None ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8-re: Fix test_match
Author: Armin RigoBranch: unicode-utf8-re Changeset: r93315:e2017b23843a Date: 2017-12-08 13:03 +0100 http://bitbucket.org/pypy/pypy/changeset/e2017b23843a/ Log:Fix test_match diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py --- a/rpython/rlib/rsre/rsre_core.py +++ b/rpython/rlib/rsre/rsre_core.py @@ -207,13 +207,6 @@ return (-1, -1) return (fmarks[groupnum], fmarks[groupnum+1]) -def group(self, groupnum=0): -frm, to = self.span(groupnum) -if 0 <= frm <= to: -return self._string[frm:to] -else: -return None - def fresh_copy(self, start): raise NotImplementedError diff --git a/rpython/rlib/rsre/test/support.py b/rpython/rlib/rsre/test/support.py --- a/rpython/rlib/rsre/test/support.py +++ b/rpython/rlib/rsre/test/support.py @@ -54,12 +54,19 @@ raise EndOfString return Position(r) -def slowly_convert_byte_pos_to_index(self, position): +def _real_pos(self, position): if type(position) is int and position == -1: return -1 assert isinstance(position, Position) return position._p +def group(self, groupnum=0): +frm, to = self.span(groupnum) +if self.ZERO <= frm <= to: +return self._string[self._real_pos(frm):self._real_pos(to)] +else: +return None + def str(self, position): assert isinstance(position, Position) return ord(self._string[position._p]) diff --git a/rpython/rlib/rsre/test/test_match.py b/rpython/rlib/rsre/test/test_match.py --- a/rpython/rlib/rsre/test/test_match.py +++ b/rpython/rlib/rsre/test/test_match.py @@ -1,7 +1,7 @@ import re, random, py from rpython.rlib.rsre import rsre_char from rpython.rlib.rsre.rpy import get_code, VERSION -from rpython.rlib.rsre.test.support import match, fullmatch, Position +from rpython.rlib.rsre.test.support import match, fullmatch, Position as P def get_code_and_re(regexp): @@ -51,20 +51,20 @@ def test_assert(self): r = get_code(r"abc(?=def)(.)") res = match(r, "abcdefghi") -assert res is not None and res.get_mark(1) == 4 +assert res is not None and res.get_mark(1) == P(4) assert not match(r, "abcdeFghi") def test_assert_not(self): r = get_code(r"abc(?!def)(.)") res = match(r, "abcdeFghi") -assert res is not None and res.get_mark(1) == 4 +assert res is not None and res.get_mark(1) == P(4) assert not match(r, "abcdefghi") def test_lookbehind(self): r = get_code(r"([a-z]*)(?<=de)") assert match(r, "ade") res = match(r, "adefg") -assert res is not None and res.get_mark(1) == 3 +assert res is not None and res.get_mark(1) == P(3) assert not match(r, "abc") assert not match(r, "X") assert not match(r, "eX") @@ -75,13 +75,13 @@ assert res is not None return res.get_mark(1) r = get_code(r"([a-z]*)(?https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8-re: duh
Author: Armin RigoBranch: unicode-utf8-re Changeset: r93313:68c926785f51 Date: 2017-12-08 12:52 +0100 http://bitbucket.org/pypy/pypy/changeset/68c926785f51/ Log:duh diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py --- a/rpython/rlib/rsre/rsre_utf8.py +++ b/rpython/rlib/rsre/rsre_utf8.py @@ -56,7 +56,7 @@ for i in range(n): if upos <= r_uint(start_position): raise EndOfString -upos = rutf8.next_codepoint_pos(self._utf8, upos) +upos = rutf8.prev_codepoint_pos(self._utf8, upos) position = intmask(upos) assert position >= 0 return position ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8-re: Remove slowly_convert_byte_pos_to_index
Author: Armin RigoBranch: unicode-utf8-re Changeset: r93312:b58a53172e21 Date: 2017-12-08 12:44 +0100 http://bitbucket.org/pypy/pypy/changeset/b58a53172e21/ Log:Remove slowly_convert_byte_pos_to_index diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py --- a/rpython/rlib/rsre/rsre_core.py +++ b/rpython/rlib/rsre/rsre_core.py @@ -159,9 +159,6 @@ def prev_n(self, position, n, start_position): raise NotImplementedError @not_rpython -def slowly_convert_byte_pos_to_index(self, position): -raise NotImplementedError -@not_rpython def debug_check_pos(self, position): raise NotImplementedError @not_rpython @@ -178,15 +175,13 @@ raise NotImplementedError def get_mark(self, gid): -mark = find_mark(self.match_marks, gid) -return self.slowly_convert_byte_pos_to_index(mark) +return find_mark(self.match_marks, gid) def flatten_marks(self): # for testing if self.match_marks_flat is None: self._compute_flattened_marks() -return [self.slowly_convert_byte_pos_to_index(i) -for i in self.match_marks_flat] +return self.match_marks_flat def _compute_flattened_marks(self): self.match_marks_flat = [self.match_start, self.match_end] @@ -249,9 +244,6 @@ raise EndOfString return position -def slowly_convert_byte_pos_to_index(self, position): -return position - def debug_check_pos(self, position): pass diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py --- a/rpython/rlib/rsre/rsre_utf8.py +++ b/rpython/rlib/rsre/rsre_utf8.py @@ -3,16 +3,19 @@ from rpython.rlib.rarithmetic import r_uint, intmask from rpython.rlib.rsre.rsre_core import AbstractMatchContext, EndOfString from rpython.rlib.rsre import rsre_char +from rpython.rlib.objectmodel import we_are_translated from rpython.rlib import rutf8 class Utf8MatchContext(AbstractMatchContext): +"""A context that matches unicode, but encoded in a utf8 string. +Be careful because most positions taken by, handled in, and returned +by this class are expressed in *bytes*, not in characters. +""" -def __init__(self, pattern, utf8string, index_storage, - match_start, end, flags): +def __init__(self, pattern, utf8string, match_start, end, flags): AbstractMatchContext.__init__(self, pattern, match_start, end, flags) self._utf8 = utf8string -self._index_storage = index_storage def str(self, index): check_nonneg(index) @@ -58,16 +61,15 @@ assert position >= 0 return position -def slowly_convert_byte_pos_to_index(self, position): -return rutf8.codepoint_index_at_byte_position( -self._utf8, self._index_storage, position) - def debug_check_pos(self, position): +if we_are_translated(): +return +if position == len(self._utf8): +return # end of string is fine assert not (0x80 <= self._utf8[position] < 0xC0) # continuation byte -def utf8search(pattern, utf8string, index_storage=None, bytestart=0, - byteend=sys.maxint, flags=0): +def utf8search(pattern, utf8string, bytestart=0, byteend=sys.maxint, flags=0): # bytestart and byteend must be valid byte positions inside the # utf8string. from rpython.rlib.rsre.rsre_core import search_context @@ -76,11 +78,9 @@ assert 0 <= byteend if byteend > len(utf8string): byteend = len(utf8string) -if index_storage is None: # should be restricted to tests only -length = rutf8.check_utf8(utf8string, allow_surrogates=True) -index_storage = rutf8.create_utf8_index_storage(utf8string, length) -ctx = Utf8MatchContext(pattern, utf8string, index_storage, - bytestart, byteend, flags) +ctx = Utf8MatchContext(pattern, utf8string, bytestart, byteend, flags) +ctx.debug_check_pos(bytestart) +ctx.debug_check_pos(byteend) if search_context(ctx): return ctx else: diff --git a/rpython/rlib/rsre/test/test_search.py b/rpython/rlib/rsre/test/test_search.py --- a/rpython/rlib/rsre/test/test_search.py +++ b/rpython/rlib/rsre/test/test_search.py @@ -12,19 +12,22 @@ assert res is None res = self.search(r_code1, "fooahcdixxx") assert res is not None -assert res.span() == (5, 8) +P = self.P +assert res.span() == (P(5), P(8)) def test_code2(self): r_code2 = get_code(r'\s*(.*?)') res = self.search(r_code2, "foo bar abcdef") assert res is not None -assert res.span() == (8, 34) +P = self.P +assert res.span() == (P(8), P(34)) def test_pure_literal(self): r_code3 = get_code(r'foobar') res = self.search(r_code3, "foo bar foobar baz")
[pypy-commit] pypy unicode-utf8-re: in-progress
Author: Armin RigoBranch: unicode-utf8-re Changeset: r93311:336fb075d139 Date: 2017-12-08 12:22 +0100 http://bitbucket.org/pypy/pypy/changeset/336fb075d139/ Log:in-progress diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py --- a/pypy/module/_sre/interp_sre.py +++ b/pypy/module/_sre/interp_sre.py @@ -13,7 +13,7 @@ # # Constants and exposed functions -from rpython.rlib.rsre import rsre_core +from rpython.rlib.rsre import rsre_core, rsre_utf8 from rpython.rlib.rsre.rsre_char import CODESIZE, MAXREPEAT, getlower, set_unicode_db @@ -40,7 +40,8 @@ end-start)) if isinstance(ctx, rsre_core.StrMatchContext): return space.newbytes(ctx._string[start:end]) -elif isinstance(ctx, rsre_core.UnicodeMatchContext): +elif isinstance(ctx, rsre_utf8.Utf8MatchContext): +XXX s = ctx._unicodestr[start:end] lgt = rutf8.check_utf8(s, True) return space.newutf8(s, lgt) @@ -103,7 +104,7 @@ raise oefmt(space.w_TypeError, "cannot copy this pattern object") def make_ctx(self, w_string, pos=0, endpos=sys.maxint): -"""Make a StrMatchContext, BufMatchContext or a UnicodeMatchContext for +"""Make a StrMatchContext, BufMatchContext or a Utf8MatchContext for searching in the given w_string object.""" space = self.space if pos < 0: @@ -111,17 +112,26 @@ if endpos < pos: endpos = pos if space.isinstance_w(w_string, space.w_unicode): -utf8str, length = space.utf8_len_w(w_string) -if pos >= length: +# xxx fish for the _index_storage +w_string = space.convert_arg_to_w_unicode(w_string) +utf8str = w_string._utf8 +length = w_string._len() +index_storage = w_string._get_index_storage() +# +if pos <= 0: +bytepos = 0 +elif pos >= length: bytepos = len(utf8str) else: -bytepos = rutf8.codepoint_at_index(..) - -pos = length +bytepos = rutf8.codepoint_at_index(utf8str, index_storage, pos) if endpos >= length: -endpos = length -return rsre_core.UnicodeMatchContext(self.code, unicodestr, - pos, endpos, self.flags) +endbytepos = len(utf8str) +else: +endbytepos = rutf8.codepoint_at_index(utf8str, index_storage, + endpos) +return rsre_utf8.Utf8MatchContext( +self.code, unicodestr, index_storage, +bytepos, endbytepos, self.flags) elif space.isinstance_w(w_string, space.w_bytes): str = space.bytes_w(w_string) if pos > len(str): @@ -372,7 +382,8 @@ if isinstance(ctx, rsre_core.StrMatchContext): assert strbuilder is not None return strbuilder.append_slice(ctx._string, start, end) -elif isinstance(ctx, rsre_core.UnicodeMatchContext): +elif isinstance(ctx, rsre_utf8.Utf8MatchContext): +XXX assert unicodebuilder is not None return unicodebuilder.append_slice(ctx._unicodestr, start, end) assert 0, "unreachable" @@ -578,7 +589,8 @@ return space.newbytes(ctx._buffer.as_str()) elif isinstance(ctx, rsre_core.StrMatchContext): return space.newbytes(ctx._string) -elif isinstance(ctx, rsre_core.UnicodeMatchContext): +elif isinstance(ctx, rsre_utf8.Utf8MatchContext): + lgt = rutf8.check_utf8(ctx._unicodestr, True) return space.newutf8(ctx._unicodestr, lgt) else: diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py --- a/rpython/rlib/rsre/rsre_core.py +++ b/rpython/rlib/rsre/rsre_core.py @@ -55,6 +55,8 @@ specific subclass, calling 'func' is a direct call; if 'ctx' is only known to be of class AbstractMatchContext, calling 'func' is an indirect call. """ +from rpython.rlib.rsre.rsre_utf8 import Utf8MatchContext + assert func.func_code.co_varnames[0] == 'ctx' specname = '_spec_' + func.func_name while specname in _seen_specname: @@ -65,7 +67,8 @@ specialized_methods = [] for prefix, concreteclass in [('buf', BufMatchContext), ('str', StrMatchContext), - ('uni', UnicodeMatchContext)]: + ('uni', UnicodeMatchContext), + ('utf8', Utf8MatchContext)]: newfunc = func_with_new_name(func, prefix + specname) assert not hasattr(concreteclass, specname) setattr(concreteclass, specname, newfunc)
[pypy-commit] pypy unicode-utf8: whack at _io module
Author: fijal Branch: unicode-utf8 Changeset: r93308:7ffcfc6493e6 Date: 2017-12-08 10:38 +0200 http://bitbucket.org/pypy/pypy/changeset/7ffcfc6493e6/ Log:whack at _io module diff --git a/pypy/module/_io/interp_stringio.py b/pypy/module/_io/interp_stringio.py --- a/pypy/module/_io/interp_stringio.py +++ b/pypy/module/_io/interp_stringio.py @@ -1,3 +1,5 @@ +from rpython.rlib.rutf8 import get_utf8_length + from pypy.interpreter.error import OperationError, oefmt from pypy.interpreter.typedef import ( TypeDef, generic_new_descr, GetSetProperty) @@ -152,7 +154,7 @@ if self.readnl is None: w_readnl = space.w_None else: -w_readnl = space.str(space.new_from_utf8(self.readnl)) # YYY +w_readnl = space.str(space.newutf8(self.readnl, get_utf8_length(self.readnl))) # YYY return space.newtuple([ w_initialval, w_readnl, space.newint(self.buf.pos), w_dict ]) @@ -215,7 +217,8 @@ if self.writenl: w_decoded = space.call_method( w_decoded, "replace", -space.newtext("\n"), space.new_from_utf8(self.writenl)) +space.newtext("\n"), space.newutf8(self.writenl, +get_utf8_length(self.writenl))) string = space.utf8_w(w_decoded) if string: self.buf.write(string) @@ -225,7 +228,9 @@ def read_w(self, space, w_size=None): self._check_closed(space) size = convert_size(space, w_size) -return space.new_from_utf8(self.buf.read(size)) +v = self.buf.read(size) +lgt = get_utf8_length(v) +return space.newutf8(v, lgt) def readline_w(self, space, w_limit=None): self._check_closed(space) @@ -239,7 +244,8 @@ else: newline = self.readnl result = self.buf.readline(newline, limit) -return space.new_from_utf8(result) +resultlen = get_utf8_length(result) +return space.newutf8(result, resultlen) @unwrap_spec(pos=int, mode=int) @@ -276,7 +282,9 @@ def getvalue_w(self, space): self._check_closed(space) -return space.new_from_utf8(self.buf.getvalue()) +v = self.buf.getvalue() +lgt = get_utf8_length(v) +return space.newutf8(v, lgt) def readable_w(self, space): self._check_closed(space) diff --git a/pypy/module/_io/interp_textio.py b/pypy/module/_io/interp_textio.py --- a/pypy/module/_io/interp_textio.py +++ b/pypy/module/_io/interp_textio.py @@ -12,7 +12,8 @@ from rpython.rlib.rbigint import rbigint from rpython.rlib.rstring import StringBuilder from rpython.rlib.rutf8 import (check_utf8, next_codepoint_pos, -codepoints_in_utf8) +codepoints_in_utf8, get_utf8_length, +Utf8StringBuilder) STATE_ZERO, STATE_OK, STATE_DETACHED = range(3) @@ -684,13 +685,15 @@ w_bytes = space.call_method(self.w_buffer, "read") w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, space.w_True) check_decoded(space, w_decoded) -w_result = space.new_from_utf8(self.decoded.get_chars(-1)) +chars = self.decoded.get_chars(-1) +lgt = get_utf8_length(chars) +w_result = space.newutf8(chars, lgt) w_final = space.add(w_result, w_decoded) self.snapshot = None return w_final remaining = size -builder = StringBuilder(size) +builder = Utf8StringBuilder(size) # Keep reading chunks until we have n characters to return while remaining > 0: @@ -700,7 +703,7 @@ builder.append(data) remaining -= len(data) -return space.new_from_utf8(builder.build()) +return space.newutf8(builder.build(), builder.get_length()) def _scan_line_ending(self, limit): if self.readuniversal: @@ -725,6 +728,7 @@ limit = convert_size(space, w_limit) remnant = None builder = StringBuilder() +# XXX maybe use Utf8StringBuilder instead? while True: # First, get some data if necessary has_data = self._ensure_data(space) @@ -771,7 +775,8 @@ self.decoded.reset() result = builder.build() -return space.new_from_utf8(result) +lgt = get_utf8_length(result) +return space.newutf8(result, lgt) # _ # write methods @@ -794,8 +799,8 @@ if text.find('\n') >= 0: haslf = True if haslf and self.writetranslate and self.writenl: -w_text = space.call_method(w_text, "replace", space.new_from_utf8('\n'), - space.new_from_utf8(self.writenl)) +w_text = space.call_method(w_text, "replace",
[pypy-commit] pypy unicode-utf8: kill dead code
Author: fijal Branch: unicode-utf8 Changeset: r93306:eb61e553bfd4 Date: 2017-12-07 18:07 +0200 http://bitbucket.org/pypy/pypy/changeset/eb61e553bfd4/ Log:kill dead code diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -466,18 +466,6 @@ if rutf8.has_surrogates(utf8): utf8 = rutf8.reencode_utf8_with_surrogates(utf8) return space.newtuple([space.newbytes(utf8), space.newint(lgt)]) -#@unwrap_spec(uni=unicode, errors='text_or_none') -#def utf_8_encode(space, uni, errors="strict"): -#if errors is None: -#errors = 'strict' -#state = space.fromcache(CodecState) -## NB. can't call unicode_encode_utf_8() directly because that's -## an @elidable function nowadays. Instead, we need the _impl(). -## (The problem is the errorhandler, which calls arbitrary Python.) -#result = runicode.unicode_encode_utf_8_impl( -#uni, len(uni), errors, state.encode_error_handler, -#allow_surrogates=True) -#return space.newtuple([space.newbytes(result), space.newint(len(uni))]) @unwrap_spec(string='bufferstr', errors='text_or_none', w_final = WrappedDefault(False)) ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: more fixes
Author: fijal Branch: unicode-utf8 Changeset: r93310:e4ed73204961 Date: 2017-12-08 10:50 +0200 http://bitbucket.org/pypy/pypy/changeset/e4ed73204961/ Log:more fixes diff --git a/pypy/module/array/interp_array.py b/pypy/module/array/interp_array.py --- a/pypy/module/array/interp_array.py +++ b/pypy/module/array/interp_array.py @@ -451,7 +451,7 @@ """ if self.typecode == 'u': buf = rffi.cast(UNICODE_ARRAY, self._buffer_as_unsigned()) -return space.newutf8(rffi.wcharpsize2unicode(buf, self.len)) +return space.newutf8(rffi.wcharpsize2utf8(buf, self.len), self.len) else: raise oefmt(space.w_ValueError, "tounicode() may only be called on type 'u' arrays") ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: fix _codecs
Author: fijal Branch: unicode-utf8 Changeset: r93307:bf4ecad403eb Date: 2017-12-08 10:19 +0200 http://bitbucket.org/pypy/pypy/changeset/bf4ecad403eb/ Log:fix _codecs diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -575,8 +575,8 @@ digits = 4 if s[pos] == 'u' else 8 message = "truncated \\u" pos += 1 -pos, _, _ = hexescape(result, s, pos, digits, -"rawunicodeescape", errorhandler, message, errors) +pos, _ = hexescape(result, s, pos, digits, + "rawunicodeescape", errorhandler, message, errors) r = result.build() lgt = rutf8.check_utf8(r, True) ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: whack the slowpath too
Author: fijal Branch: unicode-utf8 Changeset: r93305:a50930e1db6b Date: 2017-12-07 18:07 +0200 http://bitbucket.org/pypy/pypy/changeset/a50930e1db6b/ Log:whack the slowpath too diff --git a/pypy/module/_codecs/interp_codecs.py b/pypy/module/_codecs/interp_codecs.py --- a/pypy/module/_codecs/interp_codecs.py +++ b/pypy/module/_codecs/interp_codecs.py @@ -460,10 +460,12 @@ # utf-8 functions are not regular, because we have to pass # "allow_surrogates=True" -@unwrap_spec(utf8='utf8', errors='text_or_none') -def utf_8_encode(space, utf8, errors="strict"): -length, _ = rutf8.check_utf8(utf8, allow_surrogates=True) -return space.newtuple([space.newbytes(utf8), space.newint(length)]) +@unwrap_spec(errors='text_or_none') +def utf_8_encode(space, w_obj, errors="strict"): +utf8, lgt = space.utf8_len_w(w_obj) +if rutf8.has_surrogates(utf8): +utf8 = rutf8.reencode_utf8_with_surrogates(utf8) +return space.newtuple([space.newbytes(utf8), space.newint(lgt)]) #@unwrap_spec(uni=unicode, errors='text_or_none') #def utf_8_encode(space, uni, errors="strict"): #if errors is None: diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py --- a/pypy/objspace/std/test/test_unicodeobject.py +++ b/pypy/objspace/std/test/test_unicodeobject.py @@ -741,6 +741,8 @@ assert u'\u20ac'.encode('utf-8') == '\xe2\x82\xac' assert u'\ud800\udc02'.encode('utf-8') == '\xf0\x90\x80\x82' assert u'\ud84d\udc56'.encode('utf-8') == '\xf0\xa3\x91\x96' +assert u'\ud800\udc02'.encode('uTf-8') == '\xf0\x90\x80\x82' +assert u'\ud84d\udc56'.encode('Utf8') == '\xf0\xa3\x91\x96' assert u'\ud800'.encode('utf-8') == '\xed\xa0\x80' assert u'\udc00'.encode('utf-8') == '\xed\xb0\x80' assert (u'\ud800\udc02'*1000).encode('utf-8') == '\xf0\x90\x80\x82'*1000 ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8: fix _multibytecodec
Author: fijal Branch: unicode-utf8 Changeset: r93309:affb72fc7cf7 Date: 2017-12-08 10:40 +0200 http://bitbucket.org/pypy/pypy/changeset/affb72fc7cf7/ Log:fix _multibytecodec diff --git a/pypy/module/_multibytecodec/c_codecs.py b/pypy/module/_multibytecodec/c_codecs.py --- a/pypy/module/_multibytecodec/c_codecs.py +++ b/pypy/module/_multibytecodec/c_codecs.py @@ -157,7 +157,7 @@ replace, end = errorcb(errors, namecb, reason, stringdata, start, end) # 'replace' is RPython unicode here -lgt, _ = rutf8.check_utf8(replace, True) +lgt = rutf8.get_utf8_length(replace) inbuf = rffi.utf82wcharp(replace, lgt) try: r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, lgt, end) @@ -268,7 +268,7 @@ rets, end = errorcb(errors, namecb, reason, unicodedata, start, end) codec = pypy_cjk_enc_getcodec(encodebuf) -lgt, _ = rutf8.get_utf8_length_flag(rets) +lgt = rutf8.get_utf8_length(rets) replace = encode(codec, rets, lgt, "strict", errorcb, namecb) with rffi.scoped_nonmovingbuffer(replace) as inbuf: r = pypy_cjk_enc_replace_on_error(encodebuf, inbuf, len(replace), end) diff --git a/pypy/module/_multibytecodec/interp_incremental.py b/pypy/module/_multibytecodec/interp_incremental.py --- a/pypy/module/_multibytecodec/interp_incremental.py +++ b/pypy/module/_multibytecodec/interp_incremental.py @@ -66,7 +66,7 @@ pos = c_codecs.pypy_cjk_dec_inbuf_consumed(self.decodebuf) assert 0 <= pos <= len(object) self.pending = object[pos:] -lgt = rutf8.get_utf8_length_flag(output) +lgt = rutf8.get_utf8_length(output) return space.newutf8(output, lgt) diff --git a/pypy/module/_multibytecodec/interp_multibytecodec.py b/pypy/module/_multibytecodec/interp_multibytecodec.py --- a/pypy/module/_multibytecodec/interp_multibytecodec.py +++ b/pypy/module/_multibytecodec/interp_multibytecodec.py @@ -27,8 +27,8 @@ raise wrap_unicodedecodeerror(space, e, input, self.name) except RuntimeError: raise wrap_runtimeerror(space) -lgt, flag = rutf8.check_utf8(utf8_output, True) -return space.newtuple([space.newutf8(utf8_output, lgt, flag), +lgt = rutf8.get_utf8_length(utf8_output) +return space.newtuple([space.newutf8(utf8_output, lgt), space.newint(len(input))]) @unwrap_spec(errors="text_or_none") diff --git a/pypy/module/_multibytecodec/test/test_translation.py b/pypy/module/_multibytecodec/test/test_translation.py --- a/pypy/module/_multibytecodec/test/test_translation.py +++ b/pypy/module/_multibytecodec/test/test_translation.py @@ -14,7 +14,7 @@ codecname, string = argv[1], argv[2] c = c_codecs.getcodec(codecname) u = c_codecs.decode(c, string) -lgt, _ = rutf8.get_utf8_length_flag(u) +lgt = rutf8.get_utf8_length(u) r = c_codecs.encode(c, u, lgt) print r return 0 ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8-re: in-progress
Author: Armin RigoBranch: unicode-utf8-re Changeset: r93303:0fd38947b59e Date: 2017-12-08 11:45 +0100 http://bitbucket.org/pypy/pypy/changeset/0fd38947b59e/ Log:in-progress diff --git a/pypy/module/_sre/interp_sre.py b/pypy/module/_sre/interp_sre.py --- a/pypy/module/_sre/interp_sre.py +++ b/pypy/module/_sre/interp_sre.py @@ -6,9 +6,8 @@ from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault from pypy.interpreter.error import OperationError, oefmt from rpython.rlib.rarithmetic import intmask -from rpython.rlib import jit +from rpython.rlib import jit, rutf8 from rpython.rlib.rstring import StringBuilder -from rpython.rlib.rutf8 import Utf8StringBuilder # # @@ -110,11 +109,15 @@ if endpos < pos: endpos = pos if space.isinstance_w(w_string, space.w_unicode): -unicodestr = space.unicode_w(w_string) -if pos > len(unicodestr): -pos = len(unicodestr) -if endpos > len(unicodestr): -endpos = len(unicodestr) +utf8str, length = space.utf8_len_w(w_string) +if pos >= length: +bytepos = len(utf8str) +else: +bytepos = rutf8.codepoint_at_index(..) + +pos = length +if endpos >= length: +endpos = length return rsre_core.UnicodeMatchContext(self.code, unicodestr, pos, endpos, self.flags) elif space.isinstance_w(w_string, space.w_bytes): diff --git a/pypy/module/_sre/test/test_app_sre.py b/pypy/module/_sre/test/test_app_sre.py --- a/pypy/module/_sre/test/test_app_sre.py +++ b/pypy/module/_sre/test/test_app_sre.py @@ -87,6 +87,13 @@ assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus") assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs") +def test_findall_unicode(self): +import re +assert [u"\u1234"] == re.findall(u"\u1234", u"\u1000\u1234\u2000") +assert ["a", "u"] == re.findall("b(.)", "abalbus") +assert [("a", "l"), ("u", "s")] == re.findall("b(.)(.)", "abalbus") +assert [("a", ""), ("s", "s")] == re.findall("b(a|(s))", "babs") + def test_finditer(self): import re it = re.finditer("b(.)", "brabbel") ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit
[pypy-commit] pypy unicode-utf8-re: hg merge unicode-utf8
Author: Armin RigoBranch: unicode-utf8-re Changeset: r93304:be4b4c164598 Date: 2017-12-08 11:46 +0100 http://bitbucket.org/pypy/pypy/changeset/be4b4c164598/ Log:hg merge unicode-utf8 diff too long, truncating to 2000 out of 3797 lines diff --git a/TODO b/TODO --- a/TODO +++ b/TODO @@ -9,5 +9,6 @@ * remove assertions from W_UnicodeObject.__init__ if all the builders pass * what to do with error handlers that go backwards. There were tests in test_codecs that would check for that +* improve performance of splitlines * fix _pypyjson to not use a wrapped dict when decoding an object diff --git a/extra_tests/test_textio.py b/extra_tests/test_textio.py --- a/extra_tests/test_textio.py +++ b/extra_tests/test_textio.py @@ -1,28 +1,48 @@ from hypothesis import given, strategies as st from io import BytesIO, TextIOWrapper +import os -LINESEP = ['', '\r', '\n', '\r\n'] +def translate_newlines(text): +text = text.replace('\r\n', '\n') +text = text.replace('\r', '\n') +return text.replace('\n', os.linesep) @st.composite -def text_with_newlines(draw): -sep = draw(st.sampled_from(LINESEP)) -lines = draw(st.lists(st.text(max_size=10), max_size=10)) -return sep.join(lines) +def st_readline_universal( +draw, st_nlines=st.integers(min_value=0, max_value=10)): +n_lines = draw(st_nlines) +lines = draw(st.lists( +st.text(st.characters(blacklist_characters='\r\n')), +min_size=n_lines, max_size=n_lines)) +limits = [] +for line in lines: +limit = draw(st.integers(min_value=0, max_value=len(line) + 5)) +limits.append(limit) +limits.append(-1) +endings = draw(st.lists( +st.sampled_from(['\n', '\r', '\r\n']), +min_size=n_lines, max_size=n_lines)) +return ( +''.join(line + ending for line, ending in zip(lines, endings)), +limits) -@given(txt=text_with_newlines(), - mode=st.sampled_from(['\r', '\n', '\r\n', '']), - limit=st.integers(min_value=-1)) -def test_readline(txt, mode, limit): +@given(data=st_readline_universal(), + mode=st.sampled_from(['\r', '\n', '\r\n', '', None])) +def test_readline(data, mode): +txt, limits = data textio = TextIOWrapper( -BytesIO(txt.encode('utf-8')), encoding='utf-8', newline=mode) +BytesIO(txt.encode('utf-8', 'surrogatepass')), +encoding='utf-8', errors='surrogatepass', newline=mode) lines = [] -while True: +for limit in limits: line = textio.readline(limit) -if limit > 0: -assert len(line) < limit +if limit >= 0: +assert len(line) <= limit if line: lines.append(line) -else: +elif limit: break -assert u''.join(lines) == txt +if mode is None: +txt = translate_newlines(txt) +assert txt.startswith(u''.join(lines)) diff --git a/lib_pypy/resource.py b/lib_pypy/resource.py --- a/lib_pypy/resource.py +++ b/lib_pypy/resource.py @@ -20,6 +20,7 @@ or via the attributes ru_utime, ru_stime, ru_maxrss, and so on.""" __metaclass__ = _structseq.structseqtype +name = "resource.struct_rusage" ru_utime = _structseq.structseqfield(0,"user time used") ru_stime = _structseq.structseqfield(1,"system time used") diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst --- a/pypy/doc/whatsnew-head.rst +++ b/pypy/doc/whatsnew-head.rst @@ -26,3 +26,6 @@ .. branch: fix-vmprof-stacklet-switch Fix a vmprof+continulets (i.e. greenelts, eventlet, gevent, ...) + +.. branch: win32-vcvars + diff --git a/pypy/doc/windows.rst b/pypy/doc/windows.rst --- a/pypy/doc/windows.rst +++ b/pypy/doc/windows.rst @@ -25,8 +25,10 @@ This compiler, while the standard one for Python 2.7, is deprecated. Microsoft has made it available as the `Microsoft Visual C++ Compiler for Python 2.7`_ (the link -was checked in Nov 2016). Note that the compiler suite will be installed in -``C:\Users\\AppData\Local\Programs\Common\Microsoft\Visual C++ for Python``. +was checked in Nov 2016). Note that the compiler suite may be installed in +``C:\Users\\AppData\Local\Programs\Common\Microsoft\Visual C++ for Python`` +or in +``C:\Program Files (x86)\Common Files\Microsoft\Visual C++ for Python``. A current version of ``setuptools`` will be able to find it there. For Windows 10, you must right-click the download, and under ``Properties`` -> ``Compatibility`` mark it as ``Run run this program in comatibility mode for`` @@ -41,7 +43,6 @@ --- We routinely test translation using v9, also known as Visual Studio 2008. -Our buildbot is still using the Express Edition, not the compiler noted above. Other configurations may work as well. The translation scripts will set up the appropriate environment variables @@ -81,6 +82,30 @@ .. _build instructions: http://pypy.org/download.html#building-from-source +Setting Up Visual Studio
[pypy-commit] pypy unicode-utf8-re: in-progress
Author: Armin RigoBranch: unicode-utf8-re Changeset: r93302:cb5b89596a2f Date: 2017-12-08 11:44 +0100 http://bitbucket.org/pypy/pypy/changeset/cb5b89596a2f/ Log:in-progress diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py --- a/rpython/rlib/rsre/rsre_core.py +++ b/rpython/rlib/rsre/rsre_core.py @@ -142,6 +142,7 @@ # Utf8MatchContext. The non-utf8 implementation is provided # by the FixedMatchContext abstract subclass, in order to use # the same @not_rpython safety trick as above. +ZERO = 0 @not_rpython def next(self, position): raise NotImplementedError @@ -221,9 +222,8 @@ class FixedMatchContext(AbstractMatchContext): """Abstract subclass to introduce the default implementation for -these position methods. The Utf8 subclass doesn't inherit from here.""" - -ZERO = 0 +these position methods. The Utf8MatchContext subclass doesn't +inherit from here.""" def next(self, position): return position + 1 diff --git a/rpython/rlib/rsre/rsre_utf8.py b/rpython/rlib/rsre/rsre_utf8.py new file mode 100644 --- /dev/null +++ b/rpython/rlib/rsre/rsre_utf8.py @@ -0,0 +1,59 @@ +from rpython.rlib.debug import check_nonneg +from rpython.rlib.rarithmetic import r_uint, intmask +from rpython.rlib.rsre.rsre_core import AbstractMatchContext, EndOfString +from rpython.rlib.rsre import rsre_char +from rpython.rlib import rutf8 + + +class Utf8MatchContext(AbstractMatchContext): + +def __init__(self, pattern, utf8string, match_start, end, flags): +AbstractMatchContext.__init__(self, pattern, match_start, end, flags) +self._utf8 = utf8string + +def str(self, index): +check_nonneg(index) +return rutf8.codepoint_at_pos(self._utf8, index) + +def lowstr(self, index): +c = self.str(index) +return rsre_char.getlower(c, self.flags) + +def get_single_byte(self, base_position, index): +return self.str(base_position + index) + +def fresh_copy(self, start): +return Utf8MatchContext(self.pattern, self._utf8, start, +self.end, self.flags) + +def next(self, position): +return rutf8.next_codepoint_pos(self._utf8, position) + +def prev(self, position): +if position <= 0: +raise EndOfString +upos = r_uint(position) +upos = rutf8.prev_codepoint_pos(self._utf8, upos) +position = intmask(upos) +assert position >= 0 +return position + +def next_n(self, position, n, end_position): +for i in range(n): +if position >= end_position: +raise EndOfString +position = rutf8.next_codepoint_pos(self._utf8, position) +return position + +def prev_n(self, position, n, start_position): +upos = r_uint(position) +for i in range(n): +if upos <= r_uint(start_position): +raise EndOfString +upos = rutf8.next_codepoint_pos(self._utf8, upos) +position = intmask(upos) +assert position >= 0 +return position + +def slowly_convert_byte_pos_to_index(self, position): + diff --git a/rpython/rlib/rsre/test/test_search.py b/rpython/rlib/rsre/test/test_search.py --- a/rpython/rlib/rsre/test/test_search.py +++ b/rpython/rlib/rsre/test/test_search.py @@ -1,7 +1,7 @@ import re, py from rpython.rlib.rsre.test.test_match import get_code, get_code_and_re from rpython.rlib.rsre.test import support -from rpython.rlib.rsre import rsre_core +from rpython.rlib.rsre import rsre_core, rsre_utf8 class BaseTestSearch: @@ -222,3 +222,8 @@ search = staticmethod(rsre_core.search) match = staticmethod(rsre_core.match) Position = staticmethod(lambda n: n) + +class TestSearchUtf8(BaseTestSearch): +search = staticmethod(rsre_utf8.utf8search) +match = staticmethod(rsre_utf8.utf8match) +Position = staticmethod(lambda n: n) # NB. only for plain ascii ___ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit