Author: Richard Plangger <planri...@gmail.com> Branch: strbuf-as-buffer Changeset: r89038:b6a80f1a44e0 Date: 2016-12-13 11:45 +0100 http://bitbucket.org/pypy/pypy/changeset/b6a80f1a44e0/
Log: merged default diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py new file mode 100644 --- /dev/null +++ b/pypy/interpreter/test/test_unicodehelper.py @@ -0,0 +1,26 @@ +from pypy.interpreter.unicodehelper import encode_utf8, decode_utf8 + +class FakeSpace: + pass + +def test_encode_utf8(): + space = FakeSpace() + assert encode_utf8(space, u"abc") == "abc" + assert encode_utf8(space, u"\u1234") == "\xe1\x88\xb4" + assert encode_utf8(space, u"\ud800") == "\xed\xa0\x80" + assert encode_utf8(space, u"\udc00") == "\xed\xb0\x80" + # for the following test, go to lengths to avoid CPython's optimizer + # and .pyc file storage, which collapse the two surrogates into one + c = u"\udc00" + assert encode_utf8(space, u"\ud800" + c) == "\xf0\x90\x80\x80" + +def test_decode_utf8(): + space = FakeSpace() + assert decode_utf8(space, "abc") == u"abc" + assert decode_utf8(space, "\xe1\x88\xb4") == u"\u1234" + assert decode_utf8(space, "\xed\xa0\x80") == u"\ud800" + assert decode_utf8(space, "\xed\xb0\x80") == u"\udc00" + got = decode_utf8(space, "\xed\xa0\x80\xed\xb0\x80") + assert map(ord, got) == [0xd800, 0xdc00] + got = decode_utf8(space, "\xf0\x90\x80\x80") + assert map(ord, got) == [0x10000] diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py --- a/pypy/interpreter/unicodehelper.py +++ b/pypy/interpreter/unicodehelper.py @@ -51,6 +51,10 @@ return result def decode_utf8(space, string): + # Surrogates are accepted and not treated specially at all. + # If there happen to be two 3-bytes encoding a pair of surrogates, + # you still get two surrogate unicode characters in the result. + # These are the Python2 rules; Python3 differs. result, consumed = runicode.str_decode_utf_8( string, len(string), "strict", final=True, errorhandler=decode_error_handler(space), @@ -59,8 +63,9 @@ def encode_utf8(space, uni): # Note that this function never raises UnicodeEncodeError, - # since surrogate pairs are allowed. - # This is not the case with Python3. + # since surrogates are allowed, either paired or lone. + # A paired surrogate is considered like the non-BMP character + # it stands for. These are the Python2 rules; Python3 differs. return runicode.unicode_encode_utf_8( uni, len(uni), "strict", errorhandler=raise_unicode_exception_encode, diff --git a/rpython/jit/backend/ppc/regalloc.py b/rpython/jit/backend/ppc/regalloc.py --- a/rpython/jit/backend/ppc/regalloc.py +++ b/rpython/jit/backend/ppc/regalloc.py @@ -1066,7 +1066,6 @@ prepare_cond_call_value_r = prepare_cond_call_value_i - def notimplemented(self, op): msg = '[PPC/regalloc] %s not implemented\n' % op.getopname() if we_are_translated(): diff --git a/rpython/jit/backend/zarch/opassembler.py b/rpython/jit/backend/zarch/opassembler.py --- a/rpython/jit/backend/zarch/opassembler.py +++ b/rpython/jit/backend/zarch/opassembler.py @@ -374,10 +374,11 @@ _COND_CALL_SAVE_REGS = [r.r11, r.r2, r.r3, r.r4, r.r5] def emit_cond_call(self, op, arglocs, regalloc): + resloc = arglocs[0] + arglocs = arglocs[1:] fcond = self.guard_success_cc self.guard_success_cc = c.cond_none assert fcond.value != c.cond_none.value - fcond = c.negate(fcond) jmp_adr = self.mc.get_relative_pos() self.mc.reserve_cond_jump() # patched later to a relative branch @@ -411,6 +412,8 @@ self.mc.BASR(r.r14, r.r14) # restoring the registers saved above, and doing pop_gcmap(), is left # to the cond_call_slowpath helper. We never have any result value. + if resloc is not None: + self.mc.LGR(resloc, r.RES) relative_target = self.mc.currpos() - jmp_adr pmc = OverwritingBuilder(self.mc, jmp_adr, 1) pmc.BRCL(fcond, l.imm(relative_target)) @@ -419,6 +422,9 @@ # guard_no_exception too self.previous_cond_call_jcond = jmp_adr, fcond + emit_cond_call_value_i = emit_cond_call + emit_cond_call_value_r = emit_cond_call + class AllocOpAssembler(object): _mixin_ = True diff --git a/rpython/jit/backend/zarch/regalloc.py b/rpython/jit/backend/zarch/regalloc.py --- a/rpython/jit/backend/zarch/regalloc.py +++ b/rpython/jit/backend/zarch/regalloc.py @@ -1107,7 +1107,7 @@ def prepare_cond_call(self, op): self.load_condition_into_cc(op.getarg(0)) - locs = [] + locs = [None] # support between 0 and 4 integer arguments assert 2 <= op.numargs() <= 2 + 4 for i in range(1, op.numargs()): @@ -1116,6 +1116,22 @@ locs.append(loc) return locs + def prepare_cond_call_value_i(self, op): + x = self.ensure_reg(op.getarg(0)) + self.load_condition_into_cc(op.getarg(0)) + self.rm.force_allocate_reg(op, selected_reg=x) # spilled if survives + # ^^^ if arg0!=0, we jump over the next block of code (the call) + locs = [x] + # support between 0 and 4 integer arguments + assert 2 <= op.numargs() <= 2 + 4 + for i in range(1, op.numargs()): + loc = self.loc(op.getarg(i)) + assert loc.type != FLOAT + locs.append(loc) + return locs # [res, function, args...] + + prepare_cond_call_value_r = prepare_cond_call_value_i + def prepare_cond_call_gc_wb(self, op): arglocs = [self.ensure_reg(op.getarg(0))] return arglocs diff --git a/rpython/jit/codewriter/support.py b/rpython/jit/codewriter/support.py --- a/rpython/jit/codewriter/support.py +++ b/rpython/jit/codewriter/support.py @@ -142,10 +142,14 @@ assert len(lst) == len(args_v), ( "not supported so far: 'greens' variables contain Void") # a crash here means that you have to reorder the variable named in - # the JitDriver. Indeed, greens and reds must both be sorted: first - # all INTs, followed by all REFs, followed by all FLOATs. + # the JitDriver. lst2 = sort_vars(lst) - assert lst == lst2 + assert lst == lst2, ("You have to reorder the variables named in " + "the JitDriver (both the 'greens' and 'reds' independently). " + "They must be sorted like this: first all the integer-like, " + "then all the pointer-like, and finally the floats.\n" + "Got: %r\n" + "Expected: %r" % (lst, lst2)) return lst # return (_sort(greens_v, True), _sort(reds_v, False)) diff --git a/rpython/rlib/rposix.py b/rpython/rlib/rposix.py --- a/rpython/rlib/rposix.py +++ b/rpython/rlib/rposix.py @@ -1778,22 +1778,23 @@ finally: lltype.free(l_utsbuf, flavor='raw') -# These are actually macros on some/most systems -c_makedev = external('makedev', [rffi.INT, rffi.INT], rffi.INT, macro=True) -c_major = external('major', [rffi.INT], rffi.INT, macro=True) -c_minor = external('minor', [rffi.INT], rffi.INT, macro=True) +if sys.platform != 'win32': + # These are actually macros on some/most systems + c_makedev = external('makedev', [rffi.INT, rffi.INT], rffi.INT, macro=True) + c_major = external('major', [rffi.INT], rffi.INT, macro=True) + c_minor = external('minor', [rffi.INT], rffi.INT, macro=True) -@replace_os_function('makedev') -def makedev(maj, min): - return c_makedev(maj, min) + @replace_os_function('makedev') + def makedev(maj, min): + return c_makedev(maj, min) -@replace_os_function('major') -def major(dev): - return c_major(dev) + @replace_os_function('major') + def major(dev): + return c_major(dev) -@replace_os_function('minor') -def minor(dev): - return c_minor(dev) + @replace_os_function('minor') + def minor(dev): + return c_minor(dev) #___________________________________________________________________ diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -327,6 +327,16 @@ def unicode_encode_utf_8(s, size, errors, errorhandler=None, allow_surrogates=allow_surrogate_by_default): + # In this function, allow_surrogates can be: + # + # * True: surrogates are always allowed. A valid surrogate pair + # is replaced with the non-BMP unicode char it stands for, + # which is then encoded as 4 bytes. + # + # * False: surrogates are always forbidden. + # + # See also unicode_encode_utf8sp(). + # if errorhandler is None: errorhandler = default_unicode_error_encode return unicode_encode_utf_8_impl(s, size, errors, errorhandler, @@ -391,6 +401,33 @@ _encodeUCS4(result, ch) return result.build() +def unicode_encode_utf8sp(s, size): + # Surrogate-preserving utf-8 encoding. Any surrogate character + # turns into its 3-bytes encoding, whether it is paired or not. + # This should always be reversible, and the reverse is the regular + # str_decode_utf_8() with allow_surrogates=True. + assert(size >= 0) + result = StringBuilder(size) + pos = 0 + while pos < size: + ch = ord(s[pos]) + pos += 1 + if ch < 0x80: + # Encode ASCII + result.append(chr(ch)) + elif ch < 0x0800: + # Encode Latin-1 + result.append(chr((0xc0 | (ch >> 6)))) + result.append(chr((0x80 | (ch & 0x3f)))) + elif ch < 0x10000: + # Encode UCS2 Unicode ordinals, and surrogates + result.append((chr((0xe0 | (ch >> 12))))) + result.append((chr((0x80 | ((ch >> 6) & 0x3f))))) + result.append((chr((0x80 | (ch & 0x3f))))) + else: + _encodeUCS4(result, ch) + return result.build() + # ____________________________________________________________ # utf-16 diff --git a/rpython/rlib/test/test_rposix.py b/rpython/rlib/test/test_rposix.py --- a/rpython/rlib/test/test_rposix.py +++ b/rpython/rlib/test/test_rposix.py @@ -281,6 +281,7 @@ def test_isatty(self): assert rposix.isatty(-1) is False + @py.test.mark.skipif("not hasattr(rposix, 'makedev')") def test_makedev(self): dev = rposix.makedev(24, 7) assert rposix.major(dev) == 24 diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py --- a/rpython/rlib/test/test_runicode.py +++ b/rpython/rlib/test/test_runicode.py @@ -812,6 +812,21 @@ py.test.raises(UnicodeEncodeError, encoder, u' 12, \u1234 ', 7, None) assert encoder(u'u\u1234', 2, 'replace') == 'u?' + def test_encode_utf8sp(self): + # for the following test, go to lengths to avoid CPython's optimizer + # and .pyc file storage, which collapse the two surrogates into one + c = u"\udc00" + for input, expected in [ + (u"", ""), + (u"abc", "abc"), + (u"\u1234", "\xe1\x88\xb4"), + (u"\ud800", "\xed\xa0\x80"), + (u"\udc00", "\xed\xb0\x80"), + (u"\ud800" + c, "\xed\xa0\x80\xed\xb0\x80"), + ]: + got = runicode.unicode_encode_utf8sp(input, len(input)) + assert got == expected + class TestTranslation(object): def setup_class(cls): _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit