[pypy-commit] pypy strbuf-as-buffer: merged default

plan_rich Tue, 13 Dec 2016 03:30:07 -0800

Author: Richard Plangger <[email protected]>
Branch: strbuf-as-buffer
Changeset: r89038:b6a80f1a44e0
Date: 2016-12-13 11:45 +0100
http://bitbucket.org/pypy/pypy/changeset/b6a80f1a44e0/


Log:    merged default

diff --git a/pypy/interpreter/test/test_unicodehelper.py 
b/pypy/interpreter/test/test_unicodehelper.py
new file mode 100644
--- /dev/null
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -0,0 +1,26 @@
+from pypy.interpreter.unicodehelper import encode_utf8, decode_utf8
+
+class FakeSpace:
+    pass
+
+def test_encode_utf8():
+    space = FakeSpace()
+    assert encode_utf8(space, u"abc") == "abc"
+    assert encode_utf8(space, u"\u1234") == "\xe1\x88\xb4"
+    assert encode_utf8(space, u"\ud800") == "\xed\xa0\x80"
+    assert encode_utf8(space, u"\udc00") == "\xed\xb0\x80"
+    # for the following test, go to lengths to avoid CPython's optimizer
+    # and .pyc file storage, which collapse the two surrogates into one
+    c = u"\udc00"
+    assert encode_utf8(space, u"\ud800" + c) == "\xf0\x90\x80\x80"
+
+def test_decode_utf8():
+    space = FakeSpace()
+    assert decode_utf8(space, "abc") == u"abc"
+    assert decode_utf8(space, "\xe1\x88\xb4") == u"\u1234"
+    assert decode_utf8(space, "\xed\xa0\x80") == u"\ud800"
+    assert decode_utf8(space, "\xed\xb0\x80") == u"\udc00"
+    got = decode_utf8(space, "\xed\xa0\x80\xed\xb0\x80")
+    assert map(ord, got) == [0xd800, 0xdc00]
+    got = decode_utf8(space, "\xf0\x90\x80\x80")
+    assert map(ord, got) == [0x10000]
diff --git a/pypy/interpreter/unicodehelper.py 
b/pypy/interpreter/unicodehelper.py
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -51,6 +51,10 @@
     return result
 
 def decode_utf8(space, string):
+    # Surrogates are accepted and not treated specially at all.
+    # If there happen to be two 3-bytes encoding a pair of surrogates,
+    # you still get two surrogate unicode characters in the result.
+    # These are the Python2 rules; Python3 differs.
     result, consumed = runicode.str_decode_utf_8(
         string, len(string), "strict",
         final=True, errorhandler=decode_error_handler(space),
@@ -59,8 +63,9 @@
 
 def encode_utf8(space, uni):
     # Note that this function never raises UnicodeEncodeError,
-    # since surrogate pairs are allowed.
-    # This is not the case with Python3.
+    # since surrogates are allowed, either paired or lone.
+    # A paired surrogate is considered like the non-BMP character
+    # it stands for.  These are the Python2 rules; Python3 differs.
     return runicode.unicode_encode_utf_8(
         uni, len(uni), "strict",
         errorhandler=raise_unicode_exception_encode,
diff --git a/rpython/jit/backend/ppc/regalloc.py 
b/rpython/jit/backend/ppc/regalloc.py
--- a/rpython/jit/backend/ppc/regalloc.py
+++ b/rpython/jit/backend/ppc/regalloc.py
@@ -1066,7 +1066,6 @@
 
     prepare_cond_call_value_r = prepare_cond_call_value_i
 
-
 def notimplemented(self, op):
     msg = '[PPC/regalloc] %s not implemented\n' % op.getopname()
     if we_are_translated():
diff --git a/rpython/jit/backend/zarch/opassembler.py 
b/rpython/jit/backend/zarch/opassembler.py
--- a/rpython/jit/backend/zarch/opassembler.py
+++ b/rpython/jit/backend/zarch/opassembler.py
@@ -374,10 +374,11 @@
     _COND_CALL_SAVE_REGS = [r.r11, r.r2, r.r3, r.r4, r.r5]
 
     def emit_cond_call(self, op, arglocs, regalloc):
+        resloc = arglocs[0]
+        arglocs = arglocs[1:]
         fcond = self.guard_success_cc
         self.guard_success_cc = c.cond_none
         assert fcond.value != c.cond_none.value
-        fcond = c.negate(fcond)
 
         jmp_adr = self.mc.get_relative_pos()
         self.mc.reserve_cond_jump() # patched later to a relative branch
@@ -411,6 +412,8 @@
         self.mc.BASR(r.r14, r.r14)
         # restoring the registers saved above, and doing pop_gcmap(), is left
         # to the cond_call_slowpath helper.  We never have any result value.
+        if resloc is not None:
+            self.mc.LGR(resloc, r.RES)
         relative_target = self.mc.currpos() - jmp_adr
         pmc = OverwritingBuilder(self.mc, jmp_adr, 1)
         pmc.BRCL(fcond, l.imm(relative_target))
@@ -419,6 +422,9 @@
         # guard_no_exception too
         self.previous_cond_call_jcond = jmp_adr, fcond
 
+    emit_cond_call_value_i = emit_cond_call
+    emit_cond_call_value_r = emit_cond_call
+
 class AllocOpAssembler(object):
     _mixin_ = True
 
diff --git a/rpython/jit/backend/zarch/regalloc.py 
b/rpython/jit/backend/zarch/regalloc.py
--- a/rpython/jit/backend/zarch/regalloc.py
+++ b/rpython/jit/backend/zarch/regalloc.py
@@ -1107,7 +1107,7 @@
 
     def prepare_cond_call(self, op):
         self.load_condition_into_cc(op.getarg(0))
-        locs = []
+        locs = [None]
         # support between 0 and 4 integer arguments
         assert 2 <= op.numargs() <= 2 + 4
         for i in range(1, op.numargs()):
@@ -1116,6 +1116,22 @@
             locs.append(loc)
         return locs
 
+    def prepare_cond_call_value_i(self, op):
+        x = self.ensure_reg(op.getarg(0))
+        self.load_condition_into_cc(op.getarg(0))
+        self.rm.force_allocate_reg(op, selected_reg=x)   # spilled if survives
+        # ^^^ if arg0!=0, we jump over the next block of code (the call)
+        locs = [x]
+        # support between 0 and 4 integer arguments
+        assert 2 <= op.numargs() <= 2 + 4
+        for i in range(1, op.numargs()):
+            loc = self.loc(op.getarg(i))
+            assert loc.type != FLOAT
+            locs.append(loc)
+        return locs     # [res, function, args...]
+
+    prepare_cond_call_value_r = prepare_cond_call_value_i
+
     def prepare_cond_call_gc_wb(self, op):
         arglocs = [self.ensure_reg(op.getarg(0))]
         return arglocs
diff --git a/rpython/jit/codewriter/support.py 
b/rpython/jit/codewriter/support.py
--- a/rpython/jit/codewriter/support.py
+++ b/rpython/jit/codewriter/support.py
@@ -142,10 +142,14 @@
             assert len(lst) == len(args_v), (
                 "not supported so far: 'greens' variables contain Void")
         # a crash here means that you have to reorder the variable named in
-        # the JitDriver.  Indeed, greens and reds must both be sorted: first
-        # all INTs, followed by all REFs, followed by all FLOATs.
+        # the JitDriver.  
         lst2 = sort_vars(lst)
-        assert lst == lst2
+        assert lst == lst2, ("You have to reorder the variables named in "
+            "the JitDriver (both the 'greens' and 'reds' independently). "
+            "They must be sorted like this: first all the integer-like, "
+            "then all the pointer-like, and finally the floats.\n"
+            "Got: %r\n"
+            "Expected: %r" % (lst, lst2))
         return lst
     #
     return (_sort(greens_v, True), _sort(reds_v, False))
diff --git a/rpython/rlib/rposix.py b/rpython/rlib/rposix.py
--- a/rpython/rlib/rposix.py
+++ b/rpython/rlib/rposix.py
@@ -1778,22 +1778,23 @@
     finally:
         lltype.free(l_utsbuf, flavor='raw')
 
-# These are actually macros on some/most systems
-c_makedev = external('makedev', [rffi.INT, rffi.INT], rffi.INT, macro=True)
-c_major = external('major', [rffi.INT], rffi.INT, macro=True)
-c_minor = external('minor', [rffi.INT], rffi.INT, macro=True)
+if sys.platform != 'win32':
+    # These are actually macros on some/most systems
+    c_makedev = external('makedev', [rffi.INT, rffi.INT], rffi.INT, macro=True)
+    c_major = external('major', [rffi.INT], rffi.INT, macro=True)
+    c_minor = external('minor', [rffi.INT], rffi.INT, macro=True)
 
-@replace_os_function('makedev')
-def makedev(maj, min):
-    return c_makedev(maj, min)
+    @replace_os_function('makedev')
+    def makedev(maj, min):
+        return c_makedev(maj, min)
 
-@replace_os_function('major')
-def major(dev):
-    return c_major(dev)
+    @replace_os_function('major')
+    def major(dev):
+        return c_major(dev)
 
-@replace_os_function('minor')
-def minor(dev):
-    return c_minor(dev)
+    @replace_os_function('minor')
+    def minor(dev):
+        return c_minor(dev)
 
 #___________________________________________________________________
 
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -327,6 +327,16 @@
 
 def unicode_encode_utf_8(s, size, errors, errorhandler=None,
                          allow_surrogates=allow_surrogate_by_default):
+    # In this function, allow_surrogates can be:
+    #
+    #  * True:  surrogates are always allowed.  A valid surrogate pair
+    #           is replaced with the non-BMP unicode char it stands for,
+    #           which is then encoded as 4 bytes.
+    #
+    #  * False: surrogates are always forbidden.
+    #
+    # See also unicode_encode_utf8sp().
+    #
     if errorhandler is None:
         errorhandler = default_unicode_error_encode
     return unicode_encode_utf_8_impl(s, size, errors, errorhandler,
@@ -391,6 +401,33 @@
                 _encodeUCS4(result, ch)
     return result.build()
 
+def unicode_encode_utf8sp(s, size):
+    # Surrogate-preserving utf-8 encoding.  Any surrogate character
+    # turns into its 3-bytes encoding, whether it is paired or not.
+    # This should always be reversible, and the reverse is the regular
+    # str_decode_utf_8() with allow_surrogates=True.
+    assert(size >= 0)
+    result = StringBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = ord(s[pos])
+        pos += 1
+        if ch < 0x80:
+            # Encode ASCII
+            result.append(chr(ch))
+        elif ch < 0x0800:
+            # Encode Latin-1
+            result.append(chr((0xc0 | (ch >> 6))))
+            result.append(chr((0x80 | (ch & 0x3f))))
+        elif ch < 0x10000:
+            # Encode UCS2 Unicode ordinals, and surrogates
+            result.append((chr((0xe0 | (ch >> 12)))))
+            result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+            result.append((chr((0x80 | (ch & 0x3f)))))
+        else:
+            _encodeUCS4(result, ch)
+    return result.build()
+
 # ____________________________________________________________
 # utf-16
 
diff --git a/rpython/rlib/test/test_rposix.py b/rpython/rlib/test/test_rposix.py
--- a/rpython/rlib/test/test_rposix.py
+++ b/rpython/rlib/test/test_rposix.py
@@ -281,6 +281,7 @@
     def test_isatty(self):
         assert rposix.isatty(-1) is False
 
+    @py.test.mark.skipif("not hasattr(rposix, 'makedev')")
     def test_makedev(self):
         dev = rposix.makedev(24, 7)
         assert rposix.major(dev) == 24
diff --git a/rpython/rlib/test/test_runicode.py 
b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -812,6 +812,21 @@
         py.test.raises(UnicodeEncodeError, encoder, u' 12, \u1234 ', 7, None)
         assert encoder(u'u\u1234', 2, 'replace') == 'u?'
 
+    def test_encode_utf8sp(self):
+        # for the following test, go to lengths to avoid CPython's optimizer
+        # and .pyc file storage, which collapse the two surrogates into one
+        c = u"\udc00"
+        for input, expected in [
+                (u"", ""),
+                (u"abc", "abc"),
+                (u"\u1234", "\xe1\x88\xb4"),
+                (u"\ud800", "\xed\xa0\x80"),
+                (u"\udc00", "\xed\xb0\x80"),
+                (u"\ud800" + c, "\xed\xa0\x80\xed\xb0\x80"),
+            ]:
+            got = runicode.unicode_encode_utf8sp(input, len(input))
+            assert got == expected
+
 
 class TestTranslation(object):
     def setup_class(cls):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy strbuf-as-buffer: merged default

Reply via email to