[pypy-commit] pypy arm64: Optimize call_release_gil (and fix a wrong detail I think)

arigo Wed, 03 Jul 2019 03:02:55 -0700

Author: Armin Rigo <[email protected]>
Branch: arm64
Changeset: r96934:ecd3b6d0775b
Date: 2019-07-03 10:02 +0000
http://bitbucket.org/pypy/pypy/changeset/ecd3b6d0775b/


Log:    Optimize call_release_gil (and fix a wrong detail I think)

diff --git a/rpython/jit/backend/aarch64/TODO b/rpython/jit/backend/aarch64/TODO
--- a/rpython/jit/backend/aarch64/TODO
+++ b/rpython/jit/backend/aarch64/TODO
@@ -45,3 +45,15 @@
   the future to be a bit shorter.  Rewrite this two places to use the
   proper way instead of a magic "40" (or at least assert that it was
   really 40).
+
+
+* use "CBNZ register, offset" (compare-and-branch-if-not-zero)
+  instead of a CMP+BNE pair.  Same with CBZ instead of CMP+BEQ
+
+
+* when we need to save things on the stack, we typically push two words
+  and pop them later.  It would be cheaper if we reserved two locations
+  in the stack from _call_header, then we could just write there.
+  *OR*
+  maybe it's enough if we use the form "str x0, [sp, !#offset]" which
+  combines in a single instruction the "str" with the change of sp
diff --git a/rpython/jit/backend/aarch64/callbuilder.py 
b/rpython/jit/backend/aarch64/callbuilder.py
--- a/rpython/jit/backend/aarch64/callbuilder.py
+++ b/rpython/jit/backend/aarch64/callbuilder.py
@@ -138,19 +138,19 @@
     def call_releasegil_addr_and_move_real_arguments(self, fastgil):
         assert self.is_call_release_gil
         assert not self.asm._is_asmgcc()
+        RFASTGILPTR = r.x19    # constant &rpy_fastgil
+        RSHADOWOLD = r.x20     # old value of the shadowstack pointer,
+                               #    which we save here for later comparison
 
-        # Save this thread's shadowstack pointer into r7, for later comparison
         gcrootmap = self.asm.cpu.gc_ll_descr.gcrootmap
         if gcrootmap:
             rst = gcrootmap.get_root_stack_top_addr()
-            self.mc.gen_load_int(r.x19.value, rst)
-            self.mc.LDR_ri(r.x20.value, r.x19.value, 0)
+            self.mc.gen_load_int(r.ip1.value, rst)
+            self.mc.LDR_ri(RSHADOWOLD.value, r.ip1.value, 0)
 
         # change 'rpy_fastgil' to 0 (it should be non-zero right now)
-        self.mc.DMB()
-        self.mc.gen_load_int(r.ip1.value, fastgil)
-        self.mc.MOVZ_r_u16(r.ip0.value, 0, 0)
-        self.mc.STR_ri(r.ip0.value, r.ip1.value, 0)
+        self.mc.gen_load_int(RFASTGILPTR.value, fastgil)
+        self.mc.STLR(r.xzr.value, RFASTGILPTR.value)
 
         if not we_are_translated():                     # for testing: we 
should not access
             self.mc.ADD_ri(r.fp.value, r.fp.value, 1)   # fp any more
@@ -199,66 +199,82 @@
             self.mc.STR_ri(r.ip0.value, r.x3.value, rpy_errno)
 
     def move_real_result_and_call_reacqgil_addr(self, fastgil):
-        # try to reacquire the lock.
-        #     x19 == &root_stack_top
-        #     x20 == previous value of root_stack_top
-        self.mc.gen_load_int(r.ip1.value, fastgil)
-        self.mc.LDAXR(r.x1.value, r.ip1.value)    # load the lock value
-        self.mc.CMP_ri(r.x1.value, 0)            # is the lock free?
+        # try to reacquire the lock.  The following two values are saved
+        # across the call and are still alive now:
+        RFASTGILPTR = r.x19    # constant &rpy_fastgil
+        RSHADOWOLD = r.x20     # old value of the shadowstack pointer
+
+        # this comes from gcc compiling this code:
+        #    while (__atomic_test_and_set(&lock, __ATOMIC_ACQUIRE))
+        #        ;
+        self.mc.gen_load_int(r.x2.value, 1)
+        self.mc.LDXR(r.x1.value, RFASTGILPTR.value)
+        self.mc.STXR(r.x3.value, r.x2.value, RFASTGILPTR.value)
+        self.mc.CBNZ_w(r.x3.value, -8)
+        # now x1 is the old value of the lock, and the lock contains 1
 
         b1_location = self.mc.currpos()
-        self.mc.BRK() # B.ne to the call
+        self.mc.BRK()        # boehm: patched with a CBZ (jump if x1 == 0)
+                             # shadowstack: patched with CBNZ instead
 
-        # jump over the next few instructions directly to the call
-        self.mc.STLXR(r.ip0.value, r.ip1.value, r.x1.value)
-                                                 # try to claim the lock
-        self.mc.CMP_wi(r.x1.value, 0) # did this succeed?
-        self.mc.DMB() #
-
-        b2_location = self.mc.currpos()
-        self.mc.BRK() # B.ne to the call
-
-        #
-        if self.asm.cpu.gc_ll_descr.gcrootmap:
+        gcrootmap = self.asm.cpu.gc_ll_descr.gcrootmap
+        if gcrootmap:
             # When doing a call_release_gil with shadowstack, there
             # is the risk that the 'rpy_fastgil' was free but the
             # current shadowstack can be the one of a different
             # thread.  So here we check if the shadowstack pointer
             # is still the same as before we released the GIL (saved
             # in 'x20'), and if not, we fall back to 'reacqgil_addr'.
-            self.mc.LDR_ri(r.ip0.value, r.x19.value, 0)
-            self.mc.CMP_rr(r.ip0.value, r.x20.value)
+            rst = gcrootmap.get_root_stack_top_addr()
+            self.mc.gen_load_int(r.ip1.value, rst)
+            self.mc.LDR_ri(r.ip0.value, r.ip1.value, 0)   # new shadowstack
+            self.mc.CMP_rr(r.ip0.value, RSHADOWOLD.value)
             b3_location = self.mc.currpos()
-            self.mc.BRK() # B.ne to the call
+            self.mc.BRK() # B.eq forward
+
+            # revert the rpy_fastgil acquired above, so that the
+            # general 'reacqgil_addr' below can acquire it again...
+            self.mc.STR_ri(r.xzr.value, RFASTGILPTR.value, 0)
+
+            # patch the b1_location above, with "CBNZ here"
+            pmc = OverwritingBuilder(self.mc, b1_location, WORD)
+            pmc.CBNZ(r.x1.value, self.mc.currpos() - b1_location)
+
+            open_location = b3_location
         else:
-            b3_location = 0
-        #
+            open_location = b1_location
 
-        jmp_pos = self.mc.currpos()
-        self.mc.BRK()
-        # <- this is where we jump to
-        jmp_ofs = self.mc.currpos()
+        # Yes, we need to call the reacqgil() function.
+        # save the result we just got
+        RSAVEDRES = RFASTGILPTR     # can reuse this reg here to save things
+        reg = self.resloc
+        if reg is not None:
+            if reg.is_core_reg():
+                self.mc.MOV_rr(RSAVEDRES.value, reg.value)
+            elif reg.is_vfp_reg():
+                self.mc.SUB_ri(r.sp.value, r.sp.value, 2 * WORD)
+                self.mc.STR_di(reg.value, r.sp.value, 0)
 
-        pmc = OverwritingBuilder(self.mc, b1_location, WORD)
-        pmc.B_ofs_cond(jmp_ofs - b1_location, c.NE)
-        pmc = OverwritingBuilder(self.mc, b2_location, WORD)
-        pmc.B_ofs_cond(jmp_ofs - b2_location, c.NE)
-        if self.asm.cpu.gc_ll_descr.gcrootmap:
-            pmc = OverwritingBuilder(self.mc, b3_location, WORD)
-            pmc.B_ofs_cond(jmp_ofs - b3_location, c.NE)
+        # call the function
+        self.mc.BL(self.asm.reacqgil_addr)
 
-        # save the result we just got
-        # call reacquire_gil
-        self.mc.SUB_ri(r.sp.value, r.sp.value, 2 * WORD)
-        self.mc.STR_di(r.d0.value, r.sp.value, 0)
-        self.mc.STR_ri(r.x0.value, r.sp.value, WORD)
-        self.mc.BL(self.asm.reacqgil_addr)
-        self.mc.LDR_di(r.d0.value, r.sp.value, 0)
-        self.mc.LDR_ri(r.x0.value, r.sp.value, WORD)
-        self.mc.ADD_ri(r.sp.value, r.sp.value, 2 * WORD)
+        # restore the saved register
+        if reg is not None:
+            if reg.is_core_reg():
+                self.mc.MOV_rr(reg.value, RSAVEDRES.value)
+            elif reg.is_vfp_reg():
+                self.mc.LDR_di(reg.value, r.sp.value, 0)
+                self.mc.ADD_ri(r.sp.value, r.sp.value, 2 * WORD)
 
-        pmc = OverwritingBuilder(self.mc, jmp_pos, WORD)
-        pmc.B_ofs(self.mc.currpos() - jmp_pos)
+        # now patch the still-open jump above:
+        #     boehm: patch b1_location with a CBZ(x1)
+        #     shadowstack: patch b3_location with BEQ
+        pmc = OverwritingBuilder(self.mc, open_location, WORD)
+        offset = self.mc.currpos() - open_location
+        if gcrootmap:
+            pmc.B_ofs_cond(offset, c.EQ)
+        else:
+            pmc.CBZ(r.x1.value, offset)
 
         if not we_are_translated():                    # for testing: now we 
can accesss
             self.mc.SUB_ri(r.fp.value, r.fp.value, 1)  # fp again
diff --git a/rpython/jit/backend/aarch64/codebuilder.py 
b/rpython/jit/backend/aarch64/codebuilder.py
--- a/rpython/jit/backend/aarch64/codebuilder.py
+++ b/rpython/jit/backend/aarch64/codebuilder.py
@@ -423,15 +423,15 @@
         base = 0b11001000000
         self.write32((base << 21) | (rs << 16) | (0b011111 << 10) | (rn << 5) 
| rt)
 
-    def LDAXR(self, rt, rn):
-        # XXX DON'T USE
-        base = 0b1100100001011111111111
-        self.write32((base << 10) | (rn << 5) | rt)
+    #def LDAXR(self, rt, rn):
+    #    don't use any more
+    #    base = 0b1100100001011111111111
+    #    self.write32((base << 10) | (rn << 5) | rt)
 
-    def STLXR(self, rt, rn, rs):
-        # XXX DON'T USE
-        base = 0b11001000000
-        self.write32((base << 21) | (rs << 16) | (0b111111 << 10) | (rn << 5) 
| rt)
+    #def STLXR(self, rt, rn, rs):
+    #    don't use any more
+    #    base = 0b11001000000
+    #    self.write32((base << 21) | (rs << 16) | (0b111111 << 10) | (rn << 5) 
| rt)
 
     def NOP(self):
         self.write32(0b11010101000000110010000000011111)
@@ -439,7 +439,7 @@
     def B_ofs(self, ofs):
         base = 0b000101
         assert ofs & 0x3 == 0
-        assert -(1 << (26 + 2)) < ofs < 1<<(26 + 2)
+        assert -(1 << (26 + 2)) <= ofs < 1<<(26 + 2)
         if ofs < 0:
             ofs = (1 << 26) - (-ofs >> 2)
         else:
@@ -449,11 +449,33 @@
     def B_ofs_cond(self, ofs, cond):
         base = 0b01010100
         assert ofs & 0x3 == 0
-        assert -1 << 21 < ofs < 1 << 21
+        assert -1 << 21 <= ofs < 1 << 21
         imm = ofs >> 2
         assert imm > 0 # we seem not to need the - jump
         self.write32((base << 24) | (imm << 5) | cond)
 
+    def CBNZ(self, rt, ofs):
+        base = 0b10110101
+        assert -1 << 21 <= ofs < 1 << 21
+        imm = ofs >> 2
+        imm &= (1 << 19) - 1
+        self.write32((base << 24) | (imm << 5) | rt)
+
+    def CBNZ_w(self, rt, ofs):
+        # checks the 'w' part of the register (32 bits)
+        base = 0b00110101
+        assert -1 << 21 <= ofs < 1 << 21
+        imm = ofs >> 2
+        imm &= (1 << 19) - 1
+        self.write32((base << 24) | (imm << 5) | rt)
+
+    def CBZ(self, rt, ofs):
+        base = 0b10110100
+        assert -1 << 21 <= ofs < 1 << 21
+        imm = ofs >> 2
+        imm &= (1 << 19) - 1
+        self.write32((base << 24) | (imm << 5) | rt)
+
     def B(self, target):
         target = rffi.cast(lltype.Signed, target)
         self.gen_load_int_full(r.ip0.value, target)
@@ -476,9 +498,9 @@
     def BRK(self):
         self.write32(0b11010100001 << 21)
 
-    def DMB(self):
-        # XXX DON'T USE
-        self.write32(0b11010101000000110011111110111111)
+    #def DMB(self):
+    #    don't use any more
+    #    self.write32(0b11010101000000110011111110111111)
 
     def gen_load_int_full(self, r, value):
         self.MOVZ_r_u16(r, value & 0xFFFF, 0)
diff --git a/rpython/jit/backend/aarch64/test/test_instr_builder.py 
b/rpython/jit/backend/aarch64/test/test_instr_builder.py
--- a/rpython/jit/backend/aarch64/test/test_instr_builder.py
+++ b/rpython/jit/backend/aarch64/test/test_instr_builder.py
@@ -172,3 +172,18 @@
         cb = CodeBuilder()
         cb.STXR(r.x6.value, r.x11.value, r.x22.value)
         assert cb.hexdump() == assemble("STXR w6, x11, [x22]")
+
+    def test_CBNZ(self):
+        cb = CodeBuilder()
+        cb.CBNZ(r.x6.value, -8)
+        assert cb.hexdump() == assemble("CBNZ x6, -8")
+
+    def test_CBNZ_w(self):
+        cb = CodeBuilder()
+        cb.CBNZ_w(r.x6.value, -8)
+        assert cb.hexdump() == assemble("CBNZ w6, -8")
+
+    def test_CBZ(self):
+        cb = CodeBuilder()
+        cb.CBZ(r.x25.value, -888)
+        assert cb.hexdump() == assemble("CBZ x25, -888")
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy arm64: Optimize call_release_gil (and fix a wrong detail I think)

Reply via email to