Author: Armin Rigo <[email protected]>
Branch: arm64
Changeset: r96934:ecd3b6d0775b
Date: 2019-07-03 10:02 +0000
http://bitbucket.org/pypy/pypy/changeset/ecd3b6d0775b/
Log: Optimize call_release_gil (and fix a wrong detail I think)
diff --git a/rpython/jit/backend/aarch64/TODO b/rpython/jit/backend/aarch64/TODO
--- a/rpython/jit/backend/aarch64/TODO
+++ b/rpython/jit/backend/aarch64/TODO
@@ -45,3 +45,15 @@
the future to be a bit shorter. Rewrite this two places to use the
proper way instead of a magic "40" (or at least assert that it was
really 40).
+
+
+* use "CBNZ register, offset" (compare-and-branch-if-not-zero)
+ instead of a CMP+BNE pair. Same with CBZ instead of CMP+BEQ
+
+
+* when we need to save things on the stack, we typically push two words
+ and pop them later. It would be cheaper if we reserved two locations
+ in the stack from _call_header, then we could just write there.
+ *OR*
+ maybe it's enough if we use the form "str x0, [sp, !#offset]" which
+ combines in a single instruction the "str" with the change of sp
diff --git a/rpython/jit/backend/aarch64/callbuilder.py
b/rpython/jit/backend/aarch64/callbuilder.py
--- a/rpython/jit/backend/aarch64/callbuilder.py
+++ b/rpython/jit/backend/aarch64/callbuilder.py
@@ -138,19 +138,19 @@
def call_releasegil_addr_and_move_real_arguments(self, fastgil):
assert self.is_call_release_gil
assert not self.asm._is_asmgcc()
+ RFASTGILPTR = r.x19 # constant &rpy_fastgil
+ RSHADOWOLD = r.x20 # old value of the shadowstack pointer,
+ # which we save here for later comparison
- # Save this thread's shadowstack pointer into r7, for later comparison
gcrootmap = self.asm.cpu.gc_ll_descr.gcrootmap
if gcrootmap:
rst = gcrootmap.get_root_stack_top_addr()
- self.mc.gen_load_int(r.x19.value, rst)
- self.mc.LDR_ri(r.x20.value, r.x19.value, 0)
+ self.mc.gen_load_int(r.ip1.value, rst)
+ self.mc.LDR_ri(RSHADOWOLD.value, r.ip1.value, 0)
# change 'rpy_fastgil' to 0 (it should be non-zero right now)
- self.mc.DMB()
- self.mc.gen_load_int(r.ip1.value, fastgil)
- self.mc.MOVZ_r_u16(r.ip0.value, 0, 0)
- self.mc.STR_ri(r.ip0.value, r.ip1.value, 0)
+ self.mc.gen_load_int(RFASTGILPTR.value, fastgil)
+ self.mc.STLR(r.xzr.value, RFASTGILPTR.value)
if not we_are_translated(): # for testing: we
should not access
self.mc.ADD_ri(r.fp.value, r.fp.value, 1) # fp any more
@@ -199,66 +199,82 @@
self.mc.STR_ri(r.ip0.value, r.x3.value, rpy_errno)
def move_real_result_and_call_reacqgil_addr(self, fastgil):
- # try to reacquire the lock.
- # x19 == &root_stack_top
- # x20 == previous value of root_stack_top
- self.mc.gen_load_int(r.ip1.value, fastgil)
- self.mc.LDAXR(r.x1.value, r.ip1.value) # load the lock value
- self.mc.CMP_ri(r.x1.value, 0) # is the lock free?
+ # try to reacquire the lock. The following two values are saved
+ # across the call and are still alive now:
+ RFASTGILPTR = r.x19 # constant &rpy_fastgil
+ RSHADOWOLD = r.x20 # old value of the shadowstack pointer
+
+ # this comes from gcc compiling this code:
+ # while (__atomic_test_and_set(&lock, __ATOMIC_ACQUIRE))
+ # ;
+ self.mc.gen_load_int(r.x2.value, 1)
+ self.mc.LDXR(r.x1.value, RFASTGILPTR.value)
+ self.mc.STXR(r.x3.value, r.x2.value, RFASTGILPTR.value)
+ self.mc.CBNZ_w(r.x3.value, -8)
+ # now x1 is the old value of the lock, and the lock contains 1
b1_location = self.mc.currpos()
- self.mc.BRK() # B.ne to the call
+ self.mc.BRK() # boehm: patched with a CBZ (jump if x1 == 0)
+ # shadowstack: patched with CBNZ instead
- # jump over the next few instructions directly to the call
- self.mc.STLXR(r.ip0.value, r.ip1.value, r.x1.value)
- # try to claim the lock
- self.mc.CMP_wi(r.x1.value, 0) # did this succeed?
- self.mc.DMB() #
-
- b2_location = self.mc.currpos()
- self.mc.BRK() # B.ne to the call
-
- #
- if self.asm.cpu.gc_ll_descr.gcrootmap:
+ gcrootmap = self.asm.cpu.gc_ll_descr.gcrootmap
+ if gcrootmap:
# When doing a call_release_gil with shadowstack, there
# is the risk that the 'rpy_fastgil' was free but the
# current shadowstack can be the one of a different
# thread. So here we check if the shadowstack pointer
# is still the same as before we released the GIL (saved
# in 'x20'), and if not, we fall back to 'reacqgil_addr'.
- self.mc.LDR_ri(r.ip0.value, r.x19.value, 0)
- self.mc.CMP_rr(r.ip0.value, r.x20.value)
+ rst = gcrootmap.get_root_stack_top_addr()
+ self.mc.gen_load_int(r.ip1.value, rst)
+ self.mc.LDR_ri(r.ip0.value, r.ip1.value, 0) # new shadowstack
+ self.mc.CMP_rr(r.ip0.value, RSHADOWOLD.value)
b3_location = self.mc.currpos()
- self.mc.BRK() # B.ne to the call
+ self.mc.BRK() # B.eq forward
+
+ # revert the rpy_fastgil acquired above, so that the
+ # general 'reacqgil_addr' below can acquire it again...
+ self.mc.STR_ri(r.xzr.value, RFASTGILPTR.value, 0)
+
+ # patch the b1_location above, with "CBNZ here"
+ pmc = OverwritingBuilder(self.mc, b1_location, WORD)
+ pmc.CBNZ(r.x1.value, self.mc.currpos() - b1_location)
+
+ open_location = b3_location
else:
- b3_location = 0
- #
+ open_location = b1_location
- jmp_pos = self.mc.currpos()
- self.mc.BRK()
- # <- this is where we jump to
- jmp_ofs = self.mc.currpos()
+ # Yes, we need to call the reacqgil() function.
+ # save the result we just got
+ RSAVEDRES = RFASTGILPTR # can reuse this reg here to save things
+ reg = self.resloc
+ if reg is not None:
+ if reg.is_core_reg():
+ self.mc.MOV_rr(RSAVEDRES.value, reg.value)
+ elif reg.is_vfp_reg():
+ self.mc.SUB_ri(r.sp.value, r.sp.value, 2 * WORD)
+ self.mc.STR_di(reg.value, r.sp.value, 0)
- pmc = OverwritingBuilder(self.mc, b1_location, WORD)
- pmc.B_ofs_cond(jmp_ofs - b1_location, c.NE)
- pmc = OverwritingBuilder(self.mc, b2_location, WORD)
- pmc.B_ofs_cond(jmp_ofs - b2_location, c.NE)
- if self.asm.cpu.gc_ll_descr.gcrootmap:
- pmc = OverwritingBuilder(self.mc, b3_location, WORD)
- pmc.B_ofs_cond(jmp_ofs - b3_location, c.NE)
+ # call the function
+ self.mc.BL(self.asm.reacqgil_addr)
- # save the result we just got
- # call reacquire_gil
- self.mc.SUB_ri(r.sp.value, r.sp.value, 2 * WORD)
- self.mc.STR_di(r.d0.value, r.sp.value, 0)
- self.mc.STR_ri(r.x0.value, r.sp.value, WORD)
- self.mc.BL(self.asm.reacqgil_addr)
- self.mc.LDR_di(r.d0.value, r.sp.value, 0)
- self.mc.LDR_ri(r.x0.value, r.sp.value, WORD)
- self.mc.ADD_ri(r.sp.value, r.sp.value, 2 * WORD)
+ # restore the saved register
+ if reg is not None:
+ if reg.is_core_reg():
+ self.mc.MOV_rr(reg.value, RSAVEDRES.value)
+ elif reg.is_vfp_reg():
+ self.mc.LDR_di(reg.value, r.sp.value, 0)
+ self.mc.ADD_ri(r.sp.value, r.sp.value, 2 * WORD)
- pmc = OverwritingBuilder(self.mc, jmp_pos, WORD)
- pmc.B_ofs(self.mc.currpos() - jmp_pos)
+ # now patch the still-open jump above:
+ # boehm: patch b1_location with a CBZ(x1)
+ # shadowstack: patch b3_location with BEQ
+ pmc = OverwritingBuilder(self.mc, open_location, WORD)
+ offset = self.mc.currpos() - open_location
+ if gcrootmap:
+ pmc.B_ofs_cond(offset, c.EQ)
+ else:
+ pmc.CBZ(r.x1.value, offset)
if not we_are_translated(): # for testing: now we
can accesss
self.mc.SUB_ri(r.fp.value, r.fp.value, 1) # fp again
diff --git a/rpython/jit/backend/aarch64/codebuilder.py
b/rpython/jit/backend/aarch64/codebuilder.py
--- a/rpython/jit/backend/aarch64/codebuilder.py
+++ b/rpython/jit/backend/aarch64/codebuilder.py
@@ -423,15 +423,15 @@
base = 0b11001000000
self.write32((base << 21) | (rs << 16) | (0b011111 << 10) | (rn << 5)
| rt)
- def LDAXR(self, rt, rn):
- # XXX DON'T USE
- base = 0b1100100001011111111111
- self.write32((base << 10) | (rn << 5) | rt)
+ #def LDAXR(self, rt, rn):
+ # don't use any more
+ # base = 0b1100100001011111111111
+ # self.write32((base << 10) | (rn << 5) | rt)
- def STLXR(self, rt, rn, rs):
- # XXX DON'T USE
- base = 0b11001000000
- self.write32((base << 21) | (rs << 16) | (0b111111 << 10) | (rn << 5)
| rt)
+ #def STLXR(self, rt, rn, rs):
+ # don't use any more
+ # base = 0b11001000000
+ # self.write32((base << 21) | (rs << 16) | (0b111111 << 10) | (rn << 5)
| rt)
def NOP(self):
self.write32(0b11010101000000110010000000011111)
@@ -439,7 +439,7 @@
def B_ofs(self, ofs):
base = 0b000101
assert ofs & 0x3 == 0
- assert -(1 << (26 + 2)) < ofs < 1<<(26 + 2)
+ assert -(1 << (26 + 2)) <= ofs < 1<<(26 + 2)
if ofs < 0:
ofs = (1 << 26) - (-ofs >> 2)
else:
@@ -449,11 +449,33 @@
def B_ofs_cond(self, ofs, cond):
base = 0b01010100
assert ofs & 0x3 == 0
- assert -1 << 21 < ofs < 1 << 21
+ assert -1 << 21 <= ofs < 1 << 21
imm = ofs >> 2
assert imm > 0 # we seem not to need the - jump
self.write32((base << 24) | (imm << 5) | cond)
+ def CBNZ(self, rt, ofs):
+ base = 0b10110101
+ assert -1 << 21 <= ofs < 1 << 21
+ imm = ofs >> 2
+ imm &= (1 << 19) - 1
+ self.write32((base << 24) | (imm << 5) | rt)
+
+ def CBNZ_w(self, rt, ofs):
+ # checks the 'w' part of the register (32 bits)
+ base = 0b00110101
+ assert -1 << 21 <= ofs < 1 << 21
+ imm = ofs >> 2
+ imm &= (1 << 19) - 1
+ self.write32((base << 24) | (imm << 5) | rt)
+
+ def CBZ(self, rt, ofs):
+ base = 0b10110100
+ assert -1 << 21 <= ofs < 1 << 21
+ imm = ofs >> 2
+ imm &= (1 << 19) - 1
+ self.write32((base << 24) | (imm << 5) | rt)
+
def B(self, target):
target = rffi.cast(lltype.Signed, target)
self.gen_load_int_full(r.ip0.value, target)
@@ -476,9 +498,9 @@
def BRK(self):
self.write32(0b11010100001 << 21)
- def DMB(self):
- # XXX DON'T USE
- self.write32(0b11010101000000110011111110111111)
+ #def DMB(self):
+ # don't use any more
+ # self.write32(0b11010101000000110011111110111111)
def gen_load_int_full(self, r, value):
self.MOVZ_r_u16(r, value & 0xFFFF, 0)
diff --git a/rpython/jit/backend/aarch64/test/test_instr_builder.py
b/rpython/jit/backend/aarch64/test/test_instr_builder.py
--- a/rpython/jit/backend/aarch64/test/test_instr_builder.py
+++ b/rpython/jit/backend/aarch64/test/test_instr_builder.py
@@ -172,3 +172,18 @@
cb = CodeBuilder()
cb.STXR(r.x6.value, r.x11.value, r.x22.value)
assert cb.hexdump() == assemble("STXR w6, x11, [x22]")
+
+ def test_CBNZ(self):
+ cb = CodeBuilder()
+ cb.CBNZ(r.x6.value, -8)
+ assert cb.hexdump() == assemble("CBNZ x6, -8")
+
+ def test_CBNZ_w(self):
+ cb = CodeBuilder()
+ cb.CBNZ_w(r.x6.value, -8)
+ assert cb.hexdump() == assemble("CBNZ w6, -8")
+
+ def test_CBZ(self):
+ cb = CodeBuilder()
+ cb.CBZ(r.x25.value, -888)
+ assert cb.hexdump() == assemble("CBZ x25, -888")
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit