[pypy-commit] pypy stmgc-c7: in-progress

arigo Mon, 24 Mar 2014 02:46:04 -0700

Author: Armin Rigo <[email protected]>
Branch: stmgc-c7
Changeset: r70239:218296eca9e2
Date: 2014-03-24 10:44 +0100
http://bitbucket.org/pypy/pypy/changeset/218296eca9e2/


Log:    in-progress

diff --git a/rpython/jit/backend/llsupport/regalloc.py 
b/rpython/jit/backend/llsupport/regalloc.py
--- a/rpython/jit/backend/llsupport/regalloc.py
+++ b/rpython/jit/backend/llsupport/regalloc.py
@@ -654,7 +654,8 @@
             op.getopnum() == rop.STM_TRANSACTION_BREAK):
             assert operations[i + 1].getopnum() == rop.GUARD_NOT_FORCED
             return True
-        if not op.is_comparison():
+        if (not op.is_comparison() and
+            op.getopnum() != rop.STM_SHOULD_BREAK_TRANSACTION):
             if op.is_ovf():
                 if (operations[i + 1].getopnum() != rop.GUARD_NO_OVERFLOW and
                     operations[i + 1].getopnum() != rop.GUARD_OVERFLOW):
diff --git a/rpython/jit/backend/x86/assembler.py 
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -1918,51 +1918,56 @@
     def setup_failure_recovery(self):
         self.failure_recovery_code = [0, 0, 0, 0]
 
-    def _push_all_regs_to_frame(self, mc, ignored_regs, withfloats,
-                                callee_only=False):
-        # Push all general purpose registers
+    def _push_pop_regs_to_frame(self, push, mc, grp_regs, xmm_regs):
+        # Push the general purpose registers
         base_ofs = self.cpu.get_baseofs_of_frame_field()
+        for gpr in grp_regs:
+            v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value]
+            addr = (self.SEGMENT_FRAME, v * WORD + base_ofs)
+            if push:
+                mc.MOV_br(addr, gpr.value)
+            else:
+                mc.MOV_rb(gpr.value, addr)
+        # Push the XMM regs
+        if IS_X86_64:
+            coeff = 1
+        else:
+            coeff = 2
+        ofs = len(gpr_reg_mgr_cls.all_regs)
+        for xmm in xmm_regs:
+            addr = (self.SEGMENT_FRAME,
+                    (ofs + xmm.value * coeff) * WORD + base_ofs)
+            if push:
+                mc.MOVSD_bx(addr, xmm.value)
+            else:
+                mc.MOVSD_xb(xmm.value, addr)
+
+    def _do_with_registers(self, push, mc,
+                           ignored_regs, withfloats, callee_only):
         if callee_only:
             regs = gpr_reg_mgr_cls.save_around_call_regs
         else:
             regs = gpr_reg_mgr_cls.all_regs
-        for gpr in regs:
-            if gpr not in ignored_regs:
-                v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value]
-                mc.MOV_br((self.SEGMENT_FRAME, v * WORD + base_ofs), gpr.value)
+        regs = [grp for gpr in regs if gpr not in ignored_regs]
         if withfloats:
-            if IS_X86_64:
-                coeff = 1
-            else:
-                coeff = 2
-            # Push all XMM regs
-            ofs = len(gpr_reg_mgr_cls.all_regs)
-            for i in range(len(xmm_reg_mgr_cls.all_regs)):
-                mc.MOVSD_bx((self.SEGMENT_FRAME,
-                             (ofs + i * coeff) * WORD + base_ofs), i)
+            xmm_regs = xmm_reg_mgr_cls.all_regs
+        else:
+            xmm_regs = []
+        self._push_pop_regs_from_frame(push, mc, regs, xmm_regs)
+
+    def _push_all_regs_to_frame(self, mc, ignored_regs, withfloats,
+                                callee_only=False):
+        # Push all general purpose registers (or only the ones that a
+        # callee might clobber); and if withfloats, push all XMM regs
+        self._do_with_registers(True, mc,
+                                ignored_regs, withfloats, callee_only)
 
     def _pop_all_regs_from_frame(self, mc, ignored_regs, withfloats,
                                  callee_only=False):
-        # Pop all general purpose registers
-        base_ofs = self.cpu.get_baseofs_of_frame_field()
-        if callee_only:
-            regs = gpr_reg_mgr_cls.save_around_call_regs
-        else:
-            regs = gpr_reg_mgr_cls.all_regs
-        for gpr in regs:
-            if gpr not in ignored_regs:
-                v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value]
-                mc.MOV_rb(gpr.value, (self.SEGMENT_FRAME, v * WORD + base_ofs))
-        if withfloats:
-            # Pop all XMM regs
-            if IS_X86_64:
-                coeff = 1
-            else:
-                coeff = 2
-            ofs = len(gpr_reg_mgr_cls.all_regs)
-            for i in range(len(xmm_reg_mgr_cls.all_regs)):
-                mc.MOVSD_xb(i, (self.SEGMENT_FRAME,
-                                (ofs + i * coeff) * WORD + base_ofs))
+        # Pop all general purpose registers (or only the ones that a
+        # callee might clobber); and if withfloats, pop all XMM regs
+        self._do_with_registers(False, mc,
+                                ignored_regs, withfloats, callee_only)
 
     def _build_failure_recovery(self, exc, withfloats=False):
         mc = codebuf.MachineCodeBlockWrapper()
@@ -2510,6 +2515,29 @@
         assert isinstance(reg, RegLoc)
         self.mc.MOV_rr(reg.value, ebp.value)
 
+    def _generate_cmp_break_transaction(self):
+        # emits the check with a CMP instruction:
+        #    pypy_stm_nursery_low_fill_mark < STM_SEGMENT->nursery_current
+        # so if it is followed with a JB, it will follow the jump if
+        # we should break the transaction now.
+        #
+        psnlfm_adr = rstm.adr_pypy_stm_nursery_low_fill_mark
+        self.mc.MOV(X86_64_SCRATCH_REG, self.heap_tl(psnlfm_adr))
+        nf_adr = rstm.nursery_free_adr
+        assert rx86.fits_in_32bits(nf_adr)    # because it is in the 2nd page
+        self.mc.CMP_rj(X86_64_SCRATCH_REG.value, (self.SEGMENT_GC, nf_adr))
+
+    def genop_guard_stm_should_break_transaction(self, op, guard_op,
+                                                 guard_token, arglocs,
+                                                 result_loc):
+        if not IS_X86_64:
+            todo()   # "needed for X86_64_SCRATCH_REG"
+        self._generate_cmp_break_transaction()
+        if guard_op.getopnum() == rop.GUARD_FALSE:
+            self.implement_guard(guard_token, 'B')   # JB goes to "yes, break"
+        else:
+            self.implement_guard(guard_token, 'AE')  # JAE goes to "no, don't"
+
     def genop_guard_stm_transaction_break(self, op, guard_op, guard_token,
                                           arglocs, result_loc):
         assert self.cpu.gc_ll_descr.stm
@@ -2520,62 +2548,61 @@
         self._store_force_index(guard_op)
 
         mc = self.mc
-        # if stm_should_break_transaction()
+        self._generate_cmp_break_transaction()
+        # use JAE to jump over the following piece of code if we don't need
+        # to break the transaction now
+        mc.J_il(rx86.Conditions['AE'], 0xfffff)    # patched later
+        jae_location = mc.get_relative_pos()
 
+        # This is the case in which we have to do the same as the logic
+        # in pypy_stm_perform_transaction().  We know that we're not in
+        # an atomic transaction (otherwise the jump above always triggers).
+        # So we only have to do the following three operations:
+        #     stm_commit_transaction();
+        #     __builtin_setjmp(jmpbuf);
+        #     pypy_stm_start_transaction(&jmpbuf);
 
-        # XXX UD2
-        #fn = stmtlocal.stm_should_break_transaction_fn
-        #mc.CALL(imm(self.cpu.cast_ptr_to_int(fn)))
-        mc.MOV(eax, imm(0))
-
-
-        mc.TEST8(eax.lowest8bits(), eax.lowest8bits())
-        mc.J_il(rx86.Conditions['Z'], 0xfffff)    # patched later
-        jz_location2 = mc.get_relative_pos()
+        # save all registers and the gcmap
+        self.push_gcmap(mc, gcmap, mov=True)
+        grp_regs = self._regalloc.rm.reg_bindings.values()
+        xmm_regs = self._regalloc.xrm.reg_bindings.values()
+        self._push_pop_regs_from_frame(True, mc, grp_regs, xmm_regs)
         #
-        # call stm_transaction_break() with the address of the
-        # STM_RESUME_BUF and the custom longjmp function
-        self.push_gcmap(mc, gcmap, mov=True)
+        # call stm_commit_transaction()
+        mc.CALL(imm(rstm.adr_stm_commit_transaction))
         #
-        # save all registers
-        base_ofs = self.cpu.get_baseofs_of_frame_field()
-        for gpr in self._regalloc.rm.reg_bindings.values():
-            v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value]
-            mc.MOV_br((self.SEGMENT_FRAME, v * WORD + base_ofs), gpr.value)
-        if IS_X86_64:
-            coeff = 1
-        else:
-            coeff = 2
-        ofs = len(gpr_reg_mgr_cls.all_regs)
-        for xr in self._regalloc.xrm.reg_bindings.values():
-            mc.MOVSD_bx((self.SEGMENT_FRAME,
-                         (ofs + xr.value * coeff) * WORD + base_ofs), xr.value)
+        # update the two words in the STM_RESUME_BUF, as described
+        # in arch.py.  The "learip" pseudo-instruction turns into
+        # what is, in gnu as syntax: lea 0(%rip), %rax (the 0 is
+        # one byte, patched just below)
+        mc.LEARIP_rl8(eax, 0)
+        learip_location = mc.get_relative_pos()
+        mc.MOV_sr(STM_JMPBUF_OFS_RIP, eax)
+        mc.MOV_sr(STM_JMPBUF_OFS_RSP, esp)
         #
-        # CALL break function
-        fn = self.stm_transaction_break_path
-        mc.CALL(imm(fn))
+        offset = mc.get_relative_pos() - learip_location
+        assert 0 < offset <= 127
+        mc.overwrite(learip_location - 1, chr(offset))
         # ** HERE ** is the place an aborted transaction retries
-        # ebp/frame reloaded by longjmp callback
+        # (when resuming, ebp is garbage, but the STM_RESUME_BUF is
+        # still correct in case of repeated aborting)
+        #
+        # call pypy_stm_start_transaction(&jmpbuf)
+        mc.LEA_rs(edi, STM_JMPBUF_OFS)
+        mc.CALL(imm(rstm.adr_pypy_stm_start_transaction))
+        #
+        # reload ebp (the frame) now
+        self._reload_frame_if_necessary(self.mc)
         #
         # restore regs
-        base_ofs = self.cpu.get_baseofs_of_frame_field()
-        for gpr in self._regalloc.rm.reg_bindings.values():
-            v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value]
-            mc.MOV_rb(gpr.value, (self.SEGMENT_FRAME, v * WORD + base_ofs))
-        if IS_X86_64:
-            coeff = 1
-        else:
-            coeff = 2
-        ofs = len(gpr_reg_mgr_cls.all_regs)
-        for xr in self._regalloc.xrm.reg_bindings.values():
-            mc.MOVSD_xb(xr.value, (self.SEGMENT_FRAME,
-                                   (ofs + xr.value * coeff) * WORD + base_ofs))
+        self._push_pop_regs_from_frame(False, mc, grp_regs, xmm_regs)
         #
-        # patch the JZ above
-        offset = mc.get_relative_pos() - jz_location2
-        mc.overwrite32(jz_location2-4, offset)
+        self._emit_guard_not_forced(guard_token)
 
-        self._emit_guard_not_forced(guard_token)
+        # patch the JAE above (note that we also skip the guard_not_forced
+        # in the common situation where we jump over the code above)
+        offset = mc.get_relative_pos() - jae_location
+        mc.overwrite32(jae_location-4, offset)
 
     def genop_discard_stm_read(self, op, arglocs):
         if not IS_X86_64:
diff --git a/rpython/jit/backend/x86/regalloc.py 
b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -1235,14 +1235,13 @@
                     self.fm.hint_frame_pos[box] = self.fm.get_loc_index(loc)
 
 
+    def consider_stm_should_break_transaction(self, op, guard_op):
+        if guard_op is None:
+            self.not_implemented_op(op)
+        self.perform_with_guard(op, guard_op, [], None)
+
     def consider_stm_transaction_break(self, op, guard_op):
-        #
-        # only save regs for the should_break_transaction call
-        self.xrm.before_call()
-        self.rm.before_call()
-        #
         self.perform_with_guard(op, guard_op, [], None)
-        
 
     def consider_jump(self, op):
         assembler = self.assembler
@@ -1393,7 +1392,8 @@
             or num == rop.CALL_MAY_FORCE
             or num == rop.CALL_ASSEMBLER
             or num == rop.CALL_RELEASE_GIL
-            or num == rop.STM_TRANSACTION_BREAK):
+            or num == rop.STM_TRANSACTION_BREAK
+            or num == rop.STM_SHOULD_BREAK_TRANSACTION):
             oplist_with_guard[num] = value
             oplist[num] = add_none_argument(value)
         else:
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -696,6 +696,8 @@
         self.writechar(chr((imm >> 56) & 0xFF))
 
     CQO = insn(rex_w, '\x99')
+    LEARIP_rl8 = insn(rex_w, '\x8D', register(1, 8), chr(0x05),
+                      immediate(2, 'b'))
 
     # Three different encodings... following what gcc does.  From the
     # shortest encoding to the longest one.
diff --git a/rpython/rlib/rstm.py b/rpython/rlib/rstm.py
--- a/rpython/rlib/rstm.py
+++ b/rpython/rlib/rstm.py
@@ -11,6 +11,8 @@
 stm_nb_segments = CDefinedIntSymbolic('STM_NB_SEGMENTS')
 adr_nursery_free = CDefinedIntSymbolic('((long)&STM_SEGMENT->nursery_current)')
 adr_nursery_top  = CDefinedIntSymbolic('((long)&STM_SEGMENT->nursery_end)')
+adr_pypy_stm_nursery_low_fill_mark = (
+    CDefinedIntSymbolic('((long)&pypy_stm_nursery_low_fill_mark'))
 adr_transaction_read_version = (
     CDefinedIntSymbolic('((long)&STM_SEGMENT->transaction_read_version)'))
 adr_jmpbuf_ptr = (
@@ -21,6 +23,10 @@
     CDefinedIntSymbolic('((long)"return from JITted function")'))
 adr__stm_become_inevitable = (
     CDefinedIntSymbolic('((long)&_stm_become_inevitable)'))
+adr_stm_commit_transaction = (
+    CDefinedIntSymbolic('((long)&stm_commit_transaction)'))
+adr_pypy_stm_start_transaction = (
+    CDefinedIntSymbolic('((long)&pypy_stm_start_transaction)'))
 
 
 def jit_stm_transaction_break_point():
diff --git a/rpython/translator/stm/src_stm/stmgcintf.c 
b/rpython/translator/stm/src_stm/stmgcintf.c
--- a/rpython/translator/stm/src_stm/stmgcintf.c
+++ b/rpython/translator/stm/src_stm/stmgcintf.c
@@ -90,12 +90,38 @@
     }
 }
 
+void pypy_stm_start_transaction(stm_jmpbuf_t *jmpbuf_ptr,
+                                volatile long *v_counter)
+{
+    _stm_start_transaction(&stm_thread_local, jmpbuf_ptr);
+
+    /* If v_counter==0, initialize 'pypy_stm_nursery_low_fill_mark'
+       from the configured length limit.  If v_counter>0, we did an
+       abort, and we now configure 'pypy_stm_nursery_low_fill_mark'
+       to a value slightly smaller than the value at last abort.
+    */
+    long counter, limit;
+    counter = *v_counter;
+    *v_counter = counter + 1;
+
+    if (counter == 0) {
+        limit = pypy_transaction_length;
+    }
+    else {
+        limit = stm_thread_local.last_abort__bytes_in_nursery;
+        limit -= (limit >> 4);
+    }
+    pypy_stm_nursery_low_fill_mark = _stm_nursery_start + limit;
+    pypy_stm_ready_atomic = 1; /* reset after abort */
+}
+
 void pypy_stm_perform_transaction(object_t *arg, int callback(object_t *, int))
 {   /* must save roots around this call */
     stm_jmpbuf_t jmpbuf;
     long volatile v_counter = 0;
+    int (*volatile v_callback)(object_t *, int) = callback;
 #ifndef NDEBUG
-    struct stm_shadowentry_s *volatile old_shadowstack =
+    struct stm_shadowentry_s *volatile v_old_shadowstack =
         stm_thread_local.shadowstack;
 #endif
 
@@ -105,42 +131,28 @@
     while (1) {
 
         if (pypy_stm_ready_atomic == 1) {
+            /* Not in an atomic transaction
+             */
             stm_commit_transaction();
-            STM_START_TRANSACTION(&stm_thread_local, jmpbuf);
-            pypy_stm_ready_atomic = 1; /* reset after abort */
-        }
 
-        /* After setjmp(), the local variables v_* are preserved because they
-         * are volatile.  The other variables are only declared here. */
-        long counter, result;
-        counter = v_counter;
-        v_counter = counter + 1;
-
-        /* If counter==0, initialize 'pypy_stm_nursery_low_fill_mark'
-           from the configured length limit.  If counter>0, we did an
-           abort, and we can now configure 'pypy_stm_nursery_low_fill_mark'
-           to a value slightly smaller than the value at last abort.
-        */
-        if (stm_is_inevitable()) {
-            pypy_stm_nursery_low_fill_mark = 0;
+            /* After setjmp(), the local variables v_* are preserved because
+               they are volatile.  The other local variables should be
+               declared below than this point only.
+            */
+            while (__builtin_setjmp(jmpbuf) == 1) { /*redo setjmp*/ }
+            pypy_stm_start_transaction(&jmpbuf, &v_counter);
         }
         else {
-            long limit;
-            if (counter == 0) {
-                limit = pypy_transaction_length;
-            }
-            else {
-                limit = stm_thread_local.last_abort__bytes_in_nursery;
-                limit -= (limit >> 4);
-            }
-            pypy_stm_nursery_low_fill_mark = _stm_nursery_start + limit;
+            /* In an atomic transaction */
+            assert(pypy_stm_nursery_low_fill_mark == (uintptr_t) -1);
         }
 
         /* invoke the callback in the new transaction */
         STM_POP_ROOT(stm_thread_local, arg);
-        assert(old_shadowstack == stm_thread_local.shadowstack);
+        assert(v_old_shadowstack == stm_thread_local.shadowstack);
         STM_PUSH_ROOT(stm_thread_local, arg);
-        result = callback(arg, counter);
+
+        long result = v_callback(arg, counter);
         if (result <= 0)
             break;
         v_counter = 0;
@@ -157,11 +169,12 @@
         }
         else {
             _stm_become_inevitable("perform_transaction left with atomic");
+            assert(pypy_stm_nursery_low_fill_mark == (uintptr_t) -1);
         }
     }
 
     //gcptr x = stm_pop_root();   /* pop the END_MARKER */
     //assert(x == END_MARKER_OFF || x == END_MARKER_ON);
     STM_POP_ROOT_RET(stm_thread_local);             /* pop the 'arg' */
-    assert(old_shadowstack == stm_thread_local.shadowstack);
+    assert(v_old_shadowstack == stm_thread_local.shadowstack);
 }
diff --git a/rpython/translator/stm/src_stm/stmgcintf.h 
b/rpython/translator/stm/src_stm/stmgcintf.h
--- a/rpython/translator/stm/src_stm/stmgcintf.h
+++ b/rpython/translator/stm/src_stm/stmgcintf.h
@@ -11,6 +11,7 @@
 extern __thread struct stm_thread_local_s stm_thread_local;
 extern __thread long pypy_stm_ready_atomic;
 extern __thread uintptr_t pypy_stm_nursery_low_fill_mark;
+extern __thread uintptr_t pypy_stm_nursery_low_fill_mark_saved;
 
 void pypy_stm_setup(void);
 void pypy_stm_setup_prebuilt(void);        /* generated into stm_prebuilt.c */
@@ -35,11 +36,26 @@
     }
 }
 static inline void pypy_stm_increment_atomic(void) {
-    pypy_stm_ready_atomic++;
+    switch (++pypy_stm_ready_atomic) {
+    case 2:
+        pypy_stm_nursery_low_fill_mark_saved = pypy_stm_nursery_low_fill_mark;
+        pypy_stm_nursery_low_fill_mark = (uintptr_t) -1;
+        break;
+    default:
+        break;
+    }
 }
 static inline void pypy_stm_decrement_atomic(void) {
-    if (--pypy_stm_ready_atomic == 0)
+    switch (--pypy_stm_ready_atomic) {
+    case 1:
+        pypy_stm_nursery_low_fill_mark = pypy_stm_nursery_low_fill_mark_saved;
+        break;
+    case 0:
         pypy_stm_ready_atomic = 1;
+        break;
+    default:
+        break;
+    }
 }
 static inline long pypy_stm_get_atomic(void) {
     return pypy_stm_ready_atomic - 1;
@@ -48,15 +64,19 @@
 void pypy_stm_leave_callback_call(long);
 void pypy_stm_set_transaction_length(double);
 void pypy_stm_perform_transaction(object_t *, int(object_t *, int));
+void pypy_stm_start_transaction(stm_jmpbuf_t *, volatile long *);
 
 static inline int pypy_stm_should_break_transaction(void)
 {
     /* we should break the current transaction if we have used more than
        some initial portion of the nursery, or if we are running inevitable
-       (in which case pypy_stm_nursery_low_fill_mark is set to 0)
+       (in which case pypy_stm_nursery_low_fill_mark is set to 0).
+       If the transaction is atomic, pypy_stm_nursery_low_fill_mark is
+       instead set to (uintptr_t) -1, and the following check is never true.
     */
     uintptr_t current = (uintptr_t)STM_SEGMENT->nursery_current;
     return current > pypy_stm_nursery_low_fill_mark;
+    /* NB. this logic is hard-coded in jit/backend/x86/assembler.py too */
 }
 
 
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy stmgc-c7: in-progress

Reply via email to