Author: Armin Rigo <[email protected]>
Branch: stmgc-c7
Changeset: r70239:218296eca9e2
Date: 2014-03-24 10:44 +0100
http://bitbucket.org/pypy/pypy/changeset/218296eca9e2/
Log: in-progress
diff --git a/rpython/jit/backend/llsupport/regalloc.py
b/rpython/jit/backend/llsupport/regalloc.py
--- a/rpython/jit/backend/llsupport/regalloc.py
+++ b/rpython/jit/backend/llsupport/regalloc.py
@@ -654,7 +654,8 @@
op.getopnum() == rop.STM_TRANSACTION_BREAK):
assert operations[i + 1].getopnum() == rop.GUARD_NOT_FORCED
return True
- if not op.is_comparison():
+ if (not op.is_comparison() and
+ op.getopnum() != rop.STM_SHOULD_BREAK_TRANSACTION):
if op.is_ovf():
if (operations[i + 1].getopnum() != rop.GUARD_NO_OVERFLOW and
operations[i + 1].getopnum() != rop.GUARD_OVERFLOW):
diff --git a/rpython/jit/backend/x86/assembler.py
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -1918,51 +1918,56 @@
def setup_failure_recovery(self):
self.failure_recovery_code = [0, 0, 0, 0]
- def _push_all_regs_to_frame(self, mc, ignored_regs, withfloats,
- callee_only=False):
- # Push all general purpose registers
+ def _push_pop_regs_to_frame(self, push, mc, grp_regs, xmm_regs):
+ # Push the general purpose registers
base_ofs = self.cpu.get_baseofs_of_frame_field()
+ for gpr in grp_regs:
+ v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value]
+ addr = (self.SEGMENT_FRAME, v * WORD + base_ofs)
+ if push:
+ mc.MOV_br(addr, gpr.value)
+ else:
+ mc.MOV_rb(gpr.value, addr)
+ # Push the XMM regs
+ if IS_X86_64:
+ coeff = 1
+ else:
+ coeff = 2
+ ofs = len(gpr_reg_mgr_cls.all_regs)
+ for xmm in xmm_regs:
+ addr = (self.SEGMENT_FRAME,
+ (ofs + xmm.value * coeff) * WORD + base_ofs)
+ if push:
+ mc.MOVSD_bx(addr, xmm.value)
+ else:
+ mc.MOVSD_xb(xmm.value, addr)
+
+ def _do_with_registers(self, push, mc,
+ ignored_regs, withfloats, callee_only):
if callee_only:
regs = gpr_reg_mgr_cls.save_around_call_regs
else:
regs = gpr_reg_mgr_cls.all_regs
- for gpr in regs:
- if gpr not in ignored_regs:
- v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value]
- mc.MOV_br((self.SEGMENT_FRAME, v * WORD + base_ofs), gpr.value)
+ regs = [grp for gpr in regs if gpr not in ignored_regs]
if withfloats:
- if IS_X86_64:
- coeff = 1
- else:
- coeff = 2
- # Push all XMM regs
- ofs = len(gpr_reg_mgr_cls.all_regs)
- for i in range(len(xmm_reg_mgr_cls.all_regs)):
- mc.MOVSD_bx((self.SEGMENT_FRAME,
- (ofs + i * coeff) * WORD + base_ofs), i)
+ xmm_regs = xmm_reg_mgr_cls.all_regs
+ else:
+ xmm_regs = []
+ self._push_pop_regs_from_frame(push, mc, regs, xmm_regs)
+
+ def _push_all_regs_to_frame(self, mc, ignored_regs, withfloats,
+ callee_only=False):
+ # Push all general purpose registers (or only the ones that a
+ # callee might clobber); and if withfloats, push all XMM regs
+ self._do_with_registers(True, mc,
+ ignored_regs, withfloats, callee_only)
def _pop_all_regs_from_frame(self, mc, ignored_regs, withfloats,
callee_only=False):
- # Pop all general purpose registers
- base_ofs = self.cpu.get_baseofs_of_frame_field()
- if callee_only:
- regs = gpr_reg_mgr_cls.save_around_call_regs
- else:
- regs = gpr_reg_mgr_cls.all_regs
- for gpr in regs:
- if gpr not in ignored_regs:
- v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value]
- mc.MOV_rb(gpr.value, (self.SEGMENT_FRAME, v * WORD + base_ofs))
- if withfloats:
- # Pop all XMM regs
- if IS_X86_64:
- coeff = 1
- else:
- coeff = 2
- ofs = len(gpr_reg_mgr_cls.all_regs)
- for i in range(len(xmm_reg_mgr_cls.all_regs)):
- mc.MOVSD_xb(i, (self.SEGMENT_FRAME,
- (ofs + i * coeff) * WORD + base_ofs))
+ # Pop all general purpose registers (or only the ones that a
+ # callee might clobber); and if withfloats, pop all XMM regs
+ self._do_with_registers(False, mc,
+ ignored_regs, withfloats, callee_only)
def _build_failure_recovery(self, exc, withfloats=False):
mc = codebuf.MachineCodeBlockWrapper()
@@ -2510,6 +2515,29 @@
assert isinstance(reg, RegLoc)
self.mc.MOV_rr(reg.value, ebp.value)
+ def _generate_cmp_break_transaction(self):
+ # emits the check with a CMP instruction:
+ # pypy_stm_nursery_low_fill_mark < STM_SEGMENT->nursery_current
+ # so if it is followed with a JB, it will follow the jump if
+ # we should break the transaction now.
+ #
+ psnlfm_adr = rstm.adr_pypy_stm_nursery_low_fill_mark
+ self.mc.MOV(X86_64_SCRATCH_REG, self.heap_tl(psnlfm_adr))
+ nf_adr = rstm.nursery_free_adr
+ assert rx86.fits_in_32bits(nf_adr) # because it is in the 2nd page
+ self.mc.CMP_rj(X86_64_SCRATCH_REG.value, (self.SEGMENT_GC, nf_adr))
+
+ def genop_guard_stm_should_break_transaction(self, op, guard_op,
+ guard_token, arglocs,
+ result_loc):
+ if not IS_X86_64:
+ todo() # "needed for X86_64_SCRATCH_REG"
+ self._generate_cmp_break_transaction()
+ if guard_op.getopnum() == rop.GUARD_FALSE:
+ self.implement_guard(guard_token, 'B') # JB goes to "yes, break"
+ else:
+ self.implement_guard(guard_token, 'AE') # JAE goes to "no, don't"
+
def genop_guard_stm_transaction_break(self, op, guard_op, guard_token,
arglocs, result_loc):
assert self.cpu.gc_ll_descr.stm
@@ -2520,62 +2548,61 @@
self._store_force_index(guard_op)
mc = self.mc
- # if stm_should_break_transaction()
+ self._generate_cmp_break_transaction()
+ # use JAE to jump over the following piece of code if we don't need
+ # to break the transaction now
+ mc.J_il(rx86.Conditions['AE'], 0xfffff) # patched later
+ jae_location = mc.get_relative_pos()
+ # This is the case in which we have to do the same as the logic
+ # in pypy_stm_perform_transaction(). We know that we're not in
+ # an atomic transaction (otherwise the jump above always triggers).
+ # So we only have to do the following three operations:
+ # stm_commit_transaction();
+ # __builtin_setjmp(jmpbuf);
+ # pypy_stm_start_transaction(&jmpbuf);
- # XXX UD2
- #fn = stmtlocal.stm_should_break_transaction_fn
- #mc.CALL(imm(self.cpu.cast_ptr_to_int(fn)))
- mc.MOV(eax, imm(0))
-
-
- mc.TEST8(eax.lowest8bits(), eax.lowest8bits())
- mc.J_il(rx86.Conditions['Z'], 0xfffff) # patched later
- jz_location2 = mc.get_relative_pos()
+ # save all registers and the gcmap
+ self.push_gcmap(mc, gcmap, mov=True)
+ grp_regs = self._regalloc.rm.reg_bindings.values()
+ xmm_regs = self._regalloc.xrm.reg_bindings.values()
+ self._push_pop_regs_from_frame(True, mc, grp_regs, xmm_regs)
#
- # call stm_transaction_break() with the address of the
- # STM_RESUME_BUF and the custom longjmp function
- self.push_gcmap(mc, gcmap, mov=True)
+ # call stm_commit_transaction()
+ mc.CALL(imm(rstm.adr_stm_commit_transaction))
#
- # save all registers
- base_ofs = self.cpu.get_baseofs_of_frame_field()
- for gpr in self._regalloc.rm.reg_bindings.values():
- v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value]
- mc.MOV_br((self.SEGMENT_FRAME, v * WORD + base_ofs), gpr.value)
- if IS_X86_64:
- coeff = 1
- else:
- coeff = 2
- ofs = len(gpr_reg_mgr_cls.all_regs)
- for xr in self._regalloc.xrm.reg_bindings.values():
- mc.MOVSD_bx((self.SEGMENT_FRAME,
- (ofs + xr.value * coeff) * WORD + base_ofs), xr.value)
+ # update the two words in the STM_RESUME_BUF, as described
+ # in arch.py. The "learip" pseudo-instruction turns into
+ # what is, in gnu as syntax: lea 0(%rip), %rax (the 0 is
+ # one byte, patched just below)
+ mc.LEARIP_rl8(eax, 0)
+ learip_location = mc.get_relative_pos()
+ mc.MOV_sr(STM_JMPBUF_OFS_RIP, eax)
+ mc.MOV_sr(STM_JMPBUF_OFS_RSP, esp)
#
- # CALL break function
- fn = self.stm_transaction_break_path
- mc.CALL(imm(fn))
+ offset = mc.get_relative_pos() - learip_location
+ assert 0 < offset <= 127
+ mc.overwrite(learip_location - 1, chr(offset))
# ** HERE ** is the place an aborted transaction retries
- # ebp/frame reloaded by longjmp callback
+ # (when resuming, ebp is garbage, but the STM_RESUME_BUF is
+ # still correct in case of repeated aborting)
+ #
+ # call pypy_stm_start_transaction(&jmpbuf)
+ mc.LEA_rs(edi, STM_JMPBUF_OFS)
+ mc.CALL(imm(rstm.adr_pypy_stm_start_transaction))
+ #
+ # reload ebp (the frame) now
+ self._reload_frame_if_necessary(self.mc)
#
# restore regs
- base_ofs = self.cpu.get_baseofs_of_frame_field()
- for gpr in self._regalloc.rm.reg_bindings.values():
- v = gpr_reg_mgr_cls.all_reg_indexes[gpr.value]
- mc.MOV_rb(gpr.value, (self.SEGMENT_FRAME, v * WORD + base_ofs))
- if IS_X86_64:
- coeff = 1
- else:
- coeff = 2
- ofs = len(gpr_reg_mgr_cls.all_regs)
- for xr in self._regalloc.xrm.reg_bindings.values():
- mc.MOVSD_xb(xr.value, (self.SEGMENT_FRAME,
- (ofs + xr.value * coeff) * WORD + base_ofs))
+ self._push_pop_regs_from_frame(False, mc, grp_regs, xmm_regs)
#
- # patch the JZ above
- offset = mc.get_relative_pos() - jz_location2
- mc.overwrite32(jz_location2-4, offset)
+ self._emit_guard_not_forced(guard_token)
- self._emit_guard_not_forced(guard_token)
+ # patch the JAE above (note that we also skip the guard_not_forced
+ # in the common situation where we jump over the code above)
+ offset = mc.get_relative_pos() - jae_location
+ mc.overwrite32(jae_location-4, offset)
def genop_discard_stm_read(self, op, arglocs):
if not IS_X86_64:
diff --git a/rpython/jit/backend/x86/regalloc.py
b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -1235,14 +1235,13 @@
self.fm.hint_frame_pos[box] = self.fm.get_loc_index(loc)
+ def consider_stm_should_break_transaction(self, op, guard_op):
+ if guard_op is None:
+ self.not_implemented_op(op)
+ self.perform_with_guard(op, guard_op, [], None)
+
def consider_stm_transaction_break(self, op, guard_op):
- #
- # only save regs for the should_break_transaction call
- self.xrm.before_call()
- self.rm.before_call()
- #
self.perform_with_guard(op, guard_op, [], None)
-
def consider_jump(self, op):
assembler = self.assembler
@@ -1393,7 +1392,8 @@
or num == rop.CALL_MAY_FORCE
or num == rop.CALL_ASSEMBLER
or num == rop.CALL_RELEASE_GIL
- or num == rop.STM_TRANSACTION_BREAK):
+ or num == rop.STM_TRANSACTION_BREAK
+ or num == rop.STM_SHOULD_BREAK_TRANSACTION):
oplist_with_guard[num] = value
oplist[num] = add_none_argument(value)
else:
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -696,6 +696,8 @@
self.writechar(chr((imm >> 56) & 0xFF))
CQO = insn(rex_w, '\x99')
+ LEARIP_rl8 = insn(rex_w, '\x8D', register(1, 8), chr(0x05),
+ immediate(2, 'b'))
# Three different encodings... following what gcc does. From the
# shortest encoding to the longest one.
diff --git a/rpython/rlib/rstm.py b/rpython/rlib/rstm.py
--- a/rpython/rlib/rstm.py
+++ b/rpython/rlib/rstm.py
@@ -11,6 +11,8 @@
stm_nb_segments = CDefinedIntSymbolic('STM_NB_SEGMENTS')
adr_nursery_free = CDefinedIntSymbolic('((long)&STM_SEGMENT->nursery_current)')
adr_nursery_top = CDefinedIntSymbolic('((long)&STM_SEGMENT->nursery_end)')
+adr_pypy_stm_nursery_low_fill_mark = (
+ CDefinedIntSymbolic('((long)&pypy_stm_nursery_low_fill_mark'))
adr_transaction_read_version = (
CDefinedIntSymbolic('((long)&STM_SEGMENT->transaction_read_version)'))
adr_jmpbuf_ptr = (
@@ -21,6 +23,10 @@
CDefinedIntSymbolic('((long)"return from JITted function")'))
adr__stm_become_inevitable = (
CDefinedIntSymbolic('((long)&_stm_become_inevitable)'))
+adr_stm_commit_transaction = (
+ CDefinedIntSymbolic('((long)&stm_commit_transaction)'))
+adr_pypy_stm_start_transaction = (
+ CDefinedIntSymbolic('((long)&pypy_stm_start_transaction)'))
def jit_stm_transaction_break_point():
diff --git a/rpython/translator/stm/src_stm/stmgcintf.c
b/rpython/translator/stm/src_stm/stmgcintf.c
--- a/rpython/translator/stm/src_stm/stmgcintf.c
+++ b/rpython/translator/stm/src_stm/stmgcintf.c
@@ -90,12 +90,38 @@
}
}
+void pypy_stm_start_transaction(stm_jmpbuf_t *jmpbuf_ptr,
+ volatile long *v_counter)
+{
+ _stm_start_transaction(&stm_thread_local, jmpbuf_ptr);
+
+ /* If v_counter==0, initialize 'pypy_stm_nursery_low_fill_mark'
+ from the configured length limit. If v_counter>0, we did an
+ abort, and we now configure 'pypy_stm_nursery_low_fill_mark'
+ to a value slightly smaller than the value at last abort.
+ */
+ long counter, limit;
+ counter = *v_counter;
+ *v_counter = counter + 1;
+
+ if (counter == 0) {
+ limit = pypy_transaction_length;
+ }
+ else {
+ limit = stm_thread_local.last_abort__bytes_in_nursery;
+ limit -= (limit >> 4);
+ }
+ pypy_stm_nursery_low_fill_mark = _stm_nursery_start + limit;
+ pypy_stm_ready_atomic = 1; /* reset after abort */
+}
+
void pypy_stm_perform_transaction(object_t *arg, int callback(object_t *, int))
{ /* must save roots around this call */
stm_jmpbuf_t jmpbuf;
long volatile v_counter = 0;
+ int (*volatile v_callback)(object_t *, int) = callback;
#ifndef NDEBUG
- struct stm_shadowentry_s *volatile old_shadowstack =
+ struct stm_shadowentry_s *volatile v_old_shadowstack =
stm_thread_local.shadowstack;
#endif
@@ -105,42 +131,28 @@
while (1) {
if (pypy_stm_ready_atomic == 1) {
+ /* Not in an atomic transaction
+ */
stm_commit_transaction();
- STM_START_TRANSACTION(&stm_thread_local, jmpbuf);
- pypy_stm_ready_atomic = 1; /* reset after abort */
- }
- /* After setjmp(), the local variables v_* are preserved because they
- * are volatile. The other variables are only declared here. */
- long counter, result;
- counter = v_counter;
- v_counter = counter + 1;
-
- /* If counter==0, initialize 'pypy_stm_nursery_low_fill_mark'
- from the configured length limit. If counter>0, we did an
- abort, and we can now configure 'pypy_stm_nursery_low_fill_mark'
- to a value slightly smaller than the value at last abort.
- */
- if (stm_is_inevitable()) {
- pypy_stm_nursery_low_fill_mark = 0;
+ /* After setjmp(), the local variables v_* are preserved because
+ they are volatile. The other local variables should be
+ declared below than this point only.
+ */
+ while (__builtin_setjmp(jmpbuf) == 1) { /*redo setjmp*/ }
+ pypy_stm_start_transaction(&jmpbuf, &v_counter);
}
else {
- long limit;
- if (counter == 0) {
- limit = pypy_transaction_length;
- }
- else {
- limit = stm_thread_local.last_abort__bytes_in_nursery;
- limit -= (limit >> 4);
- }
- pypy_stm_nursery_low_fill_mark = _stm_nursery_start + limit;
+ /* In an atomic transaction */
+ assert(pypy_stm_nursery_low_fill_mark == (uintptr_t) -1);
}
/* invoke the callback in the new transaction */
STM_POP_ROOT(stm_thread_local, arg);
- assert(old_shadowstack == stm_thread_local.shadowstack);
+ assert(v_old_shadowstack == stm_thread_local.shadowstack);
STM_PUSH_ROOT(stm_thread_local, arg);
- result = callback(arg, counter);
+
+ long result = v_callback(arg, counter);
if (result <= 0)
break;
v_counter = 0;
@@ -157,11 +169,12 @@
}
else {
_stm_become_inevitable("perform_transaction left with atomic");
+ assert(pypy_stm_nursery_low_fill_mark == (uintptr_t) -1);
}
}
//gcptr x = stm_pop_root(); /* pop the END_MARKER */
//assert(x == END_MARKER_OFF || x == END_MARKER_ON);
STM_POP_ROOT_RET(stm_thread_local); /* pop the 'arg' */
- assert(old_shadowstack == stm_thread_local.shadowstack);
+ assert(v_old_shadowstack == stm_thread_local.shadowstack);
}
diff --git a/rpython/translator/stm/src_stm/stmgcintf.h
b/rpython/translator/stm/src_stm/stmgcintf.h
--- a/rpython/translator/stm/src_stm/stmgcintf.h
+++ b/rpython/translator/stm/src_stm/stmgcintf.h
@@ -11,6 +11,7 @@
extern __thread struct stm_thread_local_s stm_thread_local;
extern __thread long pypy_stm_ready_atomic;
extern __thread uintptr_t pypy_stm_nursery_low_fill_mark;
+extern __thread uintptr_t pypy_stm_nursery_low_fill_mark_saved;
void pypy_stm_setup(void);
void pypy_stm_setup_prebuilt(void); /* generated into stm_prebuilt.c */
@@ -35,11 +36,26 @@
}
}
static inline void pypy_stm_increment_atomic(void) {
- pypy_stm_ready_atomic++;
+ switch (++pypy_stm_ready_atomic) {
+ case 2:
+ pypy_stm_nursery_low_fill_mark_saved = pypy_stm_nursery_low_fill_mark;
+ pypy_stm_nursery_low_fill_mark = (uintptr_t) -1;
+ break;
+ default:
+ break;
+ }
}
static inline void pypy_stm_decrement_atomic(void) {
- if (--pypy_stm_ready_atomic == 0)
+ switch (--pypy_stm_ready_atomic) {
+ case 1:
+ pypy_stm_nursery_low_fill_mark = pypy_stm_nursery_low_fill_mark_saved;
+ break;
+ case 0:
pypy_stm_ready_atomic = 1;
+ break;
+ default:
+ break;
+ }
}
static inline long pypy_stm_get_atomic(void) {
return pypy_stm_ready_atomic - 1;
@@ -48,15 +64,19 @@
void pypy_stm_leave_callback_call(long);
void pypy_stm_set_transaction_length(double);
void pypy_stm_perform_transaction(object_t *, int(object_t *, int));
+void pypy_stm_start_transaction(stm_jmpbuf_t *, volatile long *);
static inline int pypy_stm_should_break_transaction(void)
{
/* we should break the current transaction if we have used more than
some initial portion of the nursery, or if we are running inevitable
- (in which case pypy_stm_nursery_low_fill_mark is set to 0)
+ (in which case pypy_stm_nursery_low_fill_mark is set to 0).
+ If the transaction is atomic, pypy_stm_nursery_low_fill_mark is
+ instead set to (uintptr_t) -1, and the following check is never true.
*/
uintptr_t current = (uintptr_t)STM_SEGMENT->nursery_current;
return current > pypy_stm_nursery_low_fill_mark;
+ /* NB. this logic is hard-coded in jit/backend/x86/assembler.py too */
}
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit