Author: Richard Plangger <planri...@gmail.com> Branch: ppc-vsx-support Changeset: r87318:07a0b3bdccbe Date: 2016-09-22 14:38 +0200 http://bitbucket.org/pypy/pypy/changeset/07a0b3bdccbe/
Log: try to align the loop (should improve ppc and s390x load/store performance diff --git a/rpython/jit/backend/llgraph/runner.py b/rpython/jit/backend/llgraph/runner.py --- a/rpython/jit/backend/llgraph/runner.py +++ b/rpython/jit/backend/llgraph/runner.py @@ -879,6 +879,9 @@ def bh_vec_int_xor(self, vx, vy, count): return [int(x) ^ int(y) for x,y in zip(vx,vy)] + def bh_vec_float_xor(self, vx, vy, count): + return [0.0 for x,y in zip(vx,vy)] # just used for clearing the vector register + def bh_vec_cast_float_to_singlefloat(self, vx, count): from rpython.rlib.rarithmetic import r_singlefloat return [longlong.singlefloat2int(r_singlefloat(longlong.getrealfloat(v))) diff --git a/rpython/jit/backend/llsupport/vector_ext.py b/rpython/jit/backend/llsupport/vector_ext.py --- a/rpython/jit/backend/llsupport/vector_ext.py +++ b/rpython/jit/backend/llsupport/vector_ext.py @@ -218,6 +218,8 @@ class VectorExt(object): + should_align_unroll = True + def __init__(self): self._enabled = False self.register_size = 0 # in bytes diff --git a/rpython/jit/backend/x86/vector_ext.py b/rpython/jit/backend/x86/vector_ext.py --- a/rpython/jit/backend/x86/vector_ext.py +++ b/rpython/jit/backend/x86/vector_ext.py @@ -42,6 +42,9 @@ return "<TempVector At %s>" % (id(self),) class X86VectorExt(VectorExt): + + should_align_unroll = True + def setup_once(self, asm): if detect_feature.detect_sse4_1(): self.enable(16, accum=True) diff --git a/rpython/jit/metainterp/optimizeopt/dependency.py b/rpython/jit/metainterp/optimizeopt/dependency.py --- a/rpython/jit/metainterp/optimizeopt/dependency.py +++ b/rpython/jit/metainterp/optimizeopt/dependency.py @@ -943,10 +943,6 @@ """ exec py.code.Source(multiplicative_func_source .format(name='INT_MUL', op='*', tgt='mul', cop='*')).compile() - #exec py.code.Source(multiplicative_func_source - # .format(name='INT_PY_DIV', op='*', tgt='div', cop='/')).compile() - #exec py.code.Source(multiplicative_func_source - # .format(name='UINT_FLOORDIV', op='*', tgt='div', cop='/')).compile() del multiplicative_func_source array_access_source = """ diff --git a/rpython/jit/metainterp/optimizeopt/vector.py b/rpython/jit/metainterp/optimizeopt/vector.py --- a/rpython/jit/metainterp/optimizeopt/vector.py +++ b/rpython/jit/metainterp/optimizeopt/vector.py @@ -234,7 +234,10 @@ # unroll self.unroll_count = self.get_unroll_count(vsize) - self.unroll_loop_iterations(loop, self.unroll_count) + align_unroll = self.unroll_count==1 and \ + self.vector_ext.should_align_unroll + self.unroll_loop_iterations(info, loop, self.unroll_count, + align_unroll_once=align_unroll) # vectorize graph = DependencyGraph(loop) @@ -256,23 +259,32 @@ return loop.finaloplist(jitcell_token=jitcell_token, reset_label_token=False) - def unroll_loop_iterations(self, loop, unroll_count): - """ Unroll the loop X times. unroll_count + 1 = unroll_factor """ + def unroll_loop_iterations(self, info, loop, unroll_count, align_unroll_once=False): + """ Unroll the loop `unroll_count` times. There can be an additional unroll step + if alignment might benefit """ numops = len(loop.operations) renamer = Renamer() operations = loop.operations + orig_jump_args = loop.jump.getarglist()[:] + prohibit_opnums = (rop.GUARD_FUTURE_CONDITION, + rop.GUARD_NOT_INVALIDATED, + rop.DEBUG_MERGE_POINT) unrolled = [] - prohibit_opnums = (rop.GUARD_FUTURE_CONDITION, - rop.GUARD_NOT_INVALIDATED) - orig_jump_args = loop.jump.getarglist()[:] + + if align_unroll_once: + unroll_count += 1 + # it is assumed that #label_args == #jump_args label_arg_count = len(orig_jump_args) + label = loop.label + jump = loop.jump + new_label = loop.label for u in range(unroll_count): # fill the map with the renaming boxes. keys are boxes from the label for i in range(label_arg_count): - la = loop.label.getarg(i) - ja = loop.jump.getarg(i) + la = label.getarg(i) + ja = jump.getarg(i) ja = renamer.rename_box(ja) if la != ja: renamer.start_renaming(la, ja) @@ -294,9 +306,16 @@ # to be adjusted. rd_snapshot stores the live variables # that are needed to resume. if copied_op.is_guard(): - self.copy_guard_descr(renamer, copied_op) + self.copy_guard_descr(renamer, copied_op, align_unroll_once and u == 0) # unrolled.append(copied_op) + # + if align_unroll_once and u == 0: + descr = label.getdescr() + args = label.getarglist()[:] + new_label = ResOperation(rop.LABEL, args, descr) + renamer.rename(new_label) + # # the jump arguments have been changed # if label(iX) ... jump(i(X+1)) is called, at the next unrolled loop @@ -306,7 +325,25 @@ value = renamer.rename_box(arg) loop.jump.setarg(i, value) # - loop.operations = operations + unrolled + loop.label = new_label + if align_unroll_once: + for op in operations: + descr = op.getdescr() + if descr: + # the first step of the optimization will overwrite the descr + # with a compile loop version descr + # in the operations to align the loop load/store ops we want the original + # descr saved on the forwarded info + vinfo = copied_op.get_forwarded() + if vinfo: + assert isinstance(vinfo, VectorizationInfo) + descr = vinfo.get_old_descr() + assert descr is not None + op.setdescr(descr) + info.extra_same_as += operations + loop.operations = unrolled + else: + loop.operations = operations + unrolled def copy_guard_descr(self, renamer, copied_op): descr = copied_op.getdescr() @@ -316,6 +353,8 @@ failargs = renamer.rename_failargs(copied_op, clone=True) if not we_are_translated(): for arg in failargs: + if arg is None: + continue assert not arg.is_constant() copied_op.setfailargs(failargs) @@ -551,8 +590,12 @@ assert isinstance(op, GuardResOp) if op.getopnum() in (rop.GUARD_TRUE, rop.GUARD_FALSE): descr = CompileLoopVersionDescr() + olddescr = op.getdescr() + vinfo = op.get_forwarded() + assert isinstance(vinfo, VectorizationInfo) + vinfo.set_old_descr(olddescr) if op.getdescr(): - descr.copy_all_attributes_from(op.getdescr()) + descr.copy_all_attributes_from(olddescr) op.setdescr(descr) arglistcopy = loop.label.getarglist_copy() if not we_are_translated(): diff --git a/rpython/jit/metainterp/resoperation.py b/rpython/jit/metainterp/resoperation.py --- a/rpython/jit/metainterp/resoperation.py +++ b/rpython/jit/metainterp/resoperation.py @@ -159,6 +159,7 @@ count = -1 def __init__(self, op): + self.olddescr = None if op is None: return from rpython.jit.metainterp.history import Const @@ -227,6 +228,12 @@ self.bytesize = bytesize self.signed = signed + def set_old_descr(self, descr): + self.olddescr = descr + + def get_old_descr(self): + return self.olddescr + class AbstractResOpOrInputArg(AbstractValue): _attrs_ = ('_forwarded',) diff --git a/rpython/jit/metainterp/test/test_vector.py b/rpython/jit/metainterp/test/test_vector.py --- a/rpython/jit/metainterp/test/test_vector.py +++ b/rpython/jit/metainterp/test/test_vector.py @@ -117,7 +117,7 @@ rawstorage = RawStorage() va = rawstorage.new(la, type) vc = rawstorage.new(None, type, size=l) - self.meta_interp(f, [l*size, va, vc]) + self.meta_interp(f, [l*size, va, vc], vec=True) for i in range(l): c = raw_storage_getitem(type,vc,i*size) @@ -165,7 +165,7 @@ va = rawstorage.new(la, type) vb = rawstorage.new(lb, type) vc = rawstorage.new(None, type, size=l) - self.meta_interp(f, [l*size, va, vb, vc]) + self.meta_interp(f, [l*size, va, vb, vc], vec=True) for i in range(l): c = raw_storage_getitem(type,vc,i*size) @@ -223,7 +223,7 @@ va = rawstorage.new(la, type) vb = rawstorage.new(lb, type) vc = rawstorage.new(None, type, size=l) - self.meta_interp(f, [l*size, va, vb, vc]) + self.meta_interp(f, [l*size, va, vb, vc], vec=True) for i in range(l): c = raw_storage_getitem(type,vc,i*size) @@ -310,7 +310,7 @@ lltype.free(va, flavor='raw') lltype.free(vb, flavor='raw') return res - res = self.meta_interp(f, [i]) + res = self.meta_interp(f, [i], vec=True) assert res == f(i) == 3 def test_vec_max(self): @@ -334,7 +334,7 @@ i += 1 lltype.free(va, flavor='raw') return m - res = self.meta_interp(f, [30]) + res = self.meta_interp(f, [30], vec=True) assert res == f(30) == 128 @py.test.mark.parametrize('type,func,init,insert,at,count,breaks', @@ -383,7 +383,7 @@ nobreak = True lltype.free(va, flavor='raw') return not nobreak - res = self.meta_interp(f, [count]) + res = self.meta_interp(f, [count], vec=True) assert res == f(count) == breaks def _vec_reduce(self, strat, func, type, data): @@ -408,7 +408,7 @@ accum = data.draw(strat) rawstorage = RawStorage() va = rawstorage.new(la, type) - res = self.meta_interp(f, [accum, l*size, va]) + res = self.meta_interp(f, [accum, l*size, va], vec=True) assert isclose(rffi.cast(type, res), f(accum, l*size, va)) @@ -439,7 +439,7 @@ val = va[0] lltype.free(va, flavor='raw') return val - res = self.meta_interp(f, [60]) + res = self.meta_interp(f, [60], vec=True) assert res == f(60) == 34.5 def test_constant_expand_vec_all(self): @@ -457,7 +457,7 @@ val = va[0] lltype.free(va, flavor='raw') return val - res = self.meta_interp(f, [60], vec_all=True) + res = self.meta_interp(f, [60], vec=True, vec_all=True) assert res == f(60) == 34.5 @py.test.mark.parametrize('type,value', [(rffi.DOUBLE, 58.4547), @@ -476,7 +476,7 @@ val = va[d//2] lltype.free(va, flavor='raw') return val - res = self.meta_interp(f, [60,value]) + res = self.meta_interp(f, [60,value], vec=True) assert res == f(60,value) == value @py.test.mark.parametrize('vec,vec_all',[(False,True),(True,False),(True,True),(False,False)]) @@ -540,7 +540,7 @@ lltype.free(va, flavor='raw') lltype.free(vb, flavor='raw') return 0 - res = self.meta_interp(f, [i]) + res = self.meta_interp(f, [i], vec=True) assert res == f(i) @py.test.mark.parametrize('i,v1,v2',[(25,2.5,0.3),(25,2.5,0.3)]) @@ -566,7 +566,7 @@ for i in range(d): s += a[i] return s - res = self.meta_interp(f, [i,v1,v2], vec_all=True) + res = self.meta_interp(f, [i,v1,v2], vec=True, vec_all=True) # sum helps to generate the rounding error of floating points # return 69.999 ... instead of 70, (v1+v2)*i == 70.0 assert res == f(i,v1,v2) == sum([v1+v2]*i) @@ -593,7 +593,7 @@ free(vector_a) free(vector_b) return 0 - res = self.meta_interp(f, [size], vec_all=True) + res = self.meta_interp(f, [size], vec=True, vec_all=True) assert res == f(size) def test_max_byte(self): @@ -616,7 +616,7 @@ i += 1 free(vector_a) return max - res = self.meta_interp(f, [128], vec_all=True) + res = self.meta_interp(f, [128], vec=True, vec_all=True) assert res == f(128) @@ -673,7 +673,7 @@ c += {type_c_loadcast}(vector_c[i]) lltype.free(vector_c, flavor='raw') return c - res = self.meta_interp(f, [{size}], vec_all=True) + res = self.meta_interp(f, [{size}], vec=True, vec_all=True) assert res == f({size}) """ env = { @@ -717,7 +717,7 @@ # c += vector_c[i] lltype.free(vector_c, flavor='raw') return 0 - res = self.meta_interp(f, [22], vec_all=True) + res = self.meta_interp(f, [22], vec=True, vec_all=True) assert res == f(22) def test_guard_test_location_assert(self): @@ -739,7 +739,7 @@ i += 1 lltype.free(vector_a, flavor='raw') return breaks - res = self.meta_interp(f, [22], vec_all=True) + res = self.meta_interp(f, [22], vec=True, vec_all=True) assert res == f(22) def run_unpack(self, unpack, vector_type, assignments, float=True): _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit