Author: Richard Plangger <[email protected]>
Branch: ppc-vsx-support
Changeset: r87318:07a0b3bdccbe
Date: 2016-09-22 14:38 +0200
http://bitbucket.org/pypy/pypy/changeset/07a0b3bdccbe/
Log: try to align the loop (should improve ppc and s390x load/store
performance
diff --git a/rpython/jit/backend/llgraph/runner.py
b/rpython/jit/backend/llgraph/runner.py
--- a/rpython/jit/backend/llgraph/runner.py
+++ b/rpython/jit/backend/llgraph/runner.py
@@ -879,6 +879,9 @@
def bh_vec_int_xor(self, vx, vy, count):
return [int(x) ^ int(y) for x,y in zip(vx,vy)]
+ def bh_vec_float_xor(self, vx, vy, count):
+ return [0.0 for x,y in zip(vx,vy)] # just used for clearing the vector
register
+
def bh_vec_cast_float_to_singlefloat(self, vx, count):
from rpython.rlib.rarithmetic import r_singlefloat
return
[longlong.singlefloat2int(r_singlefloat(longlong.getrealfloat(v)))
diff --git a/rpython/jit/backend/llsupport/vector_ext.py
b/rpython/jit/backend/llsupport/vector_ext.py
--- a/rpython/jit/backend/llsupport/vector_ext.py
+++ b/rpython/jit/backend/llsupport/vector_ext.py
@@ -218,6 +218,8 @@
class VectorExt(object):
+ should_align_unroll = True
+
def __init__(self):
self._enabled = False
self.register_size = 0 # in bytes
diff --git a/rpython/jit/backend/x86/vector_ext.py
b/rpython/jit/backend/x86/vector_ext.py
--- a/rpython/jit/backend/x86/vector_ext.py
+++ b/rpython/jit/backend/x86/vector_ext.py
@@ -42,6 +42,9 @@
return "<TempVector At %s>" % (id(self),)
class X86VectorExt(VectorExt):
+
+ should_align_unroll = True
+
def setup_once(self, asm):
if detect_feature.detect_sse4_1():
self.enable(16, accum=True)
diff --git a/rpython/jit/metainterp/optimizeopt/dependency.py
b/rpython/jit/metainterp/optimizeopt/dependency.py
--- a/rpython/jit/metainterp/optimizeopt/dependency.py
+++ b/rpython/jit/metainterp/optimizeopt/dependency.py
@@ -943,10 +943,6 @@
"""
exec py.code.Source(multiplicative_func_source
.format(name='INT_MUL', op='*', tgt='mul', cop='*')).compile()
- #exec py.code.Source(multiplicative_func_source
- # .format(name='INT_PY_DIV', op='*', tgt='div', cop='/')).compile()
- #exec py.code.Source(multiplicative_func_source
- # .format(name='UINT_FLOORDIV', op='*', tgt='div',
cop='/')).compile()
del multiplicative_func_source
array_access_source = """
diff --git a/rpython/jit/metainterp/optimizeopt/vector.py
b/rpython/jit/metainterp/optimizeopt/vector.py
--- a/rpython/jit/metainterp/optimizeopt/vector.py
+++ b/rpython/jit/metainterp/optimizeopt/vector.py
@@ -234,7 +234,10 @@
# unroll
self.unroll_count = self.get_unroll_count(vsize)
- self.unroll_loop_iterations(loop, self.unroll_count)
+ align_unroll = self.unroll_count==1 and \
+ self.vector_ext.should_align_unroll
+ self.unroll_loop_iterations(info, loop, self.unroll_count,
+ align_unroll_once=align_unroll)
# vectorize
graph = DependencyGraph(loop)
@@ -256,23 +259,32 @@
return loop.finaloplist(jitcell_token=jitcell_token,
reset_label_token=False)
- def unroll_loop_iterations(self, loop, unroll_count):
- """ Unroll the loop X times. unroll_count + 1 = unroll_factor """
+ def unroll_loop_iterations(self, info, loop, unroll_count,
align_unroll_once=False):
+ """ Unroll the loop `unroll_count` times. There can be an additional
unroll step
+ if alignment might benefit """
numops = len(loop.operations)
renamer = Renamer()
operations = loop.operations
+ orig_jump_args = loop.jump.getarglist()[:]
+ prohibit_opnums = (rop.GUARD_FUTURE_CONDITION,
+ rop.GUARD_NOT_INVALIDATED,
+ rop.DEBUG_MERGE_POINT)
unrolled = []
- prohibit_opnums = (rop.GUARD_FUTURE_CONDITION,
- rop.GUARD_NOT_INVALIDATED)
- orig_jump_args = loop.jump.getarglist()[:]
+
+ if align_unroll_once:
+ unroll_count += 1
+
# it is assumed that #label_args == #jump_args
label_arg_count = len(orig_jump_args)
+ label = loop.label
+ jump = loop.jump
+ new_label = loop.label
for u in range(unroll_count):
# fill the map with the renaming boxes. keys are boxes from the
label
for i in range(label_arg_count):
- la = loop.label.getarg(i)
- ja = loop.jump.getarg(i)
+ la = label.getarg(i)
+ ja = jump.getarg(i)
ja = renamer.rename_box(ja)
if la != ja:
renamer.start_renaming(la, ja)
@@ -294,9 +306,16 @@
# to be adjusted. rd_snapshot stores the live variables
# that are needed to resume.
if copied_op.is_guard():
- self.copy_guard_descr(renamer, copied_op)
+ self.copy_guard_descr(renamer, copied_op,
align_unroll_once and u == 0)
#
unrolled.append(copied_op)
+ #
+ if align_unroll_once and u == 0:
+ descr = label.getdescr()
+ args = label.getarglist()[:]
+ new_label = ResOperation(rop.LABEL, args, descr)
+ renamer.rename(new_label)
+ #
# the jump arguments have been changed
# if label(iX) ... jump(i(X+1)) is called, at the next unrolled loop
@@ -306,7 +325,25 @@
value = renamer.rename_box(arg)
loop.jump.setarg(i, value)
#
- loop.operations = operations + unrolled
+ loop.label = new_label
+ if align_unroll_once:
+ for op in operations:
+ descr = op.getdescr()
+ if descr:
+ # the first step of the optimization will overwrite the
descr
+ # with a compile loop version descr
+ # in the operations to align the loop load/store ops we
want the original
+ # descr saved on the forwarded info
+ vinfo = copied_op.get_forwarded()
+ if vinfo:
+ assert isinstance(vinfo, VectorizationInfo)
+ descr = vinfo.get_old_descr()
+ assert descr is not None
+ op.setdescr(descr)
+ info.extra_same_as += operations
+ loop.operations = unrolled
+ else:
+ loop.operations = operations + unrolled
def copy_guard_descr(self, renamer, copied_op):
descr = copied_op.getdescr()
@@ -316,6 +353,8 @@
failargs = renamer.rename_failargs(copied_op, clone=True)
if not we_are_translated():
for arg in failargs:
+ if arg is None:
+ continue
assert not arg.is_constant()
copied_op.setfailargs(failargs)
@@ -551,8 +590,12 @@
assert isinstance(op, GuardResOp)
if op.getopnum() in (rop.GUARD_TRUE, rop.GUARD_FALSE):
descr = CompileLoopVersionDescr()
+ olddescr = op.getdescr()
+ vinfo = op.get_forwarded()
+ assert isinstance(vinfo, VectorizationInfo)
+ vinfo.set_old_descr(olddescr)
if op.getdescr():
- descr.copy_all_attributes_from(op.getdescr())
+ descr.copy_all_attributes_from(olddescr)
op.setdescr(descr)
arglistcopy = loop.label.getarglist_copy()
if not we_are_translated():
diff --git a/rpython/jit/metainterp/resoperation.py
b/rpython/jit/metainterp/resoperation.py
--- a/rpython/jit/metainterp/resoperation.py
+++ b/rpython/jit/metainterp/resoperation.py
@@ -159,6 +159,7 @@
count = -1
def __init__(self, op):
+ self.olddescr = None
if op is None:
return
from rpython.jit.metainterp.history import Const
@@ -227,6 +228,12 @@
self.bytesize = bytesize
self.signed = signed
+ def set_old_descr(self, descr):
+ self.olddescr = descr
+
+ def get_old_descr(self):
+ return self.olddescr
+
class AbstractResOpOrInputArg(AbstractValue):
_attrs_ = ('_forwarded',)
diff --git a/rpython/jit/metainterp/test/test_vector.py
b/rpython/jit/metainterp/test/test_vector.py
--- a/rpython/jit/metainterp/test/test_vector.py
+++ b/rpython/jit/metainterp/test/test_vector.py
@@ -117,7 +117,7 @@
rawstorage = RawStorage()
va = rawstorage.new(la, type)
vc = rawstorage.new(None, type, size=l)
- self.meta_interp(f, [l*size, va, vc])
+ self.meta_interp(f, [l*size, va, vc], vec=True)
for i in range(l):
c = raw_storage_getitem(type,vc,i*size)
@@ -165,7 +165,7 @@
va = rawstorage.new(la, type)
vb = rawstorage.new(lb, type)
vc = rawstorage.new(None, type, size=l)
- self.meta_interp(f, [l*size, va, vb, vc])
+ self.meta_interp(f, [l*size, va, vb, vc], vec=True)
for i in range(l):
c = raw_storage_getitem(type,vc,i*size)
@@ -223,7 +223,7 @@
va = rawstorage.new(la, type)
vb = rawstorage.new(lb, type)
vc = rawstorage.new(None, type, size=l)
- self.meta_interp(f, [l*size, va, vb, vc])
+ self.meta_interp(f, [l*size, va, vb, vc], vec=True)
for i in range(l):
c = raw_storage_getitem(type,vc,i*size)
@@ -310,7 +310,7 @@
lltype.free(va, flavor='raw')
lltype.free(vb, flavor='raw')
return res
- res = self.meta_interp(f, [i])
+ res = self.meta_interp(f, [i], vec=True)
assert res == f(i) == 3
def test_vec_max(self):
@@ -334,7 +334,7 @@
i += 1
lltype.free(va, flavor='raw')
return m
- res = self.meta_interp(f, [30])
+ res = self.meta_interp(f, [30], vec=True)
assert res == f(30) == 128
@py.test.mark.parametrize('type,func,init,insert,at,count,breaks',
@@ -383,7 +383,7 @@
nobreak = True
lltype.free(va, flavor='raw')
return not nobreak
- res = self.meta_interp(f, [count])
+ res = self.meta_interp(f, [count], vec=True)
assert res == f(count) == breaks
def _vec_reduce(self, strat, func, type, data):
@@ -408,7 +408,7 @@
accum = data.draw(strat)
rawstorage = RawStorage()
va = rawstorage.new(la, type)
- res = self.meta_interp(f, [accum, l*size, va])
+ res = self.meta_interp(f, [accum, l*size, va], vec=True)
assert isclose(rffi.cast(type, res), f(accum, l*size, va))
@@ -439,7 +439,7 @@
val = va[0]
lltype.free(va, flavor='raw')
return val
- res = self.meta_interp(f, [60])
+ res = self.meta_interp(f, [60], vec=True)
assert res == f(60) == 34.5
def test_constant_expand_vec_all(self):
@@ -457,7 +457,7 @@
val = va[0]
lltype.free(va, flavor='raw')
return val
- res = self.meta_interp(f, [60], vec_all=True)
+ res = self.meta_interp(f, [60], vec=True, vec_all=True)
assert res == f(60) == 34.5
@py.test.mark.parametrize('type,value', [(rffi.DOUBLE, 58.4547),
@@ -476,7 +476,7 @@
val = va[d//2]
lltype.free(va, flavor='raw')
return val
- res = self.meta_interp(f, [60,value])
+ res = self.meta_interp(f, [60,value], vec=True)
assert res == f(60,value) == value
@py.test.mark.parametrize('vec,vec_all',[(False,True),(True,False),(True,True),(False,False)])
@@ -540,7 +540,7 @@
lltype.free(va, flavor='raw')
lltype.free(vb, flavor='raw')
return 0
- res = self.meta_interp(f, [i])
+ res = self.meta_interp(f, [i], vec=True)
assert res == f(i)
@py.test.mark.parametrize('i,v1,v2',[(25,2.5,0.3),(25,2.5,0.3)])
@@ -566,7 +566,7 @@
for i in range(d):
s += a[i]
return s
- res = self.meta_interp(f, [i,v1,v2], vec_all=True)
+ res = self.meta_interp(f, [i,v1,v2], vec=True, vec_all=True)
# sum helps to generate the rounding error of floating points
# return 69.999 ... instead of 70, (v1+v2)*i == 70.0
assert res == f(i,v1,v2) == sum([v1+v2]*i)
@@ -593,7 +593,7 @@
free(vector_a)
free(vector_b)
return 0
- res = self.meta_interp(f, [size], vec_all=True)
+ res = self.meta_interp(f, [size], vec=True, vec_all=True)
assert res == f(size)
def test_max_byte(self):
@@ -616,7 +616,7 @@
i += 1
free(vector_a)
return max
- res = self.meta_interp(f, [128], vec_all=True)
+ res = self.meta_interp(f, [128], vec=True, vec_all=True)
assert res == f(128)
@@ -673,7 +673,7 @@
c += {type_c_loadcast}(vector_c[i])
lltype.free(vector_c, flavor='raw')
return c
- res = self.meta_interp(f, [{size}], vec_all=True)
+ res = self.meta_interp(f, [{size}], vec=True, vec_all=True)
assert res == f({size})
"""
env = {
@@ -717,7 +717,7 @@
# c += vector_c[i]
lltype.free(vector_c, flavor='raw')
return 0
- res = self.meta_interp(f, [22], vec_all=True)
+ res = self.meta_interp(f, [22], vec=True, vec_all=True)
assert res == f(22)
def test_guard_test_location_assert(self):
@@ -739,7 +739,7 @@
i += 1
lltype.free(vector_a, flavor='raw')
return breaks
- res = self.meta_interp(f, [22], vec_all=True)
+ res = self.meta_interp(f, [22], vec=True, vec_all=True)
assert res == f(22)
def run_unpack(self, unpack, vector_type, assignments, float=True):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit