Author: Richard Plangger <[email protected]>
Branch: ppc-vsx-support
Changeset: r85247:21826e946450
Date: 2016-06-20 17:43 +0200
http://bitbucket.org/pypy/pypy/changeset/21826e946450/
Log: unaligned load in software. there is no way around that for integer
vector loads. store has some flaw, but it seems to work for integer
add
diff --git a/rpython/jit/backend/ppc/codebuilder.py
b/rpython/jit/backend/ppc/codebuilder.py
--- a/rpython/jit/backend/ppc/codebuilder.py
+++ b/rpython/jit/backend/ppc/codebuilder.py
@@ -63,7 +63,11 @@
XLL = Form("LL", "XO1")
XX1 = Form("fvrT", "rA", "rB", "XO1")
XX3 = Form("fvrT", "fvrA", "fvrB", "XO9")
-VX = Form("lvrT", "lvrA", "lvrB", "XO8")
+XV = Form("ivrT", "rA", "rB", "XO1")
+VX = Form("ivrT", "ivrA", "ivrB", "XO8")
+VXI = Form("ivrT", "SIM", "XO8")
+VA = Form("ivrT", "ivrA", "ivrB", "ivrC", "XO10")
+
MI = Form("rA", "rS", "SH", "MB", "ME", "Rc")
MB = Form("rA", "rS", "rB", "MB", "ME", "Rc")
@@ -609,8 +613,27 @@
# INTEGER
# -------
+
+ # load & store
+ lvx = XV(31, XO1=103)
+ stvx = XV(31, XO1=231)
+
+ # arith & logic
vaddudm = VX(4, XO8=192)
+ # shift, perm and select
+ lvsl = XV(31, XO1=6)
+ lvsr = XV(31, XO1=38)
+ vperm = VA(4, XO10=43)
+ vsel = VA(4, XO10=42)
+ vspltisb = VXI(4, XO8=780)
+
+
+
+
+
+
+
class PPCAssembler(BasicPPCAssembler, PPCVSXAssembler):
BA = BasicPPCAssembler
diff --git a/rpython/jit/backend/ppc/ppc_field.py
b/rpython/jit/backend/ppc/ppc_field.py
--- a/rpython/jit/backend/ppc/ppc_field.py
+++ b/rpython/jit/backend/ppc/ppc_field.py
@@ -48,11 +48,12 @@
"fvrB": (16, 31, 'unsigned', regname._V, 'overlap'),
# low vector register T (low in a sense:
# can only address 32 vector registers)
- "lvrT": (6, 10, 'unsigned', regname._V),
+ "ivrT": (6, 10, 'unsigned', regname._V),
# low vector register A
- "lvrA": (11, 15, 'unsigned', regname._V),
+ "ivrA": (11, 15, 'unsigned', regname._V),
# low vector register B
- "lvrB": (16, 20, 'unsigned', regname._V),
+ "ivrB": (16, 20, 'unsigned', regname._V),
+ "ivrC": (21, 25, 'unsigned', regname._V),
"XO1": (21, 30),
"XO2": (22, 30),
"XO3": (26, 30),
@@ -62,7 +63,9 @@
"XO7": (27, 30),
"XO8": (21, 31),
"XO9": (21, 28),
+ "XO10": (26, 31),
"LL": ( 9, 10),
+ "SIM": (11, 15),
}
diff --git a/rpython/jit/backend/ppc/regalloc.py
b/rpython/jit/backend/ppc/regalloc.py
--- a/rpython/jit/backend/ppc/regalloc.py
+++ b/rpython/jit/backend/ppc/regalloc.py
@@ -10,7 +10,7 @@
from rpython.jit.backend.ppc.helper.regalloc import _check_imm_arg,
check_imm_box
from rpython.jit.backend.ppc.helper import regalloc as helper
from rpython.jit.metainterp.history import (Const, ConstInt, ConstFloat,
ConstPtr,
- INT, REF, FLOAT, VOID)
+ INT, REF, FLOAT, VOID, VECTOR)
from rpython.jit.metainterp.history import JitCellToken, TargetToken
from rpython.jit.metainterp.resoperation import rop
from rpython.jit.backend.ppc import locations
@@ -50,6 +50,11 @@
def __repr__(self):
return "<TempFloat at %s>" % (id(self),)
+class TempVector(TempVar):
+ type = VECTOR
+
+ def __repr__(self):
+ return "<TempVector at %s>" % (id(self),)
class FPRegisterManager(RegisterManager):
all_regs = r.MANAGED_FP_REGS
@@ -136,9 +141,9 @@
self.temp_boxes.append(box)
return reg
-class VectorRegisterManager(RegisterManager):
- all_regs = r.MANAGED_VECTOR_REGS
- box_types = [INT, FLOAT]
+class IntegerVectorRegisterManager(RegisterManager):
+ all_regs = r.MANAGED_INTEGER_VECTOR_REGS
+ box_types = [INT]
save_around_call_regs = [] # ??? lookup the ABI
assert set(save_around_call_regs).issubset(all_regs)
@@ -148,6 +153,30 @@
def ensure_reg(self, box):
raise NotImplementedError
+ def get_scratch_reg(self):
+ box = TempInt()
+ reg = self.force_allocate_reg(box, forbidden_vars=self.temp_boxes)
+ self.temp_boxes.append(box)
+ return reg
+
+class FloatVectorRegisterManager(IntegerVectorRegisterManager):
+ all_regs = r.MANAGED_FLOAT_VECTOR_REGS
+ box_types = [FLOAT]
+ save_around_call_regs = [] # ??? lookup the ABI
+ assert set(save_around_call_regs).issubset(all_regs)
+
+ def __init__(self, longevity, frame_manager=None, assembler=None):
+ RegisterManager.__init__(self, longevity, frame_manager, assembler)
+
+ def ensure_reg(self, box):
+ raise NotImplementedError
+
+ def get_scratch_reg(self):
+ box = TempFloat()
+ reg = self.force_allocate_reg(box, forbidden_vars=self.temp_boxes)
+ self.temp_boxes.append(box)
+ return reg
+
class PPCFrameManager(FrameManager):
def __init__(self, base_ofs):
FrameManager.__init__(self)
@@ -191,7 +220,9 @@
assembler = self.assembler)
self.fprm = FPRegisterManager(self.longevity, frame_manager = self.fm,
assembler = self.assembler)
- self.vrm = VectorRegisterManager(self.longevity, frame_manager =
self.fm,
+ self.vrm = FloatVectorRegisterManager(self.longevity, frame_manager =
self.fm,
+ assembler = self.assembler)
+ self.ivrm = IntegerVectorRegisterManager(self.longevity, frame_manager
= self.fm,
assembler = self.assembler)
return operations
@@ -255,11 +286,15 @@
def possibly_free_var(self, var):
if var is not None:
if var.type == FLOAT:
- self.fprm.possibly_free_var(var)
- elif var.is_vector() and var.type != VOID:
- self.vrm.possibly_free_var(var)
- else:
- self.rm.possibly_free_var(var)
+ if var.is_vector():
+ self.vrm.possibly_free_var(var)
+ else:
+ self.fprm.possibly_free_var(var)
+ elif var.type == INT:
+ if var.is_vector():
+ self.ivrm.possibly_free_var(var)
+ else:
+ self.rm.possibly_free_var(var)
def possibly_free_vars(self, vars):
for var in vars:
@@ -303,6 +338,7 @@
self.rm.position = i
self.fprm.position = i
self.vrm.position = i
+ self.ivrm.position = i
opnum = op.opnum
if rop.has_no_side_effect(opnum) and op not in self.longevity:
i += 1
@@ -311,12 +347,16 @@
#
for j in range(op.numargs()):
box = op.getarg(j)
- if box.is_vector():
- self.vrm.temp_boxes.append(box)
- elif box.type != FLOAT:
- self.rm.temp_boxes.append(box)
+ if box.type != FLOAT:
+ if box.is_vector():
+ self.ivrm.temp_boxes.append(box)
+ else:
+ self.rm.temp_boxes.append(box)
else:
- self.fprm.temp_boxes.append(box)
+ if box.is_vector():
+ self.vrm.temp_boxes.append(box)
+ else:
+ self.fprm.temp_boxes.append(box)
#
if not we_are_translated() and opnum == rop.FORCE_SPILL:
self._consider_force_spill(op)
@@ -328,6 +368,7 @@
self.rm._check_invariants()
self.fprm._check_invariants()
self.vrm._check_invariants()
+ self.ivrm._check_invariants()
if self.assembler.mc.get_relative_pos() > self.limit_loop_break:
self.assembler.break_long_loop()
self.limit_loop_break = (self.assembler.mc.get_relative_pos() +
@@ -439,6 +480,7 @@
self.rm.free_temp_vars()
self.fprm.free_temp_vars()
self.vrm.free_temp_vars()
+ self.ivrm.free_temp_vars()
# ******************************************************
# * P R E P A R E O P E R A T I O N S *
diff --git a/rpython/jit/backend/ppc/register.py
b/rpython/jit/backend/ppc/register.py
--- a/rpython/jit/backend/ppc/register.py
+++ b/rpython/jit/backend/ppc/register.py
@@ -3,7 +3,8 @@
ALL_REGS = [RegisterLocation(i) for i in range(32)]
ALL_FLOAT_REGS = [FPRegisterLocation(i) for i in range(32)]
-ALL_VECTOR_REGS = [VectorRegisterLocation(i) for i in range(64)]
+ALL_INTEGER_VECTOR_REGS = [VectorRegisterLocation(i) for i in range(32)]
+ALL_FLOAT_VECTOR_REGS = [VectorRegisterLocation(i) for i in range(64)]
r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, r16,\
r17, r18, r19, r20, r21, r22, r23, r24, r25, r26, r27, r28, r29, r30, r31\
@@ -13,12 +14,18 @@
f17, f18, f19, f20, f21, f22, f23, f24, f25, f26, f27, f28, f29, f30, f31\
= ALL_FLOAT_REGS
+ivr0, ivr1, ivr2, ivr3, ivr4, ivr5, ivr6, ivr7, ivr8, ivr9, ivr10, ivr11,
ivr12,\
+ ivr13, ivr14, ivr15, ivr16, ivr17, ivr18, ivr19, ivr20, ivr21, ivr22,
ivr23,\
+ ivr24, ivr25, ivr26, ivr27, ivr28, ivr29, ivr30, ivr31\
+ = ALL_FLOAT_REGS
+
vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13, \
vr14, vr15, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, vr24, vr25, \
vr26, vr27, vr28, vr29, vr30, vr31, vr32, vr33, vr34, vr35, vr36, vr37, \
vr38, vr39, vr40, vr41, vr42, vr43, vr44, vr45, vr46, vr47, vr48, \
vr49, vr50, vr51, vr52, vr53, vr54, vr55, vr56, vr57, vr58, vr59, vr60, \
- vr61, vr62, vr63 = ALL_VECTOR_REGS
+ vr61, vr62, vr63 = ALL_FLOAT_VECTOR_REGS
+
NONVOLATILES = [r14, r15, r16, r17, r18, r19, r20, r21, r22, r23,
r24, r25, r26, r27, r28, r29, r30, r31]
@@ -51,7 +58,8 @@
MANAGED_FP_REGS = VOLATILES_FLOAT #+ NONVOLATILES_FLOAT
-MANAGED_VECTOR_REGS = ALL_VECTOR_REGS
+MANAGED_FLOAT_VECTOR_REGS = ALL_FLOAT_VECTOR_REGS
+MANAGED_INTEGER_VECTOR_REGS = ALL_INTEGER_VECTOR_REGS
assert RCS1 in MANAGED_REGS and RCS1 in NONVOLATILES
assert RCS2 in MANAGED_REGS and RCS2 in NONVOLATILES
diff --git a/rpython/jit/backend/ppc/vector_ext.py
b/rpython/jit/backend/ppc/vector_ext.py
--- a/rpython/jit/backend/ppc/vector_ext.py
+++ b/rpython/jit/backend/ppc/vector_ext.py
@@ -36,24 +36,30 @@
emit_vec_getarrayitem_gc_i = _emit_getitem
emit_vec_getarrayitem_gc_f = _emit_getitem
- def _emit_load(self, op, arglocs, regalloc):
- resloc, base_loc, ofs_loc, size_loc, ofs, integer_loc, aligned_loc =
arglocs
+ def emit_vec_raw_load_f(self, op, arglocs, regalloc):
+ resloc, baseloc, indexloc, size_loc, ofs, integer_loc, aligned_loc =
arglocs
+ #src_addr = addr_add(baseloc, ofs_loc, ofs.value, 0)
+ assert ofs.value == 0
+ itemsize = size_loc.value
+ if itemsize == 4:
+ self.mc.lxvw4x(resloc.value, indexloc.value, baseloc.value)
+ elif itemsize == 8:
+ self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value)
+
+ def emit_vec_raw_load_i(self, op, arglocs, regalloc):
+ resloc, baseloc, indexloc, size_loc, ofs, \
+ Vhiloc, Vloloc, Vploc, tloc = arglocs
#src_addr = addr_add(base_loc, ofs_loc, ofs.value, 0)
assert ofs.value == 0
- self._vec_load(resloc, base_loc, ofs_loc, integer_loc.value,
- size_loc.value, aligned_loc.value)
-
- emit_vec_raw_load_i = _emit_load
- emit_vec_raw_load_f = _emit_load
-
- def _vec_load(self, resloc, baseloc, indexloc, integer, itemsize, aligned):
- if integer:
- raise NotImplementedError
- else:
- if itemsize == 4:
- self.mc.lxvw4x(resloc.value, indexloc.value, baseloc.value)
- elif itemsize == 8:
- self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value)
+ Vlo = Vloloc.value
+ Vhi = Vhiloc.value
+ self.mc.lvx(Vhi, indexloc.value, baseloc.value)
+ Vp = Vploc.value
+ t = tloc.value
+ self.mc.lvsl(Vp, indexloc.value, baseloc.value)
+ self.mc.addi(t, baseloc.value, 16)
+ self.mc.lvx(Vlo, indexloc.value, t)
+ self.mc.vperm(resloc.value, Vhi, Vlo, Vp)
def _emit_vec_setitem(self, op, arglocs, regalloc):
# prepares item scale (raw_store does not)
@@ -72,11 +78,41 @@
#dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, 0)
assert baseofs.value == 0
self._vec_store(baseloc, ofsloc, valueloc, integer_loc.value,
- size_loc.value, aligned_loc.value)
+ size_loc.value, regalloc)
- def _vec_store(self, baseloc, indexloc, valueloc, integer, itemsize,
aligned):
+ def _vec_store(self, baseloc, indexloc, valueloc, integer, itemsize,
regalloc):
if integer:
- raise NotImplementedError
+ Vloloc = regalloc.ivrm.get_scratch_reg()
+ Vhiloc = regalloc.ivrm.get_scratch_reg()
+ Vploc = regalloc.ivrm.get_scratch_reg()
+ tloc = regalloc.rm.get_scratch_reg()
+ V1sloc = regalloc.ivrm.get_scratch_reg()
+ V1s = V1sloc.value
+ V0sloc = regalloc.ivrm.get_scratch_reg()
+ V0s = V0sloc.value
+ Vmaskloc = regalloc.ivrm.get_scratch_reg()
+ Vmask = Vmaskloc.value
+ Vlo = Vhiloc.value
+ Vhi = Vloloc.value
+ Vp = Vploc.value
+ t = tloc.value
+ Vs = valueloc.value
+ # UFF, that is a lot of code for storing unaligned!
+ # probably a lot of room for improvement (not locally,
+ # but in general for the algorithm)
+ self.mc.lvx(Vhi, indexloc.value, baseloc.value)
+ self.mc.lvsr(Vp, indexloc.value, baseloc.value)
+ self.mc.addi(t, baseloc.value, 16)
+ self.mc.lvx(Vlo, indexloc.value, t)
+ self.mc.vspltisb(V1s, -1)
+ self.mc.vspltisb(V0s, 0)
+ self.mc.vperm(Vmask, V0s, V1s, Vp)
+ self.mc.vperm(Vs, Vs, Vs, Vp)
+ self.mc.vsel(Vlo, Vs, Vlo, Vmask)
+ self.mc.vsel(Vhi, Vhi, Vs, Vmask)
+ self.mc.stvx(Vlo, indexloc.value, baseloc.value)
+ self.mc.addi(t, baseloc.value, -16)
+ self.mc.stvx(Vhi, indexloc.value, t)
else:
if itemsize == 4:
self.mc.stxvw4x(valueloc.value, indexloc.value, baseloc.value)
@@ -94,7 +130,6 @@
elif size == 4:
raise NotImplementedError
elif size == 8:
- raise NotImplementedError # need value in another register!
self.mc.vaddudm(resloc.value, loc0.value, loc1.value)
def emit_vec_float_add(self, op, arglocs, resloc):
@@ -531,12 +566,18 @@
def force_allocate_vector_reg(self, op):
forbidden_vars = self.vrm.temp_boxes
- return self.vrm.force_allocate_reg(op, forbidden_vars)
+ if op.type == FLOAT:
+ return self.vrm.force_allocate_reg(op, forbidden_vars)
+ else:
+ return self.ivrm.force_allocate_reg(op, forbidden_vars)
def ensure_vector_reg(self, box):
- loc = self.vrm.make_sure_var_in_reg(box,
- forbidden_vars=self.vrm.temp_boxes)
- return loc
+ if box.type == FLOAT:
+ return self.vrm.make_sure_var_in_reg(box,
+ forbidden_vars=self.vrm.temp_boxes)
+ else:
+ return self.ivrm.make_sure_var_in_reg(box,
+ forbidden_vars=self.ivrm.temp_boxes)
def _prepare_load(self, op):
descr = op.getdescr()
@@ -555,11 +596,30 @@
return [result_loc, base_loc, ofs_loc, imm(itemsize), imm(ofs),
imm(integer), imm(aligned)]
- prepare_vec_getarrayitem_raw_i = _prepare_load
+ def _prepare_load_i(self, op):
+ descr = op.getdescr()
+ assert isinstance(descr, ArrayDescr)
+ assert not descr.is_array_of_pointers() and \
+ not descr.is_array_of_structs()
+ itemsize, ofs, _ = unpack_arraydescr(descr)
+ args = op.getarglist()
+ a0 = op.getarg(0)
+ a1 = op.getarg(1)
+ base_loc = self.ensure_reg(a0)
+ ofs_loc = self.ensure_reg(a1)
+ result_loc = self.force_allocate_vector_reg(op)
+ tloc = self.rm.get_scratch_reg()
+ Vhiloc = self.ivrm.get_scratch_reg()
+ Vloloc = self.ivrm.get_scratch_reg()
+ Vploc = self.ivrm.get_scratch_reg()
+ return [result_loc, base_loc, ofs_loc, imm(itemsize), imm(ofs),
+ Vhiloc, Vloloc, Vploc, tloc]
+
+ prepare_vec_getarrayitem_raw_i = _prepare_load_i
prepare_vec_getarrayitem_raw_f = _prepare_load
- prepare_vec_getarrayitem_gc_i = _prepare_load
+ prepare_vec_getarrayitem_gc_i = _prepare_load_i
prepare_vec_getarrayitem_gc_f = _prepare_load
- prepare_vec_raw_load_i = _prepare_load
+ prepare_vec_raw_load_i = _prepare_load_i
prepare_vec_raw_load_f = _prepare_load
def prepare_vec_arith(self, op):
diff --git a/rpython/jit/metainterp/test/test_vector.py
b/rpython/jit/metainterp/test/test_vector.py
--- a/rpython/jit/metainterp/test/test_vector.py
+++ b/rpython/jit/metainterp/test/test_vector.py
@@ -61,6 +61,15 @@
integers_64bit = st.integers(min_value=-2**63, max_value=2**63-1)
floats = st.floats()
+def rdiv(v1,v2):
+ # TODO unused, interpeting this on top of llgraph does not work correctly
+ try:
+ return v1 / v2
+ except ZeroDivisionError:
+ if v1 == v2 == 0.0:
+ return rfloat.NAN
+ return rfloat.copysign(rfloat.INFINITY, v1 * v2)
+
class VectorizeTests:
enable_opts =
'intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll'
@@ -75,19 +84,9 @@
type_system=self.type_system,
vec=vec, vec_all=vec_all)
- @staticmethod
- def rdiv(v1,v2):
- try:
- return v1 / v2
- except ZeroDivisionError:
- if v1 == v2 == 0.0:
- return rfloat.NAN
- return rfloat.copysign(rfloat.INFINITY, v1 * v2)
-
@given(data=st.data())
@pytest.mark.parametrize('func', [lambda a,b: a+b,
- lambda a,b: a*b, lambda a,b: a-b,
- lambda a,b: VectorizeTests.rdiv(a,b)])
+ lambda a,b: a*b, lambda a,b: a-b])
def test_vector_simple_float(self, func, data):
func = always_inline(func)
@@ -144,7 +143,7 @@
rawstorage = RawStorage()
#la = data.draw(st.lists(integers_64bit, min_size=10, max_size=150))
- la = [0] * 10
+ la = [1] * 10
l = len(la)
#lb = data.draw(st.lists(integers_64bit, min_size=l, max_size=l))
lb = [0] * 10
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit