Author: Richard Plangger <planri...@gmail.com> Branch: zarch-simd-support Changeset: r87122:8022d6fb623c Date: 2016-09-15 12:09 +0200 http://bitbucket.org/pypy/pypy/changeset/8022d6fb623c/
Log: more vec op backend impl. missing, pack/unpack for i/f diff --git a/rpython/jit/backend/zarch/assembler.py b/rpython/jit/backend/zarch/assembler.py --- a/rpython/jit/backend/zarch/assembler.py +++ b/rpython/jit/backend/zarch/assembler.py @@ -1542,8 +1542,8 @@ pmc.overwrite() def notimplemented_op(asm, op, arglocs, regalloc): + msg = "[zarch/asm] %s not implemented\n" % op.getopname() if we_are_translated(): - msg = "[ZARCH/asm] %s not implemented\n" % op.getopname() llop.debug_print(lltype.Void, msg) raise NotImplementedError(msg) diff --git a/rpython/jit/backend/zarch/instruction_builder.py b/rpython/jit/backend/zarch/instruction_builder.py --- a/rpython/jit/backend/zarch/instruction_builder.py +++ b/rpython/jit/backend/zarch/instruction_builder.py @@ -486,6 +486,21 @@ self.writechar(opcode2) return encode_vrr_a +def build_vrr_b(mnemonic, (opcode1,opcode2), argtypes='v,v,v,m,m'): + @builder.arguments(argtypes) + def encode_vrr_b(self, v1, v2, v3, mask4, mask5): + self.writechar(opcode1) + rbx = (v1 >= 16) << 3 + rbx |= (v2 >= 16) << 2 + rbx |= (v3 >= 16) << 1 + byte = (v1 & BIT_MASK_4) << 4 | (v2 & BIT_MASK_4) + self.writechar(chr(byte)) + self.writechar(chr((v3 & BIT_MASK_4) << 4)) + self.writechar(chr((mask5 & BIT_MASK_4) << 4)) + self.writechar(chr((mask4 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4))) + self.writechar(opcode2) + return encode_vrr_b + def build_vrr_c(mnemonic, (opcode1,opcode2), argtypes='v,v,v,m,m,m'): @builder.arguments(argtypes) def encode_vrr_c(self, v1, v2, v3, mask4=0, mask5=0, mask6=0): diff --git a/rpython/jit/backend/zarch/instructions.py b/rpython/jit/backend/zarch/instructions.py --- a/rpython/jit/backend/zarch/instructions.py +++ b/rpython/jit/backend/zarch/instructions.py @@ -295,6 +295,7 @@ vector_mnemonic_codes = { 'VL': ('vrx', ['\xE7','\x06'], 'v,bid'), + 'VLREP': ('vrx', ['\xE7','\x05']), 'VLR': ('vrr_a', ['\xE7','\x56'], 'v,v'), 'VST': ('vrx', ['\xE7','\x0E'], 'v,bid'), @@ -318,6 +319,7 @@ # conversion 'VCDG': ('vrr_a', ['\xE7','\xC3']), + 'VCGD': ('vrr_a', ['\xE7','\xC2']), # compare, sign, ... 'VFPSO': ('vrr_a', ['\xE7','\xCC']), @@ -326,6 +328,12 @@ 'VPERM': ('vrr_e', ['\xE7','\x8C'], 'v,v,v,v'), 'VREPI': ('vri_a', ['\xE7','\x45']), + 'VCEQ': ('vrr_b', ['\xE7','\xF8']), + + # pack, merge, shift, ... + 'VMRL': ('vrr_c', ['\xE7','\x60'], 'v,v,v,m'), + 'VMRH': ('vrr_c', ['\xE7','\x61'], 'v,v,v,m'), + # '': ('', ['','']), } diff --git a/rpython/jit/backend/zarch/vector_ext.py b/rpython/jit/backend/zarch/vector_ext.py --- a/rpython/jit/backend/zarch/vector_ext.py +++ b/rpython/jit/backend/zarch/vector_ext.py @@ -220,85 +220,44 @@ flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, resloc) def emit_vec_float_xor(self, op, arglocs, regalloc): - resloc, l0, l1, sizeloc = arglocs - res = resloc.value - r0 = l0.value - r1 = l1.value - self.mc.xxlxor(res, r0, r1) + resloc, loc0, loc1, sizeloc = arglocs + self.mc.VX(resloc, loc0, loc1) def emit_vec_float_ne(self, op, arglocs, regalloc): assert isinstance(op, VectorOp) - resloc, loc1, loc2, sizeloc = arglocs + resloc, loc0, loc1, sizeloc = arglocs size = sizeloc.value - tmp = regalloc.vrm.get_scratch_reg().value - offloc = regalloc.rm.get_scratch_reg() - off = offloc.value - # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0 - self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET) - if size == 4: - self.mc.xvcmpeqspx(tmp, loc1.value, loc2.value) - self.mc.stxvw4x(tmp, off, r.SP.value) - elif size == 8: - self.mc.xvcmpeqdpx(tmp, loc1.value, loc2.value) - self.mc.stxvd2x(tmp, off, r.SP.value) + if size == 8: + # bit 3 in last argument sets the condition code + self.mc.VFCE(resloc, loc0, loc1, 3, 0, 1) + self.mc.VNO(resloc, resloc, resloc) else: - not_implemented("float == for size %d" % size) - res = resloc.value - self.mc.lvx(res, off, r.SP.value) - self.mc.vnor(res, res, res) # complement + not_implemented("[zarch/assembler] float != for size %d" % size) flush_vec_cc(self, regalloc, c.VNEI, op.bytesize, resloc) def emit_vec_cast_int_to_float(self, op, arglocs, regalloc): resloc, loc0 = arglocs - offloc = regalloc.rm.get_scratch_reg() - off = offloc.value - # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0 - # bit 1 on mask4 -> supresses inexact exception self.mc.VCDG(resloc, loc0, 3, 4, m.RND_TOZERO.value) - #self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET) - #self.mc.stvx(l0.value, off, r.SP.value) - #self.mc.lxvd2x(res.value, off, r.SP.value) - #self.mc.xvcvsxddp(res.value, res.value) def emit_vec_int_eq(self, op, arglocs, regalloc): assert isinstance(op, VectorOp) - res, l0, l1, sizeloc = arglocs + resloc, loc0, loc1, sizeloc = arglocs size = sizeloc.value - if size == 1: - self.mc.vcmpequbx(res.value, l0.value, l1.value) - elif size == 2: - self.mc.vcmpequhx(res.value, l0.value, l1.value) - elif size == 4: - self.mc.vcmpequwx(res.value, l0.value, l1.value) - elif size == 8: - self.mc.vcmpequdx(res.value, l0.value, l1.value) - flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, res) + self.mc.VCEQ(resloc, loc0, loc1, l.itemsize_to_mask(size), 1) + flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, resloc) def emit_vec_int_ne(self, op, arglocs, regalloc): assert isinstance(op, VectorOp) - res, l0, l1, sizeloc = arglocs + resloc, loc0, loc1, sizeloc = arglocs size = sizeloc.value - tmp = regalloc.vrm.get_scratch_reg(type=INT).value - self.mc.vxor(tmp, tmp, tmp) - if size == 1: - self.mc.vcmpequbx(res.value, res.value, tmp) - elif size == 2: - self.mc.vcmpequhx(res.value, res.value, tmp) - elif size == 4: - self.mc.vcmpequwx(res.value, res.value, tmp) - elif size == 8: - self.mc.vcmpequdx(res.value, res.value, tmp) - self.mc.vnor(res.value, res.value, res.value) - flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, res) + self.mc.VCEQ(resloc, loc0, loc1, l.itemsize_to_mask(size), 1) + self.mc.VNO(resloc, resloc, resloc) + flush_vec_cc(self, regalloc, c.VNEI, op.bytesize, res) def emit_vec_cast_float_to_int(self, op, arglocs, regalloc): - res, l0 = arglocs - offloc = regalloc.rm.get_scratch_reg() - v0 = regalloc.vrm.get_scratch_reg(type=INT) - off = offloc.value - # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0 - self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET) - self.mc.xvcvdpsxds(res.value, l0.value) + resloc, loc0 = arglocs + # 4 => bit 1 from the MSB: XxC + self.mc.VCGD(resloc, loc0, 3, 4, mask.RND_TOZERO.value) def emit_vec_expand_f(self, op, arglocs, regalloc): assert isinstance(op, VectorOp) @@ -320,30 +279,37 @@ def emit_vec_expand_i(self, op, arglocs, regalloc): assert isinstance(op, VectorOp) - res, l0, off = arglocs + resloc, loc0 = arglocs size = op.bytesize + self.mc.VLREP(resloc, loc0, l.itemsize_to_mask(size)) - self.mc.load_imm(r.SCRATCH2, off.value) - self.mc.lvx(res.value, r.SCRATCH2.value, r.SP.value) - if size == 1: - if IS_BIG_ENDIAN: - self.mc.vspltb(res.value, res.value, 0b0000) - else: - self.mc.vspltb(res.value, res.value, 0b1111) - elif size == 2: - if IS_BIG_ENDIAN: - self.mc.vsplth(res.value, res.value, 0b000) - else: - self.mc.vsplth(res.value, res.value, 0b111) - elif size == 4: - if IS_BIG_ENDIAN: - self.mc.vspltw(res.value, res.value, 0b00) - else: - self.mc.vspltw(res.value, res.value, 0b11) - elif size == 8: - pass + emit_vec_expand_f = emit_vec_expand_i + + def _accum_reduce(self, op, arg, accumloc, targetloc): + # Currently the accumulator can ONLY be 64 bit float/int + if arg.type == FLOAT: + # r = (r[0]+r[1],r[0]+r[1]) + self.mc.VMRL(targetloc, accumloc, accumloc) + if op == '+': + self.mc.VFA(targetloc, targetloc, accumloc) + return + elif op == '*': + self.mc.VFM(targetloc, targetloc, accumloc) + return else: - not_implemented("expand int size not impl") + assert arg.type == INT + # store the vector onto the stack, just below the stack pointer + self.mc.VST(accumloc, l.addr(0, r.SP)) + self.mc.LG(r.SCRATCH, l.addr(0, r.SP)) + self.mc.LG(targetloc, l.addr(8, r.SP)) + if op == '+': + self.mc.AGR(targetloc, r.SCRATCH) + return + elif op == '*': + self.mc.MSGR(targetloc, r.SCRATCH) + return + not_implemented("reduce sum for %s not impl." % arg) + def emit_vec_pack_i(self, op, arglocs, regalloc): assert isinstance(op, VectorOp) @@ -467,6 +433,7 @@ # r = (v[0], s[1]) self.mc.xxpermdi(res, vec, src, permi(0,1)) + def emit_vec_unpack_f(self, op, arglocs, regalloc): assert isinstance(op, VectorOp) resloc, srcloc, srcidxloc, countloc = arglocs @@ -486,41 +453,6 @@ return not_implemented("unpack for combination src %d -> res %d" % (srcidx, residx)) - def _accum_reduce(self, op, arg, accumloc, targetloc): - # Currently the accumulator can ONLY be the biggest - # 64 bit float/int - # TODO - tgt = targetloc.value - acc = accumloc.value - if arg.type == FLOAT: - # r = (r[0]+r[1],r[0]+r[1]) - if IS_BIG_ENDIAN: - self.mc.xxpermdi(tgt, acc, acc, 0b00) - else: - self.mc.xxpermdi(tgt, acc, acc, 0b10) - if op == '+': - self.mc.xsadddp(tgt, tgt, acc) - elif op == '*': - self.mc.xsmuldp(tgt, tgt, acc) - else: - not_implemented("sum not implemented") - return - else: - assert arg.type == INT - self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET) - self.mc.stvx(acc, r.SCRATCH2.value, r.SP.value) - self.mc.load(tgt, r.SP.value, PARAM_SAVE_AREA_OFFSET) - self.mc.load(r.SCRATCH.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+8) - if op == '+': - self.mc.add(tgt, tgt, acc) - elif op == '*': - self.mc.mulld(tgt, tgt, acc) - else: - not_implemented("sum not implemented") - return - - not_implemented("reduce sum for %s not impl." % arg) - def emit_vec_f(self, op, arglocs, regalloc): pass emit_vec_i = emit_vec_f @@ -729,18 +661,10 @@ assert isinstance(op, VectorOp) arg = op.getarg(0) mc = self.assembler.mc - if arg.is_constant(): - assert isinstance(arg, ConstInt) - l0 = self.rm.get_scratch_reg() - mc.load_imm(l0, arg.value) - else: - l0 = self.ensure_reg(arg) - mc.store(l0.value, r.SP.value, PARAM_SAVE_AREA_OFFSET) + l0 = self.ensure_reg_or_pool(arg) size = op.bytesize - if size == 8: - mc.store(l0.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+8) res = self.force_allocate_vector_reg(op) - return [res, l0, imm(PARAM_SAVE_AREA_OFFSET)] + return [res, l0] def prepare_vec_int_is_true(self, op): assert isinstance(op, VectorOp) diff --git a/rpython/jit/metainterp/test/test_vector.py b/rpython/jit/metainterp/test/test_vector.py --- a/rpython/jit/metainterp/test/test_vector.py +++ b/rpython/jit/metainterp/test/test_vector.py @@ -273,9 +273,9 @@ test_vec_xor_short = \ vec_int_arith(lambda a,b: intmask(a)^intmask(b), rffi.SHORT) - test_vec_int_eq = \ + test_vec_int_cmp_eq = \ vec_int_arith(lambda a,b: a == b, rffi.SIGNED) - test_vec_int_ne = \ + test_vec_int_cmp_ne = \ vec_int_arith(lambda a,b: a == b, rffi.SIGNED) @py.test.mark.parametrize('i',[1,2,3,4,9]) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit