[pypy-commit] pypy zarch-simd-support: more vec op backend impl. missing, pack/unpack for i/f

plan_rich Thu, 15 Sep 2016 03:10:27 -0700

Author: Richard Plangger <[email protected]>
Branch: zarch-simd-support
Changeset: r87122:8022d6fb623c
Date: 2016-09-15 12:09 +0200
http://bitbucket.org/pypy/pypy/changeset/8022d6fb623c/


Log:    more vec op backend impl. missing, pack/unpack for i/f

diff --git a/rpython/jit/backend/zarch/assembler.py 
b/rpython/jit/backend/zarch/assembler.py
--- a/rpython/jit/backend/zarch/assembler.py
+++ b/rpython/jit/backend/zarch/assembler.py
@@ -1542,8 +1542,8 @@
         pmc.overwrite()
 
 def notimplemented_op(asm, op, arglocs, regalloc):
+    msg = "[zarch/asm] %s not implemented\n" % op.getopname()
     if we_are_translated():
-        msg = "[ZARCH/asm] %s not implemented\n" % op.getopname()
         llop.debug_print(lltype.Void, msg)
     raise NotImplementedError(msg)
 
diff --git a/rpython/jit/backend/zarch/instruction_builder.py 
b/rpython/jit/backend/zarch/instruction_builder.py
--- a/rpython/jit/backend/zarch/instruction_builder.py
+++ b/rpython/jit/backend/zarch/instruction_builder.py
@@ -486,6 +486,21 @@
         self.writechar(opcode2)
     return encode_vrr_a
 
+def build_vrr_b(mnemonic, (opcode1,opcode2), argtypes='v,v,v,m,m'):
+    @builder.arguments(argtypes)
+    def encode_vrr_b(self, v1, v2, v3, mask4, mask5):
+        self.writechar(opcode1)
+        rbx =  (v1 >= 16) << 3
+        rbx |= (v2 >= 16) << 2
+        rbx |= (v3 >= 16) << 1
+        byte = (v1 & BIT_MASK_4) << 4 | (v2 & BIT_MASK_4)
+        self.writechar(chr(byte))
+        self.writechar(chr((v3 & BIT_MASK_4) << 4))
+        self.writechar(chr((mask5 & BIT_MASK_4) << 4))
+        self.writechar(chr((mask4 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4)))
+        self.writechar(opcode2)
+    return encode_vrr_b
+
 def build_vrr_c(mnemonic, (opcode1,opcode2), argtypes='v,v,v,m,m,m'):
     @builder.arguments(argtypes)
     def encode_vrr_c(self, v1, v2, v3, mask4=0, mask5=0, mask6=0):
diff --git a/rpython/jit/backend/zarch/instructions.py 
b/rpython/jit/backend/zarch/instructions.py
--- a/rpython/jit/backend/zarch/instructions.py
+++ b/rpython/jit/backend/zarch/instructions.py
@@ -295,6 +295,7 @@
 
 vector_mnemonic_codes = {
     'VL':     ('vrx', ['\xE7','\x06'], 'v,bid'),
+    'VLREP':  ('vrx', ['\xE7','\x05']),
     'VLR':    ('vrr_a', ['\xE7','\x56'], 'v,v'),
 
     'VST':    ('vrx', ['\xE7','\x0E'], 'v,bid'),
@@ -318,6 +319,7 @@
 
     # conversion
     'VCDG':  ('vrr_a', ['\xE7','\xC3']),
+    'VCGD':  ('vrr_a', ['\xE7','\xC2']),
 
     # compare, sign, ...
     'VFPSO': ('vrr_a', ['\xE7','\xCC']),
@@ -326,6 +328,12 @@
     'VPERM': ('vrr_e', ['\xE7','\x8C'], 'v,v,v,v'),
     'VREPI': ('vri_a', ['\xE7','\x45']),
 
+    'VCEQ':  ('vrr_b', ['\xE7','\xF8']),
+
+    # pack, merge, shift, ...
+    'VMRL':  ('vrr_c', ['\xE7','\x60'], 'v,v,v,m'),
+    'VMRH':  ('vrr_c', ['\xE7','\x61'], 'v,v,v,m'),
+
     # '': ('', ['','']),
 }
 
diff --git a/rpython/jit/backend/zarch/vector_ext.py 
b/rpython/jit/backend/zarch/vector_ext.py
--- a/rpython/jit/backend/zarch/vector_ext.py
+++ b/rpython/jit/backend/zarch/vector_ext.py
@@ -220,85 +220,44 @@
         flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, resloc)
 
     def emit_vec_float_xor(self, op, arglocs, regalloc):
-        resloc, l0, l1, sizeloc = arglocs
-        res = resloc.value
-        r0 = l0.value
-        r1 = l1.value
-        self.mc.xxlxor(res, r0, r1)
+        resloc, loc0, loc1, sizeloc = arglocs
+        self.mc.VX(resloc, loc0, loc1)
 
     def emit_vec_float_ne(self, op, arglocs, regalloc):
         assert isinstance(op, VectorOp)
-        resloc, loc1, loc2, sizeloc = arglocs
+        resloc, loc0, loc1, sizeloc = arglocs
         size = sizeloc.value
-        tmp = regalloc.vrm.get_scratch_reg().value
-        offloc = regalloc.rm.get_scratch_reg()
-        off = offloc.value
-        # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0
-        self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET)
-        if size == 4:
-            self.mc.xvcmpeqspx(tmp, loc1.value, loc2.value)
-            self.mc.stxvw4x(tmp, off, r.SP.value)
-        elif size == 8:
-            self.mc.xvcmpeqdpx(tmp, loc1.value, loc2.value)
-            self.mc.stxvd2x(tmp, off, r.SP.value)
+        if size == 8:
+            # bit 3 in last argument sets the condition code
+            self.mc.VFCE(resloc, loc0, loc1, 3, 0, 1)
+            self.mc.VNO(resloc, resloc, resloc)
         else:
-            not_implemented("float == for size %d" % size)
-        res = resloc.value
-        self.mc.lvx(res, off, r.SP.value)
-        self.mc.vnor(res, res, res) # complement
+            not_implemented("[zarch/assembler] float != for size %d" % size)
         flush_vec_cc(self, regalloc, c.VNEI, op.bytesize, resloc)
 
     def emit_vec_cast_int_to_float(self, op, arglocs, regalloc):
         resloc, loc0 = arglocs
-        offloc = regalloc.rm.get_scratch_reg()
-        off = offloc.value
-        # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0
-        # bit 1 on mask4 -> supresses inexact exception
         self.mc.VCDG(resloc, loc0, 3, 4, m.RND_TOZERO.value)
-        #self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET)
-        #self.mc.stvx(l0.value, off, r.SP.value)
-        #self.mc.lxvd2x(res.value, off, r.SP.value)
-        #self.mc.xvcvsxddp(res.value, res.value)
 
     def emit_vec_int_eq(self, op, arglocs, regalloc):
         assert isinstance(op, VectorOp)
-        res, l0, l1, sizeloc = arglocs
+        resloc, loc0, loc1, sizeloc = arglocs
         size = sizeloc.value
-        if size == 1:
-            self.mc.vcmpequbx(res.value, l0.value, l1.value)
-        elif size == 2:
-            self.mc.vcmpequhx(res.value, l0.value, l1.value)
-        elif size == 4:
-            self.mc.vcmpequwx(res.value, l0.value, l1.value)
-        elif size == 8:
-            self.mc.vcmpequdx(res.value, l0.value, l1.value)
-        flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, res)
+        self.mc.VCEQ(resloc, loc0, loc1, l.itemsize_to_mask(size), 1)
+        flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, resloc)
 
     def emit_vec_int_ne(self, op, arglocs, regalloc):
         assert isinstance(op, VectorOp)
-        res, l0, l1, sizeloc = arglocs
+        resloc, loc0, loc1, sizeloc = arglocs
         size = sizeloc.value
-        tmp = regalloc.vrm.get_scratch_reg(type=INT).value
-        self.mc.vxor(tmp, tmp, tmp)
-        if size == 1:
-            self.mc.vcmpequbx(res.value, res.value, tmp)
-        elif size == 2:
-            self.mc.vcmpequhx(res.value, res.value, tmp)
-        elif size == 4:
-            self.mc.vcmpequwx(res.value, res.value, tmp)
-        elif size == 8:
-            self.mc.vcmpequdx(res.value, res.value, tmp)
-        self.mc.vnor(res.value, res.value, res.value)
-        flush_vec_cc(self, regalloc, c.VEQI, op.bytesize, res)
+        self.mc.VCEQ(resloc, loc0, loc1, l.itemsize_to_mask(size), 1)
+        self.mc.VNO(resloc, resloc, resloc)
+        flush_vec_cc(self, regalloc, c.VNEI, op.bytesize, res)
 
     def emit_vec_cast_float_to_int(self, op, arglocs, regalloc):
-        res, l0 = arglocs
-        offloc = regalloc.rm.get_scratch_reg()
-        v0 = regalloc.vrm.get_scratch_reg(type=INT)
-        off = offloc.value
-        # SP is always 16 byte aligned, and PARAM_SAVE_AREA_OFFSET % 16 == 0
-        self.mc.load_imm(offloc, PARAM_SAVE_AREA_OFFSET)
-        self.mc.xvcvdpsxds(res.value, l0.value)
+        resloc, loc0 = arglocs
+        # 4 => bit 1 from the MSB: XxC
+        self.mc.VCGD(resloc, loc0, 3, 4, mask.RND_TOZERO.value)
 
     def emit_vec_expand_f(self, op, arglocs, regalloc):
         assert isinstance(op, VectorOp)
@@ -320,30 +279,37 @@
 
     def emit_vec_expand_i(self, op, arglocs, regalloc):
         assert isinstance(op, VectorOp)
-        res, l0, off = arglocs
+        resloc, loc0 = arglocs
         size = op.bytesize
+        self.mc.VLREP(resloc, loc0, l.itemsize_to_mask(size))
 
-        self.mc.load_imm(r.SCRATCH2, off.value)
-        self.mc.lvx(res.value, r.SCRATCH2.value, r.SP.value)
-        if size == 1:
-            if IS_BIG_ENDIAN:
-                self.mc.vspltb(res.value, res.value, 0b0000)
-            else:
-                self.mc.vspltb(res.value, res.value, 0b1111)
-        elif size == 2:
-            if IS_BIG_ENDIAN:
-                self.mc.vsplth(res.value, res.value, 0b000)
-            else:
-                self.mc.vsplth(res.value, res.value, 0b111)
-        elif size == 4:
-            if IS_BIG_ENDIAN:
-                self.mc.vspltw(res.value, res.value, 0b00)
-            else:
-                self.mc.vspltw(res.value, res.value, 0b11)
-        elif size == 8:
-            pass
+    emit_vec_expand_f = emit_vec_expand_i
+
+    def _accum_reduce(self, op, arg, accumloc, targetloc):
+        # Currently the accumulator can ONLY be 64 bit float/int
+        if arg.type == FLOAT:
+            # r = (r[0]+r[1],r[0]+r[1])
+            self.mc.VMRL(targetloc, accumloc, accumloc)
+            if op == '+':
+                self.mc.VFA(targetloc, targetloc, accumloc)
+                return
+            elif op == '*':
+                self.mc.VFM(targetloc, targetloc, accumloc)
+                return
         else:
-            not_implemented("expand int size not impl")
+            assert arg.type == INT
+            # store the vector onto the stack, just below the stack pointer
+            self.mc.VST(accumloc, l.addr(0, r.SP))
+            self.mc.LG(r.SCRATCH, l.addr(0, r.SP))
+            self.mc.LG(targetloc, l.addr(8, r.SP))
+            if op == '+':
+                self.mc.AGR(targetloc, r.SCRATCH)
+                return
+            elif op == '*':
+                self.mc.MSGR(targetloc, r.SCRATCH)
+                return
+        not_implemented("reduce sum for %s not impl." % arg)
+
 
     def emit_vec_pack_i(self, op, arglocs, regalloc):
         assert isinstance(op, VectorOp)
@@ -467,6 +433,7 @@
                 # r = (v[0], s[1])
                 self.mc.xxpermdi(res, vec, src, permi(0,1))
 
+
     def emit_vec_unpack_f(self, op, arglocs, regalloc):
         assert isinstance(op, VectorOp)
         resloc, srcloc, srcidxloc, countloc = arglocs
@@ -486,41 +453,6 @@
             return
         not_implemented("unpack for combination src %d -> res %d" % (srcidx, 
residx))
 
-    def _accum_reduce(self, op, arg, accumloc, targetloc):
-        # Currently the accumulator can ONLY be the biggest
-        # 64 bit float/int
-        # TODO
-        tgt = targetloc.value
-        acc = accumloc.value
-        if arg.type == FLOAT:
-            # r = (r[0]+r[1],r[0]+r[1])
-            if IS_BIG_ENDIAN:
-                self.mc.xxpermdi(tgt, acc, acc, 0b00)
-            else:
-                self.mc.xxpermdi(tgt, acc, acc, 0b10)
-            if op == '+':
-                self.mc.xsadddp(tgt, tgt, acc)
-            elif op == '*':
-                self.mc.xsmuldp(tgt, tgt, acc)
-            else:
-                not_implemented("sum not implemented")
-            return
-        else:
-            assert arg.type == INT
-            self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET)
-            self.mc.stvx(acc, r.SCRATCH2.value, r.SP.value)
-            self.mc.load(tgt, r.SP.value, PARAM_SAVE_AREA_OFFSET)
-            self.mc.load(r.SCRATCH.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+8)
-            if op == '+':
-                self.mc.add(tgt, tgt, acc)
-            elif op == '*':
-                self.mc.mulld(tgt, tgt, acc)
-            else:
-                not_implemented("sum not implemented")
-            return
-
-        not_implemented("reduce sum for %s not impl." % arg)
-
     def emit_vec_f(self, op, arglocs, regalloc):
         pass
     emit_vec_i = emit_vec_f
@@ -729,18 +661,10 @@
         assert isinstance(op, VectorOp)
         arg = op.getarg(0)
         mc = self.assembler.mc
-        if arg.is_constant():
-            assert isinstance(arg, ConstInt)
-            l0 = self.rm.get_scratch_reg()
-            mc.load_imm(l0, arg.value)
-        else:
-            l0 = self.ensure_reg(arg)
-        mc.store(l0.value, r.SP.value, PARAM_SAVE_AREA_OFFSET)
+        l0 = self.ensure_reg_or_pool(arg)
         size = op.bytesize
-        if size == 8:
-            mc.store(l0.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+8)
         res = self.force_allocate_vector_reg(op)
-        return [res, l0, imm(PARAM_SAVE_AREA_OFFSET)]
+        return [res, l0]
 
     def prepare_vec_int_is_true(self, op):
         assert isinstance(op, VectorOp)
diff --git a/rpython/jit/metainterp/test/test_vector.py 
b/rpython/jit/metainterp/test/test_vector.py
--- a/rpython/jit/metainterp/test/test_vector.py
+++ b/rpython/jit/metainterp/test/test_vector.py
@@ -273,9 +273,9 @@
     test_vec_xor_short = \
         vec_int_arith(lambda a,b: intmask(a)^intmask(b), rffi.SHORT)
 
-    test_vec_int_eq = \
+    test_vec_int_cmp_eq = \
         vec_int_arith(lambda a,b: a == b, rffi.SIGNED)
-    test_vec_int_ne = \
+    test_vec_int_cmp_ne = \
         vec_int_arith(lambda a,b: a == b, rffi.SIGNED)
 
     @py.test.mark.parametrize('i',[1,2,3,4,9])
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy zarch-simd-support: more vec op backend impl. missing, pack/unpack for i/f

Reply via email to