[pypy-commit] pypy ppc-vsx-support: finished accum reduce function for f64/i64 + and * (ppc)

plan_rich Fri, 01 Jul 2016 06:39:01 -0700

Author: Richard Plangger <planri...@gmail.com>
Branch: ppc-vsx-support
Changeset: r85489:89ec178c8b17
Date: 2016-07-01 15:37 +0200
http://bitbucket.org/pypy/pypy/changeset/89ec178c8b17/


Log:    finished accum reduce function for f64/i64 + and * (ppc)

diff --git a/rpython/jit/backend/ppc/codebuilder.py 
b/rpython/jit/backend/ppc/codebuilder.py
--- a/rpython/jit/backend/ppc/codebuilder.py
+++ b/rpython/jit/backend/ppc/codebuilder.py
@@ -612,7 +612,7 @@
     # mul
     xvmuldp = XX3(60, XO9=112)
     xvmulsp = XX3(60, XO9=80)
-    xsmuldp = XX3(60, XO9=46)
+    xsmuldp = XX3(60, XO9=48)
     # div
     xvdivdp = XX3(60, XO9=102)
     xvdivsp = XX3(60, XO9=88)
diff --git a/rpython/jit/backend/ppc/vector_ext.py 
b/rpython/jit/backend/ppc/vector_ext.py
--- a/rpython/jit/backend/ppc/vector_ext.py
+++ b/rpython/jit/backend/ppc/vector_ext.py
@@ -333,38 +333,40 @@
             if not scalar_loc.is_reg():
                 scalar_loc = regalloc.force_allocate_reg(scalar_arg)
             assert scalar_arg is not None
-            if accum_info.accum_operation == '+':
-                self._accum_reduce_sum(scalar_arg, vector_loc, scalar_loc)
-            elif accum_info.accum_operation == '*':
-                self._accum_reduce_mul(scalar_arg, vector_loc, scalar_loc)
-            else:
-                not_implemented("accum operator %s not implemented" %
-                                            (accum_info.accum_operation)) 
+            op = accum_info.accum_operation
+            self._accum_reduce(op, scalar_arg, vector_loc, scalar_loc)
             accum_info = accum_info.next()
 
-    def _accum_reduce_mul(self, arg, accumloc, targetloc):
-        notimplemented("[ppc reduce mul]")
-        #scratchloc = X86_64_XMM_SCRATCH_REG
-        #self.mov(accumloc, scratchloc)
-        ## swap the two elements
-        #self.mc.SHUFPD_xxi(scratchloc.value, scratchloc.value, 0x01)
-        #self.mc.MULSD(accumloc, scratchloc)
-        #if accumloc is not targetloc:
-        #    self.mov(accumloc, targetloc)
-
-    def _accum_reduce_sum(self, arg, accumloc, targetloc):
+    def _accum_reduce(self, op, arg, accumloc, targetloc):
         # Currently the accumulator can ONLY be the biggest
         # 64 bit float/int
         tgt = targetloc.value
         acc = accumloc.value
         if arg.type == FLOAT:
             # r = (r[0]+r[1],r[0]+r[1])
-            self.mc.xvmr(tgt, acc, acc)
             if IS_BIG_ENDIAN:
                 self.mc.xxspltd(tgt, acc, acc, 0b00)
             else:
-                self.mc.xxspltd(tgt, acc, acc, 0b01)
-            self.mc.xsadddp(tgt, tgt, acc)
+                self.mc.xxspltd(tgt, acc, acc, 0b10)
+            if op == '+':
+                self.mc.xsadddp(tgt, tgt, acc)
+            elif op == '*':
+                self.mc.xsmuldp(tgt, tgt, acc)
+            else:
+                not_implemented("sum not implemented")
+            return
+        else:
+            assert arg.type == INT
+            self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET)
+            self.mc.stvx(acc, r.SCRATCH2.value, r.SP.value)
+            self.mc.load(tgt, r.SP.value, PARAM_SAVE_AREA_OFFSET)
+            self.mc.load(r.SCRATCH.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+8)
+            if op == '+':
+                self.mc.add(tgt, tgt, acc)
+            elif op == '*':
+                self.mc.mul(tgt, tgt, acc)
+            else:
+                not_implemented("sum not implemented")
             return
 
         not_implemented("reduce sum for %s not impl." % arg)
@@ -514,59 +516,37 @@
         else:
             notimplemented("[expand int size not impl]")
 
-    #def genop_vec_pack_i(self, op, arglocs, regalloc):
-    #    resultloc, sourceloc, residxloc, srcidxloc, countloc, sizeloc = 
arglocs
-    #    assert isinstance(resultloc, RegLoc)
-    #    assert isinstance(sourceloc, RegLoc)
-    #    size = sizeloc.value
-    #    srcidx = srcidxloc.value
-    #    residx = residxloc.value
-    #    count = countloc.value
-    #    # for small data type conversion this can be quite costy
-    #    # NOTE there might be some combinations that can be handled
-    #    # more efficiently! e.g.
-    #    # v2 = pack(v0,v1,4,4)
-    #    si = srcidx
-    #    ri = residx
-    #    k = count
-    #    while k > 0:
-    #        if size == 8:
-    #            if resultloc.is_xmm and sourceloc.is_xmm: # both xmm
-    #                self.mc.PEXTRQ_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
-    #                self.mc.PINSRQ_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
-    #            elif resultloc.is_xmm: # xmm <- reg
-    #                self.mc.PINSRQ_xri(resultloc.value, sourceloc.value, ri)
-    #            else: # reg <- xmm
-    #                self.mc.PEXTRQ_rxi(resultloc.value, sourceloc.value, si)
-    #        elif size == 4:
-    #            if resultloc.is_xmm and sourceloc.is_xmm:
-    #                self.mc.PEXTRD_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
-    #                self.mc.PINSRD_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
-    #            elif resultloc.is_xmm:
-    #                self.mc.PINSRD_xri(resultloc.value, sourceloc.value, ri)
-    #            else:
-    #                self.mc.PEXTRD_rxi(resultloc.value, sourceloc.value, si)
-    #        elif size == 2:
-    #            if resultloc.is_xmm and sourceloc.is_xmm:
-    #                self.mc.PEXTRW_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
-    #                self.mc.PINSRW_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
-    #            elif resultloc.is_xmm:
-    #                self.mc.PINSRW_xri(resultloc.value, sourceloc.value, ri)
-    #            else:
-    #                self.mc.PEXTRW_rxi(resultloc.value, sourceloc.value, si)
-    #        elif size == 1:
-    #            if resultloc.is_xmm and sourceloc.is_xmm:
-    #                self.mc.PEXTRB_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
-    #                self.mc.PINSRB_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
-    #            elif resultloc.is_xmm:
-    #                self.mc.PINSRB_xri(resultloc.value, sourceloc.value, ri)
-    #            else:
-    #                self.mc.PEXTRB_rxi(resultloc.value, sourceloc.value, si)
-    #        si += 1
-    #        ri += 1
-    #        k -= 1
+    def emit_vec_pack_i(self, op, arglocs, regalloc):
+        resultloc, vloc, sourceloc, residxloc, srcidxloc, countloc = arglocs
+        srcidx = srcidxloc.value
+        residx = residxloc.value
+        count = countloc.value
+        # for small data type conversion this can be quite costy
+        # NOTE there might be some combinations that can be handled
+        # more efficiently! e.g.
+        # v2 = pack(v0,v1,4,4)
+        res = resultloc.value
+        vector = vloc.value
+        src = sourceloc.value
+        size = op.bytesize
+        if size == 8:
+            if resultloc.is_vector_reg() and sourceloc.is_vector_reg(): # both 
vector
+                notimplemented("[ppc/vec_pack_i]")
+            elif resultloc.is_vector_reg(): # vector <- reg
+                self.mc.load_imm(r.SCRATCH, PARAM_SAVE_AREA_OFFSET)
+                self.mc.stvx(vector, r.SCRATCH2.value, r.SP.value)
+                self.mc.store(src, r.SP.value, PARAM_SAVE_AREA_OFFSET+8*residx)
+                self.mc.lvx(res, r.SCRATCH2.value, r.SP.value)
+            else:
+                notimplemented("[ppc/vec_pack_i]")
+        elif size == 4:
+            notimplemented("[ppc/vec_pack_i]")
+        elif size == 2:
+            notimplemented("[ppc/vec_pack_i]")
+        elif size == 1:
+            notimplemented("[ppc/vec_pack_i]")
 
-    #genop_vec_unpack_i = genop_vec_pack_i
+    # TODO emit_vec_unpack_i = emit_vec_pack_i
 
     def emit_vec_pack_f(self, op, arglocs, resultloc):
         resloc, vloc, srcloc, residxloc, srcidxloc, countloc = arglocs
@@ -755,11 +735,12 @@
         count = op.getarg(3)
         assert isinstance(index, ConstInt)
         assert isinstance(count, ConstInt)
-        srcloc = self.ensure_vector_reg(arg)
+        vloc = self.ensure_vector_reg(op.getarg(0))
+        srcloc = self.ensure_reg(arg)
         resloc = self.force_allocate_vector_reg(op)
         residx = index.value # where to put it in result?
         srcidx = 0
-        return [resloc, srcloc, imm(residx), imm(srcidx), imm(count.value)]
+        return [resloc, vloc, srcloc, imm(residx), imm(srcidx), 
imm(count.value)]
 
     def prepare_vec_pack_f(self, op):
         # new_res = vec_pack_i(res, src, index, count)
@@ -769,7 +750,7 @@
         count = op.getarg(3)
         assert isinstance(index, ConstInt)
         assert isinstance(count, ConstInt)
-        assert not arg.is_vector()
+        assert not arg.is_vector_reg()
         srcloc = self.ensure_reg(arg)
         vloc = self.ensure_vector_reg(op.getarg(0))
         resloc = self.force_allocate_vector_reg(op)
diff --git a/rpython/jit/metainterp/test/test_vector.py 
b/rpython/jit/metainterp/test/test_vector.py
--- a/rpython/jit/metainterp/test/test_vector.py
+++ b/rpython/jit/metainterp/test/test_vector.py
@@ -379,23 +379,30 @@
         res = self.meta_interp(f, [count])
         assert res == f(count) == breaks
 
-    def test_sum(self):
+    @py.test.mark.parametrize('type,func,cast',
+            [(rffi.DOUBLE, lambda a,b: a+b, float),
+             (rffi.DOUBLE, lambda a,b: a*b, float),
+             (lltype.Signed, lambda a,b: a+b, int),
+             (lltype.Signed, lambda a,b: a*b, int),
+            ])
+    def test_reduce(self, type, func, cast):
+        func = always_inline(func)
         myjitdriver = JitDriver(greens = [], reds = 'auto', vectorize=True)
-        T = lltype.Array(rffi.DOUBLE, hints={'nolength': True})
+        T = lltype.Array(type, hints={'nolength': True})
         def f(d):
             va = lltype.malloc(T, d, flavor='raw', zero=True)
             for j in range(d):
-                va[j] = float(j)
+                va[j] = cast(j+1)
             i = 0
             accum = 0
             while i < d:
                 myjitdriver.jit_merge_point()
-                accum += va[i]
+                accum = func(accum,va[i])
                 i += 1
             lltype.free(va, flavor='raw')
             return accum
         res = self.meta_interp(f, [60])
-        assert res == f(60) == sum(range(60))
+        assert isclose(res, f(60))
 
     def test_constant_expand(self):
         myjitdriver = JitDriver(greens = [], reds = 'auto', vectorize=True)
_______________________________________________
pypy-commit mailing list
pypy-commit@python.org
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy ppc-vsx-support: finished accum reduce function for f64/i64 + and * (ppc)

Reply via email to