Author: Richard Plangger <planri...@gmail.com> Branch: ppc-vsx-support Changeset: r85489:89ec178c8b17 Date: 2016-07-01 15:37 +0200 http://bitbucket.org/pypy/pypy/changeset/89ec178c8b17/
Log: finished accum reduce function for f64/i64 + and * (ppc) diff --git a/rpython/jit/backend/ppc/codebuilder.py b/rpython/jit/backend/ppc/codebuilder.py --- a/rpython/jit/backend/ppc/codebuilder.py +++ b/rpython/jit/backend/ppc/codebuilder.py @@ -612,7 +612,7 @@ # mul xvmuldp = XX3(60, XO9=112) xvmulsp = XX3(60, XO9=80) - xsmuldp = XX3(60, XO9=46) + xsmuldp = XX3(60, XO9=48) # div xvdivdp = XX3(60, XO9=102) xvdivsp = XX3(60, XO9=88) diff --git a/rpython/jit/backend/ppc/vector_ext.py b/rpython/jit/backend/ppc/vector_ext.py --- a/rpython/jit/backend/ppc/vector_ext.py +++ b/rpython/jit/backend/ppc/vector_ext.py @@ -333,38 +333,40 @@ if not scalar_loc.is_reg(): scalar_loc = regalloc.force_allocate_reg(scalar_arg) assert scalar_arg is not None - if accum_info.accum_operation == '+': - self._accum_reduce_sum(scalar_arg, vector_loc, scalar_loc) - elif accum_info.accum_operation == '*': - self._accum_reduce_mul(scalar_arg, vector_loc, scalar_loc) - else: - not_implemented("accum operator %s not implemented" % - (accum_info.accum_operation)) + op = accum_info.accum_operation + self._accum_reduce(op, scalar_arg, vector_loc, scalar_loc) accum_info = accum_info.next() - def _accum_reduce_mul(self, arg, accumloc, targetloc): - notimplemented("[ppc reduce mul]") - #scratchloc = X86_64_XMM_SCRATCH_REG - #self.mov(accumloc, scratchloc) - ## swap the two elements - #self.mc.SHUFPD_xxi(scratchloc.value, scratchloc.value, 0x01) - #self.mc.MULSD(accumloc, scratchloc) - #if accumloc is not targetloc: - # self.mov(accumloc, targetloc) - - def _accum_reduce_sum(self, arg, accumloc, targetloc): + def _accum_reduce(self, op, arg, accumloc, targetloc): # Currently the accumulator can ONLY be the biggest # 64 bit float/int tgt = targetloc.value acc = accumloc.value if arg.type == FLOAT: # r = (r[0]+r[1],r[0]+r[1]) - self.mc.xvmr(tgt, acc, acc) if IS_BIG_ENDIAN: self.mc.xxspltd(tgt, acc, acc, 0b00) else: - self.mc.xxspltd(tgt, acc, acc, 0b01) - self.mc.xsadddp(tgt, tgt, acc) + self.mc.xxspltd(tgt, acc, acc, 0b10) + if op == '+': + self.mc.xsadddp(tgt, tgt, acc) + elif op == '*': + self.mc.xsmuldp(tgt, tgt, acc) + else: + not_implemented("sum not implemented") + return + else: + assert arg.type == INT + self.mc.load_imm(r.SCRATCH2, PARAM_SAVE_AREA_OFFSET) + self.mc.stvx(acc, r.SCRATCH2.value, r.SP.value) + self.mc.load(tgt, r.SP.value, PARAM_SAVE_AREA_OFFSET) + self.mc.load(r.SCRATCH.value, r.SP.value, PARAM_SAVE_AREA_OFFSET+8) + if op == '+': + self.mc.add(tgt, tgt, acc) + elif op == '*': + self.mc.mul(tgt, tgt, acc) + else: + not_implemented("sum not implemented") return not_implemented("reduce sum for %s not impl." % arg) @@ -514,59 +516,37 @@ else: notimplemented("[expand int size not impl]") - #def genop_vec_pack_i(self, op, arglocs, regalloc): - # resultloc, sourceloc, residxloc, srcidxloc, countloc, sizeloc = arglocs - # assert isinstance(resultloc, RegLoc) - # assert isinstance(sourceloc, RegLoc) - # size = sizeloc.value - # srcidx = srcidxloc.value - # residx = residxloc.value - # count = countloc.value - # # for small data type conversion this can be quite costy - # # NOTE there might be some combinations that can be handled - # # more efficiently! e.g. - # # v2 = pack(v0,v1,4,4) - # si = srcidx - # ri = residx - # k = count - # while k > 0: - # if size == 8: - # if resultloc.is_xmm and sourceloc.is_xmm: # both xmm - # self.mc.PEXTRQ_rxi(X86_64_SCRATCH_REG.value, sourceloc.value, si) - # self.mc.PINSRQ_xri(resultloc.value, X86_64_SCRATCH_REG.value, ri) - # elif resultloc.is_xmm: # xmm <- reg - # self.mc.PINSRQ_xri(resultloc.value, sourceloc.value, ri) - # else: # reg <- xmm - # self.mc.PEXTRQ_rxi(resultloc.value, sourceloc.value, si) - # elif size == 4: - # if resultloc.is_xmm and sourceloc.is_xmm: - # self.mc.PEXTRD_rxi(X86_64_SCRATCH_REG.value, sourceloc.value, si) - # self.mc.PINSRD_xri(resultloc.value, X86_64_SCRATCH_REG.value, ri) - # elif resultloc.is_xmm: - # self.mc.PINSRD_xri(resultloc.value, sourceloc.value, ri) - # else: - # self.mc.PEXTRD_rxi(resultloc.value, sourceloc.value, si) - # elif size == 2: - # if resultloc.is_xmm and sourceloc.is_xmm: - # self.mc.PEXTRW_rxi(X86_64_SCRATCH_REG.value, sourceloc.value, si) - # self.mc.PINSRW_xri(resultloc.value, X86_64_SCRATCH_REG.value, ri) - # elif resultloc.is_xmm: - # self.mc.PINSRW_xri(resultloc.value, sourceloc.value, ri) - # else: - # self.mc.PEXTRW_rxi(resultloc.value, sourceloc.value, si) - # elif size == 1: - # if resultloc.is_xmm and sourceloc.is_xmm: - # self.mc.PEXTRB_rxi(X86_64_SCRATCH_REG.value, sourceloc.value, si) - # self.mc.PINSRB_xri(resultloc.value, X86_64_SCRATCH_REG.value, ri) - # elif resultloc.is_xmm: - # self.mc.PINSRB_xri(resultloc.value, sourceloc.value, ri) - # else: - # self.mc.PEXTRB_rxi(resultloc.value, sourceloc.value, si) - # si += 1 - # ri += 1 - # k -= 1 + def emit_vec_pack_i(self, op, arglocs, regalloc): + resultloc, vloc, sourceloc, residxloc, srcidxloc, countloc = arglocs + srcidx = srcidxloc.value + residx = residxloc.value + count = countloc.value + # for small data type conversion this can be quite costy + # NOTE there might be some combinations that can be handled + # more efficiently! e.g. + # v2 = pack(v0,v1,4,4) + res = resultloc.value + vector = vloc.value + src = sourceloc.value + size = op.bytesize + if size == 8: + if resultloc.is_vector_reg() and sourceloc.is_vector_reg(): # both vector + notimplemented("[ppc/vec_pack_i]") + elif resultloc.is_vector_reg(): # vector <- reg + self.mc.load_imm(r.SCRATCH, PARAM_SAVE_AREA_OFFSET) + self.mc.stvx(vector, r.SCRATCH2.value, r.SP.value) + self.mc.store(src, r.SP.value, PARAM_SAVE_AREA_OFFSET+8*residx) + self.mc.lvx(res, r.SCRATCH2.value, r.SP.value) + else: + notimplemented("[ppc/vec_pack_i]") + elif size == 4: + notimplemented("[ppc/vec_pack_i]") + elif size == 2: + notimplemented("[ppc/vec_pack_i]") + elif size == 1: + notimplemented("[ppc/vec_pack_i]") - #genop_vec_unpack_i = genop_vec_pack_i + # TODO emit_vec_unpack_i = emit_vec_pack_i def emit_vec_pack_f(self, op, arglocs, resultloc): resloc, vloc, srcloc, residxloc, srcidxloc, countloc = arglocs @@ -755,11 +735,12 @@ count = op.getarg(3) assert isinstance(index, ConstInt) assert isinstance(count, ConstInt) - srcloc = self.ensure_vector_reg(arg) + vloc = self.ensure_vector_reg(op.getarg(0)) + srcloc = self.ensure_reg(arg) resloc = self.force_allocate_vector_reg(op) residx = index.value # where to put it in result? srcidx = 0 - return [resloc, srcloc, imm(residx), imm(srcidx), imm(count.value)] + return [resloc, vloc, srcloc, imm(residx), imm(srcidx), imm(count.value)] def prepare_vec_pack_f(self, op): # new_res = vec_pack_i(res, src, index, count) @@ -769,7 +750,7 @@ count = op.getarg(3) assert isinstance(index, ConstInt) assert isinstance(count, ConstInt) - assert not arg.is_vector() + assert not arg.is_vector_reg() srcloc = self.ensure_reg(arg) vloc = self.ensure_vector_reg(op.getarg(0)) resloc = self.force_allocate_vector_reg(op) diff --git a/rpython/jit/metainterp/test/test_vector.py b/rpython/jit/metainterp/test/test_vector.py --- a/rpython/jit/metainterp/test/test_vector.py +++ b/rpython/jit/metainterp/test/test_vector.py @@ -379,23 +379,30 @@ res = self.meta_interp(f, [count]) assert res == f(count) == breaks - def test_sum(self): + @py.test.mark.parametrize('type,func,cast', + [(rffi.DOUBLE, lambda a,b: a+b, float), + (rffi.DOUBLE, lambda a,b: a*b, float), + (lltype.Signed, lambda a,b: a+b, int), + (lltype.Signed, lambda a,b: a*b, int), + ]) + def test_reduce(self, type, func, cast): + func = always_inline(func) myjitdriver = JitDriver(greens = [], reds = 'auto', vectorize=True) - T = lltype.Array(rffi.DOUBLE, hints={'nolength': True}) + T = lltype.Array(type, hints={'nolength': True}) def f(d): va = lltype.malloc(T, d, flavor='raw', zero=True) for j in range(d): - va[j] = float(j) + va[j] = cast(j+1) i = 0 accum = 0 while i < d: myjitdriver.jit_merge_point() - accum += va[i] + accum = func(accum,va[i]) i += 1 lltype.free(va, flavor='raw') return accum res = self.meta_interp(f, [60]) - assert res == f(60) == sum(range(60)) + assert isclose(res, f(60)) def test_constant_expand(self): myjitdriver = JitDriver(greens = [], reds = 'auto', vectorize=True) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit