Author: Richard Plangger <r...@pasra.at> Branch: vecopt Changeset: r78376:ac77811327eb Date: 2015-07-01 12:35 +0200 http://bitbucket.org/pypy/pypy/changeset/ac77811327eb/
Log: correctly emit reduction operation in a guard exit that compiles a bridge (was missing before) added prod(...) as accumulator diff --git a/pypy/module/micronumpy/compile.py b/pypy/module/micronumpy/compile.py --- a/pypy/module/micronumpy/compile.py +++ b/pypy/module/micronumpy/compile.py @@ -37,19 +37,10 @@ class BadToken(Exception): pass -class FakeArguments(W_Root): - def __init__(self, args_w, kw_w): - self.args_w = args_w - self.kw_w = kw_w - - def unpack(self): - return self.args_w, self.kw_w - - SINGLE_ARG_FUNCTIONS = ["sum", "prod", "max", "min", "all", "any", "unegative", "flat", "tostring", "count_nonzero", "argsort", "cumsum", "logical_xor_reduce"] -TWO_ARG_FUNCTIONS = ["dot", 'multiply', 'take', 'searchsorted'] +TWO_ARG_FUNCTIONS = ["dot", 'take', 'searchsorted', 'multiply'] TWO_ARG_FUNCTIONS_OR_NONE = ['view', 'astype', 'reshape'] THREE_ARG_FUNCTIONS = ['where'] @@ -787,7 +778,7 @@ raise ArgumentNotAnArray if self.name == "dot": w_res = arr.descr_dot(interp.space, arg) - if self.name == "multiply": + elif self.name == 'multiply': w_res = arr.descr_mul(interp.space, arg) elif self.name == 'take': w_res = arr.descr_take(interp.space, arg) @@ -808,7 +799,7 @@ if self.name == "where": w_res = where(interp.space, arr, arg1, arg2) else: - assert False + assert False # unreachable code elif self.name in TWO_ARG_FUNCTIONS_OR_NONE: if len(self.args) != 2: raise ArgumentMismatch @@ -822,7 +813,7 @@ assert isinstance(w_arg, ArrayConstant) w_res = arr.reshape(interp.space, w_arg.wrap(interp.space)) else: - assert False, "missing two arg impl for: %s" % (self.name,) + assert False else: raise WrongFunctionName if isinstance(w_res, W_NDimArray): diff --git a/pypy/module/micronumpy/loop.py b/pypy/module/micronumpy/loop.py --- a/pypy/module/micronumpy/loop.py +++ b/pypy/module/micronumpy/loop.py @@ -421,8 +421,8 @@ lval = left_impl.getitem(i1).convert_to(space, dtype) rval = right_impl.getitem(i2).convert_to(space, dtype) oval = dtype.itemtype.add(oval, dtype.itemtype.mul(lval, rval)) - i1 += s1 - i2 += s2 + i1 += jit.promote(s1) + i2 += jit.promote(s2) outi.setitem(outs, oval) outs = outi.next(outs) rights = righti.next(rights) diff --git a/pypy/module/micronumpy/test/test_zjit.py b/pypy/module/micronumpy/test/test_zjit.py --- a/pypy/module/micronumpy/test/test_zjit.py +++ b/pypy/module/micronumpy/test/test_zjit.py @@ -118,15 +118,18 @@ retval = self.interp.eval_graph(self.graph, [i]) return retval - def define_matrix_dot(): + def define_dot_matrix(): return """ mat = |16| m = reshape(mat, [4,4]) + vec = [0,1,2,3] + a = dot(m, vec) + a -> 3 """ - def test_matrix_dot(self): - result = self.run("matrix_dot") - assert int(result) == 45 + def test_dot_matrix(self): + result = self.run("dot_matrix") + assert int(result) == 86 self.check_vectorized(1, 1) def define_float32_copy(): @@ -523,6 +526,7 @@ expected *= i * 2 assert result == expected self.check_trace_count(1) + self.check_vectorized(1, 1) def define_max(): return """ @@ -534,7 +538,7 @@ def test_max(self): result = self.run("max") assert result == 128 - self.check_vectorized(1, 0) # TODO reduce + self.check_vectorized(1, 0) def define_min(): return """ @@ -546,7 +550,7 @@ def test_min(self): result = self.run("min") assert result == -128 - self.check_vectorized(1, 0) # TODO reduce + self.check_vectorized(1, 0) def define_any(): return """ @@ -820,8 +824,8 @@ def test_dot(self): result = self.run("dot") assert result == 184 - self.check_trace_count(3) - self.check_vectorized(3,0) + self.check_trace_count(5) + self.check_vectorized(3,1) def define_argsort(): return """ diff --git a/rpython/jit/backend/llgraph/runner.py b/rpython/jit/backend/llgraph/runner.py --- a/rpython/jit/backend/llgraph/runner.py +++ b/rpython/jit/backend/llgraph/runner.py @@ -880,9 +880,6 @@ if isinstance(box, BoxVectorAccum): if box.operator == '+': value = sum(value) - elif box.operator == '-': - def sub(acc, x): return acc - x - value = reduce(sub, value, 0) elif box.operator == '*': def prod(acc, x): return acc * x value = reduce(prod, value, 1) diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py --- a/rpython/jit/backend/x86/assembler.py +++ b/rpython/jit/backend/x86/assembler.py @@ -559,6 +559,8 @@ self.current_clt.allgcrefs, self.current_clt.frame_info) self._check_frame_depth(self.mc, regalloc.get_gcmap()) + #import pdb; pdb.set_trace() + self._accum_update_at_exit(arglocs, inputargs, faildescr, regalloc) frame_depth_no_fixed_size = self._assemble(regalloc, inputargs, operations) codeendpos = self.mc.get_relative_pos() self.write_pending_failure_recoveries(regalloc) @@ -1865,7 +1867,7 @@ startpos = self.mc.get_relative_pos() # self._accum_update_at_exit(guardtok.fail_locs, guardtok.failargs, - regalloc) + guardtok.faildescr, regalloc) # fail_descr, target = self.store_info_on_descr(startpos, guardtok) self.mc.PUSH(imm(fail_descr)) @@ -2529,67 +2531,60 @@ # vector operations # ________________________________________ - def _accum_update_at_exit(self, fail_locs, fail_args, regalloc): + def _accum_update_at_exit(self, fail_locs, fail_args, faildescr, regalloc): """ If accumulation is done in this loop, at the guard exit some vector registers must be adjusted to yield the correct value""" assert regalloc is not None - for i,arg in enumerate(fail_args): - if arg is None: - continue + accum_info = faildescr.rd_accum_list + while accum_info: + pos = accum_info.position + loc = fail_locs[pos] + assert isinstance(loc, RegLoc) + arg = fail_args[pos] if isinstance(arg, BoxVectorAccum): - assert arg.scalar_var is not None - loc = fail_locs[i] - assert isinstance(loc, RegLoc) - assert loc.is_xmm - tgtloc = regalloc.force_allocate_reg(arg.scalar_var, fail_args) - assert tgtloc is not None - if arg.operator == '+': - # reduction using plus - self._accum_reduce_sum(arg, loc, tgtloc) - fail_locs[i] = tgtloc - regalloc.possibly_free_var(arg) - fail_args[i] = arg.scalar_var - else: - raise NotImplementedError("accum operator %s not implemented" % - (arg.operator)) + arg = arg.scalar_var + assert arg is not None + tgtloc = regalloc.force_allocate_reg(arg, fail_args) + if accum_info.operation == '+': + # reduction using plus + self._accum_reduce_sum(arg, loc, tgtloc) + elif accum_info.operation == '*': + self._accum_reduce_mul(arg, loc, tgtloc) + else: + import pdb; pdb.set_trace() + not_implemented("accum operator %s not implemented" % + (accum_info.operation)) + fail_locs[pos] = tgtloc + regalloc.possibly_free_var(arg) + accum_info = accum_info.prev - def _accum_reduce_sum(self, vector_var, accumloc, targetloc): - assert isinstance(vector_var, BoxVectorAccum) - # - type = vector_var.gettype() - size = vector_var.getsize() - if type == FLOAT: - if size == 8: - # r = (r[0]+r[1],r[0]+r[1]) - self.mc.HADDPD(accumloc, accumloc) - # upper bits (> 64) are dirty (but does not matter) - if accumloc is not targetloc: - self.mov(targetloc, accumloc) - return - if size == 4: - # r = (r[0]+r[1],r[2]+r[3],r[0]+r[1],r[2]+r[3]) - self.mc.HADDPS(accumloc, accumloc) - self.mc.HADDPS(accumloc, accumloc) - # invoking it a second time will gather the whole sum - # at the first element position - # the upper bits (>32) are dirty (but does not matter) - if accumloc is not targetloc: - self.mov(targetloc, accumloc) - return - elif type == INT: + def _accum_reduce_mul(self, arg, accumloc, targetloc): + scratchloc = X86_64_SCRATCH_REG + self.mc.mov(scratchloc, accumloc) + # swap the two elements + self.mc.SHUFPS_xxi(scratchloc.value, scratchloc.value, 0x01) + self.mc.MULPD(accumloc, scratchloc) + if accumloc is not targetloc: + self.mc.mov(targetloc, accumloc) + + def _accum_reduce_sum(self, arg, accumloc, targetloc): + # Currently the accumulator can ONLY be the biggest + # size for X86 -> 64 bit float/int + if arg.type == FLOAT: + # r = (r[0]+r[1],r[0]+r[1]) + self.mc.HADDPD(accumloc, accumloc) + # upper bits (> 64) are dirty (but does not matter) + if accumloc is not targetloc: + self.mov(targetloc, accumloc) + return + elif arg.type == INT: scratchloc = X86_64_SCRATCH_REG - if size == 8: - self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0) - self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1) - self.mc.ADD(targetloc, scratchloc) - return - if size == 4: - self.mc.PHADDD(accumloc, accumloc) - self.mc.PHADDD(accumloc, accumloc) - self.mc.PEXTRD_rxi(targetloc.value, accumloc.value, 0) - return + self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0) + self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1) + self.mc.ADD(targetloc, scratchloc) + return - raise NotImplementedError("reduce sum for %s not impl." % vector_var) + not_implemented("reduce sum for %s not impl." % arg) def genop_vec_getarrayitem_raw(self, op, arglocs, resloc): # considers item scale (raw_load does not) @@ -2655,7 +2650,7 @@ # There is no 64x64 bit packed mul and I did not find one # for 8 bit either. It is questionable if it gives any benefit # for 8 bit. - raise NotImplementedError("") + not_implemented("int8/64 mul") def genop_vec_int_add(self, op, arglocs, resloc): loc0, loc1, size_loc = arglocs @@ -2757,7 +2752,7 @@ # the speedup might only be modest... # the optimization does not emit such code! msg = "vec int signext (%d->%d)" % (size, tosize) - raise NotImplementedError(msg) + not_implemented(msg) def genop_vec_float_expand(self, op, arglocs, resloc): srcloc, sizeloc = arglocs diff --git a/rpython/jit/metainterp/compile.py b/rpython/jit/metainterp/compile.py --- a/rpython/jit/metainterp/compile.py +++ b/rpython/jit/metainterp/compile.py @@ -488,7 +488,8 @@ class ResumeGuardDescr(ResumeDescr): _attrs_ = ('rd_numb', 'rd_count', 'rd_consts', 'rd_virtuals', - 'rd_frame_info_list', 'rd_pendingfields', 'status') + 'rd_frame_info_list', 'rd_pendingfields', 'rd_accum_list', + 'status') rd_numb = lltype.nullptr(NUMBERING) rd_count = 0 @@ -496,6 +497,7 @@ rd_virtuals = None rd_frame_info_list = None rd_pendingfields = lltype.nullptr(PENDINGFIELDSP.TO) + rd_accum_list = None status = r_uint(0) @@ -507,6 +509,7 @@ self.rd_pendingfields = other.rd_pendingfields self.rd_virtuals = other.rd_virtuals self.rd_numb = other.rd_numb + self.rd_accum_list = other.rd_accum_list # we don't copy status ST_BUSY_FLAG = 0x01 # if set, busy tracing from the guard diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py b/rpython/jit/metainterp/optimizeopt/schedule.py --- a/rpython/jit/metainterp/optimizeopt/schedule.py +++ b/rpython/jit/metainterp/optimizeopt/schedule.py @@ -416,6 +416,7 @@ if vbox.gettype() == INT: return self.extend_int(vbox, newtype) else: + import pdb; pdb.set_trace() raise NotImplementedError("cannot yet extend float") def extend_int(self, vbox, newtype): @@ -856,8 +857,9 @@ class Accum(object): PLUS = '+' + MULTIPLY = '*' - def __init__(self, var=None, pos=-1, operator=PLUS): + def __init__(self, var, pos, operator): self.var = var self.pos = pos self.operator = operator diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py --- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py @@ -1369,6 +1369,22 @@ i32 = int_ge(i30, i25) guard_false(i32, descr=<Guard0x7f9f03ab17d0>) [p0, i29, i30, i31, p19, None, None, None] jump(p0, p19, i30, i31, i29, i8, i25, descr=TargetToken(140320937897104)) + + """ + trace =""" + [i0, i1, i16, i17, i18, i5, p6, p7, p8, f19, p10, p11, p12, p13, p14, p15, i20, i21] + guard_early_exit(descr=<rpython.jit.metainterp.compile.ResumeAtLoopHeaderDescr object at 0x7f249eb2e510>) [i5, i18, i17, i16, i1, i0, p15, p14, p13, p12, p11, p10, p8, p7, p6, f19] + f22 = raw_load(i20, i18, descr=floatarraydescr) + guard_not_invalidated(descr=<rpython.jit.metainterp.compile.ResumeGuardNotInvalidated object at 0x7f249eb2ec90>) [i5, i18, i17, i16, i1, i0, p15, p14, p13, p12, p11, p10, p8, p7, p6, f22, f19] + f23 = raw_load(i21, i17, descr=floatarraydescr) + f24 = float_mul(f22, f23) + f25 = float_add(f19, f24) + i26 = int_add(i18, 8) + i27 = int_add(i17, 8) + i28 = int_lt(i16, i5) + guard_true(i28, descr=<rpython.jit.metainterp.compile.ResumeGuardTrueDescr object at 0x7f249eb99290>) [i5, i26, i27, i16, i1, i0, p15, p14, p13, p12, p11, p10, p8, p7, p6, f25, None] + i31 = int_add(i16, 1) + jump(i0, i1, i31, i27, i26, i5, p6, p7, p8, f25, p10, p11, p12, p13, p14, p15, i20, i21) """ # schedule 885 -> ptype is non for raw_load? opt = self.vectorize(self.parse_loop(trace)) diff --git a/rpython/jit/metainterp/optimizeopt/util.py b/rpython/jit/metainterp/optimizeopt/util.py --- a/rpython/jit/metainterp/optimizeopt/util.py +++ b/rpython/jit/metainterp/optimizeopt/util.py @@ -8,7 +8,7 @@ from rpython.rlib.objectmodel import we_are_translated from rpython.jit.metainterp import resoperation from rpython.jit.metainterp.resoperation import rop -from rpython.jit.metainterp.resume import Snapshot +from rpython.jit.metainterp.resume import Snapshot, AccumInfo # ____________________________________________________________ # Misc. utilities @@ -213,6 +213,8 @@ return True def rename_failargs(self, guard, clone=False): + from rpython.jit.metainterp.history import BoxVectorAccum + from rpython.jit.metainterp.compile import ResumeGuardDescr if guard.getfailargs() is not None: if clone: args = guard.getfailargs()[:] @@ -220,6 +222,11 @@ args = guard.getfailargs() for i,arg in enumerate(args): value = self.rename_map.get(arg,arg) + if value is not arg and isinstance(value, BoxVectorAccum): + descr = guard.getdescr() + assert isinstance(descr,ResumeGuardDescr) + ai = AccumInfo(descr.rd_accum_list, i, value.operator) + descr.rd_accum_list = ai args[i] = value return args return None diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py --- a/rpython/jit/metainterp/optimizeopt/vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/vectorize.py @@ -767,7 +767,7 @@ lop = lnode.getoperation() opnum = lop.getopnum() - if opnum in (rop.FLOAT_ADD, rop.INT_ADD): + if opnum in (rop.FLOAT_ADD, rop.INT_ADD, rop.FLOAT_MUL): roper = rnode.getoperation() assert lop.numargs() == 2 and lop.result is not None accum_var, accum_pos = self.getaccumulator_variable(lop, roper, origin_pack) @@ -802,7 +802,10 @@ # of leading/preceding signext/floatcast instructions needs to be # considered. => tree pattern matching problem. return None - accum = Accum(accum_var, accum_pos, Accum.PLUS) + operator = Accum.PLUS + if opnum == rop.FLOAT_ADD: + operator = Accum.MULTIPLY + accum = Accum(accum_var, accum_pos, operator) return AccumPair(lnode, rnode, ptype, ptype, accum) return None @@ -824,14 +827,22 @@ # create a new vector box for the parameters box = pack.input_type.new_vector_box() size = vec_reg_size // pack.input_type.getsize() - op = ResOperation(rop.VEC_BOX, [ConstInt(size)], box) - sched_data.invariant_oplist.append(op) - result = box.clonebox() - # clear the box to zero TODO might not be zero for every reduction? - op = ResOperation(rop.VEC_INT_XOR, [box, box], result) - sched_data.invariant_oplist.append(op) - box = result - result = BoxVectorAccum(box, accum.var, '+') + # reset the box to zeros or ones + if accum.operator == Accum.PLUS: + op = ResOperation(rop.VEC_BOX, [ConstInt(size)], box) + sched_data.invariant_oplist.append(op) + result = box.clonebox() + op = ResOperation(rop.VEC_INT_XOR, [box, box], result) + sched_data.invariant_oplist.append(op) + box = result + elif accum.operator == Accum.MULTIPLY: + # multiply is only supported by floats + op = ResOperation(rop.VEC_FLOAT_EXPAND, [ConstInt(1)], box) + sched_data.invariant_oplist.append(op) + else: + import pdb; pdb.set_trace() + raise NotImplementedError + result = BoxVectorAccum(box, accum.var, accum.operator) # pack the scalar value op = ResOperation(getpackopnum(box.gettype()), [box, accum.var, ConstInt(0), ConstInt(1)], result) diff --git a/rpython/jit/metainterp/resume.py b/rpython/jit/metainterp/resume.py --- a/rpython/jit/metainterp/resume.py +++ b/rpython/jit/metainterp/resume.py @@ -34,6 +34,13 @@ self.jitcode = jitcode self.pc = pc +class AccumInfo(object): + __slots__ = ('prev', 'position', 'operation') + def __init__(self, prev, position, operation): + self.prev = prev + self.operation = operation + self.position = position + def _ensure_parent_resumedata(framestack, n): target = framestack[n] if n == 0: _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit