[pypy-commit] pypy vecopt: correctly emit reduction operation in a guard exit that compiles a bridge (was missing before)

plan_rich Wed, 01 Jul 2015 03:37:10 -0700

Author: Richard Plangger <[email protected]>
Branch: vecopt
Changeset: r78376:ac77811327eb
Date: 2015-07-01 12:35 +0200
http://bitbucket.org/pypy/pypy/changeset/ac77811327eb/


Log:    correctly emit reduction operation in a guard exit that compiles a
        bridge (was missing before) added prod(...) as accumulator

diff --git a/pypy/module/micronumpy/compile.py 
b/pypy/module/micronumpy/compile.py
--- a/pypy/module/micronumpy/compile.py
+++ b/pypy/module/micronumpy/compile.py
@@ -37,19 +37,10 @@
 class BadToken(Exception):
     pass
 
-class FakeArguments(W_Root):
-    def __init__(self, args_w, kw_w):
-        self.args_w = args_w
-        self.kw_w = kw_w
-
-    def unpack(self):
-        return self.args_w, self.kw_w
-
-
 SINGLE_ARG_FUNCTIONS = ["sum", "prod", "max", "min", "all", "any",
                         "unegative", "flat", "tostring", "count_nonzero",
                         "argsort", "cumsum", "logical_xor_reduce"]
-TWO_ARG_FUNCTIONS = ["dot", 'multiply', 'take', 'searchsorted']
+TWO_ARG_FUNCTIONS = ["dot", 'take', 'searchsorted', 'multiply']
 TWO_ARG_FUNCTIONS_OR_NONE = ['view', 'astype', 'reshape']
 THREE_ARG_FUNCTIONS = ['where']
 
@@ -787,7 +778,7 @@
                 raise ArgumentNotAnArray
             if self.name == "dot":
                 w_res = arr.descr_dot(interp.space, arg)
-            if self.name == "multiply":
+            elif self.name == 'multiply':
                 w_res = arr.descr_mul(interp.space, arg)
             elif self.name == 'take':
                 w_res = arr.descr_take(interp.space, arg)
@@ -808,7 +799,7 @@
             if self.name == "where":
                 w_res = where(interp.space, arr, arg1, arg2)
             else:
-                assert False
+                assert False # unreachable code
         elif self.name in TWO_ARG_FUNCTIONS_OR_NONE:
             if len(self.args) != 2:
                 raise ArgumentMismatch
@@ -822,7 +813,7 @@
                 assert isinstance(w_arg, ArrayConstant)
                 w_res = arr.reshape(interp.space, w_arg.wrap(interp.space))
             else:
-                assert False, "missing two arg impl for: %s" % (self.name,)
+                assert False
         else:
             raise WrongFunctionName
         if isinstance(w_res, W_NDimArray):
diff --git a/pypy/module/micronumpy/loop.py b/pypy/module/micronumpy/loop.py
--- a/pypy/module/micronumpy/loop.py
+++ b/pypy/module/micronumpy/loop.py
@@ -421,8 +421,8 @@
                 lval = left_impl.getitem(i1).convert_to(space, dtype)
                 rval = right_impl.getitem(i2).convert_to(space, dtype)
                 oval = dtype.itemtype.add(oval, dtype.itemtype.mul(lval, rval))
-                i1 += s1
-                i2 += s2
+                i1 += jit.promote(s1)
+                i2 += jit.promote(s2)
             outi.setitem(outs, oval)
             outs = outi.next(outs)
             rights = righti.next(rights)
diff --git a/pypy/module/micronumpy/test/test_zjit.py 
b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -118,15 +118,18 @@
         retval = self.interp.eval_graph(self.graph, [i])
         return retval
 
-    def define_matrix_dot():
+    def define_dot_matrix():
         return """
         mat = |16|
         m = reshape(mat, [4,4])
+        vec = [0,1,2,3]
+        a = dot(m, vec)
+        a -> 3
         """
 
-    def test_matrix_dot(self):
-        result = self.run("matrix_dot")
-        assert int(result) == 45
+    def test_dot_matrix(self):
+        result = self.run("dot_matrix")
+        assert int(result) == 86
         self.check_vectorized(1, 1)
 
     def define_float32_copy():
@@ -523,6 +526,7 @@
             expected *= i * 2
         assert result == expected
         self.check_trace_count(1)
+        self.check_vectorized(1, 1)
 
     def define_max():
         return """
@@ -534,7 +538,7 @@
     def test_max(self):
         result = self.run("max")
         assert result == 128
-        self.check_vectorized(1, 0) # TODO reduce
+        self.check_vectorized(1, 0)
 
     def define_min():
         return """
@@ -546,7 +550,7 @@
     def test_min(self):
         result = self.run("min")
         assert result == -128
-        self.check_vectorized(1, 0) # TODO reduce
+        self.check_vectorized(1, 0)
 
     def define_any():
         return """
@@ -820,8 +824,8 @@
     def test_dot(self):
         result = self.run("dot")
         assert result == 184
-        self.check_trace_count(3)
-        self.check_vectorized(3,0)
+        self.check_trace_count(5)
+        self.check_vectorized(3,1)
 
     def define_argsort():
         return """
diff --git a/rpython/jit/backend/llgraph/runner.py 
b/rpython/jit/backend/llgraph/runner.py
--- a/rpython/jit/backend/llgraph/runner.py
+++ b/rpython/jit/backend/llgraph/runner.py
@@ -880,9 +880,6 @@
             if isinstance(box, BoxVectorAccum):
                 if box.operator == '+':
                     value = sum(value)
-                elif box.operator == '-':
-                    def sub(acc, x): return acc - x
-                    value = reduce(sub, value, 0)
                 elif box.operator == '*':
                     def prod(acc, x): return acc * x
                     value = reduce(prod, value, 1)
diff --git a/rpython/jit/backend/x86/assembler.py 
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -559,6 +559,8 @@
                                              self.current_clt.allgcrefs,
                                              self.current_clt.frame_info)
         self._check_frame_depth(self.mc, regalloc.get_gcmap())
+        #import pdb; pdb.set_trace()
+        self._accum_update_at_exit(arglocs, inputargs, faildescr, regalloc)
         frame_depth_no_fixed_size = self._assemble(regalloc, inputargs, 
operations)
         codeendpos = self.mc.get_relative_pos()
         self.write_pending_failure_recoveries(regalloc)
@@ -1865,7 +1867,7 @@
         startpos = self.mc.get_relative_pos()
         #
         self._accum_update_at_exit(guardtok.fail_locs, guardtok.failargs,
-                                   regalloc)
+                                   guardtok.faildescr, regalloc)
         #
         fail_descr, target = self.store_info_on_descr(startpos, guardtok)
         self.mc.PUSH(imm(fail_descr))
@@ -2529,67 +2531,60 @@
     # vector operations
     # ________________________________________
 
-    def _accum_update_at_exit(self, fail_locs, fail_args, regalloc):
+    def _accum_update_at_exit(self, fail_locs, fail_args, faildescr, regalloc):
         """ If accumulation is done in this loop, at the guard exit
         some vector registers must be adjusted to yield the correct value"""
         assert regalloc is not None
-        for i,arg in enumerate(fail_args):
-            if arg is None:
-                continue
+        accum_info = faildescr.rd_accum_list
+        while accum_info:
+            pos = accum_info.position
+            loc = fail_locs[pos]
+            assert isinstance(loc, RegLoc)
+            arg = fail_args[pos]
             if isinstance(arg, BoxVectorAccum):
-                assert arg.scalar_var is not None
-                loc = fail_locs[i]
-                assert isinstance(loc, RegLoc)
-                assert loc.is_xmm
-                tgtloc = regalloc.force_allocate_reg(arg.scalar_var, fail_args)
-                assert tgtloc is not None
-                if arg.operator == '+':
-                    # reduction using plus
-                    self._accum_reduce_sum(arg, loc, tgtloc)
-                    fail_locs[i] = tgtloc
-                    regalloc.possibly_free_var(arg)
-                    fail_args[i] = arg.scalar_var
-                else:
-                    raise NotImplementedError("accum operator %s not 
implemented" %
-                                                (arg.operator)) 
+                arg = arg.scalar_var
+            assert arg is not None
+            tgtloc = regalloc.force_allocate_reg(arg, fail_args)
+            if accum_info.operation == '+':
+                # reduction using plus
+                self._accum_reduce_sum(arg, loc, tgtloc)
+            elif accum_info.operation == '*':
+                self._accum_reduce_mul(arg, loc, tgtloc)
+            else:
+                import pdb; pdb.set_trace()
+                not_implemented("accum operator %s not implemented" %
+                                            (accum_info.operation)) 
+            fail_locs[pos] = tgtloc
+            regalloc.possibly_free_var(arg)
+            accum_info = accum_info.prev
 
-    def _accum_reduce_sum(self, vector_var, accumloc, targetloc):
-        assert isinstance(vector_var, BoxVectorAccum)
-        #
-        type = vector_var.gettype()
-        size = vector_var.getsize()
-        if type == FLOAT:
-            if size == 8:
-                # r = (r[0]+r[1],r[0]+r[1])
-                self.mc.HADDPD(accumloc, accumloc)
-                # upper bits (> 64) are dirty (but does not matter)
-                if accumloc is not targetloc:
-                    self.mov(targetloc, accumloc)
-                return
-            if size == 4:
-                # r = (r[0]+r[1],r[2]+r[3],r[0]+r[1],r[2]+r[3])
-                self.mc.HADDPS(accumloc, accumloc)
-                self.mc.HADDPS(accumloc, accumloc)
-                # invoking it a second time will gather the whole sum
-                # at the first element position
-                # the upper bits (>32) are dirty (but does not matter)
-                if accumloc is not targetloc:
-                    self.mov(targetloc, accumloc)
-                return
-        elif type == INT:
+    def _accum_reduce_mul(self, arg, accumloc, targetloc):
+        scratchloc = X86_64_SCRATCH_REG
+        self.mc.mov(scratchloc, accumloc)
+        # swap the two elements
+        self.mc.SHUFPS_xxi(scratchloc.value, scratchloc.value, 0x01)
+        self.mc.MULPD(accumloc, scratchloc)
+        if accumloc is not targetloc:
+            self.mc.mov(targetloc, accumloc)
+
+    def _accum_reduce_sum(self, arg, accumloc, targetloc):
+        # Currently the accumulator can ONLY be the biggest
+        # size for X86 -> 64 bit float/int
+        if arg.type == FLOAT:
+            # r = (r[0]+r[1],r[0]+r[1])
+            self.mc.HADDPD(accumloc, accumloc)
+            # upper bits (> 64) are dirty (but does not matter)
+            if accumloc is not targetloc:
+                self.mov(targetloc, accumloc)
+            return
+        elif arg.type == INT:
             scratchloc = X86_64_SCRATCH_REG
-            if size == 8:
-                self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0)
-                self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1)
-                self.mc.ADD(targetloc, scratchloc)
-                return
-            if size == 4:
-                self.mc.PHADDD(accumloc, accumloc)
-                self.mc.PHADDD(accumloc, accumloc)
-                self.mc.PEXTRD_rxi(targetloc.value, accumloc.value, 0)
-                return
+            self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0)
+            self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1)
+            self.mc.ADD(targetloc, scratchloc)
+            return
 
-        raise NotImplementedError("reduce sum for %s not impl." % vector_var)
+        not_implemented("reduce sum for %s not impl." % arg)
 
     def genop_vec_getarrayitem_raw(self, op, arglocs, resloc):
         # considers item scale (raw_load does not)
@@ -2655,7 +2650,7 @@
             # There is no 64x64 bit packed mul and I did not find one
             # for 8 bit either. It is questionable if it gives any benefit
             # for 8 bit.
-            raise NotImplementedError("")
+            not_implemented("int8/64 mul")
 
     def genop_vec_int_add(self, op, arglocs, resloc):
         loc0, loc1, size_loc = arglocs
@@ -2757,7 +2752,7 @@
             # the speedup might only be modest...
             # the optimization does not emit such code!
             msg = "vec int signext (%d->%d)" % (size, tosize)
-            raise NotImplementedError(msg)
+            not_implemented(msg)
 
     def genop_vec_float_expand(self, op, arglocs, resloc):
         srcloc, sizeloc = arglocs
diff --git a/rpython/jit/metainterp/compile.py 
b/rpython/jit/metainterp/compile.py
--- a/rpython/jit/metainterp/compile.py
+++ b/rpython/jit/metainterp/compile.py
@@ -488,7 +488,8 @@
 
 class ResumeGuardDescr(ResumeDescr):
     _attrs_ = ('rd_numb', 'rd_count', 'rd_consts', 'rd_virtuals',
-               'rd_frame_info_list', 'rd_pendingfields', 'status')
+               'rd_frame_info_list', 'rd_pendingfields', 'rd_accum_list',
+               'status')
     
     rd_numb = lltype.nullptr(NUMBERING)
     rd_count = 0
@@ -496,6 +497,7 @@
     rd_virtuals = None
     rd_frame_info_list = None
     rd_pendingfields = lltype.nullptr(PENDINGFIELDSP.TO)
+    rd_accum_list = None
 
     status = r_uint(0)
 
@@ -507,6 +509,7 @@
         self.rd_pendingfields = other.rd_pendingfields
         self.rd_virtuals = other.rd_virtuals
         self.rd_numb = other.rd_numb
+        self.rd_accum_list = other.rd_accum_list
         # we don't copy status
 
     ST_BUSY_FLAG    = 0x01     # if set, busy tracing from the guard
diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py 
b/rpython/jit/metainterp/optimizeopt/schedule.py
--- a/rpython/jit/metainterp/optimizeopt/schedule.py
+++ b/rpython/jit/metainterp/optimizeopt/schedule.py
@@ -416,6 +416,7 @@
         if vbox.gettype() == INT:
             return self.extend_int(vbox, newtype)
         else:
+            import pdb; pdb.set_trace()
             raise NotImplementedError("cannot yet extend float")
 
     def extend_int(self, vbox, newtype):
@@ -856,8 +857,9 @@
 
 class Accum(object):
     PLUS = '+'
+    MULTIPLY = '*'
 
-    def __init__(self, var=None, pos=-1, operator=PLUS):
+    def __init__(self, var, pos, operator):
         self.var = var
         self.pos = pos
         self.operator = operator
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py 
b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -1369,6 +1369,22 @@
         i32 = int_ge(i30, i25)
         guard_false(i32, descr=<Guard0x7f9f03ab17d0>) [p0, i29, i30, i31, p19, 
None, None, None]
         jump(p0, p19, i30, i31, i29, i8, i25, 
descr=TargetToken(140320937897104))
+
+        """
+        trace ="""
+        [i0, i1, i16, i17, i18, i5, p6, p7, p8, f19, p10, p11, p12, p13, p14, 
p15, i20, i21]
+        
guard_early_exit(descr=<rpython.jit.metainterp.compile.ResumeAtLoopHeaderDescr 
object at 0x7f249eb2e510>) [i5, i18, i17, i16, i1, i0, p15, p14, p13, p12, p11, 
p10, p8, p7, p6, f19]
+        f22 = raw_load(i20, i18, descr=floatarraydescr)
+        
guard_not_invalidated(descr=<rpython.jit.metainterp.compile.ResumeGuardNotInvalidated
 object at 0x7f249eb2ec90>) [i5, i18, i17, i16, i1, i0, p15, p14, p13, p12, 
p11, p10, p8, p7, p6, f22, f19]
+        f23 = raw_load(i21, i17, descr=floatarraydescr)
+        f24 = float_mul(f22, f23)
+        f25 = float_add(f19, f24)
+        i26 = int_add(i18, 8)
+        i27 = int_add(i17, 8)
+        i28 = int_lt(i16, i5)
+        guard_true(i28, 
descr=<rpython.jit.metainterp.compile.ResumeGuardTrueDescr object at 
0x7f249eb99290>) [i5, i26, i27, i16, i1, i0, p15, p14, p13, p12, p11, p10, p8, 
p7, p6, f25, None]
+        i31 = int_add(i16, 1)
+        jump(i0, i1, i31, i27, i26, i5, p6, p7, p8, f25, p10, p11, p12, p13, 
p14, p15, i20, i21)
         """
         # schedule 885 -> ptype is non for raw_load?
         opt = self.vectorize(self.parse_loop(trace))
diff --git a/rpython/jit/metainterp/optimizeopt/util.py 
b/rpython/jit/metainterp/optimizeopt/util.py
--- a/rpython/jit/metainterp/optimizeopt/util.py
+++ b/rpython/jit/metainterp/optimizeopt/util.py
@@ -8,7 +8,7 @@
 from rpython.rlib.objectmodel import we_are_translated
 from rpython.jit.metainterp import resoperation
 from rpython.jit.metainterp.resoperation import rop
-from rpython.jit.metainterp.resume import Snapshot
+from rpython.jit.metainterp.resume import Snapshot, AccumInfo
 
 # ____________________________________________________________
 # Misc. utilities
@@ -213,6 +213,8 @@
         return True
 
     def rename_failargs(self, guard, clone=False):
+        from rpython.jit.metainterp.history import BoxVectorAccum
+        from rpython.jit.metainterp.compile import ResumeGuardDescr
         if guard.getfailargs() is not None:
             if clone:
                 args = guard.getfailargs()[:]
@@ -220,6 +222,11 @@
                 args = guard.getfailargs()
             for i,arg in enumerate(args):
                 value = self.rename_map.get(arg,arg)
+                if value is not arg and isinstance(value, BoxVectorAccum):
+                    descr = guard.getdescr()
+                    assert isinstance(descr,ResumeGuardDescr)
+                    ai = AccumInfo(descr.rd_accum_list, i, value.operator)
+                    descr.rd_accum_list = ai
                 args[i] = value
             return args
         return None
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py 
b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -767,7 +767,7 @@
         lop = lnode.getoperation()
         opnum = lop.getopnum()
 
-        if opnum in (rop.FLOAT_ADD, rop.INT_ADD):
+        if opnum in (rop.FLOAT_ADD, rop.INT_ADD, rop.FLOAT_MUL):
             roper = rnode.getoperation()
             assert lop.numargs() == 2 and lop.result is not None
             accum_var, accum_pos = self.getaccumulator_variable(lop, roper, 
origin_pack)
@@ -802,7 +802,10 @@
                 # of leading/preceding signext/floatcast instructions needs to 
be
                 # considered. => tree pattern matching problem.
                 return None
-            accum = Accum(accum_var, accum_pos, Accum.PLUS)
+            operator = Accum.PLUS
+            if opnum == rop.FLOAT_ADD:
+                operator = Accum.MULTIPLY
+            accum = Accum(accum_var, accum_pos, operator)
             return AccumPair(lnode, rnode, ptype, ptype, accum)
 
         return None
@@ -824,14 +827,22 @@
             # create a new vector box for the parameters
             box = pack.input_type.new_vector_box()
             size = vec_reg_size // pack.input_type.getsize()
-            op = ResOperation(rop.VEC_BOX, [ConstInt(size)], box)
-            sched_data.invariant_oplist.append(op)
-            result = box.clonebox()
-            # clear the box to zero TODO might not be zero for every reduction?
-            op = ResOperation(rop.VEC_INT_XOR, [box, box], result)
-            sched_data.invariant_oplist.append(op)
-            box = result
-            result = BoxVectorAccum(box, accum.var, '+')
+            # reset the box to zeros or ones
+            if accum.operator == Accum.PLUS:
+                op = ResOperation(rop.VEC_BOX, [ConstInt(size)], box)
+                sched_data.invariant_oplist.append(op)
+                result = box.clonebox()
+                op = ResOperation(rop.VEC_INT_XOR, [box, box], result)
+                sched_data.invariant_oplist.append(op)
+                box = result
+            elif accum.operator == Accum.MULTIPLY:
+                # multiply is only supported by floats
+                op = ResOperation(rop.VEC_FLOAT_EXPAND, [ConstInt(1)], box)
+                sched_data.invariant_oplist.append(op)
+            else:
+                import pdb; pdb.set_trace()
+                raise NotImplementedError
+            result = BoxVectorAccum(box, accum.var, accum.operator)
             # pack the scalar value
             op = ResOperation(getpackopnum(box.gettype()),
                               [box, accum.var, ConstInt(0), ConstInt(1)], 
result)
diff --git a/rpython/jit/metainterp/resume.py b/rpython/jit/metainterp/resume.py
--- a/rpython/jit/metainterp/resume.py
+++ b/rpython/jit/metainterp/resume.py
@@ -34,6 +34,13 @@
         self.jitcode = jitcode
         self.pc = pc
 
+class AccumInfo(object):
+    __slots__ = ('prev', 'position', 'operation')
+    def __init__(self, prev, position, operation):
+        self.prev = prev
+        self.operation = operation
+        self.position = position
+
 def _ensure_parent_resumedata(framestack, n):
     target = framestack[n]
     if n == 0:
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy vecopt: correctly emit reduction operation in a guard exit that compiles a bridge (was missing before)

Reply via email to