[pypy-commit] pypy vecopt-merge: resolving some issues introduced by the simpler combination and separate splitting phase,

plan_rich Mon, 31 Aug 2015 06:08:00 -0700

Author: Richard Plangger <[email protected]>
Branch: vecopt-merge
Changeset: r79319:75f6354522df
Date: 2015-08-31 14:33 +0200
http://bitbucket.org/pypy/pypy/changeset/75f6354522df/


Log:    resolving some issues introduced by the simpler combination and
        separate splitting phase, needs to remove packs that are not full

diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py 
b/rpython/jit/metainterp/optimizeopt/schedule.py
--- a/rpython/jit/metainterp/optimizeopt/schedule.py
+++ b/rpython/jit/metainterp/optimizeopt/schedule.py
@@ -240,6 +240,11 @@
     def getcount(self):
         return self.count
 
+    def pack_byte_size(self, pack):
+        if len(pack.operations) == 0:
+            return 0
+        return self.getsize() * pack.opcount()
+
 
 PT_GENERIC = PackType(PackType.UNKNOWN_TYPE, -1, False)
 PT_FLOAT_2 = PackType(FLOAT, 4, False, 2)
@@ -873,11 +878,43 @@
         assert not isinstance(box, BoxVector)
         self.box_to_vbox[box] = (off, vector)
 
+def opcount_filling_vector_register(pack, vec_reg_size):
+    """ how many operations of that kind can one execute
+        with a machine instruction of register size X?
+    """
+    pack_type = pack.input_type
+    if pack_type is None:
+        pack_type = pack.output_type # load operations
+
+    op = pack.leftmost()
+    if op.casts_box():
+        count = pack_type.getcount()
+        return count
+
+    count = vec_reg_size // pack_type.getsize()
+    return count
+
+def maximum_byte_size(pack, vec_reg_size):
+    """ The maxmum size in bytes the operation is able to
+        process with the hardware register and the operation
+        semantics.
+    """
+    op = pack.leftmost()
+    if op.casts_box():
+        # casting is special, often only takes a half full vector
+        pack_type = pack.input_type
+        if pack_type is None:
+            pack_type = self.output_type # load operations
+        return pack_type.byte_size()
+    return vec_reg_size
+
 class Pack(object):
     """ A pack is a set of n statements that are:
         * isomorphic
         * independent
     """
+    FULL = 0
+
     def __init__(self, ops, input_type, output_type):
         self.operations = ops
         self.accum = None
@@ -899,30 +936,43 @@
             ptype = self.output_type
         return ptype
 
-    def pack_byte_size(self):
-        return self.pack_type().getsize() * self.opcount()
+    def input_byte_size(self):
+        """ The amount of bytes the operations need with the current
+            entries in self.operations. E.g. cast_singlefloat_to_float
+            takes only #2 operations.
+        """
+        return self._byte_size(self.input_type)
+
+    def output_byte_size(self):
+        """ The amount of bytes the operations need with the current
+            entries in self.operations. E.g. vec_load(..., descr=short) 
+            with 10 operations returns 20
+        """
+        return self._byte_size(self.output_type)
+
+    def pack_load(self, vec_reg_size):
+        """ Returns the load of the pack. A value
+            smaller than 0 indicates that it is empty
+            or nearly empty, zero indicates that all slots
+            are used and > 0 indicates that too many operations
+            are in this pack instance.
+        """
+        if len(self.operations) == 0:
+            return -1
+        size = maximum_byte_size(self, vec_reg_size)
+        if self.input_type is None:
+            # e.g. load operations
+            return self.output_type.pack_byte_size(self) - size
+        # default only consider the input type
+        # e.g. store operations, int_add, ...
+        return self.input_type.pack_byte_size(self) - size
+
 
     def is_full(self, vec_reg_size):
         """ If one input element times the opcount is equal
             to the vector register size, we are full!
         """
-        ptype = self.pack_type()
-        op = self.leftmost()
-        if op.casts_box():
-            cur_bytes = ptype.getsize() * self.opcount()
-            max_bytes = self.input_type.byte_size()
-            assert cur_bytes <= max_bytes
-            return cur_bytes == max_bytes
-
-        bytes = self.pack_byte_size()
-        assert bytes <= vec_reg_size
-        if bytes == vec_reg_size:
-            return True
-        if ptype.getcount() != -1:
-            size = ptype.getcount() * ptype.getsize()
-            assert bytes <= size
-            return bytes == size
-        return False
+        return self.pack_load(vec_reg_size) == Pack.FULL
 
     def opnum(self):
         assert len(self.operations) > 0
@@ -930,9 +980,8 @@
 
     def clear(self):
         for node in self.operations:
-            if node.pack is not self:
-                node.pack = None
-                node.pack_position = -1
+            node.pack = None
+            node.pack_position = -1
 
     def update_pack_of_nodes(self):
         for i,node in enumerate(self.operations):
@@ -945,19 +994,28 @@
             vector register.
         """
         pack = self
-        pack_type = self.pack_type()
-        max_count = vec_reg_size // pack_type.getsize()
-        assert max_count * pack_type.getsize() == vec_reg_size
-        while pack.pack_byte_size() > vec_reg_size:
-            assert max_count > 0
-            newpack = pack.clone()
-            oplist = pack.operations[:max_count]
-            newpack.operations = pack.operations[max_count:]
+        while pack.pack_load(vec_reg_size) > Pack.FULL:
+            pack.clear()
+            oplist, newoplist = pack.slice_operations(vec_reg_size)
             pack.operations = oplist
             pack.update_pack_of_nodes()
-            newpack.update_pack_of_nodes()
-            pack = newpack
-            packlist.append(newpack)
+            assert pack.is_full(vec_reg_size)
+            #
+            newpack = pack.clone(newoplist)
+            load = newpack.pack_load(vec_reg_size)
+            if load >= Pack.FULL:
+                pack = newpack
+                packlist.append(newpack)
+            else:
+                newpack.clear()
+
+    def slice_operations(self, vec_reg_size):
+        count = opcount_filling_vector_register(self, vec_reg_size)
+        newoplist = self.operations[count:]
+        oplist = self.operations[:count]
+        assert len(newoplist) + len(oplist) == len(self.operations)
+        assert len(newoplist) != 0
+        return oplist, newoplist
 
     def rightmost_match_leftmost(self, other):
         """ Check if pack A can be combined with pack B """
@@ -974,14 +1032,16 @@
         return rightmost is leftmost and accum
 
     def __repr__(self):
+        if len(self.operations) == 0:
+            return "Pack(-, [])"
         opname = self.operations[0].getoperation().getopname()
         return "Pack(%s,%r)" % (opname, self.operations)
 
     def is_accumulating(self):
         return self.accum is not None
 
-    def clone(self):
-        cloned = Pack(self.operations, self.input_type, self.output_type)
+    def clone(self, oplist):
+        cloned = Pack(oplist, self.input_type, self.output_type)
         cloned.accum = self.accum
         return cloned
 
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py 
b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py
@@ -452,3 +452,19 @@
         v9[i64|2] = vec_int_and(v4[i64|2], v8[i64|2])
         """, False)
         self.assert_equal(loop2, loop3)
+
+    def test_split_cast(self):
+        trace = self.parse("""
+        f10 = cast_int_to_float(i1)
+        f11 = cast_int_to_float(i2)
+        f12 = cast_int_to_float(i3)
+        f13 = cast_int_to_float(i4)
+        """)
+        pack = self.pack(trace, 0, 4, I64, F32)
+        packs = []
+        pack.split(packs, 16)
+        packs.append(pack)
+        assert len(packs) == 2
+
+
+
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py 
b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -1427,5 +1427,51 @@
         opt = self.schedule(self.parse_loop(trace))
         self.debug_print_operations(opt.loop)
 
+    def test_arraylen(self):
+        trace = """
+        [i45, i33, p40]
+        # while i < len(l):
+        # LOAD_FAST i
+        # LOAD_GLOBAL len
+        guard_not_invalidated(descr=<Guard0x7f82c00c3518>) [i33,p40]
+        # LOAD_FAST l
+        # CALL_FUNCTION 1
+        # COMPARE_OP <
+        i50 = int_lt(i45, i33)
+        guard_true(i50) [i50,i33,p40]
+        # POP_JUMP_IF_FALSE 70
+        # l[i] = l[i] + 1
+        # LOAD_FAST l
+        # LOAD_FAST i
+        # BINARY_SUBSCR 
+        i51 = uint_ge(i45, i33)
+        guard_false(i51) [i50, i45]
+        i52 = getarrayitem_gc(p40, i45, descr=intarraydescr)
+        # LOAD_CONST 1
+        # BINARY_ADD 
+        i53 = int_add(i52, 1)
+        #guard_no_overflow(descr=<Guard0x7f82c00c33b8>) []
+        # LOAD_FAST l
+        # LOAD_FAST i
+        # STORE_SUBSCR 
+        setarrayitem_gc(p40, i45, i53, descr=intarraydescr)
+        # i += 1
+        # LOAD_FAST i
+        # LOAD_CONST 1
+        # INPLACE_ADD 
+        i54 = int_add(i45,1)
+        # STORE_FAST i
+        # JUMP_ABSOLUTE 21
+        #getfield_raw_i(140199654614400, descr=<FieldS 
pypysig_long_struct.c_value 0>)
+        #None = i55 < 0
+        #guard(i56 is false)
+        # LOAD_FAST i
+        #i34 = arraylen_gc(p40, descr=<ArrayS 8>)
+        jump(i54, i33, p40)
+        """
+        opt = self.vectorize(self.parse_loop(trace))
+        self.debug_print_operations(opt.loop)
+
+
 class TestLLtype(BaseTestVectorize, LLtypeMixin):
     pass
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py 
b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -133,10 +133,6 @@
 
     return False
 
-def cmp_pack_lt(a,b):
-    return a.left.getindex() < b.left.getindex()
-packsort = listsort.make_timsort_class(lt=cmp_pack_lt)
-
 class VectorizingOptimizer(Optimizer):
     """ Try to unroll the loop and find instructions to group """
 
@@ -401,13 +397,6 @@
         """
         if len(self.packset.packs) == 0:
             raise NotAVectorizeableLoop()
-        #packsort(self.packset.packs).sort()
-        #if not we_are_translated():
-        #    # ensure we are really sorted!
-        #    x = 0
-        #    for i,pack in enumerate(self.packset.packs):
-        #        assert x <= pack.left.getindex()
-        #        x = pack.left.getindex()
         i = 0
         j = 0
         end_ij = len(self.packset.packs)
@@ -423,33 +412,6 @@
                         continue
                     pack1 = self.packset.packs[i]
                     pack2 = self.packset.packs[j]
-                    # remove intermediate
-                    left = pack1.operations[0]
-                    #if left in orphan:
-                    #    # a pack was filled, thus the rhs was put
-                    #    # into the orphan map.
-                    #    if orphan[left] is False:
-                    #        # this pack might be redundant if pack1.right
-                    #        # is the at the left position in another pack
-                    #        assert pack1.opcount() == 2
-                    #        right = pack1.operations[1]
-                    #        orphan[right] = True
-                    #        pack1.clear()
-                    #        del self.packset.packs[i]
-                    #        end_ij -= 1
-                    #        continue
-                    #    else:
-                    #        # left is not an orphan, this pack proves that
-                    #        # there might be more packs
-                    #        del orphan[left]
-                    # check if the pack is already full
-                    #if pack1.is_full(self.cpu.vector_register_size):
-                    #    right = pack1.operations[-1]
-                    #    # False indicates that the next pair might not
-                    #    # be needed, because left is already computed
-                    #    # in another set
-                    #    orphan[right] = False
-                    #    break
                     if pack1.rightmost_match_leftmost(pack2):
                         end_ij = self.packset.combine(i,j)
                     else:
@@ -460,11 +422,14 @@
                 j = 0
             if len_before == len(self.packset.packs):
                 break
+        newpacks = []
+        vec_reg_size = self.cpu.vector_register_size
         for pack in self.packset.packs:
-            if pack.pack_byte_size() > self.cpu.vector_register_size:
-                pack.split(self.packset.packs, self.cpu.vector_register_size)
-            else:
-                pack.update_pack_of_nodes()
+            if pack.pack_load(vec_reg_size) > Pack.FULL:
+                pack.split(newpacks, vec_reg_size)
+                continue
+            pack.update_pack_of_nodes()
+        self.packset.packs.extend(newpacks)
 
         if not we_are_translated():
             # some test cases check the accumulation variables
@@ -483,9 +448,10 @@
                 if accum:
                     self.packset.accum_vars[accum.var] = accum.pos
 
-                print " %dx %s (accum? %d) " % (len(pack.operations),
-                                                
pack.operations[0].op.getopname(),
-                                                accum is not None)
+                print " %dx %s " % (len(pack.operations),
+                                    pack.operations[0].op.getopname())
+                if accum:
+                    print "   accumulates!"
             if fail:
                 assert False
 
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy vecopt-merge: resolving some issues introduced by the simpler combination and separate splitting phase,

Reply via email to