Author: Richard Plangger <r...@pasra.at> Branch: vecopt Changeset: r78318:8afb499c0584 Date: 2015-06-26 09:56 +0200 http://bitbucket.org/pypy/pypy/changeset/8afb499c0584/
Log: don't follow wrong dependency chains, excluded fail args only store is not allowed to compute operations if the vector is not fully packed diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py --- a/rpython/jit/backend/x86/assembler.py +++ b/rpython/jit/backend/x86/assembler.py @@ -2722,7 +2722,9 @@ def genop_vec_int_expand(self, op, arglocs, resloc): srcloc, sizeloc = arglocs - assert isinstance(srcloc, RegLoc) + if not isinstance(srcloc, RegLoc): + self.mov(X86_64_SCRATCH_REG, srcloc) + srcloc = X86_64_SCRATCH_REG assert not srcloc.is_xmm size = sizeloc.value if size == 1: diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py b/rpython/jit/metainterp/optimizeopt/schedule.py --- a/rpython/jit/metainterp/optimizeopt/schedule.py +++ b/rpython/jit/metainterp/optimizeopt/schedule.py @@ -269,8 +269,10 @@ left = len(pack.operations) assert stride > 0 while off < len(pack.operations): - if left < stride: - self.preamble_ops.append(pack.operations[off].getoperation()) + print left, "<", stride + if stride == 1: + op = pack.operations[off].getoperation() + self.preamble_ops.append(op) off += 1 continue ops = pack.operations[off:off+stride] @@ -294,9 +296,6 @@ if bytes > vec_reg_size: # too many bytes. does not fit into the vector register return vec_reg_size // self.getscalarsize() - if bytes < vec_reg_size: - # not enough to fill the vector register - return 1 return pack.opcount() def getscalarsize(self): @@ -316,12 +315,16 @@ if isinstance(arg, BoxVector): continue if self.is_vector_arg(i): - args[i] = self.transform_argument(args[i], i, off) + args[i] = self.transform_argument(args[i], i, off, stride) # result = op.result result = self.transform_result(result, off) # vop = ResOperation(op.vector, args, result, op.getdescr()) + if op.is_guard(): + assert isinstance(op, GuardResOp) + vop.setfailargs(op.getfailargs()) + vop.rd_snapshot = op.rd_snapshot self.preamble_ops.append(vop) def transform_result(self, result, off): @@ -342,7 +345,7 @@ signed = self.output_type.signed return BoxVector(type, count, size, signed) - def transform_argument(self, arg, argidx, off): + def transform_argument(self, arg, argidx, off, stride): ops = self.pack.operations box_pos, vbox = self.sched_data.getvector_of_box(arg) if not vbox: @@ -359,7 +362,8 @@ packed = vbox.item_count assert packed >= 0 assert packable >= 0 - if packed < packable: + vboxes = self.vector_boxes_for_args(argidx) + if len(vboxes) > 1: # packed < packable and packed < stride: # the argument is scattered along different vector boxes args = [op.getoperation().getarg(argidx) for op in ops] vbox = self._pack(vbox, packed, args, packable) @@ -379,8 +383,20 @@ vbox = self.unpack(vbox, args, off, len(ops), self.input_type) self.update_input_output(self.pack) # + assert vbox is not None return vbox + def vector_boxes_for_args(self, index): + args = [op.getoperation().getarg(index) for op in self.pack.operations] + vboxes = [] + last_vbox = None + for arg in args: + pos, vbox = self.sched_data.getvector_of_box(arg) + if vbox != last_vbox and vbox is not None: + vboxes.append(vbox) + return vboxes + + def extend(self, vbox, newtype): assert vbox.gettype() == newtype.gettype() if vbox.gettype() == INT: @@ -443,6 +459,7 @@ self.sched_data.setvector_of_box(arg, j, new_box) tgt_box = new_box _, vbox = self.sched_data.getvector_of_box(args[0]) + assert vbox is not None return vbox def _check_vec_pack(self, op): @@ -589,6 +606,11 @@ return BoxVector(type, count, size, signed) class StoreToVectorStore(OpToVectorOp): + """ + Storing operations are special because they are not allowed + to store to memory if the vector is not fully filled. + Thus a modified split_pack function + """ def __init__(self): OpToVectorOp.__init__(self, (None, None, PT_GENERIC), None) self.has_descr = True @@ -599,6 +621,20 @@ def determine_output_type(self, op): return None + def split_pack(self, pack, vec_reg_size): + """ Returns how many items of the pack should be + emitted as vector operation. """ + bytes = pack.opcount() * self.getscalarsize() + if bytes > vec_reg_size: + # too many bytes. does not fit into the vector register + return vec_reg_size // self.getscalarsize() + if bytes < vec_reg_size: + # special case for store, even though load is allowed + # to load more, store is not! + # not enough to fill the vector register + return 1 + return pack.opcount() + class PassThroughOp(OpToVectorOp): """ This pass through is only applicable if the target operation is capable of handling vector operations. diff --git a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py --- a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py @@ -79,6 +79,16 @@ for op in vsd.as_vector_operation(pack, renamer): ops.append(op) loop.operations = ops + metainterp_sd = FakeMetaInterpStaticData(self.cpu) + jitdriver_sd = FakeJitDriverStaticData() + opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, loop, 0) + opt.clear_newoperations() + for op in ops: + opt.unpack_from_vector(op, vsd, renamer) + opt.emit_operation(op) + ops = opt._newoperations + loop.operations = ops + if prepend_invariant: loop.operations = vsd.invariant_oplist + ops return loop @@ -100,8 +110,7 @@ loop2 = self.schedule(loop1, [pack1]) loop3 = self.parse(""" v10[i32|4] = vec_raw_load(p0, i0, 4, descr=float) - i14 = raw_load(p0, i4, descr=float) - i15 = raw_load(p0, i5, descr=float) + v11[i32|2] = vec_raw_load(p0, i4, 2, descr=float) """, False) self.assert_equal(loop2, loop3) @@ -109,12 +118,15 @@ loop1 = self.parse(""" i10 = raw_load(p0, i0, descr=long) i11 = raw_load(p0, i1, descr=long) - f10 = cast_int_to_float(i10) - f11 = cast_int_to_float(i11) + i12 = int_signext(i10, 4) + i13 = int_signext(i11, 4) + f10 = cast_int_to_float(i12) + f11 = cast_int_to_float(i13) """) pack1 = self.pack(loop1, 0, 2) pack2 = self.pack(loop1, 2, 4) - loop2 = self.schedule(loop1, [pack1, pack2]) + pack3 = self.pack(loop1, 4, 6) + loop2 = self.schedule(loop1, [pack1, pack2, pack3]) loop3 = self.parse(""" v10[i64|2] = vec_raw_load(p0, i0, 2, descr=long) v20[i32|2] = vec_int_signext(v10[i64|2], 4) @@ -321,3 +333,54 @@ guard_true(v11[i64|2]) [] """, False) self.assert_equal(loop2, loop3) + + + def test_split_load_store(self): + loop1 = self.parse(""" + i10 = raw_load(p0, i1, descr=float) + i11 = raw_load(p0, i2, descr=float) + raw_store(p0, i3, i10, descr=float) + raw_store(p0, i4, i11, descr=float) + """) + pack1 = self.pack(loop1, 0, 2) + pack2 = self.pack(loop1, 2, 4) + loop2 = self.schedule(loop1, [pack1,pack2], prepend_invariant=True) + loop3 = self.parse(""" + v1[ui32|2] = vec_raw_load(p0, i1, 2, descr=float) + i10 = vec_int_unpack(v1[ui32|2], 0, 1) + raw_store(p0, i3, i10, descr=float) + i11 = vec_int_unpack(v1[ui32|2], 1, 1) + raw_store(p0, i4, i11, descr=float) + """, False) + # unfortunate ui32 is the type for float32... the unsigned u is for + # the tests + self.assert_equal(loop2, loop3) + + def test_split_arith(self): + loop1 = self.parse(""" + i10 = int_and(255, i1) + i11 = int_and(255, i1) + """) + pack1 = self.pack(loop1, 0, 2) + loop2 = self.schedule(loop1, [pack1], prepend_invariant=True) + loop3 = self.parse(""" + v1[i64|2] = vec_int_expand(255) + v2[i64|2] = vec_int_expand(i1) + v3[i64|2] = vec_int_and(v1[i64|2], v2[i64|2]) + """, False) + self.assert_equal(loop2, loop3) + + def test_split_arith(self): + loop1 = self.parse(""" + i10 = int_and(255, i1) + i11 = int_and(255, i1) + """) + pack1 = self.pack(loop1, 0, 2) + loop2 = self.schedule(loop1, [pack1], prepend_invariant=True) + loop3 = self.parse(""" + v1[i64|2] = vec_int_expand(255) + v2[i64|2] = vec_int_expand(i1) + v3[i64|2] = vec_int_and(v1[i64|2], v2[i64|2]) + """, False) + self.assert_equal(loop2, loop3) + diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py --- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py @@ -1397,21 +1397,18 @@ jump(p0, p1, p5, p6, p7, p17, p19, i68, i39, i44, i49, i51) """ trace=""" - [p3, i4, p1, i5, i6, i7] - guard_early_exit(descr=<ResumeAtLoopHeaderDescr object at 0x7f3afe4fb830>) [p1, i5, i4, p3] - i8 = raw_load(i6, i5, descr=intarraydescr) - guard_not_invalidated(descr=<ResumeGuardNotInvalidated object at 0x7f3afe4fb888>) [p1, i8, i5, i4, p3] - i10 = int_and(i8, 255) - guard_false(i10, descr=<ResumeGuardFalseDescr object at 0x7f3afe4fb8e0>) [p1, i5, i4, p3] - i13 = getarrayitem_raw(139891327308826, 2, descr=chararraydescr) - guard_value(i13, 1, descr=<ResumeGuardValueDescr object at 0x7f3afe4fb938>) [i13, p1, i5, i4, p3] - i17 = getarrayitem_raw(139891327308824, 1, descr=chararraydescr) - i19 = int_add(i4, 1) - i21 = int_add(i5, 8) - i22 = int_ge(i19, i7) - guard_false(i22, descr=<ResumeGuardFalseDescr object at 0x7f3afe4fb990>) [i17, p1, i21, i19, None, None, p3] - guard_value(i17, 2, descr=<ResumeGuardValueDescr object at 0x7f3afe4fb9e8>) [i17, p1, i21, i19, None, None, p3] - jump(p3, i19, p1, i21, i6, i7) + [p0, p3, i4, i5, i6, i7] + guard_early_exit(descr=<rpython.jit.metainterp.compile.ResumeAtLoopHeaderDescr object at 0x7f492da84250>) [p0, p3, i4, i5] + f8 = raw_load(i6, i5, descr=floatarraydescr) + guard_not_invalidated(descr=<rpython.jit.metainterp.compile.ResumeGuardNotInvalidated object at 0x7f492da846d0>) [p0, f8, p3, i4, i5] + i9 = cast_float_to_int(f8) + i11 = int_and(i9, 255) + guard_true(i11, descr=<rpython.jit.metainterp.compile.ResumeGuardTrueDescr object at 0x7f492da8b790>) [p0, p3, i4, i5] + i13 = int_add(i4, 1) + i15 = int_add(i5, 8) + i16 = int_ge(i13, i7) + guard_false(i16, descr=<rpython.jit.metainterp.compile.ResumeGuardFalseDescr object at 0x7f492da93610>) [p0, i13, i15, p3, None, None] + jump(p0, p3, i13, i15, i6, i7) """ opt = self.vectorize(self.parse_loop(trace)) self.debug_print_operations(opt.loop) diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py b/rpython/jit/metainterp/optimizeopt/vectorize.py --- a/rpython/jit/metainterp/optimizeopt/vectorize.py +++ b/rpython/jit/metainterp/optimizeopt/vectorize.py @@ -339,6 +339,11 @@ for rdep in pack.right.depends(): lnode = ldep.to rnode = rdep.to + # only valid if the result of the left is in args of pack left + result = lnode.getoperation().result + args = pack.left.getoperation().getarglist() + if result is None or result not in args: + continue isomorph = isomorphic(lnode.getoperation(), rnode.getoperation()) if isomorph and lnode.is_before(rnode): pair = self.packset.can_be_packed(lnode, rnode, pack, False) @@ -351,6 +356,10 @@ for rdep in pack.right.provides(): lnode = ldep.to rnode = rdep.to + result = pack.left.getoperation().result + args = lnode.getoperation().getarglist() + if result is None or result not in args: + continue isomorph = isomorphic(lnode.getoperation(), rnode.getoperation()) if isomorph and lnode.is_before(rnode): pair = self.packset.can_be_packed(lnode, rnode, pack, True) diff --git a/rpython/jit/metainterp/warmstate.py b/rpython/jit/metainterp/warmstate.py --- a/rpython/jit/metainterp/warmstate.py +++ b/rpython/jit/metainterp/warmstate.py @@ -379,7 +379,7 @@ # so that it will keep it alive for a longer time warmrunnerdesc.memory_manager.keep_loop_alive(loop_token) # XXX debug purpose only - jitdriver_sd.xxxbench.xxx_clock_stop() + jitdriver_sd.xxxbench.xxx_clock_stop(fail=True) # XXX debug purpose only end # # Handle the failure _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit