Author: Richard Plangger <planri...@gmail.com> Branch: vecopt-merge Changeset: r79637:86dbbde6b191 Date: 2015-09-14 19:21 +0200 http://bitbucket.org/pypy/pypy/changeset/86dbbde6b191/
Log: further poking the scheduler. resoperations are now fully typed. this makes all the transformation logic much easier and less code, first simple tests pass already diff --git a/rpython/jit/metainterp/optimizeopt/schedule.py b/rpython/jit/metainterp/optimizeopt/schedule.py --- a/rpython/jit/metainterp/optimizeopt/schedule.py +++ b/rpython/jit/metainterp/optimizeopt/schedule.py @@ -97,7 +97,6 @@ """ Emit all the operations into the oplist parameter. Initiates the scheduling. """ assert isinstance(state, SchedulerState) - import pdb; pdb.set_trace() while state.has_more(): node = self.next(state) if node: @@ -273,6 +272,37 @@ # return self.count + +class TypeOutput(object): + def __init__(self, type, count): + self.type = type + self.count = count + + + def bytecount(self): + return self.count * self.type.bytecount() + +class DataTyper(object): + + def infer_type(self, op): + # default action, pass through: find the first arg + # the output is the same as the first argument! + if op.returns_void() or op.argcount() == 0: + return + arg0 = op.getarg(0) + op.setdatatype(arg0.datatype, arg0.bytesize, arg0.signed) + +class PassFirstArg(TypeOutput): + def __init__(self): + pass + +def update_arg_in_vector_pos(state, argidx, box): + arguments = [op.getoperation().getarg(argidx) for op in self.getoperations()] + for i,arg in enumerate(arguments): + #if i >= box.count: + # break + state.setvector_of_box(arg, i, box) + class TypeRestrict(object): ANY_TYPE = -1 ANY_SIZE = -1 @@ -296,389 +326,433 @@ return True -class TypeOutput(object): - def __init__(self, type, count): - self.type = type - self.count = count +class trans(object): + #DT_PASS = DataTyper() + TR_ANY = TypeRestrict() + TR_ANY_FLOAT = TypeRestrict(FLOAT) + TR_ANY_INTEGER = TypeRestrict(INT) + TR_FLOAT_2 = TypeRestrict(FLOAT, 4, 2) + TR_DOUBLE_2 = TypeRestrict(FLOAT, 8, 2) + TR_LONG = TypeRestrict(INT, 8, 2) + TR_INT_2 = TypeRestrict(INT, 4, 2) - def bytecount(self): - return self.count * self.type.bytecount() + #INT = OpToVectorOp((TR_ANY_INTEGER, TR_ANY_INTEGER), DT_PASS) + #FLOAT = OpToVectorOp((TR_ANY_FLOAT, TR_ANY_FLOAT), DT_PASS) + #FLOAT_UNARY = OpToVectorOp((TR_ANY_FLOAT,), DT_PASS) + #LOAD = LoadToVectorLoad() + #STORE = StoreToVectorStore() + #GUARD = PassThroughOp((TR_ANY_INTEGER,)) -class DataTyper(object): + # note that the following definition is x86 arch specific + MAPPING = { + rop.VEC_INT_ADD: [TR_ANY_INTEGER, TR_ANY_INTEGER], + rop.VEC_INT_SUB: [TR_ANY_INTEGER, TR_ANY_INTEGER], + rop.VEC_INT_MUL: [TR_ANY_INTEGER, TR_ANY_INTEGER], + rop.VEC_INT_AND: [TR_ANY_INTEGER, TR_ANY_INTEGER], + rop.VEC_INT_OR: [TR_ANY_INTEGER, TR_ANY_INTEGER], + rop.VEC_INT_XOR: [TR_ANY_INTEGER, TR_ANY_INTEGER], + rop.VEC_INT_EQ: [TR_ANY_INTEGER, TR_ANY_INTEGER], + rop.VEC_INT_NE: [TR_ANY_INTEGER, TR_ANY_INTEGER], - def infer_type(self, op): - # default action, pass through: find the first arg - # the output is the same as the first argument! - if op.returns_void() or op.argcount() == 0: - return - arg0 = op.getarg(0) - op.setdatatype(arg0.datatype, arg0.bytesize, arg0.signed) + rop.VEC_FLOAT_ADD: [TR_ANY_FLOAT, TR_ANY_FLOAT], + rop.VEC_FLOAT_SUB: [TR_ANY_FLOAT, TR_ANY_FLOAT], + rop.VEC_FLOAT_MUL: [TR_ANY_FLOAT, TR_ANY_FLOAT], + rop.VEC_FLOAT_TRUEDIV: [TR_ANY_FLOAT, TR_ANY_FLOAT], + rop.VEC_FLOAT_ABS: [TR_ANY_FLOAT], + rop.VEC_FLOAT_NEG: [TR_ANY_FLOAT], -class PassFirstArg(TypeOutput): - def __init__(self): - pass + rop.VEC_RAW_LOAD_I: [None, None, TR_ANY], + rop.VEC_RAW_LOAD_F: [None, None, TR_ANY], + rop.VEC_GETARRAYITEM_RAW_I: [None, None, TR_ANY], + rop.VEC_GETARRAYITEM_RAW_F: [None, None, TR_ANY], + rop.VEC_GETARRAYITEM_GC_I: [None, None, TR_ANY], + rop.VEC_GETARRAYITEM_GC_F: [None, None, TR_ANY], + + rop.VEC_RAW_STORE: [None, None, None, TR_ANY], + rop.VEC_SETARRAYITEM_RAW: [None, None, None, TR_ANY], + rop.VEC_SETARRAYITEM_GC: [None, None, None, TR_ANY], + + rop.GUARD_TRUE: [TR_ANY_INTEGER], + rop.GUARD_FALSE: [TR_ANY_INTEGER], + + ## irregular + rop.VEC_INT_SIGNEXT: [TR_ANY_INTEGER], + + rop.VEC_CAST_FLOAT_TO_SINGLEFLOAT: [TR_DOUBLE_2], + rop.VEC_CAST_SINGLEFLOAT_TO_FLOAT: [TR_FLOAT_2], + rop.VEC_CAST_FLOAT_TO_INT: [TR_DOUBLE_2], + rop.VEC_CAST_INT_TO_FLOAT: [TR_INT_2], + + rop.VEC_FLOAT_EQ: [TR_ANY_FLOAT,TR_ANY_FLOAT], + rop.VEC_FLOAT_NE: [TR_ANY_FLOAT,TR_ANY_FLOAT], + rop.VEC_INT_IS_TRUE: [TR_ANY_INTEGER,TR_ANY_INTEGER], + } + + # TODO? + UNSIGNED_OPS = (rop.UINT_FLOORDIV, rop.UINT_RSHIFT, + rop.UINT_LT, rop.UINT_LE, + rop.UINT_GT, rop.UINT_GE) + +def turn_to_vector(state, pack): + """ Turn a pack into a vector instruction """ + # + # TODO self.check_if_pack_supported(pack) + op = pack.leftmost() + args = op.getarglist() + prepare_arguments(state, pack, args) + vop = VecOperation(op.vector, args, op, pack.numops(), op.getdescr()) + for i,node in enumerate(pack.operations): + op = node.getoperation() + state.setvector_of_box(op,i,vop) + # + if op.is_guard(): + assert isinstance(op, GuardResOp) + assert isinstance(vop, GuardResOp) + vop.setfailargs(op.getfailargs()) + vop.rd_snapshot = op.rd_snapshot + state.costmodel.record_pack_savings(pack, pack.numops()) + # + if pack.is_accumulating(): + box = oplist[position].result + assert box is not None + for node in pack.operations: + op = node.getoperation() + assert not op.returns_void() + state.renamer.start_renaming(op, box) + # + state.oplist.append(vop) + + +def prepare_arguments(state, pack, args): + # Transforming one argument to a vector box argument + # The following cases can occur: + # 1) argument is present in the box_to_vbox map. + # a) vector can be reused immediatly (simple case) + # b) an operation forces the unpacking of a vector + # 2) argument is not known to reside in a vector + # a) expand vars/consts before the label and add as argument + # b) expand vars created in the loop body + # + restrictions = trans.MAPPING[pack.leftmost().vector] + for i,arg in enumerate(args): + if i >= len(restrictions) or restrictions[i] is None: + # ignore this argument + continue + print "trans", i, "arg", arg + if arg.returns_vector(): + continue + pos, vecop = state.getvector_of_box(arg) + if not vecop: + # 2) constant/variable expand this box + # TODO just as one function call + vecop = self.expand(arg, i) + state.setvector_of_box(arg, 0, vecop) + pos = 0 + continue + args[i] = vecop + assemble_scattered_values(state, pack, args, i) + position_values(state, pack, args, i, arg, pos) + +def assemble_scattered_values(state, pack, args, index): + vectors = pack.argument_vectors(state, pack, index) + if len(vectors) > 1: + # the argument is scattered along different vector boxes + value = gather(vectors, packable) + update_arg_in_vector_pos(state, i, value) + args[i] = value + #if packed < packable and len(vboxes) > 1: + # # the argument is scattered along different vector boxes + # args[i] = self.gather(vboxes, packable) + # self.update_arg_in_vector_pos(i, args[i]) + # continue + +def gather(self, vboxes, target_count): # packed < packable and packed < stride: + (_, box) = vboxes[0] + i = 1 + while i < len(vboxes): + (box2_pos, box2) = vboxes[i] + if box.getcount() + box2.getcount() <= target_count: + box = self.package(box, box.getcount(), + box2, box2_pos, box2.getcount()) + i += 1 + return box + +def position_values(state, pack, args, index, arg, pos): + pass + #if pos != 0: + # # The vector box is at a position != 0 but it + # # is required to be at position 0. Unpack it! + # args[i] = self.unpack(vecop, pos, packed - pos, self.input_type) + # self.update_arg_in_vector_pos(i, args[i]) + # continue + + # convert size i64 -> i32, i32 -> i64, ... + # TODO if self.bytesize > 0: + # determine_trans( + # self.input_type.getsize() != vecop.getsize(): + # vecop = self.extend(vecop, self.input_type) + + # use the input as an indicator for the pack type + #packable = vecop.maximum_numops() + #packed = vecop.count + #assert packed >= 0 + #assert packable >= 0 + #if packed > packable: + # # the argument has more items than the operation is able to process! + # # pos == 0 then it is already at the right place + # if pos != 0: + # args[i] = self.unpack(vecop, pos, packed - pos, self.input_type) + # self.update_arg_in_vector_pos(i, args[i]) + # #self.update_input_output(self.pack) + # continue + # else: + # assert vecop is not None + # args[i] = vecop + # continue + #vboxes = self.vector_boxes_for_args(i) + #if packed < packable and len(vboxes) > 1: + # # the argument is scattered along different vector boxes + # args[i] = self.gather(vboxes, packable) + # self.update_arg_in_vector_pos(i, args[i]) + # continue + #if pos != 0: + # # The vector box is at a position != 0 but it + # # is required to be at position 0. Unpack it! + # args[i] = self.unpack(vecop, pos, packed - pos, self.input_type) + # self.update_arg_in_vector_pos(i, args[i]) + # continue + ## + #assert vecop is not None + #args[i] = vecop + +def before_argument_transform(self, args): + pass + +def check_if_pack_supported(self, pack): + op0 = pack.operations[0].getoperation() + if self.input_type is None: + # must be a load/guard op + return + insize = self.input_type.getsize() + if op0.is_typecast(): + # prohibit the packing of signext calls that + # cast to int16/int8. + _, outsize = op0.cast_to() + self.sched_data._prevent_signext(outsize, insize) + if op0.getopnum() == rop.INT_MUL: + if insize == 8 or insize == 1: + # see assembler for comment why + raise NotAProfitableLoop + +#def transform_result(self, result): +# if result is None: +# return None +# vbox = self.new_result_vector_box() +# # +# # mark the position and the vbox in the hash +# for i, node in enumerate(self.getoperations()): +# if i >= vbox.getcount(): +# break +# op = node.getoperation() +# self.sched_data.setvector_of_box(op, i, vbox) +# return vbox + +#def new_result_vector_box(self): +# type = self.output_type.gettype() +# size = self.output_type.getsize() +# count = min(self.output_type.getcount(), len(self.pack.operations)) +# signed = self.output_type.signed +# return BoxVector(type, count, size, signed) + +#def getoperations(self): +# return self.pack.operations + +#def transform_arguments(self, args): +# """ Transforming one argument to a vector box argument +# The following cases can occur: +# 1) argument is present in the box_to_vbox map. +# a) vector can be reused immediatly (simple case) +# b) vector is to big +# c) vector is to small +# 2) argument is not known to reside in a vector +# a) expand vars/consts before the label and add as argument +# b) expand vars created in the loop body +# """ +# for i,arg in enumerate(args): +# if arg.returns_vector(): +# continue +# if not self.is_vector_arg(i): +# continue +# box_pos, vbox = self.sched_data.getvector_of_box(arg) +# if not vbox: +# # constant/variable expand this box +# vbox = self.expand(arg, i) +# self.sched_data.setvector_of_box(arg, 0, vbox) +# box_pos = 0 +# # convert size i64 -> i32, i32 -> i64, ... +# if self.input_type.getsize() > 0 and \ +# self.input_type.getsize() != vbox.getsize(): +# vbox = self.extend(vbox, self.input_type) + +# # use the input as an indicator for the pack type +# packable = self.input_type.getcount() +# packed = vbox.getcount() +# assert packed >= 0 +# assert packable >= 0 +# if packed > packable: +# # the argument has more items than the operation is able to process! +# # box_pos == 0 then it is already at the right place +# if box_pos != 0: +# args[i] = self.unpack(vbox, box_pos, packed - box_pos, self.input_type) +# self.update_arg_in_vector_pos(i, args[i]) +# #self.update_input_output(self.pack) +# continue +# else: +# assert vbox is not None +# args[i] = vbox +# continue +# vboxes = self.vector_boxes_for_args(i) +# if packed < packable and len(vboxes) > 1: +# # the argument is scattered along different vector boxes +# args[i] = self.gather(vboxes, packable) +# self.update_arg_in_vector_pos(i, args[i]) +# continue +# if box_pos != 0: +# # The vector box is at a position != 0 but it +# # is required to be at position 0. Unpack it! +# args[i] = self.unpack(vbox, box_pos, packed - box_pos, self.input_type) +# self.update_arg_in_vector_pos(i, args[i]) +# continue +# #self.update_input_output(self.pack) +# # +# assert vbox is not None +# args[i] = vbox + +def extend(self, vbox, newtype): + assert vbox.gettype() == newtype.gettype() + if vbox.gettype() == INT: + return self.extend_int(vbox, newtype) + else: + raise NotImplementedError("cannot yet extend float") + +def extend_int(self, vbox, newtype): + vbox_cloned = newtype.new_vector_box(vbox.getcount()) + self.sched_data._prevent_signext(newtype.getsize(), vbox.getsize()) + newsize = newtype.getsize() + assert newsize > 0 + op = ResOperation(rop.VEC_INT_SIGNEXT, + [vbox, ConstInt(newsize)], + vbox_cloned) + self.costmodel.record_cast_int(vbox.getsize(), newtype.getsize(), vbox.getcount()) + self.vecops.append(op) + return vbox_cloned + +def unpack(self, vbox, index, count, arg_ptype): + """ Extract parts of the vector box into another vector box """ + assert index < vbox.getcount() + assert index + count <= vbox.getcount() + assert count > 0 + vbox_cloned = vectorbox_clone_set(vbox, count=count) + opnum = getunpackopnum(vbox.gettype()) + op = ResOperation(opnum, [vbox, ConstInt(index), ConstInt(count)], vbox_cloned) + self.costmodel.record_vector_unpack(vbox, index, count) + self.vecops.append(op) + # + return vbox_cloned + +def package(self, tgt, tidx, src, sidx, scount): + """ tgt = [1,2,3,4,_,_,_,_] + src = [5,6,_,_] + new_box = [1,2,3,4,5,6,_,_] after the operation, tidx=4, scount=2 + """ + assert sidx == 0 # restriction + count = tgt.getcount() + src.getcount() + new_box = vectorbox_clone_set(tgt, count=count) + opnum = getpackopnum(tgt.gettype()) + op = ResOperation(opnum, [tgt, src, ConstInt(tidx), ConstInt(scount)], new_box) + self.vecops.append(op) + self.costmodel.record_vector_pack(src, sidx, scount) + if not we_are_translated(): + self._check_vec_pack(op) + return new_box + +def _check_vec_pack(self, op): + result = op + arg0 = op.getarg(0) + arg1 = op.getarg(1) + index = op.getarg(2) + count = op.getarg(3) + assert isinstance(result, BoxVector) + assert isinstance(arg0, BoxVector) + assert isinstance(index, ConstInt) + assert isinstance(count, ConstInt) + assert arg0.getsize() == result.getsize() + if isinstance(arg1, BoxVector): + assert arg1.getsize() == result.getsize() + else: + assert count.value == 1 + assert index.value < result.getcount() + assert index.value + count.value <= result.getcount() + assert result.getcount() > arg0.getcount() + +def expand(self, arg, argidx): + """ Expand a value into a vector box. useful for arith metic + of one vector with a scalar (either constant/varialbe) + """ + elem_count = self.input_type.getcount() + vbox = self.input_type.new_vector_box(elem_count) + box_type = arg.type + expanded_map = self.sched_data.expanded_map + # note that heterogenous nodes are not yet tracked + already_expanded = expanded_map.get(arg, None) + if already_expanded: + return already_expanded + + ops = self.sched_data.invariant_oplist + variables = self.sched_data.invariant_vector_vars + if isinstance(arg,Box) and arg not in self.sched_data.inputargs: + ops = self.vecops + variables = None + if isinstance(arg, BoxVector): + box_type = arg.gettype() + + for i, node in enumerate(self.getoperations()): + op = node.getoperation() + if not arg.same_box(op.getarg(argidx)): + break + i += 1 + else: + expand_opnum = getexpandopnum(box_type) + op = ResOperation(expand_opnum, [arg, ConstInt(vbox.item_count)], vbox) + ops.append(op) + if variables is not None: + variables.append(vbox) + expanded_map[arg] = vbox + return vbox + + op = ResOperation(rop.VEC_BOX, [ConstInt(elem_count)], vbox) + ops.append(op) + opnum = getpackopnum(arg.type) + for i,node in enumerate(self.getoperations()): + op = node.getoperation() + arg = op.getarg(argidx) + new_box = vbox.clonebox() + ci = ConstInt(i) + c1 = ConstInt(1) + op = ResOperation(opnum, [vbox,arg,ci,c1], new_box) + vbox = new_box + ops.append(op) + + if variables is not None: + variables.append(vbox) + return vbox class OpToVectorOp(object): - def __init__(self, restrictargs, typeoutput): - self.args = list(restrictargs) # do not use a tuple. rpython cannot union - self.out = typeoutput - - def as_vector_operation(self, state, pack): - # - # TODO self.check_if_pack_supported(pack) - op = pack.leftmost() - args = op.getarglist() - self.prepare_arguments(state, op.getarglist()) - vop = VecOperation(op.vector, args, op, pack.numops(), op.getdescr()) - # - if op.is_guard(): - assert isinstance(op, GuardResOp) - assert isinstance(vop, GuardResOp) - vop.setfailargs(op.getfailargs()) - vop.rd_snapshot = op.rd_snapshot - state.costmodel.record_pack_savings(pack, pack.numops()) - # - if pack.is_accumulating(): - box = oplist[position].result - assert box is not None - for node in pack.operations: - op = node.getoperation() - assert not op.returns_void() - scheduler.renamer.start_renaming(op, box) - # - state.oplist.append(vop) - - def prepare_arguments(self, state, args): - self.before_argument_transform(args) - # Transforming one argument to a vector box argument - # The following cases can occur: - # 1) argument is present in the box_to_vbox map. - # a) vector can be reused immediatly (simple case) - # b) vector is to big - # c) vector is to small - # 2) argument is not known to reside in a vector - # a) expand vars/consts before the label and add as argument - # b) expand vars created in the loop body - # - for i,arg in enumerate(args): - if arg.returns_vector(): - continue - if not self.transform_arg_at(i): - continue - box_pos, vbox = state.getvector_of_box(arg) - if not vbox: - # 2) constant/variable expand this box - vbox = self.expand(arg, i) - self.sched_data.setvector_of_box(arg, 0, vbox) - box_pos = 0 - # convert size i64 -> i32, i32 -> i64, ... - if self.input_type.getsize() > 0 and \ - self.input_type.getsize() != vbox.getsize(): - vbox = self.extend(vbox, self.input_type) - - # use the input as an indicator for the pack type - packable = self.input_type.getcount() - packed = vbox.getcount() - assert packed >= 0 - assert packable >= 0 - if packed > packable: - # the argument has more items than the operation is able to process! - # box_pos == 0 then it is already at the right place - if box_pos != 0: - args[i] = self.unpack(vbox, box_pos, packed - box_pos, self.input_type) - self.update_arg_in_vector_pos(i, args[i]) - #self.update_input_output(self.pack) - continue - else: - assert vbox is not None - args[i] = vbox - continue - vboxes = self.vector_boxes_for_args(i) - if packed < packable and len(vboxes) > 1: - # the argument is scattered along different vector boxes - args[i] = self.gather(vboxes, packable) - self.update_arg_in_vector_pos(i, args[i]) - continue - if box_pos != 0: - # The vector box is at a position != 0 but it - # is required to be at position 0. Unpack it! - args[i] = self.unpack(vbox, box_pos, packed - box_pos, self.input_type) - self.update_arg_in_vector_pos(i, args[i]) - continue - #self.update_input_output(self.pack) - # - assert vbox is not None - args[i] = vbox - - def before_argument_transform(self, args): + def __init__(self): #, restrictargs, typeoutput): pass - - def check_if_pack_supported(self, pack): - op0 = pack.operations[0].getoperation() - if self.input_type is None: - # must be a load/guard op - return - insize = self.input_type.getsize() - if op0.casts_box(): - # prohibit the packing of signext calls that - # cast to int16/int8. - _, outsize = op0.cast_to() - self.sched_data._prevent_signext(outsize, insize) - if op0.getopnum() == rop.INT_MUL: - if insize == 8 or insize == 1: - # see assembler for comment why - raise NotAProfitableLoop - - def transform_result(self, result): - if result is None: - return None - vbox = self.new_result_vector_box() - # - # mark the position and the vbox in the hash - for i, node in enumerate(self.getoperations()): - if i >= vbox.getcount(): - break - op = node.getoperation() - self.sched_data.setvector_of_box(op, i, vbox) - return vbox - - def new_result_vector_box(self): - type = self.output_type.gettype() - size = self.output_type.getsize() - count = min(self.output_type.getcount(), len(self.pack.operations)) - signed = self.output_type.signed - return BoxVector(type, count, size, signed) - - def getoperations(self): - return self.pack.operations - - def transform_arguments(self, args): - """ Transforming one argument to a vector box argument - The following cases can occur: - 1) argument is present in the box_to_vbox map. - a) vector can be reused immediatly (simple case) - b) vector is to big - c) vector is to small - 2) argument is not known to reside in a vector - a) expand vars/consts before the label and add as argument - b) expand vars created in the loop body - """ - for i,arg in enumerate(args): - if arg.returns_vector(): - continue - if not self.is_vector_arg(i): - continue - box_pos, vbox = self.sched_data.getvector_of_box(arg) - if not vbox: - # constant/variable expand this box - vbox = self.expand(arg, i) - self.sched_data.setvector_of_box(arg, 0, vbox) - box_pos = 0 - # convert size i64 -> i32, i32 -> i64, ... - if self.input_type.getsize() > 0 and \ - self.input_type.getsize() != vbox.getsize(): - vbox = self.extend(vbox, self.input_type) - - # use the input as an indicator for the pack type - packable = self.input_type.getcount() - packed = vbox.getcount() - assert packed >= 0 - assert packable >= 0 - if packed > packable: - # the argument has more items than the operation is able to process! - # box_pos == 0 then it is already at the right place - if box_pos != 0: - args[i] = self.unpack(vbox, box_pos, packed - box_pos, self.input_type) - self.update_arg_in_vector_pos(i, args[i]) - #self.update_input_output(self.pack) - continue - else: - assert vbox is not None - args[i] = vbox - continue - vboxes = self.vector_boxes_for_args(i) - if packed < packable and len(vboxes) > 1: - # the argument is scattered along different vector boxes - args[i] = self.gather(vboxes, packable) - self.update_arg_in_vector_pos(i, args[i]) - continue - if box_pos != 0: - # The vector box is at a position != 0 but it - # is required to be at position 0. Unpack it! - args[i] = self.unpack(vbox, box_pos, packed - box_pos, self.input_type) - self.update_arg_in_vector_pos(i, args[i]) - continue - #self.update_input_output(self.pack) - # - assert vbox is not None - args[i] = vbox - - def gather(self, vboxes, target_count): # packed < packable and packed < stride: - (_, box) = vboxes[0] - i = 1 - while i < len(vboxes): - (box2_pos, box2) = vboxes[i] - if box.getcount() + box2.getcount() <= target_count: - box = self.package(box, box.getcount(), - box2, box2_pos, box2.getcount()) - i += 1 - return box - - def update_arg_in_vector_pos(self, argidx, box): - arguments = [op.getoperation().getarg(argidx) for op in self.getoperations()] - for i,arg in enumerate(arguments): - if i >= box.getcount(): - break - self.sched_data.setvector_of_box(arg, i, box) - - def vector_boxes_for_args(self, index): - args = [op.getoperation().getarg(index) for op in self.getoperations()] - vboxes = [] - last_vbox = None - for arg in args: - pos, vbox = self.sched_data.getvector_of_box(arg) - if vbox is not last_vbox and vbox is not None: - vboxes.append((pos, vbox)) - last_vbox = vbox - return vboxes - - - def extend(self, vbox, newtype): - assert vbox.gettype() == newtype.gettype() - if vbox.gettype() == INT: - return self.extend_int(vbox, newtype) - else: - raise NotImplementedError("cannot yet extend float") - - def extend_int(self, vbox, newtype): - vbox_cloned = newtype.new_vector_box(vbox.getcount()) - self.sched_data._prevent_signext(newtype.getsize(), vbox.getsize()) - newsize = newtype.getsize() - assert newsize > 0 - op = ResOperation(rop.VEC_INT_SIGNEXT, - [vbox, ConstInt(newsize)], - vbox_cloned) - self.costmodel.record_cast_int(vbox.getsize(), newtype.getsize(), vbox.getcount()) - self.vecops.append(op) - return vbox_cloned - - def unpack(self, vbox, index, count, arg_ptype): - """ Extract parts of the vector box into another vector box """ - assert index < vbox.getcount() - assert index + count <= vbox.getcount() - assert count > 0 - vbox_cloned = vectorbox_clone_set(vbox, count=count) - opnum = getunpackopnum(vbox.gettype()) - op = ResOperation(opnum, [vbox, ConstInt(index), ConstInt(count)], vbox_cloned) - self.costmodel.record_vector_unpack(vbox, index, count) - self.vecops.append(op) - # - return vbox_cloned - - def package(self, tgt, tidx, src, sidx, scount): - """ tgt = [1,2,3,4,_,_,_,_] - src = [5,6,_,_] - new_box = [1,2,3,4,5,6,_,_] after the operation, tidx=4, scount=2 - """ - assert sidx == 0 # restriction - count = tgt.getcount() + src.getcount() - new_box = vectorbox_clone_set(tgt, count=count) - opnum = getpackopnum(tgt.gettype()) - op = ResOperation(opnum, [tgt, src, ConstInt(tidx), ConstInt(scount)], new_box) - self.vecops.append(op) - self.costmodel.record_vector_pack(src, sidx, scount) - if not we_are_translated(): - self._check_vec_pack(op) - return new_box - - def _check_vec_pack(self, op): - result = op - arg0 = op.getarg(0) - arg1 = op.getarg(1) - index = op.getarg(2) - count = op.getarg(3) - assert isinstance(result, BoxVector) - assert isinstance(arg0, BoxVector) - assert isinstance(index, ConstInt) - assert isinstance(count, ConstInt) - assert arg0.getsize() == result.getsize() - if isinstance(arg1, BoxVector): - assert arg1.getsize() == result.getsize() - else: - assert count.value == 1 - assert index.value < result.getcount() - assert index.value + count.value <= result.getcount() - assert result.getcount() > arg0.getcount() - - def expand(self, arg, argidx): - """ Expand a value into a vector box. useful for arith metic - of one vector with a scalar (either constant/varialbe) - """ - elem_count = self.input_type.getcount() - vbox = self.input_type.new_vector_box(elem_count) - box_type = arg.type - expanded_map = self.sched_data.expanded_map - # note that heterogenous nodes are not yet tracked - already_expanded = expanded_map.get(arg, None) - if already_expanded: - return already_expanded - - ops = self.sched_data.invariant_oplist - variables = self.sched_data.invariant_vector_vars - if isinstance(arg,Box) and arg not in self.sched_data.inputargs: - ops = self.vecops - variables = None - if isinstance(arg, BoxVector): - box_type = arg.gettype() - - for i, node in enumerate(self.getoperations()): - op = node.getoperation() - if not arg.same_box(op.getarg(argidx)): - break - i += 1 - else: - expand_opnum = getexpandopnum(box_type) - op = ResOperation(expand_opnum, [arg, ConstInt(vbox.item_count)], vbox) - ops.append(op) - if variables is not None: - variables.append(vbox) - expanded_map[arg] = vbox - return vbox - - op = ResOperation(rop.VEC_BOX, [ConstInt(elem_count)], vbox) - ops.append(op) - opnum = getpackopnum(arg.type) - for i,node in enumerate(self.getoperations()): - op = node.getoperation() - arg = op.getarg(argidx) - new_box = vbox.clonebox() - ci = ConstInt(i) - c1 = ConstInt(1) - op = ResOperation(opnum, [vbox,arg,ci,c1], new_box) - vbox = new_box - ops.append(op) - - if variables is not None: - variables.append(vbox) - return vbox - - def transform_arg_at(self, i): - if i < 0 or i >= len(self.args): - return False - return self.args[i] is not None - - def get_output_type_given(self, input_type, op): - return input_type - - def get_input_type_given(self, output_type, op): - return output_type - - def force_input(self, ptype): - """ Some operations require a specific count/size, - they can force the input type here! - """ - return ptype + #self.args = list(restrictargs) # do not use a tuple. rpython cannot union + #self.out = typeoutput class OpToVectorOpConv(OpToVectorOp): def __init__(self, intype, outtype): @@ -790,97 +864,31 @@ raise AssertionError("cannot infer input type from output type") -class trans(object): - DT_PASS = DataTyper() - TR_ANY_FLOAT = TypeRestrict(FLOAT) - TR_ANY_INTEGER = TypeRestrict(INT) - TR_FLOAT_2 = TypeRestrict(FLOAT, 4, 2) - TR_DOUBLE_2 = TypeRestrict(FLOAT, 8, 2) - TR_LONG = TypeRestrict(INT, 8, 2) - TR_INT_2 = TypeRestrict(INT, 4, 2) - - INT = OpToVectorOp((TR_ANY_INTEGER, TR_ANY_INTEGER), DT_PASS) - FLOAT = OpToVectorOp((TR_ANY_FLOAT, TR_ANY_FLOAT), DT_PASS) - FLOAT_UNARY = OpToVectorOp((TR_ANY_FLOAT,), DT_PASS) - LOAD = LoadToVectorLoad() - STORE = StoreToVectorStore() - GUARD = PassThroughOp((TR_ANY_INTEGER,)) - - # note that the following definition is x86 arch specific - MAPPING = { - rop.VEC_INT_ADD: INT, - rop.VEC_INT_SUB: INT, - rop.VEC_INT_MUL: INT, - rop.VEC_INT_AND: INT, - rop.VEC_INT_OR: INT, - rop.VEC_INT_XOR: INT, - rop.VEC_INT_EQ: INT, - rop.VEC_INT_NE: INT, - - rop.VEC_FLOAT_ADD: FLOAT, - rop.VEC_FLOAT_SUB: FLOAT, - rop.VEC_FLOAT_MUL: FLOAT, - rop.VEC_FLOAT_TRUEDIV: FLOAT, - rop.VEC_FLOAT_ABS: FLOAT_UNARY, - rop.VEC_FLOAT_NEG: FLOAT_UNARY, - - rop.VEC_RAW_LOAD_I: LOAD, - rop.VEC_RAW_LOAD_F: LOAD, - rop.VEC_GETARRAYITEM_RAW_I: LOAD, - rop.VEC_GETARRAYITEM_RAW_F: LOAD, - rop.VEC_GETARRAYITEM_GC_I: LOAD, - rop.VEC_GETARRAYITEM_GC_F: LOAD, - - rop.VEC_RAW_STORE: STORE, - rop.VEC_SETARRAYITEM_RAW: STORE, - rop.VEC_SETARRAYITEM_GC: STORE, - - rop.GUARD_TRUE: GUARD, - rop.GUARD_FALSE: GUARD, - - # irregular - rop.VEC_INT_SIGNEXT: SignExtToVectorOp((TR_ANY_INTEGER,), None), - - rop.VEC_CAST_FLOAT_TO_SINGLEFLOAT: OpToVectorOpConv(TR_DOUBLE_2, None), #RESTRICT_2_FLOAT), - rop.VEC_CAST_SINGLEFLOAT_TO_FLOAT: OpToVectorOpConv(TR_FLOAT_2, None), #RESTRICT_2_DOUBLE), - rop.VEC_CAST_FLOAT_TO_INT: OpToVectorOpConv(TR_DOUBLE_2, None), #RESTRICT_2_INT), - rop.VEC_CAST_INT_TO_FLOAT: OpToVectorOpConv(TR_INT_2, None), #RESTRICT_2_DOUBLE), - - rop.VEC_FLOAT_EQ: OpToVectorOp((TR_ANY_FLOAT,TR_ANY_FLOAT), None), - rop.VEC_FLOAT_NE: OpToVectorOp((TR_ANY_FLOAT,TR_ANY_FLOAT), None), - rop.VEC_INT_IS_TRUE: OpToVectorOp((TR_ANY_INTEGER,TR_ANY_INTEGER), None), # TR_ANY_INTEGER), - } - - # TODO? - UNSIGNED_OPS = (rop.UINT_FLOORDIV, rop.UINT_RSHIFT, - rop.UINT_LT, rop.UINT_LE, - rop.UINT_GT, rop.UINT_GE) - -def determine_input_output_types(pack, node, forward): - """ This function is two fold. If moving forward, it - gets an input type from the packs output type and returns - the transformed packtype. - - Moving backward, the origins pack input type is the output - type and the transformation of the packtype (in reverse direction) - is the input - """ - op = node.getoperation() - op2vecop = determine_trans(op) - if forward: - input_type = op2vecop.force_input(pack.output_type) - output_type = op2vecop.get_output_type_given(input_type, op) - if output_type: - output_type = output_type.clone() - else: - # going backwards, things are not that easy anymore - output_type = pack.input_type - input_type = op2vecop.get_input_type_given(output_type, op) - if input_type: - input_type = input_type.clone() - - return input_type, output_type +#def determine_input_output_types(pack, node, forward): +# """ This function is two fold. If moving forward, it +# gets an input type from the packs output type and returns +# the transformed packtype. +# +# Moving backward, the origins pack input type is the output +# type and the transformation of the packtype (in reverse direction) +# is the input +# """ +# op = node.getoperation() +# op2vecop = determine_trans(op) +# if forward: +# input_type = op2vecop.force_input(pack.output_type) +# output_type = op2vecop.get_output_type_given(input_type, op) +# if output_type: +# output_type = output_type.clone() +# else: +# # going backwards, things are not that easy anymore +# output_type = pack.input_type +# input_type = op2vecop.get_input_type_given(output_type, op) +# if input_type: +# input_type = input_type.clone() +# +# return input_type, output_type def determine_trans(op): op2vecop = trans.MAPPING.get(op.vector, None) @@ -951,8 +959,8 @@ assert node.pack.numops() > 1 for node in node.pack.operations: scheduler.mark_emitted(node, self) - op2vecop = determine_trans(node.pack.leftmost()) - op2vecop.as_vector_operation(self, node.pack) + # TODO op2vecop = determine_trans(node.pack.leftmost()) + turn_to_vector(self, node.pack) return True return False @@ -1021,39 +1029,22 @@ def getvector_of_box(self, arg): return self.box_to_vbox.get(arg, (-1, None)) - def setvector_of_box(self, box, off, vector): - assert off < vector.getcount() - assert box.type != 'V' - self.box_to_vbox[box] = (off, vector) + def setvector_of_box(self, var, off, vector): + assert off < vector.count + assert not var.is_vector() + self.box_to_vbox[var] = (off, vector) def opcount_filling_vector_register(pack, vec_reg_size): """ how many operations of that kind can one execute with a machine instruction of register size X? """ - pack_type = pack.input_type - if pack_type is None: - pack_type = pack.output_type # load operations - op = pack.leftmost() - if op.casts_box(): - count = pack_type.getcount() - return count - count = vec_reg_size // pack_type.getsize() - return count - -def maximum_byte_size(pack, vec_reg_size): - """ The maxmum size in bytes the operation is able to - process with the hardware register and the operation - semantics. - """ - op = pack.leftmost() - if op.casts_box(): - # casting is special, often only takes a half full vector - pack_type = pack.input_type - if pack_type is None: - pack_type = pack.output_type # load operations - return pack_type.byte_size() - return vec_reg_size + if op.is_typecast(): + if op.casts_down(): + return vec_reg_size // op.cast_from_bytesize() + else: + return vec_reg_size // op.cast_to_bytesize() + return vec_reg_size // op.bytesize class Pack(object): """ A pack is a set of n statements that are: @@ -1080,6 +1071,9 @@ def leftmost(self): return self.operations[0].getoperation() + def rightmost(self): + return self.operations[-1].getoperation() + def pack_type(self): ptype = self.input_type if self.input_type is None: @@ -1113,14 +1107,15 @@ return 0 if self.numops() == 0: return -1 - size = maximum_byte_size(self, vec_reg_size) - return left.bytesize * self.numops() - size - #if self.input_type is None: - # e.g. load operations - # return self.output_type.bytecount(self) - size - # default only consider the input type - # e.g. store operations, int_add, ... - #return self.input_type.bytecount(self) - size + if left.is_typecast(): + # casting is special, often only takes a half full vector + if left.casts_down(): + # size is reduced + return left.cast_from_bytesize() * self.numops() - vec_reg_size + else: + # size is increased + return left.cast_to_bytesize() * self.numops() - vec_reg_size + return left.bytesize * self.numops() - vec_reg_size def is_full(self, vec_reg_size): """ If one input element times the opcount is equal @@ -1190,6 +1185,17 @@ accum = False return rightmost is leftmost and accum + def argument_vectors(self, state, pack, index): + args = [node.getoperation().getarg(index) for node in pack.operations] + vectors = [] + last = None + for arg in args: + pos, vecop = state.getvector_of_box(arg) + if vecop is not last and vecop is not None: + vectors.append((pos, vecop)) + last = vecop + return vectors + def __repr__(self): if len(self.operations) == 0: return "Pack(empty)" diff --git a/rpython/jit/metainterp/optimizeopt/test/test_dependency.py b/rpython/jit/metainterp/optimizeopt/test/test_dependency.py --- a/rpython/jit/metainterp/optimizeopt/test/test_dependency.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_dependency.py @@ -50,7 +50,7 @@ else: label = loop.operations[0] label.setdescr(TargetToken(token)) - loop = VectorLoop(label, loop.operations[1:-1], loop.operations[-1]) + loop = VectorLoop(label, loop.operations[0:-1], loop.operations[-1]) loop.jump.setdescr(token) for op in loop.operations: if op.getopnum() == rop.GUARD_EARLY_EXIT and op.getdescr() is None: diff --git a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py --- a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py +++ b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py @@ -67,8 +67,8 @@ loop.graph = FakeDependencyGraph(loop) return loop - def pack(self, loop, l, r, input_type, output_type): - return Pack(loop.graph.nodes[1+l:1+r]) + def pack(self, loop, l, r, input_type=None, output_type=None): + return Pack(loop.graph.nodes[l:r]) def schedule(self, loop, packs, vec_reg_size=16, prepend_invariant=False, overwrite_funcs=None): @@ -115,6 +115,21 @@ assert node.count == 1 # must return here, then the test passed + def test_split_pack(self): + loop1 = self.parse_trace(""" + f10 = raw_load_f(p0, i0, descr=double) + f11 = raw_load_f(p0, i1, descr=double) + f12 = raw_load_f(p0, i2, descr=double) + """) + ps = PackSet(16) + ps.packs = [self.pack(loop1, 0, 3)] + op1 = ps.packs[0].operations[0] + op2 = ps.packs[0].operations[1] + ps.split_overloaded_packs() + assert len(ps.packs) == 1 + assert ps.packs[0].leftmost() is op1.getoperation() + assert ps.packs[0].rightmost() is op2.getoperation() + def test_schedule_split_load(self): loop1 = self.parse_trace(""" f10 = raw_load_f(p0, i0, descr=float) @@ -124,10 +139,10 @@ f14 = raw_load_f(p0, i4, descr=float) f15 = raw_load_f(p0, i5, descr=float) """) - pack1 = self.pack(loop1, 0, 6, None, F32) + pack1 = self.pack(loop1, 0, 6) loop2 = self.schedule(loop1, [pack1]) loop3 = self.parse_trace(""" - v10[4xi32] = vec_raw_load_i(p0, i0, descr=float) + v10[4xi32] = vec_raw_load_f(p0, i0, descr=float) f10 = raw_load_f(p0, i4, descr=float) f11 = raw_load_f(p0, i5, descr=float) """, False) @@ -135,21 +150,21 @@ def test_int_to_float(self): loop1 = self.parse_trace(""" - i10 = raw_load(p0, i0, descr=long) - i11 = raw_load(p0, i1, descr=long) + i10 = raw_load_i(p0, i0, descr=long) + i11 = raw_load_i(p0, i1, descr=long) i12 = int_signext(i10, 4) i13 = int_signext(i11, 4) f10 = cast_int_to_float(i12) f11 = cast_int_to_float(i13) """) - pack1 = self.pack(loop1, 0, 2, None, I64) - pack2 = self.pack(loop1, 2, 4, I64, I32_2) - pack3 = self.pack(loop1, 4, 6, I32_2, F32_2) + pack1 = self.pack(loop1, 0, 2) + pack2 = self.pack(loop1, 2, 4) + pack3 = self.pack(loop1, 4, 6) loop2 = self.schedule(loop1, [pack1, pack2, pack3]) loop3 = self.parse_trace(""" - v10[i64|2] = vec_raw_load_i(p0, i0, descr=long) - v20[i32|2] = vec_int_signext(v10[i64|2], 4) - v30[f64|2] = vec_cast_int_to_float(v20[i32|2]) + v10[2xi64] = vec_raw_load_i(p0, i0, descr=long) + v20[2xi32] = vec_int_signext(v10[2xi64], 4) + v30[2xf64] = vec_cast_int_to_float(v20[2xi32]) """, False) self.assert_equal(loop2, loop3) @@ -161,12 +176,12 @@ pack1 = self.pack(loop1, 0, 2, I64, I64) loop2 = self.schedule(loop1, [pack1], prepend_invariant=True) loop3 = self.parse_trace(""" - v10[i64|2] = vec_box(2) - v20[i64|2] = vec_int_pack(v10[i64|2], i0, 0, 1) - v30[i64|2] = vec_int_pack(v20[i64|2], i1, 1, 1) - v40[i64|2] = vec_int_expand(73,2) + v10[2xi64] = vec_box_i() + v20[2xi64] = vec_int_pack(v10[2xi64], i0, 0, 1) + v30[2xi64] = vec_int_pack(v20[2xi64], i1, 1, 1) + v40[2xi64] = vec_int_expand(73,2) # - v50[i64|2] = vec_int_add(v30[i64|2], v40[i64|2]) + v50[2xi64] = vec_int_add(v30[2xi64], v40[2xi64]) """, False) self.assert_equal(loop2, loop3) @@ -177,12 +192,12 @@ pack1 = self.pack(loop1, 0, 2, F64, F64) loop2 = self.schedule(loop1, [pack1], prepend_invariant=True) loop3 = self.parse_trace(""" - v10[f64|2] = vec_box(2) - v20[f64|2] = vec_float_pack(v10[f64|2], f0, 0, 1) - v30[f64|2] = vec_float_pack(v20[f64|2], f1, 1, 1) - v40[f64|2] = vec_float_expand(73.0,2) + v10[2xf64] = vec_box_f() + v20[2xf64] = vec_float_pack(v10[2xf64], f0, 0, 1) + v30[2xf64] = vec_float_pack(v20[2xf64], f1, 1, 1) + v40[2xf64] = vec_float_expand(73.0,2) # - v50[f64|2] = vec_float_add(v30[f64|2], v40[f64|2]) + v50[2xf64] = vec_float_add(v30[2xf64], v40[2xf64]) """, False) self.assert_equal(loop2, loop3) @@ -197,13 +212,13 @@ pack2 = self.pack(loop1, 2, 4, F64, F64) loop2 = self.schedule(loop1, [pack1, pack2], prepend_invariant=True) loop3 = self.parse_trace(""" - v10[f64|2] = vec_box(2) - v20[f64|2] = vec_float_pack(v10[f64|2], f0, 0, 1) - v30[f64|2] = vec_float_pack(v20[f64|2], f1, 1, 1) - v40[f64|2] = vec_float_expand(f5,2) # only expaned once + v10[2xf64] = vec_box_f() + v20[2xf64] = vec_float_pack(v10[2xf64], f0, 0, 1) + v30[2xf64] = vec_float_pack(v20[2xf64], f1, 1, 1) + v40[2xf64] = vec_float_expand(f5,2) # only expaned once # - v50[f64|2] = vec_float_add(v30[f64|2], v40[f64|2]) - v60[f64|2] = vec_float_add(v50[f64|2], v40[f64|2]) + v50[2xf64] = vec_float_add(v30[2xf64], v40[2xf64]) + v60[2xf64] = vec_float_add(v50[2xf64], v40[2xf64]) """, False) self.assert_equal(loop2, loop3) @@ -217,7 +232,7 @@ loop1 = self.parse_trace(""" i10 = int_signext(i1, 4) i11 = int_signext(i1, 4) - """, additional_args=['v10[i64|2]']) + """, additional_args=['v10[2xi64]']) pack1 = self.pack(loop1, 0, 2, I64, I32_2) var = self.find_input_arg('v10', loop1) def i1inv103204(v): @@ -227,20 +242,20 @@ 'getvector_of_box': i1inv103204, }) loop3 = self.parse_trace(""" - v11[i32|2] = vec_int_signext(v10[i64|2], 4) - """, False, additional_args=['v10[i64|2]']) + v11[2xi32] = vec_int_signext(v10[2xi64], 4) + """, False, additional_args=['v10[2xi64]']) self.assert_equal(loop2, loop3) def test_cast_float_to_int(self): loop1 = self.parse_trace(""" - f10 = raw_load(p0, i1, descr=double) - f11 = raw_load(p0, i2, descr=double) - f12 = raw_load(p0, i3, descr=double) - f13 = raw_load(p0, i4, descr=double) - f14 = raw_load(p0, i5, descr=double) - f15 = raw_load(p0, i6, descr=double) - f16 = raw_load(p0, i7, descr=double) - f17 = raw_load(p0, i8, descr=double) + f10 = raw_load_f(p0, i1, descr=double) + f11 = raw_load_f(p0, i2, descr=double) + f12 = raw_load_f(p0, i3, descr=double) + f13 = raw_load_f(p0, i4, descr=double) + f14 = raw_load_f(p0, i5, descr=double) + f15 = raw_load_f(p0, i6, descr=double) + f16 = raw_load_f(p0, i7, descr=double) + f17 = raw_load_f(p0, i8, descr=double) # i10 = cast_float_to_int(f10) i11 = cast_float_to_int(f11) @@ -281,31 +296,31 @@ '_prevent_signext': void }) loop3 = self.parse_trace(""" - v10[f64|2] = vec_raw_load_f(p0, i1, descr=double) - v11[f64|2] = vec_raw_load_f(p0, i3, descr=double) - v12[f64|2] = vec_raw_load_f(p0, i5, descr=double) - v13[f64|2] = vec_raw_load_f(p0, i7, descr=double) - v14[i32|2] = vec_cast_float_to_int(v10[f64|2]) - v15[i32|2] = vec_cast_float_to_int(v11[f64|2]) - v16[i32|2] = vec_cast_float_to_int(v12[f64|2]) - v17[i32|2] = vec_cast_float_to_int(v13[f64|2]) - v18[i16|2] = vec_int_signext(v14[i32|2],2) - v19[i16|2] = vec_int_signext(v15[i32|2],2) - v20[i16|2] = vec_int_signext(v16[i32|2],2) - v21[i16|2] = vec_int_signext(v17[i32|2],2) - v22[i16|4] = vec_int_pack(v18[i16|2], v19[i16|2], 2, 2) - v23[i16|6] = vec_int_pack(v22[i16|4], v20[i16|2], 4, 2) - v24[i16|8] = vec_int_pack(v23[i16|6], v21[i16|2], 6, 2) - vec_raw_store(p1, i1, v24[i16|8], descr=short) + v10[2xf64] = vec_raw_load_f(p0, i1, descr=double) + v11[2xf64] = vec_raw_load_f(p0, i3, descr=double) + v12[2xf64] = vec_raw_load_f(p0, i5, descr=double) + v13[2xf64] = vec_raw_load_f(p0, i7, descr=double) + v14[2xi32] = vec_cast_float_to_int(v10[2xf64]) + v15[2xi32] = vec_cast_float_to_int(v11[2xf64]) + v16[2xi32] = vec_cast_float_to_int(v12[2xf64]) + v17[2xi32] = vec_cast_float_to_int(v13[2xf64]) + v18[2xi16] = vec_int_signext(v14[2xi32],2) + v19[2xi16] = vec_int_signext(v15[2xi32],2) + v20[2xi16] = vec_int_signext(v16[2xi32],2) + v21[2xi16] = vec_int_signext(v17[2xi32],2) + v22[4xi16] = vec_int_pack(v18[2xi16], v19[2xi16], 2, 2) + v23[6xi16] = vec_int_pack(v22[4xi16], v20[2xi16], 4, 2) + v24[8xi16] = vec_int_pack(v23[6xi16], v21[2xi16], 6, 2) + vec_raw_store(p1, i1, v24[8xi16], descr=short) """, False) self.assert_equal(loop2, loop3) def test_cast_float_to_single_float(self): loop1 = self.parse_trace(""" - f10 = raw_load(p0, i1, descr=double) - f11 = raw_load(p0, i2, descr=double) - f12 = raw_load(p0, i3, descr=double) - f13 = raw_load(p0, i4, descr=double) + f10 = raw_load_f(p0, i1, descr=double) + f11 = raw_load_f(p0, i2, descr=double) + f12 = raw_load_f(p0, i3, descr=double) + f13 = raw_load_f(p0, i4, descr=double) # i10 = cast_float_to_singlefloat(f10) i11 = cast_float_to_singlefloat(f11) @@ -322,19 +337,19 @@ pack3 = self.pack(loop1, 8, 12, I32, None) loop2 = self.schedule(loop1, [pack1,pack2,pack3]) loop3 = self.parse_trace(""" - v44[f64|2] = vec_raw_load_f(p0, i1, descr=double) - v45[f64|2] = vec_raw_load_f(p0, i3, descr=double) - v46[i32|2] = vec_cast_float_to_singlefloat(v44[f64|2]) - v47[i32|2] = vec_cast_float_to_singlefloat(v45[f64|2]) - v41[i32|4] = vec_int_pack(v46[i32|2], v47[i32|2], 2, 2) - vec_raw_store(p1, i1, v41[i32|4], descr=float) + v44[2xf64] = vec_raw_load_f(p0, i1, descr=double) + v45[2xf64] = vec_raw_load_f(p0, i3, descr=double) + v46[2xi32] = vec_cast_float_to_singlefloat(v44[2xf64]) + v47[2xi32] = vec_cast_float_to_singlefloat(v45[2xf64]) + v41[4xi32] = vec_int_pack(v46[2xi32], v47[2xi32], 2, 2) + vec_raw_store(p1, i1, v41[4xi32], descr=float) """, False) self.assert_equal(loop2, loop3) def test_all(self): loop1 = self.parse_trace(""" - i10 = raw_load(p0, i1, descr=long) - i11 = raw_load(p0, i2, descr=long) + i10 = raw_load_i(p0, i1, descr=long) + i11 = raw_load_i(p0, i2, descr=long) # i12 = int_and(i10, 255) i13 = int_and(i11, 255) @@ -347,20 +362,20 @@ pack3 = self.pack(loop1, 4, 6, I64, None) loop2 = self.schedule(loop1, [pack1,pack2,pack3], prepend_invariant=True) loop3 = self.parse_trace(""" - v9[i64|2] = vec_int_expand(255,2) - v10[i64|2] = vec_raw_load_i(p0, i1, descr=long) - v11[i64|2] = vec_int_and(v10[i64|2], v9[i64|2]) - guard_true(v11[i64|2]) [] + v9[2xi64] = vec_int_expand(255,2) + v10[2xi64] = vec_raw_load_i(p0, i1, descr=long) + v11[2xi64] = vec_int_and(v10[2xi64], v9[2xi64]) + guard_true(v11[2xi64]) [] """, False) self.assert_equal(loop2, loop3) def test_split_load_store(self): loop1 = self.parse_trace(""" - i10 = raw_load(p0, i1, descr=float) - i11 = raw_load(p0, i2, descr=float) - i12 = raw_load(p0, i3, descr=float) - i13 = raw_load(p0, i4, descr=float) + i10 = raw_load_f(p0, i1, descr=float) + i11 = raw_load_f(p0, i2, descr=float) + i12 = raw_load_f(p0, i3, descr=float) + i13 = raw_load_f(p0, i4, descr=float) raw_store(p0, i3, i10, descr=float) raw_store(p0, i4, i11, descr=float) """) @@ -368,10 +383,10 @@ pack2 = self.pack(loop1, 4, 6, I32_2, None) loop2 = self.schedule(loop1, [pack1,pack2], prepend_invariant=True) loop3 = self.parse_trace(""" - v1[i32|4] = vec_raw_load_i(p0, i1, descr=float) - i10 = vec_int_unpack(v1[i32|4], 0, 1) + v1[4xi32] = vec_raw_load_i(p0, i1, descr=float) + i10 = vec_int_unpack(v1[4xi32], 0, 1) raw_store(p0, i3, i10, descr=float) - i11 = vec_int_unpack(v1[i32|4], 1, 1) + i11 = vec_int_unpack(v1[4xi32], 1, 1) raw_store(p0, i4, i11, descr=float) """, False) # unfortunate ui32 is the type for float32... the unsigned u is for @@ -386,9 +401,9 @@ pack1 = self.pack(loop1, 0, 2, I64, I64) loop2 = self.schedule(loop1, [pack1], prepend_invariant=True) loop3 = self.parse_trace(""" - v1[i64|2] = vec_int_expand(255,2) - v2[i64|2] = vec_int_expand(i1,2) - v3[i64|2] = vec_int_and(v1[i64|2], v2[i64|2]) + v1[2xi64] = vec_int_expand(255,2) + v2[2xi64] = vec_int_expand(i1,2) + v3[2xi64] = vec_int_and(v1[2xi64], v2[2xi64]) """, False) self.assert_equal(loop2, loop3) @@ -400,9 +415,9 @@ pack1 = self.pack(loop1, 0, 2, I64, I64) loop2 = self.schedule(loop1, [pack1], prepend_invariant=True) loop3 = self.parse_trace(""" - v1[i64|2] = vec_int_expand(255, 2) - v2[i64|2] = vec_int_expand(i1, 2) - v3[i64|2] = vec_int_and(v1[i64|2], v2[i64|2]) + v1[2xi64] = vec_int_expand(255, 2) + v2[2xi64] = vec_int_expand(i1, 2) + v3[2xi64] = vec_int_and(v1[2xi64], v2[2xi64]) """, False) self.assert_equal(loop2, loop3) @@ -419,19 +434,19 @@ pack4 = self.pack(loop1, 4, 6, I64, I64) loop2 = self.schedule(loop1, [pack1,pack4], prepend_invariant=True) loop3 = self.parse_trace(""" - v1[i64|2] = vec_int_expand(255,2) - v2[i64|2] = vec_box(2) - v3[i64|2] = vec_int_pack(v2[i64|2], i1, 0, 1) - v4[i64|2] = vec_int_pack(v3[i64|2], i2, 1, 1) - v5[i64|2] = vec_int_and(v1[i64|2], v4[i64|2]) - i10 = vec_int_unpack(v5[i64|2], 0, 1) + v1[2xi64] = vec_int_expand(255,2) + v2[2xi64] = vec_box_i() + v3[2xi64] = vec_int_pack(v2[2xi64], i1, 0, 1) + v4[2xi64] = vec_int_pack(v3[2xi64], i2, 1, 1) + v5[2xi64] = vec_int_and(v1[2xi64], v4[2xi64]) + i10 = vec_int_unpack(v5[2xi64], 0, 1) i12 = uint_floordiv(i10,1) - i11 = vec_int_unpack(v5[i64|2], 1, 1) + i11 = vec_int_unpack(v5[2xi64], 1, 1) i13 = uint_floordiv(i11,1) - v6[i64|2] = vec_box(2) - v7[i64|2] = vec_int_pack(v6[i64|2], i12, 0, 1) - v8[i64|2] = vec_int_pack(v7[i64|2], i13, 1, 1) - v9[i64|2] = vec_int_and(v4[i64|2], v8[i64|2]) + v6[2xi64] = vec_box_i() + v7[2xi64] = vec_int_pack(v6[2xi64], i12, 0, 1) + v8[2xi64] = vec_int_pack(v7[2xi64], i13, 1, 1) + v9[2xi64] = vec_int_and(v4[2xi64], v8[i64]) """, False) self.assert_equal(loop2, loop3) diff --git a/rpython/jit/metainterp/optimizeopt/vector.py b/rpython/jit/metainterp/optimizeopt/vector.py --- a/rpython/jit/metainterp/optimizeopt/vector.py +++ b/rpython/jit/metainterp/optimizeopt/vector.py @@ -614,13 +614,9 @@ self.savings += benefit_factor * times - cost def cb_signext(self, pack): - op0 = pack.operations[0].getoperation() - size = op0.getarg(1).getint() - if pack.output_type is None: - return 1,0 - orig_size = pack.output_type.getsize() - if size == orig_size: - return 0,0 + left = pack.leftmost() + if left.cast_to_bytesize() == left.cast_from_bytesize(): + return 0, 0 # no benefit for this operation! needs many x86 instrs return 1,0 @@ -836,6 +832,8 @@ pack.split(newpacks, self.vec_reg_size) continue if load < Pack.FULL: + for op in pack.operations: + op.priority = -100 pack.clear() self.packs[i] = None continue diff --git a/rpython/jit/metainterp/resoperation.py b/rpython/jit/metainterp/resoperation.py --- a/rpython/jit/metainterp/resoperation.py +++ b/rpython/jit/metainterp/resoperation.py @@ -99,7 +99,7 @@ _attrs_ = ('datatype', 'bytesize', 'signed') datatype = '\x00' - bytesize = -1 + bytesize = -1 # -1 means the biggest size known to the machine signed = True def inittype(self): @@ -112,10 +112,17 @@ descr = self.getdescr() type = self.type if descr.is_array_of_floats() or descr.concrete_type == 'f': - type = FLOAT + type = 'f' self.bytesize = descr.get_item_size_in_bytes() self.sign = descr.is_item_signed() self.datatype = type + elif self.opnum == rop.INT_SIGNEXT: + arg0 = self.getarg(0) + arg1 = self.getarg(1) + self.setdatatype('i', arg1.value, arg0.signed) + elif self.is_typecast(): + ft,tt = self.cast_types() + self.setdatatype(tt, self.cast_to_bytesize(), tt == 'i') else: # pass through the type of the first input argument if self.numargs() == 0: @@ -123,7 +130,7 @@ arg0 = self.getarg(0) self.setdatatype(arg0.datatype, arg0.bytesize, arg0.signed) assert self.datatype != '\x00' - assert self.bytesize > 0 + #assert self.bytesize > 0 def setdatatype(self, data_type, bytesize, signed): self.datatype = data_type @@ -134,7 +141,7 @@ sign = '-' if not self.signed: sign = '+' - return 'Type(%s%s, %d)' % (sign, self.type, self.size) + return 'Type(%s%s, %d)' % (sign, self.type, self.bytesize) class AbstractResOpOrInputArg(AbstractValue, Typed): _attrs_ = ('_forwarded',) @@ -159,6 +166,7 @@ boolinverse = -1 vector = -1 # -1 means, no vector equivalent, -2 it is a vector statement casts = ('\x00', -1, '\x00', -1) + count = -1 def getopnum(self): return self.opnum @@ -409,15 +417,6 @@ def forget_value(self): pass - def casts_box(self): - return False - - def cast_to(self): - return ('\x00',-1) - - def cast_from(self): - return ('\x00',-1) - def is_label(self): return self.getopnum() == rop.LABEL @@ -430,6 +429,26 @@ def returns_vector(self): return self.type != 'v' and self.vector == -2 + def is_typecast(self): + return False + + def cast_types(self): + return self.casts[0], self.casts[2] + + def cast_to_bytesize(self): + return self.casts[1] + + def cast_from_bytesize(self): + return self.casts[3] + + def casts_up(self): + return self.cast_to_bytesize() > self.cast_from_bytesize() + + def casts_down(self): + # includes the cast as noop + return self.cast_to_bytesize() <= self.cast_from_bytesize() + + # =================== # Top of the hierachy # =================== @@ -598,7 +617,7 @@ class CastOp(object): _mixin_ = True - def casts_box(self): + def is_typecast(self): return True def cast_to(self): @@ -614,15 +633,40 @@ return (to_type,size) def cast_from(self): - return ('\x00',-1) + type, size, a, b = self.casts + if size == -1: + return self.bytesize + return (type, size) + +class SignExtOp(object): + _mixin_ = True + + def is_typecast(self): + return True + + def cast_types(self): + return self.casts[0], self.casts[2] + + def cast_to_bytesize(self): + from rpython.jit.metainterp.history import ConstInt + arg = self.getarg(1) + assert isinstance(arg, ConstInt) + return arg.value + + def cast_from_bytesize(self): + arg = self.getarg(0) + return arg.bytesize class VectorOp(object): _mixin_ = True - _attrs_ = ('count',) def repr_rpython(self): return repr_rpython(self, 'bv') + def vector_bytesize(self): + assert self.count > 0 + return self.byte_size * self.count + def same_shape(self, other): """ NOT_RPYTHON """ if not other.is_vector(): @@ -675,10 +719,12 @@ class InputArgInt(IntOp, AbstractInputArg): def __init__(self, intval=0): self.setint(intval) + self.datatype = 'i' class InputArgFloat(FloatOp, AbstractInputArg): def __init__(self, f=longlong.ZEROF): self.setfloatstorage(f) + self.datatype = 'f' @staticmethod def fromfloat(x): @@ -687,13 +733,14 @@ class InputArgRef(RefOp, AbstractInputArg): def __init__(self, r=lltype.nullptr(llmemory.GCREF.TO)): self.setref_base(r) + self.datatype = 'r' def reset_value(self): self.setref_base(lltype.nullptr(llmemory.GCREF.TO)) class InputArgVector(VectorOp, AbstractInputArg): - def __init__(self): - pass + def __init__(self, datatype): + self.datatype = datatype def returns_vector(self): return True @@ -947,11 +994,10 @@ 'VEC_CAST_INT_TO_FLOAT/1/f', '_VEC_CAST_LAST', - 'VEC_INT_BOX/1/i', + 'VEC_BOX/0/if', 'VEC_INT_UNPACK/3/i', # iX|fX = VEC_INT_UNPACK(vX, index, item_count) 'VEC_INT_PACK/4/i', # VEC_INT_PACK(vX, var/const, index, item_count) 'VEC_INT_EXPAND/2/i', # vX = VEC_INT_EXPAND(var/const, item_count) - 'VEC_FLOAT_BOX/1/f', 'VEC_FLOAT_UNPACK/3/f', # iX|fX = VEC_FLOAT_UNPACK(vX, index, item_count) 'VEC_FLOAT_PACK/4/f', # VEC_FLOAT_PACK(vX, var/const, index, item_count) 'VEC_FLOAT_EXPAND/2/f', # vX = VEC_FLOAT_EXPAND(var/const, item_count) @@ -1090,13 +1136,13 @@ ] _cast_ops = { - 'INT_SIGNEXT': ('i', 0, 'i', 0), 'CAST_FLOAT_TO_INT': ('f', 8, 'i', 4), 'CAST_INT_TO_FLOAT': ('i', 4, 'f', 8), 'CAST_FLOAT_TO_SINGLEFLOAT': ('f', 8, 'f', 4), 'CAST_SINGLEFLOAT_TO_FLOAT': ('f', 4, 'f', 8), - 'CAST_PTR_TO_INT': ('r', 0, 'i', 4), - 'CAST_INT_TO_PTR': ('i', 4, 'r', 0), + 'INT_SIGNEXT': ('i', 0, 'i', 0), + #'CAST_PTR_TO_INT': ('r', 0, 'i', 4), + #'CAST_INT_TO_PTR': ('i', 4, 'r', 0), } # ____________________________________________________________ @@ -1187,6 +1233,8 @@ else: baseclass = PlainResOp mixins = [arity2mixin.get(arity, N_aryOp)] + if name.startswith('VEC'): + mixins.append(VectorOp) if result_type == 'i': mixins.append(IntOp) elif result_type == 'f': @@ -1196,9 +1244,9 @@ else: assert result_type == 'n' if name in _cast_ops: + if name == "INT_SIGNEXT": + mixins.append(SignExtOp) mixins.append(CastOp) - if name.startswith('VEC'): - mixins.insert(1,VectorOp) cls_name = '%s_OP' % name bases = (get_base_class(tuple(mixins), baseclass),) diff --git a/rpython/jit/tool/oparser.py b/rpython/jit/tool/oparser.py --- a/rpython/jit/tool/oparser.py +++ b/rpython/jit/tool/oparser.py @@ -344,9 +344,19 @@ if res in self.vars: raise ParseError("Double assign to var %s in line: %s" % (res, line)) resop = self.create_op(opnum, args, res, descr, fail_args) + self.update_vector_count(resop, res) self.vars[res] = resop return resop + def update_vector_count(self, resop, var): + pattern = re.compile('.*\[(\d+)x(u?)(i|f)(\d+)\]') + match = pattern.match(var) + if match: + resop.count = int(match.group(1)) + resop.signed = not (match.group(2) == 'u') + resop.datatype = match.group(3) + resop.bytesize = int(match.group(4)) // 8 + def parse_op_no_result(self, line): opnum, args, descr, fail_args = self.parse_op(line) res = self.create_op(opnum, args, None, descr, fail_args) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit