Author: Richard Plangger <[email protected]>
Branch: vecopt
Changeset: r77173:2afc017c4737
Date: 2015-05-07 11:41 +0200
http://bitbucket.org/pypy/pypy/changeset/2afc017c4737/
Log: rewritten parts of scheduling (priority can be set to nodes) added
box_pack/unpack operation if a argument that resides in a vector box
must be extracted
diff --git a/pypy/module/micronumpy/iterators.py
b/pypy/module/micronumpy/iterators.py
--- a/pypy/module/micronumpy/iterators.py
+++ b/pypy/module/micronumpy/iterators.py
@@ -141,7 +141,9 @@
indices = state._indices
offset = state.offset
if self.contiguous:
- offset += self.array.dtype.elsize
+ elsize = self.array.dtype.elsize
+ jit.promote(elsize)
+ offset += elsize
elif self.ndim_m1 == 0:
offset += self.strides[0]
else:
diff --git a/pypy/module/micronumpy/loop.py b/pypy/module/micronumpy/loop.py
--- a/pypy/module/micronumpy/loop.py
+++ b/pypy/module/micronumpy/loop.py
@@ -16,7 +16,7 @@
call2_driver = jit.JitDriver(
name='numpy_call2',
greens=['shapelen', 'func', 'calc_dtype', 'res_dtype'],
- reds='auto', vectorize=True)
+ reds='auto')
def call2(space, shape, func, calc_dtype, res_dtype, w_lhs, w_rhs, out):
# handle array_priority
@@ -81,7 +81,7 @@
call1_driver = jit.JitDriver(
name='numpy_call1',
greens=['shapelen', 'func', 'calc_dtype', 'res_dtype'],
- reds='auto', vectorize=True)
+ reds='auto')
def call1(space, shape, func, calc_dtype, res_dtype, w_obj, out):
obj_iter, obj_state = w_obj.create_iter(shape)
diff --git a/rpython/jit/metainterp/executor.py
b/rpython/jit/metainterp/executor.py
--- a/rpython/jit/metainterp/executor.py
+++ b/rpython/jit/metainterp/executor.py
@@ -342,6 +342,8 @@
rop.LABEL,
rop.VEC_RAW_LOAD,
rop.VEC_RAW_STORE,
+ rop.VEC_BOX_PACK,
+ rop.VEC_BOX_UNPACK,
rop.VEC_GETARRAYITEM_RAW,
rop.VEC_SETARRAYITEM_RAW,
): # list of opcodes never executed by pyjitpl
diff --git a/rpython/jit/metainterp/history.py
b/rpython/jit/metainterp/history.py
--- a/rpython/jit/metainterp/history.py
+++ b/rpython/jit/metainterp/history.py
@@ -517,10 +517,10 @@
_attrs_ = ('item_type','byte_count','item_count','signed')
_extended_display = False
- def __init__(self, item_type=INT, byte_count=4, item_count=4, signed=True):
+ def __init__(self, item_type, item_count, bytecount, signed):
self.item_type = item_type
- self.byte_count = byte_count
self.item_count = item_count
+ self.byte_count = bytecount
self.signed = signed
def forget_value(self):
diff --git a/rpython/jit/metainterp/optimizeopt/dependency.py
b/rpython/jit/metainterp/optimizeopt/dependency.py
--- a/rpython/jit/metainterp/optimizeopt/dependency.py
+++ b/rpython/jit/metainterp/optimizeopt/dependency.py
@@ -61,6 +61,10 @@
i += 1
return True
+ def set_schedule_priority(self, p):
+ for node in self.path:
+ node.priority = p
+
def walk(self, node):
self.path.append(node)
@@ -80,6 +84,7 @@
self.pack = None
self.emitted = False
self.schedule_position = -1
+ self.priority = 0
def getoperation(self):
return self.op
@@ -115,7 +120,7 @@
# clone this operation object. if the vectorizer is
# not able to relax guards, it won't leave behind a modified operation
tgt_op = self.getoperation().clone()
- op = tgt_op
+ self.op = tgt_op
op = guard.getoperation()
assert isinstance(tgt_op, GuardResOp)
@@ -441,6 +446,8 @@
def __init__(self, loop):
self.loop = loop
self.nodes = [ Node(op,i) for i,op in enumerate(loop.operations) ]
+ self.invariant_vars = {}
+ self.update_invariant_vars()
self.memory_refs = {}
self.schedulable_nodes = []
self.index_vars = {}
@@ -451,6 +458,19 @@
def getnode(self, i):
return self.nodes[i]
+ def update_invariant_vars(self):
+ label_op = self.nodes[0].getoperation()
+ jump_op = self.nodes[-1].getoperation()
+ assert label_op.numargs() == jump_op.numargs()
+ for i in range(label_op.numargs()):
+ label_box = label_op.getarg(i)
+ jump_box = jump_op.getarg(i)
+ if label_box == jump_box:
+ self.invariant_vars[label_box] = None
+
+ def box_is_invariant(self, box):
+ return box in self.invariant_vars
+
def build_dependencies(self):
""" This is basically building the definition-use chain and saving this
information in a graph structure. This is the same as calculating
@@ -463,13 +483,19 @@
#
label_pos = 0
jump_pos = len(self.nodes)-1
- intformod = IntegralForwardModification(self.memory_refs,
self.index_vars, self.comparison_vars)
+ intformod = IntegralForwardModification(self.memory_refs,
self.index_vars,
+ self.comparison_vars,
self.invariant_vars)
# pass 1
for i,node in enumerate(self.nodes):
op = node.op
+ if op.is_always_pure():
+ node.priority = 1
+ if op.is_guard():
+ node.priority = 2
# the label operation defines all operations at the
# beginning of the loop
if op.getopnum() == rop.LABEL and i != jump_pos:
+ node.priority = 100
label_pos = i
for arg in op.getarglist():
tracker.define(arg, node)
@@ -504,7 +530,7 @@
for node in self.nodes:
if node != jump_node:
if node.depends_count() == 0:
- self.schedulable_nodes.append(node)
+ self.schedulable_nodes.insert(0, node)
# every leaf instruction points to the jump_op. in theory
every instruction
# points to jump_op. this forces the jump/finish op to be the
last operation
if node.provides_count() == 0:
@@ -665,52 +691,74 @@
def has_more(self):
return len(self.schedulable_nodes) > 0
- def next(self):
- return self.schedulable_nodes[0]
+ def next(self, position):
+ i = self._next(self.schedulable_nodes)
+ if i >= 0:
+ candidate = self.schedulable_nodes[i]
+ del self.schedulable_nodes[i]
+ return self.schedule(candidate, position)
- def schedulable(self, indices):
- for index in indices:
- if index not in self.schedulable_nodes:
- break
+ raise RuntimeError("schedule failed cannot continue")
+
+ def _next(self, candidate_list):
+ i = len(candidate_list)-1
+ while i >= 0:
+ candidate = candidate_list[i]
+ if candidate.emitted:
+ del candidate_list[i]
+ i -= 1
+ continue
+ if self.schedulable(candidate):
+ return i
+ i -= 1
+ return -1
+
+ def schedulable(self, candidate):
+ if candidate.pack:
+ for node in candidate.pack.operations:
+ if node.depends_count() > 0:
+ return False
+ return candidate.depends_count() == 0
+
+ def schedule(self, candidate, position):
+ if candidate.pack:
+ pack = candidate.pack
+ vops = self.sched_data.as_vector_operation(pack)
+ for node in pack.operations:
+ self.scheduled(node, position)
+ return vops
else:
- return True
- return False
+ self.scheduled(candidate, position)
+ return [candidate.getoperation()]
- def schedule_later(self, index):
- node = self.schedulable_nodes[index]
- del self.schedulable_nodes[index]
- self.schedulable_nodes.append(node)
-
- def schedule_all(self, opindices, position):
- while len(opindices) > 0:
- opidx = opindices.pop()
- for i,node in enumerate(self.schedulable_nodes):
- if node == opidx:
- self.schedule(i, position)
- break
-
- def schedule(self, index, position):
- node = self.schedulable_nodes[index]
- node.schedule_position = position
- del self.schedulable_nodes[index]
- to_del = []
+ def scheduled(self, node, position):
+ node.position = position
for dep in node.provides()[:]: # COPY
to = dep.to
node.remove_edge_to(to)
if not to.emitted and to.depends_count() == 0:
- if to.pack:
- self.schedulable_nodes.append(to)
+ # sorts them by priority
+ nodes = self.schedulable_nodes
+ i = len(nodes)-1
+ while i >= 0:
+ itnode = nodes[i]
+ if itnode.priority < to.priority:
+ nodes.insert(i+1, to)
+ break
+ i -= 1
else:
- self.schedulable_nodes.insert(0, to)
+ nodes.insert(0, to)
node.clear_dependencies()
node.emitted = True
+
class IntegralForwardModification(object):
""" Calculates integral modifications on an integer box. """
- def __init__(self, memory_refs, index_vars, comparison_vars):
+ def __init__(self, memory_refs, index_vars, comparison_vars,
invariant_vars):
self.index_vars = index_vars
self.comparison_vars = comparison_vars
self.memory_refs = memory_refs
+ self.invariant_vars = invariant_vars
def is_const_integral(self, box):
if isinstance(box, ConstInt):
@@ -727,12 +775,8 @@
def operation_{name}(self, op, node):
box_a0 = op.getarg(0)
box_a1 = op.getarg(1)
- left = None
- right = None
- if not self.is_const_integral(box_a0):
- left = self.get_or_create(box_a0)
- if not self.is_const_integral(box_a1):
- right = self.get_or_create(box_a1)
+ left = self.index_vars.get(box_a0, None)
+ right = self.index_vars.get(box_a1, None)
box_r = op.result
self.comparison_vars[box_r] = CompareOperation(op.getopnum(), left,
right)
"""
@@ -770,6 +814,34 @@
.format(name='INT_SUB', op='-')).compile()
del additive_func_source
+ #def operation_INT_ADD(self, op, node):
+ # box_r = op.result
+ # if not box_r:
+ # return
+ # box_a0 = op.getarg(0)
+ # box_a1 = op.getarg(1)
+ # if self.is_const_integral(box_a0) and self.is_const_integral(box_a1):
+ # idx_ref = IndexVar(box_r)
+ # idx_ref.constant = box_a0.getint() + box_a1.getint()
+ # self.index_vars[box_r] = idx_ref
+ # elif self.is_const_integral(box_a0):
+ # idx_ref = self.get_or_create(box_a1)
+ # idx_ref = idx_ref.clone()
+ # idx_ref.constant {op}= box_a0.getint()
+ # self.index_vars[box_r] = idx_ref
+ # elif self.is_const_integral(box_a1):
+ # idx_ref = self.get_or_create(box_a0)
+ # idx_ref = idx_ref.clone()
+ # idx_ref.add_const(box_a1.getint())
+ # self.index_vars[box_r] = idx_ref
+ # else:
+ # # both variables are boxes
+ # if box_a1 in self.invariant_vars:
+ # idx_var = self.get_or_create(box_a0)
+ # idx_var = idx_var.clone()
+ # idx_var.set_next_nonconst_mod(BoxedIndexVar(box_a1,
op.getopnum(), box_a0))
+ # self.index_vars[box_r] = idx_var
+
multiplicative_func_source = """
def operation_{name}(self, op, node):
box_r = op.result
@@ -847,10 +919,41 @@
self.coefficient_mul = 1
self.coefficient_div = 1
self.constant = 0
+ # saves the next modification that uses a variable
+ self.next_nonconst = None
+ self.current_end = None
+ self.opnum = 0
+
+ def stride_const(self):
+ return self.next_nonconst is None
+
+ def add_const(self, number):
+ if self.current_end is None:
+ self.constant += number
+ else:
+ self.current_end.constant += number
+
+ def set_next_nonconst_mod(self, idxvar):
+ if self.current_end is None:
+ self.next_nonconst = idxvar
+ else:
+ self.current_end.next_nonconst = idxvar
+ self.current_end = idxvar
+
+ def is_adjacent_with_runtime_check(self, other, graph):
+ return self.next_nonconst is not None and \
+ self.next_nonconst is self.current_end and \
+ self.next_nonconst.opnum == rop.INT_ADD and \
+ self.next_nonconst.is_identity()
def getvariable(self):
return self.var
+ def is_identity(self):
+ return self.coefficient_mul == 1 and \
+ self.coefficient_div == 1 and \
+ self.constant == 0
+
def __eq__(self, other):
if self.same_variable(other):
return self.diff(other) == 0
@@ -883,8 +986,12 @@
return mycoeff + self.constant - (othercoeff + other.constant)
def __repr__(self):
- return 'IndexVar(%s*(%s/%s)+%s)' % (self.var, self.coefficient_mul,
- self.coefficient_div,
self.constant)
+ if self.is_identity():
+ return 'IndexVar(%s+%s)' % (self.var, repr(self.next_nonconst))
+
+ return 'IndexVar((%s*(%s/%s)+%s) + %s)' % (self.var,
self.coefficient_mul,
+ self.coefficient_div,
self.constant,
+ repr(self.next_nonconst))
def adapt_operation(self, op):
# TODO
@@ -923,6 +1030,15 @@
return abs(self.index_var.diff(other.index_var)) - stride == 0
return False
+ def is_adjacent_with_runtime_check(self, other, graph):
+ """there are many cases where the stride is variable
+ it is a priori not known if two unrolled memory accesses are
+ tightly packed"""
+ assert isinstance(other, MemoryRef)
+ if self.array == other.array and self.descr == other.descr:
+ return
self.index_var.is_adjacent_with_runtime_check(other.index_var, graph)
+ return False
+
def match(self, other):
assert isinstance(other, MemoryRef)
if self.array == other.array and self.descr == other.descr:
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_dependency.py
b/rpython/jit/metainterp/optimizeopt/test/test_dependency.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_dependency.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_dependency.py
@@ -273,7 +273,7 @@
i4 = call(i5, i3, descr=nonwritedescr) # 2: 3,4,5?
guard_no_exception() [i2] # 3: 4,5?
p2 = getarrayitem_gc(p1,i3,descr=chararraydescr) # 4: 5
- jump(p2, p1, i3) # 5:
+ jump(p2, p1, i3, i5) # 5:
"""
self.assert_dependencies(ops, full_check=True)
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -61,8 +61,10 @@
if opt.dependency_graph is not None:
self._write_dot_and_convert_to_svg(opt.dependency_graph, "ee" +
self.test_name)
opt.schedule()
+ self.debug_print_operations(opt.loop)
opt.unroll_loop_iterations(loop, unroll_factor)
opt.loop.operations = opt.get_newoperations()
+ self.debug_print_operations(opt.loop)
opt.clear_newoperations()
opt.build_dependency_graph()
self.last_graph = opt.dependency_graph
@@ -1031,6 +1033,7 @@
pass
def test_constant_expansion(self):
+ py.test.skip()
ops = """
[p0,i0]
guard_early_exit() [p0,i0]
@@ -1053,6 +1056,74 @@
vopt = self.vectorize(self.parse_loop(ops),3)
self.assert_equal(vopt.loop, self.parse_loop(opt))
+ def test_guard_invalidate(self):
+ py.test.skip()
+ ops = """
+ [p52, i51, p4, i56, p1, f15, p19, i17, p8, i23, p34, i32, i38, f30,
p7, i14, i22, i29, i37, i45, i55, i57]
+ debug_merge_point(0, 0, '(numpy_call2: no get_printable_location)')
+ guard_early_exit() [p8, p7, p4, p1, i51, p34, f15, i56, i32, p52, i23,
i38, p19, i17, f30]
+ f59 = raw_load(i14, i23, descr=floatarraydescr)
+ guard_not_invalidated() [p8, p7, p4, p1, f59, i51, p34, None, i56,
i32, p52, i23, i38, p19, i17, f30]
+ i60 = int_add(i23, i22)
+ f61 = raw_load(i29, i38, descr=floatarraydescr)
+ i62 = int_add(i38, i37)
+ f63 = float_add(f59, f61)
+ raw_store(i45, i56, f63, descr=floatarraydescr)
+ i64 = int_add(i51, 1)
+ i65 = int_add(i56, i55)
+ i66 = int_ge(i64, i57)
+ guard_false(i66) [p8, p7, p4, p1, f59, i62, i60, i65, i64, f61, None,
p34, None, None, i32, p52, None, None, p19, i17, None]
+ debug_merge_point(0, 0, '(numpy_call2: no get_printable_location)')
+ jump(p52, i64, p4, i65, p1, f59, p19, i17, p8, i60, p34, i32, i62,
f61, p7, i14, i22, i29, i37, i45, i55, i57)
+ """
+ vopt = self.vectorize(self.parse_loop(ops))
+ self.debug_print_operations(vopt.loop)
+
+ def test_element_f45_in_guard_failargs(self):
+ ops = """
+ [p36, i28, p9, i37, p14, f34, p12, p38, f35, p39, i40, i41, p42, i43,
i44, i21, i4, i0, i18]
+ guard_early_exit() [p38, p12, p9, p14, p39, i37, i44, f35, i40, p42,
i43, f34, i28, p36, i41]
+ f45 = raw_load(i21, i44, descr=floatarraydescr)
+ guard_not_invalidated() [p38, p12, p9, p14, f45, p39, i37, i44, f35,
i40, p42, i43, None, i28, p36, i41]
+ i46 = int_add(i44, 8)
+ f47 = raw_load(i4, i41, descr=floatarraydescr)
+ i48 = int_add(i41, 8)
+ f49 = float_add(f45, f47)
+ raw_store(i0, i37, f49, descr=floatarraydescr)
+ i50 = int_add(i28, 1)
+ i51 = int_add(i37, 8)
+ i52 = int_ge(i50, i18)
+ guard_false(i52) [p38, p12, p9, p14, i48, i46, f47, i51, i50, f45,
p39, None, None, None, i40, p42, i43, None, None, p36, None]
+ jump(p36, i50, p9, i51, p14, f45, p12, p38, f47, p39, i40, i48, p42,
i43, i46, i21, i4, i0, i18)
+ """
+ opt = """
+ [p36, i28, p9, i37, p14, f34, p12, p38, f35, p39, i40, i41, p42, i43,
i44, i21, i4, i0, i18]
+ guard_not_invalidated() [p38, p12, p9, p14, p39, i37, i44, f35, i40,
p42, i43, f34, i28, p36, i41]
+ guard_early_exit() [p38, p12, p9, p14, p39, i37, i44, f35, i40, p42,
i43, f34, i28, p36, i41]
+ i50 = int_add(i28, 1)
+ i48 = int_add(i41, 8)
+ i46 = int_add(i44, 8)
+ i51 = int_add(i37, 8)
+ i52 = int_ge(i50, i18)
+ i54 = int_add(i41, 16)
+ i55 = int_add(i44, 16)
+ i56 = int_add(i37, 16)
+ i53 = int_add(i28, 2)
+ i57 = int_ge(i53, i18)
+ guard_false(i57) [p38, p12, p9, p14, p39, i37, i44, f35, i40, p42,
i43, f34, i28, p36, i41]
+ v61 = vec_raw_load(i21, i44, 2, descr=floatarraydescr)
+ v62 = vec_raw_load(i4, i41, 2, descr=floatarraydescr)
+ v63 = vec_float_add(v61, v62, 2)
+ vec_raw_store(i0, i37, v63, 2, descr=floatarraydescr)
+ f100 = vec_box_unpack(v61, 1)
+ f101 = vec_box_unpack(v62, 1)
+ jump(p36, i53, p9, i56, p14, f100, p12, p38, f101, p39, i40, i54, p42,
i43, i55, i21, i4, i0, i18)
+ """
+ vopt = self.vectorize(self.parse_loop(ops))
+ self.debug_print_operations(vopt.loop)
+ self.assert_equal(vopt.loop, self.parse_loop(opt))
+
+
class TestLLtype(BaseTestVectorize, LLtypeMixin):
pass
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py
b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -48,8 +48,10 @@
inline_short_preamble, start_state, False)
orig_ops = loop.operations
try:
+ debug_print_operations(loop)
opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, loop,
optimizations)
opt.propagate_all_forward()
+ debug_print_operations(loop)
except NotAVectorizeableLoop:
loop.operations = orig_ops
# vectorization is not possible, propagate only normal optimizations
@@ -65,6 +67,8 @@
self.unroll_count = 0
self.smallest_type_bytes = 0
self.early_exit_idx = -1
+ self.sched_data = None
+ self.tried_to_pack = False
def propagate_all_forward(self, clear=True):
self.clear_newoperations()
@@ -88,7 +92,6 @@
if self.dependency_graph is not None:
self.schedule() # reorder the trace
-
# unroll
self.unroll_count = self.get_unroll_count(vsize)
self.unroll_loop_iterations(self.loop, self.unroll_count)
@@ -105,8 +108,7 @@
self.collapse_index_guards()
def emit_operation(self, op):
- if op.getopnum() == rop.GUARD_EARLY_EXIT or \
- op.getopnum() == rop.DEBUG_MERGE_POINT:
+ if op.getopnum() == rop.DEBUG_MERGE_POINT:
return
self._last_emitted_op = op
self._newoperations.append(op)
@@ -138,19 +140,24 @@
assert jump_op.is_final()
self.emit_unrolled_operation(label_op)
- #guard_ee_op = ResOperation(rop.GUARD_EARLY_EXIT, [], None,
ResumeAtLoopHeaderDescr())
- #guard_ee_op.rd_snapshot = Snapshot(None, loop.inputargs[:])
- #self.emit_unrolled_operation(guard_ee_op)
+ oi = 0
+ pure = True
operations = []
- start_index = 1
+ ee_pos = -1
+ ee_guard = None
for i in range(1,op_count-1):
op = loop.operations[i].clone()
- if loop.operations[i].getopnum() == rop.GUARD_EARLY_EXIT:
- continue
+ opnum = op.getopnum()
+ if opnum == rop.GUARD_EARLY_EXIT:
+ ee_pos = i
+ ee_guard = op
operations.append(op)
self.emit_unrolled_operation(op)
+ prohibit_opnums = (rop.GUARD_FUTURE_CONDITION, rop.GUARD_EARLY_EXIT,
+ rop.GUARD_NOT_INVALIDATED)
+
orig_jump_args = jump_op.getarglist()[:]
# it is assumed that #label_args == #jump_args
label_arg_count = len(orig_jump_args)
@@ -165,13 +172,9 @@
if la != ja:
rename_map[la] = ja
#
- emitted_ee = False
- for op in operations:
- if op.getopnum() == rop.GUARD_FUTURE_CONDITION:
+ for oi, op in enumerate(operations):
+ if op.getopnum() in prohibit_opnums:
continue # do not unroll this operation twice
- if op.getopnum() == rop.GUARD_EARLY_EXIT:
- emitted_ee = True
- pass # do not unroll this operation twice
copied_op = op.clone()
if copied_op.result is not None:
# every result assigns a new box, thus creates an entry
@@ -190,21 +193,14 @@
# not only the arguments, but also the fail args need
# to be adjusted. rd_snapshot stores the live variables
# that are needed to resume.
- if copied_op.is_guard() and emitted_ee:
+ if copied_op.is_guard():
assert isinstance(copied_op, GuardResOp)
- snapshot = self.clone_snapshot(copied_op.rd_snapshot,
rename_map)
- copied_op.rd_snapshot = snapshot
- if not we_are_translated():
- # ensure that in a test case the renaming is correct
- if copied_op.getfailargs():
- args = copied_op.getfailargs()[:]
- for i,arg in enumerate(args):
- try:
- value = rename_map[arg]
- args[i] = value
- except KeyError:
- pass
- copied_op.setfailargs(args)
+ target_guard = copied_op
+ if oi < ee_pos:
+ #self.clone_failargs(copied_op, ee_guard, rename_map)
+ pass
+ else:
+ self.clone_failargs(copied_op, copied_op, rename_map)
#
self.emit_unrolled_operation(copied_op)
@@ -221,6 +217,19 @@
self.emit_unrolled_operation(jump_op)
+ def clone_failargs(self, guard, target_guard, rename_map):
+ snapshot = self.clone_snapshot(target_guard.rd_snapshot, rename_map)
+ guard.rd_snapshot = snapshot
+ if guard.getfailargs():
+ args = target_guard.getfailargs()[:]
+ for i,arg in enumerate(args):
+ try:
+ value = rename_map[arg]
+ args[i] = value
+ except KeyError:
+ pass
+ guard.setfailargs(args)
+
def clone_snapshot(self, snapshot, rename_map):
# snapshots are nested like the MIFrames
if snapshot is None:
@@ -273,13 +282,18 @@
loop = self.loop
operations = loop.operations
+ self.tried_to_pack = True
+
self.packset = PackSet(self.dependency_graph, operations,
self.unroll_count,
self.smallest_type_bytes)
- memory_refs = self.dependency_graph.memory_refs.items()
+ graph = self.dependency_graph
+ memory_refs = graph.memory_refs.items()
# initialize the pack set
for node_a,memref_a in memory_refs:
for node_b,memref_b in memory_refs:
+ if memref_a is memref_b:
+ continue
# instead of compare every possible combination and
# exclue a_opidx == b_opidx only consider the ones
# that point forward:
@@ -287,6 +301,10 @@
if memref_a.is_adjacent_to(memref_b):
if self.packset.can_be_packed(node_a, node_b):
self.packset.add_pair(node_a, node_b)
+ #if memref_a.is_adjacent_with_runtime_check(memref_b,
graph):
+ # if self.packset.can_be_packed(node_a, node_b):
+ # self.check_adjacent_at_runtime(memref_a, memref_b)
+ # self.packset.add_pair(node_a, node_b)
def extend_packset(self):
pack_count = self.packset.pack_count()
@@ -359,22 +377,15 @@
def schedule(self):
self.guard_early_exit = -1
self.clear_newoperations()
- scheduler = Scheduler(self.dependency_graph, VecScheduleData())
+ sched_data = VecScheduleData()
+ scheduler = Scheduler(self.dependency_graph, sched_data)
while scheduler.has_more():
- candidate = scheduler.next()
- if candidate.pack:
- pack = candidate.pack
- if scheduler.schedulable(pack.operations):
- vop = scheduler.sched_data.as_vector_operation(pack)
- position = len(self._newoperations)
- self.emit_operation(vop)
- scheduler.schedule_all(pack.operations, position)
- else:
- scheduler.schedule_later(0)
- else:
- position = len(self._newoperations)
- self.emit_operation(candidate.getoperation())
- scheduler.schedule(0, position)
+ position = len(self._newoperations)
+ ops = scheduler.next(position)
+ for op in ops:
+ if self.tried_to_pack:
+ self.unpack_from_vector(op, sched_data)
+ self.emit_operation(op)
if not we_are_translated():
for node in self.dependency_graph.nodes:
@@ -382,6 +393,14 @@
self.loop.operations = self._newoperations[:]
self.clear_newoperations()
+ def unpack_from_vector(self, op, sched_data):
+ box_to_vbox = sched_data.box_to_vbox
+ for i, arg in enumerate(op.getarglist()):
+ (i, vbox) = box_to_vbox.get(arg, (-1, None))
+ if vbox:
+ unpack_op = ResOperation(rop.VEC_BOX_UNPACK, [vbox,
ConstInt(i)], arg)
+ self.emit_operation(unpack_op)
+
def analyse_index_calculations(self):
if len(self.loop.operations) <= 1 or self.early_exit_idx == -1:
return
@@ -407,6 +426,7 @@
else:
if path.has_no_side_effects(exclude_first=True,
exclude_last=True):
#index_guards[guard.getindex()] = IndexGuard(guard,
path.path[:])
+ path.set_schedule_priority(10)
pullup.append(path.last_but_one())
last_prev_node = prev_node
for a,b in del_deps:
@@ -468,6 +488,15 @@
self.loop.operations = self._newoperations[:]
+ def check_adjacent_at_runtime(self, mem_a, mem_b):
+ ivar_a = mem_a.index_var
+ ivar_b = mem_b.index_var
+ if ivar_a.mods:
+ print "guard(", ivar_a.mods[1], " is adjacent)"
+ if ivar_b.mods:
+ print "guard(", ivar_b.mods[1], " is adjacent)"
+ pass
+
def must_unpack_result_to_exec(op, target_op):
# TODO either move to resop or util
if op.getoperation().vector != -1:
@@ -516,7 +545,7 @@
args.append(ConstInt(op_count))
vop = ResOperation(op0.vector, args, op0.result, op0.getdescr())
self._inspect_operation(vop)
- return vop
+ return [vop]
def get_vbox_for(self, arg):
try:
@@ -527,17 +556,17 @@
# be emitted
assert False, "vector box MUST be defined before"
- def vector_result(self, vop):
+ def vector_result(self, vop, bytecount, signed):
ops = self.pack.operations
op0 = ops[0].getoperation()
result = op0.result
- vbox = BoxVector(result.type, 4, 0, True)
+ vboxcount = len(ops)
+ vbox = BoxVector(result.type, vboxcount, bytecount, signed)
vop.result = vbox
i = 0
- vboxcount = vbox.item_count = len(ops)
while i < vboxcount:
op = ops[i].getoperation()
- self.box_to_vbox[result] = (i, vbox)
+ self.box_to_vbox[op.result] = (i, vbox)
i += 1
def vector_arg(self, vop, argidx):
@@ -545,12 +574,13 @@
op0 = ops[0].getoperation()
vbox = self.get_vbox_for(op0.getarg(argidx))
vop.setarg(argidx, vbox)
+ return vbox
bin_arith_trans = """
def _vectorize_{name}(self, vop):
- self.vector_arg(vop, 0)
+ vbox = self.vector_arg(vop, 0)
self.vector_arg(vop, 1)
- self.vector_result(vop)
+ self.vector_result(vop, vbox.byte_count, vbox.signed)
"""
exec py.code.Source(bin_arith_trans.format(name='VEC_INT_ADD')).compile()
exec py.code.Source(bin_arith_trans.format(name='VEC_INT_MUL')).compile()
@@ -561,14 +591,20 @@
del bin_arith_trans
def _vectorize_VEC_INT_SIGNEXT(self, vop):
- self.vector_arg(vop, 0)
+ vbox = self.vector_arg(vop, 0)
# arg 1 is a constant
- self.vector_result(vop)
+ self.vector_result(vop, vbox.byte_count, vbox.signed)
def _vectorize_VEC_RAW_LOAD(self, vop):
- self.vector_result(vop)
+ descr = vop.getdescr()
+ byte_count = descr.get_item_size_in_bytes()
+ signed = descr.is_item_signed()
+ self.vector_result(vop, byte_count, signed)
def _vectorize_VEC_GETARRAYITEM_RAW(self, vop):
- self.vector_result(vop)
+ descr = vop.getdescr()
+ byte_count = descr.get_item_size_in_bytes()
+ signed = descr.is_item_signed()
+ self.vector_result(vop, byte_count, signed)
def _vectorize_VEC_RAW_STORE(self, vop):
self.vector_arg(vop, 2)
diff --git a/rpython/jit/metainterp/resoperation.py
b/rpython/jit/metainterp/resoperation.py
--- a/rpython/jit/metainterp/resoperation.py
+++ b/rpython/jit/metainterp/resoperation.py
@@ -460,6 +460,8 @@
'VEC_FLOAT_MUL/3',
'VEC_INT_SIGNEXT/3',
'_VEC_ARITHMETIC_LAST',
+ 'VEC_BOX_UNPACK/2',
+ 'VEC_BOX_PACK/3',
#
'INT_LT/2b',
'INT_LE/2b',
diff --git a/rpython/jit/tool/oparser.py b/rpython/jit/tool/oparser.py
--- a/rpython/jit/tool/oparser.py
+++ b/rpython/jit/tool/oparser.py
@@ -121,7 +121,7 @@
box = ts.BoxRef()
_box_counter_more_than(self.model, elem[1:])
elif elem.startswith('v'):
- box = self.model.BoxVector()
+ box = self.model.BoxVector('f', 8, 2, True)
_box_counter_more_than(self.model, elem[1:])
else:
for prefix, boxclass in self.boxkinds.iteritems():
diff --git a/rpython/rlib/jit.py b/rpython/rlib/jit.py
--- a/rpython/rlib/jit.py
+++ b/rpython/rlib/jit.py
@@ -590,7 +590,7 @@
get_jitcell_at=None, set_jitcell_at=None,
get_printable_location=None, confirm_enter_jit=None,
can_never_inline=None, should_unroll_one_iteration=None,
- name='jitdriver', check_untranslated=True, vectorize=False,
+ name='jitdriver', check_untranslated=True, vectorize=True,
get_unique_id=None):
if greens is not None:
self.greens = greens
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit