Author: Richard Plangger <[email protected]>
Branch: vecopt2
Changeset: r77126:c7cbb61784d6
Date: 2015-04-29 15:11 +0200
http://bitbucket.org/pypy/pypy/changeset/c7cbb61784d6/
Log: vectorization now uses the preamble of the unrolling optimization,
this is a step towards a unified unrolling algorithm (and keeps most
of the variables in the register) some test changes that where
needed after the small trace_optimize refactoring
diff --git a/rpython/jit/backend/llgraph/runner.py
b/rpython/jit/backend/llgraph/runner.py
--- a/rpython/jit/backend/llgraph/runner.py
+++ b/rpython/jit/backend/llgraph/runner.py
@@ -242,6 +242,8 @@
translate_support_code = False
is_llgraph = True
+ vector_register_size = 16
+
def __init__(self, rtyper, stats=None, *ignored_args, **kwds):
model.AbstractCPU.__init__(self)
self.rtyper = rtyper
diff --git a/rpython/jit/backend/x86/detect_sse2.py
b/rpython/jit/backend/x86/detect_sse2.py
--- a/rpython/jit/backend/x86/detect_sse2.py
+++ b/rpython/jit/backend/x86/detect_sse2.py
@@ -2,35 +2,42 @@
from rpython.rtyper.lltypesystem import lltype, rffi
from rpython.rlib.rmmap import alloc, free
-
-def detect_sse2():
+def cpu_info(instr):
data = alloc(4096)
pos = 0
- for c in ("\xB8\x01\x00\x00\x00" # MOV EAX, 1
- "\x53" # PUSH EBX
- "\x0F\xA2" # CPUID
- "\x5B" # POP EBX
- "\x92" # XCHG EAX, EDX
- "\xC3"): # RET
+ for c in instr:
data[pos] = c
pos += 1
fnptr = rffi.cast(lltype.Ptr(lltype.FuncType([], lltype.Signed)), data)
code = fnptr()
free(data, 4096)
+ return code
+
+def detect_sse2():
+ code = cpu_info("\xB8\x01\x00\x00\x00" # MOV EAX, 1
+ "\x53" # PUSH EBX
+ "\x0F\xA2" # CPUID
+ "\x5B" # POP EBX
+ "\x92" # XCHG EAX, EDX
+ "\xC3" # RET
+ )
return bool(code & (1<<25)) and bool(code & (1<<26))
+def byte_size_for_vector_registers(sse2, avx, avxbw):
+ if avx:
+ if avxbw:
+ return 64
+ return 32
+ if sse2:
+ return 16
+ assert False, "No vector extention supported"
+
def detect_x32_mode():
- data = alloc(4096)
- pos = 0 # 32-bit 64-bit / x32
- for c in ("\x48" # DEC EAX
- "\xB8\xC8\x00\x00\x00"# MOV EAX, 200 MOV RAX,
0x40404040000000C8
- "\x40\x40\x40\x40" # 4x INC EAX
- "\xC3"): # RET RET
- data[pos] = c
- pos += 1
- fnptr = rffi.cast(lltype.Ptr(lltype.FuncType([], lltype.Signed)), data)
- code = fnptr()
- free(data, 4096)
+ # 32-bit 64-bit / x32
+ code = cpuinfo("\x48" # DEC EAX
+ "\xB8\xC8\x00\x00\x00"# MOV EAX, 200 MOV RAX,
0x40404040000000C8
+ "\x40\x40\x40\x40" # 4x INC EAX
+ "\xC3") # RET RET
assert code in (200, 204, 0x40404040000000C8)
return code == 200
diff --git a/rpython/jit/backend/x86/regalloc.py
b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -1301,6 +1301,7 @@
else:
src_locations2.append(src_loc)
dst_locations2.append(dst_loc)
+
# Do we have a temp var?
if IS_X86_64:
tmpreg = X86_64_SCRATCH_REG
@@ -1466,7 +1467,7 @@
not descr.is_array_of_structs()
itemsize, ofs, _ = unpack_arraydescr(descr)
integer = not descr.is_array_of_floats()
- aligned = True
+ aligned = False
args = op.getarglist()
base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
@@ -1487,7 +1488,7 @@
ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
integer = not descr.is_array_of_floats()
- aligned = True
+ aligned = False
self.perform_discard(op, [base_loc, ofs_loc, value_loc,
imm(itemsize), imm(ofs), imm(integer),
imm(aligned)])
diff --git a/rpython/jit/backend/x86/runner.py
b/rpython/jit/backend/x86/runner.py
--- a/rpython/jit/backend/x86/runner.py
+++ b/rpython/jit/backend/x86/runner.py
@@ -24,6 +24,8 @@
with_threads = False
frame_reg = regloc.ebp
+ vector_register_size = 0 # in bytes
+
from rpython.jit.backend.x86.arch import JITFRAME_FIXED_SIZE
all_reg_indexes = gpr_reg_mgr_cls.all_reg_indexes
gen_regs = gpr_reg_mgr_cls.all_regs
@@ -148,6 +150,8 @@
IS_64_BIT = False
+ vector_register_size = 16
+
def __init__(self, *args, **kwargs):
assert sys.maxint == (2**31 - 1)
super(CPU386, self).__init__(*args, **kwargs)
@@ -163,4 +167,6 @@
IS_64_BIT = True
+ vector_register_size = 16
+
CPU = CPU386
diff --git a/rpython/jit/backend/x86/test/test_vectorize.py
b/rpython/jit/backend/x86/test/test_vectorize.py
--- a/rpython/jit/backend/x86/test/test_vectorize.py
+++ b/rpython/jit/backend/x86/test/test_vectorize.py
@@ -11,9 +11,11 @@
from rpython.rtyper.lltypesystem import lltype
-class TestBasic(test_basic.Jit386Mixin, test_vectorize.VectorizeLLtypeTests):
+class TestBasic(test_vectorize.VectorizeLLtypeTests, test_basic.Jit386Mixin):
# for the individual tests see
# ====> ../../../metainterp/test/test_basic.py
+ enable_opts =
'intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll'
+
pass
diff --git a/rpython/jit/metainterp/optimizeopt/__init__.py
b/rpython/jit/metainterp/optimizeopt/__init__.py
--- a/rpython/jit/metainterp/optimizeopt/__init__.py
+++ b/rpython/jit/metainterp/optimizeopt/__init__.py
@@ -67,13 +67,13 @@
loop.logops = metainterp_sd.logger_noopt.log_loop(loop.inputargs,
loop.operations)
optimizations, unroll = build_opt_chain(metainterp_sd, enable_opts)
-
- if jitdriver_sd.vectorize:
- optimize_vector(metainterp_sd, jitdriver_sd, loop, optimizations)
- elif unroll:
- return optimize_unroll(metainterp_sd, jitdriver_sd, loop,
- optimizations, inline_short_preamble,
- start_state, export_state)
+ if unroll:
+ if not export_state and warmstate.vectorize and
jitdriver_sd.vectorize:
+ optimize_vector(metainterp_sd, jitdriver_sd, loop,
optimizations)
+ else:
+ return optimize_unroll(metainterp_sd, jitdriver_sd, loop,
+ optimizations, inline_short_preamble,
+ start_state, export_state)
else:
optimizer = Optimizer(metainterp_sd, jitdriver_sd, loop,
optimizations)
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_vectorize.py
@@ -12,7 +12,7 @@
from rpython.jit.metainterp.optimizeopt.dependency import DependencyGraph
from rpython.jit.metainterp.optimizeopt.unroll import Inliner
from rpython.jit.metainterp.optimizeopt.vectorize import
(VectorizingOptimizer, MemoryRef,
- isomorphic, Pair, NotAVectorizeableLoop)
+ isomorphic, Pair, NotAVectorizeableLoop, NotAVectorizeableLoop)
from rpython.jit.metainterp.optimize import InvalidLoop
from rpython.jit.metainterp.history import ConstInt, BoxInt,
get_const_ptr_for_string
from rpython.jit.metainterp import executor, compile, resume
@@ -22,6 +22,8 @@
class FakeJitDriverStaticData(object):
vectorize=True
+ARCH_VEC_REG_SIZE = 16
+
class VecTestHelper(DependencyBaseTest):
enable_opts =
"intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unfold"
@@ -54,7 +56,7 @@
if unroll_factor == -1 and opt.smallest_type_bytes == 0:
raise NotAVectorizeableLoop()
if unroll_factor == -1:
- unroll_factor = opt.get_unroll_count()
+ unroll_factor = opt.get_unroll_count(ARCH_VEC_REG_SIZE)
opt.unroll_loop_iterations(loop, unroll_factor)
opt.loop.operations = opt.get_newoperations()
opt.clear_newoperations()
@@ -164,6 +166,18 @@
"""
self.assert_unroll_loop_equals(self.parse_loop(ops),
self.parse_loop(ops), 2)
+ def test_vectorize_empty_with_early_exit(self):
+ ops = """
+ []
+ guard_early_exit() []
+ jump()
+ """
+ try:
+ self.schedule(self.parse_loop(ops),1)
+ py.test.fail("empty loop with no memory references is not
vectorizable")
+ except NotAVectorizeableLoop:
+ pass
+
def test_unroll_empty_stays_empty_parameter(self):
""" same as test_unroll_empty_stays_empty but with a parameter """
ops = """
@@ -238,7 +252,7 @@
"""
vopt = self.vectoroptimizer(self.parse_loop(ops))
assert 0 == vopt.smallest_type_bytes
- assert 0 == vopt.get_unroll_count()
+ assert 0 == vopt.get_unroll_count(ARCH_VEC_REG_SIZE)
def test_array_operation_indices_not_unrolled(self):
ops = """
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py
b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -71,20 +71,22 @@
self.clear_newoperations()
label = self.loop.operations[0]
jump = self.loop.operations[-1]
- if jump.getopnum() != rop.LABEL:
+ if jump.getopnum() not in (rop.LABEL, rop.JUMP):
# compile_loop appends a additional label to all loops
# we cannot optimize normal traces
+ assert False
raise NotAVectorizeableLoop()
self.linear_find_smallest_type(self.loop)
byte_count = self.smallest_type_bytes
- if byte_count == 0 or label.getopnum() != rop.LABEL:
+ vsize = self.metainterp_sd.cpu.vector_register_size
+ if vsize == 0 or byte_count == 0 or label.getopnum() != rop.LABEL:
# stop, there is no chance to vectorize this trace
# we cannot optimize normal traces (if there is no label)
raise NotAVectorizeableLoop()
# unroll
- self.unroll_count = self.get_unroll_count()
+ self.unroll_count = self.get_unroll_count(vsize)
self.unroll_loop_iterations(self.loop, self.unroll_count)
self.loop.operations = self.get_newoperations();
self.clear_newoperations();
@@ -97,6 +99,8 @@
self.schedule()
def emit_operation(self, op):
+ if op.getopnum() == rop.GUARD_EARLY_EXIT:
+ return
self._last_emitted_op = op
self._newoperations.append(op)
@@ -111,10 +115,15 @@
op_count = len(loop.operations)
label_op = loop.operations[0].clone()
- jump_op = loop.operations[op_count-1].clone()
+ assert label_op.getopnum() == rop.LABEL
+ jump_op = loop.operations[op_count-1]
# use the target token of the label
- jump_op = ResOperation(rop.JUMP, jump_op.getarglist(), None,
label_op.getdescr())
- assert label_op.getopnum() == rop.LABEL
+ assert jump_op.getopnum() in (rop.LABEL, rop.JUMP)
+ if jump_op.getopnum() == rop.LABEL:
+ jump_op = ResOperation(rop.JUMP, jump_op.getarglist(), None,
label_op.getdescr())
+ else:
+ jump_op = jump_op.clone()
+ jump_op.setdescr(label_op.getdescr())
assert jump_op.is_final()
self.emit_unrolled_operation(label_op)
@@ -228,13 +237,12 @@
or byte_count < self.smallest_type_bytes:
self.smallest_type_bytes = byte_count
- def get_unroll_count(self):
+ def get_unroll_count(self, simd_vec_reg_bytes):
""" This is an estimated number of further unrolls """
# this optimization is not opaque, and needs info about the CPU
byte_count = self.smallest_type_bytes
if byte_count == 0:
return 0
- simd_vec_reg_bytes = 16 # TODO get from cpu
unroll_count = simd_vec_reg_bytes // byte_count
return unroll_count-1 # it is already unrolled once
@@ -357,7 +365,9 @@
if not we_are_translated():
for node in self.dependency_graph.nodes:
assert node.emitted
- self.loop.operations = self.collapse_index_guards()
+ self.loop.operations = self._newoperations[:]
+ #self.collapse_index_guards()
+ #self.clear_newoperations()
def relax_index_guards(self):
label_idx = 0
diff --git a/rpython/jit/metainterp/pyjitpl.py
b/rpython/jit/metainterp/pyjitpl.py
--- a/rpython/jit/metainterp/pyjitpl.py
+++ b/rpython/jit/metainterp/pyjitpl.py
@@ -2135,8 +2135,10 @@
self.seen_loop_header_for_jdindex = -1
# can only emit early exit if liveness is present
# TODO think of a better way later
- if self.framestack[-1].jitcode.liveness.get(0, None):
+ if self.framestack[-1].jitcode.liveness.get(0, None) \
+ and self.jitdriver_sd.vectorize:
self.generate_guard(rop.GUARD_EARLY_EXIT)
+ #self.history.record(rop.GUARD_EARLY_EXIT, [], None)
try:
self.interpret()
except SwitchToBlackhole, stb:
diff --git a/rpython/jit/metainterp/test/support.py
b/rpython/jit/metainterp/test/support.py
--- a/rpython/jit/metainterp/test/support.py
+++ b/rpython/jit/metainterp/test/support.py
@@ -48,6 +48,7 @@
trace_limit = sys.maxint
enable_opts = ALL_OPTS_DICT
+ vectorize = True
if kwds.pop('disable_optimizations', False):
FakeWarmRunnerState.enable_opts = {}
diff --git a/rpython/jit/metainterp/test/test_ajit.py
b/rpython/jit/metainterp/test/test_ajit.py
--- a/rpython/jit/metainterp/test/test_ajit.py
+++ b/rpython/jit/metainterp/test/test_ajit.py
@@ -2764,9 +2764,13 @@
return i
#
seen = []
- def my_optimize_trace(metainterp_sd, jitdriver_sd, loop, enable_opts,
+ def my_optimize_trace(metainterp_sd, jitdriver_sd, loop, warmstate,
*args, **kwds):
- seen.append('unroll' in enable_opts)
+ if 'try_disabling_unroll' in kwds and \
+ kwds['try_disabling_unroll']:
+ seen.append(False)
+ else:
+ seen.append('unroll' in warmstate.enable_opts)
raise InvalidLoop
old_optimize_trace = optimizeopt.optimize_trace
optimizeopt.optimize_trace = my_optimize_trace
diff --git a/rpython/jit/metainterp/test/test_vectorize.py
b/rpython/jit/metainterp/test/test_vectorize.py
--- a/rpython/jit/metainterp/test/test_vectorize.py
+++ b/rpython/jit/metainterp/test/test_vectorize.py
@@ -13,13 +13,14 @@
free_raw_storage, raw_storage_getitem)
class VectorizeTests:
- enable_opts = 'all'
+ enable_opts =
'intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll'
def meta_interp(self, f, args, policy=None):
return ll_meta_interp(f, args, enable_opts=self.enable_opts,
policy=policy,
CPUClass=self.CPUClass,
- type_system=self.type_system)
+ type_system=self.type_system,
+ vectorize=1)
@py.test.mark.parametrize('i',[3,4,5,6,7,8,9,50])
def test_vectorize_simple_load_arith_store_int_add_index(self,i):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit