Author: Richard Plangger <[email protected]>
Branch:
Changeset: r88097:9f66930d64e8
Date: 2016-11-03 09:36 +0100
http://bitbucket.org/pypy/pypy/changeset/9f66930d64e8/
Log: merge ppc-vsx-support and s390x-z-simd-support
diff too long, truncating to 2000 out of 7995 lines
diff --git a/pypy/doc/whatsnew-head.rst b/pypy/doc/whatsnew-head.rst
--- a/pypy/doc/whatsnew-head.rst
+++ b/pypy/doc/whatsnew-head.rst
@@ -81,6 +81,7 @@
Improve support for new buffer interface in cpyext, bf_getbuffer on built-in
types still missing
+
.. branch: fix-struct-unpack-Q
Improve compatibility with CPython in the ``struct`` module. In particular,
@@ -88,12 +89,10 @@
while previously it always returned a ``long`` for certains format codes such
as ``Q`` (and also ``I``, ``L`` and ``q`` on 32 bit)
-.. branch: newinitwarn
+.. branch: zarch-simd-support
-Issue warnings for stricter handling of ``__new/init__`` args (that
-become TypeErrors in python 3)
+s390x implementation for vector operations used in VecOpt
-.. branch: openssl-1.1
+.. branch: ppc-vsx-support
-PyPy can now be translated on a machine where the newer OpenSSL 1.1 is
-installed. Thanks tumbleweed!
+PowerPC implementation for vector operations used in VecOpt
diff --git a/pypy/module/micronumpy/test/test_zjit.py
b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -16,7 +16,7 @@
from rpython.jit.backend.detect_cpu import getcpuclass
CPU = getcpuclass()
-if not CPU.vector_extension:
+if not CPU.vector_ext:
py.test.skip("this cpu %s has no implemented vector backend" % CPU)
def get_profiler():
@@ -29,7 +29,7 @@
interp = None
def setup_method(self, method):
- if not self.CPUClass.vector_extension:
+ if not self.CPUClass.vector_ext:
py.test.skip("needs vector extension to run (for now)")
def assert_float_equal(self, f1, f2, delta=0.0001):
diff --git a/pypy/module/pypyjit/test_pypy_c/test_micronumpy.py
b/pypy/module/pypyjit/test_pypy_c/test_micronumpy.py
--- a/pypy/module/pypyjit/test_pypy_c/test_micronumpy.py
+++ b/pypy/module/pypyjit/test_pypy_c/test_micronumpy.py
@@ -3,23 +3,35 @@
from pypy.module.pypyjit.test_pypy_c.test_00_model import BaseTestPyPyC
from rpython.rlib.rawstorage import misaligned_is_fine
+def no_vector_backend():
+ import platform
+ if platform.machine().startswith('x86'):
+ from rpython.jit.backend.x86.detect_feature import detect_sse4_2
+ return not detect_sse4_2()
+ if platform.machine().startswith('ppc'):
+ from rpython.jit.backend.ppc.detect_feature import detect_vsx
+ return not detect_vsx()
+ if platform.machine() == "s390x":
+ from rpython.jit.backend.zarch.detect_feature import detect_simd_z
+ return not detect_simd_z()
+ return True
class TestMicroNumPy(BaseTestPyPyC):
arith_comb = [('+','float','float', 4*3427, 3427, 1.0,3.0),
- ('+','float','int', 9*7843, 7843, 4.0,5.0),
- ('+','int','float', 8*2571, 2571, 9.0,-1.0),
- ('+','float','int', -18*2653, 2653, 4.0,-22.0),
- ('+','int','int', -1*1499, 1499, 24.0,-25.0),
- ('-','float','float', -2*5523, 5523, 1.0,3.0),
- ('*','float','float', 3*2999, 2999, 1.0,3.0),
- ('/','float','float', 3*7632, 7632, 3.0,1.0),
- ('/','float','float', 1.5*7632, 7632, 3.0,2.0),
- ('&','int','int', 0, 1500, 1,0),
- ('&','int','int', 1500, 1500, 1,1),
- ('|','int','int', 1500, 1500, 0,1),
- ('|','int','int', 0, 1500, 0,0),
- ]
+ ('+','float','int', 9*7843, 7843, 4.0,5.0),
+ ('+','int','float', 8*2571, 2571, 9.0,-1.0),
+ ('+','float','int', -18*2653, 2653, 4.0,-22.0),
+ ('+','int','int', -1*1499, 1499, 24.0,-25.0),
+ ('-','float','float', -2*5523, 5523, 1.0,3.0),
+ ('*','float','float', 3*2999, 2999, 1.0,3.0),
+ ('/','float','float', 3*7632, 7632, 3.0,1.0),
+ ('/','float','float', 1.5*7632, 7632, 3.0,2.0),
+ ('&','int','int', 0, 1500, 1,0),
+ ('&','int','int', 1500, 1500, 1,1),
+ ('|','int','int', 1500, 1500, 0,1),
+ ('|','int','int', 0, 1500, 0,0),
+ ]
type_permuated = []
types = { 'int': ['int32','int64','int8','int16'],
'float': ['float32', 'float64']
@@ -36,6 +48,7 @@
type_permuated.append(t)
@py.test.mark.parametrize("op,adtype,bdtype,result,count,a,b",
type_permuated)
+ @py.test.mark.skipif('no_vector_backend()')
def test_vector_call2(self, op, adtype, bdtype, result, count, a, b):
source = """
def main():
@@ -51,6 +64,13 @@
log = self.run(main, [], vec=0)
assert log.result == vlog.result
assert log.result == result
+ assert log.jit_summary.vecopt_tried == 0
+ assert log.jit_summary.vecopt_success == 0
+ assert vlog.jit_summary.vecopt_tried > 0
+ if adtype in ('int64','float64') and bdtype in ('int64','float64'):
+ assert vlog.jit_summary.vecopt_success > 0
+ else:
+ assert vlog.jit_summary.vecopt_success >= 0
arith_comb = [
@@ -58,7 +78,7 @@
('sum','float', 2581, 2581, 1),
('prod','float', 1, 3178, 1),
('prod','int', 1, 3178, 1),
- ('any','int', 1, 1239, 1),
+ ('any','int', 1, 2239, 1),
('any','int', 0, 4912, 0),
('all','int', 0, 3420, 0),
('all','int', 1, 6757, 1),
@@ -76,6 +96,7 @@
type_permuated.append(t)
@py.test.mark.parametrize("op,dtype,result,count,a", type_permuated)
+ @py.test.mark.skipif('no_vector_backend()')
def test_reduce_generic(self,op,dtype,result,count,a):
source = """
def main():
@@ -84,10 +105,19 @@
return a.{method}()
""".format(method=op, dtype=dtype, count=count, a=a)
exec py.code.Source(source).compile()
+ log = self.run(main, [], vec=0)
vlog = self.run(main, [], vec=1)
- log = self.run(main, [], vec=0)
assert log.result == vlog.result
assert log.result == result
+ if not log.jit_summary:
+ return
+ assert log.jit_summary.vecopt_tried == 0
+ assert log.jit_summary.vecopt_success == 0
+ assert vlog.jit_summary.vecopt_tried > 0
+ if dtype in ('int64','float64') and (dtype != 'int64' and op !=
'prod'):
+ assert vlog.jit_summary.vecopt_success > 0
+ else:
+ assert vlog.jit_summary.vecopt_success >= 0
def test_reduce_logical_xor(self):
def main():
@@ -158,7 +188,7 @@
assert log.result is True
assert len(log.loops) == 1
loop = log._filter(log.loops[0])
- assert loop.match("""
+ loop.match("""
f31 = raw_load_f(i9, i29, descr=<ArrayF 8>)
guard_not_invalidated(descr=...)
i32 = float_ne(f31, 0.000000)
@@ -168,7 +198,20 @@
i38 = int_ge(i36, i30)
guard_false(i38, descr=...)
jump(..., descr=...)
- """)
+ """)
+ # vector version
+ #assert loop.match("""
+ # guard_not_invalidated(descr=...)
+ # i38 = int_add(i25, 2)
+ # i39 = int_ge(i38, i33)
+ # guard_false(i39, descr=...)
+ # v42 = vec_load_f(i9, i32, 1, 0, descr=<ArrayF 8>)
+ # v43 = vec_float_ne(v42, v36)
+ # f46 = vec_unpack_f(v42, 0, 1)
+ # vec_guard_true(v43, descr=...)
+ # i48 = int_add(i32, 16)
+ # i50 = int_add(i25, 2)
+ # jump(..., descr=...)""")
def test_array_getitem_basic(self):
def main():
diff --git a/rpython/doc/jit/vectorization.rst
b/rpython/doc/jit/vectorization.rst
--- a/rpython/doc/jit/vectorization.rst
+++ b/rpython/doc/jit/vectorization.rst
@@ -13,11 +13,6 @@
(e.g. those in the NumPyPy module).
* --jit vec_all=1: turns on the vectorization for any jit driver. See
parameters for
the filtering heuristics of traces.
-* --jit vec_ratio=2: A number from 0 to 10 that represents a real number
(vec_ratio / 10).
- This filters traces if vec_all is enabled. N is the trace count then the
number of
- vector transformable operations (add_int -> vec_add_int) M, the following
must hold:
- M / N >= (vec_ratio / 10)
-* --jit vec_length=60: The maximum number of trace instructions the vectorizer
filters for.
Features
--------
diff --git a/rpython/jit/backend/detect_cpu.py
b/rpython/jit/backend/detect_cpu.py
--- a/rpython/jit/backend/detect_cpu.py
+++ b/rpython/jit/backend/detect_cpu.py
@@ -13,7 +13,6 @@
MODEL_X86 = 'x86'
MODEL_X86_NO_SSE2 = 'x86-without-sse2'
MODEL_X86_64 = 'x86-64'
-MODEL_X86_64_SSE4 = 'x86-64-sse4'
MODEL_ARM = 'arm'
MODEL_PPC_64 = 'ppc-64'
MODEL_S390_64 = 's390x'
@@ -81,9 +80,6 @@
from rpython.jit.backend.x86 import detect_feature as feature
if sys.maxint == 2**63-1:
result = MODEL_X86_64
- # has sse 2 at least
- if feature.detect_sse4_1():
- result = MODEL_X86_64_SSE4
else:
assert sys.maxint == 2**31-1
if feature.detect_sse2():
@@ -120,8 +116,6 @@
return "rpython.jit.backend.x86.runner", "CPU386_NO_SSE2"
elif backend_name == MODEL_X86_64:
return "rpython.jit.backend.x86.runner", "CPU_X86_64"
- elif backend_name == MODEL_X86_64_SSE4:
- return "rpython.jit.backend.x86.runner", "CPU_X86_64_SSE4"
elif backend_name == MODEL_ARM:
return "rpython.jit.backend.arm.runner", "CPU_ARM"
elif backend_name == MODEL_PPC_64:
@@ -145,7 +139,6 @@
MODEL_X86: ['floats', 'singlefloats', 'longlong'],
MODEL_X86_NO_SSE2: ['longlong'],
MODEL_X86_64: ['floats', 'singlefloats'],
- MODEL_X86_64_SSE4: ['floats', 'singlefloats'],
MODEL_ARM: ['floats', 'singlefloats', 'longlong'],
MODEL_PPC_64: ['floats'],
MODEL_S390_64: ['floats'],
diff --git a/rpython/jit/backend/llgraph/runner.py
b/rpython/jit/backend/llgraph/runner.py
--- a/rpython/jit/backend/llgraph/runner.py
+++ b/rpython/jit/backend/llgraph/runner.py
@@ -2,6 +2,7 @@
from rpython.jit.backend import model
from rpython.jit.backend.llgraph import support
from rpython.jit.backend.llsupport import symbolic
+from rpython.jit.backend.llsupport.vector_ext import VectorExt
from rpython.jit.metainterp.history import AbstractDescr
from rpython.jit.metainterp.history import Const, getkind
from rpython.jit.metainterp.history import INT, REF, FLOAT, VOID
@@ -328,10 +329,11 @@
supports_cond_call_value = True
translate_support_code = False
is_llgraph = True
- vector_extension = True
- vector_register_size = 16 # in bytes
- vector_horizontal_operations = True
- vector_pack_slots = True
+ vector_ext = VectorExt()
+ vector_ext.enable(16, accum=True)
+ vector_ext.setup_once = lambda asm: asm
+ load_supported_factors = (1,2,4,8)
+ assembler = None
def __init__(self, rtyper, stats=None, *ignored_args, **kwds):
model.AbstractCPU.__init__(self)
@@ -877,6 +879,9 @@
def bh_vec_int_xor(self, vx, vy, count):
return [int(x) ^ int(y) for x,y in zip(vx,vy)]
+ def bh_vec_float_xor(self, vx, vy, count):
+ return [0.0 for x,y in zip(vx,vy)] # just used for clearing the vector
register
+
def bh_vec_cast_float_to_singlefloat(self, vx, count):
from rpython.rlib.rarithmetic import r_singlefloat
return
[longlong.singlefloat2int(r_singlefloat(longlong.getrealfloat(v)))
@@ -928,50 +933,32 @@
def bh_vec_int_signext(self, vx, ext, count):
return [heaptracker.int_signext(_vx, ext) for _vx in vx]
- def build_getarrayitem(func):
- def method(self, struct, offset, descr, _count):
+ def build_load(func):
+ def method(self, struct, offset, scale, disp, descr, _count):
values = []
- count = self.vector_register_size // descr.get_item_size_in_bytes()
+ count = self.vector_ext.vec_size() //
descr.get_item_size_in_bytes()
assert _count == count
assert count > 0
+ adr = struct + (offset * scale + disp)
+ a = support.cast_arg(lltype.Ptr(descr.A), adr)
+ array = a._obj
for i in range(count):
- val = func(self, struct, offset + i, descr)
+ val = support.cast_result(descr.A.OF, array.getitem(i))
values.append(val)
return values
return method
- bh_vec_getarrayitem_gc_i = build_getarrayitem(bh_getarrayitem_gc)
- bh_vec_getarrayitem_gc_f = build_getarrayitem(bh_getarrayitem_gc)
- bh_vec_getarrayitem_raw_i = build_getarrayitem(bh_getarrayitem_raw)
- bh_vec_getarrayitem_raw_f = build_getarrayitem(bh_getarrayitem_raw)
- del build_getarrayitem
+ bh_vec_load_i = build_load(bh_getarrayitem_raw)
+ bh_vec_load_f = build_load(bh_getarrayitem_raw)
+ del build_load
- def _bh_vec_raw_load(self, struct, offset, descr, _count):
- values = []
+ def bh_vec_store(self, struct, offset, newvalues, scale, disp, descr,
count):
stride = descr.get_item_size_in_bytes()
- count = self.vector_register_size // descr.get_item_size_in_bytes()
- assert _count == count
- assert count > 0
- for i in range(count):
- val = self.bh_raw_load(struct, offset + i*stride, descr)
- values.append(val)
- return values
-
- bh_vec_raw_load_i = _bh_vec_raw_load
- bh_vec_raw_load_f = _bh_vec_raw_load
-
- def bh_vec_raw_store(self, struct, offset, newvalues, descr, count):
- stride = descr.get_item_size_in_bytes()
+ adr = struct + (offset * scale + disp)
+ a = support.cast_arg(lltype.Ptr(descr.A), adr)
+ array = a._obj
for i,n in enumerate(newvalues):
- self.bh_raw_store(struct, offset + i*stride, n, descr)
-
- def bh_vec_setarrayitem_raw(self, struct, offset, newvalues, descr, count):
- for i,n in enumerate(newvalues):
- self.bh_setarrayitem_raw(struct, offset + i, n, descr)
-
- def bh_vec_setarrayitem_gc(self, struct, offset, newvalues, descr, count):
- for i,n in enumerate(newvalues):
- self.bh_setarrayitem_gc(struct, offset + i, n, descr)
+ array.setitem(i, support.cast_arg(descr.A.OF, n))
def store_fail_descr(self, deadframe, descr):
pass # I *think*
diff --git a/rpython/jit/backend/llsupport/descr.py
b/rpython/jit/backend/llsupport/descr.py
--- a/rpython/jit/backend/llsupport/descr.py
+++ b/rpython/jit/backend/llsupport/descr.py
@@ -681,7 +681,6 @@
sign = arraydescr.is_item_signed()
return size, ofs, sign
-
def unpack_fielddescr(fielddescr):
assert isinstance(fielddescr, FieldDescr)
ofs = fielddescr.offset
diff --git a/rpython/jit/backend/llsupport/llmodel.py
b/rpython/jit/backend/llsupport/llmodel.py
--- a/rpython/jit/backend/llsupport/llmodel.py
+++ b/rpython/jit/backend/llsupport/llmodel.py
@@ -35,10 +35,7 @@
# can an ISA instruction handle a factor to the offset?
load_supported_factors = (1,)
- vector_extension = False
- vector_register_size = 0 # in bytes
- vector_horizontal_operations = False
- vector_pack_slots = False
+ vector_ext = None
def __init__(self, rtyper, stats, opts, translate_support_code=False,
gcdescr=None):
diff --git a/rpython/jit/backend/llsupport/regalloc.py
b/rpython/jit/backend/llsupport/regalloc.py
--- a/rpython/jit/backend/llsupport/regalloc.py
+++ b/rpython/jit/backend/llsupport/regalloc.py
@@ -349,6 +349,8 @@
assert len(self.temp_boxes) == 0
if self.longevity:
for v in self.reg_bindings:
+ if v not in self.longevity:
+ llop.debug_print(lltype.Void, "variable %s not in
longevity\n" % v.repr({}))
assert self.longevity[v][1] > self.position
def try_allocate_reg(self, v, selected_reg=None, need_lower_byte=False):
diff --git a/rpython/jit/backend/llsupport/rewrite.py
b/rpython/jit/backend/llsupport/rewrite.py
--- a/rpython/jit/backend/llsupport/rewrite.py
+++ b/rpython/jit/backend/llsupport/rewrite.py
@@ -1,5 +1,5 @@
from rpython.rlib import rgc
-from rpython.rlib.objectmodel import we_are_translated, r_dict
+from rpython.rlib.objectmodel import we_are_translated, r_dict, always_inline
from rpython.rlib.rarithmetic import ovfcheck, highest_bit
from rpython.rtyper.lltypesystem import llmemory, lltype, rstr
from rpython.rtyper.annlowlevel import cast_instance_to_gcref
@@ -15,6 +15,7 @@
from rpython.jit.metainterp.history import JitCellToken
from rpython.jit.backend.llsupport.descr import (unpack_arraydescr,
unpack_fielddescr, unpack_interiorfielddescr)
+from rpython.rtyper.lltypesystem.lloperation import llop
FLAG_ARRAY = 0
FLAG_STR = 1
@@ -157,32 +158,12 @@
index_box = op.getarg(1)
self.emit_gc_load_or_indexed(op, ptr_box, index_box, itemsize,
itemsize, ofs, sign)
- def handle_rawload(self, op):
- itemsize, ofs, sign = unpack_arraydescr(op.getdescr())
- ptr_box = op.getarg(0)
- index_box = op.getarg(1)
- self.emit_gc_load_or_indexed(op, ptr_box, index_box, itemsize, 1, ofs,
sign)
-
def _emit_mul_if_factor_offset_not_supported(self, index_box,
factor, offset):
- # Returns (factor, offset, index_box) where index_box is either
- # a non-constant BoxInt or None.
- if isinstance(index_box, ConstInt):
- return 1, index_box.value * factor + offset, None
- else:
- if factor != 1 and factor not in self.cpu.load_supported_factors:
- # the factor is supported by the cpu
- # x & (x - 1) == 0 is a quick test for power of 2
- assert factor > 0
- if (factor & (factor - 1)) == 0:
- index_box = ResOperation(rop.INT_LSHIFT,
- [index_box, ConstInt(highest_bit(factor))])
- else:
- index_box = ResOperation(rop.INT_MUL,
- [index_box, ConstInt(factor)])
- self.emit_op(index_box)
- factor = 1
- return factor, offset, index_box
+ factor, offset, new_index_box, emit = cpu_simplify_scale(self.cpu,
index_box, factor, offset)
+ if emit:
+ self.emit_op(new_index_box)
+ return factor, offset, new_index_box
def emit_gc_load_or_indexed(self, op, ptr_box, index_box, itemsize,
factor, offset, sign, type='i'):
@@ -214,14 +195,6 @@
NOT_SIGNED = 0
CINT_ZERO = ConstInt(0)
opnum = op.getopnum()
- #if opnum == rop.CALL_MALLOC_NURSERY_VARSIZE:
- # v_length = op.getarg(2)
- # scale = op.getarg(1).getint()
- # if scale not in self.cpu.load_supported_factors:
- # scale, offset, v_length = \
- #
self._emit_mul_if_factor_offset_not_supported(v_length, scale, 0)
- # op.setarg(1, ConstInt(scale))
- # op.setarg(2, v_length)
if rop.is_getarrayitem(opnum) or \
opnum in (rop.GETARRAYITEM_RAW_I,
rop.GETARRAYITEM_RAW_F):
@@ -330,7 +303,11 @@
self._changed_op = None
for i in range(len(operations)):
op = operations[i]
- assert op.get_forwarded() is None
+ if op.get_forwarded():
+ msg = '[rewrite] operations at %d has forwarded info %s\n' %
(i, op.repr({}))
+ if we_are_translated():
+ llop.debug_print(lltype.Void, msg)
+ raise NotImplementedError(msg)
if op.getopnum() == rop.DEBUG_MERGE_POINT:
continue
if op is self._changed_op:
@@ -833,10 +810,6 @@
arraydescr.lendescr.offset !=
gc_descr.standard_array_length_ofs)):
return False
self.emitting_an_operation_that_can_collect()
- #scale = itemsize
- #if scale not in self.cpu.load_supported_factors:
- # scale, offset, v_length = \
- # self._emit_mul_if_factor_offset_not_supported(v_length,
scale, 0)
op = ResOperation(rop.CALL_MALLOC_NURSERY_VARSIZE,
[ConstInt(kind), ConstInt(itemsize), v_length],
descr=arraydescr)
@@ -1015,3 +988,24 @@
self._newops.append(load_op)
self.gcrefs_recently_loaded[index] = load_op
return load_op
+
+@always_inline
+def cpu_simplify_scale(cpu, index_box, factor, offset):
+ # Returns (factor, offset, index_box, [ops]) where index_box is either
+ # a non-constant BoxInt or None.
+ if isinstance(index_box, ConstInt):
+ return 1, index_box.value * factor + offset, None, False
+ else:
+ if factor != 1 and factor not in cpu.load_supported_factors:
+ # the factor is supported by the cpu
+ # x & (x - 1) == 0 is a quick test for power of 2
+ assert factor > 0
+ if (factor & (factor - 1)) == 0:
+ index_box = ResOperation(rop.INT_LSHIFT,
+ [index_box, ConstInt(highest_bit(factor))])
+ else:
+ index_box = ResOperation(rop.INT_MUL,
+ [index_box, ConstInt(factor)])
+ return 1, offset, index_box, True
+ return factor, offset, index_box, False
+
diff --git a/rpython/jit/backend/llsupport/vector_ext.py
b/rpython/jit/backend/llsupport/vector_ext.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/llsupport/vector_ext.py
@@ -0,0 +1,294 @@
+from rpython.jit.backend.llsupport.rewrite import cpu_simplify_scale
+from rpython.jit.backend.llsupport.descr import (unpack_arraydescr,
+ unpack_fielddescr, unpack_interiorfielddescr, ArrayDescr)
+from rpython.rlib.objectmodel import specialize, always_inline
+from rpython.jit.metainterp.history import (VECTOR, FLOAT, INT, ConstInt)
+from rpython.jit.metainterp.resoperation import rop
+from rpython.jit.metainterp.optimizeopt.schedule import (forwarded_vecinfo,
+ failnbail_transformation)
+from rpython.jit.metainterp.jitexc import NotAVectorizeableLoop
+from rpython.rlib.objectmodel import we_are_translated
+from rpython.rtyper.lltypesystem.lloperation import llop
+from rpython.rtyper.lltypesystem import lltype
+from rpython.rlib.debug import debug_print
+
+class TypeRestrict(object):
+ ANY_TYPE = '\x00'
+ ANY_SIZE = -1
+ ANY_SIGN = -1
+ ANY_COUNT = -1
+ SIGNED = 1
+ UNSIGNED = 0
+
+ def __init__(self,
+ type=ANY_TYPE,
+ bytesize=ANY_SIZE,
+ count=ANY_SIGN,
+ sign=ANY_COUNT):
+ self.type = type
+ self.bytesize = bytesize
+ self.sign = sign
+ self.count = count
+
+ @always_inline
+ def any_size(self):
+ return self.bytesize == TypeRestrict.ANY_SIZE
+
+ @always_inline
+ def any_count(self):
+ return self.count == TypeRestrict.ANY_COUNT
+
+ def check(self, value):
+ vecinfo = forwarded_vecinfo(value)
+ assert vecinfo.datatype != '\x00'
+ if self.type != TypeRestrict.ANY_TYPE:
+ if self.type != vecinfo.datatype:
+ msg = "type mismatch %s != %s" % \
+ (self.type, vecinfo.datatype)
+ failnbail_transformation(msg)
+ assert vecinfo.bytesize > 0
+ if not self.any_size():
+ if self.bytesize != vecinfo.bytesize:
+ msg = "bytesize mismatch %s != %s" % \
+ (self.bytesize, vecinfo.bytesize)
+ failnbail_transformation(msg)
+ assert vecinfo.count > 0
+ if self.count != TypeRestrict.ANY_COUNT:
+ if vecinfo.count < self.count:
+ msg = "count mismatch %s < %s" % \
+ (self.count, vecinfo.count)
+ failnbail_transformation(msg)
+ if self.sign != TypeRestrict.ANY_SIGN:
+ if bool(self.sign) == vecinfo.sign:
+ msg = "sign mismatch %s < %s" % \
+ (self.sign, vecinfo.sign)
+ failnbail_transformation(msg)
+
+ def max_input_count(self, count):
+ """ How many """
+ if self.count != TypeRestrict.ANY_COUNT:
+ return self.count
+ return count
+
+class OpRestrict(object):
+ def __init__(self, argument_restris):
+ self.argument_restrictions = argument_restris
+
+ def check_operation(self, state, pack, op):
+ return None
+
+ def crop_vector(self, op, newsize, size):
+ return newsize, size
+
+ def must_crop_vector(self, op, index):
+ restrict = self.argument_restrictions[index]
+ vecinfo = forwarded_vecinfo(op.getarg(index))
+ size = vecinfo.bytesize
+ newsize = self.crop_to_size(op, index)
+ return not restrict.any_size() and newsize != size
+
+ @always_inline
+ def crop_to_size(self, op, index):
+ restrict = self.argument_restrictions[index]
+ return restrict.bytesize
+
+ def opcount_filling_vector_register(self, op, vec_reg_size):
+ """ How many operations of that kind can one execute
+ with a machine instruction of register size X?
+ """
+ if op.is_typecast():
+ if op.casts_down():
+ size = op.cast_input_bytesize(vec_reg_size)
+ return size // op.cast_from_bytesize()
+ else:
+ return vec_reg_size // op.cast_to_bytesize()
+ vecinfo = forwarded_vecinfo(op)
+ return vec_reg_size // vecinfo.bytesize
+
+class GuardRestrict(OpRestrict):
+ def opcount_filling_vector_register(self, op, vec_reg_size):
+ arg = op.getarg(0)
+ vecinfo = forwarded_vecinfo(arg)
+ return vec_reg_size // vecinfo.bytesize
+
+class LoadRestrict(OpRestrict):
+ def check_operation(self, state, pack, op):
+ opnum = op.getopnum()
+ descr = op.getdescr()
+ if not we_are_translated() and not isinstance(descr, ArrayDescr):
+ itemsize = descr.get_item_size_in_bytes()
+ ofs = 0
+ else:
+ itemsize, ofs, _ = unpack_arraydescr(op.getdescr())
+ args = [op.getarg(0), op.getarg(1), ConstInt(1), ConstInt(ofs)]
+ if rop.is_getarrayitem(opnum) or \
+ opnum in (rop.GETARRAYITEM_RAW_I, rop.GETARRAYITEM_RAW_F):
+ index_box = op.getarg(1)
+ scale, offset, changed, emit = cpu_simplify_scale(state.cpu,
index_box, itemsize, ofs)
+ args[2] = ConstInt(scale)
+ args[3] = ConstInt(offset)
+ if emit:
+ state.oplist.append(changed)
+ args[1] = changed
+
+ return args
+
+
+ def opcount_filling_vector_register(self, op, vec_reg_size):
+ assert rop.is_primitive_load(op.opnum)
+ descr = op.getdescr()
+ return vec_reg_size // descr.get_item_size_in_bytes()
+
+class StoreRestrict(OpRestrict):
+ def __init__(self, argument_restris):
+ self.argument_restrictions = argument_restris
+
+ def check_operation(self, state, pack, op):
+ opnum = op.getopnum()
+ descr = op.getdescr()
+ if not we_are_translated() and not isinstance(descr, ArrayDescr):
+ itemsize = descr.get_item_size_in_bytes()
+ ofs = 0
+ else:
+ itemsize, ofs, _ = unpack_arraydescr(op.getdescr())
+ args = [op.getarg(0), op.getarg(1), op.getarg(2), ConstInt(1),
ConstInt(ofs)]
+ if opnum in (rop.SETARRAYITEM_GC, rop.SETARRAYITEM_RAW):
+ index_box = op.getarg(1)
+ scale, offset, changed, emit = cpu_simplify_scale(state.cpu,
index_box, itemsize, ofs)
+ args[3] = ConstInt(scale)
+ args[4] = ConstInt(offset)
+ if emit:
+ state.oplist.append(changed)
+ args[1] = changed
+ return args
+
+ def must_crop_vector(self, op, index):
+ vecinfo = forwarded_vecinfo(op.getarg(index))
+ bytesize = vecinfo.bytesize
+ return self.crop_to_size(op, index) != bytesize
+
+ @always_inline
+ def crop_to_size(self, op, index):
+ # there is only one parameter that needs to be transformed!
+ descr = op.getdescr()
+ return descr.get_item_size_in_bytes()
+
+ def opcount_filling_vector_register(self, op, vec_reg_size):
+ assert rop.is_primitive_store(op.opnum)
+ descr = op.getdescr()
+ return vec_reg_size // descr.get_item_size_in_bytes()
+
+class OpMatchSizeTypeFirst(OpRestrict):
+ def check_operation(self, state, pack, op):
+ i = 0
+ infos = [forwarded_vecinfo(o) for o in op.getarglist()]
+ arg0 = op.getarg(i)
+ while arg0.is_constant() and i < op.numargs():
+ i += 1
+ arg0 = op.getarg(i)
+ vecinfo = forwarded_vecinfo(arg0)
+ bytesize = vecinfo.bytesize
+ datatype = vecinfo.datatype
+
+ for arg in op.getarglist():
+ if arg.is_constant():
+ continue
+ curvecinfo = forwarded_vecinfo(arg)
+ if curvecinfo.bytesize != bytesize:
+ debug_print("op match size first type failed")
+ raise NotAVectorizeableLoop
+ if curvecinfo.datatype != datatype:
+ debug_print("op match size first type failed (datatype)")
+ raise NotAVectorizeableLoop
+ return None
+
+TR_ANY = TypeRestrict()
+TR_ANY_FLOAT = TypeRestrict(FLOAT)
+TR_ANY_INTEGER = TypeRestrict(INT)
+TR_FLOAT_2 = TypeRestrict(FLOAT, 4, 2)
+TR_DOUBLE_2 = TypeRestrict(FLOAT, 8, 2)
+TR_INT32_2 = TypeRestrict(INT, 4, 2)
+
+OR_MSTF_I = OpMatchSizeTypeFirst([TR_ANY_INTEGER, TR_ANY_INTEGER])
+OR_MSTF_F = OpMatchSizeTypeFirst([TR_ANY_FLOAT, TR_ANY_FLOAT])
+STORE_RESTRICT = StoreRestrict([None, None, TR_ANY])
+LOAD_RESTRICT = LoadRestrict([])
+GUARD_RESTRICT = GuardRestrict([TR_ANY_INTEGER])
+
+
+class VectorExt(object):
+
+ should_align_unroll = True
+
+ def __init__(self):
+ self._enabled = False
+ self.register_size = 0 # in bytes
+ self.accum = False
+ self._setup = False
+
+ def is_setup(self):
+ return self._setup
+
+ def setup_once(self):
+ raise NotImplementedError
+
+ def enable(self, vec_size, accum=False):
+ self._enabled = vec_size != 0
+ self.register_size = vec_size
+ self.accum = accum
+
+ def is_enabled(self):
+ return self._enabled
+
+ def vec_size(self):
+ return self.register_size
+
+ def supports_accumulation(self):
+ return self.accum
+
+ # note that the following definition is x86 arch specific
+ TR_MAPPING = {
+ rop.VEC_INT_ADD: OR_MSTF_I,
+ rop.VEC_INT_SUB: OR_MSTF_I,
+ rop.VEC_INT_MUL: OR_MSTF_I,
+ rop.VEC_INT_AND: OR_MSTF_I,
+ rop.VEC_INT_OR: OR_MSTF_I,
+ rop.VEC_INT_XOR: OR_MSTF_I,
+ rop.VEC_INT_EQ: OR_MSTF_I,
+ rop.VEC_INT_NE: OR_MSTF_I,
+
+ rop.VEC_FLOAT_ADD: OR_MSTF_F,
+ rop.VEC_FLOAT_SUB: OR_MSTF_F,
+ rop.VEC_FLOAT_MUL: OR_MSTF_F,
+ rop.VEC_FLOAT_TRUEDIV: OR_MSTF_F,
+ rop.VEC_FLOAT_ABS: OpRestrict([TR_ANY_FLOAT]),
+ rop.VEC_FLOAT_NEG: OpRestrict([TR_ANY_FLOAT]),
+
+ rop.VEC_STORE: STORE_RESTRICT,
+
+ rop.VEC_LOAD_I: LOAD_RESTRICT,
+ rop.VEC_LOAD_F: LOAD_RESTRICT,
+
+ rop.VEC_GUARD_TRUE: GUARD_RESTRICT,
+ rop.VEC_GUARD_FALSE: GUARD_RESTRICT,
+
+ ## irregular
+ rop.VEC_INT_SIGNEXT: OpRestrict([TR_ANY_INTEGER]),
+
+ rop.VEC_CAST_FLOAT_TO_SINGLEFLOAT: OpRestrict([TR_DOUBLE_2]),
+ # weird but the trace will store single floats in int boxes
+ rop.VEC_CAST_SINGLEFLOAT_TO_FLOAT: OpRestrict([TR_INT32_2]),
+ rop.VEC_CAST_FLOAT_TO_INT: OpRestrict([TR_DOUBLE_2]),
+ rop.VEC_CAST_INT_TO_FLOAT: OpRestrict([TR_INT32_2]),
+
+ rop.VEC_FLOAT_EQ: OpRestrict([TR_ANY_FLOAT,TR_ANY_FLOAT]),
+ rop.VEC_FLOAT_NE: OpRestrict([TR_ANY_FLOAT,TR_ANY_FLOAT]),
+ rop.VEC_INT_IS_TRUE:
OpRestrict([TR_ANY_INTEGER,TR_ANY_INTEGER]),
+ }
+
+ def get_operation_restriction(self, op):
+ res = self.TR_MAPPING.get(op.vector, None)
+ if not res:
+ failnbail_transformation("could not get OpRestrict for " + str(op))
+ return res
+
diff --git a/rpython/jit/backend/ppc/codebuilder.py
b/rpython/jit/backend/ppc/codebuilder.py
--- a/rpython/jit/backend/ppc/codebuilder.py
+++ b/rpython/jit/backend/ppc/codebuilder.py
@@ -1,4 +1,5 @@
import os
+
from rpython.jit.backend.ppc.ppc_form import PPCForm as Form
from rpython.jit.backend.ppc.locations import RegisterLocation
from rpython.jit.backend.ppc.ppc_field import ppc_fields
@@ -60,6 +61,17 @@
XFL = Form("FM", "frB", "XO1", "Rc")
XFX = Form("CRM", "rS", "XO1")
XLL = Form("LL", "XO1")
+XX1 = Form("fvrT", "rA", "rB", "XO1")
+XX2 = Form("fvrT", "fvrB", "XO6")
+XX3 = Form("fvrT", "fvrA", "fvrB", "XO9")
+XX3_2 = Form("fvrT", "fvrA", "fvrB", "OE", "XO11")
+XX3_splat = Form("fvrT", "fvrA", "fvrB", "DM", "XO13", "OE")
+XV = Form("ivrT", "rA", "rB", "XO1")
+VX = Form("ivrT", "ivrA", "ivrB", "XO8")
+VC = Form("ivrT", "ivrA", "ivrB", "XO12", "OE")
+VXI = Form("ivrT", "SIM", "XO8")
+VA = Form("ivrT", "ivrA", "ivrB", "ivrC", "XO10")
+
MI = Form("rA", "rS", "SH", "MB", "ME", "Rc")
MB = Form("rA", "rS", "rB", "MB", "ME", "Rc")
@@ -568,6 +580,145 @@
xor = XS(31, XO1=316, Rc=0)
xorx = XS(31, XO1=316, Rc=1)
+ # Vector Ext
+
+ # floating point operations (ppc got it's own vector
+ # unit for double/single precision floating points
+
+ # FLOAT
+ # -----
+
+ # load
+ lxvdsx = XX1(31, XO1=332) # splat first element
+ lxvd2x = XX1(31, XO1=844)
+ lxvw4x = XX1(31, XO1=780)
+
+ # store
+ stxvd2x = XX1(31, XO1=972)
+ stxvw4x = XX1(31, XO1=908)
+
+ # arith
+
+ # add
+ xvadddp = XX3(60, XO9=96)
+ xvaddsp = XX3(60, XO9=64)
+ xsadddp = XX3(60, XO9=32)
+ # sub
+ xvsubdp = XX3(60, XO9=104)
+ xvsubsp = XX3(60, XO9=72)
+ # mul
+ xvmuldp = XX3(60, XO9=112)
+ xvmulsp = XX3(60, XO9=80)
+ xsmuldp = XX3(60, XO9=48)
+ # div
+ xvdivdp = XX3(60, XO9=102)
+ xvdivsp = XX3(60, XO9=88)
+ # cmp
+ xvcmpeqdp = XX3_2(60, XO11=99, OE=0)
+ xvcmpeqdpx = XX3_2(60, XO11=99, OE=1)
+ xvcmpeqsp = XX3_2(60, XO11=67, OE=0)
+ xvcmpeqspx = XX3_2(60, XO11=67, OE=1)
+
+ # logical and and complement
+ xxlandc = XX3(60, XO9=138)
+
+ # neg
+ xvnegdp = XX2(60, XO6=505)
+ xvnegsp = XX2(60, XO6=441)
+
+ # abs
+ xvabsdp = XX2(60, XO6=473)
+ xvabssp = XX2(60, XO6=409)
+
+ # conversion from/to
+ xvcvsxddp = XX2(60, XO6=504)
+ xvcvdpsxds = XX2(60, XO6=472)
+
+ # compare greater than unsigned int
+ vcmpgtubx = VC(4, XO12=518, OE=1)
+ vcmpgtub = VC(4, XO12=518, OE=0)
+ vcmpgtuhx = VC(4, XO12=584, OE=1)
+ vcmpgtuh = VC(4, XO12=584, OE=0)
+ vcmpgtuwx = VC(4, XO12=646, OE=1)
+ vcmpgtuw = VC(4, XO12=646, OE=0)
+ vcmpgtudx = VC(4, XO12=711, OE=1)
+ vcmpgtud = VC(4, XO12=711, OE=0)
+
+ # compare equal to unsigned int
+ vcmpequbx = VC(4, XO12=6, OE=1)
+ vcmpequb = VC(4, XO12=6, OE=0)
+ vcmpequhx = VC(4, XO12=70, OE=1)
+ vcmpequh = VC(4, XO12=70, OE=0)
+ vcmpequwx = VC(4, XO12=134, OE=1)
+ vcmpequw = VC(4, XO12=134, OE=0)
+ vcmpequdx = VC(4, XO12=199, OE=1)
+ vcmpequd = VC(4, XO12=199, OE=0)
+
+ # permute/splat
+ # splat low of A, and low of B
+ xxspltdl = XX3_splat(60, XO13=10, OE=0, DM=0b00)
+ # splat high of A, and high of B
+ xxspltdh = XX3_splat(60, XO13=10, OE=0, DM=0b11)
+ # generic splat
+ xxpermdi = XX3_splat(60, XO13=10, OE=0)
+
+ xxlxor = XX3(60, XO9=154)
+ xxlor = XX3(60, XO9=146)
+
+ # vector move register is alias to vector or
+ xvmr = xxlor
+
+ # INTEGER
+ # -------
+
+ # load
+ lvx = XV(31, XO1=103)
+ lvewx = XV(31, XO1=71)
+ lvehx = XV(31, XO1=39)
+ lvebx = XV(31, XO1=7)
+ # store
+ stvx = XV(31, XO1=231)
+ stvewx = XV(31, XO1=199)
+ stvehx = XV(31, XO1=167)
+ stvebx = XV(31, XO1=135)
+
+ # arith
+ vaddudm = VX(4, XO8=192)
+ vadduwm = VX(4, XO8=128)
+ vadduhm = VX(4, XO8=64)
+ vaddubm = VX(4, XO8=0)
+
+ vsubudm = VX(4, XO8=1216)
+ vsubuwm = VX(4, XO8=1152)
+ vsubuhm = VX(4, XO8=1088)
+ vsububm = VX(4, XO8=1024)
+
+ # logic
+ vand = VX(4, XO8=1028)
+ vor = VX(4, XO8=1156)
+ veqv = VX(4, XO8=1668)
+ vxor = VX(4, XO8=1220)
+ vnor = VX(4, XO8=1284)
+
+ # vector move register is alias to vector or
+ vmr = vor
+ # complement is equivalent to vnor
+ vnot = vnor
+
+ # shift, perm and select
+ lvsl = XV(31, XO1=6)
+ lvsr = XV(31, XO1=38)
+ vperm = VA(4, XO10=43)
+ vsel = VA(4, XO10=42)
+ vspltisb = VXI(4, XO8=780)
+ vspltisw = VXI(4, XO8=844)
+ vspltisw = VXI(4, XO8=908)
+
+ VX_splat = Form("ivrT", "ivrB", "ivrA", "XO8")
+ vspltb = VX_splat(4, XO8=524)
+ vsplth = VX_splat(4, XO8=588)
+ vspltw = VX_splat(4, XO8=652)
+
class PPCAssembler(BasicPPCAssembler):
BA = BasicPPCAssembler
diff --git a/rpython/jit/backend/ppc/condition.py
b/rpython/jit/backend/ppc/condition.py
--- a/rpython/jit/backend/ppc/condition.py
+++ b/rpython/jit/backend/ppc/condition.py
@@ -6,6 +6,10 @@
GE = 5
SO = 6
NS = 7
+VEQ = 8
+VEQI = 9
+VNE = 10
+VNEI = 11
cond_none = -1 # invalid
def negate(cond):
@@ -19,6 +23,8 @@
assert negate(GE) == LT
assert negate(SO) == NS
assert negate(NS) == SO
+assert negate(VEQ) == VEQI
+assert negate(VNE) == VNEI
encoding = [
(2, 12), # EQ
@@ -29,4 +35,8 @@
(0, 4), # GE
(3, 12), # SO
(3, 4), # NS
+ (24, 12), # VEQ
+ (24, 4), # VEQI
+ (26, 12), # VNE
+ (26, 4), # VNEI
]
diff --git a/rpython/jit/backend/ppc/detect_feature.py
b/rpython/jit/backend/ppc/detect_feature.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/ppc/detect_feature.py
@@ -0,0 +1,46 @@
+import os
+import sys
+import struct
+import platform
+from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.rtyper.tool import rffi_platform
+from rpython.rlib.rmmap import alloc, free
+from rpython.rlib.rstruct.runpack import runpack
+
+AT_HWCAP = rffi_platform.getconstantinteger('AT_HWCAP', '#include
"linux/auxvec.h"')
+AT_NULL = rffi_platform.getconstantinteger('AT_NULL', '#include
"linux/auxvec.h"')
+PPC_FEATURE_HAS_ALTIVEC =
rffi_platform.getconstantinteger('PPC_FEATURE_HAS_ALTIVEC',
+ '#include "asm/cputable.h"')
+SYSTEM = platform.system()
+
+def detect_vsx_linux():
+ try:
+ fd = os.open("/proc/self/auxv", os.O_RDONLY, 0644)
+ try:
+ while True:
+ buf = os.read(fd, 8)
+ buf2 = os.read(fd, 8)
+ if not buf or not buf2:
+ break
+ key = runpack("L", buf)
+ value = runpack("L", buf2)
+ if key == AT_HWCAP:
+ if value & PPC_FEATURE_HAS_ALTIVEC:
+ return True
+ if key == AT_NULL:
+ return False
+ finally:
+ os.close(fd)
+ except OSError:
+ pass
+ return False
+
+def detect_vsx():
+ if SYSTEM == 'Linux':
+ return detect_vsx_linux()
+ return False
+
+if __name__ == '__main__':
+ print 'The following extensions are supported:'
+ if detect_vsx():
+ print ' - AltiVec'
diff --git a/rpython/jit/backend/ppc/locations.py
b/rpython/jit/backend/ppc/locations.py
--- a/rpython/jit/backend/ppc/locations.py
+++ b/rpython/jit/backend/ppc/locations.py
@@ -1,4 +1,4 @@
-from rpython.jit.metainterp.history import INT, FLOAT
+from rpython.jit.metainterp.history import INT, FLOAT, VECTOR
import sys
# cannot import from arch.py, currently we have a circular import
@@ -30,6 +30,9 @@
def is_fp_reg(self):
return False
+ def is_vector_reg(self):
+ return False
+
def is_imm_float(self):
return False
@@ -75,6 +78,27 @@
def as_key(self):
return self.value + 100
+class VectorRegisterLocation(AssemblerLocation):
+ _immutable_ = True
+ width = WORD * 2
+ type = VECTOR
+
+ def __init__(self, value):
+ self.value = value
+
+ def __repr__(self):
+ return 'vr%d' % self.value
+
+ def is_reg(self):
+ return True
+
+ def as_key(self):
+ return self.value + 132
+
+ def is_vector_reg(self):
+ return True
+
+
class ImmLocation(AssemblerLocation):
_immutable_ = True
width = WORD
@@ -129,9 +153,6 @@
def __repr__(self):
return 'FP(%s)+%d' % (self.type, self.value)
- def location_code(self):
- return 'b'
-
def get_position(self):
return self.position
diff --git a/rpython/jit/backend/ppc/opassembler.py
b/rpython/jit/backend/ppc/opassembler.py
--- a/rpython/jit/backend/ppc/opassembler.py
+++ b/rpython/jit/backend/ppc/opassembler.py
@@ -29,7 +29,7 @@
from rpython.jit.codewriter.effectinfo import EffectInfo
from rpython.jit.backend.ppc import callbuilder
from rpython.rlib.rarithmetic import r_uint
-from rpython.rlib.rjitlog import rjitlog as jl
+from rpython.jit.backend.ppc.vector_ext import VectorAssembler
class IntOpAssembler(object):
@@ -1329,7 +1329,8 @@
MiscOpAssembler, FieldOpAssembler,
StrOpAssembler, CallOpAssembler,
UnicodeOpAssembler, ForceOpAssembler,
- AllocOpAssembler, FloatOpAssembler):
+ AllocOpAssembler, FloatOpAssembler,
+ VectorAssembler):
_mixin_ = True
def nop(self):
diff --git a/rpython/jit/backend/ppc/ppc_assembler.py
b/rpython/jit/backend/ppc/ppc_assembler.py
--- a/rpython/jit/backend/ppc/ppc_assembler.py
+++ b/rpython/jit/backend/ppc/ppc_assembler.py
@@ -14,6 +14,7 @@
from rpython.jit.backend.ppc.helper.regalloc import _check_imm_arg
import rpython.jit.backend.ppc.register as r
import rpython.jit.backend.ppc.condition as c
+from rpython.jit.metainterp.compile import ResumeGuardDescr
from rpython.jit.backend.ppc.register import JITFRAME_FIXED_SIZE
from rpython.jit.metainterp.history import AbstractFailDescr
from rpython.jit.backend.llsupport import jitframe, rewrite
@@ -37,6 +38,7 @@
from rpython.rlib.objectmodel import compute_unique_id
from rpython.rlib.rarithmetic import r_uint
from rpython.rlib.rjitlog import rjitlog as jl
+from rpython.jit.backend.ppc.jump import remap_frame_layout_mixed
memcpy_fn = rffi.llexternal('memcpy', [llmemory.Address, llmemory.Address,
rffi.SIZE_T], lltype.Void,
@@ -769,7 +771,7 @@
self.update_frame_depth(frame_depth_no_fixed_size +
JITFRAME_FIXED_SIZE)
#
size_excluding_failure_stuff = self.mc.get_relative_pos()
- self.write_pending_failure_recoveries()
+ self.write_pending_failure_recoveries(regalloc)
full_size = self.mc.get_relative_pos()
#
self.patch_stack_checks(frame_depth_no_fixed_size +
JITFRAME_FIXED_SIZE)
@@ -813,8 +815,10 @@
# name = "Loop # %s: %s" % (looptoken.number, loopname)
# self.cpu.profile_agent.native_code_written(name,
# rawstart, full_size)
+ #print(hex(rawstart))
+ #import pdb; pdb.set_trace()
return AsmInfo(ops_offset, rawstart + looppos,
- size_excluding_failure_stuff - looppos)
+ size_excluding_failure_stuff - looppos, rawstart +
looppos)
def _assemble(self, regalloc, inputargs, operations):
self._regalloc = regalloc
@@ -855,10 +859,12 @@
self.reserve_gcref_table(allgcrefs)
startpos = self.mc.get_relative_pos()
+ self._update_at_exit(arglocs, inputargs, faildescr, regalloc)
+
self._check_frame_depth(self.mc, regalloc.get_gcmap())
frame_depth_no_fixed_size = self._assemble(regalloc, inputargs,
operations)
codeendpos = self.mc.get_relative_pos()
- self.write_pending_failure_recoveries()
+ self.write_pending_failure_recoveries(regalloc)
fullsize = self.mc.get_relative_pos()
#
self.patch_stack_checks(frame_depth_no_fixed_size +
JITFRAME_FIXED_SIZE)
@@ -886,7 +892,8 @@
self.fixup_target_tokens(rawstart)
self.update_frame_depth(frame_depth)
self.teardown()
- return AsmInfo(ops_offset, startpos + rawstart, codeendpos - startpos)
+ return AsmInfo(ops_offset, startpos + rawstart, codeendpos - startpos,
+ startpos + rawstart)
def reserve_gcref_table(self, allgcrefs):
# allocate the gc table right now. We write absolute loads in
@@ -940,7 +947,7 @@
ofs = self.cpu.get_ofs_of_frame_field('jf_gcmap')
mc.store(r.SCRATCH.value, r.SPP.value, ofs)
- def break_long_loop(self):
+ def break_long_loop(self, regalloc):
# If the loop is too long, the guards in it will jump forward
# more than 32 KB. We use an approximate hack to know if we
# should break the loop here with an unconditional "b" that
@@ -948,15 +955,20 @@
jmp_pos = self.mc.currpos()
self.mc.trap()
- self.write_pending_failure_recoveries()
+ self.write_pending_failure_recoveries(regalloc)
currpos = self.mc.currpos()
pmc = OverwritingBuilder(self.mc, jmp_pos, 1)
pmc.b(currpos - jmp_pos)
pmc.overwrite()
- def generate_quick_failure(self, guardtok):
+ def generate_quick_failure(self, guardtok, regalloc):
startpos = self.mc.currpos()
+ # accum vecopt
+ self._update_at_exit(guardtok.fail_locs, guardtok.failargs,
+ guardtok.faildescr, regalloc)
+ pos = self.mc.currpos()
+ guardtok.rel_recovery_prefix = pos - startpos
faildescrindex, target = self.store_info_on_descr(startpos, guardtok)
assert target != 0
self.mc.load_imm(r.r2, target)
@@ -969,13 +981,13 @@
self.mc.trap()
return startpos
- def write_pending_failure_recoveries(self):
+ def write_pending_failure_recoveries(self, regalloc):
# for each pending guard, generate the code of the recovery stub
# at the end of self.mc.
for i in range(self.pending_guard_tokens_recovered,
len(self.pending_guard_tokens)):
tok = self.pending_guard_tokens[i]
- tok.pos_recovery_stub = self.generate_quick_failure(tok)
+ tok.pos_recovery_stub = self.generate_quick_failure(tok, regalloc)
self.pending_guard_tokens_recovered = len(self.pending_guard_tokens)
def patch_pending_failure_recoveries(self, rawstart):
@@ -986,7 +998,7 @@
addr = rawstart + tok.pos_jump_offset
#
# XXX see patch_jump_for_descr()
- tok.faildescr.adr_jump_offset = rawstart + tok.pos_recovery_stub
+ tok.faildescr.adr_jump_offset = rawstart + tok.pos_recovery_stub +
tok.rel_recovery_prefix
#
relative_target = tok.pos_recovery_stub - tok.pos_jump_offset
#
@@ -1058,6 +1070,10 @@
self.mc.lfd(reg, r.SPP.value, offset)
return
assert 0, "not supported location"
+ elif prev_loc.is_vector_reg():
+ assert loc.is_vector_reg()
+ self.mc.vmr(loc.value, prev_loc.value, prev_loc.value)
+ return
elif prev_loc.is_reg():
reg = prev_loc.value
# move to another register
@@ -1363,6 +1379,62 @@
self.mc.load_imm(r.SCRATCH, fail_index)
self.mc.store(r.SCRATCH.value, r.SPP.value, FORCE_INDEX_OFS)
+ def stitch_bridge(self, faildescr, target):
+ """ Stitching means that one can enter a bridge with a complete
different register
+ allocation. This needs remapping which is done here for both
normal registers
+ and accumulation registers.
+ """
+ asminfo, bridge_faildescr, version, looptoken = target
+ assert isinstance(bridge_faildescr, ResumeGuardDescr)
+ assert isinstance(faildescr, ResumeGuardDescr)
+ assert asminfo.rawstart != 0
+ self.mc = PPCBuilder()
+ allblocks = self.get_asmmemmgr_blocks(looptoken)
+ self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
+ allblocks)
+ frame_info = self.datablockwrapper.malloc_aligned(
+ jitframe.JITFRAMEINFO_SIZE, alignment=WORD)
+
+ # if accumulation is saved at the guard, we need to update it here!
+ guard_locs = self.rebuild_faillocs_from_descr(faildescr,
version.inputargs)
+ bridge_locs = self.rebuild_faillocs_from_descr(bridge_faildescr,
version.inputargs)
+ guard_accum_info = faildescr.rd_vector_info
+ # O(n**2), but usually you only have at most 1 fail argument
+ while guard_accum_info:
+ bridge_accum_info = bridge_faildescr.rd_vector_info
+ while bridge_accum_info:
+ if bridge_accum_info.failargs_pos ==
guard_accum_info.failargs_pos:
+ # the mapping might be wrong!
+ if bridge_accum_info.location is not
guard_accum_info.location:
+ self.regalloc_mov(guard_accum_info.location,
bridge_accum_info.location)
+ bridge_accum_info = bridge_accum_info.next()
+ guard_accum_info = guard_accum_info.next()
+
+ # register mapping is most likely NOT valid, thus remap it
+ src_locations1 = []
+ dst_locations1 = []
+ src_locations2 = []
+ dst_locations2 = []
+
+ # Build the four lists
+ assert len(guard_locs) == len(bridge_locs)
+ for i,src_loc in enumerate(guard_locs):
+ dst_loc = bridge_locs[i]
+ if not src_loc.is_fp_reg():
+ src_locations1.append(src_loc)
+ dst_locations1.append(dst_loc)
+ else:
+ src_locations2.append(src_loc)
+ dst_locations2.append(dst_loc)
+ remap_frame_layout_mixed(self, src_locations1, dst_locations1,
r.SCRATCH,
+ src_locations2, dst_locations2, r.FP_SCRATCH)
+
+ offset = self.mc.get_relative_pos()
+ self.mc.b_abs(asminfo.rawstart)
+
+ rawstart = self.materialize_loop(looptoken)
+ # update the guard to jump right to this custom piece of assembler
+ self.patch_jump_for_descr(faildescr, rawstart)
def notimplemented_op(self, op, arglocs, regalloc):
msg = '[PPC/asm] %s not implemented\n' % op.getopname()
diff --git a/rpython/jit/backend/ppc/ppc_field.py
b/rpython/jit/backend/ppc/ppc_field.py
--- a/rpython/jit/backend/ppc/ppc_field.py
+++ b/rpython/jit/backend/ppc/ppc_field.py
@@ -43,6 +43,17 @@
"spr": (11, 20),
"TO": ( 6, 10),
"UIMM": (16, 31),
+ "fvrT": (6, 31, 'unsigned', regname._V, 'overlap'),
+ "fvrA": (11, 29, 'unsigned', regname._V, 'overlap'),
+ "fvrB": (16, 30, 'unsigned', regname._V, 'overlap'),
+ # low vector register T (low in a sense:
+ # can only address 32 vector registers)
+ "ivrT": (6, 10, 'unsigned', regname._V),
+ # low vector register A
+ "ivrA": (11, 15, 'unsigned', regname._V),
+ # low vector register B
+ "ivrB": (16, 20, 'unsigned', regname._V),
+ "ivrC": (21, 25, 'unsigned', regname._V),
"XO1": (21, 30),
"XO2": (22, 30),
"XO3": (26, 30),
@@ -50,7 +61,15 @@
"XO5": (27, 29),
"XO6": (21, 29),
"XO7": (27, 30),
+ "XO8": (21, 31),
+ "XO9": (21, 28),
+ "XO10": (26, 31),
+ "XO11": (22, 28),
+ "XO12": (22, 31),
+ "XO13": (24, 28),
+ "DM": (22, 23),
"LL": ( 9, 10),
+ "SIM": (11, 15),
}
@@ -100,7 +119,6 @@
def decode(self, inst):
value = super(sh, self).decode(inst)
return (value & 32) << 5 | (value >> 10 & 31)
-# other special fields?
ppc_fields = {
"LI": IField("LI", *fields["LI"]),
diff --git a/rpython/jit/backend/ppc/rassemblermaker.py
b/rpython/jit/backend/ppc/rassemblermaker.py
--- a/rpython/jit/backend/ppc/rassemblermaker.py
+++ b/rpython/jit/backend/ppc/rassemblermaker.py
@@ -46,6 +46,15 @@
elif field.name == 'sh':
body.append('sh1 = (%s & 31) << 10 | (%s & 32) >> 5' % (value,
value))
value = 'sh1'
+ elif field.name == 'fvrT':
+ body.append('vrT1 = (%s & 31) << 21 | (%s & 32) >> 5' % (value,
value))
+ value = 'vrT1'
+ elif field.name == 'fvrA':
+ body.append('fvrA1 = (%s & 31) << 14 | (%s & 32) >> 5' % (value,
value))
+ value = 'fvrA1'
+ elif field.name == 'fvrB':
+ body.append('fvrB1 = (%s & 31) << 10 | (%s & 32) >> 5' % (value,
value))
+ value = 'fvrB1'
if isinstance(field, IField):
body.append('v |= ((%3s >> 2) & r_uint(%#05x)) << 2' % (value,
field.mask))
else:
diff --git a/rpython/jit/backend/ppc/regalloc.py
b/rpython/jit/backend/ppc/regalloc.py
--- a/rpython/jit/backend/ppc/regalloc.py
+++ b/rpython/jit/backend/ppc/regalloc.py
@@ -10,7 +10,8 @@
from rpython.jit.backend.ppc.helper.regalloc import _check_imm_arg,
check_imm_box
from rpython.jit.backend.ppc.helper import regalloc as helper
from rpython.jit.metainterp.history import (Const, ConstInt, ConstFloat,
ConstPtr,
- INT, REF, FLOAT, VOID)
+ INT, REF, FLOAT, VOID, VECTOR,
+ AbstractFailDescr)
from rpython.jit.metainterp.history import JitCellToken, TargetToken
from rpython.jit.metainterp.resoperation import rop
from rpython.jit.backend.ppc import locations
@@ -27,6 +28,7 @@
from rpython.jit.codewriter.effectinfo import EffectInfo
from rpython.rlib import rgc
from rpython.rlib.rarithmetic import r_uint
+from rpython.jit.backend.ppc.vector_ext import VectorRegalloc
LIMIT_LOOP_BREAK = 15000 # should be much smaller than 32 KB
@@ -49,6 +51,11 @@
def __repr__(self):
return "<TempFloat at %s>" % (id(self),)
+class TempVector(TempVar):
+ type = VECTOR
+
+ def __repr__(self):
+ return "<TempVector at %s>" % (id(self),)
class FPRegisterManager(RegisterManager):
all_regs = r.MANAGED_FP_REGS
@@ -135,6 +142,26 @@
self.temp_boxes.append(box)
return reg
+class VectorRegisterManager(RegisterManager):
+ all_regs = r.MANAGED_FLOAT_VECTOR_REGS
+ box_types = [FLOAT, INT]
+ save_around_call_regs = [] # ??? lookup the ABI
+ assert set(save_around_call_regs).issubset(all_regs)
+
+ def __init__(self, longevity, frame_manager=None, assembler=None):
+ RegisterManager.__init__(self, longevity, frame_manager, assembler)
+
+ def ensure_reg(self, box):
+ raise NotImplementedError
+
+ def get_scratch_reg(self, type=INT):
+ if type == FLOAT:
+ box = TempFloat()
+ else:
+ box = TempInt()
+ reg = self.force_allocate_reg(box, forbidden_vars=self.temp_boxes)
+ self.temp_boxes.append(box)
+ return reg
class PPCFrameManager(FrameManager):
def __init__(self, base_ofs):
@@ -155,8 +182,7 @@
assert isinstance(loc, locations.StackLocation)
return loc.position
-
-class Regalloc(BaseRegalloc):
+class Regalloc(BaseRegalloc, VectorRegalloc):
def __init__(self, assembler=None):
self.cpu = assembler.cpu
@@ -180,6 +206,8 @@
assembler = self.assembler)
self.fprm = FPRegisterManager(self.longevity, frame_manager = self.fm,
assembler = self.assembler)
+ self.vrm = VectorRegisterManager(self.longevity, frame_manager =
self.fm,
+ assembler = self.assembler)
return operations
def prepare_loop(self, inputargs, operations, looptoken, allgcrefs):
@@ -241,7 +269,10 @@
def possibly_free_var(self, var):
if var is not None:
- if var.type == FLOAT:
+ if var.is_vector():
+ if var.type != VOID:
+ self.vrm.possibly_free_var(var)
+ elif var.type == FLOAT:
self.fprm.possibly_free_var(var)
else:
self.rm.possibly_free_var(var)
@@ -287,6 +318,7 @@
self.assembler.mc.mark_op(op)
self.rm.position = i
self.fprm.position = i
+ self.vrm.position = i
opnum = op.opnum
if rop.has_no_side_effect(opnum) and op not in self.longevity:
i += 1
@@ -295,7 +327,10 @@
#
for j in range(op.numargs()):
box = op.getarg(j)
- if box.type != FLOAT:
+ if box.is_vector():
+ if box.type != VOID:
+ self.vrm.temp_boxes.append(box)
+ elif box.type != FLOAT:
self.rm.temp_boxes.append(box)
else:
self.fprm.temp_boxes.append(box)
@@ -309,8 +344,9 @@
self.possibly_free_var(op)
self.rm._check_invariants()
self.fprm._check_invariants()
+ self.vrm._check_invariants()
if self.assembler.mc.get_relative_pos() > self.limit_loop_break:
- self.assembler.break_long_loop()
+ self.assembler.break_long_loop(self)
self.limit_loop_break = (self.assembler.mc.get_relative_pos() +
LIMIT_LOOP_BREAK)
i += 1
@@ -351,10 +387,13 @@
return gcmap
def loc(self, var):
- if var.type == FLOAT:
- return self.fprm.loc(var)
+ if var.is_vector():
+ return self.vrm.loc(var)
else:
- return self.rm.loc(var)
+ if var.type == FLOAT:
+ return self.fprm.loc(var)
+ else:
+ return self.rm.loc(var)
def next_instruction(self):
self.rm.next_instruction()
@@ -419,6 +458,7 @@
# temporary boxes and all the current operation's arguments
self.rm.free_temp_vars()
self.fprm.free_temp_vars()
+ self.vrm.free_temp_vars()
# ******************************************************
# * P R E P A R E O P E R A T I O N S *
@@ -550,7 +590,19 @@
#
# generate_quick_failure() produces up to 14 instructions per guard
self.limit_loop_break -= 14 * 4
- #
+ # specifically for vecopt
+ descr = op.getdescr()
+ if not descr:
+ return args
+ assert isinstance(descr, AbstractFailDescr)
+ if descr.rd_vector_info:
+ accuminfo = descr.rd_vector_info
+ while accuminfo:
+ i = accuminfo.getpos_in_failargs()+1
+ accuminfo.location = args[i]
+ loc = self.loc(accuminfo.getoriginal())
+ args[i] = loc
+ accuminfo = accuminfo.next()
return args
def load_condition_into_cc(self, box):
diff --git a/rpython/jit/backend/ppc/register.py
b/rpython/jit/backend/ppc/register.py
--- a/rpython/jit/backend/ppc/register.py
+++ b/rpython/jit/backend/ppc/register.py
@@ -1,8 +1,10 @@
from rpython.jit.backend.ppc.locations import (RegisterLocation,
- FPRegisterLocation)
+ FPRegisterLocation, VectorRegisterLocation)
ALL_REGS = [RegisterLocation(i) for i in range(32)]
ALL_FLOAT_REGS = [FPRegisterLocation(i) for i in range(32)]
+ALL_INTEGER_VECTOR_REGS = [VectorRegisterLocation(i) for i in range(32)]
+ALL_FLOAT_VECTOR_REGS = [VectorRegisterLocation(i) for i in range(32,64)]
r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15, r16,\
r17, r18, r19, r20, r21, r22, r23, r24, r25, r26, r27, r28, r29, r30, r31\
@@ -12,6 +14,19 @@
f17, f18, f19, f20, f21, f22, f23, f24, f25, f26, f27, f28, f29, f30, f31\
= ALL_FLOAT_REGS
+ivr0, ivr1, ivr2, ivr3, ivr4, ivr5, ivr6, ivr7, ivr8, ivr9, ivr10, ivr11,
ivr12,\
+ ivr13, ivr14, ivr15, ivr16, ivr17, ivr18, ivr19, ivr20, ivr21, ivr22,
ivr23,\
+ ivr24, ivr25, ivr26, ivr27, ivr28, ivr29, ivr30, ivr31\
+ = ALL_FLOAT_REGS
+
+# the first 32 vector register are partly shared with the normal floating point
+# registers, since there are so many registers, we just take the upper 31 ones
+vr32, vr33, vr34, vr35, vr36, vr37, \
+ vr38, vr39, vr40, vr41, vr42, vr43, vr44, vr45, vr46, vr47, vr48, \
+ vr49, vr50, vr51, vr52, vr53, vr54, vr55, vr56, vr57, vr58, vr59, vr60, \
+ vr61, vr62, vr63 = ALL_FLOAT_VECTOR_REGS
+
+
NONVOLATILES = [r14, r15, r16, r17, r18, r19, r20, r21, r22, r23,
r24, r25, r26, r27, r28, r29, r30, r31]
VOLATILES = [r3, r4, r5, r6, r7, r8, r9, r10, r11, r12]
@@ -43,6 +58,9 @@
MANAGED_FP_REGS = VOLATILES_FLOAT #+ NONVOLATILES_FLOAT
+MANAGED_FLOAT_VECTOR_REGS = ALL_FLOAT_VECTOR_REGS
+MANAGED_INTEGER_VECTOR_REGS = ALL_INTEGER_VECTOR_REGS
+
assert RCS1 in MANAGED_REGS and RCS1 in NONVOLATILES
assert RCS2 in MANAGED_REGS and RCS2 in NONVOLATILES
assert RCS3 in MANAGED_REGS and RCS3 in NONVOLATILES
diff --git a/rpython/jit/backend/ppc/regname.py
b/rpython/jit/backend/ppc/regname.py
--- a/rpython/jit/backend/ppc/regname.py
+++ b/rpython/jit/backend/ppc/regname.py
@@ -6,6 +6,10 @@
def __repr__(self):
return "fr%s"%(super(_F, self).__repr__(),)
__str__ = __repr__
+class _V(int):
+ def __repr__(self):
+ return "vr%s"%(super(_V, self).__repr__(),)
+ __str__ = __repr__
r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, \
r13, r14, r15, r16, r17, r18, r19, r20, r21, r22, \
@@ -15,4 +19,11 @@
fr13, fr14, fr15, fr16, fr17, fr18, fr19, fr20, fr21, fr22, \
fr23, fr24, fr25, fr26, fr27, fr28, fr29, fr30, fr31 = map(_F, range(32))
+vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11, vr12, vr13, \
+ vr14, vr15, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, vr24, vr25, \
+ vr26, vr27, vr28, vr29, vr30, vr31, vr32, vr33, vr34, vr35, vr36, vr37, \
+ vr38, vr39, vr40, vr41, vr42, vr43, vr44, vr45, vr46, vr47, vr48, \
+ vr49, vr50, vr51, vr52, vr53, vr54, vr55, vr56, vr57, vr58, vr59, vr60, \
+ vr61, vr62, vr63 = map(_V, range(64))
+
crf0, crf1, crf2, crf3, crf4, crf5, crf6, crf7 = range(8)
diff --git a/rpython/jit/backend/ppc/runner.py
b/rpython/jit/backend/ppc/runner.py
--- a/rpython/jit/backend/ppc/runner.py
+++ b/rpython/jit/backend/ppc/runner.py
@@ -3,13 +3,15 @@
from rpython.rlib import rgc
from rpython.rlib.jit_hooks import LOOP_RUN_CONTAINER
from rpython.jit.backend.llsupport.llmodel import AbstractLLCPU
+from rpython.jit.backend.ppc.vector_ext import AltiVectorExt
from rpython.jit.backend.ppc.ppc_assembler import AssemblerPPC
from rpython.jit.backend.ppc.arch import WORD
from rpython.jit.backend.ppc.codebuilder import PPCBuilder
from rpython.jit.backend.ppc import register as r
+class PPC_CPU(AbstractLLCPU):
-class PPC_CPU(AbstractLLCPU):
+ vector_ext = AltiVectorExt()
supports_floats = True
# missing: supports_singlefloats
diff --git a/rpython/jit/backend/ppc/test/test_ppcvector.py
b/rpython/jit/backend/ppc/test/test_ppcvector.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/ppc/test/test_ppcvector.py
@@ -0,0 +1,26 @@
+import py
+from rpython.jit.backend.ppc.test import test_basic
+from rpython.jit.metainterp.test import test_vector
+from rpython.jit.backend.ppc.detect_feature import detect_vsx
+
+
+class TestBasic(test_basic.JitPPCMixin, test_vector.VectorizeTests):
+ # for the individual tests see
+ # ====> ../../../metainterp/test/test_basic.py
+ def setup_method(self, method):
+ clazz = self.CPUClass
+ def init(*args, **kwargs):
+ cpu = clazz(*args, **kwargs)
+ # > 95% can be executed, thus let's cheat here a little
+ cpu.supports_guard_gc_type = True
+ return cpu
+ self.CPUClass = init
+
+ def supports_vector_ext(self):
+ return detect_vsx()
+
+ def test_list_vectorize(self):
+ pass # needs support_guard_gc_type, disable for now
+
+ enable_opts =
'intbounds:rewrite:virtualize:string:earlyforce:pure:heap:unroll'
+
diff --git a/rpython/jit/backend/ppc/test/test_vector_instr.py
b/rpython/jit/backend/ppc/test/test_vector_instr.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/ppc/test/test_vector_instr.py
@@ -0,0 +1,95 @@
+from rpython.rtyper.lltypesystem import lltype, rffi
+
+from rpython.jit.backend.ppc.codebuilder import BasicPPCAssembler, PPCBuilder
+from rpython.jit.backend.ppc.regname import *
+from rpython.jit.backend.ppc.register import *
+from rpython.jit.backend.ppc import form
+from rpython.jit.backend import detect_cpu
+from rpython.jit.backend.ppc.arch import IS_PPC_32, IS_PPC_64, IS_BIG_ENDIAN
+from rpython.jit.backend.ppc.arch import WORD
+
+cpu = detect_cpu.autodetect()
+
+signed = lltype.Signed
+unsigned = lltype.Unsigned
+char = lltype.Char
+
+def vec_asmtest(memory=[]):
+ def testmaker(test):
+ def newtest(self):
+ memory_ptrs = []
+ a = PPCBuilder()
+ for (bytes, type, values) in memory:
+ # alloc
+ adr = lltype.malloc(rffi.CArray(char), bytes, flavor="raw")
+ memory_ptrs.append(adr)
+ address = adr
+ for i,value in enumerate(values):
+ rffi.cast(rffi.CArrayPtr(type), adr)[i] = rffi.cast(type,
value)
+
+ expected = test(self, a, *[rffi.cast(lltype.Signed, m) for m in
memory_ptrs])
+ f = a.get_assembler_function()
+ f()
+ for expect, type, ptr in expected:
+ value = rffi.cast(rffi.CArrayPtr(type), ptr)[0]
+ assert value == expect
+
+ while memory_ptrs:
+ ptr = memory_ptrs.pop()
+ lltype.free(ptr, flavor="raw")
+ return newtest
+ return testmaker
+
+
+class TestAssemble(object):
+ """
+ See how tests are built in test_ppc.py
+
+ Instead of asmtest, use vec_asmtest. It adds a parameter 'memory' that
+ allocates memory.
+
+
+ @vec_asmtest(memory=[(8, [1,2]), 'm2': (16, [1,2,3,4])])
+ def test(self, builder, m, m2):
+ # ...
+ return [ (value, type, ptr), ... ]
+ """
+ def setup_class(cls):
+ if cpu not in ["ppc", "ppc64", "ppc-64"]:
+ py.test.skip("can't test all of ppcgen on non-PPC!")
+
+ @vec_asmtest(memory=[(16, signed, [0,0])])
+ def test_unaligned_load(self, a, mem):
+ a.load_imm(r15, mem)
+ a.lxvd2x(0, 0, r15.value)
+ a.blr()
+ return [ (0, signed, mem), (0, signed, mem+8) ]
+
+ @vec_asmtest(memory=[(16, signed, [1,2]), (16, signed, [0,0])])
+ def test_unaligned_load_and_store(self, a, mem_l, mem_t):
+ a.load_imm(r15, mem_l)
+ a.load_imm(r14, mem_t)
+ a.lxvd2x(0, 0, r15.value)
+ a.stxvd2x(0, 0, r14.value)
+ a.blr()
+ return [ (1, signed, mem_t), (2, signed, mem_t+8) ]
+
+ def test_xx3_instr(self):
+ a = PPCBuilder()
+ def assign_to_self(v):
+ self.last_value = v
+ a.emit = assign_to_self
+
+ a.xxspltdl(32, 32, 32)
+ # tttttaaaaabbbbb
abt
+ assert hex(int(self.last_value)) ==
hex(0b11110000000000000000000001010111)
+ a.xxspltdl(32, 2, 2)
+ # tttttaaaaabbbbb
abt
+ assert hex(int(self.last_value)) ==
hex(0b11110000000000100001000001010001)
+ a.xxspltdl(0, 63, 0)
+ # tttttaaaaabbbbb
abt
+ assert hex(int(self.last_value)) ==
hex(0b11110000000111110000000001010100)
+ a.xxspltdl(0, 0, 63)
+ # tttttaaaaabbbbb
abt
+ assert hex(int(self.last_value)) ==
hex(0b11110000000000001111100001010010)
+
diff --git a/rpython/jit/backend/ppc/vector_ext.py
b/rpython/jit/backend/ppc/vector_ext.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/ppc/vector_ext.py
@@ -0,0 +1,871 @@
+import py
+from rpython.jit.metainterp.compile import ResumeGuardDescr
+from rpython.jit.metainterp.history import (ConstInt, INT, REF,
+ FLOAT, VECTOR, TargetToken)
+from rpython.jit.backend.llsupport.descr import (ArrayDescr, CallDescr,
+ unpack_arraydescr, unpack_fielddescr, unpack_interiorfielddescr)
+from rpython.jit.backend.llsupport.regalloc import get_scale
+from rpython.jit.metainterp.resoperation import (rop, ResOperation,
+ VectorOp, VectorGuardOp)
+from rpython.rlib.objectmodel import we_are_translated
+from rpython.rtyper.lltypesystem.lloperation import llop
+from rpython.rtyper.lltypesystem import lltype
+from rpython.jit.backend.ppc.locations import imm, RegisterLocation
+from rpython.jit.backend.ppc.arch import IS_BIG_ENDIAN
+from rpython.jit.backend.llsupport.vector_ext import VectorExt
+from rpython.jit.backend.ppc.arch import PARAM_SAVE_AREA_OFFSET, WORD
+import rpython.jit.backend.ppc.register as r
+import rpython.jit.backend.ppc.condition as c
+import rpython.jit.backend.ppc.locations as l
+from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
+from rpython.rtyper.lltypesystem import lltype, rffi
+from rpython.jit.codewriter import longlong
+from rpython.jit.backend.ppc.detect_feature import detect_vsx
+from rpython.rlib.objectmodel import always_inline
+
+def not_implemented(msg):
+ msg = '[ppc/vector_ext] %s\n' % msg
+ if we_are_translated():
+ llop.debug_print(lltype.Void, msg)
+ raise NotImplementedError(msg)
+
+@always_inline
+def permi(v1, v2):
+ """ permute immediate for big and little endian """
+ # if v1 == 0 unpacks index 0 of param 1
+ # if v1 == 1 unpacks index 1 of param 1
+ # if v2 == 0 unpacks index 0 of param 2
+ # if v2 == 1 unpacks index 1 of param 2
+ mask = 0
+ if IS_BIG_ENDIAN:
+ not_implemented("no big endian support (yet)")
+ else:
+ if v1 == 0: mask |= 0b01
+ if v1 == 1: mask |= 0b00
+ if v2 == 0: mask |= 0b10
+ if v2 == 1: mask |= 0b00
+ return mask
+
+
+def flush_vec_cc(asm, regalloc, condition, size, result_loc):
+ # After emitting an instruction that leaves a boolean result in
+ # a condition code (cc), call this. In the common case, result_loc
+ # will be set to SPP by the regalloc, which in this case means
+ # "propagate it between this operation and the next guard by keeping
+ # it in the cc". In the uncommon case, result_loc is another
+ # register, and we emit a load from the cc into this register.
+
+ # Possibly invert the bit in the CR
+ bit, invert = c.encoding[condition]
+ assert 24 <= bit <= 27
+ if invert == 12:
+ pass
+ elif invert == 4:
+ asm.mc.crnor(bit, bit, bit)
+ else:
+ assert 0
+ assert asm.guard_success_cc == c.cond_none
+ #
+ if result_loc is r.SPP:
+ asm.guard_success_cc = condition
+ else:
+ resval = result_loc.value
+ # either doubleword integer 1 (2x) or word integer 1 (4x)
+ ones = regalloc.vrm.get_scratch_reg(type=INT).value
+ zeros = regalloc.vrm.get_scratch_reg(type=INT).value
+ asm.mc.vxor(zeros, zeros, zeros)
+ if size == 4:
+ asm.mc.vspltisw(ones, 1)
+ else:
+ assert size == 8
+ tloc = regalloc.rm.get_scratch_reg()
+ asm.mc.load_imm(tloc, asm.VEC_DOUBLE_WORD_ONES)
+ asm.mc.lvx(ones, 0, tloc.value)
+ asm.mc.vsel(resval, zeros, ones, resval)
+
+class AltiVectorExt(VectorExt):
+ def setup_once(self, asm):
+ if detect_vsx():
+ self.enable(16, accum=True)
+ asm.setup_once_vector()
+ self._setup = True
+
+class VectorAssembler(object):
+ _mixin_ = True
+
+ VEC_DOUBLE_WORD_ONES = 0
+
+ def setup_once_vector(self):
+ if IS_BIG_ENDIAN:
+ # 2x 64 bit signed integer(1) BE
+ data = (b'\x00' * 7 + b'\x01') * 2
+ else:
+ # 2x 64 bit signed integer(1) LE
+ data = (b'\x01' + b'\x00' * 7) * 2
+ datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr, [])
+ mem = datablockwrapper.malloc_aligned(len(data), alignment=16)
+ datablockwrapper.done()
+ addr = rffi.cast(rffi.CArrayPtr(lltype.Char), mem)
+ for i in range(len(data)):
+ addr[i] = data[i]
+ self.VEC_DOUBLE_WORD_ONES = mem
+
+ def emit_vec_load_f(self, op, arglocs, regalloc):
+ resloc, baseloc, indexloc, size_loc, ofs, integer_loc = arglocs
+ indexloc = self._apply_offset(indexloc, ofs)
+ itemsize = size_loc.value
+ if integer_loc.value:
+ self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value)
+ elif itemsize == 4:
+ self.mc.lxvw4x(resloc.value, indexloc.value, baseloc.value)
+ elif itemsize == 8:
+ self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value)
+ else:
+ not_implemented("vec_load_f itemsize %d" % itemsize)
+
+ emit_vec_load_i = emit_vec_load_f
+
+ def emit_vec_store(self, op, arglocs, regalloc):
+ baseloc, indexloc, valueloc, sizeloc, baseofs, \
+ integer_loc = arglocs
+ indexloc = self._apply_offset(indexloc, baseofs)
+ assert baseofs.value == 0
+ if integer_loc.value:
+ self.mc.stxvd2x(valueloc.value, indexloc.value, baseloc.value)
+ else:
+ itemsize = sizeloc.value
+ if itemsize == 4:
+ self.mc.stxvw4x(valueloc.value, indexloc.value, baseloc.value)
+ elif itemsize == 8:
+ self.mc.stxvd2x(valueloc.value, indexloc.value, baseloc.value)
+ else:
+ not_implemented("vec_store itemsize %d" % itemsize)
+
+ def emit_vec_int_add(self, op, arglocs, regalloc):
+ resloc, loc0, loc1, size_loc = arglocs
+ size = size_loc.value
+ if size == 1:
+ self.mc.vaddubm(resloc.value, loc0.value, loc1.value)
+ elif size == 2:
+ self.mc.vadduhm(resloc.value, loc0.value, loc1.value)
+ elif size == 4:
+ self.mc.vadduwm(resloc.value, loc0.value, loc1.value)
+ elif size == 8:
+ self.mc.vaddudm(resloc.value, loc0.value, loc1.value)
+
+ def emit_vec_int_sub(self, op, arglocs, regalloc):
+ resloc, loc0, loc1, size_loc = arglocs
+ size = size_loc.value
+ if size == 1:
+ # TODO verify if unsigned subtract is the wanted feature
+ self.mc.vsububm(resloc.value, loc0.value, loc1.value)
+ elif size == 2:
+ # TODO verify if unsigned subtract is the wanted feature
+ self.mc.vsubuhm(resloc.value, loc0.value, loc1.value)
+ elif size == 4:
+ # TODO verify if unsigned subtract is the wanted feature
+ self.mc.vsubuwm(resloc.value, loc0.value, loc1.value)
+ elif size == 8:
+ self.mc.vsubudm(resloc.value, loc0.value, loc1.value)
+
+ def emit_vec_float_add(self, op, arglocs, regalloc):
+ resloc, loc0, loc1, itemsize_loc = arglocs
+ itemsize = itemsize_loc.value
+ if itemsize == 4:
+ self.mc.xvaddsp(resloc.value, loc0.value, loc1.value)
+ elif itemsize == 8:
+ self.mc.xvadddp(resloc.value, loc0.value, loc1.value)
+
+ def emit_vec_float_sub(self, op, arglocs, regalloc):
+ resloc, loc0, loc1, itemsize_loc = arglocs
+ itemsize = itemsize_loc.value
+ if itemsize == 4:
+ self.mc.xvsubsp(resloc.value, loc0.value, loc1.value)
+ elif itemsize == 8:
+ self.mc.xvsubdp(resloc.value, loc0.value, loc1.value)
+
+ def emit_vec_float_mul(self, op, arglocs, regalloc):
+ resloc, loc0, loc1, itemsize_loc = arglocs
+ itemsize = itemsize_loc.value
+ if itemsize == 4:
+ self.mc.xvmulsp(resloc.value, loc0.value, loc1.value)
+ elif itemsize == 8:
+ self.mc.xvmuldp(resloc.value, loc0.value, loc1.value)
+
+ def emit_vec_float_truediv(self, op, arglocs, regalloc):
+ resloc, loc0, loc1, itemsize_loc = arglocs
+ itemsize = itemsize_loc.value
+ if itemsize == 4:
+ self.mc.xvdivsp(resloc.value, loc0.value, loc1.value)
+ elif itemsize == 8:
+ self.mc.xvdivdp(resloc.value, loc0.value, loc1.value)
+
+ def emit_vec_int_mul(self, op, arglocs, regalloc):
+ raise NotImplementedError
+ pass # TODO
+
+ def emit_vec_int_and(self, op, arglocs, regalloc):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit