Author: Richard Plangger <[email protected]>
Branch: ppc-vsx-support
Changeset: r85328:7799b9b5dc53
Date: 2016-06-22 16:42 +0200
http://bitbucket.org/pypy/pypy/changeset/7799b9b5dc53/
Log: (ppc) added several new ppc op codes, fixed endian issues for vec
load/store (int only). added more tests for integer add
diff --git a/rpython/jit/backend/ppc/codebuilder.py
b/rpython/jit/backend/ppc/codebuilder.py
--- a/rpython/jit/backend/ppc/codebuilder.py
+++ b/rpython/jit/backend/ppc/codebuilder.py
@@ -614,11 +614,18 @@
# INTEGER
# -------
- # load & store
+ # load
lvx = XV(31, XO1=103)
+ lvewx = XV(31, XO1=71)
+ lvehx = XV(31, XO1=39)
+ lvebx = XV(31, XO1=7)
+ # store
stvx = XV(31, XO1=231)
+ stvewx = XV(31, XO1=199)
+ stvehx = XV(31, XO1=167)
+ stvebx = XV(31, XO1=135)
- # arith & logic
+ # arith
vaddudm = VX(4, XO8=192)
vadduwm = VX(4, XO8=128)
vadduhm = VX(4, XO8=64)
@@ -629,6 +636,14 @@
vsubuhm = VX(4, XO8=1088)
vsububm = VX(4, XO8=1024)
+ # logic
+ vand = VX(4, XO8=1028)
+ vor = VX(4, XO8=1156)
+ veqv = VX(4, XO8=1668)
+
+ # vector move register is alias to vector or
+ vmr = vor
+
# shift, perm and select
diff --git a/rpython/jit/backend/ppc/locations.py
b/rpython/jit/backend/ppc/locations.py
--- a/rpython/jit/backend/ppc/locations.py
+++ b/rpython/jit/backend/ppc/locations.py
@@ -30,6 +30,9 @@
def is_fp_reg(self):
return False
+ def is_vector_reg(self):
+ return False
+
def is_imm_float(self):
return False
@@ -77,7 +80,7 @@
class VectorRegisterLocation(AssemblerLocation):
_immutable_ = True
- width = WORD
+ width = WORD * 2
type = VECTOR
def __init__(self, value):
@@ -92,6 +95,9 @@
def as_key(self):
return self.value + 132
+ def is_vector_reg(self):
+ return True
+
class ImmLocation(AssemblerLocation):
_immutable_ = True
diff --git a/rpython/jit/backend/ppc/ppc_assembler.py
b/rpython/jit/backend/ppc/ppc_assembler.py
--- a/rpython/jit/backend/ppc/ppc_assembler.py
+++ b/rpython/jit/backend/ppc/ppc_assembler.py
@@ -808,6 +808,8 @@
# name = "Loop # %s: %s" % (looptoken.number, loopname)
# self.cpu.profile_agent.native_code_written(name,
# rawstart, full_size)
+ #print(hex(rawstart))
+ #import pdb; pdb.set_trace()
return AsmInfo(ops_offset, rawstart + looppos,
size_excluding_failure_stuff - looppos)
@@ -1044,6 +1046,10 @@
self.mc.lfd(reg, r.SPP.value, offset)
return
assert 0, "not supported location"
+ elif prev_loc.is_vector_reg():
+ assert loc.is_vector_reg()
+ self.mc.vmr(loc.value, prev_loc.value, prev_loc.value)
+ return
elif prev_loc.is_reg():
reg = prev_loc.value
# move to another register
diff --git a/rpython/jit/backend/ppc/vector_ext.py
b/rpython/jit/backend/ppc/vector_ext.py
--- a/rpython/jit/backend/ppc/vector_ext.py
+++ b/rpython/jit/backend/ppc/vector_ext.py
@@ -11,6 +11,7 @@
from rpython.rtyper.lltypesystem.lloperation import llop
from rpython.rtyper.lltypesystem import lltype
from rpython.jit.backend.ppc.locations import imm
+from rpython.jit.backend.ppc.arch import IS_BIG_ENDIAN
def not_implemented(msg):
msg = '[ppc/vector_ext] %s\n' % msg
@@ -46,6 +47,34 @@
elif itemsize == 8:
self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value)
+ def dispatch_vector_load(self, size, Vt, index, addr):
+ self.mc.lvx(Vt, index, addr)
+ return
+ if size == 8:
+ self.mc.lvx(Vt, index, addr)
+ elif size == 4:
+ self.mc.lvewx(Vt, index, addr)
+ elif size == 2:
+ self.mc.lvehx(Vt, index, addr)
+ elif size == 1:
+ self.mc.lvehx(Vt, index, addr)
+ else:
+ notimplemented("[ppc/assembler] dispatch vector load of size %d" %
size)
+
+ def dispatch_vector_store(self, size, Vt, index, addr):
+ self.mc.stvx(Vt, index, addr)
+ return
+ if size == 8:
+ self.mc.stvx(Vt, index, addr)
+ elif size == 4:
+ self.mc.stvewx(Vt, index, addr)
+ elif size == 2:
+ self.mc.stvehx(Vt, index, addr)
+ elif size == 1:
+ self.mc.stvehx(Vt, index, addr)
+ else:
+ notimplemented("[ppc/assembler] dispatch vector load of size %d" %
size)
+
def emit_vec_raw_load_i(self, op, arglocs, regalloc):
resloc, baseloc, indexloc, size_loc, ofs, \
Vhiloc, Vloloc, Vploc, tloc = arglocs
@@ -56,10 +85,17 @@
self.mc.lvx(Vhi, indexloc.value, baseloc.value)
Vp = Vploc.value
t = tloc.value
- self.mc.lvsl(Vp, indexloc.value, baseloc.value)
+ if IS_BIG_ENDIAN:
+ self.mc.lvsl(Vp, indexloc.value, baseloc.value)
+ else:
+ self.mc.lvsr(Vp, indexloc.value, baseloc.value)
self.mc.addi(t, baseloc.value, 16)
self.mc.lvx(Vlo, indexloc.value, t)
- self.mc.vperm(resloc.value, Vhi, Vlo, Vp)
+ if IS_BIG_ENDIAN:
+ self.mc.vperm(resloc.value, Vhi, Vlo, Vp)
+ else:
+ self.mc.vperm(resloc.value, Vlo, Vhi, Vp)
+ #self.mc.trap()
def _emit_vec_setitem(self, op, arglocs, regalloc):
# prepares item scale (raw_store does not)
@@ -101,12 +137,19 @@
# probably a lot of room for improvement (not locally,
# but in general for the algorithm)
self.mc.lvx(Vhi, indexloc.value, baseloc.value)
- self.mc.lvsr(Vp, indexloc.value, baseloc.value)
+ #self.mc.lvsr(Vp, indexloc.value, baseloc.value)
+ if IS_BIG_ENDIAN:
+ self.mc.lvsr(Vp, indexloc.value, baseloc.value)
+ else:
+ self.mc.lvsl(Vp, indexloc.value, baseloc.value)
self.mc.addi(t, baseloc.value, 16)
self.mc.lvx(Vlo, indexloc.value, t)
self.mc.vspltisb(V1s, -1)
self.mc.vspltisb(V0s, 0)
- self.mc.vperm(Vmask, V0s, V1s, Vp)
+ if IS_BIG_ENDIAN:
+ self.mc.vperm(Vmask, V0s, V1s, Vp)
+ else:
+ self.mc.vperm(Vmask, V1s, V0s, Vp)
self.mc.vperm(Vs, Vs, Vs, Vp)
self.mc.vsel(Vlo, Vs, Vlo, Vmask)
self.mc.vsel(Vhi, Vhi, Vs, Vmask)
@@ -179,27 +222,24 @@
self.mc.xvdivdp(resloc.value, loc0.value, loc1.value)
def emit_vec_int_mul(self, op, arglocs, resloc):
- loc0, loc1, itemsize_loc = arglocs
- itemsize = itemsize_loc.value
- if itemsize == 1:
- self.mc.PMULLW(loc0, loc1)
- elif itemsize == 2:
- self.mc.PMULLW(loc0, loc1)
- elif itemsize == 4:
- self.mc.PMULLD(loc0, loc1)
- else:
- # NOTE see
http://stackoverflow.com/questions/8866973/can-long-integer-routines-benefit-from-sse/8867025#8867025
- # There is no 64x64 bit packed mul. For 8 bit either. It is
questionable if it gives any benefit?
- not_implemented("int8/64 mul")
+ pass # TODO
- def emit_vec_int_and(self, op, arglocs, resloc):
- self.mc.PAND(resloc, arglocs[0])
+ def emit_vec_int_and(self, op, arglocs, regalloc):
+ resloc, loc0, loc1 = arglocs
+ self.mc.vand(resloc.value, loc0.value, loc1.value)
- def emit_vec_int_or(self, op, arglocs, resloc):
- self.mc.POR(resloc, arglocs[0])
+ def emit_vec_int_or(self, op, arglocs, regalloc):
+ resloc, loc0, loc1 = arglocs
+ self.mc.vor(resloc.value, loc0.value, loc1.value)
- def emit_vec_int_xor(self, op, arglocs, resloc):
- self.mc.PXOR(resloc, arglocs[0])
+ def emit_vec_int_xor(self, op, arglocs, regalloc):
+ resloc, loc0, loc1 = arglocs
+ self.mc.veqv(resloc.value, loc0.value, loc1.value)
+
+ def emit_vec_int_signext(self, op, arglocs, regalloc):
+ resloc, loc0 = arglocs
+ # TODO
+ self.regalloc_mov(loc0, resloc)
#def genop_guard_vec_guard_true(self, guard_op, guard_token, locs, resloc):
# self.implement_guard(guard_token)
@@ -367,34 +407,6 @@
# # ----------- pxor
# # 00 11 00 00
- #def genop_vec_int_signext(self, op, arglocs, resloc):
- # srcloc, sizeloc, tosizeloc = arglocs
- # size = sizeloc.value
- # tosize = tosizeloc.value
- # if size == tosize:
- # return # already the right size
- # if size == 4 and tosize == 8:
- # scratch = X86_64_SCRATCH_REG.value
- # self.mc.PEXTRD_rxi(scratch, srcloc.value, 1)
- # self.mc.PINSRQ_xri(resloc.value, scratch, 1)
- # self.mc.PEXTRD_rxi(scratch, srcloc.value, 0)
- # self.mc.PINSRQ_xri(resloc.value, scratch, 0)
- # elif size == 8 and tosize == 4:
- # # is there a better sequence to move them?
- # scratch = X86_64_SCRATCH_REG.value
- # self.mc.PEXTRQ_rxi(scratch, srcloc.value, 0)
- # self.mc.PINSRD_xri(resloc.value, scratch, 0)
- # self.mc.PEXTRQ_rxi(scratch, srcloc.value, 1)
- # self.mc.PINSRD_xri(resloc.value, scratch, 1)
- # else:
- # # note that all other conversions are not implemented
- # # on purpose. it needs many x86 op codes to implement
- # # the missing combinations. even if they are implemented
- # # the speedup might only be modest...
- # # the optimization does not emit such code!
- # msg = "vec int signext (%d->%d)" % (size, tosize)
- # not_implemented(msg)
-
#def genop_vec_expand_f(self, op, arglocs, resloc):
# srcloc, sizeloc = arglocs
# size = sizeloc.value
@@ -666,6 +678,14 @@
prepare_vec_raw_store = _prepare_vec_store
del _prepare_vec_store
+ def prepare_vec_int_signext(self, op):
+ assert isinstance(op, VectorOp)
+ a0 = op.getarg(0)
+ loc0 = self.ensure_vector_reg(a0)
+ resloc = self.force_allocate_vector_reg(op)
+ return [resloc, loc0]
+
+
#def prepare_vec_arith_unary(self, op):
# lhs = op.getarg(0)
@@ -758,16 +778,6 @@
# resloc = self.xrm.force_allocate_reg(op, args)
# self.perform(op, [srcloc, imm(op.bytesize)], resloc)
- #def prepare_vec_int_signext(self, op):
- # assert isinstance(op, VectorOp)
- # args = op.getarglist()
- # resloc = self.xrm.force_result_in_reg(op, op.getarg(0), args)
- # arg = op.getarg(0)
- # assert isinstance(arg, VectorOp)
- # size = arg.bytesize
- # assert size > 0
- # self.perform(op, [resloc, imm(size), imm(op.bytesize)], resloc)
-
#def prepare_vec_int_is_true(self, op):
# args = op.getarglist()
# arg = op.getarg(0)
diff --git a/rpython/jit/metainterp/test/test_vector.py
b/rpython/jit/metainterp/test/test_vector.py
--- a/rpython/jit/metainterp/test/test_vector.py
+++ b/rpython/jit/metainterp/test/test_vector.py
@@ -1,6 +1,7 @@
import py
import pytest
import math
+import functools
from hypothesis import given, note, strategies as st
from rpython.jit.metainterp.warmspot import ll_meta_interp, get_stats
from rpython.jit.metainterp.test.support import LLJitMixin
@@ -11,7 +12,7 @@
from rpython.rlib.objectmodel import compute_hash
from rpython.rlib import rfloat
from rpython.rtyper.lltypesystem import lltype, rffi
-from rpython.rlib.rarithmetic import r_uint, intmask
+from rpython.rlib.rarithmetic import r_uint, intmask, r_int
from rpython.rlib.rawstorage import (alloc_raw_storage, raw_storage_setitem,
free_raw_storage, raw_storage_getitem)
from rpython.rlib.objectmodel import (specialize, is_annotation_constant,
@@ -59,8 +60,6 @@
request.cls.a
return rs
-integers_64bit = st.integers(min_value=-2**63, max_value=2**63-1)
-floats = st.floats()
def rdiv(v1,v2):
# TODO unused, interpeting this on top of llgraph does not work correctly
@@ -104,11 +103,11 @@
raw_storage_setitem(vc, i, rffi.cast(type,c))
i += size
- la = data.draw(st.lists(floats, min_size=10, max_size=150))
+ la = data.draw(st.lists(st.floats(), min_size=10, max_size=150))
#la = [0.0,0.0,0.0,0.0,0.0,0.0,0.0]
#lb = [0.0,0.0,0.0,0.0,1.7976931348623157e+308,0.0,0.0]
l = len(la)
- lb = data.draw(st.lists(floats, min_size=l, max_size=l))
+ lb = data.draw(st.lists(st.floats(), min_size=l, max_size=l))
rawstorage = RawStorage()
va = rawstorage.new(la, type)
@@ -126,18 +125,7 @@
rawstorage.clear()
- #@given(st.data())
- @pytest.mark.parametrize('func,type', [
- (lambda a,b: intmask(a+b), rffi.SIGNED),
- (lambda a,b: intmask(a+b), rffi.UNSIGNED),
- (lambda a,b: intmask(a+b), rffi.INT),
- (lambda a,b: intmask(a+b), rffi.UINT),
- (lambda a,b: intmask(a+b), rffi.SHORT),
- (lambda a,b: intmask(a+b), rffi.USHORT),
- (lambda a,b: intmask(a+b), rffi.CHAR),
- (lambda a,b: intmask(a+b), rffi.UCHAR),
- ])
- def test_vector_simple_int(self, func, type):
+ def _vector_simple_int(self, func, type, data):
func = always_inline(func)
size = rffi.sizeof(type)
@@ -152,11 +140,13 @@
raw_storage_setitem(vc, i, rffi.cast(type,c))
i += size
- #la = data.draw(st.lists(integers_64bit, min_size=10, max_size=150))
- la = [1] * 10
+ bits = size*8
+ integers = st.integers(min_value=-2**(bits-1), max_value=2**(bits-1)-1)
+ la = data.draw(st.lists(integers, min_size=10, max_size=150))
+ #la = [1,2,3,4,5,6,7,8,9,10,11,12,13]
l = len(la)
- #lb = data.draw(st.lists(integers_64bit, min_size=l, max_size=l))
- lb = [0] * 10
+ #lb = [1,2,3,4,5,6,7,8,9,10,11,12,13]
+ lb = data.draw(st.lists(integers, min_size=l, max_size=l))
rawstorage = RawStorage()
va = rawstorage.new(la, type)
@@ -166,10 +156,31 @@
for i in range(l):
c = raw_storage_getitem(type,vc,i*size)
- assert func(la[i], lb[i]) == c
+ assert rffi.cast(type, func(la[i], lb[i])) == c
rawstorage.clear()
+ def vec_int_arith(test_func, arith_func, type):
+ return pytest.mark.parametrize('func,type', [
+ (arith_func, type)
+ ])(given(data=st.data())(test_func))
+
+ vec_int_arith = functools.partial(vec_int_arith, _vector_simple_int)
+
+ test_vector_signed_add = \
+ vec_int_arith(lambda a,b: intmask(a+b), rffi.SIGNED)
+ test_vector_int_add = \
+ vec_int_arith(lambda a,b: r_int(a)+r_int(b), rffi.INT)
+ test_vector_short_add = \
+ vec_int_arith(lambda a,b: r_int(a)+r_int(b), rffi.SHORT)
+
+ test_vector_signed_sub = \
+ vec_int_arith(lambda a,b: r_int(a)-r_int(b), rffi.SIGNED)
+ test_vector_int_sub = \
+ vec_int_arith(lambda a,b: r_int(a)-r_int(b), rffi.INT)
+ test_vector_short_sub = \
+ vec_int_arith(lambda a,b: r_int(a)-r_int(b), rffi.SHORT)
+
@py.test.mark.parametrize('i',[1,2,3,8,17,128,130,131,142,143])
def test_vectorize_array_get_set(self,i):
myjitdriver = JitDriver(greens = [],
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit