Author: Richard Plangger <[email protected]>
Branch: ppc-vsx-support
Changeset: r85328:7799b9b5dc53
Date: 2016-06-22 16:42 +0200
http://bitbucket.org/pypy/pypy/changeset/7799b9b5dc53/

Log:    (ppc) added several new ppc op codes, fixed endian issues for vec
        load/store (int only). added more tests for integer add

diff --git a/rpython/jit/backend/ppc/codebuilder.py 
b/rpython/jit/backend/ppc/codebuilder.py
--- a/rpython/jit/backend/ppc/codebuilder.py
+++ b/rpython/jit/backend/ppc/codebuilder.py
@@ -614,11 +614,18 @@
     # INTEGER
     # -------
 
-    # load & store
+    # load
     lvx = XV(31, XO1=103)
+    lvewx = XV(31, XO1=71)
+    lvehx = XV(31, XO1=39)
+    lvebx = XV(31, XO1=7)
+    # store
     stvx = XV(31, XO1=231)
+    stvewx = XV(31, XO1=199)
+    stvehx = XV(31, XO1=167)
+    stvebx = XV(31, XO1=135)
 
-    # arith & logic
+    # arith
     vaddudm = VX(4, XO8=192)
     vadduwm = VX(4, XO8=128)
     vadduhm = VX(4, XO8=64)
@@ -629,6 +636,14 @@
     vsubuhm = VX(4, XO8=1088)
     vsububm = VX(4, XO8=1024)
 
+    # logic
+    vand = VX(4, XO8=1028)
+    vor = VX(4, XO8=1156)
+    veqv = VX(4, XO8=1668)
+
+    # vector move register is alias to vector or
+    vmr = vor
+
 
 
     # shift, perm and select
diff --git a/rpython/jit/backend/ppc/locations.py 
b/rpython/jit/backend/ppc/locations.py
--- a/rpython/jit/backend/ppc/locations.py
+++ b/rpython/jit/backend/ppc/locations.py
@@ -30,6 +30,9 @@
     def is_fp_reg(self):
         return False
 
+    def is_vector_reg(self):
+        return False
+
     def is_imm_float(self):
         return False
 
@@ -77,7 +80,7 @@
 
 class VectorRegisterLocation(AssemblerLocation):
     _immutable_ = True
-    width = WORD
+    width = WORD * 2
     type = VECTOR
 
     def __init__(self, value):
@@ -92,6 +95,9 @@
     def as_key(self):
         return self.value + 132
 
+    def is_vector_reg(self):
+        return True
+
 
 class ImmLocation(AssemblerLocation):
     _immutable_ = True
diff --git a/rpython/jit/backend/ppc/ppc_assembler.py 
b/rpython/jit/backend/ppc/ppc_assembler.py
--- a/rpython/jit/backend/ppc/ppc_assembler.py
+++ b/rpython/jit/backend/ppc/ppc_assembler.py
@@ -808,6 +808,8 @@
         #    name = "Loop # %s: %s" % (looptoken.number, loopname)
         #    self.cpu.profile_agent.native_code_written(name,
         #                                               rawstart, full_size)
+        #print(hex(rawstart))
+        #import pdb; pdb.set_trace()
         return AsmInfo(ops_offset, rawstart + looppos,
                        size_excluding_failure_stuff - looppos)
 
@@ -1044,6 +1046,10 @@
                 self.mc.lfd(reg, r.SPP.value, offset)
                 return
             assert 0, "not supported location"
+        elif prev_loc.is_vector_reg():
+            assert loc.is_vector_reg()
+            self.mc.vmr(loc.value, prev_loc.value, prev_loc.value)
+            return
         elif prev_loc.is_reg():
             reg = prev_loc.value
             # move to another register
diff --git a/rpython/jit/backend/ppc/vector_ext.py 
b/rpython/jit/backend/ppc/vector_ext.py
--- a/rpython/jit/backend/ppc/vector_ext.py
+++ b/rpython/jit/backend/ppc/vector_ext.py
@@ -11,6 +11,7 @@
 from rpython.rtyper.lltypesystem.lloperation import llop
 from rpython.rtyper.lltypesystem import lltype
 from rpython.jit.backend.ppc.locations import imm
+from rpython.jit.backend.ppc.arch import IS_BIG_ENDIAN
 
 def not_implemented(msg):
     msg = '[ppc/vector_ext] %s\n' % msg
@@ -46,6 +47,34 @@
         elif itemsize == 8:
             self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value)
 
+    def dispatch_vector_load(self, size, Vt, index, addr):
+        self.mc.lvx(Vt, index, addr)
+        return
+        if size == 8:
+            self.mc.lvx(Vt, index, addr)
+        elif size == 4:
+            self.mc.lvewx(Vt, index, addr)
+        elif size == 2:
+            self.mc.lvehx(Vt, index, addr)
+        elif size == 1:
+            self.mc.lvehx(Vt, index, addr)
+        else:
+            notimplemented("[ppc/assembler] dispatch vector load of size %d" % 
size)
+
+    def dispatch_vector_store(self, size, Vt, index, addr):
+        self.mc.stvx(Vt, index, addr)
+        return
+        if size == 8:
+            self.mc.stvx(Vt, index, addr)
+        elif size == 4:
+            self.mc.stvewx(Vt, index, addr)
+        elif size == 2:
+            self.mc.stvehx(Vt, index, addr)
+        elif size == 1:
+            self.mc.stvehx(Vt, index, addr)
+        else:
+            notimplemented("[ppc/assembler] dispatch vector load of size %d" % 
size)
+
     def emit_vec_raw_load_i(self, op, arglocs, regalloc):
         resloc, baseloc, indexloc, size_loc, ofs, \
             Vhiloc, Vloloc, Vploc, tloc = arglocs
@@ -56,10 +85,17 @@
         self.mc.lvx(Vhi, indexloc.value, baseloc.value)
         Vp = Vploc.value
         t = tloc.value
-        self.mc.lvsl(Vp, indexloc.value, baseloc.value)
+        if IS_BIG_ENDIAN:
+            self.mc.lvsl(Vp, indexloc.value, baseloc.value)
+        else:
+            self.mc.lvsr(Vp, indexloc.value, baseloc.value)
         self.mc.addi(t, baseloc.value, 16)
         self.mc.lvx(Vlo, indexloc.value, t)
-        self.mc.vperm(resloc.value, Vhi, Vlo, Vp)
+        if IS_BIG_ENDIAN:
+            self.mc.vperm(resloc.value, Vhi, Vlo, Vp)
+        else:
+            self.mc.vperm(resloc.value, Vlo, Vhi, Vp)
+        #self.mc.trap()
 
     def _emit_vec_setitem(self, op, arglocs, regalloc):
         # prepares item scale (raw_store does not)
@@ -101,12 +137,19 @@
             # probably a lot of room for improvement (not locally,
             # but in general for the algorithm)
             self.mc.lvx(Vhi, indexloc.value, baseloc.value)
-            self.mc.lvsr(Vp, indexloc.value, baseloc.value)
+            #self.mc.lvsr(Vp, indexloc.value, baseloc.value)
+            if IS_BIG_ENDIAN:
+                self.mc.lvsr(Vp, indexloc.value, baseloc.value)
+            else:
+                self.mc.lvsl(Vp, indexloc.value, baseloc.value)
             self.mc.addi(t, baseloc.value, 16)
             self.mc.lvx(Vlo, indexloc.value, t)
             self.mc.vspltisb(V1s, -1)
             self.mc.vspltisb(V0s, 0)
-            self.mc.vperm(Vmask, V0s, V1s, Vp)
+            if IS_BIG_ENDIAN:
+                self.mc.vperm(Vmask, V0s, V1s, Vp)
+            else:
+                self.mc.vperm(Vmask, V1s, V0s, Vp)
             self.mc.vperm(Vs, Vs, Vs, Vp)
             self.mc.vsel(Vlo, Vs, Vlo, Vmask)
             self.mc.vsel(Vhi, Vhi, Vs, Vmask)
@@ -179,27 +222,24 @@
             self.mc.xvdivdp(resloc.value, loc0.value, loc1.value)
 
     def emit_vec_int_mul(self, op, arglocs, resloc):
-        loc0, loc1, itemsize_loc = arglocs
-        itemsize = itemsize_loc.value
-        if itemsize == 1:
-            self.mc.PMULLW(loc0, loc1)
-        elif itemsize == 2:
-            self.mc.PMULLW(loc0, loc1)
-        elif itemsize == 4:
-            self.mc.PMULLD(loc0, loc1)
-        else:
-            # NOTE see 
http://stackoverflow.com/questions/8866973/can-long-integer-routines-benefit-from-sse/8867025#8867025
-            # There is no 64x64 bit packed mul. For 8 bit either. It is 
questionable if it gives any benefit?
-            not_implemented("int8/64 mul")
+        pass # TODO
 
-    def emit_vec_int_and(self, op, arglocs, resloc):
-        self.mc.PAND(resloc, arglocs[0])
+    def emit_vec_int_and(self, op, arglocs, regalloc):
+        resloc, loc0, loc1 = arglocs
+        self.mc.vand(resloc.value, loc0.value, loc1.value)
 
-    def emit_vec_int_or(self, op, arglocs, resloc):
-        self.mc.POR(resloc, arglocs[0])
+    def emit_vec_int_or(self, op, arglocs, regalloc):
+        resloc, loc0, loc1 = arglocs
+        self.mc.vor(resloc.value, loc0.value, loc1.value)
 
-    def emit_vec_int_xor(self, op, arglocs, resloc):
-        self.mc.PXOR(resloc, arglocs[0])
+    def emit_vec_int_xor(self, op, arglocs, regalloc):
+        resloc, loc0, loc1 = arglocs
+        self.mc.veqv(resloc.value, loc0.value, loc1.value)
+
+    def emit_vec_int_signext(self, op, arglocs, regalloc):
+        resloc, loc0 = arglocs
+        # TODO
+        self.regalloc_mov(loc0, resloc)
 
     #def genop_guard_vec_guard_true(self, guard_op, guard_token, locs, resloc):
     #    self.implement_guard(guard_token)
@@ -367,34 +407,6 @@
     #    # ----------- pxor
     #    # 00 11 00 00
 
-    #def genop_vec_int_signext(self, op, arglocs, resloc):
-    #    srcloc, sizeloc, tosizeloc = arglocs
-    #    size = sizeloc.value
-    #    tosize = tosizeloc.value
-    #    if size == tosize:
-    #        return # already the right size
-    #    if size == 4 and tosize == 8:
-    #        scratch = X86_64_SCRATCH_REG.value
-    #        self.mc.PEXTRD_rxi(scratch, srcloc.value, 1)
-    #        self.mc.PINSRQ_xri(resloc.value, scratch, 1)
-    #        self.mc.PEXTRD_rxi(scratch, srcloc.value, 0)
-    #        self.mc.PINSRQ_xri(resloc.value, scratch, 0)
-    #    elif size == 8 and tosize == 4:
-    #        # is there a better sequence to move them?
-    #        scratch = X86_64_SCRATCH_REG.value
-    #        self.mc.PEXTRQ_rxi(scratch, srcloc.value, 0)
-    #        self.mc.PINSRD_xri(resloc.value, scratch, 0)
-    #        self.mc.PEXTRQ_rxi(scratch, srcloc.value, 1)
-    #        self.mc.PINSRD_xri(resloc.value, scratch, 1)
-    #    else:
-    #        # note that all other conversions are not implemented
-    #        # on purpose. it needs many x86 op codes to implement
-    #        # the missing combinations. even if they are implemented
-    #        # the speedup might only be modest...
-    #        # the optimization does not emit such code!
-    #        msg = "vec int signext (%d->%d)" % (size, tosize)
-    #        not_implemented(msg)
-
     #def genop_vec_expand_f(self, op, arglocs, resloc):
     #    srcloc, sizeloc = arglocs
     #    size = sizeloc.value
@@ -666,6 +678,14 @@
     prepare_vec_raw_store = _prepare_vec_store
     del _prepare_vec_store
 
+    def prepare_vec_int_signext(self, op):
+        assert isinstance(op, VectorOp)
+        a0 = op.getarg(0)
+        loc0 = self.ensure_vector_reg(a0)
+        resloc = self.force_allocate_vector_reg(op)
+        return [resloc, loc0]
+
+
 
     #def prepare_vec_arith_unary(self, op):
     #    lhs = op.getarg(0)
@@ -758,16 +778,6 @@
     #    resloc = self.xrm.force_allocate_reg(op, args)
     #    self.perform(op, [srcloc, imm(op.bytesize)], resloc)
 
-    #def prepare_vec_int_signext(self, op):
-    #    assert isinstance(op, VectorOp)
-    #    args = op.getarglist()
-    #    resloc = self.xrm.force_result_in_reg(op, op.getarg(0), args)
-    #    arg = op.getarg(0)
-    #    assert isinstance(arg, VectorOp)
-    #    size = arg.bytesize
-    #    assert size > 0
-    #    self.perform(op, [resloc, imm(size), imm(op.bytesize)], resloc)
-
     #def prepare_vec_int_is_true(self, op):
     #    args = op.getarglist()
     #    arg = op.getarg(0)
diff --git a/rpython/jit/metainterp/test/test_vector.py 
b/rpython/jit/metainterp/test/test_vector.py
--- a/rpython/jit/metainterp/test/test_vector.py
+++ b/rpython/jit/metainterp/test/test_vector.py
@@ -1,6 +1,7 @@
 import py
 import pytest
 import math
+import functools
 from hypothesis import given, note, strategies as st
 from rpython.jit.metainterp.warmspot import ll_meta_interp, get_stats
 from rpython.jit.metainterp.test.support import LLJitMixin
@@ -11,7 +12,7 @@
 from rpython.rlib.objectmodel import compute_hash
 from rpython.rlib import rfloat
 from rpython.rtyper.lltypesystem import lltype, rffi
-from rpython.rlib.rarithmetic import r_uint, intmask
+from rpython.rlib.rarithmetic import r_uint, intmask, r_int
 from rpython.rlib.rawstorage import (alloc_raw_storage, raw_storage_setitem,
                                      free_raw_storage, raw_storage_getitem)
 from rpython.rlib.objectmodel import (specialize, is_annotation_constant,
@@ -59,8 +60,6 @@
     request.cls.a
     return rs
 
-integers_64bit = st.integers(min_value=-2**63, max_value=2**63-1)
-floats = st.floats()
 
 def rdiv(v1,v2):
     # TODO unused, interpeting this on top of llgraph does not work correctly
@@ -104,11 +103,11 @@
                 raw_storage_setitem(vc, i, rffi.cast(type,c))
                 i += size
 
-        la = data.draw(st.lists(floats, min_size=10, max_size=150))
+        la = data.draw(st.lists(st.floats(), min_size=10, max_size=150))
         #la = [0.0,0.0,0.0,0.0,0.0,0.0,0.0]
         #lb = [0.0,0.0,0.0,0.0,1.7976931348623157e+308,0.0,0.0]
         l = len(la)
-        lb = data.draw(st.lists(floats, min_size=l, max_size=l))
+        lb = data.draw(st.lists(st.floats(), min_size=l, max_size=l))
 
         rawstorage = RawStorage()
         va = rawstorage.new(la, type)
@@ -126,18 +125,7 @@
 
         rawstorage.clear()
 
-    #@given(st.data())
-    @pytest.mark.parametrize('func,type', [
-        (lambda a,b: intmask(a+b), rffi.SIGNED),
-        (lambda a,b: intmask(a+b), rffi.UNSIGNED),
-        (lambda a,b: intmask(a+b), rffi.INT),
-        (lambda a,b: intmask(a+b), rffi.UINT),
-        (lambda a,b: intmask(a+b), rffi.SHORT),
-        (lambda a,b: intmask(a+b), rffi.USHORT),
-        (lambda a,b: intmask(a+b), rffi.CHAR),
-        (lambda a,b: intmask(a+b), rffi.UCHAR),
-    ])
-    def test_vector_simple_int(self, func, type):
+    def _vector_simple_int(self, func, type, data):
         func = always_inline(func)
 
         size = rffi.sizeof(type)
@@ -152,11 +140,13 @@
                 raw_storage_setitem(vc, i, rffi.cast(type,c))
                 i += size
 
-        #la = data.draw(st.lists(integers_64bit, min_size=10, max_size=150))
-        la = [1] * 10
+        bits = size*8
+        integers = st.integers(min_value=-2**(bits-1), max_value=2**(bits-1)-1)
+        la = data.draw(st.lists(integers, min_size=10, max_size=150))
+        #la = [1,2,3,4,5,6,7,8,9,10,11,12,13]
         l = len(la)
-        #lb = data.draw(st.lists(integers_64bit, min_size=l, max_size=l))
-        lb = [0] * 10
+        #lb = [1,2,3,4,5,6,7,8,9,10,11,12,13]
+        lb = data.draw(st.lists(integers, min_size=l, max_size=l))
 
         rawstorage = RawStorage()
         va = rawstorage.new(la, type)
@@ -166,10 +156,31 @@
 
         for i in range(l):
             c = raw_storage_getitem(type,vc,i*size)
-            assert func(la[i], lb[i]) == c
+            assert rffi.cast(type, func(la[i], lb[i])) == c
 
         rawstorage.clear()
 
+    def vec_int_arith(test_func, arith_func, type):
+        return pytest.mark.parametrize('func,type', [
+            (arith_func, type)
+        ])(given(data=st.data())(test_func))
+
+    vec_int_arith = functools.partial(vec_int_arith, _vector_simple_int)
+
+    test_vector_signed_add = \
+        vec_int_arith(lambda a,b: intmask(a+b), rffi.SIGNED)
+    test_vector_int_add = \
+        vec_int_arith(lambda a,b: r_int(a)+r_int(b), rffi.INT)
+    test_vector_short_add = \
+        vec_int_arith(lambda a,b: r_int(a)+r_int(b), rffi.SHORT)
+
+    test_vector_signed_sub = \
+        vec_int_arith(lambda a,b: r_int(a)-r_int(b), rffi.SIGNED)
+    test_vector_int_sub = \
+        vec_int_arith(lambda a,b: r_int(a)-r_int(b), rffi.INT)
+    test_vector_short_sub = \
+        vec_int_arith(lambda a,b: r_int(a)-r_int(b), rffi.SHORT)
+
     @py.test.mark.parametrize('i',[1,2,3,8,17,128,130,131,142,143])
     def test_vectorize_array_get_set(self,i):
         myjitdriver = JitDriver(greens = [],
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to