Author: Richard Plangger <[email protected]>
Branch: vecopt2
Changeset: r77121:76c8e4a033f7
Date: 2015-04-22 15:55 +0200
http://bitbucket.org/pypy/pypy/changeset/76c8e4a033f7/
Log: x86_64 backend is now capable of emitting sse2 instructions for the
current vector operations
diff --git a/rpython/jit/backend/x86/assembler.py
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -2415,15 +2415,67 @@
i += current
# vector operations
- def genop_vec_raw_load(self, op, arglocs, resloc):
- base_loc, ofs_loc, size_loc, ofs, sign_loc = arglocs
- assert isinstance(ofs, ImmedLoc)
- assert isinstance(size_loc, ImmedLoc)
+ # ________________________________________
+
+ def genop_vec_getarrayitem_raw(self, op, arglocs, resloc):
+ # considers item scale (raw_load does not)
+ base_loc, ofs_loc, size_loc, ofs, sign, integer, aligned = arglocs
scale = get_scale(size_loc.value)
src_addr = addr_add(base_loc, ofs_loc, ofs.value, scale)
- assert False
- #self.load_from_mem(resloc, src_addr, size_loc, sign_loc)
+ self._vec_load(resloc, src_addr, integer, aligned)
+ def genop_vec_raw_load(self, op, arglocs, resloc):
+ base_loc, ofs_loc, size_loc, ofs, sign, integer, aligned = arglocs
+ src_addr = addr_add(base_loc, ofs_loc, ofs.value, 0)
+ self._vec_load(resloc, src_addr, integer, aligned)
+
+ def _vec_load(self, resloc, src_addr, integer, aligned):
+ if integer:
+ if aligned:
+ raise NotImplementedError
+ self.mc.MOVDQA(resloc, src_addr)
+ else:
+ self.mc.MOVDQU(resloc, src_addr)
+ else:
+ if size == 8: # TODO is there a constant for double floating point
size?
+ self.mc.MOVSD(resloc, source_addr)
+ else:
+ raise NotImplementedError
+
+ def genop_discard_vec_setarrayitem_raw(self, op, arglocs):
+ # considers item scale (raw_store does not)
+ base_loc, ofs_loc, value_loc, size_loc, baseofs, integer, aligned =
arglocs
+ scale = get_scale(size_loc.value)
+ dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, scale)
+ self._vec_store(dest_loc, value_loc, integer, aligned)
+
+ def genop_discard_vec_raw_store(self, op, arglocs):
+ base_loc, ofs_loc, value_loc, size_loc, baseofs, integer, aligned =
arglocs
+ dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, 0)
+ self._vec_store(dest_loc, value_loc, integer, aligned)
+
+ def _vec_store(self, dest_loc, value_loc, integer, aligned):
+ if integer:
+ if aligned:
+ raise NotImplementedError
+ else:
+ self.mc.MOVDQU(dest_loc, value_loc)
+ else:
+ if size == 8: # TODO is there a constant for double floating point
size?
+ self.mc.MOVSD(dest_loc, value_loc)
+ else:
+ raise NotImplementedError
+
+ def genop_vec_int_add(self, op, arglocs, resloc):
+ loc0, loc1, itemsize = arglocs
+ if itemsize == 4:
+ self.mc.PADDD(loc0, loc1)
+ elif itemsize == 8:
+ self.mc.PADDQ(loc0, loc1)
+ else:
+ raise NotImplementedError
+
+ # ________________________________________
genop_discard_list = [Assembler386.not_implemented_op_discard] * rop._LAST
genop_list = [Assembler386.not_implemented_op] * rop._LAST
diff --git a/rpython/jit/backend/x86/regalloc.py
b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -177,7 +177,7 @@
return self.fm.get_frame_depth()
def possibly_free_var(self, var):
- if var.type == FLOAT:
+ if var.type == FLOAT or var.type == VECTOR:
self.xrm.possibly_free_var(var)
else:
self.rm.possibly_free_var(var)
@@ -197,7 +197,7 @@
def make_sure_var_in_reg(self, var, forbidden_vars=[],
selected_reg=None, need_lower_byte=False):
- if var.type == FLOAT:
+ if var.type == FLOAT or var.type == VECTOR:
if isinstance(var, ConstFloat):
return FloatImmedLoc(var.getfloatstorage())
return self.xrm.make_sure_var_in_reg(var, forbidden_vars,
@@ -1458,22 +1458,54 @@
self.rm.possibly_free_var(dstaddr_box)
# vector operations
- def consider_vec_raw_load(self, op):
- itemsize, ofs, sign = unpack_arraydescr(op.getdescr())
+ # ________________________________________
+
+ def consider_vec_getarrayitem_raw(self, op):
+ descr = op.getdescr()
+ assert not descr.is_array_of_pointers() and \
+ not descr.is_array_of_structs()
+ itemsize, ofs, sign = unpack_arraydescr(descr)
+ integer = not descr.is_array_of_floats()
+ aligned = False
args = op.getarglist()
base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
result_loc = self.force_allocate_reg(op.result)
- if sign:
- sign_loc = imm1
- else:
- sign_loc = imm0
self.perform(op, [base_loc, ofs_loc, imm(itemsize), imm(ofs),
- sign_loc], result_loc)
+ sign, integer, aligned], result_loc)
+
+ consider_vec_raw_load = consider_vec_getarrayitem_raw
+
+ def consider_vec_setarrayitem_raw(self, op):
+ descr = op.getdescr()
+ assert not descr.is_array_of_pointers() and \
+ not descr.is_array_of_structs()
+ itemsize, ofs, sign = unpack_arraydescr(descr)
+ args = op.getarglist()
+ base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
+ value_loc = self.make_sure_var_in_reg(op.getarg(2), args)
+ ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
+
+ integer = not descr.is_array_of_floats()
+ aligned = False
+ self.perform_discard(op, [base_loc, ofs_loc, value_loc,
+ imm(itemsize), imm(ofs), integer, aligned])
+
+ consider_vec_raw_store = consider_vec_setarrayitem_raw
+
+ def consider_vec_int_add(self, op):
+ count = op.getarg(2)
+ itemsize = 16 // count.value
+ args = op.getarglist()
+ loc1 = self.xrm.make_sure_var_in_reg(op.getarg(1), args)
+ loc0 = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+ self.perform(op, [loc0, loc1, itemsize], loc0)
def consider_guard_early_exit(self, op):
pass
+ # ________________________________________
+
def not_implemented_op(self, op):
not_implemented("not implemented operation: %s" % op.getopname())
diff --git a/rpython/jit/backend/x86/regloc.py
b/rpython/jit/backend/x86/regloc.py
--- a/rpython/jit/backend/x86/regloc.py
+++ b/rpython/jit/backend/x86/regloc.py
@@ -656,6 +656,7 @@
MOVSD = _binaryop('MOVSD')
MOVAPD = _binaryop('MOVAPD')
+ MOVDQU = _binaryop('MOVDQU')
ADDSD = _binaryop('ADDSD')
ADDPD = _binaryop('ADDPD')
SUBSD = _binaryop('SUBSD')
@@ -673,6 +674,7 @@
XORPD = _binaryop('XORPD')
PADDQ = _binaryop('PADDQ')
+ PADDD = _binaryop('PADDD')
PSUBQ = _binaryop('PSUBQ')
PAND = _binaryop('PAND')
POR = _binaryop('POR')
diff --git a/rpython/jit/backend/x86/test/test_vectorize.py
b/rpython/jit/backend/x86/test/test_vectorize.py
--- a/rpython/jit/backend/x86/test/test_vectorize.py
+++ b/rpython/jit/backend/x86/test/test_vectorize.py
@@ -1,12 +1,67 @@
import py
+from rpython.jit.backend.x86.regloc import *
+from rpython.jit.backend.x86.test import test_basic
+from rpython.jit.backend.x86.test.test_assembler import \
+ (TestRegallocPushPop as BaseTestAssembler)
from rpython.jit.backend.detect_cpu import getcpuclass
+from rpython.jit.metainterp.history import ConstFloat
+from rpython.jit.metainterp.test import support, test_vectorize
from rpython.jit.metainterp.warmspot import ll_meta_interp
-from rpython.jit.metainterp.test import support, test_vectorize
-from rpython.jit.backend.x86.test import test_basic
from rpython.rlib.jit import JitDriver
+from rpython.rtyper.lltypesystem import lltype
class TestBasic(test_basic.Jit386Mixin, test_vectorize.VectorizeLLtypeTests):
# for the individual tests see
# ====> ../../../metainterp/test/test_basic.py
pass
+
+
+
+class TestAssembler(BaseTestAssembler):
+ def imm_4_int32(self, a, b, c, d):
+ adr = self.xrm.assembler.datablockwrapper.malloc_aligned(16, 16)
+ ptr = rffi.cast(rffi.CArrayPtr(rffi.INT), adr)
+ ptr[0] = rffi.r_int(a)
+ ptr[1] = rffi.r_int(b)
+ ptr[2] = rffi.r_int(c)
+ ptr[3] = rffi.r_int(d)
+ return ConstAddressLoc(adr,4)
+
+ def test_simple_4_int_load_sum_x86_64(self):
+ def callback(asm):
+ if asm.mc.WORD != 8:
+ py.test.skip()
+ loc = self.imm_4_int32(123,543,0,0)
+ adr = loc.value
+ asm.mc.MOV_ri(r8.value,adr)
+ asm.mc.MOVDQU_xm(xmm7.value, (r8.value, 0))
+ asm.mc.PADDD_xm(xmm7.value, (r8.value, 0))
+ asm.mc.PADDD_xx(xmm7.value, xmm7.value)
+
+ asm.mc.MOV_ri(edx.value, 0x00000000ffffffff)
+
+ asm.mc.MOV_ri(eax.value, 0)
+ asm.mc.MOVDQ_rx(ecx.value, xmm7.value)
+ asm.mc.AND_rr(ecx.value, edx.value)
+ asm.mc.ADD(eax, ecx)
+
+ asm.mc.PSRLDQ_xi((xmm7.value, 4))
+ asm.mc.MOVDQ_rx(ecx.value, xmm7.value)
+ asm.mc.AND_rr(ecx.value, edx.value)
+ asm.mc.ADD(eax, ecx)
+ res = self.do_test(callback)
+ assert res == 123*4 + 543*4
+
+ def test_vector_store(self):
+ def callback(asm):
+ loc = self.imm_4_int32(11,12,13,14)
+ asm.mov(ImmedLoc(loc.value), ecx)
+ asm.mc.MOVDQU_xm(xmm6.value, (ecx.value,0))
+ asm.mc.PADDD_xm(xmm6.value, (ecx.value,0))
+ asm.mc.MOVDQU(AddressLoc(ecx,ImmedLoc(0)), xmm6)
+ asm.mc.MOVDQU(xmm6, AddressLoc(ecx,ImmedLoc(0)))
+ asm.mc.MOVDQ_rx(eax.value, xmm6.value)
+
+ res = self.do_test(callback) & 0xffffffff
+ assert res == 22
diff --git a/rpython/jit/metainterp/test/test_vectorize.py
b/rpython/jit/metainterp/test/test_vectorize.py
--- a/rpython/jit/metainterp/test/test_vectorize.py
+++ b/rpython/jit/metainterp/test/test_vectorize.py
@@ -31,6 +31,7 @@
va = alloc_raw_storage(bc, zero=True)
vb = alloc_raw_storage(bc, zero=True)
vc = alloc_raw_storage(bc, zero=True)
+ x = 1
for i in range(d):
j = i*rffi.sizeof(rffi.SIGNED)
raw_storage_setitem(va, j, rffi.cast(rffi.SIGNED,i))
@@ -57,29 +58,7 @@
if i > 3:
self.check_trace_count(1)
- def test_guard(self):
- py.test.skip('abc')
- myjitdriver = JitDriver(greens = [],
- reds = ['a','b','c'],
- vectorize=True)
- def f(a,c):
- b = 0
- while b < c:
- myjitdriver.can_enter_jit(a=a, b=b, c=c)
- myjitdriver.jit_merge_point(a=a, b=b, c=c)
-
- if a:
- a = not a
- b += 1
-
- return 42
-
- i = 32
- res = self.meta_interp(f, [True,i])
- assert res == 42
- self.check_trace_count(1)
-
- @py.test.mark.parametrize('i',[1,2,3,8,17,128,500])
+ @py.test.mark.parametrize('i',[1,2,3,8,17,128,500,501,502,1300])
def test_vectorize_array_get_set(self,i):
myjitdriver = JitDriver(greens = [],
reds = ['i','d','va','vb','vc'],
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit