Author: Richard Plangger <r...@pasra.at> Branch: vecopt2 Changeset: r77121:76c8e4a033f7 Date: 2015-04-22 15:55 +0200 http://bitbucket.org/pypy/pypy/changeset/76c8e4a033f7/
Log: x86_64 backend is now capable of emitting sse2 instructions for the current vector operations diff --git a/rpython/jit/backend/x86/assembler.py b/rpython/jit/backend/x86/assembler.py --- a/rpython/jit/backend/x86/assembler.py +++ b/rpython/jit/backend/x86/assembler.py @@ -2415,15 +2415,67 @@ i += current # vector operations - def genop_vec_raw_load(self, op, arglocs, resloc): - base_loc, ofs_loc, size_loc, ofs, sign_loc = arglocs - assert isinstance(ofs, ImmedLoc) - assert isinstance(size_loc, ImmedLoc) + # ________________________________________ + + def genop_vec_getarrayitem_raw(self, op, arglocs, resloc): + # considers item scale (raw_load does not) + base_loc, ofs_loc, size_loc, ofs, sign, integer, aligned = arglocs scale = get_scale(size_loc.value) src_addr = addr_add(base_loc, ofs_loc, ofs.value, scale) - assert False - #self.load_from_mem(resloc, src_addr, size_loc, sign_loc) + self._vec_load(resloc, src_addr, integer, aligned) + def genop_vec_raw_load(self, op, arglocs, resloc): + base_loc, ofs_loc, size_loc, ofs, sign, integer, aligned = arglocs + src_addr = addr_add(base_loc, ofs_loc, ofs.value, 0) + self._vec_load(resloc, src_addr, integer, aligned) + + def _vec_load(self, resloc, src_addr, integer, aligned): + if integer: + if aligned: + raise NotImplementedError + self.mc.MOVDQA(resloc, src_addr) + else: + self.mc.MOVDQU(resloc, src_addr) + else: + if size == 8: # TODO is there a constant for double floating point size? + self.mc.MOVSD(resloc, source_addr) + else: + raise NotImplementedError + + def genop_discard_vec_setarrayitem_raw(self, op, arglocs): + # considers item scale (raw_store does not) + base_loc, ofs_loc, value_loc, size_loc, baseofs, integer, aligned = arglocs + scale = get_scale(size_loc.value) + dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, scale) + self._vec_store(dest_loc, value_loc, integer, aligned) + + def genop_discard_vec_raw_store(self, op, arglocs): + base_loc, ofs_loc, value_loc, size_loc, baseofs, integer, aligned = arglocs + dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, 0) + self._vec_store(dest_loc, value_loc, integer, aligned) + + def _vec_store(self, dest_loc, value_loc, integer, aligned): + if integer: + if aligned: + raise NotImplementedError + else: + self.mc.MOVDQU(dest_loc, value_loc) + else: + if size == 8: # TODO is there a constant for double floating point size? + self.mc.MOVSD(dest_loc, value_loc) + else: + raise NotImplementedError + + def genop_vec_int_add(self, op, arglocs, resloc): + loc0, loc1, itemsize = arglocs + if itemsize == 4: + self.mc.PADDD(loc0, loc1) + elif itemsize == 8: + self.mc.PADDQ(loc0, loc1) + else: + raise NotImplementedError + + # ________________________________________ genop_discard_list = [Assembler386.not_implemented_op_discard] * rop._LAST genop_list = [Assembler386.not_implemented_op] * rop._LAST diff --git a/rpython/jit/backend/x86/regalloc.py b/rpython/jit/backend/x86/regalloc.py --- a/rpython/jit/backend/x86/regalloc.py +++ b/rpython/jit/backend/x86/regalloc.py @@ -177,7 +177,7 @@ return self.fm.get_frame_depth() def possibly_free_var(self, var): - if var.type == FLOAT: + if var.type == FLOAT or var.type == VECTOR: self.xrm.possibly_free_var(var) else: self.rm.possibly_free_var(var) @@ -197,7 +197,7 @@ def make_sure_var_in_reg(self, var, forbidden_vars=[], selected_reg=None, need_lower_byte=False): - if var.type == FLOAT: + if var.type == FLOAT or var.type == VECTOR: if isinstance(var, ConstFloat): return FloatImmedLoc(var.getfloatstorage()) return self.xrm.make_sure_var_in_reg(var, forbidden_vars, @@ -1458,22 +1458,54 @@ self.rm.possibly_free_var(dstaddr_box) # vector operations - def consider_vec_raw_load(self, op): - itemsize, ofs, sign = unpack_arraydescr(op.getdescr()) + # ________________________________________ + + def consider_vec_getarrayitem_raw(self, op): + descr = op.getdescr() + assert not descr.is_array_of_pointers() and \ + not descr.is_array_of_structs() + itemsize, ofs, sign = unpack_arraydescr(descr) + integer = not descr.is_array_of_floats() + aligned = False args = op.getarglist() base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args) ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args) result_loc = self.force_allocate_reg(op.result) - if sign: - sign_loc = imm1 - else: - sign_loc = imm0 self.perform(op, [base_loc, ofs_loc, imm(itemsize), imm(ofs), - sign_loc], result_loc) + sign, integer, aligned], result_loc) + + consider_vec_raw_load = consider_vec_getarrayitem_raw + + def consider_vec_setarrayitem_raw(self, op): + descr = op.getdescr() + assert not descr.is_array_of_pointers() and \ + not descr.is_array_of_structs() + itemsize, ofs, sign = unpack_arraydescr(descr) + args = op.getarglist() + base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args) + value_loc = self.make_sure_var_in_reg(op.getarg(2), args) + ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args) + + integer = not descr.is_array_of_floats() + aligned = False + self.perform_discard(op, [base_loc, ofs_loc, value_loc, + imm(itemsize), imm(ofs), integer, aligned]) + + consider_vec_raw_store = consider_vec_setarrayitem_raw + + def consider_vec_int_add(self, op): + count = op.getarg(2) + itemsize = 16 // count.value + args = op.getarglist() + loc1 = self.xrm.make_sure_var_in_reg(op.getarg(1), args) + loc0 = self.xrm.force_result_in_reg(op.result, op.getarg(0), args) + self.perform(op, [loc0, loc1, itemsize], loc0) def consider_guard_early_exit(self, op): pass + # ________________________________________ + def not_implemented_op(self, op): not_implemented("not implemented operation: %s" % op.getopname()) diff --git a/rpython/jit/backend/x86/regloc.py b/rpython/jit/backend/x86/regloc.py --- a/rpython/jit/backend/x86/regloc.py +++ b/rpython/jit/backend/x86/regloc.py @@ -656,6 +656,7 @@ MOVSD = _binaryop('MOVSD') MOVAPD = _binaryop('MOVAPD') + MOVDQU = _binaryop('MOVDQU') ADDSD = _binaryop('ADDSD') ADDPD = _binaryop('ADDPD') SUBSD = _binaryop('SUBSD') @@ -673,6 +674,7 @@ XORPD = _binaryop('XORPD') PADDQ = _binaryop('PADDQ') + PADDD = _binaryop('PADDD') PSUBQ = _binaryop('PSUBQ') PAND = _binaryop('PAND') POR = _binaryop('POR') diff --git a/rpython/jit/backend/x86/test/test_vectorize.py b/rpython/jit/backend/x86/test/test_vectorize.py --- a/rpython/jit/backend/x86/test/test_vectorize.py +++ b/rpython/jit/backend/x86/test/test_vectorize.py @@ -1,12 +1,67 @@ import py +from rpython.jit.backend.x86.regloc import * +from rpython.jit.backend.x86.test import test_basic +from rpython.jit.backend.x86.test.test_assembler import \ + (TestRegallocPushPop as BaseTestAssembler) from rpython.jit.backend.detect_cpu import getcpuclass +from rpython.jit.metainterp.history import ConstFloat +from rpython.jit.metainterp.test import support, test_vectorize from rpython.jit.metainterp.warmspot import ll_meta_interp -from rpython.jit.metainterp.test import support, test_vectorize -from rpython.jit.backend.x86.test import test_basic from rpython.rlib.jit import JitDriver +from rpython.rtyper.lltypesystem import lltype class TestBasic(test_basic.Jit386Mixin, test_vectorize.VectorizeLLtypeTests): # for the individual tests see # ====> ../../../metainterp/test/test_basic.py pass + + + +class TestAssembler(BaseTestAssembler): + def imm_4_int32(self, a, b, c, d): + adr = self.xrm.assembler.datablockwrapper.malloc_aligned(16, 16) + ptr = rffi.cast(rffi.CArrayPtr(rffi.INT), adr) + ptr[0] = rffi.r_int(a) + ptr[1] = rffi.r_int(b) + ptr[2] = rffi.r_int(c) + ptr[3] = rffi.r_int(d) + return ConstAddressLoc(adr,4) + + def test_simple_4_int_load_sum_x86_64(self): + def callback(asm): + if asm.mc.WORD != 8: + py.test.skip() + loc = self.imm_4_int32(123,543,0,0) + adr = loc.value + asm.mc.MOV_ri(r8.value,adr) + asm.mc.MOVDQU_xm(xmm7.value, (r8.value, 0)) + asm.mc.PADDD_xm(xmm7.value, (r8.value, 0)) + asm.mc.PADDD_xx(xmm7.value, xmm7.value) + + asm.mc.MOV_ri(edx.value, 0x00000000ffffffff) + + asm.mc.MOV_ri(eax.value, 0) + asm.mc.MOVDQ_rx(ecx.value, xmm7.value) + asm.mc.AND_rr(ecx.value, edx.value) + asm.mc.ADD(eax, ecx) + + asm.mc.PSRLDQ_xi((xmm7.value, 4)) + asm.mc.MOVDQ_rx(ecx.value, xmm7.value) + asm.mc.AND_rr(ecx.value, edx.value) + asm.mc.ADD(eax, ecx) + res = self.do_test(callback) + assert res == 123*4 + 543*4 + + def test_vector_store(self): + def callback(asm): + loc = self.imm_4_int32(11,12,13,14) + asm.mov(ImmedLoc(loc.value), ecx) + asm.mc.MOVDQU_xm(xmm6.value, (ecx.value,0)) + asm.mc.PADDD_xm(xmm6.value, (ecx.value,0)) + asm.mc.MOVDQU(AddressLoc(ecx,ImmedLoc(0)), xmm6) + asm.mc.MOVDQU(xmm6, AddressLoc(ecx,ImmedLoc(0))) + asm.mc.MOVDQ_rx(eax.value, xmm6.value) + + res = self.do_test(callback) & 0xffffffff + assert res == 22 diff --git a/rpython/jit/metainterp/test/test_vectorize.py b/rpython/jit/metainterp/test/test_vectorize.py --- a/rpython/jit/metainterp/test/test_vectorize.py +++ b/rpython/jit/metainterp/test/test_vectorize.py @@ -31,6 +31,7 @@ va = alloc_raw_storage(bc, zero=True) vb = alloc_raw_storage(bc, zero=True) vc = alloc_raw_storage(bc, zero=True) + x = 1 for i in range(d): j = i*rffi.sizeof(rffi.SIGNED) raw_storage_setitem(va, j, rffi.cast(rffi.SIGNED,i)) @@ -57,29 +58,7 @@ if i > 3: self.check_trace_count(1) - def test_guard(self): - py.test.skip('abc') - myjitdriver = JitDriver(greens = [], - reds = ['a','b','c'], - vectorize=True) - def f(a,c): - b = 0 - while b < c: - myjitdriver.can_enter_jit(a=a, b=b, c=c) - myjitdriver.jit_merge_point(a=a, b=b, c=c) - - if a: - a = not a - b += 1 - - return 42 - - i = 32 - res = self.meta_interp(f, [True,i]) - assert res == 42 - self.check_trace_count(1) - - @py.test.mark.parametrize('i',[1,2,3,8,17,128,500]) + @py.test.mark.parametrize('i',[1,2,3,8,17,128,500,501,502,1300]) def test_vectorize_array_get_set(self,i): myjitdriver = JitDriver(greens = [], reds = ['i','d','va','vb','vc'], _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit