Author: Richard Plangger <[email protected]>
Branch: vecopt
Changeset: r78453:a3e41e7fb44b
Date: 2015-07-03 14:32 +0200
http://bitbucket.org/pypy/pypy/changeset/a3e41e7fb44b/

Log:    refactored assembler, moved vector regalloc & assembler methods to a
        separate file added implementation to automatically generate a exit
        bridge out of the vectorized trace

diff --git a/rpython/jit/backend/x86/assembler.py 
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -10,6 +10,7 @@
 from rpython.jit.metainterp.history import (Const, Box, VOID,
     BoxVector, ConstInt, BoxVectorAccum)
 from rpython.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
+from rpython.jit.metainterp.compile import CompileLoopVersionDescr
 from rpython.rtyper.lltypesystem import lltype, rffi, rstr, llmemory
 from rpython.rtyper.lltypesystem.lloperation import llop
 from rpython.rtyper.annlowlevel import llhelper, cast_instance_to_gcref
@@ -29,6 +30,7 @@
     imm0, imm1, FloatImmedLoc, RawEbpLoc, RawEspLoc)
 from rpython.rlib.objectmodel import we_are_translated
 from rpython.jit.backend.x86 import rx86, codebuf, callbuilder
+from rpython.jit.backend.x86.vector_ext import VectorAssemblerMixin
 from rpython.jit.backend.x86.callbuilder import follow_jump
 from rpython.jit.metainterp.resoperation import rop
 from rpython.jit.backend.x86 import support
@@ -40,7 +42,7 @@
 from rpython.rlib.objectmodel import compute_unique_id
 
 
-class Assembler386(BaseAssembler):
+class Assembler386(BaseAssembler, VectorAssemblerMixin):
     _regalloc = None
     _output_loop_log = None
     _second_tmp_reg = ecx
@@ -550,7 +552,6 @@
         if log:
             operations = self._inject_debugging_code(faildescr, operations,
                                                      'b', descr_number)
-
         arglocs = self.rebuild_faillocs_from_descr(faildescr, inputargs)
         regalloc = RegAlloc(self, self.cpu.translate_support_code)
         startpos = self.mc.get_relative_pos()
@@ -592,8 +593,12 @@
         # for each pending guard, generate the code of the recovery stub
         # at the end of self.mc.
         for tok in self.pending_guard_tokens:
-            regalloc.position = tok.position
-            tok.pos_recovery_stub = self.generate_quick_failure(tok, regalloc)
+            descr = tok.faildescr
+            if not isinstance(descr, CompileLoopVersionDescr):
+                regalloc.position = tok.position
+                tok.pos_recovery_stub = self.generate_quick_failure(tok, 
regalloc)
+            else:
+                self.store_info_on_descr(0, tok)
         if WORD == 8 and len(self.pending_memoryerror_trampoline_from) > 0:
             self.error_trampoline_64 = self.generate_propagate_error_64()
 
@@ -606,6 +611,9 @@
         for tok in self.pending_guard_tokens:
             addr = rawstart + tok.pos_jump_offset
             tok.faildescr.adr_jump_offset = addr
+            descr = tok.faildescr
+            if isinstance(descr, CompileLoopVersionDescr):
+                continue # patch them later
             relative_target = tok.pos_recovery_stub - (tok.pos_jump_offset + 4)
             assert rx86.fits_in_32bits(relative_target)
             #
@@ -1645,30 +1653,6 @@
             self.mc.MOVD32_xr(resloc.value, eax.value)
             self.mc.PUNPCKLDQ_xx(resloc.value, loc1.value)
 
-    def _guard_vector_true(self, guard_op, loc, zero=False):
-        arg = guard_op.getarg(0)
-        assert isinstance(arg, BoxVector)
-        size = arg.item_size
-        temp = X86_64_XMM_SCRATCH_REG
-        #
-        self.mc.PXOR(temp, temp)
-        # if the vector is not fully packed blend 1s
-        if not arg.fully_packed(self.cpu.vector_register_size):
-            self.mc.PCMPEQQ(temp, temp) # fill with ones
-            select = 0
-            bits_used = (arg.item_count * arg.item_size * 8)
-            index = bits_used // 16
-            while index < 8:
-                select |= (1 << index)
-                index += 1
-            self.mc.PBLENDW_xxi(loc.value, temp.value, select)
-            # reset to zeros
-            self.mc.PXOR(temp, temp)
-
-        self.mc.PCMPEQ(size, loc, temp)
-        self.mc.PCMPEQQ(temp, temp)
-        self.mc.PTEST(loc, temp)
-
     def genop_guard_guard_true(self, ign_1, guard_op, guard_token, locs, 
ign_2):
         loc = locs[0]
         if isinstance(loc, RegLoc):
@@ -2527,393 +2511,6 @@
                 self.save_into_mem(addr, imm0, imm(current))
             i += current
 
-    # vector operations
-    # ________________________________________
-
-    def _accum_update_at_exit(self, fail_locs, fail_args, faildescr, regalloc):
-        """ If accumulation is done in this loop, at the guard exit
-        some vector registers must be adjusted to yield the correct value"""
-        assert regalloc is not None
-        accum_info = faildescr.rd_accum_list
-        while accum_info:
-            pos = accum_info.position
-            loc = fail_locs[pos]
-            assert isinstance(loc, RegLoc)
-            arg = fail_args[pos]
-            if isinstance(arg, BoxVectorAccum):
-                arg = arg.scalar_var
-            assert arg is not None
-            tgtloc = regalloc.force_allocate_reg(arg, fail_args)
-            if accum_info.operation == '+':
-                # reduction using plus
-                self._accum_reduce_sum(arg, loc, tgtloc)
-            elif accum_info.operation == '*':
-                self._accum_reduce_mul(arg, loc, tgtloc)
-            else:
-                not_implemented("accum operator %s not implemented" %
-                                            (accum_info.operation)) 
-            fail_locs[pos] = tgtloc
-            regalloc.possibly_free_var(arg)
-            accum_info = accum_info.prev
-
-    def _accum_reduce_mul(self, arg, accumloc, targetloc):
-        scratchloc = X86_64_XMM_SCRATCH_REG
-        self.mov(accumloc, scratchloc)
-        # swap the two elements
-        self.mc.SHUFPD_xxi(scratchloc.value, scratchloc.value, 0x01)
-        self.mc.MULSD(accumloc, scratchloc)
-        if accumloc is not targetloc:
-            self.mov(accumloc, targetloc)
-
-    def _accum_reduce_sum(self, arg, accumloc, targetloc):
-        # Currently the accumulator can ONLY be the biggest
-        # size for X86 -> 64 bit float/int
-        if arg.type == FLOAT:
-            # r = (r[0]+r[1],r[0]+r[1])
-            self.mc.HADDPD(accumloc, accumloc)
-            # upper bits (> 64) are dirty (but does not matter)
-            if accumloc is not targetloc:
-                self.mov(accumloc, targetloc)
-            return
-        elif arg.type == INT:
-            scratchloc = X86_64_SCRATCH_REG
-            self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0)
-            self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1)
-            self.mc.ADD(targetloc, scratchloc)
-            return
-
-        not_implemented("reduce sum for %s not impl." % arg)
-
-    def genop_vec_getarrayitem_raw(self, op, arglocs, resloc):
-        # considers item scale (raw_load does not)
-        base_loc, ofs_loc, size_loc, ofs, integer_loc, aligned_loc = arglocs
-        scale = get_scale(size_loc.value)
-        src_addr = addr_add(base_loc, ofs_loc, ofs.value, scale)
-        self._vec_load(resloc, src_addr, integer_loc.value,
-                       size_loc.value, aligned_loc.value)
-
-    def genop_vec_raw_load(self, op, arglocs, resloc):
-        base_loc, ofs_loc, size_loc, ofs, integer_loc, aligned_loc = arglocs
-        src_addr = addr_add(base_loc, ofs_loc, ofs.value, 0)
-        self._vec_load(resloc, src_addr, integer_loc.value,
-                       size_loc.value, aligned_loc.value)
-
-    def _vec_load(self, resloc, src_addr, integer, itemsize, aligned):
-        if integer:
-            if aligned:
-                self.mc.MOVDQA(resloc, src_addr)
-            else:
-                self.mc.MOVDQU(resloc, src_addr)
-        else:
-            if itemsize == 4:
-                self.mc.MOVUPS(resloc, src_addr)
-            elif itemsize == 8:
-                self.mc.MOVUPD(resloc, src_addr)
-
-    def genop_discard_vec_setarrayitem_raw(self, op, arglocs):
-        # considers item scale (raw_store does not)
-        base_loc, ofs_loc, value_loc, size_loc, baseofs, integer_loc, 
aligned_loc = arglocs
-        scale = get_scale(size_loc.value)
-        dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, scale)
-        self._vec_store(dest_loc, value_loc, integer_loc.value,
-                        size_loc.value, aligned_loc.value)
-
-    def genop_discard_vec_raw_store(self, op, arglocs):
-        base_loc, ofs_loc, value_loc, size_loc, baseofs, integer_loc, 
aligned_loc = arglocs
-        dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, 0)
-        self._vec_store(dest_loc, value_loc, integer_loc.value,
-                        size_loc.value, aligned_loc.value)
-
-    def _vec_store(self, dest_loc, value_loc, integer, itemsize, aligned):
-        if integer:
-            if aligned:
-                self.mc.MOVDQA(dest_loc, value_loc)
-            else:
-                self.mc.MOVDQU(dest_loc, value_loc)
-        else:
-            if itemsize == 4:
-                self.mc.MOVUPS(dest_loc, value_loc)
-            elif itemsize == 8:
-                self.mc.MOVUPD(dest_loc, value_loc)
-
-    def genop_vec_int_mul(self, op, arglocs, resloc):
-        loc0, loc1, itemsize_loc = arglocs
-        itemsize = itemsize_loc.value
-        if itemsize == 2:
-            self.mc.PMULLW(loc0, loc1)
-        elif itemsize == 4:
-            self.mc.PMULLD(loc0, loc1)
-        else:
-            # NOTE see 
http://stackoverflow.com/questions/8866973/can-long-integer-routines-benefit-from-sse/8867025#8867025
-            # There is no 64x64 bit packed mul and I did not find one
-            # for 8 bit either. It is questionable if it gives any benefit
-            # for 8 bit.
-            not_implemented("int8/64 mul")
-
-    def genop_vec_int_add(self, op, arglocs, resloc):
-        loc0, loc1, size_loc = arglocs
-        size = size_loc.value
-        if size == 1:
-            self.mc.PADDB(loc0, loc1)
-        elif size == 2:
-            self.mc.PADDW(loc0, loc1)
-        elif size == 4:
-            self.mc.PADDD(loc0, loc1)
-        elif size == 8:
-            self.mc.PADDQ(loc0, loc1)
-
-    def genop_vec_int_sub(self, op, arglocs, resloc):
-        loc0, loc1, size_loc = arglocs
-        size = size_loc.value
-        if size == 1:
-            self.mc.PSUBB(loc0, loc1)
-        elif size == 2:
-            self.mc.PSUBW(loc0, loc1)
-        elif size == 4:
-            self.mc.PSUBD(loc0, loc1)
-        elif size == 8:
-            self.mc.PSUBQ(loc0, loc1)
-
-    def genop_vec_int_and(self, op, arglocs, resloc):
-        self.mc.PAND(resloc, arglocs[0])
-
-    def genop_vec_int_or(self, op, arglocs, resloc):
-        self.mc.POR(resloc, arglocs[0])
-
-    def genop_vec_int_xor(self, op, arglocs, resloc):
-        self.mc.PXOR(resloc, arglocs[0])
-
-    genop_vec_float_arith = """
-    def genop_vec_float_{type}(self, op, arglocs, resloc):
-        loc0, loc1, itemsize_loc = arglocs
-        itemsize = itemsize_loc.value
-        if itemsize == 4:
-            self.mc.{p_op_s}(loc0, loc1)
-        elif itemsize == 8:
-            self.mc.{p_op_d}(loc0, loc1)
-    """
-    for op in ['add','mul','sub']:
-        OP = op.upper()
-        _source = genop_vec_float_arith.format(type=op,
-                                               p_op_s=OP+'PS',
-                                               p_op_d=OP+'PD')
-        exec py.code.Source(_source).compile()
-    del genop_vec_float_arith
-
-    def genop_vec_float_truediv(self, op, arglocs, resloc):
-        loc0, loc1, sizeloc = arglocs
-        size = sizeloc.value
-        if size == 4:
-            self.mc.DIVPS(loc0, loc1)
-        elif size == 8:
-            self.mc.DIVPD(loc0, loc1)
-
-    def genop_vec_float_abs(self, op, arglocs, resloc):
-        src, sizeloc = arglocs
-        size = sizeloc.value
-        if size == 4:
-            self.mc.ANDPS(src, heap(self.single_float_const_abs_addr))
-        elif size == 8:
-            self.mc.ANDPD(src, heap(self.float_const_abs_addr))
-
-    def genop_vec_float_neg(self, op, arglocs, resloc):
-        src, sizeloc = arglocs
-        size = sizeloc.value
-        if size == 4:
-            self.mc.XORPS(src, heap(self.single_float_const_neg_addr))
-        elif size == 8:
-            self.mc.XORPD(src, heap(self.float_const_neg_addr))
-
-    def genop_vec_int_signext(self, op, arglocs, resloc):
-        srcloc, sizeloc, tosizeloc = arglocs
-        size = sizeloc.value
-        tosize = tosizeloc.value
-        if size == tosize:
-            return # already the right size
-        if size == 4 and tosize == 8:
-            scratch = X86_64_SCRATCH_REG.value
-            self.mc.PEXTRD_rxi(scratch, srcloc.value, 1)
-            self.mc.PINSRQ_xri(resloc.value, scratch, 1)
-            self.mc.PEXTRD_rxi(scratch, srcloc.value, 0)
-            self.mc.PINSRQ_xri(resloc.value, scratch, 0)
-        elif size == 8 and tosize == 4:
-            # is there a better sequence to move them?
-            scratch = X86_64_SCRATCH_REG.value
-            self.mc.PEXTRQ_rxi(scratch, srcloc.value, 0)
-            self.mc.PINSRD_xri(resloc.value, scratch, 0)
-            self.mc.PEXTRQ_rxi(scratch, srcloc.value, 1)
-            self.mc.PINSRD_xri(resloc.value, scratch, 1)
-        else:
-            # note that all other conversions are not implemented
-            # on purpose. it needs many x86 op codes to implement
-            # the missing combinations. even if they are implemented
-            # the speedup might only be modest...
-            # the optimization does not emit such code!
-            msg = "vec int signext (%d->%d)" % (size, tosize)
-            not_implemented(msg)
-
-    def genop_vec_float_expand(self, op, arglocs, resloc):
-        srcloc, sizeloc = arglocs
-        size = sizeloc.value
-        if isinstance(srcloc, ConstFloatLoc):
-            # they are aligned!
-            self.mc.MOVAPD(resloc, srcloc)
-        elif size == 4:
-            # the register allocator forces src to be the same as resloc
-            # r = (s[0], s[0], r[0], r[0])
-            # since resloc == srcloc: r = (r[0], r[0], r[0], r[0])
-            self.mc.SHUFPS_xxi(resloc.value, srcloc.value, 0)
-        elif size == 8:
-            self.mc.MOVDDUP(resloc, srcloc)
-        else:
-            raise AssertionError("float of size %d not supported" % (size,))
-
-    def genop_vec_int_expand(self, op, arglocs, resloc):
-        srcloc, sizeloc = arglocs
-        if not isinstance(srcloc, RegLoc):
-            self.mov(srcloc, X86_64_SCRATCH_REG)
-            srcloc = X86_64_SCRATCH_REG
-        assert not srcloc.is_xmm
-        size = sizeloc.value
-        if size == 1:
-            self.mc.PINSRB_xri(resloc.value, srcloc.value, 0)
-            self.mc.PSHUFB(resloc, heap(self.expand_byte_mask_addr))
-        elif size == 2:
-            self.mc.PINSRW_xri(resloc.value, srcloc.value, 0)
-            self.mc.PINSRW_xri(resloc.value, srcloc.value, 4)
-            self.mc.PSHUFLW_xxi(resloc.value, resloc.value, 0)
-            self.mc.PSHUFHW_xxi(resloc.value, resloc.value, 0)
-        elif size == 4:
-            self.mc.PINSRD_xri(resloc.value, srcloc.value, 0)
-            self.mc.PSHUFD_xxi(resloc.value, resloc.value, 0)
-        elif size == 8:
-            self.mc.PINSRQ_xri(resloc.value, srcloc.value, 0)
-            self.mc.PINSRQ_xri(resloc.value, srcloc.value, 1)
-        else:
-            raise AssertionError("cannot handle size %d (int expand)" % 
(size,))
-
-    def genop_vec_int_pack(self, op, arglocs, resloc):
-        resultloc, sourceloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
-        assert isinstance(resultloc, RegLoc)
-        assert isinstance(sourceloc, RegLoc)
-        size = sizeloc.value
-        srcidx = srcidxloc.value
-        residx = residxloc.value
-        count = countloc.value
-        # for small data type conversion this can be quite costy
-        # NOTE there might be some combinations that can be handled
-        # more efficiently! e.g.
-        # v2 = pack(v0,v1,4,4)
-        si = srcidx
-        ri = residx
-        k = count
-        while k > 0:
-            if size == 8:
-                if resultloc.is_xmm and sourceloc.is_xmm: # both xmm
-                    self.mc.PEXTRQ_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
-                    self.mc.PINSRQ_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
-                elif resultloc.is_xmm: # xmm <- reg
-                    self.mc.PINSRQ_xri(resultloc.value, sourceloc.value, ri)
-                else: # reg <- xmm
-                    self.mc.PEXTRQ_rxi(resultloc.value, sourceloc.value, si)
-            elif size == 4:
-                if resultloc.is_xmm and sourceloc.is_xmm:
-                    self.mc.PEXTRD_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
-                    self.mc.PINSRD_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
-                elif resultloc.is_xmm:
-                    self.mc.PINSRD_xri(resultloc.value, sourceloc.value, ri)
-                else:
-                    self.mc.PEXTRD_rxi(resultloc.value, sourceloc.value, si)
-            elif size == 2:
-                if resultloc.is_xmm and sourceloc.is_xmm:
-                    self.mc.PEXTRW_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
-                    self.mc.PINSRW_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
-                elif resultloc.is_xmm:
-                    self.mc.PINSRW_xri(resultloc.value, sourceloc.value, ri)
-                else:
-                    self.mc.PEXTRW_rxi(resultloc.value, sourceloc.value, si)
-            elif size == 1:
-                if resultloc.is_xmm and sourceloc.is_xmm:
-                    self.mc.PEXTRB_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
-                    self.mc.PINSRB_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
-                elif resultloc.is_xmm:
-                    self.mc.PINSRB_xri(resultloc.value, sourceloc.value, ri)
-                else:
-                    self.mc.PEXTRB_rxi(resultloc.value, sourceloc.value, si)
-            si += 1
-            ri += 1
-            k -= 1
-
-    genop_vec_int_unpack = genop_vec_int_pack
-
-    def genop_vec_float_pack(self, op, arglocs, resultloc):
-        resloc, srcloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
-        assert isinstance(resloc, RegLoc)
-        assert isinstance(srcloc, RegLoc)
-        count = countloc.value
-        residx = residxloc.value
-        srcidx = srcidxloc.value
-        size = sizeloc.value
-        if size == 4:
-            si = srcidx
-            ri = residx
-            k = count
-            while k > 0:
-                if resloc.is_xmm:
-                    src = srcloc.value
-                    if not srcloc.is_xmm:
-                        # if source is a normal register (unpack)
-                        assert count == 1
-                        assert si == 0
-                        self.mov(srcloc, X86_64_XMM_SCRATCH_REG)
-                        src = X86_64_XMM_SCRATCH_REG.value
-                    select = ((si & 0x3) << 6)|((ri & 0x3) << 4)
-                    self.mc.INSERTPS_xxi(resloc.value, src, select)
-                else:
-                    self.mc.PEXTRD_rxi(resloc.value, srcloc.value, si)
-                si += 1
-                ri += 1
-                k -= 1
-        elif size == 8:
-            assert resloc.is_xmm
-            if srcloc.is_xmm:
-                if srcidx == 0:
-                    if residx == 0:
-                        # r = (s[0], r[1])
-                        self.mc.MOVSD(resloc, srcloc)
-                    else:
-                        assert residx == 1
-                        # r = (r[0], s[0])
-                        self.mc.UNPCKLPD(resloc, srcloc)
-                else:
-                    assert srcidx == 1
-                    if residx == 0:
-                        # r = (s[1], r[1])
-                        if resloc != srcloc:
-                            self.mc.UNPCKHPD(resloc, srcloc)
-                        self.mc.SHUFPD_xxi(resloc.value, resloc.value, 1)
-                    else:
-                        assert residx == 1
-                        # r = (r[0], s[1])
-                        if resloc != srcloc:
-                            self.mc.SHUFPD_xxi(resloc.value, resloc.value, 1)
-                            self.mc.UNPCKHPD(resloc, srcloc)
-                        # if they are equal nothing is to be done
-
-    genop_vec_float_unpack = genop_vec_float_pack
-
-    def genop_vec_cast_float_to_singlefloat(self, op, arglocs, resloc):
-        self.mc.CVTPD2PS(resloc, arglocs[0])
-
-    def genop_vec_cast_float_to_int(self, op, arglocs, resloc):
-        self.mc.CVTPD2DQ(resloc, arglocs[0])
-
-    def genop_vec_cast_int_to_float(self, op, arglocs, resloc):
-        self.mc.CVTDQ2PD(resloc, arglocs[0])
-
-    def genop_vec_cast_singlefloat_to_float(self, op, arglocs, resloc):
-        self.mc.CVTPS2PD(resloc, arglocs[0])
-
     # ________________________________________
 
 genop_discard_list = [Assembler386.not_implemented_op_discard] * rop._LAST
@@ -2923,7 +2520,10 @@
 genop_tlref_list = {}
 genop_guard_list = [Assembler386.not_implemented_op_guard] * rop._LAST
 
-for name, value in Assembler386.__dict__.iteritems():
+import itertools
+iterate = itertools.chain(Assembler386.__dict__.iteritems(),
+                          VectorAssemblerMixin.__dict__.iteritems())
+for name, value in iterate:
     if name.startswith('genop_discard_'):
         opname = name[len('genop_discard_'):]
         num = getattr(rop, opname.upper())
diff --git a/rpython/jit/backend/x86/regalloc.py 
b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -19,6 +19,7 @@
     ebp, r8, r9, r10, r11, r12, r13, r14, r15, xmm0, xmm1, xmm2, xmm3, xmm4,
     xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14,
     X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG)
+from rpython.jit.backend.x86.vector_ext import VectorRegallocMixin
 from rpython.jit.codewriter import longlong
 from rpython.jit.codewriter.effectinfo import EffectInfo
 from rpython.jit.metainterp.history import (Box, Const, ConstInt, ConstPtr,
@@ -146,7 +147,7 @@
     gpr_reg_mgr_cls.all_reg_indexes[_reg.value] = _i
 
 
-class RegAlloc(BaseRegalloc):
+class RegAlloc(BaseRegalloc, VectorRegallocMixin):
 
     def __init__(self, assembler, translate_support_code=False):
         assert isinstance(translate_support_code, bool)
@@ -1503,190 +1504,10 @@
             self.rm.possibly_free_var(length_box)
             self.rm.possibly_free_var(dstaddr_box)
 
-    # vector operations
-    # ________________________________________
-
-    def consider_vec_getarrayitem_raw(self, op):
-        descr = op.getdescr()
-        assert isinstance(descr, ArrayDescr)
-        assert not descr.is_array_of_pointers() and \
-               not descr.is_array_of_structs()
-        itemsize, ofs, _ = unpack_arraydescr(descr)
-        integer = not (descr.is_array_of_floats() or descr.getconcrete_type() 
== FLOAT)
-        aligned = False
-        args = op.getarglist()
-        base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
-        ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
-        result_loc = self.force_allocate_reg(op.result)
-        self.perform(op, [base_loc, ofs_loc, imm(itemsize), imm(ofs),
-                          imm(integer), imm(aligned)], result_loc)
-
-    consider_vec_raw_load = consider_vec_getarrayitem_raw
-
-    def consider_vec_setarrayitem_raw(self, op):
-        descr = op.getdescr()
-        assert isinstance(descr, ArrayDescr)
-        assert not descr.is_array_of_pointers() and \
-               not descr.is_array_of_structs()
-        itemsize, ofs, _ = unpack_arraydescr(descr)
-        args = op.getarglist()
-        base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
-        value_loc = self.make_sure_var_in_reg(op.getarg(2), args)
-        ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
-
-        integer = not (descr.is_array_of_floats() or descr.getconcrete_type() 
== FLOAT)
-        aligned = False
-        self.perform_discard(op, [base_loc, ofs_loc, value_loc,
-                                 imm(itemsize), imm(ofs), imm(integer), 
imm(aligned)])
-
-    consider_vec_raw_store = consider_vec_setarrayitem_raw
-
-    def consider_vec_arith(self, op):
-        lhs = op.getarg(0)
-        assert isinstance(lhs, BoxVector)
-        size = lhs.item_size
-        args = op.getarglist()
-        loc1 = self.make_sure_var_in_reg(op.getarg(1), args)
-        loc0 = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
-        self.perform(op, [loc0, loc1, imm(size)], loc0)
-
-    consider_vec_int_add = consider_vec_arith
-    consider_vec_int_sub = consider_vec_arith
-    consider_vec_int_mul = consider_vec_arith
-    consider_vec_float_add = consider_vec_arith
-    consider_vec_float_sub = consider_vec_arith
-    consider_vec_float_mul = consider_vec_arith
-    consider_vec_float_truediv = consider_vec_arith
-    del consider_vec_arith
-
-    def consider_vec_arith_unary(self, op):
-        lhs = op.getarg(0)
-        assert isinstance(lhs, BoxVector)
-        size = lhs.item_size
-        args = op.getarglist()
-        res = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
-        self.perform(op, [res, imm(size)], res)
-
-    consider_vec_float_neg = consider_vec_arith_unary
-    consider_vec_float_abs = consider_vec_arith_unary
-    del consider_vec_arith_unary
-
-    def consider_vec_logic(self, op):
-        lhs = op.getarg(0)
-        assert isinstance(lhs, BoxVector)
-        size = lhs.item_size
-        args = op.getarglist()
-        source = self.make_sure_var_in_reg(op.getarg(1), args)
-        result = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
-        self.perform(op, [source, imm(size)], result)
-
-    consider_vec_float_eq = consider_vec_logic
-    consider_vec_int_and = consider_vec_logic
-    consider_vec_int_or = consider_vec_logic
-    consider_vec_int_xor = consider_vec_logic
-    del consider_vec_logic
-
-    def consider_vec_int_pack(self, op):
-        # new_res = vec_int_pack(res, src, index, count)
-        arg = op.getarg(1)
-        index = op.getarg(2)
-        count = op.getarg(3)
-        assert isinstance(index, ConstInt)
-        assert isinstance(count, ConstInt)
-        args = op.getarglist()
-        srcloc = self.make_sure_var_in_reg(arg, args)
-        resloc =  self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
-        residx = index.value # where to put it in result?
-        srcidx = 0
-        assert isinstance(op.result, BoxVector)
-        size = op.result.getsize()
-        arglocs = [resloc, srcloc, imm(residx), imm(srcidx), imm(count.value), 
imm(size)]
-        self.perform(op, arglocs, resloc)
-
-    consider_vec_float_pack = consider_vec_int_pack
-
-    def consider_vec_int_unpack(self, op):
-        index = op.getarg(1)
-        count = op.getarg(2)
-        assert isinstance(index, ConstInt)
-        assert isinstance(count, ConstInt)
-        args = op.getarglist()
-        srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
-        if isinstance(op.result, BoxVector):
-            resloc =  self.xrm.force_result_in_reg(op.result, op.getarg(0), 
args)
-            assert isinstance(op.result, BoxVector)
-            size = op.result.getsize()
-        else:
-            # unpack into iX box
-            resloc =  self.force_allocate_reg(op.result, args)
-            arg = op.getarg(0)
-            assert isinstance(arg, BoxVector)
-            size = arg.getsize()
-        residx = 0
-        args = op.getarglist()
-        arglocs = [resloc, srcloc, imm(residx), imm(index.value), 
imm(count.value), imm(size)]
-        self.perform(op, arglocs, resloc)
-
-    consider_vec_float_unpack = consider_vec_int_unpack
-
-    def consider_vec_float_expand(self, op):
-        result = op.result
-        assert isinstance(result, BoxVector)
-        arg = op.getarg(0)
-        args = op.getarglist()
-        if isinstance(arg, Const):
-            resloc = self.xrm.force_allocate_reg(result)
-            srcloc = self.xrm.expand_float(result.getsize(), arg)
-        else:
-            resloc = self.xrm.force_result_in_reg(op.result, arg, args)
-            srcloc = resloc
-
-        size = op.result.getsize()
-        self.perform(op, [srcloc, imm(size)], resloc)
-
-    def consider_vec_int_expand(self, op):
-        arg = op.getarg(0)
-        args = op.getarglist()
-        if isinstance(arg, Const):
-            srcloc = self.rm.convert_to_imm(arg)
-        else:
-            srcloc = self.make_sure_var_in_reg(arg, args)
-        resloc = self.xrm.force_allocate_reg(op.result, args)
-        assert isinstance(op.result, BoxVector)
-        size = op.result.getsize()
-        self.perform(op, [srcloc, imm(size)], resloc)
-
-    def consider_vec_int_signext(self, op):
-        args = op.getarglist()
-        resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
-        sizearg = op.getarg(0)
-        result = op.result
-        assert isinstance(sizearg, BoxVector)
-        assert isinstance(result, BoxVector)
-        size = sizearg.getsize()
-        tosize = result.getsize()
-        self.perform(op, [resloc, imm(size), imm(tosize)], resloc)
-
-    def consider_vec_box(self, op):
-        # pseudo instruction, needed to create a new variable
-        self.xrm.force_allocate_reg(op.result)
-
-    def consider_guard_early_exit(self, op):
-        pass
-
-    def consider_vec_cast_float_to_int(self, op):
-        args = op.getarglist()
-        srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
-        resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
-        self.perform(op, [srcloc], resloc)
-
-    consider_vec_cast_int_to_float = consider_vec_cast_float_to_int
-    consider_vec_cast_float_to_singlefloat = consider_vec_cast_float_to_int
-    consider_vec_cast_singlefloat_to_float = consider_vec_cast_float_to_int
-
     # ________________________________________
 
     def not_implemented_op(self, op):
+        import pdb; pdb.set_trace()
         not_implemented("not implemented operation: %s" % op.getopname())
 
     def not_implemented_op_with_guard(self, op, guard_op):
@@ -1699,7 +1520,10 @@
 def add_none_argument(fn):
     return lambda self, op: fn(self, op, None)
 
-for name, value in RegAlloc.__dict__.iteritems():
+import itertools
+iterate = itertools.chain(RegAlloc.__dict__.iteritems(),
+                          VectorRegallocMixin.__dict__.iteritems())
+for name, value in iterate:
     if name.startswith('consider_'):
         name = name[len('consider_'):]
         num = getattr(rop, name.upper())
diff --git a/rpython/jit/backend/x86/vector_ext.py 
b/rpython/jit/backend/x86/vector_ext.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/x86/vector_ext.py
@@ -0,0 +1,610 @@
+import py
+from rpython.jit.metainterp.history import (Box, Const, ConstInt, ConstPtr,
+    ConstFloat, BoxInt, BoxFloat, BoxVector, BoxVectorAccum, INT, REF,
+    FLOAT, VECTOR, TargetToken)
+from rpython.jit.backend.llsupport.descr import (ArrayDescr, CallDescr,
+    unpack_arraydescr, unpack_fielddescr, unpack_interiorfielddescr)
+from rpython.jit.backend.x86.regloc import (FrameLoc, RegLoc, ConstFloatLoc,
+    FloatImmedLoc, ImmedLoc, imm, imm0, imm1, ecx, eax, edx, ebx, esi, edi,
+    ebp, r8, r9, r10, r11, r12, r13, r14, r15, xmm0, xmm1, xmm2, xmm3, xmm4,
+    xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14,
+    X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG, AddressLoc)
+
+def addr_add(reg_or_imm1, reg_or_imm2, offset=0, scale=0):
+    # duplicated for easy migration, def in assembler.py as well
+    return AddressLoc(reg_or_imm1, reg_or_imm2, scale, offset)
+
+class VectorAssemblerMixin(object):
+    _mixin_ = True
+
+    def _guard_vector_true(self, guard_op, loc, zero=False):
+        arg = guard_op.getarg(0)
+        assert isinstance(arg, BoxVector)
+        size = arg.item_size
+        temp = X86_64_XMM_SCRATCH_REG
+        #
+        self.mc.PXOR(temp, temp)
+        # if the vector is not fully packed blend 1s
+        if not arg.fully_packed(self.cpu.vector_register_size):
+            self.mc.PCMPEQQ(temp, temp) # fill with ones
+            select = 0
+            bits_used = (arg.item_count * arg.item_size * 8)
+            index = bits_used // 16
+            while index < 8:
+                select |= (1 << index)
+                index += 1
+            self.mc.PBLENDW_xxi(loc.value, temp.value, select)
+            # reset to zeros
+            self.mc.PXOR(temp, temp)
+
+        self.mc.PCMPEQ(size, loc, temp)
+        self.mc.PCMPEQQ(temp, temp)
+        self.mc.PTEST(loc, temp)
+
+    # vector operations
+    # ________________________________________
+
+    def _accum_update_at_exit(self, fail_locs, fail_args, faildescr, regalloc):
+        """ If accumulation is done in this loop, at the guard exit
+        some vector registers must be adjusted to yield the correct value"""
+        assert regalloc is not None
+        accum_info = faildescr.rd_accum_list
+        while accum_info:
+            pos = accum_info.position
+            loc = fail_locs[pos]
+            assert isinstance(loc, RegLoc)
+            arg = fail_args[pos]
+            if isinstance(arg, BoxVectorAccum):
+                arg = arg.scalar_var
+            assert arg is not None
+            tgtloc = regalloc.force_allocate_reg(arg, fail_args)
+            if accum_info.operation == '+':
+                # reduction using plus
+                self._accum_reduce_sum(arg, loc, tgtloc)
+            elif accum_info.operation == '*':
+                self._accum_reduce_mul(arg, loc, tgtloc)
+            else:
+                not_implemented("accum operator %s not implemented" %
+                                            (accum_info.operation)) 
+            fail_locs[pos] = tgtloc
+            regalloc.possibly_free_var(arg)
+            accum_info = accum_info.prev
+
+    def _accum_reduce_mul(self, arg, accumloc, targetloc):
+        scratchloc = X86_64_XMM_SCRATCH_REG
+        self.mov(accumloc, scratchloc)
+        # swap the two elements
+        self.mc.SHUFPD_xxi(scratchloc.value, scratchloc.value, 0x01)
+        self.mc.MULSD(accumloc, scratchloc)
+        if accumloc is not targetloc:
+            self.mov(accumloc, targetloc)
+
+    def _accum_reduce_sum(self, arg, accumloc, targetloc):
+        # Currently the accumulator can ONLY be the biggest
+        # size for X86 -> 64 bit float/int
+        if arg.type == FLOAT:
+            # r = (r[0]+r[1],r[0]+r[1])
+            self.mc.HADDPD(accumloc, accumloc)
+            # upper bits (> 64) are dirty (but does not matter)
+            if accumloc is not targetloc:
+                self.mov(accumloc, targetloc)
+            return
+        elif arg.type == INT:
+            scratchloc = X86_64_SCRATCH_REG
+            self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0)
+            self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1)
+            self.mc.ADD(targetloc, scratchloc)
+            return
+
+        not_implemented("reduce sum for %s not impl." % arg)
+
+    def genop_vec_getarrayitem_raw(self, op, arglocs, resloc):
+        # considers item scale (raw_load does not)
+        base_loc, ofs_loc, size_loc, ofs, integer_loc, aligned_loc = arglocs
+        scale = get_scale(size_loc.value)
+        src_addr = addr_add(base_loc, ofs_loc, ofs.value, scale)
+        self._vec_load(resloc, src_addr, integer_loc.value,
+                       size_loc.value, aligned_loc.value)
+
+    def genop_vec_raw_load(self, op, arglocs, resloc):
+        base_loc, ofs_loc, size_loc, ofs, integer_loc, aligned_loc = arglocs
+        src_addr = addr_add(base_loc, ofs_loc, ofs.value, 0)
+        self._vec_load(resloc, src_addr, integer_loc.value,
+                       size_loc.value, aligned_loc.value)
+
+    def _vec_load(self, resloc, src_addr, integer, itemsize, aligned):
+        if integer:
+            if aligned:
+                self.mc.MOVDQA(resloc, src_addr)
+            else:
+                self.mc.MOVDQU(resloc, src_addr)
+        else:
+            if itemsize == 4:
+                self.mc.MOVUPS(resloc, src_addr)
+            elif itemsize == 8:
+                self.mc.MOVUPD(resloc, src_addr)
+
+    def genop_discard_vec_setarrayitem_raw(self, op, arglocs):
+        # considers item scale (raw_store does not)
+        base_loc, ofs_loc, value_loc, size_loc, baseofs, integer_loc, 
aligned_loc = arglocs
+        scale = get_scale(size_loc.value)
+        dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, scale)
+        self._vec_store(dest_loc, value_loc, integer_loc.value,
+                        size_loc.value, aligned_loc.value)
+
+    def genop_discard_vec_raw_store(self, op, arglocs):
+        base_loc, ofs_loc, value_loc, size_loc, baseofs, integer_loc, 
aligned_loc = arglocs
+        dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, 0)
+        self._vec_store(dest_loc, value_loc, integer_loc.value,
+                        size_loc.value, aligned_loc.value)
+
+    def _vec_store(self, dest_loc, value_loc, integer, itemsize, aligned):
+        if integer:
+            if aligned:
+                self.mc.MOVDQA(dest_loc, value_loc)
+            else:
+                self.mc.MOVDQU(dest_loc, value_loc)
+        else:
+            if itemsize == 4:
+                self.mc.MOVUPS(dest_loc, value_loc)
+            elif itemsize == 8:
+                self.mc.MOVUPD(dest_loc, value_loc)
+
+    def genop_vec_int_mul(self, op, arglocs, resloc):
+        loc0, loc1, itemsize_loc = arglocs
+        itemsize = itemsize_loc.value
+        if itemsize == 2:
+            self.mc.PMULLW(loc0, loc1)
+        elif itemsize == 4:
+            self.mc.PMULLD(loc0, loc1)
+        else:
+            # NOTE see 
http://stackoverflow.com/questions/8866973/can-long-integer-routines-benefit-from-sse/8867025#8867025
+            # There is no 64x64 bit packed mul and I did not find one
+            # for 8 bit either. It is questionable if it gives any benefit
+            # for 8 bit.
+            not_implemented("int8/64 mul")
+
+    def genop_vec_int_add(self, op, arglocs, resloc):
+        loc0, loc1, size_loc = arglocs
+        size = size_loc.value
+        if size == 1:
+            self.mc.PADDB(loc0, loc1)
+        elif size == 2:
+            self.mc.PADDW(loc0, loc1)
+        elif size == 4:
+            self.mc.PADDD(loc0, loc1)
+        elif size == 8:
+            self.mc.PADDQ(loc0, loc1)
+
+    def genop_vec_int_sub(self, op, arglocs, resloc):
+        loc0, loc1, size_loc = arglocs
+        size = size_loc.value
+        if size == 1:
+            self.mc.PSUBB(loc0, loc1)
+        elif size == 2:
+            self.mc.PSUBW(loc0, loc1)
+        elif size == 4:
+            self.mc.PSUBD(loc0, loc1)
+        elif size == 8:
+            self.mc.PSUBQ(loc0, loc1)
+
+    def genop_vec_int_and(self, op, arglocs, resloc):
+        self.mc.PAND(resloc, arglocs[0])
+
+    def genop_vec_int_or(self, op, arglocs, resloc):
+        self.mc.POR(resloc, arglocs[0])
+
+    def genop_vec_int_xor(self, op, arglocs, resloc):
+        self.mc.PXOR(resloc, arglocs[0])
+
+    genop_vec_float_arith = """
+    def genop_vec_float_{type}(self, op, arglocs, resloc):
+        loc0, loc1, itemsize_loc = arglocs
+        itemsize = itemsize_loc.value
+        if itemsize == 4:
+            self.mc.{p_op_s}(loc0, loc1)
+        elif itemsize == 8:
+            self.mc.{p_op_d}(loc0, loc1)
+    """
+    for op in ['add','mul','sub']:
+        OP = op.upper()
+        _source = genop_vec_float_arith.format(type=op,
+                                               p_op_s=OP+'PS',
+                                               p_op_d=OP+'PD')
+        exec py.code.Source(_source).compile()
+    del genop_vec_float_arith
+
+    def genop_vec_float_truediv(self, op, arglocs, resloc):
+        loc0, loc1, sizeloc = arglocs
+        size = sizeloc.value
+        if size == 4:
+            self.mc.DIVPS(loc0, loc1)
+        elif size == 8:
+            self.mc.DIVPD(loc0, loc1)
+
+    def genop_vec_float_abs(self, op, arglocs, resloc):
+        src, sizeloc = arglocs
+        size = sizeloc.value
+        if size == 4:
+            self.mc.ANDPS(src, heap(self.single_float_const_abs_addr))
+        elif size == 8:
+            self.mc.ANDPD(src, heap(self.float_const_abs_addr))
+
+    def genop_vec_float_neg(self, op, arglocs, resloc):
+        src, sizeloc = arglocs
+        size = sizeloc.value
+        if size == 4:
+            self.mc.XORPS(src, heap(self.single_float_const_neg_addr))
+        elif size == 8:
+            self.mc.XORPD(src, heap(self.float_const_neg_addr))
+
+    def genop_vec_int_signext(self, op, arglocs, resloc):
+        srcloc, sizeloc, tosizeloc = arglocs
+        size = sizeloc.value
+        tosize = tosizeloc.value
+        if size == tosize:
+            return # already the right size
+        if size == 4 and tosize == 8:
+            scratch = X86_64_SCRATCH_REG.value
+            self.mc.PEXTRD_rxi(scratch, srcloc.value, 1)
+            self.mc.PINSRQ_xri(resloc.value, scratch, 1)
+            self.mc.PEXTRD_rxi(scratch, srcloc.value, 0)
+            self.mc.PINSRQ_xri(resloc.value, scratch, 0)
+        elif size == 8 and tosize == 4:
+            # is there a better sequence to move them?
+            scratch = X86_64_SCRATCH_REG.value
+            self.mc.PEXTRQ_rxi(scratch, srcloc.value, 0)
+            self.mc.PINSRD_xri(resloc.value, scratch, 0)
+            self.mc.PEXTRQ_rxi(scratch, srcloc.value, 1)
+            self.mc.PINSRD_xri(resloc.value, scratch, 1)
+        else:
+            # note that all other conversions are not implemented
+            # on purpose. it needs many x86 op codes to implement
+            # the missing combinations. even if they are implemented
+            # the speedup might only be modest...
+            # the optimization does not emit such code!
+            msg = "vec int signext (%d->%d)" % (size, tosize)
+            not_implemented(msg)
+
+    def genop_vec_float_expand(self, op, arglocs, resloc):
+        srcloc, sizeloc = arglocs
+        size = sizeloc.value
+        if isinstance(srcloc, ConstFloatLoc):
+            # they are aligned!
+            self.mc.MOVAPD(resloc, srcloc)
+        elif size == 4:
+            # the register allocator forces src to be the same as resloc
+            # r = (s[0], s[0], r[0], r[0])
+            # since resloc == srcloc: r = (r[0], r[0], r[0], r[0])
+            self.mc.SHUFPS_xxi(resloc.value, srcloc.value, 0)
+        elif size == 8:
+            self.mc.MOVDDUP(resloc, srcloc)
+        else:
+            raise AssertionError("float of size %d not supported" % (size,))
+
+    def genop_vec_int_expand(self, op, arglocs, resloc):
+        srcloc, sizeloc = arglocs
+        if not isinstance(srcloc, RegLoc):
+            self.mov(srcloc, X86_64_SCRATCH_REG)
+            srcloc = X86_64_SCRATCH_REG
+        assert not srcloc.is_xmm
+        size = sizeloc.value
+        if size == 1:
+            self.mc.PINSRB_xri(resloc.value, srcloc.value, 0)
+            self.mc.PSHUFB(resloc, heap(self.expand_byte_mask_addr))
+        elif size == 2:
+            self.mc.PINSRW_xri(resloc.value, srcloc.value, 0)
+            self.mc.PINSRW_xri(resloc.value, srcloc.value, 4)
+            self.mc.PSHUFLW_xxi(resloc.value, resloc.value, 0)
+            self.mc.PSHUFHW_xxi(resloc.value, resloc.value, 0)
+        elif size == 4:
+            self.mc.PINSRD_xri(resloc.value, srcloc.value, 0)
+            self.mc.PSHUFD_xxi(resloc.value, resloc.value, 0)
+        elif size == 8:
+            self.mc.PINSRQ_xri(resloc.value, srcloc.value, 0)
+            self.mc.PINSRQ_xri(resloc.value, srcloc.value, 1)
+        else:
+            raise AssertionError("cannot handle size %d (int expand)" % 
(size,))
+
+    def genop_vec_int_pack(self, op, arglocs, resloc):
+        resultloc, sourceloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
+        assert isinstance(resultloc, RegLoc)
+        assert isinstance(sourceloc, RegLoc)
+        size = sizeloc.value
+        srcidx = srcidxloc.value
+        residx = residxloc.value
+        count = countloc.value
+        # for small data type conversion this can be quite costy
+        # NOTE there might be some combinations that can be handled
+        # more efficiently! e.g.
+        # v2 = pack(v0,v1,4,4)
+        si = srcidx
+        ri = residx
+        k = count
+        while k > 0:
+            if size == 8:
+                if resultloc.is_xmm and sourceloc.is_xmm: # both xmm
+                    self.mc.PEXTRQ_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
+                    self.mc.PINSRQ_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
+                elif resultloc.is_xmm: # xmm <- reg
+                    self.mc.PINSRQ_xri(resultloc.value, sourceloc.value, ri)
+                else: # reg <- xmm
+                    self.mc.PEXTRQ_rxi(resultloc.value, sourceloc.value, si)
+            elif size == 4:
+                if resultloc.is_xmm and sourceloc.is_xmm:
+                    self.mc.PEXTRD_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
+                    self.mc.PINSRD_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
+                elif resultloc.is_xmm:
+                    self.mc.PINSRD_xri(resultloc.value, sourceloc.value, ri)
+                else:
+                    self.mc.PEXTRD_rxi(resultloc.value, sourceloc.value, si)
+            elif size == 2:
+                if resultloc.is_xmm and sourceloc.is_xmm:
+                    self.mc.PEXTRW_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
+                    self.mc.PINSRW_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
+                elif resultloc.is_xmm:
+                    self.mc.PINSRW_xri(resultloc.value, sourceloc.value, ri)
+                else:
+                    self.mc.PEXTRW_rxi(resultloc.value, sourceloc.value, si)
+            elif size == 1:
+                if resultloc.is_xmm and sourceloc.is_xmm:
+                    self.mc.PEXTRB_rxi(X86_64_SCRATCH_REG.value, 
sourceloc.value, si)
+                    self.mc.PINSRB_xri(resultloc.value, 
X86_64_SCRATCH_REG.value, ri)
+                elif resultloc.is_xmm:
+                    self.mc.PINSRB_xri(resultloc.value, sourceloc.value, ri)
+                else:
+                    self.mc.PEXTRB_rxi(resultloc.value, sourceloc.value, si)
+            si += 1
+            ri += 1
+            k -= 1
+
+    genop_vec_int_unpack = genop_vec_int_pack
+
+    def genop_vec_float_pack(self, op, arglocs, resultloc):
+        resloc, srcloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
+        assert isinstance(resloc, RegLoc)
+        assert isinstance(srcloc, RegLoc)
+        count = countloc.value
+        residx = residxloc.value
+        srcidx = srcidxloc.value
+        size = sizeloc.value
+        if size == 4:
+            si = srcidx
+            ri = residx
+            k = count
+            while k > 0:
+                if resloc.is_xmm:
+                    src = srcloc.value
+                    if not srcloc.is_xmm:
+                        # if source is a normal register (unpack)
+                        assert count == 1
+                        assert si == 0
+                        self.mov(srcloc, X86_64_XMM_SCRATCH_REG)
+                        src = X86_64_XMM_SCRATCH_REG.value
+                    select = ((si & 0x3) << 6)|((ri & 0x3) << 4)
+                    self.mc.INSERTPS_xxi(resloc.value, src, select)
+                else:
+                    self.mc.PEXTRD_rxi(resloc.value, srcloc.value, si)
+                si += 1
+                ri += 1
+                k -= 1
+        elif size == 8:
+            assert resloc.is_xmm
+            if srcloc.is_xmm:
+                if srcidx == 0:
+                    if residx == 0:
+                        # r = (s[0], r[1])
+                        self.mc.MOVSD(resloc, srcloc)
+                    else:
+                        assert residx == 1
+                        # r = (r[0], s[0])
+                        self.mc.UNPCKLPD(resloc, srcloc)
+                else:
+                    assert srcidx == 1
+                    if residx == 0:
+                        # r = (s[1], r[1])
+                        if resloc != srcloc:
+                            self.mc.UNPCKHPD(resloc, srcloc)
+                        self.mc.SHUFPD_xxi(resloc.value, resloc.value, 1)
+                    else:
+                        assert residx == 1
+                        # r = (r[0], s[1])
+                        if resloc != srcloc:
+                            self.mc.SHUFPD_xxi(resloc.value, resloc.value, 1)
+                            self.mc.UNPCKHPD(resloc, srcloc)
+                        # if they are equal nothing is to be done
+
+    genop_vec_float_unpack = genop_vec_float_pack
+
+    def genop_vec_cast_float_to_singlefloat(self, op, arglocs, resloc):
+        self.mc.CVTPD2PS(resloc, arglocs[0])
+
+    def genop_vec_cast_float_to_int(self, op, arglocs, resloc):
+        self.mc.CVTPD2DQ(resloc, arglocs[0])
+
+    def genop_vec_cast_int_to_float(self, op, arglocs, resloc):
+        self.mc.CVTDQ2PD(resloc, arglocs[0])
+
+    def genop_vec_cast_singlefloat_to_float(self, op, arglocs, resloc):
+        self.mc.CVTPS2PD(resloc, arglocs[0])
+
+class VectorRegallocMixin(object):
+    _mixin_ = True
+
+    def consider_vec_getarrayitem_raw(self, op):
+        descr = op.getdescr()
+        assert isinstance(descr, ArrayDescr)
+        assert not descr.is_array_of_pointers() and \
+               not descr.is_array_of_structs()
+        itemsize, ofs, _ = unpack_arraydescr(descr)
+        integer = not (descr.is_array_of_floats() or descr.getconcrete_type() 
== FLOAT)
+        aligned = False
+        args = op.getarglist()
+        base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
+        ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
+        result_loc = self.force_allocate_reg(op.result)
+        self.perform(op, [base_loc, ofs_loc, imm(itemsize), imm(ofs),
+                          imm(integer), imm(aligned)], result_loc)
+
+    consider_vec_raw_load = consider_vec_getarrayitem_raw
+
+    def consider_vec_setarrayitem_raw(self, op):
+        descr = op.getdescr()
+        assert isinstance(descr, ArrayDescr)
+        assert not descr.is_array_of_pointers() and \
+               not descr.is_array_of_structs()
+        itemsize, ofs, _ = unpack_arraydescr(descr)
+        args = op.getarglist()
+        base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
+        value_loc = self.make_sure_var_in_reg(op.getarg(2), args)
+        ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
+
+        integer = not (descr.is_array_of_floats() or descr.getconcrete_type() 
== FLOAT)
+        aligned = False
+        self.perform_discard(op, [base_loc, ofs_loc, value_loc,
+                                 imm(itemsize), imm(ofs), imm(integer), 
imm(aligned)])
+
+    consider_vec_raw_store = consider_vec_setarrayitem_raw
+
+    def consider_vec_arith(self, op):
+        lhs = op.getarg(0)
+        assert isinstance(lhs, BoxVector)
+        size = lhs.item_size
+        args = op.getarglist()
+        loc1 = self.make_sure_var_in_reg(op.getarg(1), args)
+        loc0 = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+        self.perform(op, [loc0, loc1, imm(size)], loc0)
+
+    consider_vec_int_add = consider_vec_arith
+    consider_vec_int_sub = consider_vec_arith
+    consider_vec_int_mul = consider_vec_arith
+    consider_vec_float_add = consider_vec_arith
+    consider_vec_float_sub = consider_vec_arith
+    consider_vec_float_mul = consider_vec_arith
+    consider_vec_float_truediv = consider_vec_arith
+    del consider_vec_arith
+
+    def consider_vec_arith_unary(self, op):
+        lhs = op.getarg(0)
+        assert isinstance(lhs, BoxVector)
+        size = lhs.item_size
+        args = op.getarglist()
+        res = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+        self.perform(op, [res, imm(size)], res)
+
+    consider_vec_float_neg = consider_vec_arith_unary
+    consider_vec_float_abs = consider_vec_arith_unary
+    del consider_vec_arith_unary
+
+    def consider_vec_logic(self, op):
+        lhs = op.getarg(0)
+        assert isinstance(lhs, BoxVector)
+        size = lhs.item_size
+        args = op.getarglist()
+        source = self.make_sure_var_in_reg(op.getarg(1), args)
+        result = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+        self.perform(op, [source, imm(size)], result)
+
+    consider_vec_float_eq = consider_vec_logic
+    consider_vec_int_and = consider_vec_logic
+    consider_vec_int_or = consider_vec_logic
+    consider_vec_int_xor = consider_vec_logic
+    del consider_vec_logic
+
+    def consider_vec_int_pack(self, op):
+        # new_res = vec_int_pack(res, src, index, count)
+        arg = op.getarg(1)
+        index = op.getarg(2)
+        count = op.getarg(3)
+        assert isinstance(index, ConstInt)
+        assert isinstance(count, ConstInt)
+        args = op.getarglist()
+        srcloc = self.make_sure_var_in_reg(arg, args)
+        resloc =  self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+        residx = index.value # where to put it in result?
+        srcidx = 0
+        assert isinstance(op.result, BoxVector)
+        size = op.result.getsize()
+        arglocs = [resloc, srcloc, imm(residx), imm(srcidx), imm(count.value), 
imm(size)]
+        self.perform(op, arglocs, resloc)
+
+    consider_vec_float_pack = consider_vec_int_pack
+
+    def consider_vec_int_unpack(self, op):
+        index = op.getarg(1)
+        count = op.getarg(2)
+        assert isinstance(index, ConstInt)
+        assert isinstance(count, ConstInt)
+        args = op.getarglist()
+        srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
+        if isinstance(op.result, BoxVector):
+            resloc =  self.xrm.force_result_in_reg(op.result, op.getarg(0), 
args)
+            assert isinstance(op.result, BoxVector)
+            size = op.result.getsize()
+        else:
+            # unpack into iX box
+            resloc =  self.force_allocate_reg(op.result, args)
+            arg = op.getarg(0)
+            assert isinstance(arg, BoxVector)
+            size = arg.getsize()
+        residx = 0
+        args = op.getarglist()
+        arglocs = [resloc, srcloc, imm(residx), imm(index.value), 
imm(count.value), imm(size)]
+        self.perform(op, arglocs, resloc)
+
+    consider_vec_float_unpack = consider_vec_int_unpack
+
+    def consider_vec_float_expand(self, op):
+        result = op.result
+        assert isinstance(result, BoxVector)
+        arg = op.getarg(0)
+        args = op.getarglist()
+        if isinstance(arg, Const):
+            resloc = self.xrm.force_allocate_reg(result)
+            srcloc = self.xrm.expand_float(result.getsize(), arg)
+        else:
+            resloc = self.xrm.force_result_in_reg(op.result, arg, args)
+            srcloc = resloc
+
+        size = op.result.getsize()
+        self.perform(op, [srcloc, imm(size)], resloc)
+
+    def consider_vec_int_expand(self, op):
+        arg = op.getarg(0)
+        args = op.getarglist()
+        if isinstance(arg, Const):
+            srcloc = self.rm.convert_to_imm(arg)
+        else:
+            srcloc = self.make_sure_var_in_reg(arg, args)
+        resloc = self.xrm.force_allocate_reg(op.result, args)
+        assert isinstance(op.result, BoxVector)
+        size = op.result.getsize()
+        self.perform(op, [srcloc, imm(size)], resloc)
+
+    def consider_vec_int_signext(self, op):
+        args = op.getarglist()
+        resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+        sizearg = op.getarg(0)
+        result = op.result
+        assert isinstance(sizearg, BoxVector)
+        assert isinstance(result, BoxVector)
+        size = sizearg.getsize()
+        tosize = result.getsize()
+        self.perform(op, [resloc, imm(size), imm(tosize)], resloc)
+
+    def consider_vec_box(self, op):
+        # pseudo instruction, needed to create a new variable
+        self.xrm.force_allocate_reg(op.result)
+
+    def consider_guard_early_exit(self, op):
+        pass
+
+    def consider_vec_cast_float_to_int(self, op):
+        args = op.getarglist()
+        srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
+        resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+        self.perform(op, [srcloc], resloc)
+
+    consider_vec_cast_int_to_float = consider_vec_cast_float_to_int
+    consider_vec_cast_float_to_singlefloat = consider_vec_cast_float_to_int
+    consider_vec_cast_singlefloat_to_float = consider_vec_cast_float_to_int
diff --git a/rpython/jit/metainterp/compile.py 
b/rpython/jit/metainterp/compile.py
--- a/rpython/jit/metainterp/compile.py
+++ b/rpython/jit/metainterp/compile.py
@@ -163,10 +163,17 @@
             return None
 
         loop.operations = loop.operations[:-1] + part.operations
+        loop.versions = part.versions
         if part.quasi_immutable_deps:
             loop.quasi_immutable_deps.update(part.quasi_immutable_deps)
     assert part.operations[-1].getopnum() != rop.LABEL
 
+    if loop.versions is not None:
+        # several different loop version have been generated
+        for version in loop.versions:
+            token = version.update_token(jitcell_token)
+            all_target_tokens.append(token)
+
     if not loop.quasi_immutable_deps:
         loop.quasi_immutable_deps = None
     for box in loop.inputargs:
@@ -181,8 +188,21 @@
     propagate_original_jitcell_token(loop)
     send_loop_to_backend(greenkey, jitdriver_sd, metainterp_sd, loop, "loop")
     record_loop_or_bridge(metainterp_sd, loop)
+
+    generate_pending_loop_versions(loop, jitdriver_sd, metainterp_sd, 
jitcell_token)
+
     return all_target_tokens[0]
 
+def generate_pending_loop_versions(loop, jitdriver_sd, metainterp_sd, 
jitcell_token):
+    if loop.versions is not None:
+        token = jitcell_token
+        for version in loop.versions:
+            version.update_inputargs()
+            for faildescr  in version.faildescrs:
+                send_bridge_to_backend(jitdriver_sd, metainterp_sd,
+                                       faildescr, version.inputargs,
+                                       version.operations, jitcell_token)
+
 def compile_retrace(metainterp, greenkey, start,
                     inputargs, jumpargs,
                     partial_trace, resumekey, start_state):
@@ -689,6 +709,16 @@
 class ResumeAtLoopHeaderDescr(ResumeGuardDescr):
     guard_opnum = rop.GUARD_EARLY_EXIT
 
+class CompileLoopVersionDescr(ResumeGuardDescr):
+    guard_opnum = rop.GUARD_EARLY_EXIT
+
+    operations = None
+    inputargs = None
+    faillocs = None
+
+    def handle_fail(self, deadframe, metainterp_sd, jitdriver_sd):
+        assert 0, "this guard must never fail"
+
 class AllVirtuals:
     llopaque = True
     cache = None
diff --git a/rpython/jit/metainterp/history.py 
b/rpython/jit/metainterp/history.py
--- a/rpython/jit/metainterp/history.py
+++ b/rpython/jit/metainterp/history.py
@@ -695,12 +695,49 @@
     def repr_of_descr(self):
         return 'TargetToken(%d)' % compute_unique_id(self)
 
+class LoopVersion(object):
+
+    def __init__(self, loop, aligned=False):
+        self.operations = loop.operations
+        self.aligned = aligned
+        self.faildescrs = []
+        #
+        label = self.operations[0]
+        assert label.getopnum() == rop.LABEL
+        self.enter_args = label.getarglist()
+        self.calling_args = None
+        self.inputargs = None
+
+    def adddescr(self, descr):
+        self.faildescrs.append(descr)
+
+    def update_token(self, jitcell_token):
+        label = self.operations[0]
+        jump = self.operations[-1]
+        #
+        assert label.getopnum() == rop.LABEL
+        assert jump.getopnum() == rop.JUMP
+        #
+        token = TargetToken(jitcell_token)
+        token.original_jitcell_token = jitcell_token
+        label.setdescr(token)
+        jump.setdescr(token)
+        return token
+
+    def update_inputargs(self):
+        assert len(self.enter_args) == len(self.inputargs)
+        rename = { a: b for a,b in zip(self.enter_args, self.calling_args) }
+        for i, arg in enumerate(self.inputargs):
+            self.inputargs[i] = rename[arg]
+
+
 class TreeLoop(object):
     inputargs = None
     operations = None
     call_pure_results = None
     logops = None
     quasi_immutable_deps = None
+    versions = None
 
     def _token(*args):
         raise Exception("TreeLoop.token is killed")
@@ -817,6 +854,7 @@
     def __repr__(self):
         return '<%s>' % (self.name,)
 
+
 def _list_all_operations(result, operations, omit_finish=True):
     if omit_finish and operations[-1].getopnum() == rop.FINISH:
         # xxx obscure
diff --git a/rpython/jit/metainterp/optimizeopt/dependency.py 
b/rpython/jit/metainterp/optimizeopt/dependency.py
--- a/rpython/jit/metainterp/optimizeopt/dependency.py
+++ b/rpython/jit/metainterp/optimizeopt/dependency.py
@@ -104,26 +104,6 @@
     def can_be_relaxed(self):
         return self.op.getopnum() in (rop.GUARD_TRUE, rop.GUARD_FALSE)
 
-    def relax_guard_to(self, guard):
-        """ Relaxes a guard operation to an earlier guard. """
-        # clone this operation object. if the vectorizer is
-        # not able to relax guards, it won't leave behind a modified operation
-        tgt_op = self.getoperation().clone()
-        self.op = tgt_op
-
-        op = guard.getoperation()
-        assert isinstance(tgt_op, GuardResOp)
-        assert isinstance(op, GuardResOp)
-        olddescr = op.getdescr()
-        descr = compile.ResumeAtLoopHeaderDescr()
-        if olddescr:
-            descr.copy_all_attributes_from(olddescr)
-        #
-        tgt_op.setdescr(descr)
-        tgt_op.rd_snapshot = op.rd_snapshot
-        #if not we_are_translated():
-        tgt_op.setfailargs(op.getfailargs())
-
     def edge_to(self, to, arg=None, failarg=False, label=None):
         if self is to:
             return
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py 
b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -11,10 +11,11 @@
 from rpython.jit.metainterp.resume import Snapshot
 from rpython.jit.metainterp.jitexc import NotAVectorizeableLoop, 
NotAProfitableLoop
 from rpython.jit.metainterp.optimizeopt.unroll import optimize_unroll
-from rpython.jit.metainterp.compile import ResumeAtLoopHeaderDescr, 
invent_fail_descr_for_op
+from rpython.jit.metainterp.compile import (ResumeAtLoopHeaderDescr,
+        CompileLoopVersionDescr, invent_fail_descr_for_op)
 from rpython.jit.metainterp.history import (ConstInt, VECTOR, FLOAT, INT,
         BoxVector, BoxFloat, BoxInt, ConstFloat, TargetToken, JitCellToken, 
Box,
-        BoxVectorAccum)
+        BoxVectorAccum, LoopVersion)
 from rpython.jit.metainterp.optimizeopt.optimizer import Optimizer, 
Optimization
 from rpython.jit.metainterp.optimizeopt.util import make_dispatcher_method, 
Renamer
 from rpython.jit.metainterp.optimizeopt.dependency import (DependencyGraph,
@@ -50,6 +51,7 @@
     optimize_unroll(metainterp_sd, jitdriver_sd, loop, optimizations,
                     inline_short_preamble, start_state, False)
     orig_ops = loop.operations
+    orig_version = LoopVersion(loop)
     if len(orig_ops) >= 75:
         # if more than 75 operations are present in this loop,
         # it won't be possible to vectorize. There are too many
@@ -62,11 +64,16 @@
         metainterp_sd.logger_noopt.log_loop(loop.inputargs, loop.operations, 
-2, None, None, "pre vectorize")
         metainterp_sd.profiler.count(Counters.OPT_VECTORIZE_TRY)
         start = time.clock()
-        opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, loop, 
cost_threshold)
+        opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, loop, 
cost_threshold, orig_version)
         opt.propagate_all_forward()
         gso = GuardStrengthenOpt(opt.dependency_graph.index_vars)
         gso.propagate_all_forward(opt.loop)
         end = time.clock()
+
+        aligned_vector_version = LoopVersion(loop, aligned=True)
+
+        loop.versions = [orig_version] #, aligned_vector_version]
+
         metainterp_sd.profiler.count(Counters.OPT_VECTORIZED)
         metainterp_sd.logger_noopt.log_loop(loop.inputargs, loop.operations, 
-2, None, None, "post vectorize")
         debug_stop("vec-opt-loop")
@@ -107,8 +114,9 @@
 class VectorizingOptimizer(Optimizer):
     """ Try to unroll the loop and find instructions to group """
 
-    def __init__(self, metainterp_sd, jitdriver_sd, loop, cost_threshold=0):
+    def __init__(self, metainterp_sd, jitdriver_sd, loop, cost_threshold, 
orig_loop_version):
         Optimizer.__init__(self, metainterp_sd, jitdriver_sd, loop, [])
+        self.orig_loop_version = orig_loop_version
         self.dependency_graph = None
         self.packset = None
         self.unroll_count = 0
@@ -188,6 +196,8 @@
 
         self.emit_unrolled_operation(label_op)
 
+        self.orig_loop_version.calling_args = label_op.getarglist()
+
         renamer = Renamer()
         oi = 0
         pure = True
@@ -247,7 +257,7 @@
                     assert isinstance(copied_op, GuardResOp)
                     target_guard = copied_op
                     # do not overwrite resume at loop header
-                    if not isinstance(target_guard.getdescr(), 
ResumeAtLoopHeaderDescr):
+                    if target_guard.getdescr().guard_opnum != 
rop.GUARD_EARLY_EXIT:
                         descr = invent_fail_descr_for_op(copied_op.getopnum(), 
self)
                         olddescr = copied_op.getdescr()
                         if olddescr:
@@ -573,7 +583,29 @@
                         label_node.edge_to(last_but_one, label='pullup')
                 # only the last guard needs a connection
                 guard_node.edge_to(ee_guard_node, label='pullup-last-guard')
-                guard_node.relax_guard_to(ee_guard_node)
+                self.relax_guard_to(guard_node, ee_guard_node)
+
+    def relax_guard_to(self, guard_node, other_node):
+        """ Relaxes a guard operation to an earlier guard. """
+        # clone this operation object. if the vectorizer is
+        # not able to relax guards, it won't leave behind a modified operation
+        tgt_op = guard_node.getoperation().clone()
+        guard_node.op = tgt_op
+
+        op = other_node.getoperation()
+        assert isinstance(tgt_op, GuardResOp)
+        assert isinstance(op, GuardResOp)
+        olddescr = op.getdescr()
+        descr = CompileLoopVersionDescr()
+        if olddescr:
+            descr.copy_all_attributes_from(olddescr)
+        self.orig_loop_version.inputargs = op.getfailargs()
+        self.orig_loop_version.adddescr(descr)
+        #
+        tgt_op.setdescr(descr)
+        tgt_op.rd_snapshot = op.rd_snapshot
+        tgt_op.setfailargs(op.getfailargs())
+
 
 class CostModel(object):
     def __init__(self, threshold, vec_reg_size):
@@ -754,17 +786,6 @@
 
         del self.packs[j]
         return len(self.packs)
-        # OLD
-        # instead of deleting an item in the center of pack array,
-        # the last element is assigned to position j and
-        # the last slot is freed. Order of packs doesn't matter
-        #last_pos = len(self.packs) - 1
-        #if j == last_pos:
-        #    del self.packs[j]
-        #else:
-        #    self.packs[j] = self.packs[last_pos]
-        #    del self.packs[last_pos]
-        #return last_pos
 
     def accumulates_pair(self, lnode, rnode, origin_pack):
         # lnode and rnode are isomorphic and dependent
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to