Author: Richard Plangger <[email protected]>
Branch: vecopt
Changeset: r78453:a3e41e7fb44b
Date: 2015-07-03 14:32 +0200
http://bitbucket.org/pypy/pypy/changeset/a3e41e7fb44b/
Log: refactored assembler, moved vector regalloc & assembler methods to a
separate file added implementation to automatically generate a exit
bridge out of the vectorized trace
diff --git a/rpython/jit/backend/x86/assembler.py
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -10,6 +10,7 @@
from rpython.jit.metainterp.history import (Const, Box, VOID,
BoxVector, ConstInt, BoxVectorAccum)
from rpython.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
+from rpython.jit.metainterp.compile import CompileLoopVersionDescr
from rpython.rtyper.lltypesystem import lltype, rffi, rstr, llmemory
from rpython.rtyper.lltypesystem.lloperation import llop
from rpython.rtyper.annlowlevel import llhelper, cast_instance_to_gcref
@@ -29,6 +30,7 @@
imm0, imm1, FloatImmedLoc, RawEbpLoc, RawEspLoc)
from rpython.rlib.objectmodel import we_are_translated
from rpython.jit.backend.x86 import rx86, codebuf, callbuilder
+from rpython.jit.backend.x86.vector_ext import VectorAssemblerMixin
from rpython.jit.backend.x86.callbuilder import follow_jump
from rpython.jit.metainterp.resoperation import rop
from rpython.jit.backend.x86 import support
@@ -40,7 +42,7 @@
from rpython.rlib.objectmodel import compute_unique_id
-class Assembler386(BaseAssembler):
+class Assembler386(BaseAssembler, VectorAssemblerMixin):
_regalloc = None
_output_loop_log = None
_second_tmp_reg = ecx
@@ -550,7 +552,6 @@
if log:
operations = self._inject_debugging_code(faildescr, operations,
'b', descr_number)
-
arglocs = self.rebuild_faillocs_from_descr(faildescr, inputargs)
regalloc = RegAlloc(self, self.cpu.translate_support_code)
startpos = self.mc.get_relative_pos()
@@ -592,8 +593,12 @@
# for each pending guard, generate the code of the recovery stub
# at the end of self.mc.
for tok in self.pending_guard_tokens:
- regalloc.position = tok.position
- tok.pos_recovery_stub = self.generate_quick_failure(tok, regalloc)
+ descr = tok.faildescr
+ if not isinstance(descr, CompileLoopVersionDescr):
+ regalloc.position = tok.position
+ tok.pos_recovery_stub = self.generate_quick_failure(tok,
regalloc)
+ else:
+ self.store_info_on_descr(0, tok)
if WORD == 8 and len(self.pending_memoryerror_trampoline_from) > 0:
self.error_trampoline_64 = self.generate_propagate_error_64()
@@ -606,6 +611,9 @@
for tok in self.pending_guard_tokens:
addr = rawstart + tok.pos_jump_offset
tok.faildescr.adr_jump_offset = addr
+ descr = tok.faildescr
+ if isinstance(descr, CompileLoopVersionDescr):
+ continue # patch them later
relative_target = tok.pos_recovery_stub - (tok.pos_jump_offset + 4)
assert rx86.fits_in_32bits(relative_target)
#
@@ -1645,30 +1653,6 @@
self.mc.MOVD32_xr(resloc.value, eax.value)
self.mc.PUNPCKLDQ_xx(resloc.value, loc1.value)
- def _guard_vector_true(self, guard_op, loc, zero=False):
- arg = guard_op.getarg(0)
- assert isinstance(arg, BoxVector)
- size = arg.item_size
- temp = X86_64_XMM_SCRATCH_REG
- #
- self.mc.PXOR(temp, temp)
- # if the vector is not fully packed blend 1s
- if not arg.fully_packed(self.cpu.vector_register_size):
- self.mc.PCMPEQQ(temp, temp) # fill with ones
- select = 0
- bits_used = (arg.item_count * arg.item_size * 8)
- index = bits_used // 16
- while index < 8:
- select |= (1 << index)
- index += 1
- self.mc.PBLENDW_xxi(loc.value, temp.value, select)
- # reset to zeros
- self.mc.PXOR(temp, temp)
-
- self.mc.PCMPEQ(size, loc, temp)
- self.mc.PCMPEQQ(temp, temp)
- self.mc.PTEST(loc, temp)
-
def genop_guard_guard_true(self, ign_1, guard_op, guard_token, locs,
ign_2):
loc = locs[0]
if isinstance(loc, RegLoc):
@@ -2527,393 +2511,6 @@
self.save_into_mem(addr, imm0, imm(current))
i += current
- # vector operations
- # ________________________________________
-
- def _accum_update_at_exit(self, fail_locs, fail_args, faildescr, regalloc):
- """ If accumulation is done in this loop, at the guard exit
- some vector registers must be adjusted to yield the correct value"""
- assert regalloc is not None
- accum_info = faildescr.rd_accum_list
- while accum_info:
- pos = accum_info.position
- loc = fail_locs[pos]
- assert isinstance(loc, RegLoc)
- arg = fail_args[pos]
- if isinstance(arg, BoxVectorAccum):
- arg = arg.scalar_var
- assert arg is not None
- tgtloc = regalloc.force_allocate_reg(arg, fail_args)
- if accum_info.operation == '+':
- # reduction using plus
- self._accum_reduce_sum(arg, loc, tgtloc)
- elif accum_info.operation == '*':
- self._accum_reduce_mul(arg, loc, tgtloc)
- else:
- not_implemented("accum operator %s not implemented" %
- (accum_info.operation))
- fail_locs[pos] = tgtloc
- regalloc.possibly_free_var(arg)
- accum_info = accum_info.prev
-
- def _accum_reduce_mul(self, arg, accumloc, targetloc):
- scratchloc = X86_64_XMM_SCRATCH_REG
- self.mov(accumloc, scratchloc)
- # swap the two elements
- self.mc.SHUFPD_xxi(scratchloc.value, scratchloc.value, 0x01)
- self.mc.MULSD(accumloc, scratchloc)
- if accumloc is not targetloc:
- self.mov(accumloc, targetloc)
-
- def _accum_reduce_sum(self, arg, accumloc, targetloc):
- # Currently the accumulator can ONLY be the biggest
- # size for X86 -> 64 bit float/int
- if arg.type == FLOAT:
- # r = (r[0]+r[1],r[0]+r[1])
- self.mc.HADDPD(accumloc, accumloc)
- # upper bits (> 64) are dirty (but does not matter)
- if accumloc is not targetloc:
- self.mov(accumloc, targetloc)
- return
- elif arg.type == INT:
- scratchloc = X86_64_SCRATCH_REG
- self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0)
- self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1)
- self.mc.ADD(targetloc, scratchloc)
- return
-
- not_implemented("reduce sum for %s not impl." % arg)
-
- def genop_vec_getarrayitem_raw(self, op, arglocs, resloc):
- # considers item scale (raw_load does not)
- base_loc, ofs_loc, size_loc, ofs, integer_loc, aligned_loc = arglocs
- scale = get_scale(size_loc.value)
- src_addr = addr_add(base_loc, ofs_loc, ofs.value, scale)
- self._vec_load(resloc, src_addr, integer_loc.value,
- size_loc.value, aligned_loc.value)
-
- def genop_vec_raw_load(self, op, arglocs, resloc):
- base_loc, ofs_loc, size_loc, ofs, integer_loc, aligned_loc = arglocs
- src_addr = addr_add(base_loc, ofs_loc, ofs.value, 0)
- self._vec_load(resloc, src_addr, integer_loc.value,
- size_loc.value, aligned_loc.value)
-
- def _vec_load(self, resloc, src_addr, integer, itemsize, aligned):
- if integer:
- if aligned:
- self.mc.MOVDQA(resloc, src_addr)
- else:
- self.mc.MOVDQU(resloc, src_addr)
- else:
- if itemsize == 4:
- self.mc.MOVUPS(resloc, src_addr)
- elif itemsize == 8:
- self.mc.MOVUPD(resloc, src_addr)
-
- def genop_discard_vec_setarrayitem_raw(self, op, arglocs):
- # considers item scale (raw_store does not)
- base_loc, ofs_loc, value_loc, size_loc, baseofs, integer_loc,
aligned_loc = arglocs
- scale = get_scale(size_loc.value)
- dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, scale)
- self._vec_store(dest_loc, value_loc, integer_loc.value,
- size_loc.value, aligned_loc.value)
-
- def genop_discard_vec_raw_store(self, op, arglocs):
- base_loc, ofs_loc, value_loc, size_loc, baseofs, integer_loc,
aligned_loc = arglocs
- dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, 0)
- self._vec_store(dest_loc, value_loc, integer_loc.value,
- size_loc.value, aligned_loc.value)
-
- def _vec_store(self, dest_loc, value_loc, integer, itemsize, aligned):
- if integer:
- if aligned:
- self.mc.MOVDQA(dest_loc, value_loc)
- else:
- self.mc.MOVDQU(dest_loc, value_loc)
- else:
- if itemsize == 4:
- self.mc.MOVUPS(dest_loc, value_loc)
- elif itemsize == 8:
- self.mc.MOVUPD(dest_loc, value_loc)
-
- def genop_vec_int_mul(self, op, arglocs, resloc):
- loc0, loc1, itemsize_loc = arglocs
- itemsize = itemsize_loc.value
- if itemsize == 2:
- self.mc.PMULLW(loc0, loc1)
- elif itemsize == 4:
- self.mc.PMULLD(loc0, loc1)
- else:
- # NOTE see
http://stackoverflow.com/questions/8866973/can-long-integer-routines-benefit-from-sse/8867025#8867025
- # There is no 64x64 bit packed mul and I did not find one
- # for 8 bit either. It is questionable if it gives any benefit
- # for 8 bit.
- not_implemented("int8/64 mul")
-
- def genop_vec_int_add(self, op, arglocs, resloc):
- loc0, loc1, size_loc = arglocs
- size = size_loc.value
- if size == 1:
- self.mc.PADDB(loc0, loc1)
- elif size == 2:
- self.mc.PADDW(loc0, loc1)
- elif size == 4:
- self.mc.PADDD(loc0, loc1)
- elif size == 8:
- self.mc.PADDQ(loc0, loc1)
-
- def genop_vec_int_sub(self, op, arglocs, resloc):
- loc0, loc1, size_loc = arglocs
- size = size_loc.value
- if size == 1:
- self.mc.PSUBB(loc0, loc1)
- elif size == 2:
- self.mc.PSUBW(loc0, loc1)
- elif size == 4:
- self.mc.PSUBD(loc0, loc1)
- elif size == 8:
- self.mc.PSUBQ(loc0, loc1)
-
- def genop_vec_int_and(self, op, arglocs, resloc):
- self.mc.PAND(resloc, arglocs[0])
-
- def genop_vec_int_or(self, op, arglocs, resloc):
- self.mc.POR(resloc, arglocs[0])
-
- def genop_vec_int_xor(self, op, arglocs, resloc):
- self.mc.PXOR(resloc, arglocs[0])
-
- genop_vec_float_arith = """
- def genop_vec_float_{type}(self, op, arglocs, resloc):
- loc0, loc1, itemsize_loc = arglocs
- itemsize = itemsize_loc.value
- if itemsize == 4:
- self.mc.{p_op_s}(loc0, loc1)
- elif itemsize == 8:
- self.mc.{p_op_d}(loc0, loc1)
- """
- for op in ['add','mul','sub']:
- OP = op.upper()
- _source = genop_vec_float_arith.format(type=op,
- p_op_s=OP+'PS',
- p_op_d=OP+'PD')
- exec py.code.Source(_source).compile()
- del genop_vec_float_arith
-
- def genop_vec_float_truediv(self, op, arglocs, resloc):
- loc0, loc1, sizeloc = arglocs
- size = sizeloc.value
- if size == 4:
- self.mc.DIVPS(loc0, loc1)
- elif size == 8:
- self.mc.DIVPD(loc0, loc1)
-
- def genop_vec_float_abs(self, op, arglocs, resloc):
- src, sizeloc = arglocs
- size = sizeloc.value
- if size == 4:
- self.mc.ANDPS(src, heap(self.single_float_const_abs_addr))
- elif size == 8:
- self.mc.ANDPD(src, heap(self.float_const_abs_addr))
-
- def genop_vec_float_neg(self, op, arglocs, resloc):
- src, sizeloc = arglocs
- size = sizeloc.value
- if size == 4:
- self.mc.XORPS(src, heap(self.single_float_const_neg_addr))
- elif size == 8:
- self.mc.XORPD(src, heap(self.float_const_neg_addr))
-
- def genop_vec_int_signext(self, op, arglocs, resloc):
- srcloc, sizeloc, tosizeloc = arglocs
- size = sizeloc.value
- tosize = tosizeloc.value
- if size == tosize:
- return # already the right size
- if size == 4 and tosize == 8:
- scratch = X86_64_SCRATCH_REG.value
- self.mc.PEXTRD_rxi(scratch, srcloc.value, 1)
- self.mc.PINSRQ_xri(resloc.value, scratch, 1)
- self.mc.PEXTRD_rxi(scratch, srcloc.value, 0)
- self.mc.PINSRQ_xri(resloc.value, scratch, 0)
- elif size == 8 and tosize == 4:
- # is there a better sequence to move them?
- scratch = X86_64_SCRATCH_REG.value
- self.mc.PEXTRQ_rxi(scratch, srcloc.value, 0)
- self.mc.PINSRD_xri(resloc.value, scratch, 0)
- self.mc.PEXTRQ_rxi(scratch, srcloc.value, 1)
- self.mc.PINSRD_xri(resloc.value, scratch, 1)
- else:
- # note that all other conversions are not implemented
- # on purpose. it needs many x86 op codes to implement
- # the missing combinations. even if they are implemented
- # the speedup might only be modest...
- # the optimization does not emit such code!
- msg = "vec int signext (%d->%d)" % (size, tosize)
- not_implemented(msg)
-
- def genop_vec_float_expand(self, op, arglocs, resloc):
- srcloc, sizeloc = arglocs
- size = sizeloc.value
- if isinstance(srcloc, ConstFloatLoc):
- # they are aligned!
- self.mc.MOVAPD(resloc, srcloc)
- elif size == 4:
- # the register allocator forces src to be the same as resloc
- # r = (s[0], s[0], r[0], r[0])
- # since resloc == srcloc: r = (r[0], r[0], r[0], r[0])
- self.mc.SHUFPS_xxi(resloc.value, srcloc.value, 0)
- elif size == 8:
- self.mc.MOVDDUP(resloc, srcloc)
- else:
- raise AssertionError("float of size %d not supported" % (size,))
-
- def genop_vec_int_expand(self, op, arglocs, resloc):
- srcloc, sizeloc = arglocs
- if not isinstance(srcloc, RegLoc):
- self.mov(srcloc, X86_64_SCRATCH_REG)
- srcloc = X86_64_SCRATCH_REG
- assert not srcloc.is_xmm
- size = sizeloc.value
- if size == 1:
- self.mc.PINSRB_xri(resloc.value, srcloc.value, 0)
- self.mc.PSHUFB(resloc, heap(self.expand_byte_mask_addr))
- elif size == 2:
- self.mc.PINSRW_xri(resloc.value, srcloc.value, 0)
- self.mc.PINSRW_xri(resloc.value, srcloc.value, 4)
- self.mc.PSHUFLW_xxi(resloc.value, resloc.value, 0)
- self.mc.PSHUFHW_xxi(resloc.value, resloc.value, 0)
- elif size == 4:
- self.mc.PINSRD_xri(resloc.value, srcloc.value, 0)
- self.mc.PSHUFD_xxi(resloc.value, resloc.value, 0)
- elif size == 8:
- self.mc.PINSRQ_xri(resloc.value, srcloc.value, 0)
- self.mc.PINSRQ_xri(resloc.value, srcloc.value, 1)
- else:
- raise AssertionError("cannot handle size %d (int expand)" %
(size,))
-
- def genop_vec_int_pack(self, op, arglocs, resloc):
- resultloc, sourceloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
- assert isinstance(resultloc, RegLoc)
- assert isinstance(sourceloc, RegLoc)
- size = sizeloc.value
- srcidx = srcidxloc.value
- residx = residxloc.value
- count = countloc.value
- # for small data type conversion this can be quite costy
- # NOTE there might be some combinations that can be handled
- # more efficiently! e.g.
- # v2 = pack(v0,v1,4,4)
- si = srcidx
- ri = residx
- k = count
- while k > 0:
- if size == 8:
- if resultloc.is_xmm and sourceloc.is_xmm: # both xmm
- self.mc.PEXTRQ_rxi(X86_64_SCRATCH_REG.value,
sourceloc.value, si)
- self.mc.PINSRQ_xri(resultloc.value,
X86_64_SCRATCH_REG.value, ri)
- elif resultloc.is_xmm: # xmm <- reg
- self.mc.PINSRQ_xri(resultloc.value, sourceloc.value, ri)
- else: # reg <- xmm
- self.mc.PEXTRQ_rxi(resultloc.value, sourceloc.value, si)
- elif size == 4:
- if resultloc.is_xmm and sourceloc.is_xmm:
- self.mc.PEXTRD_rxi(X86_64_SCRATCH_REG.value,
sourceloc.value, si)
- self.mc.PINSRD_xri(resultloc.value,
X86_64_SCRATCH_REG.value, ri)
- elif resultloc.is_xmm:
- self.mc.PINSRD_xri(resultloc.value, sourceloc.value, ri)
- else:
- self.mc.PEXTRD_rxi(resultloc.value, sourceloc.value, si)
- elif size == 2:
- if resultloc.is_xmm and sourceloc.is_xmm:
- self.mc.PEXTRW_rxi(X86_64_SCRATCH_REG.value,
sourceloc.value, si)
- self.mc.PINSRW_xri(resultloc.value,
X86_64_SCRATCH_REG.value, ri)
- elif resultloc.is_xmm:
- self.mc.PINSRW_xri(resultloc.value, sourceloc.value, ri)
- else:
- self.mc.PEXTRW_rxi(resultloc.value, sourceloc.value, si)
- elif size == 1:
- if resultloc.is_xmm and sourceloc.is_xmm:
- self.mc.PEXTRB_rxi(X86_64_SCRATCH_REG.value,
sourceloc.value, si)
- self.mc.PINSRB_xri(resultloc.value,
X86_64_SCRATCH_REG.value, ri)
- elif resultloc.is_xmm:
- self.mc.PINSRB_xri(resultloc.value, sourceloc.value, ri)
- else:
- self.mc.PEXTRB_rxi(resultloc.value, sourceloc.value, si)
- si += 1
- ri += 1
- k -= 1
-
- genop_vec_int_unpack = genop_vec_int_pack
-
- def genop_vec_float_pack(self, op, arglocs, resultloc):
- resloc, srcloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
- assert isinstance(resloc, RegLoc)
- assert isinstance(srcloc, RegLoc)
- count = countloc.value
- residx = residxloc.value
- srcidx = srcidxloc.value
- size = sizeloc.value
- if size == 4:
- si = srcidx
- ri = residx
- k = count
- while k > 0:
- if resloc.is_xmm:
- src = srcloc.value
- if not srcloc.is_xmm:
- # if source is a normal register (unpack)
- assert count == 1
- assert si == 0
- self.mov(srcloc, X86_64_XMM_SCRATCH_REG)
- src = X86_64_XMM_SCRATCH_REG.value
- select = ((si & 0x3) << 6)|((ri & 0x3) << 4)
- self.mc.INSERTPS_xxi(resloc.value, src, select)
- else:
- self.mc.PEXTRD_rxi(resloc.value, srcloc.value, si)
- si += 1
- ri += 1
- k -= 1
- elif size == 8:
- assert resloc.is_xmm
- if srcloc.is_xmm:
- if srcidx == 0:
- if residx == 0:
- # r = (s[0], r[1])
- self.mc.MOVSD(resloc, srcloc)
- else:
- assert residx == 1
- # r = (r[0], s[0])
- self.mc.UNPCKLPD(resloc, srcloc)
- else:
- assert srcidx == 1
- if residx == 0:
- # r = (s[1], r[1])
- if resloc != srcloc:
- self.mc.UNPCKHPD(resloc, srcloc)
- self.mc.SHUFPD_xxi(resloc.value, resloc.value, 1)
- else:
- assert residx == 1
- # r = (r[0], s[1])
- if resloc != srcloc:
- self.mc.SHUFPD_xxi(resloc.value, resloc.value, 1)
- self.mc.UNPCKHPD(resloc, srcloc)
- # if they are equal nothing is to be done
-
- genop_vec_float_unpack = genop_vec_float_pack
-
- def genop_vec_cast_float_to_singlefloat(self, op, arglocs, resloc):
- self.mc.CVTPD2PS(resloc, arglocs[0])
-
- def genop_vec_cast_float_to_int(self, op, arglocs, resloc):
- self.mc.CVTPD2DQ(resloc, arglocs[0])
-
- def genop_vec_cast_int_to_float(self, op, arglocs, resloc):
- self.mc.CVTDQ2PD(resloc, arglocs[0])
-
- def genop_vec_cast_singlefloat_to_float(self, op, arglocs, resloc):
- self.mc.CVTPS2PD(resloc, arglocs[0])
-
# ________________________________________
genop_discard_list = [Assembler386.not_implemented_op_discard] * rop._LAST
@@ -2923,7 +2520,10 @@
genop_tlref_list = {}
genop_guard_list = [Assembler386.not_implemented_op_guard] * rop._LAST
-for name, value in Assembler386.__dict__.iteritems():
+import itertools
+iterate = itertools.chain(Assembler386.__dict__.iteritems(),
+ VectorAssemblerMixin.__dict__.iteritems())
+for name, value in iterate:
if name.startswith('genop_discard_'):
opname = name[len('genop_discard_'):]
num = getattr(rop, opname.upper())
diff --git a/rpython/jit/backend/x86/regalloc.py
b/rpython/jit/backend/x86/regalloc.py
--- a/rpython/jit/backend/x86/regalloc.py
+++ b/rpython/jit/backend/x86/regalloc.py
@@ -19,6 +19,7 @@
ebp, r8, r9, r10, r11, r12, r13, r14, r15, xmm0, xmm1, xmm2, xmm3, xmm4,
xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14,
X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG)
+from rpython.jit.backend.x86.vector_ext import VectorRegallocMixin
from rpython.jit.codewriter import longlong
from rpython.jit.codewriter.effectinfo import EffectInfo
from rpython.jit.metainterp.history import (Box, Const, ConstInt, ConstPtr,
@@ -146,7 +147,7 @@
gpr_reg_mgr_cls.all_reg_indexes[_reg.value] = _i
-class RegAlloc(BaseRegalloc):
+class RegAlloc(BaseRegalloc, VectorRegallocMixin):
def __init__(self, assembler, translate_support_code=False):
assert isinstance(translate_support_code, bool)
@@ -1503,190 +1504,10 @@
self.rm.possibly_free_var(length_box)
self.rm.possibly_free_var(dstaddr_box)
- # vector operations
- # ________________________________________
-
- def consider_vec_getarrayitem_raw(self, op):
- descr = op.getdescr()
- assert isinstance(descr, ArrayDescr)
- assert not descr.is_array_of_pointers() and \
- not descr.is_array_of_structs()
- itemsize, ofs, _ = unpack_arraydescr(descr)
- integer = not (descr.is_array_of_floats() or descr.getconcrete_type()
== FLOAT)
- aligned = False
- args = op.getarglist()
- base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
- ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
- result_loc = self.force_allocate_reg(op.result)
- self.perform(op, [base_loc, ofs_loc, imm(itemsize), imm(ofs),
- imm(integer), imm(aligned)], result_loc)
-
- consider_vec_raw_load = consider_vec_getarrayitem_raw
-
- def consider_vec_setarrayitem_raw(self, op):
- descr = op.getdescr()
- assert isinstance(descr, ArrayDescr)
- assert not descr.is_array_of_pointers() and \
- not descr.is_array_of_structs()
- itemsize, ofs, _ = unpack_arraydescr(descr)
- args = op.getarglist()
- base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
- value_loc = self.make_sure_var_in_reg(op.getarg(2), args)
- ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
-
- integer = not (descr.is_array_of_floats() or descr.getconcrete_type()
== FLOAT)
- aligned = False
- self.perform_discard(op, [base_loc, ofs_loc, value_loc,
- imm(itemsize), imm(ofs), imm(integer),
imm(aligned)])
-
- consider_vec_raw_store = consider_vec_setarrayitem_raw
-
- def consider_vec_arith(self, op):
- lhs = op.getarg(0)
- assert isinstance(lhs, BoxVector)
- size = lhs.item_size
- args = op.getarglist()
- loc1 = self.make_sure_var_in_reg(op.getarg(1), args)
- loc0 = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
- self.perform(op, [loc0, loc1, imm(size)], loc0)
-
- consider_vec_int_add = consider_vec_arith
- consider_vec_int_sub = consider_vec_arith
- consider_vec_int_mul = consider_vec_arith
- consider_vec_float_add = consider_vec_arith
- consider_vec_float_sub = consider_vec_arith
- consider_vec_float_mul = consider_vec_arith
- consider_vec_float_truediv = consider_vec_arith
- del consider_vec_arith
-
- def consider_vec_arith_unary(self, op):
- lhs = op.getarg(0)
- assert isinstance(lhs, BoxVector)
- size = lhs.item_size
- args = op.getarglist()
- res = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
- self.perform(op, [res, imm(size)], res)
-
- consider_vec_float_neg = consider_vec_arith_unary
- consider_vec_float_abs = consider_vec_arith_unary
- del consider_vec_arith_unary
-
- def consider_vec_logic(self, op):
- lhs = op.getarg(0)
- assert isinstance(lhs, BoxVector)
- size = lhs.item_size
- args = op.getarglist()
- source = self.make_sure_var_in_reg(op.getarg(1), args)
- result = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
- self.perform(op, [source, imm(size)], result)
-
- consider_vec_float_eq = consider_vec_logic
- consider_vec_int_and = consider_vec_logic
- consider_vec_int_or = consider_vec_logic
- consider_vec_int_xor = consider_vec_logic
- del consider_vec_logic
-
- def consider_vec_int_pack(self, op):
- # new_res = vec_int_pack(res, src, index, count)
- arg = op.getarg(1)
- index = op.getarg(2)
- count = op.getarg(3)
- assert isinstance(index, ConstInt)
- assert isinstance(count, ConstInt)
- args = op.getarglist()
- srcloc = self.make_sure_var_in_reg(arg, args)
- resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
- residx = index.value # where to put it in result?
- srcidx = 0
- assert isinstance(op.result, BoxVector)
- size = op.result.getsize()
- arglocs = [resloc, srcloc, imm(residx), imm(srcidx), imm(count.value),
imm(size)]
- self.perform(op, arglocs, resloc)
-
- consider_vec_float_pack = consider_vec_int_pack
-
- def consider_vec_int_unpack(self, op):
- index = op.getarg(1)
- count = op.getarg(2)
- assert isinstance(index, ConstInt)
- assert isinstance(count, ConstInt)
- args = op.getarglist()
- srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
- if isinstance(op.result, BoxVector):
- resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0),
args)
- assert isinstance(op.result, BoxVector)
- size = op.result.getsize()
- else:
- # unpack into iX box
- resloc = self.force_allocate_reg(op.result, args)
- arg = op.getarg(0)
- assert isinstance(arg, BoxVector)
- size = arg.getsize()
- residx = 0
- args = op.getarglist()
- arglocs = [resloc, srcloc, imm(residx), imm(index.value),
imm(count.value), imm(size)]
- self.perform(op, arglocs, resloc)
-
- consider_vec_float_unpack = consider_vec_int_unpack
-
- def consider_vec_float_expand(self, op):
- result = op.result
- assert isinstance(result, BoxVector)
- arg = op.getarg(0)
- args = op.getarglist()
- if isinstance(arg, Const):
- resloc = self.xrm.force_allocate_reg(result)
- srcloc = self.xrm.expand_float(result.getsize(), arg)
- else:
- resloc = self.xrm.force_result_in_reg(op.result, arg, args)
- srcloc = resloc
-
- size = op.result.getsize()
- self.perform(op, [srcloc, imm(size)], resloc)
-
- def consider_vec_int_expand(self, op):
- arg = op.getarg(0)
- args = op.getarglist()
- if isinstance(arg, Const):
- srcloc = self.rm.convert_to_imm(arg)
- else:
- srcloc = self.make_sure_var_in_reg(arg, args)
- resloc = self.xrm.force_allocate_reg(op.result, args)
- assert isinstance(op.result, BoxVector)
- size = op.result.getsize()
- self.perform(op, [srcloc, imm(size)], resloc)
-
- def consider_vec_int_signext(self, op):
- args = op.getarglist()
- resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
- sizearg = op.getarg(0)
- result = op.result
- assert isinstance(sizearg, BoxVector)
- assert isinstance(result, BoxVector)
- size = sizearg.getsize()
- tosize = result.getsize()
- self.perform(op, [resloc, imm(size), imm(tosize)], resloc)
-
- def consider_vec_box(self, op):
- # pseudo instruction, needed to create a new variable
- self.xrm.force_allocate_reg(op.result)
-
- def consider_guard_early_exit(self, op):
- pass
-
- def consider_vec_cast_float_to_int(self, op):
- args = op.getarglist()
- srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
- resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
- self.perform(op, [srcloc], resloc)
-
- consider_vec_cast_int_to_float = consider_vec_cast_float_to_int
- consider_vec_cast_float_to_singlefloat = consider_vec_cast_float_to_int
- consider_vec_cast_singlefloat_to_float = consider_vec_cast_float_to_int
-
# ________________________________________
def not_implemented_op(self, op):
+ import pdb; pdb.set_trace()
not_implemented("not implemented operation: %s" % op.getopname())
def not_implemented_op_with_guard(self, op, guard_op):
@@ -1699,7 +1520,10 @@
def add_none_argument(fn):
return lambda self, op: fn(self, op, None)
-for name, value in RegAlloc.__dict__.iteritems():
+import itertools
+iterate = itertools.chain(RegAlloc.__dict__.iteritems(),
+ VectorRegallocMixin.__dict__.iteritems())
+for name, value in iterate:
if name.startswith('consider_'):
name = name[len('consider_'):]
num = getattr(rop, name.upper())
diff --git a/rpython/jit/backend/x86/vector_ext.py
b/rpython/jit/backend/x86/vector_ext.py
new file mode 100644
--- /dev/null
+++ b/rpython/jit/backend/x86/vector_ext.py
@@ -0,0 +1,610 @@
+import py
+from rpython.jit.metainterp.history import (Box, Const, ConstInt, ConstPtr,
+ ConstFloat, BoxInt, BoxFloat, BoxVector, BoxVectorAccum, INT, REF,
+ FLOAT, VECTOR, TargetToken)
+from rpython.jit.backend.llsupport.descr import (ArrayDescr, CallDescr,
+ unpack_arraydescr, unpack_fielddescr, unpack_interiorfielddescr)
+from rpython.jit.backend.x86.regloc import (FrameLoc, RegLoc, ConstFloatLoc,
+ FloatImmedLoc, ImmedLoc, imm, imm0, imm1, ecx, eax, edx, ebx, esi, edi,
+ ebp, r8, r9, r10, r11, r12, r13, r14, r15, xmm0, xmm1, xmm2, xmm3, xmm4,
+ xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14,
+ X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG, AddressLoc)
+
+def addr_add(reg_or_imm1, reg_or_imm2, offset=0, scale=0):
+ # duplicated for easy migration, def in assembler.py as well
+ return AddressLoc(reg_or_imm1, reg_or_imm2, scale, offset)
+
+class VectorAssemblerMixin(object):
+ _mixin_ = True
+
+ def _guard_vector_true(self, guard_op, loc, zero=False):
+ arg = guard_op.getarg(0)
+ assert isinstance(arg, BoxVector)
+ size = arg.item_size
+ temp = X86_64_XMM_SCRATCH_REG
+ #
+ self.mc.PXOR(temp, temp)
+ # if the vector is not fully packed blend 1s
+ if not arg.fully_packed(self.cpu.vector_register_size):
+ self.mc.PCMPEQQ(temp, temp) # fill with ones
+ select = 0
+ bits_used = (arg.item_count * arg.item_size * 8)
+ index = bits_used // 16
+ while index < 8:
+ select |= (1 << index)
+ index += 1
+ self.mc.PBLENDW_xxi(loc.value, temp.value, select)
+ # reset to zeros
+ self.mc.PXOR(temp, temp)
+
+ self.mc.PCMPEQ(size, loc, temp)
+ self.mc.PCMPEQQ(temp, temp)
+ self.mc.PTEST(loc, temp)
+
+ # vector operations
+ # ________________________________________
+
+ def _accum_update_at_exit(self, fail_locs, fail_args, faildescr, regalloc):
+ """ If accumulation is done in this loop, at the guard exit
+ some vector registers must be adjusted to yield the correct value"""
+ assert regalloc is not None
+ accum_info = faildescr.rd_accum_list
+ while accum_info:
+ pos = accum_info.position
+ loc = fail_locs[pos]
+ assert isinstance(loc, RegLoc)
+ arg = fail_args[pos]
+ if isinstance(arg, BoxVectorAccum):
+ arg = arg.scalar_var
+ assert arg is not None
+ tgtloc = regalloc.force_allocate_reg(arg, fail_args)
+ if accum_info.operation == '+':
+ # reduction using plus
+ self._accum_reduce_sum(arg, loc, tgtloc)
+ elif accum_info.operation == '*':
+ self._accum_reduce_mul(arg, loc, tgtloc)
+ else:
+ not_implemented("accum operator %s not implemented" %
+ (accum_info.operation))
+ fail_locs[pos] = tgtloc
+ regalloc.possibly_free_var(arg)
+ accum_info = accum_info.prev
+
+ def _accum_reduce_mul(self, arg, accumloc, targetloc):
+ scratchloc = X86_64_XMM_SCRATCH_REG
+ self.mov(accumloc, scratchloc)
+ # swap the two elements
+ self.mc.SHUFPD_xxi(scratchloc.value, scratchloc.value, 0x01)
+ self.mc.MULSD(accumloc, scratchloc)
+ if accumloc is not targetloc:
+ self.mov(accumloc, targetloc)
+
+ def _accum_reduce_sum(self, arg, accumloc, targetloc):
+ # Currently the accumulator can ONLY be the biggest
+ # size for X86 -> 64 bit float/int
+ if arg.type == FLOAT:
+ # r = (r[0]+r[1],r[0]+r[1])
+ self.mc.HADDPD(accumloc, accumloc)
+ # upper bits (> 64) are dirty (but does not matter)
+ if accumloc is not targetloc:
+ self.mov(accumloc, targetloc)
+ return
+ elif arg.type == INT:
+ scratchloc = X86_64_SCRATCH_REG
+ self.mc.PEXTRQ_rxi(targetloc.value, accumloc.value, 0)
+ self.mc.PEXTRQ_rxi(scratchloc.value, accumloc.value, 1)
+ self.mc.ADD(targetloc, scratchloc)
+ return
+
+ not_implemented("reduce sum for %s not impl." % arg)
+
+ def genop_vec_getarrayitem_raw(self, op, arglocs, resloc):
+ # considers item scale (raw_load does not)
+ base_loc, ofs_loc, size_loc, ofs, integer_loc, aligned_loc = arglocs
+ scale = get_scale(size_loc.value)
+ src_addr = addr_add(base_loc, ofs_loc, ofs.value, scale)
+ self._vec_load(resloc, src_addr, integer_loc.value,
+ size_loc.value, aligned_loc.value)
+
+ def genop_vec_raw_load(self, op, arglocs, resloc):
+ base_loc, ofs_loc, size_loc, ofs, integer_loc, aligned_loc = arglocs
+ src_addr = addr_add(base_loc, ofs_loc, ofs.value, 0)
+ self._vec_load(resloc, src_addr, integer_loc.value,
+ size_loc.value, aligned_loc.value)
+
+ def _vec_load(self, resloc, src_addr, integer, itemsize, aligned):
+ if integer:
+ if aligned:
+ self.mc.MOVDQA(resloc, src_addr)
+ else:
+ self.mc.MOVDQU(resloc, src_addr)
+ else:
+ if itemsize == 4:
+ self.mc.MOVUPS(resloc, src_addr)
+ elif itemsize == 8:
+ self.mc.MOVUPD(resloc, src_addr)
+
+ def genop_discard_vec_setarrayitem_raw(self, op, arglocs):
+ # considers item scale (raw_store does not)
+ base_loc, ofs_loc, value_loc, size_loc, baseofs, integer_loc,
aligned_loc = arglocs
+ scale = get_scale(size_loc.value)
+ dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, scale)
+ self._vec_store(dest_loc, value_loc, integer_loc.value,
+ size_loc.value, aligned_loc.value)
+
+ def genop_discard_vec_raw_store(self, op, arglocs):
+ base_loc, ofs_loc, value_loc, size_loc, baseofs, integer_loc,
aligned_loc = arglocs
+ dest_loc = addr_add(base_loc, ofs_loc, baseofs.value, 0)
+ self._vec_store(dest_loc, value_loc, integer_loc.value,
+ size_loc.value, aligned_loc.value)
+
+ def _vec_store(self, dest_loc, value_loc, integer, itemsize, aligned):
+ if integer:
+ if aligned:
+ self.mc.MOVDQA(dest_loc, value_loc)
+ else:
+ self.mc.MOVDQU(dest_loc, value_loc)
+ else:
+ if itemsize == 4:
+ self.mc.MOVUPS(dest_loc, value_loc)
+ elif itemsize == 8:
+ self.mc.MOVUPD(dest_loc, value_loc)
+
+ def genop_vec_int_mul(self, op, arglocs, resloc):
+ loc0, loc1, itemsize_loc = arglocs
+ itemsize = itemsize_loc.value
+ if itemsize == 2:
+ self.mc.PMULLW(loc0, loc1)
+ elif itemsize == 4:
+ self.mc.PMULLD(loc0, loc1)
+ else:
+ # NOTE see
http://stackoverflow.com/questions/8866973/can-long-integer-routines-benefit-from-sse/8867025#8867025
+ # There is no 64x64 bit packed mul and I did not find one
+ # for 8 bit either. It is questionable if it gives any benefit
+ # for 8 bit.
+ not_implemented("int8/64 mul")
+
+ def genop_vec_int_add(self, op, arglocs, resloc):
+ loc0, loc1, size_loc = arglocs
+ size = size_loc.value
+ if size == 1:
+ self.mc.PADDB(loc0, loc1)
+ elif size == 2:
+ self.mc.PADDW(loc0, loc1)
+ elif size == 4:
+ self.mc.PADDD(loc0, loc1)
+ elif size == 8:
+ self.mc.PADDQ(loc0, loc1)
+
+ def genop_vec_int_sub(self, op, arglocs, resloc):
+ loc0, loc1, size_loc = arglocs
+ size = size_loc.value
+ if size == 1:
+ self.mc.PSUBB(loc0, loc1)
+ elif size == 2:
+ self.mc.PSUBW(loc0, loc1)
+ elif size == 4:
+ self.mc.PSUBD(loc0, loc1)
+ elif size == 8:
+ self.mc.PSUBQ(loc0, loc1)
+
+ def genop_vec_int_and(self, op, arglocs, resloc):
+ self.mc.PAND(resloc, arglocs[0])
+
+ def genop_vec_int_or(self, op, arglocs, resloc):
+ self.mc.POR(resloc, arglocs[0])
+
+ def genop_vec_int_xor(self, op, arglocs, resloc):
+ self.mc.PXOR(resloc, arglocs[0])
+
+ genop_vec_float_arith = """
+ def genop_vec_float_{type}(self, op, arglocs, resloc):
+ loc0, loc1, itemsize_loc = arglocs
+ itemsize = itemsize_loc.value
+ if itemsize == 4:
+ self.mc.{p_op_s}(loc0, loc1)
+ elif itemsize == 8:
+ self.mc.{p_op_d}(loc0, loc1)
+ """
+ for op in ['add','mul','sub']:
+ OP = op.upper()
+ _source = genop_vec_float_arith.format(type=op,
+ p_op_s=OP+'PS',
+ p_op_d=OP+'PD')
+ exec py.code.Source(_source).compile()
+ del genop_vec_float_arith
+
+ def genop_vec_float_truediv(self, op, arglocs, resloc):
+ loc0, loc1, sizeloc = arglocs
+ size = sizeloc.value
+ if size == 4:
+ self.mc.DIVPS(loc0, loc1)
+ elif size == 8:
+ self.mc.DIVPD(loc0, loc1)
+
+ def genop_vec_float_abs(self, op, arglocs, resloc):
+ src, sizeloc = arglocs
+ size = sizeloc.value
+ if size == 4:
+ self.mc.ANDPS(src, heap(self.single_float_const_abs_addr))
+ elif size == 8:
+ self.mc.ANDPD(src, heap(self.float_const_abs_addr))
+
+ def genop_vec_float_neg(self, op, arglocs, resloc):
+ src, sizeloc = arglocs
+ size = sizeloc.value
+ if size == 4:
+ self.mc.XORPS(src, heap(self.single_float_const_neg_addr))
+ elif size == 8:
+ self.mc.XORPD(src, heap(self.float_const_neg_addr))
+
+ def genop_vec_int_signext(self, op, arglocs, resloc):
+ srcloc, sizeloc, tosizeloc = arglocs
+ size = sizeloc.value
+ tosize = tosizeloc.value
+ if size == tosize:
+ return # already the right size
+ if size == 4 and tosize == 8:
+ scratch = X86_64_SCRATCH_REG.value
+ self.mc.PEXTRD_rxi(scratch, srcloc.value, 1)
+ self.mc.PINSRQ_xri(resloc.value, scratch, 1)
+ self.mc.PEXTRD_rxi(scratch, srcloc.value, 0)
+ self.mc.PINSRQ_xri(resloc.value, scratch, 0)
+ elif size == 8 and tosize == 4:
+ # is there a better sequence to move them?
+ scratch = X86_64_SCRATCH_REG.value
+ self.mc.PEXTRQ_rxi(scratch, srcloc.value, 0)
+ self.mc.PINSRD_xri(resloc.value, scratch, 0)
+ self.mc.PEXTRQ_rxi(scratch, srcloc.value, 1)
+ self.mc.PINSRD_xri(resloc.value, scratch, 1)
+ else:
+ # note that all other conversions are not implemented
+ # on purpose. it needs many x86 op codes to implement
+ # the missing combinations. even if they are implemented
+ # the speedup might only be modest...
+ # the optimization does not emit such code!
+ msg = "vec int signext (%d->%d)" % (size, tosize)
+ not_implemented(msg)
+
+ def genop_vec_float_expand(self, op, arglocs, resloc):
+ srcloc, sizeloc = arglocs
+ size = sizeloc.value
+ if isinstance(srcloc, ConstFloatLoc):
+ # they are aligned!
+ self.mc.MOVAPD(resloc, srcloc)
+ elif size == 4:
+ # the register allocator forces src to be the same as resloc
+ # r = (s[0], s[0], r[0], r[0])
+ # since resloc == srcloc: r = (r[0], r[0], r[0], r[0])
+ self.mc.SHUFPS_xxi(resloc.value, srcloc.value, 0)
+ elif size == 8:
+ self.mc.MOVDDUP(resloc, srcloc)
+ else:
+ raise AssertionError("float of size %d not supported" % (size,))
+
+ def genop_vec_int_expand(self, op, arglocs, resloc):
+ srcloc, sizeloc = arglocs
+ if not isinstance(srcloc, RegLoc):
+ self.mov(srcloc, X86_64_SCRATCH_REG)
+ srcloc = X86_64_SCRATCH_REG
+ assert not srcloc.is_xmm
+ size = sizeloc.value
+ if size == 1:
+ self.mc.PINSRB_xri(resloc.value, srcloc.value, 0)
+ self.mc.PSHUFB(resloc, heap(self.expand_byte_mask_addr))
+ elif size == 2:
+ self.mc.PINSRW_xri(resloc.value, srcloc.value, 0)
+ self.mc.PINSRW_xri(resloc.value, srcloc.value, 4)
+ self.mc.PSHUFLW_xxi(resloc.value, resloc.value, 0)
+ self.mc.PSHUFHW_xxi(resloc.value, resloc.value, 0)
+ elif size == 4:
+ self.mc.PINSRD_xri(resloc.value, srcloc.value, 0)
+ self.mc.PSHUFD_xxi(resloc.value, resloc.value, 0)
+ elif size == 8:
+ self.mc.PINSRQ_xri(resloc.value, srcloc.value, 0)
+ self.mc.PINSRQ_xri(resloc.value, srcloc.value, 1)
+ else:
+ raise AssertionError("cannot handle size %d (int expand)" %
(size,))
+
+ def genop_vec_int_pack(self, op, arglocs, resloc):
+ resultloc, sourceloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
+ assert isinstance(resultloc, RegLoc)
+ assert isinstance(sourceloc, RegLoc)
+ size = sizeloc.value
+ srcidx = srcidxloc.value
+ residx = residxloc.value
+ count = countloc.value
+ # for small data type conversion this can be quite costy
+ # NOTE there might be some combinations that can be handled
+ # more efficiently! e.g.
+ # v2 = pack(v0,v1,4,4)
+ si = srcidx
+ ri = residx
+ k = count
+ while k > 0:
+ if size == 8:
+ if resultloc.is_xmm and sourceloc.is_xmm: # both xmm
+ self.mc.PEXTRQ_rxi(X86_64_SCRATCH_REG.value,
sourceloc.value, si)
+ self.mc.PINSRQ_xri(resultloc.value,
X86_64_SCRATCH_REG.value, ri)
+ elif resultloc.is_xmm: # xmm <- reg
+ self.mc.PINSRQ_xri(resultloc.value, sourceloc.value, ri)
+ else: # reg <- xmm
+ self.mc.PEXTRQ_rxi(resultloc.value, sourceloc.value, si)
+ elif size == 4:
+ if resultloc.is_xmm and sourceloc.is_xmm:
+ self.mc.PEXTRD_rxi(X86_64_SCRATCH_REG.value,
sourceloc.value, si)
+ self.mc.PINSRD_xri(resultloc.value,
X86_64_SCRATCH_REG.value, ri)
+ elif resultloc.is_xmm:
+ self.mc.PINSRD_xri(resultloc.value, sourceloc.value, ri)
+ else:
+ self.mc.PEXTRD_rxi(resultloc.value, sourceloc.value, si)
+ elif size == 2:
+ if resultloc.is_xmm and sourceloc.is_xmm:
+ self.mc.PEXTRW_rxi(X86_64_SCRATCH_REG.value,
sourceloc.value, si)
+ self.mc.PINSRW_xri(resultloc.value,
X86_64_SCRATCH_REG.value, ri)
+ elif resultloc.is_xmm:
+ self.mc.PINSRW_xri(resultloc.value, sourceloc.value, ri)
+ else:
+ self.mc.PEXTRW_rxi(resultloc.value, sourceloc.value, si)
+ elif size == 1:
+ if resultloc.is_xmm and sourceloc.is_xmm:
+ self.mc.PEXTRB_rxi(X86_64_SCRATCH_REG.value,
sourceloc.value, si)
+ self.mc.PINSRB_xri(resultloc.value,
X86_64_SCRATCH_REG.value, ri)
+ elif resultloc.is_xmm:
+ self.mc.PINSRB_xri(resultloc.value, sourceloc.value, ri)
+ else:
+ self.mc.PEXTRB_rxi(resultloc.value, sourceloc.value, si)
+ si += 1
+ ri += 1
+ k -= 1
+
+ genop_vec_int_unpack = genop_vec_int_pack
+
+ def genop_vec_float_pack(self, op, arglocs, resultloc):
+ resloc, srcloc, residxloc, srcidxloc, countloc, sizeloc = arglocs
+ assert isinstance(resloc, RegLoc)
+ assert isinstance(srcloc, RegLoc)
+ count = countloc.value
+ residx = residxloc.value
+ srcidx = srcidxloc.value
+ size = sizeloc.value
+ if size == 4:
+ si = srcidx
+ ri = residx
+ k = count
+ while k > 0:
+ if resloc.is_xmm:
+ src = srcloc.value
+ if not srcloc.is_xmm:
+ # if source is a normal register (unpack)
+ assert count == 1
+ assert si == 0
+ self.mov(srcloc, X86_64_XMM_SCRATCH_REG)
+ src = X86_64_XMM_SCRATCH_REG.value
+ select = ((si & 0x3) << 6)|((ri & 0x3) << 4)
+ self.mc.INSERTPS_xxi(resloc.value, src, select)
+ else:
+ self.mc.PEXTRD_rxi(resloc.value, srcloc.value, si)
+ si += 1
+ ri += 1
+ k -= 1
+ elif size == 8:
+ assert resloc.is_xmm
+ if srcloc.is_xmm:
+ if srcidx == 0:
+ if residx == 0:
+ # r = (s[0], r[1])
+ self.mc.MOVSD(resloc, srcloc)
+ else:
+ assert residx == 1
+ # r = (r[0], s[0])
+ self.mc.UNPCKLPD(resloc, srcloc)
+ else:
+ assert srcidx == 1
+ if residx == 0:
+ # r = (s[1], r[1])
+ if resloc != srcloc:
+ self.mc.UNPCKHPD(resloc, srcloc)
+ self.mc.SHUFPD_xxi(resloc.value, resloc.value, 1)
+ else:
+ assert residx == 1
+ # r = (r[0], s[1])
+ if resloc != srcloc:
+ self.mc.SHUFPD_xxi(resloc.value, resloc.value, 1)
+ self.mc.UNPCKHPD(resloc, srcloc)
+ # if they are equal nothing is to be done
+
+ genop_vec_float_unpack = genop_vec_float_pack
+
+ def genop_vec_cast_float_to_singlefloat(self, op, arglocs, resloc):
+ self.mc.CVTPD2PS(resloc, arglocs[0])
+
+ def genop_vec_cast_float_to_int(self, op, arglocs, resloc):
+ self.mc.CVTPD2DQ(resloc, arglocs[0])
+
+ def genop_vec_cast_int_to_float(self, op, arglocs, resloc):
+ self.mc.CVTDQ2PD(resloc, arglocs[0])
+
+ def genop_vec_cast_singlefloat_to_float(self, op, arglocs, resloc):
+ self.mc.CVTPS2PD(resloc, arglocs[0])
+
+class VectorRegallocMixin(object):
+ _mixin_ = True
+
+ def consider_vec_getarrayitem_raw(self, op):
+ descr = op.getdescr()
+ assert isinstance(descr, ArrayDescr)
+ assert not descr.is_array_of_pointers() and \
+ not descr.is_array_of_structs()
+ itemsize, ofs, _ = unpack_arraydescr(descr)
+ integer = not (descr.is_array_of_floats() or descr.getconcrete_type()
== FLOAT)
+ aligned = False
+ args = op.getarglist()
+ base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
+ ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
+ result_loc = self.force_allocate_reg(op.result)
+ self.perform(op, [base_loc, ofs_loc, imm(itemsize), imm(ofs),
+ imm(integer), imm(aligned)], result_loc)
+
+ consider_vec_raw_load = consider_vec_getarrayitem_raw
+
+ def consider_vec_setarrayitem_raw(self, op):
+ descr = op.getdescr()
+ assert isinstance(descr, ArrayDescr)
+ assert not descr.is_array_of_pointers() and \
+ not descr.is_array_of_structs()
+ itemsize, ofs, _ = unpack_arraydescr(descr)
+ args = op.getarglist()
+ base_loc = self.rm.make_sure_var_in_reg(op.getarg(0), args)
+ value_loc = self.make_sure_var_in_reg(op.getarg(2), args)
+ ofs_loc = self.rm.make_sure_var_in_reg(op.getarg(1), args)
+
+ integer = not (descr.is_array_of_floats() or descr.getconcrete_type()
== FLOAT)
+ aligned = False
+ self.perform_discard(op, [base_loc, ofs_loc, value_loc,
+ imm(itemsize), imm(ofs), imm(integer),
imm(aligned)])
+
+ consider_vec_raw_store = consider_vec_setarrayitem_raw
+
+ def consider_vec_arith(self, op):
+ lhs = op.getarg(0)
+ assert isinstance(lhs, BoxVector)
+ size = lhs.item_size
+ args = op.getarglist()
+ loc1 = self.make_sure_var_in_reg(op.getarg(1), args)
+ loc0 = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+ self.perform(op, [loc0, loc1, imm(size)], loc0)
+
+ consider_vec_int_add = consider_vec_arith
+ consider_vec_int_sub = consider_vec_arith
+ consider_vec_int_mul = consider_vec_arith
+ consider_vec_float_add = consider_vec_arith
+ consider_vec_float_sub = consider_vec_arith
+ consider_vec_float_mul = consider_vec_arith
+ consider_vec_float_truediv = consider_vec_arith
+ del consider_vec_arith
+
+ def consider_vec_arith_unary(self, op):
+ lhs = op.getarg(0)
+ assert isinstance(lhs, BoxVector)
+ size = lhs.item_size
+ args = op.getarglist()
+ res = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+ self.perform(op, [res, imm(size)], res)
+
+ consider_vec_float_neg = consider_vec_arith_unary
+ consider_vec_float_abs = consider_vec_arith_unary
+ del consider_vec_arith_unary
+
+ def consider_vec_logic(self, op):
+ lhs = op.getarg(0)
+ assert isinstance(lhs, BoxVector)
+ size = lhs.item_size
+ args = op.getarglist()
+ source = self.make_sure_var_in_reg(op.getarg(1), args)
+ result = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+ self.perform(op, [source, imm(size)], result)
+
+ consider_vec_float_eq = consider_vec_logic
+ consider_vec_int_and = consider_vec_logic
+ consider_vec_int_or = consider_vec_logic
+ consider_vec_int_xor = consider_vec_logic
+ del consider_vec_logic
+
+ def consider_vec_int_pack(self, op):
+ # new_res = vec_int_pack(res, src, index, count)
+ arg = op.getarg(1)
+ index = op.getarg(2)
+ count = op.getarg(3)
+ assert isinstance(index, ConstInt)
+ assert isinstance(count, ConstInt)
+ args = op.getarglist()
+ srcloc = self.make_sure_var_in_reg(arg, args)
+ resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+ residx = index.value # where to put it in result?
+ srcidx = 0
+ assert isinstance(op.result, BoxVector)
+ size = op.result.getsize()
+ arglocs = [resloc, srcloc, imm(residx), imm(srcidx), imm(count.value),
imm(size)]
+ self.perform(op, arglocs, resloc)
+
+ consider_vec_float_pack = consider_vec_int_pack
+
+ def consider_vec_int_unpack(self, op):
+ index = op.getarg(1)
+ count = op.getarg(2)
+ assert isinstance(index, ConstInt)
+ assert isinstance(count, ConstInt)
+ args = op.getarglist()
+ srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
+ if isinstance(op.result, BoxVector):
+ resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0),
args)
+ assert isinstance(op.result, BoxVector)
+ size = op.result.getsize()
+ else:
+ # unpack into iX box
+ resloc = self.force_allocate_reg(op.result, args)
+ arg = op.getarg(0)
+ assert isinstance(arg, BoxVector)
+ size = arg.getsize()
+ residx = 0
+ args = op.getarglist()
+ arglocs = [resloc, srcloc, imm(residx), imm(index.value),
imm(count.value), imm(size)]
+ self.perform(op, arglocs, resloc)
+
+ consider_vec_float_unpack = consider_vec_int_unpack
+
+ def consider_vec_float_expand(self, op):
+ result = op.result
+ assert isinstance(result, BoxVector)
+ arg = op.getarg(0)
+ args = op.getarglist()
+ if isinstance(arg, Const):
+ resloc = self.xrm.force_allocate_reg(result)
+ srcloc = self.xrm.expand_float(result.getsize(), arg)
+ else:
+ resloc = self.xrm.force_result_in_reg(op.result, arg, args)
+ srcloc = resloc
+
+ size = op.result.getsize()
+ self.perform(op, [srcloc, imm(size)], resloc)
+
+ def consider_vec_int_expand(self, op):
+ arg = op.getarg(0)
+ args = op.getarglist()
+ if isinstance(arg, Const):
+ srcloc = self.rm.convert_to_imm(arg)
+ else:
+ srcloc = self.make_sure_var_in_reg(arg, args)
+ resloc = self.xrm.force_allocate_reg(op.result, args)
+ assert isinstance(op.result, BoxVector)
+ size = op.result.getsize()
+ self.perform(op, [srcloc, imm(size)], resloc)
+
+ def consider_vec_int_signext(self, op):
+ args = op.getarglist()
+ resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+ sizearg = op.getarg(0)
+ result = op.result
+ assert isinstance(sizearg, BoxVector)
+ assert isinstance(result, BoxVector)
+ size = sizearg.getsize()
+ tosize = result.getsize()
+ self.perform(op, [resloc, imm(size), imm(tosize)], resloc)
+
+ def consider_vec_box(self, op):
+ # pseudo instruction, needed to create a new variable
+ self.xrm.force_allocate_reg(op.result)
+
+ def consider_guard_early_exit(self, op):
+ pass
+
+ def consider_vec_cast_float_to_int(self, op):
+ args = op.getarglist()
+ srcloc = self.make_sure_var_in_reg(op.getarg(0), args)
+ resloc = self.xrm.force_result_in_reg(op.result, op.getarg(0), args)
+ self.perform(op, [srcloc], resloc)
+
+ consider_vec_cast_int_to_float = consider_vec_cast_float_to_int
+ consider_vec_cast_float_to_singlefloat = consider_vec_cast_float_to_int
+ consider_vec_cast_singlefloat_to_float = consider_vec_cast_float_to_int
diff --git a/rpython/jit/metainterp/compile.py
b/rpython/jit/metainterp/compile.py
--- a/rpython/jit/metainterp/compile.py
+++ b/rpython/jit/metainterp/compile.py
@@ -163,10 +163,17 @@
return None
loop.operations = loop.operations[:-1] + part.operations
+ loop.versions = part.versions
if part.quasi_immutable_deps:
loop.quasi_immutable_deps.update(part.quasi_immutable_deps)
assert part.operations[-1].getopnum() != rop.LABEL
+ if loop.versions is not None:
+ # several different loop version have been generated
+ for version in loop.versions:
+ token = version.update_token(jitcell_token)
+ all_target_tokens.append(token)
+
if not loop.quasi_immutable_deps:
loop.quasi_immutable_deps = None
for box in loop.inputargs:
@@ -181,8 +188,21 @@
propagate_original_jitcell_token(loop)
send_loop_to_backend(greenkey, jitdriver_sd, metainterp_sd, loop, "loop")
record_loop_or_bridge(metainterp_sd, loop)
+
+ generate_pending_loop_versions(loop, jitdriver_sd, metainterp_sd,
jitcell_token)
+
return all_target_tokens[0]
+def generate_pending_loop_versions(loop, jitdriver_sd, metainterp_sd,
jitcell_token):
+ if loop.versions is not None:
+ token = jitcell_token
+ for version in loop.versions:
+ version.update_inputargs()
+ for faildescr in version.faildescrs:
+ send_bridge_to_backend(jitdriver_sd, metainterp_sd,
+ faildescr, version.inputargs,
+ version.operations, jitcell_token)
+
def compile_retrace(metainterp, greenkey, start,
inputargs, jumpargs,
partial_trace, resumekey, start_state):
@@ -689,6 +709,16 @@
class ResumeAtLoopHeaderDescr(ResumeGuardDescr):
guard_opnum = rop.GUARD_EARLY_EXIT
+class CompileLoopVersionDescr(ResumeGuardDescr):
+ guard_opnum = rop.GUARD_EARLY_EXIT
+
+ operations = None
+ inputargs = None
+ faillocs = None
+
+ def handle_fail(self, deadframe, metainterp_sd, jitdriver_sd):
+ assert 0, "this guard must never fail"
+
class AllVirtuals:
llopaque = True
cache = None
diff --git a/rpython/jit/metainterp/history.py
b/rpython/jit/metainterp/history.py
--- a/rpython/jit/metainterp/history.py
+++ b/rpython/jit/metainterp/history.py
@@ -695,12 +695,49 @@
def repr_of_descr(self):
return 'TargetToken(%d)' % compute_unique_id(self)
+class LoopVersion(object):
+
+ def __init__(self, loop, aligned=False):
+ self.operations = loop.operations
+ self.aligned = aligned
+ self.faildescrs = []
+ #
+ label = self.operations[0]
+ assert label.getopnum() == rop.LABEL
+ self.enter_args = label.getarglist()
+ self.calling_args = None
+ self.inputargs = None
+
+ def adddescr(self, descr):
+ self.faildescrs.append(descr)
+
+ def update_token(self, jitcell_token):
+ label = self.operations[0]
+ jump = self.operations[-1]
+ #
+ assert label.getopnum() == rop.LABEL
+ assert jump.getopnum() == rop.JUMP
+ #
+ token = TargetToken(jitcell_token)
+ token.original_jitcell_token = jitcell_token
+ label.setdescr(token)
+ jump.setdescr(token)
+ return token
+
+ def update_inputargs(self):
+ assert len(self.enter_args) == len(self.inputargs)
+ rename = { a: b for a,b in zip(self.enter_args, self.calling_args) }
+ for i, arg in enumerate(self.inputargs):
+ self.inputargs[i] = rename[arg]
+
+
class TreeLoop(object):
inputargs = None
operations = None
call_pure_results = None
logops = None
quasi_immutable_deps = None
+ versions = None
def _token(*args):
raise Exception("TreeLoop.token is killed")
@@ -817,6 +854,7 @@
def __repr__(self):
return '<%s>' % (self.name,)
+
def _list_all_operations(result, operations, omit_finish=True):
if omit_finish and operations[-1].getopnum() == rop.FINISH:
# xxx obscure
diff --git a/rpython/jit/metainterp/optimizeopt/dependency.py
b/rpython/jit/metainterp/optimizeopt/dependency.py
--- a/rpython/jit/metainterp/optimizeopt/dependency.py
+++ b/rpython/jit/metainterp/optimizeopt/dependency.py
@@ -104,26 +104,6 @@
def can_be_relaxed(self):
return self.op.getopnum() in (rop.GUARD_TRUE, rop.GUARD_FALSE)
- def relax_guard_to(self, guard):
- """ Relaxes a guard operation to an earlier guard. """
- # clone this operation object. if the vectorizer is
- # not able to relax guards, it won't leave behind a modified operation
- tgt_op = self.getoperation().clone()
- self.op = tgt_op
-
- op = guard.getoperation()
- assert isinstance(tgt_op, GuardResOp)
- assert isinstance(op, GuardResOp)
- olddescr = op.getdescr()
- descr = compile.ResumeAtLoopHeaderDescr()
- if olddescr:
- descr.copy_all_attributes_from(olddescr)
- #
- tgt_op.setdescr(descr)
- tgt_op.rd_snapshot = op.rd_snapshot
- #if not we_are_translated():
- tgt_op.setfailargs(op.getfailargs())
-
def edge_to(self, to, arg=None, failarg=False, label=None):
if self is to:
return
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py
b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -11,10 +11,11 @@
from rpython.jit.metainterp.resume import Snapshot
from rpython.jit.metainterp.jitexc import NotAVectorizeableLoop,
NotAProfitableLoop
from rpython.jit.metainterp.optimizeopt.unroll import optimize_unroll
-from rpython.jit.metainterp.compile import ResumeAtLoopHeaderDescr,
invent_fail_descr_for_op
+from rpython.jit.metainterp.compile import (ResumeAtLoopHeaderDescr,
+ CompileLoopVersionDescr, invent_fail_descr_for_op)
from rpython.jit.metainterp.history import (ConstInt, VECTOR, FLOAT, INT,
BoxVector, BoxFloat, BoxInt, ConstFloat, TargetToken, JitCellToken,
Box,
- BoxVectorAccum)
+ BoxVectorAccum, LoopVersion)
from rpython.jit.metainterp.optimizeopt.optimizer import Optimizer,
Optimization
from rpython.jit.metainterp.optimizeopt.util import make_dispatcher_method,
Renamer
from rpython.jit.metainterp.optimizeopt.dependency import (DependencyGraph,
@@ -50,6 +51,7 @@
optimize_unroll(metainterp_sd, jitdriver_sd, loop, optimizations,
inline_short_preamble, start_state, False)
orig_ops = loop.operations
+ orig_version = LoopVersion(loop)
if len(orig_ops) >= 75:
# if more than 75 operations are present in this loop,
# it won't be possible to vectorize. There are too many
@@ -62,11 +64,16 @@
metainterp_sd.logger_noopt.log_loop(loop.inputargs, loop.operations,
-2, None, None, "pre vectorize")
metainterp_sd.profiler.count(Counters.OPT_VECTORIZE_TRY)
start = time.clock()
- opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, loop,
cost_threshold)
+ opt = VectorizingOptimizer(metainterp_sd, jitdriver_sd, loop,
cost_threshold, orig_version)
opt.propagate_all_forward()
gso = GuardStrengthenOpt(opt.dependency_graph.index_vars)
gso.propagate_all_forward(opt.loop)
end = time.clock()
+
+ aligned_vector_version = LoopVersion(loop, aligned=True)
+
+ loop.versions = [orig_version] #, aligned_vector_version]
+
metainterp_sd.profiler.count(Counters.OPT_VECTORIZED)
metainterp_sd.logger_noopt.log_loop(loop.inputargs, loop.operations,
-2, None, None, "post vectorize")
debug_stop("vec-opt-loop")
@@ -107,8 +114,9 @@
class VectorizingOptimizer(Optimizer):
""" Try to unroll the loop and find instructions to group """
- def __init__(self, metainterp_sd, jitdriver_sd, loop, cost_threshold=0):
+ def __init__(self, metainterp_sd, jitdriver_sd, loop, cost_threshold,
orig_loop_version):
Optimizer.__init__(self, metainterp_sd, jitdriver_sd, loop, [])
+ self.orig_loop_version = orig_loop_version
self.dependency_graph = None
self.packset = None
self.unroll_count = 0
@@ -188,6 +196,8 @@
self.emit_unrolled_operation(label_op)
+ self.orig_loop_version.calling_args = label_op.getarglist()
+
renamer = Renamer()
oi = 0
pure = True
@@ -247,7 +257,7 @@
assert isinstance(copied_op, GuardResOp)
target_guard = copied_op
# do not overwrite resume at loop header
- if not isinstance(target_guard.getdescr(),
ResumeAtLoopHeaderDescr):
+ if target_guard.getdescr().guard_opnum !=
rop.GUARD_EARLY_EXIT:
descr = invent_fail_descr_for_op(copied_op.getopnum(),
self)
olddescr = copied_op.getdescr()
if olddescr:
@@ -573,7 +583,29 @@
label_node.edge_to(last_but_one, label='pullup')
# only the last guard needs a connection
guard_node.edge_to(ee_guard_node, label='pullup-last-guard')
- guard_node.relax_guard_to(ee_guard_node)
+ self.relax_guard_to(guard_node, ee_guard_node)
+
+ def relax_guard_to(self, guard_node, other_node):
+ """ Relaxes a guard operation to an earlier guard. """
+ # clone this operation object. if the vectorizer is
+ # not able to relax guards, it won't leave behind a modified operation
+ tgt_op = guard_node.getoperation().clone()
+ guard_node.op = tgt_op
+
+ op = other_node.getoperation()
+ assert isinstance(tgt_op, GuardResOp)
+ assert isinstance(op, GuardResOp)
+ olddescr = op.getdescr()
+ descr = CompileLoopVersionDescr()
+ if olddescr:
+ descr.copy_all_attributes_from(olddescr)
+ self.orig_loop_version.inputargs = op.getfailargs()
+ self.orig_loop_version.adddescr(descr)
+ #
+ tgt_op.setdescr(descr)
+ tgt_op.rd_snapshot = op.rd_snapshot
+ tgt_op.setfailargs(op.getfailargs())
+
class CostModel(object):
def __init__(self, threshold, vec_reg_size):
@@ -754,17 +786,6 @@
del self.packs[j]
return len(self.packs)
- # OLD
- # instead of deleting an item in the center of pack array,
- # the last element is assigned to position j and
- # the last slot is freed. Order of packs doesn't matter
- #last_pos = len(self.packs) - 1
- #if j == last_pos:
- # del self.packs[j]
- #else:
- # self.packs[j] = self.packs[last_pos]
- # del self.packs[last_pos]
- #return last_pos
def accumulates_pair(self, lnode, rnode, origin_pack):
# lnode and rnode are isomorphic and dependent
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit