Author: Richard Plangger <planri...@gmail.com> Branch: zarch-simd-support Changeset: r87105:4802a50159bf Date: 2016-09-14 12:06 +0200 http://bitbucket.org/pypy/pypy/changeset/4802a50159bf/
Log: load, store, add, subtract vector versions diff --git a/rpython/jit/backend/zarch/detect_feature.py b/rpython/jit/backend/zarch/detect_feature.py --- a/rpython/jit/backend/zarch/detect_feature.py +++ b/rpython/jit/backend/zarch/detect_feature.py @@ -5,18 +5,16 @@ from rpython.rtyper.tool import rffi_platform from rpython.rlib.rmmap import alloc, free from rpython.rlib.rstruct.runpack import runpack -from rpython.translator.platform.arch.s390x import s390x_cpu_revision +from rpython.translator.platform.arch.s390x import s390x_detect_vx SYSTEM = platform.system() def detect_simd_z_linux(): - return False + return s390x_detect_vx() def detect_simd_z(): if SYSTEM == 'Linux': - machine = s390x_cpu_revision() - if machine == "z13": - return True + return detect_simd_z_linux() return False if __name__ == '__main__': diff --git a/rpython/jit/backend/zarch/instruction_builder.py b/rpython/jit/backend/zarch/instruction_builder.py --- a/rpython/jit/backend/zarch/instruction_builder.py +++ b/rpython/jit/backend/zarch/instruction_builder.py @@ -5,7 +5,7 @@ def dummy_argument(arg): """ NOT_RPYTHON """ - if arg in ('r', 'r/m', 'm', 'f', '-', 'eo'): + if arg in ('r', 'r/m', 'm', 'f', '-', 'eo', 'v'): return 0 if arg.startswith('i') or arg.startswith('u'): return 0 @@ -23,6 +23,7 @@ - - unused f - floating point register r - register + v - vector register (128 bit) m - mask eo - even odd pair (= the even register) r/m - register or mask @@ -447,7 +448,6 @@ return encode_rxe def build_ris(mnemonic, (opcode1,opcode2), argtypes='r,i8,r/m,bd'): - br = is_branch_relative(mnemonic) @builder.arguments(argtypes) def encode_rie_c(self, reg1, imm8, mask, basedisp): self.writechar(opcode1) @@ -459,6 +459,50 @@ self.writechar(opcode2) return encode_rie_c +def build_vrx(mnemonic, (opcode1,opcode2), argtypes='v,bid,m'): + @builder.arguments(argtypes) + def encode_vrx(self, v1, bid, mask=0): + self.writechar(opcode1) + rbx = (v1 >= 16) << 3 + idx = bid.index + byte = (v1 & BIT_MASK_4) << 4 | (idx & BIT_MASK_4) + self.writechar(chr(byte)) + encode_base_displace(self, bid) + self.writechar(chr((mask & BIT_MASK_4 << 4) | (rbx & BIT_MASK_4))) + self.writechar(opcode2) + return encode_vrx + +def build_vrr_a(mnemonic, (opcode1,opcode2), argtypes='v,v'): + @builder.arguments(argtypes) + def encode_vrr_a(self, v1, v2): + self.writechar(opcode1) + rbx = (v1 >= 16) << 3 + rbx |= (v2 >= 16) << 2 + byte = (v1 & BIT_MASK_4) << 4 | (v2 & BIT_MASK_4) + self.writechar(chr(byte)) + self.writechar(chr(0)) + self.writechar(chr(0)) + self.writechar(chr(rbx & BIT_MASK_4)) + self.writechar(opcode2) + return encode_vrr_a + +def build_vrr_c(mnemonic, (opcode1,opcode2), argtypes='v,v,v,m,m'): + @builder.arguments(argtypes) + def encode_vrr_c(self, v1, v2, v3, mask1, mask2): + self.writechar(opcode1) + rbx = (v1 >= 16) << 3 + rbx |= (v2 >= 16) << 2 + rbx |= (v3 >= 16) << 1 + byte = (v1 & BIT_MASK_4) << 4 | (v2 & BIT_MASK_4) + self.writechar(chr(byte)) + byte = (v3 & BIT_MASK_4) << 4 + self.writechar(chr(byte)) + self.writechar(chr(mask2 & BIT_MASK_4)) + self.writechar(chr((mask1 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4))) + self.writechar(opcode2) + return encode_vrr_c + + def build_unpack_func(mnemonic, func): @always_inline def check_arg_type(arg, type): @@ -502,7 +546,8 @@ if argtype == '-': return 0 elif argtype == 'r' or argtype == 'r/m' or \ - argtype == 'f' or argtype == 'eo': + argtype == 'f' or argtype == 'eo' or \ + argtype == 'v': return arg.value elif argtype.startswith('i') or argtype.startswith('u') or argtype.startswith('h'): return arg.value diff --git a/rpython/jit/backend/zarch/instructions.py b/rpython/jit/backend/zarch/instructions.py --- a/rpython/jit/backend/zarch/instructions.py +++ b/rpython/jit/backend/zarch/instructions.py @@ -292,11 +292,26 @@ 'STFLE': ('s', ['\xB2','\xB0']), } + +vector_mnemonic_codes = { + 'VL': ('vrx', ['\xE7','\x06'], 'v,bid'), + 'VLR': ('vrr_a', ['\xE7','\x56']), + + 'VST': ('vrx', ['\xE7','\x0E'], 'v,bid'), + + # floating point + 'VFA': ('vrr_c', ['\xE7','\xE3']), + 'VFS': ('vrr_c', ['\xE7','\xE2']), + + # '': ('', ['','']), +} + all_mnemonic_codes.update(arith_mnemonic_codes) all_mnemonic_codes.update(logic_mnemonic_codes) all_mnemonic_codes.update(memory_mnemonic_codes) all_mnemonic_codes.update(floatingpoint_mnemonic_codes) all_mnemonic_codes.update(branch_mnemonic_codes) +all_mnemonic_codes.update(vector_mnemonic_codes) if __name__ == "__main__": diff --git a/rpython/jit/backend/zarch/locations.py b/rpython/jit/backend/zarch/locations.py --- a/rpython/jit/backend/zarch/locations.py +++ b/rpython/jit/backend/zarch/locations.py @@ -107,6 +107,26 @@ def is_float(self): return True +class VectorRegisterLocation(RegisterLocation): + _immutable_ = True + type = FLOAT + width = DOUBLE_WORD*2 + + def __repr__(self): + return 'v%d' % self.value + + def is_core_reg(self): + return False + + def is_fp_reg(self): + return True + + def as_key(self): # 16 <= as_key <= 32 + return self.value + 32 + + def is_float(self): + return True + class ImmLocation(AssemblerLocation): _immutable_ = True width = WORD @@ -176,6 +196,9 @@ if length: self.length = length.value + def __repr__(self): + return 'addr(base=r%d,idx=r%d,len=%d)' % (self.base, self.index, self.length) + class PoolLoc(AddressLocation): _immutable_ = True width = WORD diff --git a/rpython/jit/backend/zarch/regalloc.py b/rpython/jit/backend/zarch/regalloc.py --- a/rpython/jit/backend/zarch/regalloc.py +++ b/rpython/jit/backend/zarch/regalloc.py @@ -109,6 +109,46 @@ self.temp_boxes.append(box) return reg +class VectorRegisterManager(RegisterManager): + all_regs = r.MANAGED_VECTOR_REGS + box_types = [FLOAT, INT] + save_around_call_regs = [] # calling not allowed in vectorized traces! + assert set(save_around_call_regs).issubset(all_regs) + pool = None + + def __init__(self, longevity, frame_manager=None, assembler=None): + RegisterManager.__init__(self, longevity, frame_manager, assembler) + + def call_result_location(self, v): + return None + + def convert_to_imm(self, c): + return l.pool(self.assembler.pool.get_offset(c), float=True) + + def ensure_reg_or_pool(self, box): + if isinstance(box, Const): + offset = self.assembler.pool.get_offset(box) + return l.pool(offset, float=True) + else: + assert box in self.temp_boxes + loc = self.make_sure_var_in_reg(box, + forbidden_vars=self.temp_boxes) + return loc + + def ensure_reg(self, box): + assert box in self.temp_boxes + loc = self.make_sure_var_in_reg(box, + forbidden_vars=self.temp_boxes) + return loc + + def get_scratch_reg(self, selected_reg=None): + # TODO + box = TempFloat() + reg = self.force_allocate_reg(box, forbidden_vars=self.temp_boxes, selected_reg=selected_reg) + self.temp_boxes.append(box) + return reg + + class ZARCHRegisterManager(RegisterManager): all_regs = r.MANAGED_REGS @@ -389,8 +429,9 @@ assert isinstance(loc, l.StackLocation) return loc.position +from rpython.jit.backend.zarch import vector_ext -class Regalloc(BaseRegalloc): +class Regalloc(BaseRegalloc, vector_ext.VectorRegalloc): def __init__(self, assembler=None): self.cpu = assembler.cpu @@ -415,6 +456,9 @@ self.fprm = FPRegisterManager(self.longevity, frame_manager = self.fm, assembler = self.assembler) self.fprm.pool = self.assembler.pool + self.vrm = VectorRegisterManager(self.longevity, frame_manager = self.fm, + assembler = self.assembler) + self.vrm.pool = self.assembler.pool return operations def prepare_loop(self, inputargs, operations, looptoken, allgcrefs): @@ -470,13 +514,17 @@ self.fm.finish_binding() self.rm._check_invariants() self.fprm._check_invariants() + self.vrm._check_invariants() def get_final_frame_depth(self): return self.fm.get_frame_depth() def possibly_free_var(self, var): if var is not None: - if var.type == FLOAT: + if var.is_vector(): + if var.type != VOID: + self.vrm.possibly_free_var(var) + elif var.type == FLOAT: self.fprm.possibly_free_var(var) else: self.rm.possibly_free_var(var) @@ -533,6 +581,7 @@ self.assembler.mc.mark_op(op) self.rm.position = i self.fprm.position = i + self.vrm.position = i opnum = op.getopnum() if rop.has_no_side_effect(opnum) and op not in self.longevity: i += 1 @@ -541,7 +590,10 @@ # for j in range(op.numargs()): box = op.getarg(j) - if box.type != FLOAT: + if box.is_vector(): + if box.type != VOID: + self.vrm.temp_boxes.append(box) + elif box.type != FLOAT: self.rm.temp_boxes.append(box) else: self.fprm.temp_boxes.append(box) @@ -555,6 +607,7 @@ self.possibly_free_var(op) self.rm._check_invariants() self.fprm._check_invariants() + self.vrm._check_invariants() if self.assembler.mc.get_relative_pos() > self.limit_loop_break: self.assembler.break_long_loop() self.limit_loop_break = (self.assembler.mc.get_relative_pos() + @@ -562,6 +615,7 @@ i += 1 assert not self.rm.reg_bindings assert not self.fprm.reg_bindings + assert not self.vrm.reg_bindings self.flush_loop() self.assembler.mc.mark_op(None) # end of the loop self.operations = None @@ -677,6 +731,7 @@ # temporary boxes and all the current operation's arguments self.rm.free_temp_vars() self.fprm.free_temp_vars() + self.vrm.free_temp_vars() def compute_hint_frame_locations(self, operations): # optimization only: fill in the 'hint_frame_locations' dictionary diff --git a/rpython/jit/backend/zarch/registers.py b/rpython/jit/backend/zarch/registers.py --- a/rpython/jit/backend/zarch/registers.py +++ b/rpython/jit/backend/zarch/registers.py @@ -1,8 +1,9 @@ -from rpython.jit.backend.zarch.locations import FloatRegisterLocation -from rpython.jit.backend.zarch.locations import RegisterLocation +from rpython.jit.backend.zarch.locations import (FloatRegisterLocation, + RegisterLocation, VectorRegisterLocation) registers = [RegisterLocation(i) for i in range(16)] fpregisters = [FloatRegisterLocation(i) for i in range(16)] +vregisters = [VectorRegisterLocation(16+i) for i in range(16)] [r0,r1,r2,r3,r4,r5,r6,r7,r8, r9,r10,r11,r12,r13,r14,r15] = registers @@ -30,6 +31,8 @@ MANAGED_FP_REGS = fpregisters[:-1] VOLATILES_FLOAT = [f0,f1,f2,f3,f4,f5,f6,f7] +MANAGED_VECTOR_REGS = vregisters + # The JITFRAME_FIXED_SIZE is measured in words, and should be the # number of registers that need to be saved into the jitframe when # failing a guard, for example. diff --git a/rpython/jit/backend/zarch/vector_ext.py b/rpython/jit/backend/zarch/vector_ext.py --- a/rpython/jit/backend/zarch/vector_ext.py +++ b/rpython/jit/backend/zarch/vector_ext.py @@ -15,6 +15,7 @@ import rpython.jit.backend.zarch.registers as r import rpython.jit.backend.zarch.conditions as c import rpython.jit.backend.zarch.locations as l +from rpython.jit.backend.zarch.locations import imm from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper from rpython.rtyper.lltypesystem import lltype, rffi from rpython.jit.codewriter import longlong @@ -92,35 +93,16 @@ pass def emit_vec_load_f(self, op, arglocs, regalloc): - resloc, baseloc, indexloc, size_loc, ofs, integer_loc = arglocs - indexloc = self._apply_offset(indexloc, ofs) - itemsize = size_loc.value - if integer_loc.value: - self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value) - elif itemsize == 4: - self.mc.lxvw4x(resloc.value, indexloc.value, baseloc.value) - elif itemsize == 8: - self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value) - else: - not_implemented("vec_load_f itemsize %d" % itemsize) + resloc, baseloc, indexloc, size_loc, offsetloc, integer_loc = arglocs + addrloc = self._load_address(baseloc, indexloc, offsetloc) + self.mc.VL(resloc, addrloc) emit_vec_load_i = emit_vec_load_f def emit_vec_store(self, op, arglocs, regalloc): - baseloc, indexloc, valueloc, sizeloc, baseofs, \ - integer_loc = arglocs - indexloc = self._apply_offset(indexloc, baseofs) - assert baseofs.value == 0 - if integer_loc.value: - self.mc.stxvd2x(valueloc.value, indexloc.value, baseloc.value) - else: - itemsize = sizeloc.value - if itemsize == 4: - self.mc.stxvw4x(valueloc.value, indexloc.value, baseloc.value) - elif itemsize == 8: - self.mc.stxvd2x(valueloc.value, indexloc.value, baseloc.value) - else: - not_implemented("vec_store itemsize %d" % itemsize) + baseloc, indexloc, valueloc, sizeloc, offsetloc, integer_loc = arglocs + addrloc = self._load_address(baseloc, indexloc, offsetloc) + self.mc.VST(valueloc, addrloc) def emit_vec_int_add(self, op, arglocs, regalloc): resloc, loc0, loc1, size_loc = arglocs @@ -152,18 +134,18 @@ def emit_vec_float_add(self, op, arglocs, regalloc): resloc, loc0, loc1, itemsize_loc = arglocs itemsize = itemsize_loc.value - if itemsize == 4: - self.mc.xvaddsp(resloc.value, loc0.value, loc1.value) - elif itemsize == 8: - self.mc.xvadddp(resloc.value, loc0.value, loc1.value) + if itemsize == 8: + self.mc.VFA(resloc, loc0, loc1, 3, 0) + return + not_implemented("vec_float_add of size %d" % itemsize) def emit_vec_float_sub(self, op, arglocs, regalloc): resloc, loc0, loc1, itemsize_loc = arglocs itemsize = itemsize_loc.value - if itemsize == 4: - self.mc.xvsubsp(resloc.value, loc0.value, loc1.value) - elif itemsize == 8: - self.mc.xvsubdp(resloc.value, loc0.value, loc1.value) + if itemsize == 8: + self.mc.VFS(resloc, loc0, loc1, 3, 0) + return + not_implemented("vec_float_add of size %d" % itemsize) def emit_vec_float_mul(self, op, arglocs, regalloc): resloc, loc0, loc1, itemsize_loc = arglocs diff --git a/rpython/translator/platform/arch/s390x.py b/rpython/translator/platform/arch/s390x.py --- a/rpython/translator/platform/arch/s390x.py +++ b/rpython/translator/platform/arch/s390x.py @@ -38,6 +38,18 @@ return ids +def s390x_detect_vx(): + with open("/proc/cpuinfo", "rb") as fd: + lines = fd.read().splitlines() + for line in lines: + if line.startswith("features"): + colonidx = line.find(':') + split = line[colonidx+1:].strip().split(' ') + if 'vx' in split: + return True + break + + return False def s390x_cpu_revision(): # linux kernel does the same classification _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit