Author: Richard Plangger <[email protected]>
Branch: zarch-simd-support
Changeset: r87105:4802a50159bf
Date: 2016-09-14 12:06 +0200
http://bitbucket.org/pypy/pypy/changeset/4802a50159bf/
Log: load, store, add, subtract vector versions
diff --git a/rpython/jit/backend/zarch/detect_feature.py
b/rpython/jit/backend/zarch/detect_feature.py
--- a/rpython/jit/backend/zarch/detect_feature.py
+++ b/rpython/jit/backend/zarch/detect_feature.py
@@ -5,18 +5,16 @@
from rpython.rtyper.tool import rffi_platform
from rpython.rlib.rmmap import alloc, free
from rpython.rlib.rstruct.runpack import runpack
-from rpython.translator.platform.arch.s390x import s390x_cpu_revision
+from rpython.translator.platform.arch.s390x import s390x_detect_vx
SYSTEM = platform.system()
def detect_simd_z_linux():
- return False
+ return s390x_detect_vx()
def detect_simd_z():
if SYSTEM == 'Linux':
- machine = s390x_cpu_revision()
- if machine == "z13":
- return True
+ return detect_simd_z_linux()
return False
if __name__ == '__main__':
diff --git a/rpython/jit/backend/zarch/instruction_builder.py
b/rpython/jit/backend/zarch/instruction_builder.py
--- a/rpython/jit/backend/zarch/instruction_builder.py
+++ b/rpython/jit/backend/zarch/instruction_builder.py
@@ -5,7 +5,7 @@
def dummy_argument(arg):
""" NOT_RPYTHON """
- if arg in ('r', 'r/m', 'm', 'f', '-', 'eo'):
+ if arg in ('r', 'r/m', 'm', 'f', '-', 'eo', 'v'):
return 0
if arg.startswith('i') or arg.startswith('u'):
return 0
@@ -23,6 +23,7 @@
- - unused
f - floating point register
r - register
+ v - vector register (128 bit)
m - mask
eo - even odd pair (= the even register)
r/m - register or mask
@@ -447,7 +448,6 @@
return encode_rxe
def build_ris(mnemonic, (opcode1,opcode2), argtypes='r,i8,r/m,bd'):
- br = is_branch_relative(mnemonic)
@builder.arguments(argtypes)
def encode_rie_c(self, reg1, imm8, mask, basedisp):
self.writechar(opcode1)
@@ -459,6 +459,50 @@
self.writechar(opcode2)
return encode_rie_c
+def build_vrx(mnemonic, (opcode1,opcode2), argtypes='v,bid,m'):
+ @builder.arguments(argtypes)
+ def encode_vrx(self, v1, bid, mask=0):
+ self.writechar(opcode1)
+ rbx = (v1 >= 16) << 3
+ idx = bid.index
+ byte = (v1 & BIT_MASK_4) << 4 | (idx & BIT_MASK_4)
+ self.writechar(chr(byte))
+ encode_base_displace(self, bid)
+ self.writechar(chr((mask & BIT_MASK_4 << 4) | (rbx & BIT_MASK_4)))
+ self.writechar(opcode2)
+ return encode_vrx
+
+def build_vrr_a(mnemonic, (opcode1,opcode2), argtypes='v,v'):
+ @builder.arguments(argtypes)
+ def encode_vrr_a(self, v1, v2):
+ self.writechar(opcode1)
+ rbx = (v1 >= 16) << 3
+ rbx |= (v2 >= 16) << 2
+ byte = (v1 & BIT_MASK_4) << 4 | (v2 & BIT_MASK_4)
+ self.writechar(chr(byte))
+ self.writechar(chr(0))
+ self.writechar(chr(0))
+ self.writechar(chr(rbx & BIT_MASK_4))
+ self.writechar(opcode2)
+ return encode_vrr_a
+
+def build_vrr_c(mnemonic, (opcode1,opcode2), argtypes='v,v,v,m,m'):
+ @builder.arguments(argtypes)
+ def encode_vrr_c(self, v1, v2, v3, mask1, mask2):
+ self.writechar(opcode1)
+ rbx = (v1 >= 16) << 3
+ rbx |= (v2 >= 16) << 2
+ rbx |= (v3 >= 16) << 1
+ byte = (v1 & BIT_MASK_4) << 4 | (v2 & BIT_MASK_4)
+ self.writechar(chr(byte))
+ byte = (v3 & BIT_MASK_4) << 4
+ self.writechar(chr(byte))
+ self.writechar(chr(mask2 & BIT_MASK_4))
+ self.writechar(chr((mask1 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4)))
+ self.writechar(opcode2)
+ return encode_vrr_c
+
+
def build_unpack_func(mnemonic, func):
@always_inline
def check_arg_type(arg, type):
@@ -502,7 +546,8 @@
if argtype == '-':
return 0
elif argtype == 'r' or argtype == 'r/m' or \
- argtype == 'f' or argtype == 'eo':
+ argtype == 'f' or argtype == 'eo' or \
+ argtype == 'v':
return arg.value
elif argtype.startswith('i') or argtype.startswith('u') or
argtype.startswith('h'):
return arg.value
diff --git a/rpython/jit/backend/zarch/instructions.py
b/rpython/jit/backend/zarch/instructions.py
--- a/rpython/jit/backend/zarch/instructions.py
+++ b/rpython/jit/backend/zarch/instructions.py
@@ -292,11 +292,26 @@
'STFLE': ('s', ['\xB2','\xB0']),
}
+
+vector_mnemonic_codes = {
+ 'VL': ('vrx', ['\xE7','\x06'], 'v,bid'),
+ 'VLR': ('vrr_a', ['\xE7','\x56']),
+
+ 'VST': ('vrx', ['\xE7','\x0E'], 'v,bid'),
+
+ # floating point
+ 'VFA': ('vrr_c', ['\xE7','\xE3']),
+ 'VFS': ('vrr_c', ['\xE7','\xE2']),
+
+ # '': ('', ['','']),
+}
+
all_mnemonic_codes.update(arith_mnemonic_codes)
all_mnemonic_codes.update(logic_mnemonic_codes)
all_mnemonic_codes.update(memory_mnemonic_codes)
all_mnemonic_codes.update(floatingpoint_mnemonic_codes)
all_mnemonic_codes.update(branch_mnemonic_codes)
+all_mnemonic_codes.update(vector_mnemonic_codes)
if __name__ == "__main__":
diff --git a/rpython/jit/backend/zarch/locations.py
b/rpython/jit/backend/zarch/locations.py
--- a/rpython/jit/backend/zarch/locations.py
+++ b/rpython/jit/backend/zarch/locations.py
@@ -107,6 +107,26 @@
def is_float(self):
return True
+class VectorRegisterLocation(RegisterLocation):
+ _immutable_ = True
+ type = FLOAT
+ width = DOUBLE_WORD*2
+
+ def __repr__(self):
+ return 'v%d' % self.value
+
+ def is_core_reg(self):
+ return False
+
+ def is_fp_reg(self):
+ return True
+
+ def as_key(self): # 16 <= as_key <= 32
+ return self.value + 32
+
+ def is_float(self):
+ return True
+
class ImmLocation(AssemblerLocation):
_immutable_ = True
width = WORD
@@ -176,6 +196,9 @@
if length:
self.length = length.value
+ def __repr__(self):
+ return 'addr(base=r%d,idx=r%d,len=%d)' % (self.base, self.index,
self.length)
+
class PoolLoc(AddressLocation):
_immutable_ = True
width = WORD
diff --git a/rpython/jit/backend/zarch/regalloc.py
b/rpython/jit/backend/zarch/regalloc.py
--- a/rpython/jit/backend/zarch/regalloc.py
+++ b/rpython/jit/backend/zarch/regalloc.py
@@ -109,6 +109,46 @@
self.temp_boxes.append(box)
return reg
+class VectorRegisterManager(RegisterManager):
+ all_regs = r.MANAGED_VECTOR_REGS
+ box_types = [FLOAT, INT]
+ save_around_call_regs = [] # calling not allowed in vectorized traces!
+ assert set(save_around_call_regs).issubset(all_regs)
+ pool = None
+
+ def __init__(self, longevity, frame_manager=None, assembler=None):
+ RegisterManager.__init__(self, longevity, frame_manager, assembler)
+
+ def call_result_location(self, v):
+ return None
+
+ def convert_to_imm(self, c):
+ return l.pool(self.assembler.pool.get_offset(c), float=True)
+
+ def ensure_reg_or_pool(self, box):
+ if isinstance(box, Const):
+ offset = self.assembler.pool.get_offset(box)
+ return l.pool(offset, float=True)
+ else:
+ assert box in self.temp_boxes
+ loc = self.make_sure_var_in_reg(box,
+ forbidden_vars=self.temp_boxes)
+ return loc
+
+ def ensure_reg(self, box):
+ assert box in self.temp_boxes
+ loc = self.make_sure_var_in_reg(box,
+ forbidden_vars=self.temp_boxes)
+ return loc
+
+ def get_scratch_reg(self, selected_reg=None):
+ # TODO
+ box = TempFloat()
+ reg = self.force_allocate_reg(box, forbidden_vars=self.temp_boxes,
selected_reg=selected_reg)
+ self.temp_boxes.append(box)
+ return reg
+
+
class ZARCHRegisterManager(RegisterManager):
all_regs = r.MANAGED_REGS
@@ -389,8 +429,9 @@
assert isinstance(loc, l.StackLocation)
return loc.position
+from rpython.jit.backend.zarch import vector_ext
-class Regalloc(BaseRegalloc):
+class Regalloc(BaseRegalloc, vector_ext.VectorRegalloc):
def __init__(self, assembler=None):
self.cpu = assembler.cpu
@@ -415,6 +456,9 @@
self.fprm = FPRegisterManager(self.longevity, frame_manager = self.fm,
assembler = self.assembler)
self.fprm.pool = self.assembler.pool
+ self.vrm = VectorRegisterManager(self.longevity, frame_manager =
self.fm,
+ assembler = self.assembler)
+ self.vrm.pool = self.assembler.pool
return operations
def prepare_loop(self, inputargs, operations, looptoken, allgcrefs):
@@ -470,13 +514,17 @@
self.fm.finish_binding()
self.rm._check_invariants()
self.fprm._check_invariants()
+ self.vrm._check_invariants()
def get_final_frame_depth(self):
return self.fm.get_frame_depth()
def possibly_free_var(self, var):
if var is not None:
- if var.type == FLOAT:
+ if var.is_vector():
+ if var.type != VOID:
+ self.vrm.possibly_free_var(var)
+ elif var.type == FLOAT:
self.fprm.possibly_free_var(var)
else:
self.rm.possibly_free_var(var)
@@ -533,6 +581,7 @@
self.assembler.mc.mark_op(op)
self.rm.position = i
self.fprm.position = i
+ self.vrm.position = i
opnum = op.getopnum()
if rop.has_no_side_effect(opnum) and op not in self.longevity:
i += 1
@@ -541,7 +590,10 @@
#
for j in range(op.numargs()):
box = op.getarg(j)
- if box.type != FLOAT:
+ if box.is_vector():
+ if box.type != VOID:
+ self.vrm.temp_boxes.append(box)
+ elif box.type != FLOAT:
self.rm.temp_boxes.append(box)
else:
self.fprm.temp_boxes.append(box)
@@ -555,6 +607,7 @@
self.possibly_free_var(op)
self.rm._check_invariants()
self.fprm._check_invariants()
+ self.vrm._check_invariants()
if self.assembler.mc.get_relative_pos() > self.limit_loop_break:
self.assembler.break_long_loop()
self.limit_loop_break = (self.assembler.mc.get_relative_pos() +
@@ -562,6 +615,7 @@
i += 1
assert not self.rm.reg_bindings
assert not self.fprm.reg_bindings
+ assert not self.vrm.reg_bindings
self.flush_loop()
self.assembler.mc.mark_op(None) # end of the loop
self.operations = None
@@ -677,6 +731,7 @@
# temporary boxes and all the current operation's arguments
self.rm.free_temp_vars()
self.fprm.free_temp_vars()
+ self.vrm.free_temp_vars()
def compute_hint_frame_locations(self, operations):
# optimization only: fill in the 'hint_frame_locations' dictionary
diff --git a/rpython/jit/backend/zarch/registers.py
b/rpython/jit/backend/zarch/registers.py
--- a/rpython/jit/backend/zarch/registers.py
+++ b/rpython/jit/backend/zarch/registers.py
@@ -1,8 +1,9 @@
-from rpython.jit.backend.zarch.locations import FloatRegisterLocation
-from rpython.jit.backend.zarch.locations import RegisterLocation
+from rpython.jit.backend.zarch.locations import (FloatRegisterLocation,
+ RegisterLocation, VectorRegisterLocation)
registers = [RegisterLocation(i) for i in range(16)]
fpregisters = [FloatRegisterLocation(i) for i in range(16)]
+vregisters = [VectorRegisterLocation(16+i) for i in range(16)]
[r0,r1,r2,r3,r4,r5,r6,r7,r8,
r9,r10,r11,r12,r13,r14,r15] = registers
@@ -30,6 +31,8 @@
MANAGED_FP_REGS = fpregisters[:-1]
VOLATILES_FLOAT = [f0,f1,f2,f3,f4,f5,f6,f7]
+MANAGED_VECTOR_REGS = vregisters
+
# The JITFRAME_FIXED_SIZE is measured in words, and should be the
# number of registers that need to be saved into the jitframe when
# failing a guard, for example.
diff --git a/rpython/jit/backend/zarch/vector_ext.py
b/rpython/jit/backend/zarch/vector_ext.py
--- a/rpython/jit/backend/zarch/vector_ext.py
+++ b/rpython/jit/backend/zarch/vector_ext.py
@@ -15,6 +15,7 @@
import rpython.jit.backend.zarch.registers as r
import rpython.jit.backend.zarch.conditions as c
import rpython.jit.backend.zarch.locations as l
+from rpython.jit.backend.zarch.locations import imm
from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
from rpython.rtyper.lltypesystem import lltype, rffi
from rpython.jit.codewriter import longlong
@@ -92,35 +93,16 @@
pass
def emit_vec_load_f(self, op, arglocs, regalloc):
- resloc, baseloc, indexloc, size_loc, ofs, integer_loc = arglocs
- indexloc = self._apply_offset(indexloc, ofs)
- itemsize = size_loc.value
- if integer_loc.value:
- self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value)
- elif itemsize == 4:
- self.mc.lxvw4x(resloc.value, indexloc.value, baseloc.value)
- elif itemsize == 8:
- self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value)
- else:
- not_implemented("vec_load_f itemsize %d" % itemsize)
+ resloc, baseloc, indexloc, size_loc, offsetloc, integer_loc = arglocs
+ addrloc = self._load_address(baseloc, indexloc, offsetloc)
+ self.mc.VL(resloc, addrloc)
emit_vec_load_i = emit_vec_load_f
def emit_vec_store(self, op, arglocs, regalloc):
- baseloc, indexloc, valueloc, sizeloc, baseofs, \
- integer_loc = arglocs
- indexloc = self._apply_offset(indexloc, baseofs)
- assert baseofs.value == 0
- if integer_loc.value:
- self.mc.stxvd2x(valueloc.value, indexloc.value, baseloc.value)
- else:
- itemsize = sizeloc.value
- if itemsize == 4:
- self.mc.stxvw4x(valueloc.value, indexloc.value, baseloc.value)
- elif itemsize == 8:
- self.mc.stxvd2x(valueloc.value, indexloc.value, baseloc.value)
- else:
- not_implemented("vec_store itemsize %d" % itemsize)
+ baseloc, indexloc, valueloc, sizeloc, offsetloc, integer_loc = arglocs
+ addrloc = self._load_address(baseloc, indexloc, offsetloc)
+ self.mc.VST(valueloc, addrloc)
def emit_vec_int_add(self, op, arglocs, regalloc):
resloc, loc0, loc1, size_loc = arglocs
@@ -152,18 +134,18 @@
def emit_vec_float_add(self, op, arglocs, regalloc):
resloc, loc0, loc1, itemsize_loc = arglocs
itemsize = itemsize_loc.value
- if itemsize == 4:
- self.mc.xvaddsp(resloc.value, loc0.value, loc1.value)
- elif itemsize == 8:
- self.mc.xvadddp(resloc.value, loc0.value, loc1.value)
+ if itemsize == 8:
+ self.mc.VFA(resloc, loc0, loc1, 3, 0)
+ return
+ not_implemented("vec_float_add of size %d" % itemsize)
def emit_vec_float_sub(self, op, arglocs, regalloc):
resloc, loc0, loc1, itemsize_loc = arglocs
itemsize = itemsize_loc.value
- if itemsize == 4:
- self.mc.xvsubsp(resloc.value, loc0.value, loc1.value)
- elif itemsize == 8:
- self.mc.xvsubdp(resloc.value, loc0.value, loc1.value)
+ if itemsize == 8:
+ self.mc.VFS(resloc, loc0, loc1, 3, 0)
+ return
+ not_implemented("vec_float_add of size %d" % itemsize)
def emit_vec_float_mul(self, op, arglocs, regalloc):
resloc, loc0, loc1, itemsize_loc = arglocs
diff --git a/rpython/translator/platform/arch/s390x.py
b/rpython/translator/platform/arch/s390x.py
--- a/rpython/translator/platform/arch/s390x.py
+++ b/rpython/translator/platform/arch/s390x.py
@@ -38,6 +38,18 @@
return ids
+def s390x_detect_vx():
+ with open("/proc/cpuinfo", "rb") as fd:
+ lines = fd.read().splitlines()
+ for line in lines:
+ if line.startswith("features"):
+ colonidx = line.find(':')
+ split = line[colonidx+1:].strip().split(' ')
+ if 'vx' in split:
+ return True
+ break
+
+ return False
def s390x_cpu_revision():
# linux kernel does the same classification
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit