[pypy-commit] pypy zarch-simd-support: load, store, add, subtract vector versions

plan_rich Wed, 14 Sep 2016 04:01:32 -0700

Author: Richard Plangger <[email protected]>
Branch: zarch-simd-support
Changeset: r87105:4802a50159bf
Date: 2016-09-14 12:06 +0200
http://bitbucket.org/pypy/pypy/changeset/4802a50159bf/


Log:    load, store, add, subtract vector versions

diff --git a/rpython/jit/backend/zarch/detect_feature.py 
b/rpython/jit/backend/zarch/detect_feature.py
--- a/rpython/jit/backend/zarch/detect_feature.py
+++ b/rpython/jit/backend/zarch/detect_feature.py
@@ -5,18 +5,16 @@
 from rpython.rtyper.tool import rffi_platform
 from rpython.rlib.rmmap import alloc, free
 from rpython.rlib.rstruct.runpack import runpack
-from rpython.translator.platform.arch.s390x import s390x_cpu_revision
+from rpython.translator.platform.arch.s390x import s390x_detect_vx
 
 SYSTEM = platform.system()
 
 def detect_simd_z_linux():
-    return False
+    return s390x_detect_vx()
 
 def detect_simd_z():
     if SYSTEM == 'Linux':
-        machine = s390x_cpu_revision()
-        if machine == "z13":
-            return True
+        return detect_simd_z_linux()
     return False
 
 if __name__ == '__main__':
diff --git a/rpython/jit/backend/zarch/instruction_builder.py 
b/rpython/jit/backend/zarch/instruction_builder.py
--- a/rpython/jit/backend/zarch/instruction_builder.py
+++ b/rpython/jit/backend/zarch/instruction_builder.py
@@ -5,7 +5,7 @@
 
 def dummy_argument(arg):
     """ NOT_RPYTHON """
-    if arg in ('r', 'r/m', 'm', 'f', '-', 'eo'):
+    if arg in ('r', 'r/m', 'm', 'f', '-', 'eo', 'v'):
         return 0
     if arg.startswith('i') or arg.startswith('u'):
         return 0
@@ -23,6 +23,7 @@
         -      - unused
         f      - floating point register
         r      - register
+        v      - vector register (128 bit)
         m      - mask
         eo     - even odd pair (= the even register)
         r/m    - register or mask
@@ -447,7 +448,6 @@
     return encode_rxe
 
 def build_ris(mnemonic, (opcode1,opcode2), argtypes='r,i8,r/m,bd'):
-    br = is_branch_relative(mnemonic)
     @builder.arguments(argtypes)
     def encode_rie_c(self, reg1, imm8, mask, basedisp):
         self.writechar(opcode1)
@@ -459,6 +459,50 @@
         self.writechar(opcode2)
     return encode_rie_c
 
+def build_vrx(mnemonic, (opcode1,opcode2), argtypes='v,bid,m'):
+    @builder.arguments(argtypes)
+    def encode_vrx(self, v1, bid, mask=0):
+        self.writechar(opcode1)
+        rbx = (v1 >= 16) << 3
+        idx = bid.index
+        byte = (v1 & BIT_MASK_4) << 4 | (idx & BIT_MASK_4)
+        self.writechar(chr(byte))
+        encode_base_displace(self, bid)
+        self.writechar(chr((mask & BIT_MASK_4 << 4) | (rbx & BIT_MASK_4)))
+        self.writechar(opcode2)
+    return encode_vrx
+
+def build_vrr_a(mnemonic, (opcode1,opcode2), argtypes='v,v'):
+    @builder.arguments(argtypes)
+    def encode_vrr_a(self, v1, v2):
+        self.writechar(opcode1)
+        rbx =  (v1 >= 16) << 3
+        rbx |= (v2 >= 16) << 2
+        byte = (v1 & BIT_MASK_4) << 4 | (v2 & BIT_MASK_4)
+        self.writechar(chr(byte))
+        self.writechar(chr(0))
+        self.writechar(chr(0))
+        self.writechar(chr(rbx & BIT_MASK_4))
+        self.writechar(opcode2)
+    return encode_vrr_a
+
+def build_vrr_c(mnemonic, (opcode1,opcode2), argtypes='v,v,v,m,m'):
+    @builder.arguments(argtypes)
+    def encode_vrr_c(self, v1, v2, v3, mask1, mask2):
+        self.writechar(opcode1)
+        rbx =  (v1 >= 16) << 3
+        rbx |= (v2 >= 16) << 2
+        rbx |= (v3 >= 16) << 1
+        byte = (v1 & BIT_MASK_4) << 4 | (v2 & BIT_MASK_4)
+        self.writechar(chr(byte))
+        byte = (v3 & BIT_MASK_4) << 4
+        self.writechar(chr(byte))
+        self.writechar(chr(mask2 & BIT_MASK_4))
+        self.writechar(chr((mask1 & BIT_MASK_4) << 4 | (rbx & BIT_MASK_4)))
+        self.writechar(opcode2)
+    return encode_vrr_c
+
+
 def build_unpack_func(mnemonic, func):
     @always_inline
     def check_arg_type(arg, type):
@@ -502,7 +546,8 @@
         if argtype == '-':
             return 0
         elif argtype == 'r' or argtype == 'r/m' or \
-             argtype == 'f' or argtype == 'eo':
+             argtype == 'f' or argtype == 'eo' or \
+             argtype == 'v':
             return arg.value
         elif argtype.startswith('i') or argtype.startswith('u') or 
argtype.startswith('h'):
             return arg.value
diff --git a/rpython/jit/backend/zarch/instructions.py 
b/rpython/jit/backend/zarch/instructions.py
--- a/rpython/jit/backend/zarch/instructions.py
+++ b/rpython/jit/backend/zarch/instructions.py
@@ -292,11 +292,26 @@
 
     'STFLE':   ('s',     ['\xB2','\xB0']),
 }
+
+vector_mnemonic_codes = {
+    'VL':     ('vrx', ['\xE7','\x06'], 'v,bid'),
+    'VLR':    ('vrr_a', ['\xE7','\x56']),
+
+    'VST':    ('vrx', ['\xE7','\x0E'], 'v,bid'),
+
+    # floating point
+    'VFA':   ('vrr_c', ['\xE7','\xE3']),
+    'VFS':   ('vrr_c', ['\xE7','\xE2']),
+
+    # '': ('', ['','']),
+}
+
 all_mnemonic_codes.update(arith_mnemonic_codes)
 all_mnemonic_codes.update(logic_mnemonic_codes)
 all_mnemonic_codes.update(memory_mnemonic_codes)
 all_mnemonic_codes.update(floatingpoint_mnemonic_codes)
 all_mnemonic_codes.update(branch_mnemonic_codes)
+all_mnemonic_codes.update(vector_mnemonic_codes)
 
 
 if __name__ == "__main__":
diff --git a/rpython/jit/backend/zarch/locations.py 
b/rpython/jit/backend/zarch/locations.py
--- a/rpython/jit/backend/zarch/locations.py
+++ b/rpython/jit/backend/zarch/locations.py
@@ -107,6 +107,26 @@
     def is_float(self):
         return True
 
+class VectorRegisterLocation(RegisterLocation):
+    _immutable_ = True
+    type = FLOAT
+    width = DOUBLE_WORD*2
+
+    def __repr__(self):
+        return 'v%d' % self.value
+
+    def is_core_reg(self):
+        return False
+
+    def is_fp_reg(self):
+        return True
+
+    def as_key(self):            # 16 <= as_key <= 32
+        return self.value + 32
+
+    def is_float(self):
+        return True
+
 class ImmLocation(AssemblerLocation):
     _immutable_ = True
     width = WORD
@@ -176,6 +196,9 @@
         if length:
             self.length = length.value
 
+    def __repr__(self):
+        return 'addr(base=r%d,idx=r%d,len=%d)' % (self.base, self.index, 
self.length)
+
 class PoolLoc(AddressLocation):
     _immutable_ = True
     width = WORD
diff --git a/rpython/jit/backend/zarch/regalloc.py 
b/rpython/jit/backend/zarch/regalloc.py
--- a/rpython/jit/backend/zarch/regalloc.py
+++ b/rpython/jit/backend/zarch/regalloc.py
@@ -109,6 +109,46 @@
         self.temp_boxes.append(box)
         return reg
 
+class VectorRegisterManager(RegisterManager):
+    all_regs              = r.MANAGED_VECTOR_REGS
+    box_types             = [FLOAT, INT]
+    save_around_call_regs = [] # calling not allowed in vectorized traces!
+    assert set(save_around_call_regs).issubset(all_regs)
+    pool = None
+
+    def __init__(self, longevity, frame_manager=None, assembler=None):
+        RegisterManager.__init__(self, longevity, frame_manager, assembler)
+
+    def call_result_location(self, v):
+        return None
+
+    def convert_to_imm(self, c):
+        return l.pool(self.assembler.pool.get_offset(c), float=True)
+
+    def ensure_reg_or_pool(self, box):
+        if isinstance(box, Const):
+            offset = self.assembler.pool.get_offset(box)
+            return l.pool(offset, float=True)
+        else:
+            assert box in self.temp_boxes
+            loc = self.make_sure_var_in_reg(box,
+                    forbidden_vars=self.temp_boxes)
+        return loc
+
+    def ensure_reg(self, box):
+        assert box in self.temp_boxes
+        loc = self.make_sure_var_in_reg(box,
+                forbidden_vars=self.temp_boxes)
+        return loc
+
+    def get_scratch_reg(self, selected_reg=None):
+        # TODO
+        box = TempFloat()
+        reg = self.force_allocate_reg(box, forbidden_vars=self.temp_boxes, 
selected_reg=selected_reg)
+        self.temp_boxes.append(box)
+        return reg
+
+
 
 class ZARCHRegisterManager(RegisterManager):
     all_regs              = r.MANAGED_REGS
@@ -389,8 +429,9 @@
         assert isinstance(loc, l.StackLocation)
         return loc.position
 
+from rpython.jit.backend.zarch import vector_ext
 
-class Regalloc(BaseRegalloc):
+class Regalloc(BaseRegalloc, vector_ext.VectorRegalloc):
 
     def __init__(self, assembler=None):
         self.cpu = assembler.cpu
@@ -415,6 +456,9 @@
         self.fprm = FPRegisterManager(self.longevity, frame_manager = self.fm,
                                       assembler = self.assembler)
         self.fprm.pool = self.assembler.pool
+        self.vrm = VectorRegisterManager(self.longevity, frame_manager = 
self.fm,
+                                         assembler = self.assembler)
+        self.vrm.pool = self.assembler.pool
         return operations
 
     def prepare_loop(self, inputargs, operations, looptoken, allgcrefs):
@@ -470,13 +514,17 @@
         self.fm.finish_binding()
         self.rm._check_invariants()
         self.fprm._check_invariants()
+        self.vrm._check_invariants()
 
     def get_final_frame_depth(self):
         return self.fm.get_frame_depth()
 
     def possibly_free_var(self, var):
         if var is not None:
-            if var.type == FLOAT:
+            if var.is_vector():
+                if var.type != VOID:
+                    self.vrm.possibly_free_var(var)
+            elif var.type == FLOAT:
                 self.fprm.possibly_free_var(var)
             else:
                 self.rm.possibly_free_var(var)
@@ -533,6 +581,7 @@
             self.assembler.mc.mark_op(op)
             self.rm.position = i
             self.fprm.position = i
+            self.vrm.position = i
             opnum = op.getopnum()
             if rop.has_no_side_effect(opnum) and op not in self.longevity:
                 i += 1
@@ -541,7 +590,10 @@
             #
             for j in range(op.numargs()):
                 box = op.getarg(j)
-                if box.type != FLOAT:
+                if box.is_vector():
+                    if box.type != VOID:
+                        self.vrm.temp_boxes.append(box)
+                elif box.type != FLOAT:
                     self.rm.temp_boxes.append(box)
                 else:
                     self.fprm.temp_boxes.append(box)
@@ -555,6 +607,7 @@
             self.possibly_free_var(op)
             self.rm._check_invariants()
             self.fprm._check_invariants()
+            self.vrm._check_invariants()
             if self.assembler.mc.get_relative_pos() > self.limit_loop_break:
                 self.assembler.break_long_loop()
                 self.limit_loop_break = (self.assembler.mc.get_relative_pos() +
@@ -562,6 +615,7 @@
             i += 1
         assert not self.rm.reg_bindings
         assert not self.fprm.reg_bindings
+        assert not self.vrm.reg_bindings
         self.flush_loop()
         self.assembler.mc.mark_op(None) # end of the loop
         self.operations = None
@@ -677,6 +731,7 @@
         # temporary boxes and all the current operation's arguments
         self.rm.free_temp_vars()
         self.fprm.free_temp_vars()
+        self.vrm.free_temp_vars()
 
     def compute_hint_frame_locations(self, operations):
         # optimization only: fill in the 'hint_frame_locations' dictionary
diff --git a/rpython/jit/backend/zarch/registers.py 
b/rpython/jit/backend/zarch/registers.py
--- a/rpython/jit/backend/zarch/registers.py
+++ b/rpython/jit/backend/zarch/registers.py
@@ -1,8 +1,9 @@
-from rpython.jit.backend.zarch.locations import FloatRegisterLocation
-from rpython.jit.backend.zarch.locations import RegisterLocation
+from rpython.jit.backend.zarch.locations import (FloatRegisterLocation,
+        RegisterLocation, VectorRegisterLocation)
 
 registers = [RegisterLocation(i) for i in range(16)]
 fpregisters = [FloatRegisterLocation(i) for i in range(16)]
+vregisters = [VectorRegisterLocation(16+i) for i in range(16)]
 
 [r0,r1,r2,r3,r4,r5,r6,r7,r8,
  r9,r10,r11,r12,r13,r14,r15] = registers
@@ -30,6 +31,8 @@
 MANAGED_FP_REGS = fpregisters[:-1]
 VOLATILES_FLOAT = [f0,f1,f2,f3,f4,f5,f6,f7]
 
+MANAGED_VECTOR_REGS = vregisters
+
 # The JITFRAME_FIXED_SIZE is measured in words, and should be the
 # number of registers that need to be saved into the jitframe when
 # failing a guard, for example.
diff --git a/rpython/jit/backend/zarch/vector_ext.py 
b/rpython/jit/backend/zarch/vector_ext.py
--- a/rpython/jit/backend/zarch/vector_ext.py
+++ b/rpython/jit/backend/zarch/vector_ext.py
@@ -15,6 +15,7 @@
 import rpython.jit.backend.zarch.registers as r
 import rpython.jit.backend.zarch.conditions as c
 import rpython.jit.backend.zarch.locations as l
+from rpython.jit.backend.zarch.locations import imm
 from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
 from rpython.rtyper.lltypesystem import lltype, rffi
 from rpython.jit.codewriter import longlong
@@ -92,35 +93,16 @@
         pass
 
     def emit_vec_load_f(self, op, arglocs, regalloc):
-        resloc, baseloc, indexloc, size_loc, ofs, integer_loc = arglocs
-        indexloc = self._apply_offset(indexloc, ofs)
-        itemsize = size_loc.value
-        if integer_loc.value:
-            self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value)
-        elif itemsize == 4:
-            self.mc.lxvw4x(resloc.value, indexloc.value, baseloc.value)
-        elif itemsize == 8:
-            self.mc.lxvd2x(resloc.value, indexloc.value, baseloc.value)
-        else:
-            not_implemented("vec_load_f itemsize %d" % itemsize)
+        resloc, baseloc, indexloc, size_loc, offsetloc, integer_loc = arglocs
+        addrloc = self._load_address(baseloc, indexloc, offsetloc)
+        self.mc.VL(resloc, addrloc)
 
     emit_vec_load_i = emit_vec_load_f
 
     def emit_vec_store(self, op, arglocs, regalloc):
-        baseloc, indexloc, valueloc, sizeloc, baseofs, \
-            integer_loc = arglocs
-        indexloc = self._apply_offset(indexloc, baseofs)
-        assert baseofs.value == 0
-        if integer_loc.value:
-            self.mc.stxvd2x(valueloc.value, indexloc.value, baseloc.value)
-        else:
-            itemsize = sizeloc.value
-            if itemsize == 4:
-                self.mc.stxvw4x(valueloc.value, indexloc.value, baseloc.value)
-            elif itemsize == 8:
-                self.mc.stxvd2x(valueloc.value, indexloc.value, baseloc.value)
-            else:
-                not_implemented("vec_store itemsize %d" % itemsize)
+        baseloc, indexloc, valueloc, sizeloc, offsetloc, integer_loc = arglocs
+        addrloc = self._load_address(baseloc, indexloc, offsetloc)
+        self.mc.VST(valueloc, addrloc)
 
     def emit_vec_int_add(self, op, arglocs, regalloc):
         resloc, loc0, loc1, size_loc = arglocs
@@ -152,18 +134,18 @@
     def emit_vec_float_add(self, op, arglocs, regalloc):
         resloc, loc0, loc1, itemsize_loc = arglocs
         itemsize = itemsize_loc.value
-        if itemsize == 4:
-            self.mc.xvaddsp(resloc.value, loc0.value, loc1.value)
-        elif itemsize == 8:
-            self.mc.xvadddp(resloc.value, loc0.value, loc1.value)
+        if itemsize == 8:
+            self.mc.VFA(resloc, loc0, loc1, 3, 0)
+            return
+        not_implemented("vec_float_add of size %d" % itemsize)
 
     def emit_vec_float_sub(self, op, arglocs, regalloc):
         resloc, loc0, loc1, itemsize_loc = arglocs
         itemsize = itemsize_loc.value
-        if itemsize == 4:
-            self.mc.xvsubsp(resloc.value, loc0.value, loc1.value)
-        elif itemsize == 8:
-            self.mc.xvsubdp(resloc.value, loc0.value, loc1.value)
+        if itemsize == 8:
+            self.mc.VFS(resloc, loc0, loc1, 3, 0)
+            return
+        not_implemented("vec_float_add of size %d" % itemsize)
 
     def emit_vec_float_mul(self, op, arglocs, regalloc):
         resloc, loc0, loc1, itemsize_loc = arglocs
diff --git a/rpython/translator/platform/arch/s390x.py 
b/rpython/translator/platform/arch/s390x.py
--- a/rpython/translator/platform/arch/s390x.py
+++ b/rpython/translator/platform/arch/s390x.py
@@ -38,6 +38,18 @@
 
     return ids
 
+def s390x_detect_vx():
+    with open("/proc/cpuinfo", "rb") as fd:
+        lines = fd.read().splitlines()
+        for line in lines:
+            if line.startswith("features"):
+                colonidx = line.find(':')
+                split = line[colonidx+1:].strip().split(' ')
+                if 'vx' in split:
+                    return True
+                break
+
+    return False
 
 def s390x_cpu_revision():
     # linux kernel does the same classification
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy zarch-simd-support: load, store, add, subtract vector versions

Reply via email to