Author: Richard Plangger <[email protected]>
Branch: vecopt
Changeset: r77902:eb3cc9cf75f4
Date: 2015-06-05 12:59 +0200
http://bitbucket.org/pypy/pypy/changeset/eb3cc9cf75f4/
Log: int expansion for int16 and int8 added, int32/16 test added first
already passes
diff --git a/pypy/module/micronumpy/test/test_zjit.py
b/pypy/module/micronumpy/test/test_zjit.py
--- a/pypy/module/micronumpy/test/test_zjit.py
+++ b/pypy/module/micronumpy/test/test_zjit.py
@@ -226,6 +226,32 @@
assert int(result) == 7+16+8+16
self.check_vectorized(2, 2)
+ def define_int16_expand():
+ return """
+ a = astype(|30|, int16)
+ c = astype(|1|, int16)
+ c[0] = 16i
+ b = a + c
+ sum(b -> 7:14)
+ """
+ def test_int16_expand(self):
+ result = self.run("int16_expand")
+ assert int(result) == 8*16 + sum(range(7,15))
+ self.check_vectorized(2, 2)
+
+ def define_int8_expand():
+ return """
+ a = astype(|30|, int16)
+ c = astype(|1|, int16)
+ c[0] = 8i
+ b = a + c
+ sum(b -> 0:17)
+ """
+ def test_int16_expand(self):
+ result = self.run("int16_expand")
+ assert int(result) == 16*8 + sum(range(0,17))
+ self.check_vectorized(2, 2)
+
def define_int32_add_const():
return """
a = astype(|30|, int32)
diff --git a/rpython/jit/backend/x86/assembler.py
b/rpython/jit/backend/x86/assembler.py
--- a/rpython/jit/backend/x86/assembler.py
+++ b/rpython/jit/backend/x86/assembler.py
@@ -54,6 +54,7 @@
self.float_const_abs_addr = 0
self.single_float_const_neg_addr = 0
self.single_float_const_abs_addr = 0
+ self.expand_byte_mask_addr = 0
self.malloc_slowpath = 0
self.malloc_slowpath_varsize = 0
self.wb_slowpath = [0, 0, 0, 0, 0]
@@ -102,9 +103,11 @@
single_abs_const =
'\xFF\xFF\xFF\x7F\xFF\xFF\xFF\x7F\xFF\xFF\xFF\x7F\xFF\xFF\xFF\x7F'
# 0x80000000800000008000000080000000
single_neg_const =
'\x00\x00\x00\x80\x00\x00\x00\x80\x00\x00\x00\x80\x00\x00\x00\x80'
+ zero_const = '\x00' * 16
#
data = neg_const + abs_const + \
- single_neg_const + single_abs_const
+ single_neg_const + single_abs_const + \
+ zero_const
datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr, [])
float_constants = datablockwrapper.malloc_aligned(len(data),
alignment=16)
datablockwrapper.done()
@@ -115,6 +118,7 @@
self.float_const_abs_addr = float_constants + 16
self.single_float_const_neg_addr = float_constants + 32
self.single_float_const_abs_addr = float_constants + 48
+ self.expand_byte_mask_addr = float_constants + 64
def set_extra_stack_depth(self, mc, value):
if self._is_asmgcc():
@@ -2641,7 +2645,18 @@
assert isinstance(srcloc, RegLoc)
assert not srcloc.is_xmm
size = sizeloc.value
- if size == 8:
+ if size == 1:
+ self.mc.PINSRB_xri(resloc.value, srcloc.value, 0)
+ self.mc.PSHUFB(resloc, heap(self.expand_byte_mask_addr))
+ elif size == 2:
+ self.mc.PINSRW_xri(resloc.value, srcloc.value, 0)
+ self.mc.PINSRW_xri(resloc.value, srcloc.value, 4)
+ self.mc.PSHUFLW_xxi(resloc.value, resloc.value, 0)
+ self.mc.PSHUFHW_xxi(resloc.value, resloc.value, 0)
+ elif size == 4:
+ self.mc.PINSRD_xri(resloc.value, srcloc.value, 0)
+ self.mc.PSHUFD_xxi(resloc.value, resloc.value, 0)
+ elif size == 8:
self.mc.PINSRQ_xri(resloc.value, srcloc.value, 0)
self.mc.PINSRQ_xri(resloc.value, srcloc.value, 1)
else:
diff --git a/rpython/jit/backend/x86/regloc.py
b/rpython/jit/backend/x86/regloc.py
--- a/rpython/jit/backend/x86/regloc.py
+++ b/rpython/jit/backend/x86/regloc.py
@@ -715,6 +715,8 @@
PUNPCKLDQ = _binaryop('PUNPCKLDQ')
PUNPCKHDQ = _binaryop('PUNPCKHDQ')
+ PSHUFB = _binaryop('PSHUFB')
+
CALL = _relative_unaryop('CALL')
JMP = _relative_unaryop('JMP')
diff --git a/rpython/jit/backend/x86/rx86.py b/rpython/jit/backend/x86/rx86.py
--- a/rpython/jit/backend/x86/rx86.py
+++ b/rpython/jit/backend/x86/rx86.py
@@ -743,6 +743,11 @@
SHUFPD_xxi = xmminsn('\x66', rex_nw, '\x0F\xC6', register(1,8),
register(2), '\xC0', immediate(3, 'b'))
PSHUFD_xxi = xmminsn('\x66', rex_nw, '\x0F\x70', register(1,8),
register(2), '\xC0', immediate(3, 'b'))
+ PSHUFHW_xxi = xmminsn('\xF3', rex_nw, '\x0F\x70', register(1,8),
register(2), '\xC0', immediate(3, 'b'))
+ PSHUFLW_xxi = xmminsn('\xF2', rex_nw, '\x0F\x70', register(1,8),
register(2), '\xC0', immediate(3, 'b'))
+ PSHUFB_xx = xmminsn('\x66', rex_nw, '\x0F\x38\x00', register(1,8),
register(2), '\xC0')
+ PSHUFB_xm = xmminsn('\x66', rex_nw, '\x0F\x38\x00', register(1,8),
mem_reg_plus_const(2))
+
# following require SSE4_1
PEXTRQ_rxi = xmminsn('\x66', rex_w, '\x0F\x3A\x16', register(2,8),
register(1), '\xC0', immediate(3, 'b'))
diff --git a/rpython/jit/metainterp/history.py
b/rpython/jit/metainterp/history.py
--- a/rpython/jit/metainterp/history.py
+++ b/rpython/jit/metainterp/history.py
@@ -395,7 +395,7 @@
t = 'b'
self._str = '%s%d' % (t, Box._counter)
if self.type == VECTOR:
- self._str = '%s%d[%s%d#%d]' % (t, Box._counter, self.item_type,
+ self._str = '%s%d[%s%d|%d]' % (t, Box._counter, self.item_type,
self.item_size * 8,
self.item_count)
Box._counter += 1
return self._str
diff --git a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py
b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py
--- a/rpython/jit/metainterp/optimizeopt/test/test_schedule.py
+++ b/rpython/jit/metainterp/optimizeopt/test/test_schedule.py
@@ -22,7 +22,7 @@
'long': self.intarraydescr,
'int': self.int32arraydescr,
}
- loop = opparse("
[p0,p1,p2,p3,p4,p5,i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,f0,f1,f2,f3,f4,f5]\n" + source
+ \
+ loop = opparse("
[p0,p1,p2,p3,p4,p5,i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,f0,f1,f2,f3,f4,f5,v103204[i32|4]]\n"
+ source + \
"\n
jump(p0,p1,p2,p3,p4,p5,i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,f0,f1,f2,f3,f4,f5)",
cpu=self.cpu,
namespace=ns)
@@ -39,13 +39,15 @@
def pack(self, loop, l, r):
return [Node(op,1+l+i) for i,op in enumerate(loop.operations[1+l:1+r])]
- def schedule(self, loop_orig, packs, vec_reg_size=16,
prepend_invariant=False):
+ def schedule(self, loop_orig, packs, vec_reg_size=16,
prepend_invariant=False, getvboxfunc=None):
loop = get_model(False).ExtendedTreeLoop("loop")
loop.original_jitcell_token = loop_orig.original_jitcell_token
loop.inputargs = loop_orig.inputargs
ops = []
vsd = VecScheduleData(vec_reg_size)
+ if getvboxfunc is not None:
+ vsd.getvector_of_box = getvboxfunc
for pack in packs:
if len(pack) == 1:
ops.append(pack[0].getoperation())
@@ -73,7 +75,7 @@
pack1 = self.pack(loop1, 0, 6)
loop2 = self.schedule(loop1, [pack1])
loop3 = self.parse("""
- v1[i32#4] = vec_raw_load(p0, i0, 4, descr=float)
+ v10[i32|4] = vec_raw_load(p0, i0, 4, descr=float)
i14 = raw_load(p0, i4, descr=float)
i15 = raw_load(p0, i5, descr=float)
""", False)
@@ -90,9 +92,9 @@
pack2 = self.pack(loop1, 2, 4)
loop2 = self.schedule(loop1, [pack1, pack2])
loop3 = self.parse("""
- v1[i64#2] = vec_raw_load(p0, i0, 2, descr=long)
- v2[i32#2] = vec_int_signext(v1[i64#2], 4)
- v3[f64#2] = vec_cast_int_to_float(v2[i32#2])
+ v10[i64|2] = vec_raw_load(p0, i0, 2, descr=long)
+ v20[i32|2] = vec_int_signext(v10[i64|2], 4)
+ v30[f64|2] = vec_cast_int_to_float(v20[i32|2])
""", False)
self.assert_equal(loop2, loop3)
@@ -104,12 +106,12 @@
pack1 = self.pack(loop1, 0, 2)
loop2 = self.schedule(loop1, [pack1], prepend_invariant=True)
loop3 = self.parse("""
- v1[i64#2] = vec_box(2)
- v2[i64#2] = vec_int_pack(v1[i64#2], i0, 0, 1)
- v3[i64#2] = vec_int_pack(v2[i64#2], i1, 1, 1)
- v4[i64#2] = vec_int_expand(73)
+ v10[i64|2] = vec_box(2)
+ v20[i64|2] = vec_int_pack(v10[i64|2], i0, 0, 1)
+ v30[i64|2] = vec_int_pack(v20[i64|2], i1, 1, 1)
+ v40[i64|2] = vec_int_expand(73)
#
- v5[i64#2] = vec_int_add(v3[i64#2], v4[i64#2])
+ v50[i64|2] = vec_int_add(v30[i64|2], v40[i64|2])
""", False)
self.assert_equal(loop2, loop3)
@@ -120,12 +122,12 @@
pack1 = self.pack(loop1, 0, 2)
loop2 = self.schedule(loop1, [pack1], prepend_invariant=True)
loop3 = self.parse("""
- v1[f64#2] = vec_box(2)
- v2[f64#2] = vec_float_pack(v1[f64#2], f0, 0, 1)
- v3[f64#2] = vec_float_pack(v2[f64#2], f1, 1, 1)
- v4[f64#2] = vec_float_expand(73.0)
+ v10[f64|2] = vec_box(2)
+ v20[f64|2] = vec_float_pack(v10[f64|2], f0, 0, 1)
+ v30[f64|2] = vec_float_pack(v20[f64|2], f1, 1, 1)
+ v40[f64|2] = vec_float_expand(73.0)
#
- v5[f64#2] = vec_float_add(v3[f64#2], v4[f64#2])
+ v50[f64|2] = vec_float_add(v30[f64|2], v40[f64|2])
""", False)
self.assert_equal(loop2, loop3)
@@ -140,12 +142,35 @@
pack2 = self.pack(loop1, 2, 4)
loop2 = self.schedule(loop1, [pack1, pack2], prepend_invariant=True)
loop3 = self.parse("""
- v1[f64#2] = vec_box(2)
- v2[f64#2] = vec_float_pack(v1[f64#2], f0, 0, 1)
- v3[f64#2] = vec_float_pack(v2[f64#2], f1, 1, 1)
- v4[f64#2] = vec_float_expand(f5) # only expaned once
+ v10[f64|2] = vec_box(2)
+ v20[f64|2] = vec_float_pack(v10[f64|2], f0, 0, 1)
+ v30[f64|2] = vec_float_pack(v20[f64|2], f1, 1, 1)
+ v40[f64|2] = vec_float_expand(f5) | only expaned once
#
- v5[f64#2] = vec_float_add(v3[f64#2], v4[f64#2])
- v6[f64#2] = vec_float_add(v5[f64#2], v4[f64#2])
+ v50[f64|2] = vec_float_add(v30[f64|2], v40[f64|2])
+ v60[f64|2] = vec_float_add(v50[f64|2], v40[f64|2])
""", False)
self.assert_equal(loop2, loop3)
+
+ def find_input_arg(self, name, loop):
+ for arg in loop.inputargs:
+ if str(arg).startswith(name):
+ return arg
+ raise Exception("could not find %s in args %s" % (name,
loop.inputargs))
+
+ def test_signext_int16(self):
+ loop1 = self.parse("""
+ i10 = int_signext(i1, 2)
+ i11 = int_signext(i1, 2)
+ i12 = int_signext(i1, 2)
+ i13 = int_signext(i1, 2)
+ """)
+ pack1 = self.pack(loop1, 0, 4)
+ v103204 = self.find_input_arg('v103204', loop1)
+ def i1inv103204(var):
+ return 0, v103204
+ loop2 = self.schedule(loop1, [pack1], prepend_invariant=True,
getvboxfunc=i1inv103204)
+ loop3 = self.parse("""
+ v11[i16|4] = vec_int_signext(v103204[i32|4], 2)
+ """, False)
+ self.assert_equal(loop2, loop3)
diff --git a/rpython/jit/metainterp/optimizeopt/vectorize.py
b/rpython/jit/metainterp/optimizeopt/vectorize.py
--- a/rpython/jit/metainterp/optimizeopt/vectorize.py
+++ b/rpython/jit/metainterp/optimizeopt/vectorize.py
@@ -776,7 +776,7 @@
class PackType(object):
UNKNOWN_TYPE = '-'
- def __init__(self, type, size, signed, count=-1, scalar_cost=1,
vector_cost=1):
+ def __init__(self, type, size, signed, count=-1):
assert type in (FLOAT, INT, PackType.UNKNOWN_TYPE)
self.type = type
self.size = size
@@ -826,7 +826,6 @@
def clone(self):
return PackType(self.type, self.size, self.signed, self.count)
-
class OpToVectorOp(object):
def __init__(self, arg_ptypes, result_ptype):
self.arg_ptypes = [a for a in arg_ptypes] # do not use a tuple.
rpython cannot union
@@ -837,6 +836,9 @@
self.input_type = None
self.output_type = None
+ def clone_vbox_set_count(self, box, count):
+ return BoxVector(box.item_type, count, box.item_size, box.item_signed)
+
def is_vector_arg(self, i):
if i < 0 or i >= len(self.arg_ptypes):
return False
@@ -985,8 +987,7 @@
return vbox_cloned
def unpack(self, vbox, index, count, arg_ptype):
- vbox_cloned = vbox.clonebox()
- vbox_cloned.item_count = count
+ vbox_cloned = self.clone_vbox_set_count(vbox, count)
opnum = rop.VEC_FLOAT_UNPACK
if vbox.item_type == INT:
opnum = rop.VEC_INT_UNPACK
@@ -1012,8 +1013,8 @@
if pos == -1:
i += 1
continue
- new_box = tgt_box.clonebox()
- new_box.item_count += src_box.item_count
+ count = tgt_box.item_count + src_box.item_count
+ new_box = self.clone_vbox_set_count(tgt_box, count)
op = ResOperation(opnum, [tgt_box, src_box, ConstInt(i),
ConstInt(src_box.item_count)], new_box)
self.preamble_ops.append(op)
diff --git a/rpython/jit/tool/oparser.py b/rpython/jit/tool/oparser.py
--- a/rpython/jit/tool/oparser.py
+++ b/rpython/jit/tool/oparser.py
@@ -123,12 +123,12 @@
box = ts.BoxRef()
_box_counter_more_than(self.model, elem[1:])
elif elem.startswith('v'):
- pattern = re.compile('.*\[(u?)(i|f)(\d+)#(\d+)\]')
+ pattern = re.compile('.*\[(u?)(i|f)(\d+)(#|\|)(\d+)\]')
match = pattern.match(elem)
if match:
item_type = match.group(2)[0]
item_size = int(match.group(3)) // 8
- item_count = int(match.group(4))
+ item_count = int(match.group(5))
item_signed = not (match.group(1) == 'u')
box = self.model.BoxVector(item_type, item_count, item_size,
item_signed)
lbracket = elem.find('[')
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit