mbaret commented on a change in pull request #6711:
URL: https://github.com/apache/incubator-tvm/pull/6711#discussion_r513341812
##########
File path: python/tvm/topi/arm_cpu/depthwise_conv2d.py
##########
@@ -298,24 +299,29 @@ def schedule_depthwise_conv2d_nhwc(cfg, outs):
cfg["tile_c"] = SplitEntity([-1, 8])
cfg["tile_h"] = SplitEntity([-1, 2])
cfg["tile_w"] = SplitEntity([-1, 2])
- cfg["locate_output"] = OtherOptionEntity(1)
+ cfg["locate_output"] = OtherOptionEntity(2)
##### space definition end #####
def schedule_conv(conv):
conv_data = conv.op.input_tensors[0]
+ kernel_data = conv.op.input_tensors[1]
+ in_type = conv_data.dtype
+
+ _, _, IC, channel_multiplier = get_const_tuple(kernel_data.shape)
n, w, h, c = conv.op.axis
r_h, r_w = conv.op.reduce_axis
ho, hi = cfg["tile_h"].apply(s, conv, h)
wo, wi = cfg["tile_w"].apply(s, conv, w)
co, ci = cfg["tile_c"].apply(s, conv, c)
+ data_pad_value = -1
if conv_data.name == "data_pad":
assert isinstance(conv_data.op, tvm.te.ComputeOp)
# Define a policy for padding computation
cfg.define_knob("data_pad_inline", [1, 2, 3])
if cfg.is_fallback:
- cfg["data_pad_inline"] = OtherOptionEntity(3)
+ cfg["data_pad_inline"] = OtherOptionEntity(2)
Review comment:
Could we gate this behind knowing whether we can tensorize? Just in case
it regresses default fp32 performance.
##########
File path: python/tvm/topi/arm_cpu/depthwise_conv2d.py
##########
@@ -324,24 +330,48 @@ def schedule_conv(conv):
s[conv_data].compute_at(s[conv], wo)
if cfg["data_pad_inline"].val == 3:
s[conv_data].compute_inline()
+ data_pad_value = cfg["data_pad_inline"].val
+
+ split_val = cfg["tile_c"].size[-1]
+ use_tensorization = (
+ (in_type == "int16")
+ and (split_val == 8)
+ and (IC % split_val == 0)
+ and (channel_multiplier == 1)
+ and (data_pad_value != 3)
+ and is_aarch64_arm()
+ )
+
+ if use_tensorization:
+ smlal = smlal_int16_int32()
+ s[conv].tensorize(ci, smlal)
+ else:
+ s[conv].vectorize(ci)
+
+ s[conv].unroll(r_h)
+ s[conv].unroll(r_w)
+ s[conv].unroll(wi)
+ s[conv].unroll(hi)
Review comment:
Just check that this is an improvement for fp32 also, otherwise move
into use_tensorization.
##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -879,6 +879,96 @@ def _instr(index):
)
+def smlal_int16_int32():
+ """
+ Intrinsic to be used in order to load two int16x8 vectors and multiply
+ them together through a pair of smlal/smlal2 instructions. The pseudo-code
+ for the algorithm is as follows:
+
+ vec_a = vld1q_s16(A)
+ vec_b = vld1q_s16(B)
+
+ vec_c[0:4] += vec_a[0:4]*vec_b[0:4] // -> smlal instruction
+ vec_c[4:8] += vec_a[4:8]*vec_b[4:8] // -> smlal2 instruction
+
+ So we load a single int16x8 vector and we accumulate its lower (0:4) and
+ higher part separately.
+ """
+ int16_lanes = 8
+ A = te.placeholder((int16_lanes,), dtype="int16", name="A")
+ B = te.placeholder((int16_lanes, 1), dtype="int16", name="B")
+ C = te.compute(
+ (int16_lanes,),
+ lambda i: A[i].astype("int32") * B[i, 0].astype("int32"),
+ name="C",
+ )
+
+ a_buffer = tvm.tir.decl_buffer(
+ A.shape, dtype="int16", name="a_buffer", offset_factor=1, strides=[1]
+ )
+ b_buffer = tvm.tir.decl_buffer(
+ B.shape,
+ dtype="int16",
+ name="b_buffer",
+ offset_factor=1,
+ strides=[te.var("sb"), 1],
+ )
+ c_buffer = tvm.tir.decl_buffer(
+ C.shape,
+ dtype="int32",
+ name="c_buffer",
+ offset_factor=1,
+ strides=[1],
+ )
+
+ def _intrin_func(ins, outs):
+ def _instr(index):
+ ib = tvm.tir.ir_builder.create()
+ if index == 1:
+ ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x8")))
+ return ib.get()
+
+ vec_a = ins[0].vload([0], "int16x8")
+ vec_b = ins[1].vload([0, 0], "int16x8")
+ inst = "llvm.aarch64.neon.smull"
+
+ # Lower part of the vector
+ vec_c_h = outs[0].vload([4], "int32x4")
+ vec_a_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_a)
+ vec_b_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_b)
+ vmull_h = tvm.tir.call_llvm_pure_intrin(
+ "int32x4", inst, tvm.tir.const(2, "uint32"), vec_a_h, vec_b_h
+ )
+ vec_out_h = vec_c_h + vmull_h
+
+ # Lower part of the vector
+ vec_c_l = outs[0].vload([0], "int32x4")
+ vec_a_l = tvm.tir.call_intrin("int16x4", "tir.vectorlow", vec_a)
+ vec_b_l = tvm.tir.call_intrin("int16x4", "tir.vectorlow", vec_b)
+ vmull_l = tvm.tir.call_llvm_pure_intrin(
+ "int32x4", inst, tvm.tir.const(2, "uint32"), vec_a_l, vec_b_l
+ )
+ vec_out_l = vec_c_l + vmull_l
+
+ # Combine higher and lower part in a single int32x8 vector to store
+ # (this will require two different STR instructions, since the
Review comment:
close brackets, STR -> STORE (for those of us who play with strings a
lot :) )
##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -879,6 +879,96 @@ def _instr(index):
)
+def smlal_int16_int32():
+ """
+ Intrinsic to be used in order to load two int16x8 vectors and multiply
+ them together through a pair of smlal/smlal2 instructions. The pseudo-code
+ for the algorithm is as follows:
+
+ vec_a = vld1q_s16(A)
+ vec_b = vld1q_s16(B)
Review comment:
Use a pseudo-instruction like vload maybe.
##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -879,6 +879,96 @@ def _instr(index):
)
+def smlal_int16_int32():
+ """
+ Intrinsic to be used in order to load two int16x8 vectors and multiply
+ them together through a pair of smlal/smlal2 instructions. The pseudo-code
+ for the algorithm is as follows:
+
+ vec_a = vld1q_s16(A)
+ vec_b = vld1q_s16(B)
+
+ vec_c[0:4] += vec_a[0:4]*vec_b[0:4] // -> smlal instruction
+ vec_c[4:8] += vec_a[4:8]*vec_b[4:8] // -> smlal2 instruction
+
+ So we load a single int16x8 vector and we accumulate its lower (0:4) and
+ higher part separately.
+ """
+ int16_lanes = 8
+ A = te.placeholder((int16_lanes,), dtype="int16", name="A")
+ B = te.placeholder((int16_lanes, 1), dtype="int16", name="B")
+ C = te.compute(
+ (int16_lanes,),
+ lambda i: A[i].astype("int32") * B[i, 0].astype("int32"),
+ name="C",
+ )
+
+ a_buffer = tvm.tir.decl_buffer(
+ A.shape, dtype="int16", name="a_buffer", offset_factor=1, strides=[1]
+ )
+ b_buffer = tvm.tir.decl_buffer(
+ B.shape,
+ dtype="int16",
+ name="b_buffer",
+ offset_factor=1,
+ strides=[te.var("sb"), 1],
+ )
+ c_buffer = tvm.tir.decl_buffer(
+ C.shape,
+ dtype="int32",
+ name="c_buffer",
+ offset_factor=1,
+ strides=[1],
+ )
+
+ def _intrin_func(ins, outs):
+ def _instr(index):
+ ib = tvm.tir.ir_builder.create()
+ if index == 1:
+ ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x8")))
+ return ib.get()
+
+ vec_a = ins[0].vload([0], "int16x8")
+ vec_b = ins[1].vload([0, 0], "int16x8")
+ inst = "llvm.aarch64.neon.smull"
+
+ # Lower part of the vector
Review comment:
Higher part of the vector.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]