mbaret commented on a change in pull request #6907:
URL: https://github.com/apache/incubator-tvm/pull/6907#discussion_r528691885
##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -447,28 +97,317 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type,
out_type):
C.shape, dtype="int32", name="c_buffer", offset_factor=1,
strides=[te.var("sc"), 1]
)
+ # Intrinsics used in the following algorithm
+ umull_intrin = "llvm.aarch64.neon.umull" if in_type == "uint8" else
"llvm.aarch64.neon.smull"
+ uaddlp_intrin = "llvm.aarch64.neon.uaddlp" if in_type == "uint8" else
"llvm.aarch64.neon.saddlp"
+ addp_intrin = "llvm.aarch64.neon.addp"
+
+ def uadalp(a, b):
+ """Add pair and accumulate
+
+ Parameters:
+ ----------
+ a: int16x8 vector
+ b: int16x8 vector
+
+ Returns:
+ --------
+ return a int32x4 vector
+
+ Pseudocode:
+ ----------
+ a += (b0+b1, b2+b3, b4+b5, b6+b7)
+ """
+
+ return a + tvm.tir.call_llvm_pure_intrin(
+ "int32x4", uaddlp_intrin, tvm.tir.const(1, "uint32"), b
+ )
+
+ def umull(a, b):
+ """Multiply long (higher part)
+
+ Parameters:
+ ----------
+ a: int8x16 vector
+ b: int8x16 vector
+
+ Returns:
+ --------
+ return a int16x8 vector
+
+ Pseudocode:
+ ----------
+ c = (a0*b0, a1*b1, a2*b2, a3*b3, a4*b4, a5*b5, a6*b6, a7*b7)
+ """
+ a_high = tvm.tir.call_intrin("int8x8", "tir.vectorhigh", a)
+ b_high = tvm.tir.call_intrin("int8x8", "tir.vectorhigh", b)
+ c = tvm.tir.call_llvm_pure_intrin(
+ "int16x8", umull_intrin, tvm.tir.const(2, "uint32"), a_high, b_high
+ )
+ return c
+
+ def umull2(a, b):
+ """Multiply long (lower part)
+
+ Parameters:
+ ----------
+ a: int8x16 vector
+ b: int8x16 vector
+
+ Returns:
+ --------
+ return a int16x8 vector
+
+ Pseudocode:
+ ----------
+ c = (a8*b8, a9*b9, a10*b10, a11*b11, a12*b12, a13*b13, a14*b14,
a15*b15)
+ """
+ a_low = tvm.tir.call_intrin("int8x8", "tir.vectorlow", a)
+ b_low = tvm.tir.call_intrin("int8x8", "tir.vectorlow", b)
+ c = tvm.tir.call_llvm_pure_intrin(
+ "int16x8", umull_intrin, tvm.tir.const(2, "uint32"), a_low, b_low
+ )
+ return c
+
+ def addp(a, b):
+ """Add two vectors in pairs
+
+ Parameters:
+ ----------
+ a: int32x4 vector
+ b: int32x4 vector
+
+ Returns:
+ --------
+ return a int32x4 vector
+
+ Pseudocode:
+ ----------
+ c = (a0+a1, a2+a3, b0+b1, b0+b3)
+ """
+ return tvm.tir.call_llvm_pure_intrin(
+ "int32x4", addp_intrin, tvm.tir.const(2, "uint32"), a, b
+ )
+
+ def accumulation_loop(M, N, ins, acc, tile_idx):
+ """Internal tile accumulation. This function
+ takes two arrays of int8 data type A[tile_idx][4][16] and
B[tile_idx][4][16], produces
+ a 4x4 matrix which is equal to A*B' and accumulates into C[4][4]
+
+ The pseudo code is as follows.
+
+ .. code-block:: c
+
+ void gemm_4x4_int8_int8_int32(int8 A[tile_idx][4][K],
+ int8 B[tile_idx][4][K],
+ int32 C[4][4]){
+ for (int i = 0; i < 4; i++){
+ for (int j = 0; j < 4; j++){
+ for (int k = 0; k < 16; k++){
+ C[i][j] += A[tile_idx][i][k] * B[tile_idx][j][k]
+ }
+ }
+ }
+
+ Notes:
+ * The tiling strategy is picked to maximize register usage.
+
+ Parameters:
+ ----------
+ M: number of total rows of the output matrix
+ N: number of total rows of the output matrix
Review comment:
I guess one of these should be 'columns'.
##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -403,8 +52,9 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type,
out_type):
columns of the matrix B
K: int
columns of matrix A
+ unroll: boolean
Review comment:
Couple of things, stylistically NumPy docstrings would leave a space
either side of the colon and in Python the type would be 'bool'.
```suggestion
unroll : bool
```
##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -447,28 +97,317 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type,
out_type):
C.shape, dtype="int32", name="c_buffer", offset_factor=1,
strides=[te.var("sc"), 1]
)
+ # Intrinsics used in the following algorithm
+ umull_intrin = "llvm.aarch64.neon.umull" if in_type == "uint8" else
"llvm.aarch64.neon.smull"
+ uaddlp_intrin = "llvm.aarch64.neon.uaddlp" if in_type == "uint8" else
"llvm.aarch64.neon.saddlp"
+ addp_intrin = "llvm.aarch64.neon.addp"
+
+ def uadalp(a, b):
+ """Add pair and accumulate
+
+ Parameters:
+ ----------
+ a: int16x8 vector
+ b: int16x8 vector
+
+ Returns:
+ --------
+ return a int32x4 vector
+
+ Pseudocode:
+ ----------
+ a += (b0+b1, b2+b3, b4+b5, b6+b7)
+ """
+
+ return a + tvm.tir.call_llvm_pure_intrin(
+ "int32x4", uaddlp_intrin, tvm.tir.const(1, "uint32"), b
+ )
+
+ def umull(a, b):
+ """Multiply long (higher part)
+
+ Parameters:
+ ----------
+ a: int8x16 vector
+ b: int8x16 vector
+
+ Returns:
+ --------
+ return a int16x8 vector
+
+ Pseudocode:
+ ----------
+ c = (a0*b0, a1*b1, a2*b2, a3*b3, a4*b4, a5*b5, a6*b6, a7*b7)
+ """
+ a_high = tvm.tir.call_intrin("int8x8", "tir.vectorhigh", a)
+ b_high = tvm.tir.call_intrin("int8x8", "tir.vectorhigh", b)
+ c = tvm.tir.call_llvm_pure_intrin(
+ "int16x8", umull_intrin, tvm.tir.const(2, "uint32"), a_high, b_high
+ )
+ return c
+
+ def umull2(a, b):
+ """Multiply long (lower part)
+
+ Parameters:
+ ----------
+ a: int8x16 vector
+ b: int8x16 vector
+
+ Returns:
+ --------
+ return a int16x8 vector
+
+ Pseudocode:
+ ----------
+ c = (a8*b8, a9*b9, a10*b10, a11*b11, a12*b12, a13*b13, a14*b14,
a15*b15)
+ """
+ a_low = tvm.tir.call_intrin("int8x8", "tir.vectorlow", a)
+ b_low = tvm.tir.call_intrin("int8x8", "tir.vectorlow", b)
+ c = tvm.tir.call_llvm_pure_intrin(
+ "int16x8", umull_intrin, tvm.tir.const(2, "uint32"), a_low, b_low
+ )
+ return c
+
+ def addp(a, b):
+ """Add two vectors in pairs
+
+ Parameters:
+ ----------
+ a: int32x4 vector
+ b: int32x4 vector
+
+ Returns:
+ --------
+ return a int32x4 vector
+
+ Pseudocode:
+ ----------
+ c = (a0+a1, a2+a3, b0+b1, b0+b3)
+ """
+ return tvm.tir.call_llvm_pure_intrin(
+ "int32x4", addp_intrin, tvm.tir.const(2, "uint32"), a, b
+ )
+
+ def accumulation_loop(M, N, ins, acc, tile_idx):
+ """Internal tile accumulation. This function
+ takes two arrays of int8 data type A[tile_idx][4][16] and
B[tile_idx][4][16], produces
+ a 4x4 matrix which is equal to A*B' and accumulates into C[4][4]
+
+ The pseudo code is as follows.
+
+ .. code-block:: c
+
+ void gemm_4x4_int8_int8_int32(int8 A[tile_idx][4][K],
+ int8 B[tile_idx][4][K],
+ int32 C[4][4]){
+ for (int i = 0; i < 4; i++){
+ for (int j = 0; j < 4; j++){
+ for (int k = 0; k < 16; k++){
+ C[i][j] += A[tile_idx][i][k] * B[tile_idx][j][k]
+ }
+ }
+ }
+
+ Notes:
+ * The tiling strategy is picked to maximize register usage.
+
+ Parameters:
+ ----------
+ M: number of total rows of the output matrix
+ N: number of total rows of the output matrix
+ ins: input buffers
Review comment:
NumPy style doc-strings should have this on a separate line (you just
specify the type after the colon and are allowed to give no type).
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]