mbaret commented on a change in pull request #6445:
URL: https://github.com/apache/incubator-tvm/pull/6445#discussion_r496745509



##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -589,6 +587,236 @@ def _instr(index):
     )
 
 
+def select_word(vec, lane, dtype_vec):
+    """
+    Utility function used to select a int8x4 word within a int8x16 vector
+    and replicate 4 times.
+    The pseudo-code for this operation is:
+
+    v = [x0, ..., x15]
+    vsub(lane) = v[4*lane:4*lane+3]
+    replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)]
+
+    Note that 0<=lane<4
+
+     Parameters
+    ----------
+    vec: tvm.tir.Expr
+         int8x16 vector expression
+    lane: int
+        vector lane we want to replicate
+    dtype_vec: str
+        vector data type (e.g., int8x16)
+
+    Returns
+    ----------
+    output: tvm.tir.Expr
+        replicated vector
+    """
+    # Reinterpret vec_a as 4 int32 words
+    vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)
+    # Broadcast the lane-th word
+    vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
+    # Convert back to uint8x16
+    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", 
vec_int32_shuffled)
+    return vec_int8_broadcast
+
+
+def mmla_4x4_int8_int8_int32(dtype):
+    """
+    Int8 4x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[4][4] and
+    B[4][4] and produces a 4x4 matrix which is equal to A*B
+    The pseudo code is as follows.

Review comment:
       This needs to explicitly reference the accumulation step, also in the 
pseudo-code

##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -589,6 +587,236 @@ def _instr(index):
     )
 
 
+def select_word(vec, lane, dtype_vec):
+    """
+    Utility function used to select a int8x4 word within a int8x16 vector
+    and replicate 4 times.
+    The pseudo-code for this operation is:
+
+    v = [x0, ..., x15]
+    vsub(lane) = v[4*lane:4*lane+3]
+    replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)]
+
+    Note that 0<=lane<4
+
+     Parameters
+    ----------
+    vec: tvm.tir.Expr
+         int8x16 vector expression
+    lane: int
+        vector lane we want to replicate
+    dtype_vec: str
+        vector data type (e.g., int8x16)
+
+    Returns
+    ----------
+    output: tvm.tir.Expr
+        replicated vector
+    """
+    # Reinterpret vec_a as 4 int32 words
+    vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)
+    # Broadcast the lane-th word
+    vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
+    # Convert back to uint8x16
+    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", 
vec_int32_shuffled)
+    return vec_int8_broadcast
+
+
+def mmla_4x4_int8_int8_int32(dtype):
+    """
+    Int8 4x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[4][4] and
+    B[4][4] and produces a 4x4 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 
output[4][4]){
+            for (int i = 0; i < 4; i++){
+                for (int j = 0; i < 4; i++){
+                    out[i][j] = 0;
+                    for (int k = 0; k < 4; k++){
+                        out[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * Matrix A is interleaved

Review comment:
       Matrix A is not interleaved at this stage

##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -589,6 +587,236 @@ def _instr(index):
     )
 
 
+def select_word(vec, lane, dtype_vec):
+    """
+    Utility function used to select a int8x4 word within a int8x16 vector
+    and replicate 4 times.
+    The pseudo-code for this operation is:
+
+    v = [x0, ..., x15]
+    vsub(lane) = v[4*lane:4*lane+3]
+    replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)]
+
+    Note that 0<=lane<4
+
+     Parameters
+    ----------
+    vec: tvm.tir.Expr
+         int8x16 vector expression
+    lane: int
+        vector lane we want to replicate
+    dtype_vec: str
+        vector data type (e.g., int8x16)
+
+    Returns
+    ----------
+    output: tvm.tir.Expr
+        replicated vector
+    """
+    # Reinterpret vec_a as 4 int32 words
+    vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)
+    # Broadcast the lane-th word
+    vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
+    # Convert back to uint8x16
+    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", 
vec_int32_shuffled)
+    return vec_int8_broadcast
+
+
+def mmla_4x4_int8_int8_int32(dtype):
+    """
+    Int8 4x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[4][4] and
+    B[4][4] and produces a 4x4 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 
output[4][4]){
+            for (int i = 0; i < 4; i++){
+                for (int j = 0; i < 4; i++){
+                    out[i][j] = 0;
+                    for (int k = 0; k < 4; k++){
+                        out[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * Matrix A is interleaved
+    This function returns a TensorIntrin that can be used to tensorize a 
schedule.
+
+    Parameters
+    ----------
+    dtype: str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    data = te.placeholder((te.var("rows"), 4), dtype, name="data")
+    kernel = te.placeholder((4, 4), dtype, name="kernel")
+    dtype_vec = dtype + "x16"
+
+    k = te.reduce_axis((0, 4), name="k")
+    C = te.compute(
+        (te.var("rows"), 4),
+        lambda i, j: te.sum(data[i, k].astype("int32") * kernel[j, 
k].astype("int32"), axis=k),
+        name="C",
+    )
+
+    aa_buffer = tvm.tir.decl_buffer(
+        data.shape, dtype, name="aa_buffer", offset_factor=1, 
strides=[te.var("sa"), 1]
+    )
+    bb_buffer = tvm.tir.decl_buffer(
+        kernel.shape, dtype, name="bb_buffer", offset_factor=1, 
strides=[te.var("sb"), 1]
+    )
+    cc_buffer = tvm.tir.decl_buffer(
+        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, 
strides=[te.var("sc"), 1]
+    )
+
+    llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else 
"llvm.aarch64.neon.udot"
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                for i in range(0, 4):
+                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, 
"int32x4")))
+                return ib.get()
+
+            vec_a = ins[0].vload([0, 0], dtype_vec)
+            vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)]
+            vec_b = ins[1].vload([0, 0], dtype_vec)
+

Review comment:
       Could we include some comments here, maybe with an example?

##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -589,6 +587,236 @@ def _instr(index):
     )
 
 
+def select_word(vec, lane, dtype_vec):
+    """
+    Utility function used to select a int8x4 word within a int8x16 vector
+    and replicate 4 times.
+    The pseudo-code for this operation is:
+
+    v = [x0, ..., x15]
+    vsub(lane) = v[4*lane:4*lane+3]
+    replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)]
+
+    Note that 0<=lane<4
+
+     Parameters
+    ----------
+    vec: tvm.tir.Expr
+         int8x16 vector expression
+    lane: int
+        vector lane we want to replicate
+    dtype_vec: str
+        vector data type (e.g., int8x16)
+
+    Returns
+    ----------
+    output: tvm.tir.Expr
+        replicated vector
+    """
+    # Reinterpret vec_a as 4 int32 words
+    vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)
+    # Broadcast the lane-th word
+    vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
+    # Convert back to uint8x16
+    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", 
vec_int32_shuffled)
+    return vec_int8_broadcast
+
+
+def mmla_4x4_int8_int8_int32(dtype):
+    """
+    Int8 4x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[4][4] and
+    B[4][4] and produces a 4x4 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 
output[4][4]){
+            for (int i = 0; i < 4; i++){
+                for (int j = 0; i < 4; i++){
+                    out[i][j] = 0;
+                    for (int k = 0; k < 4; k++){
+                        out[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * Matrix A is interleaved
+    This function returns a TensorIntrin that can be used to tensorize a 
schedule.
+
+    Parameters
+    ----------
+    dtype: str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    data = te.placeholder((te.var("rows"), 4), dtype, name="data")
+    kernel = te.placeholder((4, 4), dtype, name="kernel")
+    dtype_vec = dtype + "x16"
+
+    k = te.reduce_axis((0, 4), name="k")
+    C = te.compute(
+        (te.var("rows"), 4),
+        lambda i, j: te.sum(data[i, k].astype("int32") * kernel[j, 
k].astype("int32"), axis=k),
+        name="C",
+    )
+
+    aa_buffer = tvm.tir.decl_buffer(
+        data.shape, dtype, name="aa_buffer", offset_factor=1, 
strides=[te.var("sa"), 1]
+    )
+    bb_buffer = tvm.tir.decl_buffer(
+        kernel.shape, dtype, name="bb_buffer", offset_factor=1, 
strides=[te.var("sb"), 1]
+    )
+    cc_buffer = tvm.tir.decl_buffer(
+        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, 
strides=[te.var("sc"), 1]
+    )
+
+    llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else 
"llvm.aarch64.neon.udot"
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                for i in range(0, 4):
+                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, 
"int32x4")))
+                return ib.get()
+
+            vec_a = ins[0].vload([0, 0], dtype_vec)
+            vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)]
+            vec_b = ins[1].vload([0, 0], dtype_vec)
+
+            # Execute the dot product
+            for i in range(0, 4):
+                vec_c = outs[0].vload([i, 0], "int32x4")
+                vdot = tvm.tir.call_llvm_intrin(
+                    "int32x4", llvm_intrin, tvm.tir.const(3, "uint32"), vec_c, 
vec_b, vec_aa[i],
+                )
+
+                # Store the result
+                ib.emit(outs[0].vstore([i, 0], vdot))
+
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op,
+        _intrin_func,
+        binds={data: aa_buffer, kernel: bb_buffer, C: cc_buffer},
+        default_buffer_params=buffer_params,
+    )
+
+
+def mmla_16x4_int8_int8_int32(dtype, rows):

Review comment:
       This needs at least changing to nx16 with a description of how that 
interacts with the row parameters.

##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -589,6 +587,236 @@ def _instr(index):
     )
 
 
+def select_word(vec, lane, dtype_vec):
+    """
+    Utility function used to select a int8x4 word within a int8x16 vector
+    and replicate 4 times.
+    The pseudo-code for this operation is:
+
+    v = [x0, ..., x15]
+    vsub(lane) = v[4*lane:4*lane+3]
+    replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)]
+
+    Note that 0<=lane<4
+
+     Parameters
+    ----------
+    vec: tvm.tir.Expr
+         int8x16 vector expression
+    lane: int
+        vector lane we want to replicate
+    dtype_vec: str
+        vector data type (e.g., int8x16)
+
+    Returns
+    ----------
+    output: tvm.tir.Expr
+        replicated vector
+    """
+    # Reinterpret vec_a as 4 int32 words
+    vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)
+    # Broadcast the lane-th word
+    vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
+    # Convert back to uint8x16
+    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", 
vec_int32_shuffled)
+    return vec_int8_broadcast
+
+
+def mmla_4x4_int8_int8_int32(dtype):
+    """
+    Int8 4x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[4][4] and
+    B[4][4] and produces a 4x4 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 
output[4][4]){
+            for (int i = 0; i < 4; i++){
+                for (int j = 0; i < 4; i++){
+                    out[i][j] = 0;
+                    for (int k = 0; k < 4; k++){
+                        out[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * Matrix A is interleaved
+    This function returns a TensorIntrin that can be used to tensorize a 
schedule.
+
+    Parameters
+    ----------
+    dtype: str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    data = te.placeholder((te.var("rows"), 4), dtype, name="data")

Review comment:
       Using te.var("rows") here instead of just 4 is strange, could you 
explain?

##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -589,6 +587,236 @@ def _instr(index):
     )
 
 
+def select_word(vec, lane, dtype_vec):
+    """
+    Utility function used to select a int8x4 word within a int8x16 vector
+    and replicate 4 times.
+    The pseudo-code for this operation is:
+
+    v = [x0, ..., x15]
+    vsub(lane) = v[4*lane:4*lane+3]
+    replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)]
+
+    Note that 0<=lane<4
+
+     Parameters
+    ----------
+    vec: tvm.tir.Expr
+         int8x16 vector expression
+    lane: int
+        vector lane we want to replicate
+    dtype_vec: str
+        vector data type (e.g., int8x16)
+
+    Returns
+    ----------
+    output: tvm.tir.Expr
+        replicated vector
+    """
+    # Reinterpret vec_a as 4 int32 words
+    vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)
+    # Broadcast the lane-th word
+    vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
+    # Convert back to uint8x16
+    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", 
vec_int32_shuffled)
+    return vec_int8_broadcast
+
+
+def mmla_4x4_int8_int8_int32(dtype):
+    """
+    Int8 4x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[4][4] and
+    B[4][4] and produces a 4x4 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 
output[4][4]){
+            for (int i = 0; i < 4; i++){
+                for (int j = 0; i < 4; i++){
+                    out[i][j] = 0;
+                    for (int k = 0; k < 4; k++){
+                        out[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * Matrix A is interleaved
+    This function returns a TensorIntrin that can be used to tensorize a 
schedule.
+
+    Parameters
+    ----------
+    dtype: str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    data = te.placeholder((te.var("rows"), 4), dtype, name="data")
+    kernel = te.placeholder((4, 4), dtype, name="kernel")

Review comment:
       Advise either a comment explaining how A/B map to data/kernel or changes 
these to A/B

##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -589,6 +587,236 @@ def _instr(index):
     )
 
 
+def select_word(vec, lane, dtype_vec):
+    """
+    Utility function used to select a int8x4 word within a int8x16 vector
+    and replicate 4 times.
+    The pseudo-code for this operation is:
+
+    v = [x0, ..., x15]
+    vsub(lane) = v[4*lane:4*lane+3]
+    replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)]
+
+    Note that 0<=lane<4
+
+     Parameters
+    ----------
+    vec: tvm.tir.Expr
+         int8x16 vector expression
+    lane: int
+        vector lane we want to replicate
+    dtype_vec: str
+        vector data type (e.g., int8x16)
+
+    Returns
+    ----------
+    output: tvm.tir.Expr
+        replicated vector
+    """
+    # Reinterpret vec_a as 4 int32 words
+    vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)
+    # Broadcast the lane-th word
+    vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
+    # Convert back to uint8x16
+    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", 
vec_int32_shuffled)
+    return vec_int8_broadcast
+
+
+def mmla_4x4_int8_int8_int32(dtype):
+    """
+    Int8 4x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[4][4] and
+    B[4][4] and produces a 4x4 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 
output[4][4]){
+            for (int i = 0; i < 4; i++){
+                for (int j = 0; i < 4; i++){
+                    out[i][j] = 0;
+                    for (int k = 0; k < 4; k++){
+                        out[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * Matrix A is interleaved
+    This function returns a TensorIntrin that can be used to tensorize a 
schedule.
+
+    Parameters
+    ----------
+    dtype: str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    data = te.placeholder((te.var("rows"), 4), dtype, name="data")
+    kernel = te.placeholder((4, 4), dtype, name="kernel")
+    dtype_vec = dtype + "x16"
+
+    k = te.reduce_axis((0, 4), name="k")
+    C = te.compute(
+        (te.var("rows"), 4),
+        lambda i, j: te.sum(data[i, k].astype("int32") * kernel[j, 
k].astype("int32"), axis=k),
+        name="C",
+    )
+
+    aa_buffer = tvm.tir.decl_buffer(
+        data.shape, dtype, name="aa_buffer", offset_factor=1, 
strides=[te.var("sa"), 1]
+    )
+    bb_buffer = tvm.tir.decl_buffer(
+        kernel.shape, dtype, name="bb_buffer", offset_factor=1, 
strides=[te.var("sb"), 1]
+    )
+    cc_buffer = tvm.tir.decl_buffer(
+        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, 
strides=[te.var("sc"), 1]
+    )
+
+    llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else 
"llvm.aarch64.neon.udot"
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                for i in range(0, 4):
+                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, 
"int32x4")))
+                return ib.get()
+
+            vec_a = ins[0].vload([0, 0], dtype_vec)
+            vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)]
+            vec_b = ins[1].vload([0, 0], dtype_vec)
+
+            # Execute the dot product
+            for i in range(0, 4):
+                vec_c = outs[0].vload([i, 0], "int32x4")
+                vdot = tvm.tir.call_llvm_intrin(
+                    "int32x4", llvm_intrin, tvm.tir.const(3, "uint32"), vec_c, 
vec_b, vec_aa[i],
+                )
+
+                # Store the result
+                ib.emit(outs[0].vstore([i, 0], vdot))
+
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op,
+        _intrin_func,
+        binds={data: aa_buffer, kernel: bb_buffer, C: cc_buffer},
+        default_buffer_params=buffer_params,
+    )
+
+
+def mmla_16x4_int8_int8_int32(dtype, rows):
+    """
+    Int8 16x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[rows][4] and
+    B[4][16] and produces a rowsx16 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_16x4_int8_int8_int32(int8 A[rows][4], int8 B[4][16], int32 
output[rows][16]){
+            for (int i = 0; i < rows; i++){
+                for (int j = 0; i < 16; i++){
+                    out[i][j] = 0;
+                    for (int k = 0; k < 4; k++){
+                        out[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }

Review comment:
       indent

##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -589,6 +587,236 @@ def _instr(index):
     )
 
 
+def select_word(vec, lane, dtype_vec):
+    """
+    Utility function used to select a int8x4 word within a int8x16 vector
+    and replicate 4 times.
+    The pseudo-code for this operation is:
+
+    v = [x0, ..., x15]
+    vsub(lane) = v[4*lane:4*lane+3]
+    replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)]
+
+    Note that 0<=lane<4
+
+     Parameters
+    ----------
+    vec: tvm.tir.Expr
+         int8x16 vector expression
+    lane: int
+        vector lane we want to replicate
+    dtype_vec: str
+        vector data type (e.g., int8x16)
+
+    Returns
+    ----------
+    output: tvm.tir.Expr
+        replicated vector
+    """
+    # Reinterpret vec_a as 4 int32 words
+    vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)
+    # Broadcast the lane-th word
+    vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
+    # Convert back to uint8x16
+    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", 
vec_int32_shuffled)
+    return vec_int8_broadcast
+
+
+def mmla_4x4_int8_int8_int32(dtype):

Review comment:
       suggestion mmla -> gemm_acc? This avoids overloading the term mmla which 
also refers to a hardware intrinsic.

##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -589,6 +587,236 @@ def _instr(index):
     )
 
 
+def select_word(vec, lane, dtype_vec):
+    """
+    Utility function used to select a int8x4 word within a int8x16 vector
+    and replicate 4 times.
+    The pseudo-code for this operation is:
+
+    v = [x0, ..., x15]
+    vsub(lane) = v[4*lane:4*lane+3]
+    replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)]
+
+    Note that 0<=lane<4
+
+     Parameters
+    ----------
+    vec: tvm.tir.Expr
+         int8x16 vector expression
+    lane: int
+        vector lane we want to replicate
+    dtype_vec: str
+        vector data type (e.g., int8x16)
+
+    Returns
+    ----------
+    output: tvm.tir.Expr
+        replicated vector
+    """
+    # Reinterpret vec_a as 4 int32 words
+    vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)
+    # Broadcast the lane-th word
+    vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
+    # Convert back to uint8x16
+    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", 
vec_int32_shuffled)
+    return vec_int8_broadcast
+
+
+def mmla_4x4_int8_int8_int32(dtype):
+    """
+    Int8 4x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[4][4] and
+    B[4][4] and produces a 4x4 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 
output[4][4]){
+            for (int i = 0; i < 4; i++){
+                for (int j = 0; i < 4; i++){
+                    out[i][j] = 0;
+                    for (int k = 0; k < 4; k++){
+                        out[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * Matrix A is interleaved
+    This function returns a TensorIntrin that can be used to tensorize a 
schedule.
+
+    Parameters
+    ----------
+    dtype: str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    data = te.placeholder((te.var("rows"), 4), dtype, name="data")
+    kernel = te.placeholder((4, 4), dtype, name="kernel")
+    dtype_vec = dtype + "x16"
+
+    k = te.reduce_axis((0, 4), name="k")
+    C = te.compute(
+        (te.var("rows"), 4),
+        lambda i, j: te.sum(data[i, k].astype("int32") * kernel[j, 
k].astype("int32"), axis=k),
+        name="C",
+    )
+
+    aa_buffer = tvm.tir.decl_buffer(
+        data.shape, dtype, name="aa_buffer", offset_factor=1, 
strides=[te.var("sa"), 1]
+    )
+    bb_buffer = tvm.tir.decl_buffer(
+        kernel.shape, dtype, name="bb_buffer", offset_factor=1, 
strides=[te.var("sb"), 1]
+    )
+    cc_buffer = tvm.tir.decl_buffer(
+        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, 
strides=[te.var("sc"), 1]
+    )
+
+    llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else 
"llvm.aarch64.neon.udot"
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                for i in range(0, 4):
+                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, 
"int32x4")))
+                return ib.get()
+
+            vec_a = ins[0].vload([0, 0], dtype_vec)
+            vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)]
+            vec_b = ins[1].vload([0, 0], dtype_vec)
+
+            # Execute the dot product
+            for i in range(0, 4):
+                vec_c = outs[0].vload([i, 0], "int32x4")
+                vdot = tvm.tir.call_llvm_intrin(
+                    "int32x4", llvm_intrin, tvm.tir.const(3, "uint32"), vec_c, 
vec_b, vec_aa[i],
+                )
+
+                # Store the result
+                ib.emit(outs[0].vstore([i, 0], vdot))
+
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op,
+        _intrin_func,
+        binds={data: aa_buffer, kernel: bb_buffer, C: cc_buffer},
+        default_buffer_params=buffer_params,
+    )
+
+
+def mmla_16x4_int8_int8_int32(dtype, rows):
+    """
+    Int8 16x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[rows][4] and
+    B[4][16] and produces a rowsx16 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_16x4_int8_int8_int32(int8 A[rows][4], int8 B[4][16], int32 
output[rows][16]){
+            for (int i = 0; i < rows; i++){
+                for (int j = 0; i < 16; i++){
+                    out[i][j] = 0;
+                    for (int k = 0; k < 4; k++){
+                        out[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * A is not interleaved, but used in its native form

Review comment:
       I think it's better to describe the properties this matrix has now 
rather than how it was produced.

##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -589,6 +587,236 @@ def _instr(index):
     )
 
 
+def select_word(vec, lane, dtype_vec):
+    """
+    Utility function used to select a int8x4 word within a int8x16 vector
+    and replicate 4 times.
+    The pseudo-code for this operation is:
+
+    v = [x0, ..., x15]
+    vsub(lane) = v[4*lane:4*lane+3]
+    replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)]
+
+    Note that 0<=lane<4
+
+     Parameters
+    ----------
+    vec: tvm.tir.Expr
+         int8x16 vector expression
+    lane: int
+        vector lane we want to replicate
+    dtype_vec: str
+        vector data type (e.g., int8x16)
+
+    Returns
+    ----------
+    output: tvm.tir.Expr
+        replicated vector
+    """
+    # Reinterpret vec_a as 4 int32 words
+    vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)
+    # Broadcast the lane-th word
+    vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
+    # Convert back to uint8x16
+    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", 
vec_int32_shuffled)
+    return vec_int8_broadcast
+
+
+def mmla_4x4_int8_int8_int32(dtype):
+    """
+    Int8 4x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[4][4] and
+    B[4][4] and produces a 4x4 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 
output[4][4]){
+            for (int i = 0; i < 4; i++){
+                for (int j = 0; i < 4; i++){
+                    out[i][j] = 0;
+                    for (int k = 0; k < 4; k++){
+                        out[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * Matrix A is interleaved
+    This function returns a TensorIntrin that can be used to tensorize a 
schedule.
+
+    Parameters
+    ----------
+    dtype: str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    data = te.placeholder((te.var("rows"), 4), dtype, name="data")
+    kernel = te.placeholder((4, 4), dtype, name="kernel")
+    dtype_vec = dtype + "x16"
+
+    k = te.reduce_axis((0, 4), name="k")
+    C = te.compute(
+        (te.var("rows"), 4),
+        lambda i, j: te.sum(data[i, k].astype("int32") * kernel[j, 
k].astype("int32"), axis=k),
+        name="C",
+    )
+
+    aa_buffer = tvm.tir.decl_buffer(
+        data.shape, dtype, name="aa_buffer", offset_factor=1, 
strides=[te.var("sa"), 1]
+    )
+    bb_buffer = tvm.tir.decl_buffer(
+        kernel.shape, dtype, name="bb_buffer", offset_factor=1, 
strides=[te.var("sb"), 1]
+    )
+    cc_buffer = tvm.tir.decl_buffer(
+        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, 
strides=[te.var("sc"), 1]
+    )
+
+    llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else 
"llvm.aarch64.neon.udot"
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                for i in range(0, 4):
+                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, 
"int32x4")))
+                return ib.get()
+
+            vec_a = ins[0].vload([0, 0], dtype_vec)
+            vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)]
+            vec_b = ins[1].vload([0, 0], dtype_vec)
+
+            # Execute the dot product
+            for i in range(0, 4):
+                vec_c = outs[0].vload([i, 0], "int32x4")
+                vdot = tvm.tir.call_llvm_intrin(
+                    "int32x4", llvm_intrin, tvm.tir.const(3, "uint32"), vec_c, 
vec_b, vec_aa[i],
+                )
+
+                # Store the result
+                ib.emit(outs[0].vstore([i, 0], vdot))
+
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op,
+        _intrin_func,
+        binds={data: aa_buffer, kernel: bb_buffer, C: cc_buffer},
+        default_buffer_params=buffer_params,
+    )
+
+
+def mmla_16x4_int8_int8_int32(dtype, rows):
+    """
+    Int8 16x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[rows][4] and

Review comment:
       [4] should be [16], change in other places too

##########
File path: python/tvm/topi/arm_cpu/tensor_intrin.py
##########
@@ -589,6 +587,236 @@ def _instr(index):
     )
 
 
+def select_word(vec, lane, dtype_vec):
+    """
+    Utility function used to select a int8x4 word within a int8x16 vector
+    and replicate 4 times.
+    The pseudo-code for this operation is:
+
+    v = [x0, ..., x15]
+    vsub(lane) = v[4*lane:4*lane+3]
+    replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)]
+
+    Note that 0<=lane<4
+
+     Parameters
+    ----------
+    vec: tvm.tir.Expr
+         int8x16 vector expression
+    lane: int
+        vector lane we want to replicate
+    dtype_vec: str
+        vector data type (e.g., int8x16)
+
+    Returns
+    ----------
+    output: tvm.tir.Expr
+        replicated vector
+    """
+    # Reinterpret vec_a as 4 int32 words
+    vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)
+    # Broadcast the lane-th word
+    vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
+    # Convert back to uint8x16
+    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", 
vec_int32_shuffled)
+    return vec_int8_broadcast
+
+
+def mmla_4x4_int8_int8_int32(dtype):
+    """
+    Int8 4x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[4][4] and
+    B[4][4] and produces a 4x4 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 
output[4][4]){
+            for (int i = 0; i < 4; i++){
+                for (int j = 0; i < 4; i++){
+                    out[i][j] = 0;
+                    for (int k = 0; k < 4; k++){
+                        out[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * Matrix A is interleaved
+    This function returns a TensorIntrin that can be used to tensorize a 
schedule.
+
+    Parameters
+    ----------
+    dtype: str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    data = te.placeholder((te.var("rows"), 4), dtype, name="data")
+    kernel = te.placeholder((4, 4), dtype, name="kernel")
+    dtype_vec = dtype + "x16"
+
+    k = te.reduce_axis((0, 4), name="k")
+    C = te.compute(
+        (te.var("rows"), 4),
+        lambda i, j: te.sum(data[i, k].astype("int32") * kernel[j, 
k].astype("int32"), axis=k),
+        name="C",
+    )
+
+    aa_buffer = tvm.tir.decl_buffer(
+        data.shape, dtype, name="aa_buffer", offset_factor=1, 
strides=[te.var("sa"), 1]
+    )
+    bb_buffer = tvm.tir.decl_buffer(
+        kernel.shape, dtype, name="bb_buffer", offset_factor=1, 
strides=[te.var("sb"), 1]
+    )
+    cc_buffer = tvm.tir.decl_buffer(
+        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, 
strides=[te.var("sc"), 1]
+    )
+
+    llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else 
"llvm.aarch64.neon.udot"
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                for i in range(0, 4):
+                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, 
"int32x4")))
+                return ib.get()
+
+            vec_a = ins[0].vload([0, 0], dtype_vec)
+            vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)]
+            vec_b = ins[1].vload([0, 0], dtype_vec)
+
+            # Execute the dot product
+            for i in range(0, 4):
+                vec_c = outs[0].vload([i, 0], "int32x4")
+                vdot = tvm.tir.call_llvm_intrin(
+                    "int32x4", llvm_intrin, tvm.tir.const(3, "uint32"), vec_c, 
vec_b, vec_aa[i],
+                )
+
+                # Store the result
+                ib.emit(outs[0].vstore([i, 0], vdot))
+
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op,
+        _intrin_func,
+        binds={data: aa_buffer, kernel: bb_buffer, C: cc_buffer},
+        default_buffer_params=buffer_params,
+    )
+
+
+def mmla_16x4_int8_int8_int32(dtype, rows):
+    """
+    Int8 16x4 matrix multiplication using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[rows][4] and
+    B[4][16] and produces a rowsx16 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_16x4_int8_int8_int32(int8 A[rows][4], int8 B[4][16], int32 
output[rows][16]){
+            for (int i = 0; i < rows; i++){
+                for (int j = 0; i < 16; i++){
+                    out[i][j] = 0;
+                    for (int k = 0; k < 4; k++){
+                        out[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * A is not interleaved, but used in its native form
+    This function returns a TensorIntrin that can be used to tensorize a 
schedule.
+
+    Parameters
+    ----------
+    dtype: str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    data = te.placeholder((rows, 16), dtype, name="data")
+    kernel = te.placeholder((4, 16, 4), dtype, name="kernel")
+    dtype_vec = dtype + "x16"
+    idxm = tvm.tir.indexmod
+    k = te.reduce_axis((0, 16), name="k")
+    C = te.compute(
+        (rows, 16),
+        lambda i, j: te.sum(
+            data[i, k].astype("int32") * kernel[k // 4, j, idxm(k, 
4)].astype("int32"), axis=k
+        ),
+        name="C",
+    )

Review comment:
       This needs further explanation.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to