comaniac commented on a change in pull request #9860:
URL: https://github.com/apache/tvm/pull/9860#discussion_r780528430



##########
File path: 
tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py
##########
@@ -0,0 +1,1554 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: 
disable=missing-module-docstring,missing-function-docstring,missing-class-docstring
+from typing import Callable, List
+
+from numpy.testing import assert_allclose
+import tvm
+from tvm import meta_schedule as ms, te, tir
+from tvm.script import tir as T
+
+N_FEATURES = 164
+
+
[email protected]_func
+def matmul(
+    A: T.Buffer[(512, 512), "float32"],
+    B: T.Buffer[(512, 512), "float32"],
+    C: T.Buffer[(512, 512), "float32"],
+) -> None:
+    # function attr dict
+    T.func_attr({"global_symbol": "main", "tir.noalias": True})
+    # body
+    # with T.block("root")
+    for i0, i1, i2 in T.grid(512, 512, 512):
+        with T.block("C"):
+            i, j, k = T.axis.remap("SSR", [i0, i1, i2])
+            T.reads(C[i, j], A[i, k], B[k, j])
+            T.writes(C[i, j])
+            with T.init():
+                C[i, j] = T.float32(0)
+            C[i, j] = C[i, j] + A[i, k] * B[k, j]
+
+
+def _make_context(target) -> ms.TuneContext:
+    return ms.TuneContext(
+        target=target,
+        num_threads=1,
+    )
+
+
+def _make_candidate(f_sch: Callable[[], tir.Schedule]) -> ms.MeasureCandidate:
+    return ms.MeasureCandidate(sch=f_sch(), args_info=[])
+
+
+def _feature_names(  # pylint: disable=invalid-name
+    buffers_per_store: int = 5,
+    arith_intensity_curve_num_samples: int = 10,
+) -> List[str]:
+    result = [
+        "float_mad",
+        "float_addsub",
+        "float_mul",
+        "float_divmod",
+        "float_cmp",
+        "float_mathfunc",
+        "float_otherfunc",
+        "int_mad",
+        "int_addsub",
+        "int_mul",
+        "int_divmod",
+        "int_cmp",
+        "int_mathfunc",
+        "int_otherfunc",
+        "bool_op",
+        "select_op",
+        "vec_num",
+        "vec_prod",
+        "vec_len",
+        "vec_type.kPosNone",
+        "vec_type.kPosInnerSpatial",
+        "vec_type.kPosMiddleSpatial",
+        "vec_type.kPosOuterSpatial",
+        "vec_type.kPosInnerReduce",
+        "vec_type.kPosMiddleReduce",
+        "vec_type.kPosOuterReduce",
+        "vec_type.kPosMixed",
+        "unroll_num",
+        "unroll_prod",
+        "unroll_len",
+        "unroll_type.kPosNone",
+        "unroll_type.kPosInnerSpatial",
+        "unroll_type.kPosMiddleSpatial",
+        "unroll_type.kPosOuterSpatial",
+        "unroll_type.kPosInnerReduce",
+        "unroll_type.kPosMiddleReduce",
+        "unroll_type.kPosOuterReduce",
+        "unroll_type.kPosMixed",
+        "parallel_num",
+        "parallel_prod",
+        "parallel_len",
+        "parallel_type.kPosNone",
+        "parallel_type.kPosInnerSpatial",
+        "parallel_type.kPosMiddleSpatial",
+        "parallel_type.kPosOuterSpatial",
+        "parallel_type.kPosInnerReduce",
+        "parallel_type.kPosMiddleReduce",
+        "parallel_type.kPosOuterReduce",
+        "parallel_type.kPosMixed",
+        "is_gpu",
+        "blockIdx_x_len",
+        "blockIdx_y_len",
+        "blockIdx_z_len",
+        "threadIdx_x_len",
+        "threadIdx_y_len",
+        "threadIdx_z_len",
+        "vthread_len",
+    ]
+    for i in range(buffers_per_store):
+        result.extend(
+            f"B{i}.{s}"
+            for s in [
+                "acc_type.kRead",
+                "acc_type.kWrite",
+                "acc_type.kReadWrite",
+                "bytes",
+                "unique_bytes",
+                "lines",
+                "unique_lines",
+                "reuse_type.kLoopMultipleRead",
+                "reuse_type.kSerialMultipleReadWrite",
+                "reuse_type.kNoReuse",
+                "reuse_dis_iter",
+                "reuse_dis_bytes",
+                "reuse_ct",
+                "bytes_d_reuse_ct",
+                "unique_bytes_d_reuse_ct",
+                "lines_d_reuse_ct",
+                "unique_lines_d_reuse_ct",
+                "stride",
+            ]
+        )
+    result.extend(f"arith_intensity_curve_{i}" for i in 
range(arith_intensity_curve_num_samples))
+    result.extend(
+        [
+            "alloc_size",
+            "alloc_prod",
+            "alloc_outer_prod",
+            "alloc_inner_prod",
+            "outer_prod",
+            "num_loops",
+            "auto_unroll_max_step",
+        ]
+    )
+    # 57 + 18 * 5 + 10 + 4 + 3
+    assert len(result) == N_FEATURES
+    return result
+
+
+def _zip_feature(feature, names):
+    assert feature.ndim == 1
+    assert feature.shape[0] == N_FEATURES
+    assert len(names) == N_FEATURES
+    return list(zip(names, feature))
+
+
+def _print_feature(feature, st, ed):  # pylint: disable=invalid-name
+    named_feature = _zip_feature(feature, _feature_names())
+    for k, v in named_feature[st:ed]:
+        print("\t", k, v)
+
+
+def test_cpu_matmul():
+    def _create_schedule():
+        func = matmul
+        sch = tir.Schedule(func, debug_mask="all")
+        block = sch.get_block("C")
+        i, j, k = sch.get_loops(block)
+        i_o, i_i = sch.split(i, factors=[None, 16])  # outer: 32
+        j_o, j_i = sch.split(j, factors=[None, 8])  # outer: 64
+        sch.reorder(i_o, j_o, k, j_i, i_i)
+        sch.vectorize(j_i)
+        sch.parallel(i_o)
+        sch.parallel(j_o)
+        sch.unroll(k)
+        return sch
+
+    extractor = ms.feature_extractor.PerStoreFeature()
+    (feature,) = extractor.extract_from(
+        _make_context(tvm.target.Target("llvm")),
+        candidates=[_make_candidate(_create_schedule)],
+    )
+    feature = feature.numpy()
+    assert feature.shape == (1, N_FEATURES)
+    f = feature[0]
+    # Group 1.1: arith
+    assert_allclose(
+        actual=f[0:16],
+        # fmt: off
+        desired=[
+            # float math ops
+            0, 27, 27, 0, 0, 0, 0,
+            # int math ops
+            0, 29, 29, 0, 0, 0, 0,
+            # bool/select ops
+            0, 0,
+        ],
+        # fmt: on
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.2: vectorize
+    assert_allclose(
+        actual=f[16:27],
+        desired=[1.0, 3.169924, 3.169924, 0, 0, 0, 0, 0, 0, 0, 1],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.3: unroll
+    assert_allclose(
+        actual=f[27:38],
+        desired=[1.0, 9.002815, 9.002815, 0, 0, 0, 0, 0, 0, 0, 1],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.4: parallel
+    assert_allclose(
+        actual=f[38:49],
+        desired=[1.58496, 11.0007, 6.022368, 0, 0, 0, 0, 0, 0, 0, 1],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.5: is_gpu, blockIdx.x/y/z, threadIdx.x/y/z, vthread
+    assert_allclose(
+        actual=f[49:57],
+        desired=[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.1: Buffer A
+    assert_allclose(
+        actual=f[57:75],
+        desired=[
+            1,
+            0,
+            0,
+            29,
+            20,
+            27,
+            14,
+            1,
+            0,
+            0,
+            4.087463,
+            7.0552826,
+            3.169925,
+            26,
+            17,
+            24,
+            11.0007038,
+            9.002815,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.2: Buffer C
+    assert_allclose(
+        actual=f[75:93],
+        desired=[
+            0.0,
+            0.0,
+            1.0,
+            29.0,
+            20.000001907348633,
+            27.0,
+            14.00008773803711,
+            1.0,
+            0.0,
+            0.0,
+            7.011227130889893,
+            9.250298500061035,
+            9.002815246582031,
+            20.000001907348633,
+            11.000703811645508,
+            18.0000057220459,
+            5.044394016265869,
+            9.002815246582031,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.3: Buffer B
+    assert_allclose(
+        actual=f[93:111],
+        desired=[
+            1.0,
+            0.0,
+            0.0,
+            29.0,
+            20.000001907348633,
+            19.000001907348633,
+            14.00008773803711,
+            1.0,
+            0.0,
+            0.0,
+            1.0,
+            3.700439691543579,
+            4.087462902069092,
+            25.0,
+            16.000022888183594,
+            15.000043869018555,
+            10.001408576965332,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.4: Dummy padding
+    assert_allclose(
+        actual=f[111:129],
+        desired=[0.0] * 18,
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.5: Dummy padding
+    assert_allclose(
+        actual=f[129:147],
+        desired=[0.0] * 18,
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 3: Arithmetic intensity
+    assert_allclose(
+        actual=f[147:157],
+        desired=[
+            0.7097842693328857,
+            0.7408391237258911,
+            0.8750449419021606,
+            0.9449487924575806,
+            1.0148526430130005,
+            1.0847564935684204,
+            1.113688349723816,
+            1.1394684314727783,
+            1.2119636535644531,
+            1.2971993684768677,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 4 & 5
+    assert_allclose(
+        actual=f[157:164],
+        desired=[
+            20.000001907348633,
+            18.0000057220459,
+            1.0,
+            27.0,
+            27.0,
+            2.5849626064300537,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+
+
+def test_cpu_fusion():
+    # pylint: disable=all
+    @T.prim_func
+    def func(a: T.handle, b: T.handle, c: T.handle) -> None:
+        A = T.match_buffer(a, [64, 32], dtype="float32")
+        B = T.match_buffer(b, [64, 32], dtype="float32")
+        C = T.match_buffer(c, [64, 32], dtype="float32")
+        for i, j in T.grid(64, 32):  # type: ignore
+            with T.block():
+                T.reads([A[i, j], B[i, j]])  # type: ignore
+                T.writes([B[i, j], C[i, j]])  # type: ignore
+                with T.block("B"):
+                    T.reads([A[i, j]])  # type: ignore
+                    T.writes([B[i, j]])  # type: ignore
+                    B[i, j] = A[i, j]  # type: ignore
+                with T.block("C"):
+                    T.reads([B[i, j]])  # type: ignore
+                    T.writes([C[i, j]])  # type: ignore
+                    C[i, j] = B[i, j]  # type: ignore
+
+    # pylint: enable=all
+
+    def _create_schedule():
+        return tir.Schedule(func, debug_mask="all")
+
+    extractor = ms.feature_extractor.PerStoreFeature()
+    (feature,) = extractor.extract_from(
+        _make_context(tvm.target.Target("llvm")),
+        candidates=[_make_candidate(_create_schedule)],
+    )
+    feature = feature.numpy()
+    assert feature.shape == (2, N_FEATURES)
+    ## Features for BufferStore(B)
+    f = feature[0]
+    # Group 1.1: arith
+    assert_allclose(
+        actual=f[0:16],
+        # fmt: off
+        desired=[0.0] * 16,
+        # fmt: on
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.2: vectorize
+    assert_allclose(
+        actual=f[16:27],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.3: unroll
+    assert_allclose(
+        actual=f[27:38],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.4: parallel
+    assert_allclose(
+        actual=f[38:49],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.5: is_gpu, blockIdx.x/y/z, threadIdx.x/y/z, vthread
+    assert_allclose(
+        actual=f[49:57],
+        desired=[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.1: Buffer A
+    assert_allclose(
+        actual=f[57:75],
+        desired=[
+            1.0,
+            0.0,
+            0.0,
+            13.000176429748535,
+            13.000176429748535,
+            7.011227130889893,
+            7.011227130889893,
+            0.0,
+            0.0,
+            1.0,
+            0.0,
+            0.0,
+            0.0,
+            14.00008773803711,
+            14.00008773803711,
+            8.005624771118164,
+            8.005624771118164,
+            1.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.2: Buffer B
+    assert_allclose(
+        actual=f[75:93],
+        desired=[
+            0.0,
+            1.0,
+            0.0,
+            13.000176429748535,
+            13.000176429748535,
+            7.011227130889893,
+            7.011227130889893,
+            0.0,
+            0.0,
+            1.0,
+            0.0,
+            0.0,
+            0.0,
+            14.00008773803711,
+            14.00008773803711,
+            8.005624771118164,
+            8.005624771118164,
+            1.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.3: Dummy padding
+    assert_allclose(
+        actual=f[93:111],
+        desired=[0.0] * 18,
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.4: Dummy padding
+    assert_allclose(
+        actual=f[111:129],
+        desired=[0.0] * 18,
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.5: Dummy padding
+    assert_allclose(
+        actual=f[129:147],
+        desired=[0.0] * 18,
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 3: Arithmetic intensity
+    assert_allclose(
+        actual=f[147:157],
+        desired=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 4 & 5
+    assert_allclose(
+        actual=f[157:164],
+        desired=[
+            13.000176,
+            11.000703811645508,
+            1.0,
+            11.000703811645508,
+            11.000703811645508,
+            1.5849624872207642,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    ## Features for BufferStore(C)
+    f = feature[1]
+    # Group 1.1: arith
+    assert_allclose(
+        actual=f[0:16],
+        # fmt: off
+        desired=[0.0] * 16,
+        # fmt: on
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.2: vectorize
+    assert_allclose(
+        actual=f[16:27],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.3: unroll
+    assert_allclose(
+        actual=f[27:38],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.4: parallel
+    assert_allclose(
+        actual=f[38:49],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.5: is_gpu, blockIdx.x/y/z, threadIdx.x/y/z, vthread
+    assert_allclose(
+        actual=f[49:57],
+        desired=[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.1: Buffer B
+    assert_allclose(
+        actual=f[57:75],
+        desired=[
+            1.0,
+            0.0,
+            0.0,
+            13.000176429748535,
+            13.000176429748535,
+            7.011227130889893,
+            7.011227130889893,
+            0.0,
+            1.0,
+            0.0,
+            1.0,
+            4.087462902069092,
+            1.0,
+            13.000176429748535,
+            13.000176429748535,
+            7.011227130889893,
+            7.011227130889893,
+            1.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.2: Buffer C
+    assert_allclose(
+        actual=f[75:93],
+        desired=[
+            0.0,
+            1.0,
+            0.0,
+            13.000176429748535,
+            13.000176429748535,
+            7.011227130889893,
+            7.011227130889893,
+            0.0,
+            0.0,
+            1.0,
+            0.0,
+            0.0,
+            0.0,
+            14.00008773803711,
+            14.00008773803711,
+            8.005624771118164,
+            8.005624771118164,
+            1.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.3: Dummy padding
+    assert_allclose(
+        actual=f[93:111],
+        desired=[0.0] * 18,
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.4: Dummy padding
+    assert_allclose(
+        actual=f[111:129],
+        desired=[0.0] * 18,
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.5: Dummy padding
+    assert_allclose(
+        actual=f[129:147],
+        desired=[0.0] * 18,
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 3: Arithmetic intensity
+    assert_allclose(
+        actual=f[147:157],
+        desired=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 4 & 5
+    assert_allclose(
+        actual=f[157:164],
+        desired=[
+            13.000176429748535,
+            11.000703811645508,
+            1.0,
+            11.000703811645508,
+            11.000703811645508,
+            1.5849624872207642,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+
+
+def test_gpu():
+    def _create_schedule():
+        func = matmul
+        sch = tir.Schedule(func, debug_mask="all")
+        c = sch.get_block("C")
+        c_local = sch.cache_write(c, 0, "local")
+        i, j, k = sch.get_loops(c)
+        # pylint: disable=invalid-name
+        i0, i1, i2, i3, i4 = sch.split(i, factors=[None, 1, 16, 32, 1])  # 
outer: 1
+        j0, j1, j2, j3, j4 = sch.split(j, factors=[None, 4, 1, 1, 16])  # 
outer: 8
+        k0, k1, k2 = sch.split(k, factors=[None, 1, 2])  # outer: 256
+        # pylint: enable=invalid-name
+        # fmt: off
+        sch.reorder(
+            i0, j0,  # S
+            i1, j1,  # S
+            i2, j2,  # S
+            k0,      # R
+            k1,      # R
+            i3, j3,  # S
+            k2,      # R
+            i4, j4,  # S
+        )
+        # fmt: on
+        # thread binding
+        i0_j0 = sch.fuse(i0, j0)
+        i1_j1 = sch.fuse(i1, j1)
+        i2_j2 = sch.fuse(i2, j2)
+        sch.bind(i0_j0, "blockIdx.x")
+        sch.bind(i1_j1, "vthread.x")
+        sch.bind(i2_j2, "threadIdx.x")
+        # fusion
+        sch.reverse_compute_at(c_local, i2_j2)
+        # cache read 'A'
+        a_shared = sch.cache_read(c, 1, "shared")
+        sch.compute_at(a_shared, k0)
+        _, _, _, _, a_i, a_j = sch.get_loops(a_shared)
+        a_ij = sch.fuse(a_i, a_j)
+        _, a_j = sch.split(a_ij, factors=[None, 16])  # outer: 64
+        sch.bind(a_j, "threadIdx.x")
+        # cache read 'B'
+        b_shared = sch.cache_read(c, 2, "shared")
+        sch.compute_at(b_shared, k0)
+        _, _, _, _, b_i, b_j = sch.get_loops(b_shared)
+        b_ij = sch.fuse(b_i, b_j)
+        _, b_j = sch.split(b_ij, factors=[None, 16])  # outer: 8
+        sch.bind(b_j, "threadIdx.x")
+        # auto unroll
+        sch.annotate(i0_j0, "pragma_auto_unroll_max_step", tir.IntImm("int32", 
1024))
+        sch.annotate(i0_j0, "pragma_unroll_explicit", tir.IntImm("int32", 1))
+        return sch
+
+    extractor = ms.feature_extractor.PerStoreFeature()
+    (feature,) = extractor.extract_from(
+        _make_context(tvm.target.Target("cuda")),
+        candidates=[_make_candidate(_create_schedule)],
+    )
+    feature = feature.numpy()
+    assert feature.shape == (4, N_FEATURES)
+    ### Check feature[0]: BufferStore(A_shared) <= A[...]
+    f = feature[0]
+    # Group 1.1: arith
+    assert_allclose(
+        actual=f[0:16],
+        desired=[
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            24.000000085991324,
+            24.000000085991324,
+            24.000000085991324,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.2: vectorize
+    assert_allclose(
+        actual=f[16:27],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.3: unroll
+    assert_allclose(
+        actual=f[27:38],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.4: parallel
+    assert_allclose(
+        actual=f[38:49],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.5: is_gpu, blockIdx.x/y/z, threadIdx.x/y/z, vthread
+    assert_allclose(
+        actual=f[49:57],
+        desired=[1.0, 3.169925001442312, 1.0, 1.0, 4.087462841250339, 1.0, 
1.0, 2.321928094887362],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.1: Buffer A
+    assert_allclose(
+        actual=f[57:75],
+        desired=[
+            1.0,
+            0.0,
+            0.0,
+            25.000000042995662,
+            20.000001375860553,
+            23.00000017198264,
+            14.000088052430122,
+            1.0,
+            0.0,
+            0.0,
+            18.00000550343433,
+            20.00562591970089,
+            2.321928094887362,
+            23.00000017198264,
+            18.00000550343433,
+            21.000000687930438,
+            12.0003521774803,
+            12.0003521774803,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.2: Buffer A.shared
+    assert_allclose(
+        actual=f[75:93],
+        desired=[
+            0.0,
+            1.0,
+            0.0,
+            25.000000042995662,
+            12.0003521774803,
+            23.00000017198264,
+            9.002815015607053,
+            1.0,
+            0.0,
+            0.0,
+            6.022367813028454,
+            11.98049663618346,
+            8.005624549193879,
+            17.000011006847668,
+            4.087462841250339,
+            15.000044026886828,
+            1.584962500721156,
+            4.087462841250339,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.3: Dummy padding
+    assert_allclose(
+        actual=f[93:111],
+        desired=[
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.4: Dummy padding
+    assert_allclose(
+        actual=f[111:129],
+        desired=[
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.5: Dummy padding
+    assert_allclose(
+        actual=f[129:147],
+        desired=[
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 3: Arithmetic intensity
+    assert_allclose(
+        actual=f[147:157],
+        desired=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 4 & 5
+    assert_allclose(
+        actual=f[157:164],
+        desired=[
+            12.0003521774803,
+            27.000000010748916,
+            17.000011006847668,
+            6.022367813028454,
+            23.00000017198264,
+            2.584962500721156,
+            10.001408,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    ### Check feature[1]: BufferStore(B_shared) <= B[...]
+    f = feature[1]
+    # Group 1.1: arith
+    assert_allclose(
+        actual=f[0:16],
+        desired=[
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            22.00000034396526,
+            22.00000034396526,
+            21.000000687930438,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.2: vectorize
+    assert_allclose(
+        actual=f[16:27],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.3: unroll
+    assert_allclose(
+        actual=f[27:38],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.4: parallel
+    assert_allclose(
+        actual=f[38:49],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.5: is_gpu, blockIdx.x/y/z, threadIdx.x/y/z, vthread
+    assert_allclose(
+        actual=f[49:57],
+        desired=[1.0, 3.169925001442312, 1.0, 1.0, 4.087462841250339, 1.0, 
1.0, 2.321928094887362],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.1: Buffer B
+    assert_allclose(
+        actual=f[57:75],
+        desired=[
+            1.0,
+            0.0,
+            0.0,
+            22.00000034396526,
+            20.000001375860553,
+            20.000001375860553,
+            14.000088052430122,
+            1.0,
+            0.0,
+            0.0,
+            15.000044026886828,
+            20.17555076886471,
+            2.321928094887362,
+            20.000001375860553,
+            18.00000550343433,
+            18.00000550343433,
+            12.0003521774803,
+            4.087462841250339,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.2: Buffer B.shared
+    assert_allclose(
+        actual=f[75:93],
+        desired=[
+            0.0,
+            1.0,
+            0.0,
+            22.00000034396526,
+            9.002815015607053,
+            20.000001375860553,
+            3.169925001442312,
+            1.0,
+            0.0,
+            0.0,
+            3.169925001442312,
+            10.001408194392809,
+            8.005624549193879,
+            14.000088052430122,
+            1.584962500721156,
+            12.0003521774803,
+            0.044394119358453436,
+            4.087462841250339,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.3: Dummy padding
+    assert_allclose(
+        actual=f[93:111],
+        desired=[
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.4: Dummy padding
+    assert_allclose(
+        actual=f[111:129],
+        desired=[
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.5: Dummy padding
+    assert_allclose(
+        actual=f[129:147],
+        desired=[
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 3: Arithmetic intensity
+    assert_allclose(
+        actual=f[147:157],
+        desired=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 4 & 5
+    assert_allclose(
+        actual=f[157:164],
+        desired=[
+            9.002815015607053,
+            24.000000085991324,
+            17.000011006847668,
+            3.169925001442312,
+            20.000001375860553,
+            2.584962500721156,
+            10.001408,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    ### Check feature[2]: BufferStore(C_local) <= C_local[...] + A_shared[...] 
* B_shared[...]
+    f = feature[2]
+    # Group 1.1: arith
+    assert_allclose(
+        actual=f[0:16],
+        desired=[
+            0.0,
+            27.000000010748916,
+            27.000000010748916,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            28.000000005374456,
+            28.000000005374456,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.2: vectorize
+    assert_allclose(
+        actual=f[16:27],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.3: unroll
+    assert_allclose(
+        actual=f[27:38],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.4: parallel
+    assert_allclose(
+        actual=f[38:49],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.5: is_gpu, blockIdx.x/y/z, threadIdx.x/y/z, vthread
+    assert_allclose(
+        actual=f[49:57],
+        desired=[1.0, 3.169925001442312, 1.0, 1.0, 4.087462841250339, 1.0, 
1.0, 2.321928094887362],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.1: Buffer B.shared
+    assert_allclose(
+        actual=f[57:75],
+        desired=[
+            1.0,
+            0.0,
+            0.0,
+            29.00000000268723,
+            9.002815015607053,
+            23.00000017198264,
+            3.169925001442312,
+            1.0,
+            0.0,
+            0.0,
+            5.044394119358453,
+            7.651051691178929,
+            5.044394119358453,
+            24.000000085991324,
+            4.087462841250339,
+            18.00000550343433,
+            0.32192809488736235,
+            1.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.2: Buffer C.local
+    assert_allclose(
+        actual=f[75:93],
+        desired=[
+            0.0,
+            0.0,
+            1.0,
+            29.00000000268723,
+            11.000704269011246,
+            23.00000017198264,
+            5.044394119358453,
+            1.0,
+            0.0,
+            0.0,
+            4.087462841250339,
+            7.05528243550119,
+            1.584962500721156,
+            28.000000005374456,
+            10.001408194392809,
+            22.00000034396526,
+            4.087462841250339,
+            1.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.3: Buffer A.shared
+    assert_allclose(
+        actual=f[93:111],
+        desired=[
+            1.0,
+            0.0,
+            0.0,
+            29.00000000268723,
+            12.0003521774803,
+            19.00000275171979,
+            9.002815015607053,
+            1.0,
+            0.0,
+            0.0,
+            1.0,
+            3.700439718141092,
+            4.087462841250339,
+            25.000000042995662,
+            8.005624549193879,
+            15.000044026886828,
+            5.044394119358453,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.4: Dummy padding
+    assert_allclose(
+        actual=f[111:129],
+        desired=[
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.5: Dummy padding
+    assert_allclose(
+        actual=f[129:147],
+        desired=[
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 3: Arithmetic intensity
+    assert_allclose(
+        actual=f[147:157],
+        desired=[
+            0.7097842504665767,
+            0.7548801745187567,
+            0.8775907547541741,
+            0.9957389916154509,
+            1.2446737395193135,
+            1.493608487423176,
+            1.7093103019954263,
+            1.8031580276850985,
+            1.9841832691827785,
+            2.204648076869754,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 4 & 5
+    assert_allclose(
+        actual=f[157:164],
+        desired=[
+            11.000704269011246,
+            18.00000550343433,
+            9.002815015607053,
+            18.00000550343433,
+            27.000000010748916,
+            3.0,
+            10.001408,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    ### Check feature[3]: BufferStore(C) <= C_local[...]
+    f = feature[3]
+    # Group 1.1: arith
+    assert_allclose(
+        actual=f[0:16],
+        desired=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.2: vectorize
+    assert_allclose(
+        actual=f[16:27],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.3: unroll
+    assert_allclose(
+        actual=f[27:38],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.4: parallel
+    assert_allclose(
+        actual=f[38:49],
+        desired=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 1.5: is_gpu, blockIdx.x/y/z, threadIdx.x/y/z, vthread
+    assert_allclose(
+        actual=f[49:57],
+        desired=[1.0, 3.169925001442312, 1.0, 1.0, 4.087462841250339, 1.0, 
1.0, 2.321928094887362],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.1: Buffer C
+    assert_allclose(
+        actual=f[57:75],
+        desired=[
+            0.0,
+            1.0,
+            0.0,
+            20.000001375860553,
+            20.000001375860553,
+            14.000088052430122,
+            14.000088052430122,
+            0.0,
+            0.0,
+            1.0,
+            0.0,
+            0.0,
+            0.0,
+            21.000000687930438,
+            21.000000687930438,
+            15.000044026886828,
+            15.000044026886828,
+            1.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.2: Buffer C.local
+    assert_allclose(
+        actual=f[75:93],
+        desired=[
+            1.0,
+            0.0,
+            0.0,
+            20.000001375860553,
+            11.000704269011246,
+            14.000088052430122,
+            5.044394119358453,
+            1.0,
+            0.0,
+            0.0,
+            9.002815015607053,
+            12.0003521774803,
+            4.087462841250339,
+            16.00002201361136,
+            7.011227255423254,
+            10.001408194392809,
+            1.584962500721156,
+            1.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.3: Dummy padding
+    assert_allclose(
+        actual=f[93:111],
+        desired=[
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.4: Dummy padding
+    assert_allclose(
+        actual=f[111:129],
+        desired=[
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 2.5: Dummy padding
+    assert_allclose(
+        actual=f[129:147],
+        desired=[
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 3: Arithmetic intensity
+    assert_allclose(
+        actual=f[147:157],
+        desired=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    # Group 4 & 5
+    assert_allclose(
+        actual=f[157:164],
+        desired=[
+            20.000001375860553,
+            18.00000550343433,
+            1.0,
+            18.00000550343433,
+            18.00000550343433,
+            2.584962500721156,
+            10.001408,
+        ],
+        rtol=1e-5,
+        atol=1e-5,
+    )
+
+
+if __name__ == "__main__":
+    test_cpu_matmul()
+    test_cpu_fusion()
+    test_gpu()

Review comment:
       ```suggestion
       pytest.main([__file__])
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to