trevor-m opened a new issue #6691:
URL: https://github.com/apache/incubator-tvm/issues/6691
I've started noticing a large performance regression affecting Keras
MobileNetV2 caused by `INDEX_DEFAULT_I64=ON` (PR #6143). This is on an AWS
m5.12xlarge instance.
INDEX_DEFAULT_I64 | Frames per second
------------ | -------------
ON | 66.56
OFF | 435.49
I profiled the ops and found the slowdown comes from the
## Profile with `INDEX_DEFAULT_I64=OFF` (fast)
```
Node Name Ops
Time(us) Time(%) Shape Inputs
Outputs
--------- ---
-------- ------- ----- ------
-------
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7 64.704 3.571 (1, 9,
56, 56, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6 53.362 2.945 (1, 2,
112, 112, 16) 3 1
fused_nn_pad_3 fused_nn_pad_3
50.582 2.791 (1, 6, 113, 113, 16) 1 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5 47.874 2.642 (1, 6,
56, 56, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_6
fused_nn_contrib_conv2d_NCHWc_add_clip_6 46.828 2.584 (1, 6,
112, 112, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 42.364 2.338 (1, 12,
28, 28, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_91
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 39.554 2.183 (1, 36,
14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_81
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 39.418 2.175 (1, 12,
28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add_4
fused_nn_contrib_conv2d_NCHWc_add_add_4 38.871 2.145 (1, 2,
56, 56, 12) 4 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 37.926 2.093 (1, 36,
14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_5
fused_nn_contrib_conv2d_NCHWc_add_clip_5 37.407 2.064 (1, 9,
56, 56, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_51
fused_nn_contrib_conv2d_NCHWc_add_clip_5 35.349 1.951 (1, 9,
56, 56, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip
fused_nn_contrib_conv2d_NCHWc_add_clip 34.692 1.915 (1, 80,
7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_6
fused_nn_contrib_conv2d_NCHWc_add_6 34.052 1.879 (1, 1,
112, 112, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add
fused_nn_contrib_conv2d_NCHWc_add 33.58 1.853 (1, 20,
7, 7, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_21
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 33.298 1.838 (1, 24,
14, 14, 16) 3 1
fused_nn_pad_2 fused_nn_pad_2
33.201 1.832 (1, 9, 57, 57, 16) 1 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_22
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 33.057 1.824 (1, 24,
14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 33.027 1.823 (1, 24,
14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_23
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 32.787 1.809 (1, 24,
14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_5
fused_nn_contrib_conv2d_NCHWc_add_5 32.332 1.784 (1, 2,
56, 56, 12) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 32.156 1.775 (1, 60,
7, 7, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 31.68 1.748 (1, 60,
7, 7, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip2
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 30.832 1.701 (1, 60,
7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_7
fused_nn_contrib_conv2d_NCHWc_add_clip_7 30.521 1.684 (1, 2,
112, 112, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add_11
fused_nn_contrib_conv2d_NCHWc_add_add_1 30.012 1.656 (1, 6,
14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_1
fused_nn_contrib_conv2d_NCHWc_add_add_1 29.914 1.651 (1, 6,
14, 14, 16) 4 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4 28.642 1.581 (1, 9,
28, 28, 16) 3 1
fused_nn_global_avg_pool2d
fused_nn_global_avg_pool2d 28.552 1.576 (1, 80,
1, 1, 16) 1 1
fused_layout_transform_40
fused_layout_transform_40 26.741 1.476 (1, 8,
56, 56, 12) 1 1
fused_layout_transform_41
fused_layout_transform_41 25.793 1.423 (1, 12,
56, 56, 12) 1 1
fused_nn_contrib_conv2d_NCHWc_add_add1
fused_nn_contrib_conv2d_NCHWc_add_add 25.759 1.422 (1, 10,
7, 7, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_2
fused_nn_contrib_conv2d_NCHWc_add_add_2 25.566 1.411 (1, 4,
14, 14, 16) 4 1
fused_nn_dense_add fused_nn_dense_add
25.52 1.408 (1, 1000) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add
fused_nn_contrib_conv2d_NCHWc_add_add 25.391 1.401 (1, 10,
7, 7, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_clip_21
fused_nn_contrib_conv2d_NCHWc_add_clip_2 25.345 1.399 (1, 36,
14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_2
fused_nn_contrib_conv2d_NCHWc_add_clip_2 25.262 1.394 (1, 36,
14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_22
fused_nn_contrib_conv2d_NCHWc_add_clip_2 24.895 1.374 (1, 36,
14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add_3
fused_nn_contrib_conv2d_NCHWc_add_add_3 24.679 1.362 (1, 2,
28, 28, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_31
fused_nn_contrib_conv2d_NCHWc_add_add_3 24.553 1.355 (1, 2,
28, 28, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_2
fused_nn_contrib_conv2d_NCHWc_add_2 23.364 1.289 (1, 6,
14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add_21
fused_nn_contrib_conv2d_NCHWc_add_add_2 23.264 1.284 (1, 4,
14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_22
fused_nn_contrib_conv2d_NCHWc_add_add_2 23.006 1.27 (1, 4,
14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_clip_11
fused_nn_contrib_conv2d_NCHWc_add_clip_1 22.724 1.254 (1, 60,
7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_32
fused_nn_contrib_conv2d_NCHWc_add_clip_3 22.722 1.254 (1, 24,
14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_41
fused_nn_contrib_conv2d_NCHWc_add_clip_4 22.522 1.243 (1, 12,
28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_1
fused_nn_contrib_conv2d_NCHWc_add_clip_1 22.247 1.228 (1, 60,
7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_33
fused_nn_contrib_conv2d_NCHWc_add_clip_3 21.648 1.195 (1, 24,
14, 14, 16) 3 1
fused_nn_pad fused_nn_pad
21.439 1.183 (1, 36, 15, 15, 16) 1 1
fused_nn_contrib_conv2d_NCHWc_add_clip_12
fused_nn_contrib_conv2d_NCHWc_add_clip_1 21.437 1.183 (1, 60,
7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_4
fused_nn_contrib_conv2d_NCHWc_add_4 21.426 1.182 (1, 2,
28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_1
fused_nn_contrib_conv2d_NCHWc_add_1 21.227 1.171 (1, 10,
7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_31
fused_nn_contrib_conv2d_NCHWc_add_clip_3 20.739 1.145 (1, 24,
14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_3
fused_nn_contrib_conv2d_NCHWc_add_clip_3 20.719 1.143 (1, 24,
14, 14, 16) 3 1
fused_nn_softmax fused_nn_softmax
19.798 1.093 (1, 1000) 1 1
fused_nn_contrib_conv2d_NCHWc_add_clip_42
fused_nn_contrib_conv2d_NCHWc_add_clip_4 19.751 1.09 (1, 12,
28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_4
fused_nn_contrib_conv2d_NCHWc_add_clip_4 19.679 1.086 (1, 12,
28, 28, 16) 3 1
fused_nn_pad_1 fused_nn_pad_1
18.729 1.034 (1, 12, 29, 29, 16) 1 1
fused_nn_contrib_conv2d_NCHWc_add_3
fused_nn_contrib_conv2d_NCHWc_add_3 18.411 1.016 (1, 4,
14, 14, 16) 3 1
fused_nn_pad_layout_transform
fused_nn_pad_layout_transform 18.159 1.002 (1, 1,
225, 225, 3) 1 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3 15.938 0.88 (1, 12,
14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1 15.438 0.852 (1, 36,
7, 7, 16) 3 1
fused_layout_transform_transpose_nn_batch_flatten
fused_layout_transform_transpose_nn_batch_flatten 1.563 0.086 (1,
1280) 1 1
Total_time -
1812.033 - - - -
```
## Profile with `INDEX_DEFAULT_I64=ON` (slow)
```
Node Name Ops
Time(us) Time(%) Shape Inputs
Outputs
--------- ---
-------- ------- ----- ------
-------
fused_nn_contrib_conv2d_NCHWc_add_add_1
fused_nn_contrib_conv2d_NCHWc_add_add_1 3105.8 21.391 (1, 6,
14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_11
fused_nn_contrib_conv2d_NCHWc_add_add_1 3104.62 21.382 (1, 6,
14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_2
fused_nn_contrib_conv2d_NCHWc_add_add_2 2200.03 15.152 (1, 4,
14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_21
fused_nn_contrib_conv2d_NCHWc_add_add_2 2189.84 15.082 (1, 4,
14, 14, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_add_22
fused_nn_contrib_conv2d_NCHWc_add_add_2 2185.71 15.054 (1, 4,
14, 14, 16) 4 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_7 60.094 0.414 (1, 9,
56, 56, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_91
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 52.82 0.364 (1, 36,
14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_6 51.393 0.354 (1, 2,
112, 112, 16) 3 1
fused_nn_pad_3 fused_nn_pad_3
51.19 0.353 (1, 6, 113, 113, 16) 1 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_5 49.058 0.338 (1, 6,
56, 56, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_6
fused_nn_contrib_conv2d_NCHWc_add_clip_6 46.637 0.321 (1, 6,
112, 112, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 43.381 0.299 (1, 24,
14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 40.165 0.277 (1, 12,
28, 28, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_23
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 39.355 0.271 (1, 24,
14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_22
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 39.205 0.27 (1, 24,
14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add_4
fused_nn_contrib_conv2d_NCHWc_add_add_4 38.595 0.266 (1, 2,
56, 56, 12) 4 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_9 38.019 0.262 (1, 36,
14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_81
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_8 37.559 0.259 (1, 12,
28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_5
fused_nn_contrib_conv2d_NCHWc_add_clip_5 36.159 0.249 (1, 9,
56, 56, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_51
fused_nn_contrib_conv2d_NCHWc_add_clip_5 35.269 0.243 (1, 9,
56, 56, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip
fused_nn_contrib_conv2d_NCHWc_add_clip 34.755 0.239 (1, 80,
7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_2
fused_nn_contrib_conv2d_NCHWc_add_2 34.248 0.236 (1, 6,
14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_6
fused_nn_contrib_conv2d_NCHWc_add_6 33.65 0.232 (1, 1,
112, 112, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_7
fused_nn_contrib_conv2d_NCHWc_add_clip_7 33.163 0.228 (1, 2,
112, 112, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_21
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_2 32.593 0.224 (1, 24,
14, 14, 16) 3 1
fused_nn_pad_2 fused_nn_pad_2
32.542 0.224 (1, 9, 57, 57, 16) 1 1
fused_nn_contrib_conv2d_NCHWc_add
fused_nn_contrib_conv2d_NCHWc_add 32.471 0.224 (1, 20,
7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_5
fused_nn_contrib_conv2d_NCHWc_add_5 31.587 0.218 (1, 2,
56, 56, 12) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 30.659 0.211 (1, 60,
7, 7, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 30.109 0.207 (1, 60,
7, 7, 16) 3 1
fused_nn_pad fused_nn_pad
29.258 0.202 (1, 36, 15, 15, 16) 1 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_4 29.083 0.2 (1, 9,
28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_2
fused_nn_contrib_conv2d_NCHWc_add_clip_2 28.273 0.195 (1, 36,
14, 14, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip2
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip 28.052 0.193 (1, 60,
7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_22
fused_nn_contrib_conv2d_NCHWc_add_clip_2 27.855 0.192 (1, 36,
14, 14, 16) 3 1
fused_layout_transform_40
fused_layout_transform_40 27.811 0.192 (1, 8,
56, 56, 12) 1 1
fused_nn_global_avg_pool2d
fused_nn_global_avg_pool2d 27.724 0.191 (1, 80,
1, 1, 16) 1 1
fused_layout_transform_41
fused_layout_transform_41 27.308 0.188 (1, 12,
56, 56, 12) 1 1
fused_nn_dense_add fused_nn_dense_add
26.655 0.184 (1, 1000) 3 1
fused_nn_contrib_conv2d_NCHWc_add_1
fused_nn_contrib_conv2d_NCHWc_add_1 26.406 0.182 (1, 10,
7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add
fused_nn_contrib_conv2d_NCHWc_add_add 25.447 0.175 (1, 10,
7, 7, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_clip_21
fused_nn_contrib_conv2d_NCHWc_add_clip_2 25.433 0.175 (1, 36,
14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add1
fused_nn_contrib_conv2d_NCHWc_add_add 25.276 0.174 (1, 10,
7, 7, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_clip_11
fused_nn_contrib_conv2d_NCHWc_add_clip_1 24.78 0.171 (1, 60,
7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add_31
fused_nn_contrib_conv2d_NCHWc_add_add_3 24.132 0.166 (1, 2,
28, 28, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_clip_12
fused_nn_contrib_conv2d_NCHWc_add_clip_1 23.359 0.161 (1, 60,
7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_add_3
fused_nn_contrib_conv2d_NCHWc_add_add_3 23.226 0.16 (1, 2,
28, 28, 16) 4 1
fused_nn_contrib_conv2d_NCHWc_add_clip_31
fused_nn_contrib_conv2d_NCHWc_add_clip_3 22.999 0.158 (1, 24,
14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_1
fused_nn_contrib_conv2d_NCHWc_add_clip_1 22.372 0.154 (1, 60,
7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_41
fused_nn_contrib_conv2d_NCHWc_add_clip_4 21.948 0.151 (1, 12,
28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_4
fused_nn_contrib_conv2d_NCHWc_add_4 21.359 0.147 (1, 2,
28, 28, 16) 3 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_1 21.269 0.146 (1, 36,
7, 7, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_33
fused_nn_contrib_conv2d_NCHWc_add_clip_3 20.916 0.144 (1, 24,
14, 14, 16) 3 1
fused_nn_softmax fused_nn_softmax
20.415 0.141 (1, 1000) 1 1
fused_nn_contrib_conv2d_NCHWc_add_clip_3
fused_nn_contrib_conv2d_NCHWc_add_clip_3 20.37 0.14 (1, 24,
14, 14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_4
fused_nn_contrib_conv2d_NCHWc_add_clip_4 19.395 0.134 (1, 12,
28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_clip_32
fused_nn_contrib_conv2d_NCHWc_add_clip_3 19.306 0.133 (1, 24,
14, 14, 16) 3 1
fused_nn_pad_1 fused_nn_pad_1
19.284 0.133 (1, 12, 29, 29, 16) 1 1
fused_nn_contrib_conv2d_NCHWc_add_clip_42
fused_nn_contrib_conv2d_NCHWc_add_clip_4 18.807 0.13 (1, 12,
28, 28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_3
fused_nn_contrib_conv2d_NCHWc_add_3 17.728 0.122 (1, 4,
14, 14, 16) 3 1
fused_nn_pad_layout_transform
fused_nn_pad_layout_transform 15.683 0.108 (1, 1,
225, 225, 3) 1 1
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3
fused_nn_contrib_depthwise_conv2d_NCHWc_add_clip_3 15.236 0.105 (1, 12,
14, 14, 16) 3 1
fused_layout_transform_transpose_nn_batch_flatten
fused_layout_transform_transpose_nn_batch_flatten 1.607 0.011 (1,
1280) 1 1
Total_time -
14519.449 - - - -
```
Here is a script to reproduce:
```
import time
import numpy as np
import tvm
from tvm import relay
from tvm.contrib import graph_runtime
import tensorflow as tf
input_shape = (1, 3, 224, 224)
model = tf.keras.applications.MobileNetV2()
mod, params = relay.frontend.from_keras(model, shape={'input_1':
input_shape})
dtype = 'float32'
with relay.build_config(opt_level=3):
graph, lib, params = relay.build(mod, "llvm -mcpu=skylake-avx512",
params=params)
i_data = np.random.uniform(0, 1, input_shape).astype(dtype)
mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0))
mod.set_input(**params)
# Time
times = []
for i in range(100):
start_time = time.time()
mod.run(input_1=i_data)
res = mod.get_output(0)
times.append(time.time() - start_time)
print('Mean latency:', 1000.0 * np.mean(times[10:]))
print('Mean FPS:', 1.0 / np.mean(times[10:]))
```
Thanks!
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]