Lunderberg commented on issue #17248:
URL: https://github.com/apache/tvm/issues/17248#issuecomment-2272162235
Yup, confirmed that it's an effect of the data transfer and the small array
sizes.
<details>
<summary>Click to expand benchmark script</summary>
(Requires `pytest-benchmark` to be installed.)
```python
#!/usr/bin/env python3
import pytest
import tvm
from tvm import relax
import numpy as np
from tvm.script import ir as I
from tvm.script import tir as T
from tvm.script import relax as R
@pytest.mark.parametrize("target", ["llvm", "cuda"])
@pytest.mark.parametrize(
"benchmark_step", ["data_to_device", "compute_on_device",
"data_from_device", "end_to_end"]
)
@pytest.mark.parametrize(
"array_sizes", [(1, 42), (64 * 1024 * 1024, 64 * 1024 * 1024)],
ids=["small", "large"]
)
def test_func(target, array_sizes, benchmark_step, benchmark):
size_0, size_1 = array_sizes
@I.ir_module
class Module:
@T.prim_func(private=True)
def cast(
v0_0: T.Buffer(T.int64(size_0), "int32"), compute:
T.Buffer(T.int64(size_0), "float64")
):
T.func_attr({"tir.noalias": T.bool(True)})
for i0 in range(T.int64(size_0)):
with T.block("compute"):
v_i0 = T.axis.spatial(T.int64(size_0), i0)
compute[v_i0] = T.Cast("float64", v0_0[v_i0])
@T.prim_func(private=True)
def less(
v1_0: T.Buffer(T.int64(size_1), "int32"),
v1_0_1: T.Buffer(T.int64(size_1), "int32"),
T_less: T.Buffer(T.int64(size_1), "bool"),
):
T.func_attr({"tir.noalias": T.bool(True)})
for ax0 in range(T.int64(size_1)):
with T.block("T_less"):
v_ax0 = T.axis.spatial(T.int64(size_1), ax0)
T_less[v_ax0] = v1_0[v_ax0] < v1_0_1[v_ax0]
@T.prim_func(private=True)
def multiply(
v1_0: T.Buffer(T.int64(size_1), "int32"),
lv: T.Buffer(T.int64(size_0), "int32"),
T_multiply: T.Buffer(T.int64(size_1), "int32"),
):
T.func_attr({"tir.noalias": T.bool(True)})
for ax0 in range(T.int64(size_1)):
with T.block("T_multiply"):
v_ax0 = T.axis.spatial(T.int64(size_1), ax0)
T_multiply[v_ax0] = v1_0[v_ax0] * lv[T.int64(0)]
@T.prim_func(private=True)
def tir_negative(
v0_0: T.Buffer(T.int64(size_0), "int32"), compute:
T.Buffer(T.int64(size_0), "int32")
):
T.func_attr({"tir.noalias": T.bool(True)})
for i0 in range(T.int64(size_0)):
with T.block("compute"):
v_i0 = T.axis.spatial(T.int64(1), i0)
compute[v_i0] = v0_0[v_i0] * -1
@R.function
def main(
v0_0: R.Tensor([T.int64(size_0)], dtype="int32"),
v1_0: R.Tensor([T.int64(size_1)], dtype="int32"),
) -> R.Tuple(
R.Tensor([T.int64(size_0)], dtype="float64"),
R.Tensor([T.int64(size_1)], dtype="bool"),
R.Tensor([T.int64(size_1)], dtype="int32"),
):
R.func_attr({"num_input": 2})
cls = Module
with R.dataflow():
lv = R.call_tir(
cls.tir_negative, (v0_0,),
out_sinfo=R.Tensor([T.int64(size_0)], dtype="int32")
)
lv1 = R.call_tir(
cls.cast, (v0_0,), out_sinfo=R.Tensor([T.int64(size_0)],
dtype="float64")
)
lv2 = R.call_tir(
cls.less, (v1_0, v1_0),
out_sinfo=R.Tensor([T.int64(size_1)], dtype="bool")
)
lv3 = R.call_tir(
cls.multiply, (v1_0, lv),
out_sinfo=R.Tensor([T.int64(size_1)], dtype="int32")
)
gv = (lv1, lv2, lv3)
R.output(gv)
return gv
mod = Module
if "gpu" in tvm.target.Target(target).keys:
with tvm.target.Target(target):
mod = tvm.tir.transform.DefaultGPUSchedule()(mod)
ex = relax.build(mod, target=target)
dev = tvm.device(target)
func = relax.VirtualMachine(ex, dev)["main"]
np_inputs = [
np.random.randint(10, size=[size_0]).astype("int32"),
np.random.randint(10, size=[size_1]).astype("int32"),
]
def data_to_device(np_input_0, np_input_1):
tvm_input_0 = tvm.nd.array(np_input_0, dev)
tvm_input_1 = tvm.nd.array(np_input_1, dev)
dev.sync()
return tvm_input_0, tvm_input_1
def compute_on_device(tvm_input_0, tvm_input_1):
tvm_outputs = func(tvm_input_0, tvm_input_1)
dev.sync()
return tvm_outputs
def data_from_device(*tvm_outputs):
return [arr.numpy() for arr in tvm_outputs]
def end_to_end(np_input_0, np_input_1):
tvm_input_0 = tvm.nd.array(np_input_0, dev)
tvm_input_1 = tvm.nd.array(np_input_1, dev)
tvm_outputs = func(tvm_input_0, tvm_input_1)
return [arr.numpy() for arr in tvm_outputs]
if benchmark_step == "end_to_end":
benchmark(end_to_end, *np_inputs)
return
if benchmark_step == "data_to_device":
tvm_inputs = benchmark(data_to_device, *np_inputs)
else:
tvm_inputs = data_to_device(*np_inputs)
if benchmark_step == "compute_on_device":
tvm_outputs = benchmark(compute_on_device, *tvm_inputs)
else:
tvm_outputs = compute_on_device(*tvm_inputs)
if benchmark_step == "data_from_device":
np_outputs = benchmark(data_from_device, *tvm_outputs)
else:
np_outputs = data_from_device(*tvm_outputs)
```
</details>
Benchmark results below. For small arrays (same size as your initial test)
LLVM beats cuda by about 2x. For large arrays (256 MB for each input), the
CUDA result is about 4x faster. (The `large-end_to_end-*` results.) Excluding
the time required to transfer inputs to the GPU and retrieve results from the
GPU, CUDA is about 20x faster. (The `large-compute_on_device-*` results.)
```
------------------------------------------------------------------------------------------------------------
benchmark: 16 tests
------------------------------------------------------------------------------------------------------------
Name (time in us) Min
Max Mean StdDev Median
IQR Outliers OPS Rounds Iterations
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
test_func[small-compute_on_device-llvm] 9.0499 (1.0)
25.3399 (1.0) 9.4720 (1.0) 1.0820 (1.0) 9.3100
(1.0) 0.1104 (1.0) 962;1960 105,574.8046 (1.0) 36336
1
test_func[small-data_from_device-llvm] 17.0399 (1.88)
105.1610 (4.15) 18.3110 (1.93) 4.6396 (4.29) 17.6511
(1.90) 0.3211 (2.91) 539;2075 54,611.8976 (0.52) 24820
1
test_func[small-compute_on_device-cuda] 19.1112 (2.11)
378.0441 (14.92) 22.6536 (2.39) 21.9386 (20.28) 19.9701
(2.15) 0.5902 (5.35) 87;870 44,143.0052 (0.42) 7777
1
test_func[small-data_from_device-cuda] 37.2110 (4.11)
157.5621 (6.22) 44.3340 (4.68) 7.5103 (6.94) 42.9400
(4.61) 2.3299 (21.11) 666;1354 22,556.0667 (0.21) 13240
1
test_func[small-data_to_device-llvm] 46.5401 (5.14)
600.1082 (23.68) 50.1849 (5.30) 10.2509 (9.47) 49.0400
(5.27) 1.2708 (11.51) 92;340 19,926.3044 (0.19) 4510
1
test_func[small-end_to_end-llvm] 115.9220 (12.81)
225.7330 (8.91) 125.9927 (13.30) 11.9701 (11.06) 121.6519
(13.07) 10.1275 (91.77) 278;178 7,936.9686 (0.08) 4267
1
test_func[small-end_to_end-cuda] 236.7632 (26.16)
917.2419 (36.20) 263.0915 (27.78) 62.8217 (58.06) 244.7229
(26.29) 9.3799 (84.99) 109;288 3,800.9594 (0.04) 1715
1
test_func[small-data_to_device-cuda] 241.4340 (26.68)
260.8141 (10.29) 247.2537 (26.10) 5.9761 (5.52) 244.9939
(26.32) 9.4234 (85.39) 5;0 4,044.4295 (0.04) 25
1
test_func[large-compute_on_device-cuda] 5,481.5309 (605.70)
7,146.6921 (282.03) 5,645.7970 (596.05) 266.9570 (246.72)
5,544.0720 (595.50) 148.2225 (>1000.0) 13;16 177.1229 (0.00)
173 1
test_func[large-data_to_device-cuda] 47,258.1829 (>1000.0)
49,895.8279 (>1000.0) 48,068.3519 (>1000.0) 639.3231 (590.85)
48,020.0830 (>1000.0) 539.2472 (>1000.0) 5;2 20.8037 (0.00)
25 1
test_func[large-compute_on_device-llvm] 95,119.3729 (>1000.0)
338,263.1938 (>1000.0) 119,093.0710 (>1000.0) 72,722.1956 (>1000.0)
96,317.1280 (>1000.0) 4,397.9906 (>1000.0) 1;1 8.3968 (0.00)
11 1
test_func[large-data_from_device-llvm] 103,340.1210 (>1000.0)
110,499.1839 (>1000.0) 105,368.4036 (>1000.0) 2,165.9420 (>1000.0)
104,764.2034 (>1000.0) 2,236.1381 (>1000.0) 1;1 9.4905 (0.00)
10 1
test_func[large-data_from_device-cuda] 115,621.2708 (>1000.0)
118,050.2211 (>1000.0) 116,433.3510 (>1000.0) 765.6208 (707.57)
116,457.8821 (>1000.0) 1,002.1689 (>1000.0) 3;0 8.5886 (0.00)
9 1
test_func[large-end_to_end-cuda] 168,903.6440 (>1000.0)
173,889.3481 (>1000.0) 170,860.7842 (>1000.0) 1,647.2517 (>1000.0)
170,547.2330 (>1000.0) 1,900.8236 (>1000.0) 2;0 5.8527 (0.00)
7 1
test_func[large-data_to_device-llvm] 424,237.4769 (>1000.0)
427,762.2711 (>1000.0) 426,219.4017 (>1000.0) 1,536.2265 (>1000.0)
426,894.0398 (>1000.0) 2,607.3752 (>1000.0) 2;0 2.3462 (0.00)
5 1
test_func[large-end_to_end-llvm] 602,830.7430 (>1000.0)
611,205.6130 (>1000.0) 605,958.4046 (>1000.0) 3,588.9265 (>1000.0)
603,839.9071 (>1000.0) 5,400.2019 (>1000.0) 1;0 1.6503 (0.00)
5 1
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]