Re: [I] [Bug] [Relax] Hug inference performance difference on cuda and cpu [tvm]

via GitHub Tue, 06 Aug 2024 14:13:13 -0700


Lunderberg commented on issue #17248:
URL: https://github.com/apache/tvm/issues/17248#issuecomment-2272162235


   Yup, confirmed that it's an effect of the data transfer and the small array 
sizes.
   
   <details>
   <summary>Click to expand benchmark script</summary>
   
   (Requires `pytest-benchmark` to be installed.)
   
   ```python
   #!/usr/bin/env python3
   
   import pytest
   
   import tvm
   from tvm import relax
   import numpy as np
   from tvm.script import ir as I
   from tvm.script import tir as T
   from tvm.script import relax as R
   
   
   @pytest.mark.parametrize("target", ["llvm", "cuda"])
   @pytest.mark.parametrize(
       "benchmark_step", ["data_to_device", "compute_on_device", 
"data_from_device", "end_to_end"]
   )
   @pytest.mark.parametrize(
       "array_sizes", [(1, 42), (64 * 1024 * 1024, 64 * 1024 * 1024)], 
ids=["small", "large"]
   )
   def test_func(target, array_sizes, benchmark_step, benchmark):
       size_0, size_1 = array_sizes
   
       @I.ir_module
       class Module:
           @T.prim_func(private=True)
           def cast(
               v0_0: T.Buffer(T.int64(size_0), "int32"), compute: 
T.Buffer(T.int64(size_0), "float64")
           ):
               T.func_attr({"tir.noalias": T.bool(True)})
               for i0 in range(T.int64(size_0)):
                   with T.block("compute"):
                       v_i0 = T.axis.spatial(T.int64(size_0), i0)
                       compute[v_i0] = T.Cast("float64", v0_0[v_i0])
   
           @T.prim_func(private=True)
           def less(
               v1_0: T.Buffer(T.int64(size_1), "int32"),
               v1_0_1: T.Buffer(T.int64(size_1), "int32"),
               T_less: T.Buffer(T.int64(size_1), "bool"),
           ):
               T.func_attr({"tir.noalias": T.bool(True)})
               for ax0 in range(T.int64(size_1)):
                   with T.block("T_less"):
                       v_ax0 = T.axis.spatial(T.int64(size_1), ax0)
                       T_less[v_ax0] = v1_0[v_ax0] < v1_0_1[v_ax0]
   
           @T.prim_func(private=True)
           def multiply(
               v1_0: T.Buffer(T.int64(size_1), "int32"),
               lv: T.Buffer(T.int64(size_0), "int32"),
               T_multiply: T.Buffer(T.int64(size_1), "int32"),
           ):
               T.func_attr({"tir.noalias": T.bool(True)})
               for ax0 in range(T.int64(size_1)):
                   with T.block("T_multiply"):
                       v_ax0 = T.axis.spatial(T.int64(size_1), ax0)
                       T_multiply[v_ax0] = v1_0[v_ax0] * lv[T.int64(0)]
   
           @T.prim_func(private=True)
           def tir_negative(
               v0_0: T.Buffer(T.int64(size_0), "int32"), compute: 
T.Buffer(T.int64(size_0), "int32")
           ):
               T.func_attr({"tir.noalias": T.bool(True)})
               for i0 in range(T.int64(size_0)):
                   with T.block("compute"):
                       v_i0 = T.axis.spatial(T.int64(1), i0)
                       compute[v_i0] = v0_0[v_i0] * -1
   
           @R.function
           def main(
               v0_0: R.Tensor([T.int64(size_0)], dtype="int32"),
               v1_0: R.Tensor([T.int64(size_1)], dtype="int32"),
           ) -> R.Tuple(
               R.Tensor([T.int64(size_0)], dtype="float64"),
               R.Tensor([T.int64(size_1)], dtype="bool"),
               R.Tensor([T.int64(size_1)], dtype="int32"),
           ):
               R.func_attr({"num_input": 2})
               cls = Module
               with R.dataflow():
                   lv = R.call_tir(
                       cls.tir_negative, (v0_0,), 
out_sinfo=R.Tensor([T.int64(size_0)], dtype="int32")
                   )
                   lv1 = R.call_tir(
                       cls.cast, (v0_0,), out_sinfo=R.Tensor([T.int64(size_0)], 
dtype="float64")
                   )
                   lv2 = R.call_tir(
                       cls.less, (v1_0, v1_0), 
out_sinfo=R.Tensor([T.int64(size_1)], dtype="bool")
                   )
                   lv3 = R.call_tir(
                       cls.multiply, (v1_0, lv), 
out_sinfo=R.Tensor([T.int64(size_1)], dtype="int32")
                   )
                   gv = (lv1, lv2, lv3)
                   R.output(gv)
               return gv
   
       mod = Module
   
       if "gpu" in tvm.target.Target(target).keys:
           with tvm.target.Target(target):
               mod = tvm.tir.transform.DefaultGPUSchedule()(mod)
       ex = relax.build(mod, target=target)
   
       dev = tvm.device(target)
   
       func = relax.VirtualMachine(ex, dev)["main"]
   
       np_inputs = [
           np.random.randint(10, size=[size_0]).astype("int32"),
           np.random.randint(10, size=[size_1]).astype("int32"),
       ]
   
       def data_to_device(np_input_0, np_input_1):
           tvm_input_0 = tvm.nd.array(np_input_0, dev)
           tvm_input_1 = tvm.nd.array(np_input_1, dev)
           dev.sync()
           return tvm_input_0, tvm_input_1
   
       def compute_on_device(tvm_input_0, tvm_input_1):
           tvm_outputs = func(tvm_input_0, tvm_input_1)
           dev.sync()
           return tvm_outputs
   
       def data_from_device(*tvm_outputs):
           return [arr.numpy() for arr in tvm_outputs]
   
       def end_to_end(np_input_0, np_input_1):
           tvm_input_0 = tvm.nd.array(np_input_0, dev)
           tvm_input_1 = tvm.nd.array(np_input_1, dev)
           tvm_outputs = func(tvm_input_0, tvm_input_1)
           return [arr.numpy() for arr in tvm_outputs]
   
       if benchmark_step == "end_to_end":
           benchmark(end_to_end, *np_inputs)
           return
   
       if benchmark_step == "data_to_device":
           tvm_inputs = benchmark(data_to_device, *np_inputs)
       else:
           tvm_inputs = data_to_device(*np_inputs)
   
       if benchmark_step == "compute_on_device":
           tvm_outputs = benchmark(compute_on_device, *tvm_inputs)
       else:
           tvm_outputs = compute_on_device(*tvm_inputs)
   
       if benchmark_step == "data_from_device":
           np_outputs = benchmark(data_from_device, *tvm_outputs)
       else:
           np_outputs = data_from_device(*tvm_outputs)
   
   ```
   
   </details>
   
   Benchmark results below.  For small arrays (same size as your initial test) 
LLVM beats cuda by about 2x.  For large arrays (256 MB for each input), the 
CUDA result is about 4x faster.  (The `large-end_to_end-*` results.)  Excluding 
the time required to transfer inputs to the GPU and retrieve results from the 
GPU, CUDA is about 20x faster.  (The `large-compute_on_device-*` results.)
   
   ```
   
------------------------------------------------------------------------------------------------------------
 benchmark: 16 tests 
------------------------------------------------------------------------------------------------------------
   Name (time in us)                                    Min                     
Max                    Mean                 StdDev                  Median      
             IQR            Outliers           OPS            Rounds  Iterations
   
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   test_func[small-compute_on_device-llvm]           9.0499 (1.0)           
25.3399 (1.0)            9.4720 (1.0)           1.0820 (1.0)            9.3100 
(1.0)          0.1104 (1.0)      962;1960  105,574.8046 (1.0)       36336       
    1
   test_func[small-data_from_device-llvm]           17.0399 (1.88)         
105.1610 (4.15)          18.3110 (1.93)          4.6396 (4.29)          17.6511 
(1.90)         0.3211 (2.91)     539;2075   54,611.8976 (0.52)      24820       
    1
   test_func[small-compute_on_device-cuda]          19.1112 (2.11)         
378.0441 (14.92)         22.6536 (2.39)         21.9386 (20.28)         19.9701 
(2.15)         0.5902 (5.35)       87;870   44,143.0052 (0.42)       7777       
    1
   test_func[small-data_from_device-cuda]           37.2110 (4.11)         
157.5621 (6.22)          44.3340 (4.68)          7.5103 (6.94)          42.9400 
(4.61)         2.3299 (21.11)    666;1354   22,556.0667 (0.21)      13240       
    1
   test_func[small-data_to_device-llvm]             46.5401 (5.14)         
600.1082 (23.68)         50.1849 (5.30)         10.2509 (9.47)          49.0400 
(5.27)         1.2708 (11.51)      92;340   19,926.3044 (0.19)       4510       
    1
   test_func[small-end_to_end-llvm]                115.9220 (12.81)        
225.7330 (8.91)         125.9927 (13.30)        11.9701 (11.06)        121.6519 
(13.07)       10.1275 (91.77)     278;178    7,936.9686 (0.08)       4267       
    1
   test_func[small-end_to_end-cuda]                236.7632 (26.16)        
917.2419 (36.20)        263.0915 (27.78)        62.8217 (58.06)        244.7229 
(26.29)        9.3799 (84.99)     109;288    3,800.9594 (0.04)       1715       
    1
   test_func[small-data_to_device-cuda]            241.4340 (26.68)        
260.8141 (10.29)        247.2537 (26.10)         5.9761 (5.52)         244.9939 
(26.32)        9.4234 (85.39)         5;0    4,044.4295 (0.04)         25       
    1
   test_func[large-compute_on_device-cuda]       5,481.5309 (605.70)     
7,146.6921 (282.03)     5,645.7970 (596.05)      266.9570 (246.72)     
5,544.0720 (595.50)     148.2225 (>1000.0)     13;16      177.1229 (0.00)       
 173           1
   test_func[large-data_to_device-cuda]         47,258.1829 (>1000.0)   
49,895.8279 (>1000.0)   48,068.3519 (>1000.0)     639.3231 (590.85)    
48,020.0830 (>1000.0)    539.2472 (>1000.0)       5;2       20.8037 (0.00)      
   25           1
   test_func[large-compute_on_device-llvm]      95,119.3729 (>1000.0)  
338,263.1938 (>1000.0)  119,093.0710 (>1000.0)  72,722.1956 (>1000.0)   
96,317.1280 (>1000.0)  4,397.9906 (>1000.0)       1;1        8.3968 (0.00)      
   11           1
   test_func[large-data_from_device-llvm]      103,340.1210 (>1000.0)  
110,499.1839 (>1000.0)  105,368.4036 (>1000.0)   2,165.9420 (>1000.0)  
104,764.2034 (>1000.0)  2,236.1381 (>1000.0)       1;1        9.4905 (0.00)     
    10           1
   test_func[large-data_from_device-cuda]      115,621.2708 (>1000.0)  
118,050.2211 (>1000.0)  116,433.3510 (>1000.0)     765.6208 (707.57)   
116,457.8821 (>1000.0)  1,002.1689 (>1000.0)       3;0        8.5886 (0.00)     
     9           1
   test_func[large-end_to_end-cuda]            168,903.6440 (>1000.0)  
173,889.3481 (>1000.0)  170,860.7842 (>1000.0)   1,647.2517 (>1000.0)  
170,547.2330 (>1000.0)  1,900.8236 (>1000.0)       2;0        5.8527 (0.00)     
     7           1
   test_func[large-data_to_device-llvm]        424,237.4769 (>1000.0)  
427,762.2711 (>1000.0)  426,219.4017 (>1000.0)   1,536.2265 (>1000.0)  
426,894.0398 (>1000.0)  2,607.3752 (>1000.0)       2;0        2.3462 (0.00)     
     5           1
   test_func[large-end_to_end-llvm]            602,830.7430 (>1000.0)  
611,205.6130 (>1000.0)  605,958.4046 (>1000.0)   3,588.9265 (>1000.0)  
603,839.9071 (>1000.0)  5,400.2019 (>1000.0)       1;0        1.6503 (0.00)     
     5           1
   
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   ```
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [I] [Bug] [Relax] Hug inference performance difference on cuda and cpu [tvm]

Reply via email to