wrongtest commented on pull request #9482:
URL: https://github.com/apache/tvm/pull/9482#issuecomment-972543408


   BTW current split host device machinary seems work quite well with common 
expression bindings!
   A test script is as below, it shows common expression is evaluated at host 
function and passed as a kernel param to device function.
   
   ```python
   import tvm
   from tvm.script import tir as T
   
   @T.prim_func
   def func(a: T.handle, b: T.handle, n: T.int32) -> None:
       threadIdx_x = T.env_thread("threadIdx.x")
       A = T.match_buffer(a, [256], dtype="int32")
       B = T.match_buffer(b, [256], dtype="int32")
       common_expr = T.var("int32")
       # for common_expr in range(n // 8, n // 8 + 1):
       with T.let(common_expr, n // 8):
           for i in T.serial(0, common_expr):
               T.launch_thread(threadIdx_x, 8)
               T.store(B.data, i * 8 + threadIdx_x, common_expr + 
T.load("int32", A.data, i * 8 + threadIdx_x), True)
   
   mod = tvm.IRModule.from_expr(func)
   mod = tvm.tir.transform.Apply(lambda f: f.with_attr({"global_symbol": 
"main", "target": tvm.target.Target("cuda")}))(mod)
   mod = tvm.tir.transform.SplitHostDevice()(mod)
   print(mod.script())
   
   
   # script for result mod 
   @tvm.script.ir_module
   class Module:
       @T.prim_func
       def main(a: T.handle, b: T.handle, n: T.int32) -> None:
           # function attr dict
           T.func_attr({"global_symbol": "main", "target": None})
           A = T.match_buffer(a, [256], dtype="int32")
           B = T.match_buffer(b, [256], dtype="int32")
           # body
           for common_expr in T.serial(n // 8, n // 8 + 1):
               for i in T.serial(0, common_expr):
                   T.evaluate(T.tvm_call_packed("main_kernel0", B.data, A.data, 
common_expr, i, 8, dtype="int32"))
   
       @T.prim_func
       def main_kernel0(B_1: T.Ptr[global T.int32], A_1: T.Ptr[global T.int32], 
common_expr: T.int32, i: T.int32) -> None:
           # function attr dict
           T.func_attr({"target": cuda -keys=cuda,gpu -max_num_threads=1024 
-thread_warp_size=32, "tir.noalias": 1, "global_symbol": "main_kernel0", 
"tir.device_thread_axis": [T.iter_var(threadIdx_x, [0:8], "ThreadIndex", 
"threadIdx.x")], "tir.is_global_func": 1, "calling_conv": 2})
           # var definition
           threadIdx_x = T.env_thread("threadIdx.x")
           # body
           T.launch_thread(threadIdx_x, 8)
           T.store(B_1, i * 8 + threadIdx_x, common_expr + T.load("int32", A_1, 
i * 8 + threadIdx_x), True)
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to