tsupei opened a new issue #7079:
URL: https://github.com/apache/tvm/issues/7079


   I am implementing a cuda kernel which involves argmax in scan operation. 
However, program will execute but terminate without giving any error message. 
But if I don't compile it as cuda kernel, it could properly get correct 
results. Is there any misuse in my code?
   
   ```python3=
   def test_case():
       """
       Scan with argmax
   
       ! Command Terminated
       """
       def fcombine(x, y):
           lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0])
           rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1])
           return lhs, rhs
   
       def fidentity(t0, t1):
           return tvm.tir.const(-1, t0), tvm.tir.const(1e-10, t1)
   
       argmax = te.comm_reducer(fcombine, fidentity, name="argmax")
   
       n = te.var("n")
       m = te.var("m")
   
       data = te.placeholder((n, m), dtype="float32")
   
       state1 = te.placeholder((n, m), dtype="float32")
       state2 = te.placeholder((n, m), dtype="int32")
       state3 = te.placeholder((n, m), dtype="float32")
   
       init1 = te.compute((1, m), lambda _, i: tvm.tir.const(0.0))
       init2 = te.compute((1, m), lambda _, i: tvm.tir.const(0))
       init3 = te.compute((1, m), lambda _, i: tvm.tir.const(0.0))
   
       idx = te.compute((n, m), lambda t, i: i)
   
       update1 = te.compute((n, m), lambda t, i: tvm.tir.if_then_else(
           i == state2[t-1, i],
           state3[t-1, i],
           data[t, i]
       ))
   
       k = te.reduce_axis((0, m))
       update2, update3 = te.compute((n, m), lambda t, i: argmax((idx[t, k], 
state1[t-1, k]), axis=k))
   
       scan1, scan2, scan3 = te.scan([init1, init2, init3], [update1, update2, 
update3], [state1, state2, state3], inputs=[data])
   
       s = te.create_schedule(scan1.op)
   
       # s[init2].compute_inline()
   
       block_x = te.thread_axis("blockIdx.x")
       thread_x = te.thread_axis("threadIdx.x")
   
       num_thread = 32
   
       xo, xi = s[init1].split(init1.op.axis[1], factor=num_thread)
       s[init1].bind(xo, block_x)
       s[init1].bind(xi, thread_x)
   
       xo, xi = s[init2].split(init2.op.axis[1], factor=num_thread)
       s[init2].bind(xo, block_x)
       s[init2].bind(xi, thread_x)
   
       xo, xi = s[update1].split(update1.op.axis[1], factor=num_thread)
       s[update1].bind(xo, block_x)
       s[update1].bind(xi, thread_x)
   
       xo, xi = s[update2].split(update2.op.axis[1], factor=num_thread)
       s[update2].bind(xo, block_x)
       s[update2].bind(xi, thread_x)
   
       # ko, ki = s[update2].split(update2.op.reduce_axis[0], factor=num_thread)
       # bf = s.rfactor(update2, ki)
       # print(tvm.lower(s, [data, scan1]))
   
       tvm_scan = tvm.build(s, [data, scan1], "cuda", name="tvm_scan")
   
       ########
       # TEST #
       ########
       n = 100
       m = 50
   
       ctx = tvm.gpu(0)
       data = tvm.nd.array(torch.randn(n, m).numpy(), ctx)
       out = tvm.nd.array(torch.randn(n, m).numpy(), ctx)
   
       # Error occurs. Command Terminated!
       tvm_scan(data, out)
   ```
   
   ### Environment
   OS:  `Ubuntu 18.04.4 LTS`
   GPU: `GeForce GTX 1080` 
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to