tgxs002 opened a new issue, #11139:
URL: https://github.com/apache/tvm/issues/11139

   I am running the auto-scheduler on this function:
   
   ```
     @auto_scheduler.register_workload  # Note the auto_scheduler decorator
     def logit_forward(N, L, D, C, S, dtype):
         A = te.placeholder((N, C), name="A", dtype=dtype)
         I = te.placeholder((N, L), name='I', dtype='int32')
         K = te.placeholder((D, S, C), name="K", dtype=dtype)
         B = te.placeholder((D, S,), name="B", dtype=dtype)
         
         # this step is to make sure the index do not go over the bound
         # remember remove related code
         I_1 = topi.abs(I)
         I_2 = topi.mod(I_1, D)
     
         t = te.reduce_axis((0, C), name="t")
         key = te.compute(
             (N, L, S),
             lambda i, j, k: te.sum(A[i, t] * K[I_2[i, j], k, t], axis=t),
             name="compute_key",
             attrs={"layout_free_placeholders": [K]},  # enable automatic 
layout transform for tensor B
         )
         
         key_bias = te.compute(
             (N, L, S),
             lambda i, j, k: key[i, j, k] + B[I_2[i, j], k],
             name="add_bias",
         )
     
         return [A, I, K, B, key_bias]
   ```
   
   Note that I_2 is accessed twice. I can run the operator without using the 
scheduler, and it works perfect with the correct result.
   ### Expected behavior
   
   I expect it to be automatically optimized by the following code.
   ```
   task = tvm.auto_scheduler.SearchTask(func=logit_forward, args=(N, L, D, C, 
S, "float32"), target=gpu_target)
   
   # Inspect the computational graph
   print("Computational DAG:")
   print(task.compute_dag)
   
   log_file = "ops/logit_forward.json"
   measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
   tune_option = auto_scheduler.TuningOptions(
       num_measure_trials=10,
       runner=measure_ctx.runner,
       measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
       verbose=2,
   )
   ```
   
   ### Actual behavior
   
   ```
   Exception has occurred: TVMError
   Traceback (most recent call last):
     7: TVMFuncCall
     6: std::_Function_handler<void (tvm::runtime::TVMArgs, 
tvm::runtime::TVMRetValue*), 
tvm::runtime::TypedPackedFunc<tvm::runtime::Array<tvm::runtime::ObjectRef, 
void> (tvm::auto_scheduler::SearchPolicy, 
tvm::auto_scheduler::TuningOptions)>::AssignTypedLambda<tvm::auto_scheduler::{lambda(tvm::auto_scheduler::SearchPolicy,
 
tvm::auto_scheduler::TuningOptions)#3}>(tvm::auto_scheduler::{lambda(tvm::auto_scheduler::SearchPolicy,
 tvm::auto_scheduler::TuningOptions)#3}, std::__cxx11::basic_string<char, 
std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs 
const&, tvm::runtime::TVMRetValue*)#1}>::_M_invoke(std::_Any_data const&, 
tvm::runtime::TVMArgs&&, tvm::runtime::TVMRetValue*&&)
     5: tvm::auto_scheduler::AutoSchedule(tvm::auto_scheduler::SearchPolicy, 
tvm::auto_scheduler::TuningOptions)
     4: tvm::auto_scheduler::SketchPolicyNode::Search(int, int, int, 
tvm::auto_scheduler::ProgramMeasurer)
     3: tvm::auto_scheduler::SketchPolicyNode::SearchOneRound(int, 
tvm::runtime::Array<tvm::auto_scheduler::State, void>*)
     2: tvm::auto_scheduler::SketchPolicyNode::GenerateSketches()
     1: 
tvm::auto_scheduler::RuleAddCacheRead::Apply(tvm::auto_scheduler::SketchPolicyNode
 const&, tvm::auto_scheduler::State const&, int) const
     0: 
tvm::auto_scheduler::GetLastReduceIteratorInOutermostReduceTile(tvm::auto_scheduler::Stage
 const&)
     File "/nfs/tvm/src/auto_scheduler/search_policy/utils.h", line 536
   TVMError: Cannot find the iterator.
     File "/nfs/ops/logit_kernel.py", line 98, in <module>
       task.tune(tune_option)
   ```
   
   ### My workaround 
   
   I duplicated I_2 by adding this line after the declaration of I_2:
   ```
       I_3 = topi.mod(I_1, D)
   ```
   And access I_3 in the second te.compute function. This works fine, but feels 
a little clumsy to me. Maybe there is something wrong?
   
   ### Environment
   
   OS: Ubuntu 18.04
   TVM version: 0.8
   
   ### Steps to reproduce
   
   The code I'm running:
   ```
   import numpy as np
   import tvm
   import tvm.testing
   from tvm import te, auto_scheduler, topi
   
   @auto_scheduler.register_workload  # Note the auto_scheduler decorator
   def logit_forward(N, L, D, C, S, dtype):
       A = te.placeholder((N, C), name="A", dtype=dtype)
       I = te.placeholder((N, L), name='I', dtype='int32')
       K = te.placeholder((D, S, C), name="K", dtype=dtype)
       B = te.placeholder((D, S,), name="B", dtype=dtype)
       
       # this step is to make sure the index do not go over the bound
       # remember remove related code
       I_1 = topi.abs(I)
       I_2 = topi.mod(I_1, D)
   
       t = te.reduce_axis((0, C), name="t")
       key = te.compute(
           (N, L, S),
           lambda i, j, k: te.sum(A[i, t] * K[I_2[i, j], k, t], axis=t),
           name="compute_key",
           attrs={"layout_free_placeholders": [K]},  # enable automatic layout 
transform for tensor B
       )
       
       key_bias = te.compute(
           (N, L, S),
           lambda i, j, k: key[i, j, k] + B[I_2[i, j], k],
           name="add_bias",
       )
   
       return [A, I, K, B, key_bias]
   
   cpu_target = tvm.target.Target("llvm")
   gpu_target = tvm.target.Target("cuda")
   N = 5
   L = 3
   D = 6
   C = 3
   S = 4
   
   def check_correctness():
       i_feat = np.random.rand(N, C).astype("float32")
       index = np.random.randint(0, D, (N, L)).astype("int32")
       linear = np.random.rand(D, S, C).astype("float32")
       bias = np.random.rand(D, S).astype("float32")
       
       act = np.einsum('nc,dsc->nds', i_feat, linear)
       act = act + bias[None]
       result = np.take_along_axis(act, index[:,:,None], axis=1)
       
       nodes = logit_forward(N, L, D, C, S, "float32")
       s = te.create_schedule(nodes[-1].op)
       func = tvm.build(s, nodes, cpu_target, name="act")
       
       dev = tvm.device(cpu_target.kind.name, 0)
       i_feat = tvm.nd.array(i_feat, dev)
       index = tvm.nd.array(index, dev)
       linear = tvm.nd.array(linear, dev)
       bias = tvm.nd.array(bias, dev)
       out = tvm.nd.array(np.zeros_like(result), dev)
       
       func(i_feat, index, linear, bias, out)
       
       tvm.testing.assert_allclose(out.numpy(), result)
   
   check_correctness()
   
   task = tvm.auto_scheduler.SearchTask(func=logit_forward, args=(N, L, D, C, 
S, "float32"), target=gpu_target)
   
   # Inspect the computational graph
   print("Computational DAG:")
   print(task.compute_dag)
   
   log_file = "ops/logit_forward.json"
   measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
   tune_option = auto_scheduler.TuningOptions(
       num_measure_trials=10,
       runner=measure_ctx.runner,
       measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
       verbose=2,
   )
   
   # Run auto-tuning (search)
   task.tune(tune_option)
   # Apply the best schedule
   sch, args = task.apply_best(log_file)
   
   print("Lowered TIR:")
   print(tvm.lower(sch, args, simple_mode=True))
   ```
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to