Hecmay opened a new issue #6869:
URL: https://github.com/apache/incubator-tvm/issues/6869


   ## Env
   * Ubuntu 18.04 LTS (LLVM dev 6.0.0 )
   * GPU: RTX2080 + CUDA 10.2
   
   ## Problem Description
   I was trying to construct a neural network using realy's APIs, and tune it 
with TVM's auto scheduler. During the tuning process, I can see from the log 
that some tasks failed all the evaluation (e.g. the log outputs `*T*T*T*T*T*T"` 
for all the evaluation). As a result, those tasks are not scheduled at all. 
Here is error msg I got from relay runtime when trying to run the network. I 
suppose this is the problematic task in the network.
   
   ```shell
   RuntimeError: Check failed: VerifyMemory(func): Direct host side access to 
device memory is detected. Did you forget to bind?
   PrimFunc([placeholder, placeholder, placeholder, T_relu]) 
attrs={"global_symbol": "fused_nn_dense_add_nn_relu_3", "tir.noalias": (bool)1, 
"target": cuda} {
     // attr [T_dense] storage_scope = "global"
     allocate T_dense[float32 * 4096]
     for (j, 0, 4096) {
       T_dense[j] = 0f
       for (k, 0, 25088) {
         T_dense[j] = (T_dense[j] + (placeholder[k]*placeholder[((j*25088) + 
k)]))
       }
     }
     for (ax1, 0, 4096) {
       T_dense[ax1] = (placeholder[ax1] + T_dense[ax1])
     }
     for (ax1, 0, 4096) {
       T_relu[ax1] = max(T_dense[ax1], 0f)
     }
   }
   ```
   ## Test case
   
   Here is a VGG11 network. I set the timeout to be a very large value, but for 
some tasks, all the measurement still would not pass because of timeout. 
   
   ```python
   
   import os
   import numpy as np
   import tvm
   from tvm import relay
   from tvm import ansor as auto_scheduler
   from tvm.relay import testing
   import tvm.contrib.graph_runtime as runtime
   
   def build_graph():
     t1 = relay.var('I_1', shape=(1,3,224,224), dtype='float32')
     t2 = relay.var('I_2', shape=(64,3,3,3), dtype='float32')
     t3 = relay.var('I_3', shape=(64,), dtype='float32')
     t4 = relay.var('I_4', shape=(128,64,3,3), dtype='float32')
     t5 = relay.var('I_5', shape=(128,), dtype='float32')
     t6 = relay.var('I_6', shape=(256,128,3,3), dtype='float32')
     t7 = relay.var('I_7', shape=(256,), dtype='float32')
     t8 = relay.var('I_8', shape=(256,256,3,3), dtype='float32')
     t9 = relay.var('I_9', shape=(256,), dtype='float32')
     t10 = relay.var('I_10', shape=(512,256,3,3), dtype='float32')
     t11 = relay.var('I_11', shape=(512,), dtype='float32')
     t12 = relay.var('I_12', shape=(512,512,3,3), dtype='float32')
     t13 = relay.var('I_13', shape=(512,), dtype='float32')
     t14 = relay.var('I_14', shape=(512,512,3,3), dtype='float32')
     t15 = relay.var('I_15', shape=(512,), dtype='float32')
     t16 = relay.var('I_16', shape=(512,512,3,3), dtype='float32')
     t17 = relay.var('I_17', shape=(512,), dtype='float32')
     t18 = relay.var('I_18', shape=(4096,25088), dtype='float32')
     t19 = relay.var('I_19', shape=(4096,), dtype='float32')
     t20 = relay.var('I_20', shape=(4096,4096), dtype='float32')
     t21 = relay.var('I_21', shape=(4096,), dtype='float32')
     t22 = relay.var('I_22', shape=(1000,4096), dtype='float32')
     t23 = relay.var('I_23', shape=(1000,), dtype='float32')
     t24 = relay.nn.conv2d(t1, t2, padding=[1,1])
     t25 = relay.reshape(t3, (64,1,1))
     t26 = relay.reshape(t5, (128,1,1))
     t27 = relay.reshape(t7, (256,1,1))
     t28 = relay.reshape(t9, (256,1,1))
     t29 = relay.reshape(t11, (512,1,1))
     t30 = relay.reshape(t13, (512,1,1))
     t31 = relay.reshape(t15, (512,1,1))
     t32 = relay.reshape(t17, (512,1,1))
     t33 = relay.add(t24, t25)
     t34 = relay.nn.relu(t33)
     t35 = relay.nn.max_pool2d(t34, pool_size=[2,2], strides=[2,2], 
padding=[0,0])
     t36 = relay.nn.conv2d(t35, t4, padding=[1,1])
     t37 = relay.add(t36, t26)
     t38 = relay.nn.relu(t37)
     t39 = relay.nn.max_pool2d(t38, pool_size=[2,2], strides=[2,2], 
padding=[0,0])
     t40 = relay.nn.conv2d(t39, t6, padding=[1,1])
     t41 = relay.add(t40, t27)
     t42 = relay.nn.relu(t41)
     t43 = relay.nn.conv2d(t42, t8, padding=[1,1])
     t44 = relay.add(t43, t28)
     t45 = relay.nn.relu(t44)
     t46 = relay.nn.max_pool2d(t45, pool_size=[2,2], strides=[2,2], 
padding=[0,0])
     t47 = relay.nn.conv2d(t46, t10, padding=[1,1])
     t48 = relay.add(t47, t29)
     t49 = relay.nn.relu(t48)
     t50 = relay.nn.conv2d(t49, t12, padding=[1,1])
     t51 = relay.add(t50, t30)
     t52 = relay.nn.relu(t51)
     t53 = relay.nn.max_pool2d(t52, pool_size=[2,2], strides=[2,2], 
padding=[0,0])
     t54 = relay.nn.conv2d(t53, t14, padding=[1,1])
     t55 = relay.add(t54, t31)
     t56 = relay.nn.relu(t55)
     t57 = relay.nn.conv2d(t56, t16, padding=[1,1])
     t58 = relay.add(t57, t32)
     t59 = relay.nn.relu(t58)
     t60 = relay.nn.max_pool2d(t59, pool_size=[2,2], strides=[2,2], 
padding=[0,0])
     t61 = relay.nn.avg_pool2d(t60, pool_size=[1,1], strides=[1,1], 
padding=[0,0])
     t62 = relay.reshape(t61, (1,25088))
     t63 = relay.nn.dense(t62, t18)
     t64 = relay.add(t19, t63)
     t65 = relay.nn.relu(t64)
     t66 = t65
     t67 = relay.nn.dense(t66, t20)
     t68 = relay.add(t21, t67)
     t69 = relay.nn.relu(t68)
     t70 = t69
     t71 = relay.nn.dense(t70, t22)
     t72 = relay.add(t23, t71)
     return 
[t1,t2,t3,t4,t11,t5,t6,t7,t8,t9,t10,t22,t23,t12,t13,t14,t15,t16,t17,t18,t19,t20,t21],
 t72
   
   def compile_graph(inputs, output):
     name = "tvm_dag_test"
     log_file = "{}.json".format(name)
     
     ctx = tvm.context("llvm")
     target = tvm.target.create("cuda")
     target_host = tvm.target.create("llvm")
   
     mod = relay.Function(inputs, output)
     mod = tvm.IRModule.from_expr(mod)  
   
     # search optimal schedule
     workloads, wkl_weights = auto_scheduler.extract_from_program(
       mod, {}, target=target)
   
     # define objective function (latency) for task scheduler
     def objective_func(costs):
         return sum(c * w for c, w in zip(costs, wkl_weights))
   
     # create tasks and scheduler
     tasks = []
     for i, wkl_key in enumerate(workloads):
         dag = auto_scheduler.workload_key_to_dag(wkl_key)
         tasks.append(auto_scheduler.SearchTask(dag, wkl_key, target, 
target_host))
         print("---------- Task %d ---------- (key: %s) \n" % (i, wkl_key), dag)
     
     assert len(tasks) > 0, "Task extraction fail"
     tuner = auto_scheduler.SimpleTaskScheduler(tasks, objective_func)
     measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300)
     tune_option = auto_scheduler.TuneOption(
         n_trials=1000, runner=measure_ctx.runner,
         measure_callbacks=[auto_scheduler.LogToFile(log_file)])
     tuner.tune(tune_option)
     del measure_ctx
   ```  
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to