[GitHub] [tvm] jinfagang opened a new issue, #10956: [Bug] The matmul search out on CPU much more slower than Numpy

GitBox Sun, 10 Apr 2022 20:48:54 -0700


jinfagang opened a new issue, #10956:
URL: https://github.com/apache/tvm/issues/10956


   Here is the script:
   
   ```
   import os
   
   import numpy as np
   import tvm
   from tvm import te, auto_scheduler
   from functools import wraps
   from time import time
   
   
   def timing(f):
       @wraps(f)
       def wrap(*args, **kw):
           ts = time()
           for i in range(100):
               result = f(*args, **kw)
           te = time()
           print("func:%r took: %2.4f sec" % (f.__name__, (te - ts)/100))
           return result
       return wrap
   
   
   @auto_scheduler.register_workload  # Note the auto_scheduler decorator
   def matmul_add(N, L, M, dtype):
       A = te.placeholder((N, L), name="A", dtype=dtype)
       B = te.placeholder((L, M), name="B", dtype=dtype)
       C = te.placeholder((N, M), name="C", dtype=dtype)
   
       k = te.reduce_axis((0, L), name="k")
       matmul = te.compute(
           (N, M),
           lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
           name="matmul",
           attrs={
               "layout_free_placeholders": [B]
           },  # enable automatic layout transform for tensor B
       )
       out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")
       return [A, B, C, out]
   
   
   @timing
   def np_matmul(a_np, b_np, c_np):
       out_np = a_np.dot(b_np) + c_np
       return out_np
   
   
   def test():
       target = tvm.target.Target("llvm")
       N = L = M = 1024
       task = tvm.auto_scheduler.SearchTask(
           func=matmul_add, args=(N, L, M, "float32"), target=target
       )
   
       # Inspect the computational graph
       print("Computational DAG:")
       print(task.compute_dag)
   
       log_file = "matmul.json"
       tune_option = auto_scheduler.TuningOptions(
           num_measure_trials=50,
           measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
           verbose=2,
       )
       task.tune(tune_option)
   
       print('tune done....!')
       # Apply the best schedule
       sch, args = task.apply_best(log_file)
       # print(tvm.lower(sch, args, simple_mode=True))
   
       func = tvm.build(sch, args, target)
       a_np = np.random.uniform(size=(N, L)).astype(np.float32)
       b_np = np.random.uniform(size=(L, M)).astype(np.float32)
       c_np = np.random.uniform(size=(N, M)).astype(np.float32)
       out_np = a_np.dot(b_np) + c_np
   
   
       dev = tvm.cpu()
       a_tvm = tvm.nd.array(a_np, device=dev)
       b_tvm = tvm.nd.array(b_np, device=dev)
       c_tvm = tvm.nd.array(c_np, device=dev)
       out_tvm = tvm.nd.empty(out_np.shape, device=dev)
   
       print('calculate new func???')
       func(a_tvm, b_tvm, c_tvm, out_tvm)
   
       print(out_tvm.numpy().shape)
       # Check results
       np.testing.assert_allclose(out_np, out_tvm.numpy(), rtol=1e-3)
   
       # Evaluate execution time.
       evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
       print(
           "Execution time of this operator: %.3f ms"
           % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
       )
       np_matmul(a_np, b_np, c_np)
   
   if __name__ == "__main__":
       test()
   
   ```
   
   And this is the result:
   
   ```
   tune done....!
   calculate new func???
   (1024, 1024)
   Execution time of this operator: 16.308 ms
   func:'np_matmul' took: 0.0071 sec
   ```
   
   Clearly the numpy result much more faster than tvm. Is that normal?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [tvm] jinfagang opened a new issue, #10956: [Bug] The matmul search out on CPU much more slower than Numpy

Reply via email to