jinfagang opened a new issue, #10956:
URL: https://github.com/apache/tvm/issues/10956
Here is the script:
```
import os
import numpy as np
import tvm
from tvm import te, auto_scheduler
from functools import wraps
from time import time
def timing(f):
@wraps(f)
def wrap(*args, **kw):
ts = time()
for i in range(100):
result = f(*args, **kw)
te = time()
print("func:%r took: %2.4f sec" % (f.__name__, (te - ts)/100))
return result
return wrap
@auto_scheduler.register_workload # Note the auto_scheduler decorator
def matmul_add(N, L, M, dtype):
A = te.placeholder((N, L), name="A", dtype=dtype)
B = te.placeholder((L, M), name="B", dtype=dtype)
C = te.placeholder((N, M), name="C", dtype=dtype)
k = te.reduce_axis((0, L), name="k")
matmul = te.compute(
(N, M),
lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
name="matmul",
attrs={
"layout_free_placeholders": [B]
}, # enable automatic layout transform for tensor B
)
out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")
return [A, B, C, out]
@timing
def np_matmul(a_np, b_np, c_np):
out_np = a_np.dot(b_np) + c_np
return out_np
def test():
target = tvm.target.Target("llvm")
N = L = M = 1024
task = tvm.auto_scheduler.SearchTask(
func=matmul_add, args=(N, L, M, "float32"), target=target
)
# Inspect the computational graph
print("Computational DAG:")
print(task.compute_dag)
log_file = "matmul.json"
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=50,
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
verbose=2,
)
task.tune(tune_option)
print('tune done....!')
# Apply the best schedule
sch, args = task.apply_best(log_file)
# print(tvm.lower(sch, args, simple_mode=True))
func = tvm.build(sch, args, target)
a_np = np.random.uniform(size=(N, L)).astype(np.float32)
b_np = np.random.uniform(size=(L, M)).astype(np.float32)
c_np = np.random.uniform(size=(N, M)).astype(np.float32)
out_np = a_np.dot(b_np) + c_np
dev = tvm.cpu()
a_tvm = tvm.nd.array(a_np, device=dev)
b_tvm = tvm.nd.array(b_np, device=dev)
c_tvm = tvm.nd.array(c_np, device=dev)
out_tvm = tvm.nd.empty(out_np.shape, device=dev)
print('calculate new func???')
func(a_tvm, b_tvm, c_tvm, out_tvm)
print(out_tvm.numpy().shape)
# Check results
np.testing.assert_allclose(out_np, out_tvm.numpy(), rtol=1e-3)
# Evaluate execution time.
evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500)
print(
"Execution time of this operator: %.3f ms"
% (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000)
)
np_matmul(a_np, b_np, c_np)
if __name__ == "__main__":
test()
```
And this is the result:
```
tune done....!
calculate new func???
(1024, 1024)
Execution time of this operator: 16.308 ms
func:'np_matmul' took: 0.0071 sec
```
Clearly the numpy result much more faster than tvm. Is that normal?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]