LightricksNatanKaminsky opened a new issue #9242:
URL: https://github.com/apache/tvm/issues/9242
### Expected behavior
When I auto schedule a model for a remote device with a Mali GPU the outputs
of the compiled fine tuned model are significantly different than the outputs
of the model compiled without auto scheduling or compiled for the CPU of the
device.
### Environment
I ran this script on mac with macOS Big Sur 11.6, and my android device is
Samsung S21 5G.
### Steps to reproduce
The following code reproduces this problem:
```
import os
import tvm
from tvm.contrib import utils, ndk, graph_executor
import numpy as np
from tvm import relay, auto_scheduler, rpc
import tvm.relay.testing
from mxnet.gluon.model_zoo.vision import get_model
if __name__ == "__main__":
block = get_model("mobilenet1.0", pretrained=True)
shape_dict = {"data": (1, 3, 224, 224)}
mod, params = relay.frontend.from_mxnet(block, shape_dict)
tracker_host = "0.0.0.0"
tracker_port = 9000
key = 'android'
os.environ["TVM_NDK_CC"] =
'/users/nkaminsky/library/android/android-toolchain-arm64/bin/aarch64-linux-android-g++'
output_file = 'auto_scheduled_model_mxnet.json'
target = tvm.target.Target('opencl -device=mali', 'llvm
-mtriple=arm64-linux-android')
tasks, task_weights = tvm.auto_scheduler.extract_tasks(mod['main'],
params, target)
tuner = tvm.auto_scheduler.TaskScheduler(tasks, task_weights)
builder = tvm.auto_scheduler.LocalBuilder(build_func='ndk')
runner = tvm.auto_scheduler.RPCRunner(key=key, host=tracker_host,
port=tracker_port, priority=0, number=3, repeat=1)
tune_options = tvm.auto_scheduler.TuningOptions(num_measure_trials=35,
builder=builder,
runner=runner,
measure_callbacks=[tvm.auto_scheduler.RecordToFile(output_file)])
tuner.tune(tune_options)
with tvm.auto_scheduler.ApplyHistoryBest(output_file):
with tvm.transform.PassContext(opt_level=3,
config={"relay.backend.use_auto_scheduler": True}):
lib = relay.build(mod, target=target, params=params)
tracker = rpc.connect_tracker(tracker_host, tracker_port)
remote = tracker.request(key, priority=0, session_timeout=0)
device = remote.cl(0)
tmp = utils.tempdir()
lib_fname = tmp.relpath(f"net.so")
fcompile = ndk.create_shared
lib.export_library(lib_fname, fcompile)
remote.upload(lib_fname)
exported_lib = remote.load_module(f"net.so")
module = graph_executor.GraphModule(exported_lib["default"](device))
# test output:
local_device = tvm.device(str(tvm.target.Target('llvm')), 0)
with tvm.transform.PassContext(opt_level=3):
unoptimized_lib = relay.build(mod, target=tvm.target.Target('llvm'),
params=params)
rng = np.random.default_rng(seed=0)
unoptimized_module =
graph_executor.GraphModule(unoptimized_lib["default"](local_device))
dummy_input = {'input': rng.random([1, 3, 224, 224]).astype("float32")}
tvm_dummy = {key: tvm.nd.array(input) for key, input in
dummy_input.items()}
module.set_input(**tvm_dummy)
module.run()
module_output = module.get_output(0).numpy()
unoptimized_module.set_input(**tvm_dummy)
unoptimized_module.run()
unoptimized_module_output = unoptimized_module.get_output(0).numpy()
diff = np.abs(module_output - unoptimized_module_output)
assert (diff <= 1e-3).all(), f'maximum element difference:
{np.amax(diff)}, l2 diff: {np.linalg.norm(diff)}'
```
The numerical error I get for a fine tuned model is around L2 = 35. However,
if I build the module without the json file from auto scheduling the assert
passes (L2 of less than 1). Any help will be greatly appreciated!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]