tinywisdom opened a new issue, #19548:
URL: https://github.com/apache/tvm/issues/19548
### Description
I found that a minimal Relax program containing only `R.power(x, 2.0)` fails
to build for CUDA with the default Relax build pipeline.
The failure happens during TIR build / memory verification. The generated
TIR directly accesses buffers from host code and has no GPU thread binding:
```text
RuntimeError: Memory verification failed with the following errors:
Variable `T_power` is directly accessed by host memory (it is not
contained in a thread environment or in the function arguments.
Variable `x` is directly accessed by host memory (it is not contained in
a thread environment or in the function arguments.
Did you forget to bind?
```
However, if I explicitly apply `relax.transform.LegalizeOps()` followed by
`tir.transform.DefaultGPUSchedule()` under the CUDA target context, the same
module builds successfully. This suggests that the default CUDA Relax build
pipeline legalizes `R.power` to TIR but does not apply the required GPU
scheduling / thread binding before memory verification.
### Environment
TVM: 0.23.0
LLVM: 17.0.6
Python: 3.10.16 (from stack paths)
NumPy: 2.2.6
### Steps to reproduce
```python
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import platform
import traceback
import tvm
from tvm import relax
from tvm.script import ir as I
from tvm.script import relax as R
@I.ir_module
class PowerModule:
@R.function
def main(
x: R.Tensor((1, 2, 1, 1), dtype="float32")
) -> R.Tensor((1, 2, 1, 1), dtype="float32"):
with R.dataflow():
y: R.Tensor((1, 2, 1, 1), dtype="float32") = R.power(
x, R.const(2.0, "float32")
)
R.output(y)
return y
def build_relax_module(mod, target):
if hasattr(tvm, "compile"):
return tvm.compile(mod, target=target)
return relax.build(mod, target=target)
def print_header(title):
print("\n" + "=" * 100)
print(title)
print("=" * 100)
def print_env(target):
print_header("Environment")
print("python:", sys.version.replace("\n", " "))
print("platform:", platform.platform())
print("tvm version:", getattr(tvm, "__version__", "<unknown>"))
print("tvm path:", getattr(tvm, "__file__", "<unknown>"))
print("TVM_CUDA_TARGET env:", os.environ.get("TVM_CUDA_TARGET",
"<unset>"))
try:
dev = tvm.cuda(0)
print("tvm.cuda(0).exist:", dev.exist)
print("tvm.cuda(0):", dev)
except Exception as e:
print("tvm.cuda(0): failed:", repr(e))
print("target:", target)
print("has tvm.compile:", hasattr(tvm, "compile"))
print("has relax.transform.LegalizeOps:", hasattr(relax.transform,
"LegalizeOps"))
print("has tir.transform.DefaultGPUSchedule:",
hasattr(tvm.tir.transform, "DefaultGPUSchedule"))
def print_exception(e):
print(type(e).__name__, repr(e))
print("\nTraceback:")
traceback.print_exc()
def try_build_case(name, mod, target):
print_header(name)
print("IRModule:")
print(mod)
print("\nTarget:")
print(target)
try:
build_relax_module(mod, target)
print("\n[BUILD] OK")
return True
except Exception as e:
print("\n[BUILD] FAILED")
print_exception(e)
return False
def try_transform_case(name, transform_fn, mod):
print_header(name)
try:
out = transform_fn(mod)
print("[TRANSFORM] OK")
print("\nTransformed IRModule:")
print(out)
return out
except Exception as e:
print("[TRANSFORM] FAILED")
print_exception(e)
return None
def _schedule_with_target(mod, target):
with target:
return tvm.tir.transform.DefaultGPUSchedule()(mod)
def main():
target = tvm.target.Target(
"cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024
-thread_warp_size=32"
)
print_env(target)
print_header("Original Relax IR")
print(PowerModule)
results = {}
results["default_cuda_build"] = try_build_case(
"CASE 1: default CUDA Relax build",
PowerModule,
target,
)
legalized_mod = try_transform_case(
"CASE 2A: relax.transform.LegalizeOps()",
lambda mod: relax.transform.LegalizeOps()(mod),
PowerModule,
)
if legalized_mod is not None:
results["legalize_ops_then_cuda_build"] = try_build_case(
"CASE 2B: build after LegalizeOps",
legalized_mod,
target,
)
else:
results["legalize_ops_then_cuda_build"] = False
scheduled_mod = None
if legalized_mod is not None:
scheduled_mod = try_transform_case(
"CASE 3A: DefaultGPUSchedule after LegalizeOps under CUDA target
context",
lambda mod: _schedule_with_target(mod, target),
legalized_mod,
)
if scheduled_mod is not None:
results["legalize_ops_default_gpu_schedule_then_cuda_build"] =
try_build_case(
"CASE 3B: build after LegalizeOps + DefaultGPUSchedule",
scheduled_mod,
target,
)
else:
results["legalize_ops_default_gpu_schedule_then_cuda_build"] = False
print_header("Summary")
for k, v in results.items():
print(f"{k}: {'OK' if v else 'FAILED'}")
if __name__ == "__main__":
main()
```
### Triage
Please refer to the list of label tags
[here](https://github.com/apache/tvm/wiki/Issue-Triage-Labels) to find the
relevant tags and add them below in a bullet format (example below).
* needs-triage
* bug
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]