This is an automated email from the ASF dual-hosted git repository.
syfeng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 3b976585c7 [DLIGHT][GPU] Enhance opencl thread limit for schedules
(#16972)
3b976585c7 is described below
commit 3b976585c725fbf607f9e5fafd464ddcb3edc8dd
Author: krishnaraj36 <[email protected]>
AuthorDate: Tue May 21 14:25:34 2024 +0530
[DLIGHT][GPU] Enhance opencl thread limit for schedules (#16972)
* [DLIGHT][GPU] Enhance opencl thread limit for schedules
Enhanced the opencl thread limit and improved the gpu schedules
for opencl targets.
It improves decode performance 20 % for few set of models.
* Update the build test
* reverted opencl max_thread enhancement
* Fix in opencl thread assign
---
python/tvm/dlight/gpu/general_reduction.py | 3 +++
python/tvm/dlight/gpu/rmsnorm.py | 2 ++
python/tvm/dlight/gpu/transpose.py | 4 ++++
python/tvm/dlight/gpu/utils.py | 2 ++
4 files changed, 11 insertions(+)
diff --git a/python/tvm/dlight/gpu/general_reduction.py
b/python/tvm/dlight/gpu/general_reduction.py
index ef6bb1db91..404b73a6f0 100644
--- a/python/tvm/dlight/gpu/general_reduction.py
+++ b/python/tvm/dlight/gpu/general_reduction.py
@@ -40,6 +40,9 @@ class GeneralReduction(GPUScheduleRule):
if target.kind.name == "cuda":
len_tx = 256
unroll_depth = 256
+ elif target.kind.name == "opencl":
+ len_tx = 256
+ unroll_depth = 64
else:
len_tx = 64
unroll_depth = 64
diff --git a/python/tvm/dlight/gpu/rmsnorm.py b/python/tvm/dlight/gpu/rmsnorm.py
index f8b2bb4a17..4047721c9a 100644
--- a/python/tvm/dlight/gpu/rmsnorm.py
+++ b/python/tvm/dlight/gpu/rmsnorm.py
@@ -82,6 +82,8 @@ class RMSNorm(ScheduleRule):
) -> tir.Schedule:
if target.kind.name == "cuda":
num_tx = 512
+ elif target.kind.name == "opencl":
+ num_tx = 256
else:
num_tx = 64
diff --git a/python/tvm/dlight/gpu/transpose.py
b/python/tvm/dlight/gpu/transpose.py
index d4496756a2..3bef3d61e5 100644
--- a/python/tvm/dlight/gpu/transpose.py
+++ b/python/tvm/dlight/gpu/transpose.py
@@ -57,6 +57,10 @@ class Transpose(GPUScheduleRule):
len_tx = 16
len_ty = 8
unroll_depth = 256
+ elif target.kind.name == "opencl":
+ len_tx = 16
+ len_ty = 8
+ unroll_depth = 64
else:
len_tx = 8
len_ty = 4
diff --git a/python/tvm/dlight/gpu/utils.py b/python/tvm/dlight/gpu/utils.py
index 4f2df5cfa0..e27a6969ad 100644
--- a/python/tvm/dlight/gpu/utils.py
+++ b/python/tvm/dlight/gpu/utils.py
@@ -55,6 +55,8 @@ def suggest_threads_per_block(
threads = 256
elif target.kind.name == "metal":
threads = 256
+ elif target.kind.name == "opencl":
+ threads = 256
else:
threads = 64
results: List[Optional[int]] = []