This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new fb16d9487d [CODEGEN][OPENCL] Fix opencl codegen for few ops (#17273)
fb16d9487d is described below

commit fb16d9487d062353b1fed3b14729e9282da2b875
Author: krishnaraj36 <[email protected]>
AuthorDate: Wed Aug 14 18:25:09 2024 +0530

    [CODEGEN][OPENCL] Fix opencl codegen for few ops (#17273)
    
    * Compiler pass config to choose target clml support version
    
    Partition pass should shoose off loading ops based on target support
    this config enables choosing target version on python api aswell as
    tvmc.
    
    * Update clml.py
    
    * Fix opencl codegen for few ops
    
    Fixed the opencl codegen for few operators -
    1. Atomic add for float - opencl doesn't have support float atomic add,
    Enabled work-around for this operation with atomic_cmpexch()
    2. fmodf - Opencl only support fmod for all floating point
    3. nearbyint - Opencl doesn't have this function and henced replaced
    with roud function.
    
    * Update test_relay_ops.py
    
    * Update codegen_opencl.cc
    
    * Update codegen_opencl.cc
    
    * Revert "Compiler pass config to choose target clml support version"
    
    This reverts commit bc955b02c436cdab7e397a2f1e66d828861da6e8.
    
    * Revert "Update clml.py"
    
    This reverts commit 4ff98a82dc463628f673292631df518e6831fd4e.
    
    ---------
    
    Co-authored-by: Siva <[email protected]>
    Co-authored-by: B, Siva Rama Krishna Reddy <[email protected]>
    Co-authored-by: Vegiraju, Krishna Raju <[email protected]>
---
 python/tvm/topi/cuda/nms.py                        |  4 +-
 src/target/source/codegen_opencl.cc                | 52 ++++++++++++++-
 src/target/source/codegen_opencl.h                 |  1 +
 .../python/relay/opencl_texture/test_relay_ops.py  | 73 ++++++++++++++++++++++
 4 files changed, 126 insertions(+), 4 deletions(-)

diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py
index e402c58889..f258bffc3e 100644
--- a/python/tvm/topi/cuda/nms.py
+++ b/python/tvm/topi/cuda/nms.py
@@ -50,7 +50,9 @@ def cuda_atomic_add_rule(op):
 def opencl_atomic_add_rule(op):
     if op.dtype == "int32":
         return tvm.tir.call_pure_extern("int32", "atomic_add", op.args[0], 
op.args[1])
-    raise RuntimeError("only support int32")
+    elif op.dtype == "float32":
+        return tvm.tir.call_pure_extern("float32", "atomic_add", op.args[0], 
op.args[1])
+    raise RuntimeError("only support int32, float32")
 
 
 register_intrin_lowering("tir.atomic_add", target="cuda", 
f=cuda_atomic_add_rule, level=99)
diff --git a/src/target/source/codegen_opencl.cc 
b/src/target/source/codegen_opencl.cc
index f17a452d5c..5933c9582c 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -129,6 +129,16 @@ std::string CodeGenOpenCL::Finish() {
   if (enable_atomics_) {
     decl_stream << "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics 
: enable\n"
                    "#pragma OPENCL EXTENSION 
cl_khr_global_int32_extended_atomics : enable\n\n";
+    decl_stream << "__inline float atomic_add_float_emu(volatile __global 
float* sum, const float "
+                   "toAdd) {\n"
+                   "float next_value = 0;"
+                   "float prev_value = 0;"
+                   "do {\n"
+                   "prev_value =*(sum);\n"
+                   "next_value =prev_value + toAdd;\n"
+                   "} while(atomic_cmpxchg((volatile global int *)(sum), 
*((int*)&prev_value), "
+                   "*((int*)&next_value)) != *((int*)&prev_value));\n"
+                   "return next_value;\n}\n";
   }
 
   // Enable OpenCL 1.2 sampler-less texture reads, but utilize
@@ -458,13 +468,21 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, 
std::ostream& os) {
       this->PrintExpr(op->args.back(), os);
       os << "]";
     }
-  } else if (op->op.same_as(builtin_call_extern_)) {
+  } else if (op->op.same_as(builtin_call_extern_) || 
op->op.same_as(builtin_call_pure_extern_)) {
     auto func = Downcast<StringImm>(op->args[0]);
     // Enable atomics extension if used.
-    if (func->value == "atomic_add") {
+    if (func->value == "atomic_add" && op->dtype.is_float()) {
       enable_atomics_ = true;
+      this->PrintCallExtern(GetType(GetRef<PrimExpr>(op)), 
"atomic_add_float_emu", op->args, true,
+                            os);
+    } else if (func->value == "nearbyint") {
+      this->PrintCallExtern(GetType(GetRef<PrimExpr>(op)), "round", op->args, 
true, os);
+    } else {
+      if (func->value == "atomic_add") {
+        enable_atomics_ = true;
+      }
+      CodeGenC::VisitExpr_(op, os);
     }
-    CodeGenC::VisitExpr_(op, os);
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
@@ -534,6 +552,34 @@ void CodeGenOpenCL::VisitExpr_(const MaxNode* op, 
std::ostream& os) {
   PrintBinaryExpr(op, "max", os, this);
 }
 
+void CodeGenOpenCL::VisitExpr_(const ModNode* op, std::ostream& os) {  // 
NOLINT(*)
+  std::string opstr;
+  if (op->dtype.is_int() || op->dtype.is_uint()) {
+    opstr = "%";
+  } else {
+    ICHECK(op->dtype.is_float()) << "Expected floating point or integer dtype 
in Mod, but got "
+                                 << op->dtype;
+    opstr = "fmod";
+  }
+  if (op->dtype.lanes() == 1) {
+    if (isalpha(opstr.c_str()[0])) {
+      os << opstr.c_str() << '(';
+      this->PrintExpr(op->a, os);
+      os << ", ";
+      this->PrintExpr(op->b, os);
+      os << ')';
+    } else {
+      os << '(';
+      this->PrintExpr(op->a, os);
+      os << ' ' << opstr.c_str() << ' ';
+      this->PrintExpr(op->b, os);
+      os << ')';
+    }
+  } else {
+    this->PrintVecBinaryOp(opstr.c_str(), op->dtype, op->a, op->b, os);
+  }
+}
+
 void CodeGenOpenCL::VisitExpr_(const AndNode* op, std::ostream& os) {
   std::ostringstream oss;
   os << "(";
diff --git a/src/target/source/codegen_opencl.h 
b/src/target/source/codegen_opencl.h
index 8b365f85d6..e668f75b2e 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -74,6 +74,7 @@ class CodeGenOpenCL final : public CodeGenC {
   void VisitExpr_(const AndNode* op, std::ostream& os) final;
   void VisitExpr_(const OrNode* op, std::ostream& os) final;
   void VisitExpr_(const SelectNode* op, std::ostream& os) final;
+  void VisitExpr_(const ModNode* op, std::ostream& os) final;
 
  private:
   // whether enable fp16 and fp64 extension
diff --git a/tests/python/relay/opencl_texture/test_relay_ops.py 
b/tests/python/relay/opencl_texture/test_relay_ops.py
new file mode 100644
index 0000000000..686a9a9b9e
--- /dev/null
+++ b/tests/python/relay/opencl_texture/test_relay_ops.py
@@ -0,0 +1,73 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay import testing
+from tvm.contrib import utils
+from utils.adreno_utils import gpu_preprocess, build_run_compare, 
build_run_compare_vm
+
+
+executor_type = tvm.testing.parameter("ge", "vm")
+dtype = tvm.testing.parameter("float32")
+
+
[email protected]_opencl
[email protected]_targets("opencl -device=adreno")
+def test_mod(remote, target, executor_type, dtype):
+    # NCHW
+    input_shape = (1, 25, 38, 64)
+    A = relay.var("data", shape=input_shape, dtype=dtype)
+    scale = relay.const(2.0, dtype=dtype)
+    op = relay.mod(A, scale)
+    mod = relay.Function([A], op)
+
+    if executor_type == "ge":
+        build_run_compare(remote, mod, {}, {"data": input_shape}, {"data": 
dtype}, target)
+    else:
+        build_run_compare_vm(remote, mod, {}, {"data": input_shape}, {"data": 
dtype}, target)
+
+
[email protected]_opencl
[email protected]_targets("opencl -device=adreno")
+def test_scatter_nd_add(remote, target, executor_type, dtype):
+    # NCHW
+
+    A = relay.var("data", shape=(6, 30, 30, 256), dtype=dtype)
+    indices = relay.const(tvm.nd.array(np.random.randint(0, 1, (2, 6, 30, 
30))), dtype="int64")
+    update = relay.const(
+        tvm.nd.array(np.random.uniform(-1, 1, size=(50, 50, 
256)).astype(dtype)), dtype=dtype
+    )
+    op = relay.scatter_nd(update, indices, A, mode="add")
+    mod = relay.Function([A], op)
+    shape_dict = {
+        "data": (6, 30, 30, 256),
+    }
+    dtype_dict = {
+        "data": dtype,
+    }
+
+    if executor_type == "ge":
+        build_run_compare(remote, mod, {}, shape_dict, dtype_dict, target)
+    else:
+        build_run_compare_vm(remote, mod, {}, shape_dict, dtype_dict, target)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()

Reply via email to