Re: [PR] [Fix][CodeGen][CUDA]: Gate fast math intrinsic lowering behind target option [tvm]

via GitHub Thu, 14 May 2026 02:23:48 -0700


gemini-code-assist[bot] commented on code in PR #19565:
URL: https://github.com/apache/tvm/pull/19565#discussion_r3240381023



##########
tests/python/codegen/test_target_codegen_cuda_fastmath.py:
##########
@@ -0,0 +1,298 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+import tvm.tirx as tirx
+from tvm.contrib.nvcc import have_fp16
+from tvm.ir.module import IRModule
+from tvm.runtime.executable import Executable
+from tvm.script import tirx as T
+
+VECTOR_N_INPUTS = 8
+
+
+def make_prim_func(
+    name: str,
+    dtype: str,
+    num_inputes: int,

Review Comment:
   ![medium](https://www.gstatic.com/codereviewagent/medium-priority.svg)
   
   Typo in parameter name `num_inputes`. It should be `num_inputs`.
   
   ```suggestion
       num_inputs: int,
   ```



##########
tests/python/target/test_target_target.py:
##########
@@ -158,15 +159,19 @@ def test_target_tag_1():
     assert tgt.attrs["max_threads_per_block"] == 1024
     assert tgt.attrs["thread_warp_size"] == 32
     assert tgt.attrs["registers_per_block"] == 32768
+    assert not tgt.attrs["enable_fast_math"]
 
 
 def test_target_tag_override():
     """Test creating a target from a tag with attribute overrides."""
-    tgt = tvm.target.Target({"tag": "nvidia/nvidia-a100", 
"l2_cache_size_bytes": 12345})
+    tgt = tvm.target.Target(
+        {"tag": "nvidia/nvidia-a100", "l2_cache_size_bytes": 12345, 
"enable_fast_math": True}
+    )
     assert tgt.kind.name == "cuda"
     assert tgt.attrs["arch"] == "sm_80"
     # Override should take effect
     assert int(tgt.attrs["l2_cache_size_bytes"]) == 12345
+    assert not tgt.attrs["enable_fast_math"]

Review Comment:
   ![high](https://www.gstatic.com/codereviewagent/high-priority.svg)
   
   The assertion `assert not tgt.attrs["enable_fast_math"]` contradicts the 
target configuration on line 168 where `"enable_fast_math": True` is explicitly 
set. This assertion should be updated to verify that the override correctly 
took effect.
   
   ```suggestion
       assert tgt.attrs["enable_fast_math"]
   ```



##########
tests/python/codegen/test_target_codegen_cuda_fastmath.py:
##########
@@ -0,0 +1,298 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+import tvm.tirx as tirx
+from tvm.contrib.nvcc import have_fp16
+from tvm.ir.module import IRModule
+from tvm.runtime.executable import Executable
+from tvm.script import tirx as T
+
+VECTOR_N_INPUTS = 8
+
+
+def make_prim_func(
+    name: str,
+    dtype: str,
+    num_inputes: int,
+    op: Callable[[tirx.PrimExpr, ...], tirx.PrimExpr],
+) -> tirx.PrimFunc:
+    """Make a primitive function that applies the given operation to the input 
buffer."""
+    if num_inputes == 1:
+
+        @T.prim_func
+        def kernel(
+            A: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            B: T.Buffer((VECTOR_N_INPUTS,), dtype),
+        ):
+            T.func_attr({"global_symbol": name + "_kernel", "tirx.noalias": 
True})
+            for i in T.thread_binding(VECTOR_N_INPUTS, thread="threadIdx.x"):
+                B[i] = op(A[i])
+
+        return kernel
+    elif num_inputes == 2:
+
+        @T.prim_func
+        def kernel(
+            A: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            E: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            B: T.Buffer((VECTOR_N_INPUTS,), dtype),
+        ):
+            T.func_attr({"global_symbol": name + "_kernel", "tirx.noalias": 
True})
+            for i in T.thread_binding(VECTOR_N_INPUTS, thread="threadIdx.x"):
+                B[i] = op(A[i], E[i])
+
+        return kernel
+    else:
+        raise ValueError(f"Unsupported number of inputs: {num_inputes}")
+
+
+@dataclass(frozen=True)
+class MathCase:
+    name: str
+    op: Callable[[tirx.PrimExpr, ...], tirx.PrimExpr]
+    num_inputes: int

Review Comment:
   ![medium](https://www.gstatic.com/codereviewagent/medium-priority.svg)
   
   Typo in field name `num_inputes`. It should be `num_inputs`.
   
   ```suggestion
       num_inputs: int
   ```



##########
tests/python/codegen/test_target_codegen_cuda_fastmath.py:
##########
@@ -0,0 +1,298 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+import tvm.tirx as tirx
+from tvm.contrib.nvcc import have_fp16
+from tvm.ir.module import IRModule
+from tvm.runtime.executable import Executable
+from tvm.script import tirx as T
+
+VECTOR_N_INPUTS = 8
+
+
+def make_prim_func(
+    name: str,
+    dtype: str,
+    num_inputes: int,
+    op: Callable[[tirx.PrimExpr, ...], tirx.PrimExpr],
+) -> tirx.PrimFunc:
+    """Make a primitive function that applies the given operation to the input 
buffer."""
+    if num_inputes == 1:
+
+        @T.prim_func
+        def kernel(
+            A: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            B: T.Buffer((VECTOR_N_INPUTS,), dtype),
+        ):
+            T.func_attr({"global_symbol": name + "_kernel", "tirx.noalias": 
True})
+            for i in T.thread_binding(VECTOR_N_INPUTS, thread="threadIdx.x"):
+                B[i] = op(A[i])
+
+        return kernel
+    elif num_inputes == 2:
+
+        @T.prim_func
+        def kernel(
+            A: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            E: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            B: T.Buffer((VECTOR_N_INPUTS,), dtype),
+        ):
+            T.func_attr({"global_symbol": name + "_kernel", "tirx.noalias": 
True})
+            for i in T.thread_binding(VECTOR_N_INPUTS, thread="threadIdx.x"):
+                B[i] = op(A[i], E[i])
+
+        return kernel
+    else:
+        raise ValueError(f"Unsupported number of inputs: {num_inputes}")

Review Comment:
   ![medium](https://www.gstatic.com/codereviewagent/medium-priority.svg)
   
   Update usage of renamed parameter `num_inputs`.
   
   ```suggestion
           raise ValueError(f"Unsupported number of inputs: {num_inputs}")
   ```



##########
tests/python/codegen/test_target_codegen_cuda_fastmath.py:
##########
@@ -0,0 +1,298 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+import tvm.tirx as tirx
+from tvm.contrib.nvcc import have_fp16
+from tvm.ir.module import IRModule
+from tvm.runtime.executable import Executable
+from tvm.script import tirx as T
+
+VECTOR_N_INPUTS = 8
+
+
+def make_prim_func(
+    name: str,
+    dtype: str,
+    num_inputes: int,
+    op: Callable[[tirx.PrimExpr, ...], tirx.PrimExpr],
+) -> tirx.PrimFunc:
+    """Make a primitive function that applies the given operation to the input 
buffer."""
+    if num_inputes == 1:

Review Comment:
   ![medium](https://www.gstatic.com/codereviewagent/medium-priority.svg)
   
   Update usage of renamed parameter `num_inputs`.
   
   ```suggestion
       if num_inputs == 1:
   ```



##########
tests/python/codegen/test_target_codegen_cuda_fastmath.py:
##########
@@ -0,0 +1,298 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+import tvm.tirx as tirx
+from tvm.contrib.nvcc import have_fp16
+from tvm.ir.module import IRModule
+from tvm.runtime.executable import Executable
+from tvm.script import tirx as T
+
+VECTOR_N_INPUTS = 8
+
+
+def make_prim_func(
+    name: str,
+    dtype: str,
+    num_inputes: int,
+    op: Callable[[tirx.PrimExpr, ...], tirx.PrimExpr],
+) -> tirx.PrimFunc:
+    """Make a primitive function that applies the given operation to the input 
buffer."""
+    if num_inputes == 1:
+
+        @T.prim_func
+        def kernel(
+            A: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            B: T.Buffer((VECTOR_N_INPUTS,), dtype),
+        ):
+            T.func_attr({"global_symbol": name + "_kernel", "tirx.noalias": 
True})
+            for i in T.thread_binding(VECTOR_N_INPUTS, thread="threadIdx.x"):
+                B[i] = op(A[i])
+
+        return kernel
+    elif num_inputes == 2:
+
+        @T.prim_func
+        def kernel(
+            A: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            E: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            B: T.Buffer((VECTOR_N_INPUTS,), dtype),
+        ):
+            T.func_attr({"global_symbol": name + "_kernel", "tirx.noalias": 
True})
+            for i in T.thread_binding(VECTOR_N_INPUTS, thread="threadIdx.x"):
+                B[i] = op(A[i], E[i])
+
+        return kernel
+    else:
+        raise ValueError(f"Unsupported number of inputs: {num_inputes}")
+
+
+@dataclass(frozen=True)
+class MathCase:
+    name: str
+    op: Callable[[tirx.PrimExpr, ...], tirx.PrimExpr]
+    num_inputes: int
+    default_intrinsic_f16: str
+    default_intrinsic_bf16: str
+    default_intrinsic_f32: str
+    default_intrinsic_f64: str
+    fast_math_intrinsic_f32: str
+    np_ref: object
+    rtol: float = 1e-5
+    atol: float = 1e-6
+
+
+MATH_CASES = [
+    MathCase(
+        "exp_case",
+        T.exp,
+        1,
+        "hexp",
+        "hexp",
+        "expf",
+        "exp",
+        "__expf",
+        lambda x: np.exp(x),
+    ),
+    MathCase(
+        "exp10_case",
+        T.exp10,
+        1,
+        "hexp10",
+        "hexp10",
+        "exp10f",
+        "exp10",
+        "__exp10f",
+        lambda x: np.power(10.0, x),
+    ),
+    MathCase(
+        "log_case",
+        T.log,
+        1,
+        "hlog",
+        "hlog",
+        "logf",
+        "log",
+        "__logf",
+        lambda x: np.log(x),
+    ),
+    MathCase(
+        "log2_case",
+        T.log2,
+        1,
+        "hlog2",
+        "hlog2",
+        "log2f",
+        "log2",
+        "__log2f",
+        lambda x: np.log2(x),
+    ),
+    MathCase(
+        "log10_case",
+        T.log10,
+        1,
+        "hlog10",
+        "hlog10",
+        "log10f",
+        "log10",
+        "__log10f",
+        lambda x: np.log10(x),
+    ),
+    MathCase(
+        "tan_case",
+        T.tan,
+        1,
+        "htan",
+        "htan",
+        "tanf",
+        "tan",
+        "tanf",
+        lambda x: np.tan(x),
+    ),
+    MathCase(
+        "cos_case",
+        T.cos,
+        1,
+        "hcos",
+        "hcos",
+        "cosf",
+        "cos",
+        "__cosf",
+        lambda x: np.cos(x),
+    ),
+    MathCase(
+        "sin_case",
+        T.sin,
+        1,
+        "hsin",
+        "hsin",
+        "sinf",
+        "sin",
+        "__sinf",
+        lambda x: np.sin(x),
+    ),
+    MathCase(
+        "tanh_case",
+        T.tanh,
+        1,
+        "htanh",
+        "htanh",
+        "tanhf",
+        "tanh",
+        "__tanhf",
+        lambda x: np.tanh(x),
+    ),
+    MathCase(
+        "pow_case",
+        T.pow,
+        2,
+        "hpow",
+        "hpow",
+        "powf",
+        "pow",
+        "__powf",
+        lambda x, y: np.power(x, y),
+    ),
+]
+
+
+def make_mod(
+    dtype: str, case: MathCase, enable_fast_math: bool
+) -> tuple[tvm.target.Target, tvm.IRModule]:
+    """Make a module for the given dtype and case."""
+    target = tvm.target.Target({"kind": "cuda", "enable_fast_math": 
enable_fast_math})
+    prim_func = make_prim_func(case.name, dtype, case.num_inputes, case.op)
+    return target, tvm.IRModule.from_expr(prim_func.with_attr("target", 
target))
+
+
+def expected_intrinsic(dtype: str, case: MathCase, enable_fast_math: bool) -> 
str:
+    """Get the expected intrinsic for the given dtype and case."""
+    if dtype == "float16":
+        return case.default_intrinsic_f16
+    elif dtype == "bfloat16":
+        return case.default_intrinsic_bf16
+    elif dtype == "float32":
+        return case.fast_math_intrinsic_f32 if enable_fast_math else 
case.default_intrinsic_f32
+    elif dtype == "float64":
+        return case.default_intrinsic_f64
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+
+def check_lowered_ir(
+    dtype: str, case: MathCase, enable_fast_math: bool
+) -> tuple[tvm.target.Target, IRModule]:
+    """Check the lowered IR for the given dtype and case."""
+    target, mod = make_mod(dtype, case, enable_fast_math)
+    lowered_mod = tvm.tirx.transform.LowerIntrin()(mod)
+    script = lowered_mod.script(show_meta=False)
+    expected = expected_intrinsic(dtype, case, enable_fast_math)
+    assert re.search(rf"""["']{re.escape(expected)}["']""", script)
+    return target, lowered_mod
+
+
+def check_cuda_source(
+    target: tvm.target.Target,
+    mod: IRModule,
+    dtype: str,
+    case: MathCase,
+    enable_fast_math: bool,
+) -> Executable:
+    """Check the CUDA source for the given dtype and case."""
+    executable = tvm.compile(mod, target=target)
+    source = executable.mod.imports[0].inspect_source()
+    expected = expected_intrinsic(dtype, case, enable_fast_math)
+    assert re.search(rf"(?<!_)\b{re.escape(expected)}\s*\(", source)
+    return executable
+
+
+def make_numpy_inputs(dtype: str, case: MathCase):
+    """Make the numpy inputs for the given dtype and case."""
+    lhs = np.array([0.25, 0.5, 1.0, 2.0, 4.0, 9.0, 16.0, 10.0], dtype=dtype)
+    if case.num_inputes == 1:
+        return [lhs]
+    elif case.num_inputes == 2:
+        rhs = np.array([2.0, 3.0, 0.5, 1.5, 0.25, 0.5, 2.0, 1.0], dtype=dtype)
+        return [lhs, rhs]
+    else:
+        raise ValueError(f"Unsupported number of inputs: {case.num_inputes}")

Review Comment:
   ![medium](https://www.gstatic.com/codereviewagent/medium-priority.svg)
   
   Update usage of renamed field `num_inputs`.
   
   ```suggestion
           raise ValueError(f"Unsupported number of inputs: {case.num_inputs}")
   ```



##########
tests/python/codegen/test_target_codegen_cuda_fastmath.py:
##########
@@ -0,0 +1,298 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+import tvm.tirx as tirx
+from tvm.contrib.nvcc import have_fp16
+from tvm.ir.module import IRModule
+from tvm.runtime.executable import Executable
+from tvm.script import tirx as T
+
+VECTOR_N_INPUTS = 8
+
+
+def make_prim_func(
+    name: str,
+    dtype: str,
+    num_inputes: int,
+    op: Callable[[tirx.PrimExpr, ...], tirx.PrimExpr],
+) -> tirx.PrimFunc:
+    """Make a primitive function that applies the given operation to the input 
buffer."""
+    if num_inputes == 1:
+
+        @T.prim_func
+        def kernel(
+            A: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            B: T.Buffer((VECTOR_N_INPUTS,), dtype),
+        ):
+            T.func_attr({"global_symbol": name + "_kernel", "tirx.noalias": 
True})
+            for i in T.thread_binding(VECTOR_N_INPUTS, thread="threadIdx.x"):
+                B[i] = op(A[i])
+
+        return kernel
+    elif num_inputes == 2:

Review Comment:
   ![medium](https://www.gstatic.com/codereviewagent/medium-priority.svg)
   
   Update usage of renamed parameter `num_inputs`.
   
   ```suggestion
       elif num_inputs == 2:
   ```



##########
tests/python/codegen/test_target_codegen_cuda_fastmath.py:
##########
@@ -0,0 +1,298 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+import tvm.tirx as tirx
+from tvm.contrib.nvcc import have_fp16
+from tvm.ir.module import IRModule
+from tvm.runtime.executable import Executable
+from tvm.script import tirx as T
+
+VECTOR_N_INPUTS = 8
+
+
+def make_prim_func(
+    name: str,
+    dtype: str,
+    num_inputes: int,
+    op: Callable[[tirx.PrimExpr, ...], tirx.PrimExpr],
+) -> tirx.PrimFunc:
+    """Make a primitive function that applies the given operation to the input 
buffer."""
+    if num_inputes == 1:
+
+        @T.prim_func
+        def kernel(
+            A: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            B: T.Buffer((VECTOR_N_INPUTS,), dtype),
+        ):
+            T.func_attr({"global_symbol": name + "_kernel", "tirx.noalias": 
True})
+            for i in T.thread_binding(VECTOR_N_INPUTS, thread="threadIdx.x"):
+                B[i] = op(A[i])
+
+        return kernel
+    elif num_inputes == 2:
+
+        @T.prim_func
+        def kernel(
+            A: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            E: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            B: T.Buffer((VECTOR_N_INPUTS,), dtype),
+        ):
+            T.func_attr({"global_symbol": name + "_kernel", "tirx.noalias": 
True})
+            for i in T.thread_binding(VECTOR_N_INPUTS, thread="threadIdx.x"):
+                B[i] = op(A[i], E[i])
+
+        return kernel
+    else:
+        raise ValueError(f"Unsupported number of inputs: {num_inputes}")
+
+
+@dataclass(frozen=True)
+class MathCase:
+    name: str
+    op: Callable[[tirx.PrimExpr, ...], tirx.PrimExpr]
+    num_inputes: int
+    default_intrinsic_f16: str
+    default_intrinsic_bf16: str
+    default_intrinsic_f32: str
+    default_intrinsic_f64: str
+    fast_math_intrinsic_f32: str
+    np_ref: object
+    rtol: float = 1e-5
+    atol: float = 1e-6
+
+
+MATH_CASES = [
+    MathCase(
+        "exp_case",
+        T.exp,
+        1,
+        "hexp",
+        "hexp",
+        "expf",
+        "exp",
+        "__expf",
+        lambda x: np.exp(x),
+    ),
+    MathCase(
+        "exp10_case",
+        T.exp10,
+        1,
+        "hexp10",
+        "hexp10",
+        "exp10f",
+        "exp10",
+        "__exp10f",
+        lambda x: np.power(10.0, x),
+    ),
+    MathCase(
+        "log_case",
+        T.log,
+        1,
+        "hlog",
+        "hlog",
+        "logf",
+        "log",
+        "__logf",
+        lambda x: np.log(x),
+    ),
+    MathCase(
+        "log2_case",
+        T.log2,
+        1,
+        "hlog2",
+        "hlog2",
+        "log2f",
+        "log2",
+        "__log2f",
+        lambda x: np.log2(x),
+    ),
+    MathCase(
+        "log10_case",
+        T.log10,
+        1,
+        "hlog10",
+        "hlog10",
+        "log10f",
+        "log10",
+        "__log10f",
+        lambda x: np.log10(x),
+    ),
+    MathCase(
+        "tan_case",
+        T.tan,
+        1,
+        "htan",
+        "htan",
+        "tanf",
+        "tan",
+        "tanf",
+        lambda x: np.tan(x),
+    ),
+    MathCase(
+        "cos_case",
+        T.cos,
+        1,
+        "hcos",
+        "hcos",
+        "cosf",
+        "cos",
+        "__cosf",
+        lambda x: np.cos(x),
+    ),
+    MathCase(
+        "sin_case",
+        T.sin,
+        1,
+        "hsin",
+        "hsin",
+        "sinf",
+        "sin",
+        "__sinf",
+        lambda x: np.sin(x),
+    ),
+    MathCase(
+        "tanh_case",
+        T.tanh,
+        1,
+        "htanh",
+        "htanh",
+        "tanhf",
+        "tanh",
+        "__tanhf",
+        lambda x: np.tanh(x),
+    ),
+    MathCase(
+        "pow_case",
+        T.pow,
+        2,
+        "hpow",
+        "hpow",
+        "powf",
+        "pow",
+        "__powf",
+        lambda x, y: np.power(x, y),
+    ),
+]
+
+
+def make_mod(
+    dtype: str, case: MathCase, enable_fast_math: bool
+) -> tuple[tvm.target.Target, tvm.IRModule]:
+    """Make a module for the given dtype and case."""
+    target = tvm.target.Target({"kind": "cuda", "enable_fast_math": 
enable_fast_math})
+    prim_func = make_prim_func(case.name, dtype, case.num_inputes, case.op)

Review Comment:
   ![medium](https://www.gstatic.com/codereviewagent/medium-priority.svg)
   
   Update usage of renamed field `num_inputs`.
   
   ```suggestion
       prim_func = make_prim_func(case.name, dtype, case.num_inputs, case.op)
   ```



##########
tests/python/codegen/test_target_codegen_cuda_fastmath.py:
##########
@@ -0,0 +1,298 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import re
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import numpy as np
+import pytest
+
+import tvm
+import tvm.testing
+import tvm.tirx as tirx
+from tvm.contrib.nvcc import have_fp16
+from tvm.ir.module import IRModule
+from tvm.runtime.executable import Executable
+from tvm.script import tirx as T
+
+VECTOR_N_INPUTS = 8
+
+
+def make_prim_func(
+    name: str,
+    dtype: str,
+    num_inputes: int,
+    op: Callable[[tirx.PrimExpr, ...], tirx.PrimExpr],
+) -> tirx.PrimFunc:
+    """Make a primitive function that applies the given operation to the input 
buffer."""
+    if num_inputes == 1:
+
+        @T.prim_func
+        def kernel(
+            A: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            B: T.Buffer((VECTOR_N_INPUTS,), dtype),
+        ):
+            T.func_attr({"global_symbol": name + "_kernel", "tirx.noalias": 
True})
+            for i in T.thread_binding(VECTOR_N_INPUTS, thread="threadIdx.x"):
+                B[i] = op(A[i])
+
+        return kernel
+    elif num_inputes == 2:
+
+        @T.prim_func
+        def kernel(
+            A: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            E: T.Buffer((VECTOR_N_INPUTS,), dtype),
+            B: T.Buffer((VECTOR_N_INPUTS,), dtype),
+        ):
+            T.func_attr({"global_symbol": name + "_kernel", "tirx.noalias": 
True})
+            for i in T.thread_binding(VECTOR_N_INPUTS, thread="threadIdx.x"):
+                B[i] = op(A[i], E[i])
+
+        return kernel
+    else:
+        raise ValueError(f"Unsupported number of inputs: {num_inputes}")
+
+
+@dataclass(frozen=True)
+class MathCase:
+    name: str
+    op: Callable[[tirx.PrimExpr, ...], tirx.PrimExpr]
+    num_inputes: int
+    default_intrinsic_f16: str
+    default_intrinsic_bf16: str
+    default_intrinsic_f32: str
+    default_intrinsic_f64: str
+    fast_math_intrinsic_f32: str
+    np_ref: object
+    rtol: float = 1e-5
+    atol: float = 1e-6
+
+
+MATH_CASES = [
+    MathCase(
+        "exp_case",
+        T.exp,
+        1,
+        "hexp",
+        "hexp",
+        "expf",
+        "exp",
+        "__expf",
+        lambda x: np.exp(x),
+    ),
+    MathCase(
+        "exp10_case",
+        T.exp10,
+        1,
+        "hexp10",
+        "hexp10",
+        "exp10f",
+        "exp10",
+        "__exp10f",
+        lambda x: np.power(10.0, x),
+    ),
+    MathCase(
+        "log_case",
+        T.log,
+        1,
+        "hlog",
+        "hlog",
+        "logf",
+        "log",
+        "__logf",
+        lambda x: np.log(x),
+    ),
+    MathCase(
+        "log2_case",
+        T.log2,
+        1,
+        "hlog2",
+        "hlog2",
+        "log2f",
+        "log2",
+        "__log2f",
+        lambda x: np.log2(x),
+    ),
+    MathCase(
+        "log10_case",
+        T.log10,
+        1,
+        "hlog10",
+        "hlog10",
+        "log10f",
+        "log10",
+        "__log10f",
+        lambda x: np.log10(x),
+    ),
+    MathCase(
+        "tan_case",
+        T.tan,
+        1,
+        "htan",
+        "htan",
+        "tanf",
+        "tan",
+        "tanf",
+        lambda x: np.tan(x),
+    ),
+    MathCase(
+        "cos_case",
+        T.cos,
+        1,
+        "hcos",
+        "hcos",
+        "cosf",
+        "cos",
+        "__cosf",
+        lambda x: np.cos(x),
+    ),
+    MathCase(
+        "sin_case",
+        T.sin,
+        1,
+        "hsin",
+        "hsin",
+        "sinf",
+        "sin",
+        "__sinf",
+        lambda x: np.sin(x),
+    ),
+    MathCase(
+        "tanh_case",
+        T.tanh,
+        1,
+        "htanh",
+        "htanh",
+        "tanhf",
+        "tanh",
+        "__tanhf",
+        lambda x: np.tanh(x),
+    ),
+    MathCase(
+        "pow_case",
+        T.pow,
+        2,
+        "hpow",
+        "hpow",
+        "powf",
+        "pow",
+        "__powf",
+        lambda x, y: np.power(x, y),
+    ),
+]
+
+
+def make_mod(
+    dtype: str, case: MathCase, enable_fast_math: bool
+) -> tuple[tvm.target.Target, tvm.IRModule]:
+    """Make a module for the given dtype and case."""
+    target = tvm.target.Target({"kind": "cuda", "enable_fast_math": 
enable_fast_math})
+    prim_func = make_prim_func(case.name, dtype, case.num_inputes, case.op)
+    return target, tvm.IRModule.from_expr(prim_func.with_attr("target", 
target))
+
+
+def expected_intrinsic(dtype: str, case: MathCase, enable_fast_math: bool) -> 
str:
+    """Get the expected intrinsic for the given dtype and case."""
+    if dtype == "float16":
+        return case.default_intrinsic_f16
+    elif dtype == "bfloat16":
+        return case.default_intrinsic_bf16
+    elif dtype == "float32":
+        return case.fast_math_intrinsic_f32 if enable_fast_math else 
case.default_intrinsic_f32
+    elif dtype == "float64":
+        return case.default_intrinsic_f64
+    else:
+        raise ValueError(f"Unsupported dtype: {dtype}")
+
+
+def check_lowered_ir(
+    dtype: str, case: MathCase, enable_fast_math: bool
+) -> tuple[tvm.target.Target, IRModule]:
+    """Check the lowered IR for the given dtype and case."""
+    target, mod = make_mod(dtype, case, enable_fast_math)
+    lowered_mod = tvm.tirx.transform.LowerIntrin()(mod)
+    script = lowered_mod.script(show_meta=False)
+    expected = expected_intrinsic(dtype, case, enable_fast_math)
+    assert re.search(rf"""["']{re.escape(expected)}["']""", script)
+    return target, lowered_mod
+
+
+def check_cuda_source(
+    target: tvm.target.Target,
+    mod: IRModule,
+    dtype: str,
+    case: MathCase,
+    enable_fast_math: bool,
+) -> Executable:
+    """Check the CUDA source for the given dtype and case."""
+    executable = tvm.compile(mod, target=target)
+    source = executable.mod.imports[0].inspect_source()
+    expected = expected_intrinsic(dtype, case, enable_fast_math)
+    assert re.search(rf"(?<!_)\b{re.escape(expected)}\s*\(", source)
+    return executable
+
+
+def make_numpy_inputs(dtype: str, case: MathCase):
+    """Make the numpy inputs for the given dtype and case."""
+    lhs = np.array([0.25, 0.5, 1.0, 2.0, 4.0, 9.0, 16.0, 10.0], dtype=dtype)
+    if case.num_inputes == 1:

Review Comment:
   ![medium](https://www.gstatic.com/codereviewagent/medium-priority.svg)
   
   Update usage of renamed field `num_inputs`.
   
   ```suggestion
       if case.num_inputs == 1:
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [Fix][CodeGen][CUDA]: Gate fast math intrinsic lowering behind target option [tvm]

Reply via email to