(tvm) branch main updated: [FFI][FEAT] AutoDLPack for taking external tensor objects (#17927)

syfeng Wed, 07 May 2025 19:45:40 -0700

This is an automated email from the ASF dual-hosted git repository.

syfeng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new da6d510ce0 [FFI][FEAT] AutoDLPack for taking external tensor objects 
(#17927)
da6d510ce0 is described below

commit da6d510ce0a48776f1bb6c79502be1f8ca43b35e
Author: Tianqi Chen <[email protected]>
AuthorDate: Wed May 7 22:45:09 2025 -0400

    [FFI][FEAT] AutoDLPack for taking external tensor objects (#17927)
    
    [FFI][FEAT] AutoDLPack to enable external tensor args.
    
    This PR introduces autodlpack feature to the tvm ffi.
    When an ffi Function takes Tensor argument that conforms to DLPack
    it automatically imports into NDArray and pass as argument.
    
    The feature will allow compiled function to directly take torch.Tensor
    as input argument without extra set of changes. When a function returns
    NDArray, the return value still needs to be converted back via 
torch.from_dlpack.
    
    However, a common use case is the destination passing, where all inputs
    outputs are pre-allocated and passed into the function. AutoDLPack 
effectively
    enables zero overhead support for a wide range of python arrays.
    
    We also added a benchmark script to measure the overall ffi overhead.
    One thing to note is that there is still continuguous and alignment
    requirement that is needed by underlying DSL compiler, as of now
    we use a global value. So x.continugous is still needed before passing
    the argument if tranpose or other ops are performed.
---
 ffi/scripts/benchmark_dlpack.py    | 345 +++++++++++++++++++++++++++++++++++++
 python/tvm/ffi/convert.py          |   5 +
 python/tvm/ffi/cython/function.pxi |  16 ++
 python/tvm/ffi/cython/ndarray.pxi  |   2 +
 tests/python/ffi/test_ndarray.py   |  27 +++
 5 files changed, 395 insertions(+)

diff --git a/ffi/scripts/benchmark_dlpack.py b/ffi/scripts/benchmark_dlpack.py
new file mode 100644
index 0000000000..b19f566364
--- /dev/null
+++ b/ffi/scripts/benchmark_dlpack.py
@@ -0,0 +1,345 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+This script is used to benchmark the API overhead of different
+python FFI API calling overhead, through DLPack API.
+
+Specifically, we would like to understand the overall overhead
+python/C++ API calls. The general goal is to understand the overall
+space and get a sense of what are the possible operations.
+
+We pick function f(x, y, z) where x, y, z are length 1 tensors.
+The benchmark is running in eager mode so we can see what is possible.
+It is orthogonal to other optimizations. For example cudagraph can
+eliminate these overheads completely. So the goal is to get a sense
+of what is possible under eager mode.
+
+Summary of some takeaways:
+- numpy.add roughly takes 0.36 us per call, which gives roughly what can
+  be done in python env.
+- torch.add on gpu takes about 3.7us per call, giving us an idea of what
+  roughly we need to get to in eager mode.
+-
+
+"""
+import torch
+import numpy as np
+from tvm import ffi as tvm_ffi
+import time
+
+
+def print_speed(name, speed):
+    print(f"{name:<40} {speed} sec/call")
+
+
+def print_error(name, error):
+    print(f"{name:<40} {error}")
+
+
+def baseline_torch_add(repeat):
+    """Run torch.add with one element"""
+
+    def run_bench(device):
+        x = torch.arange(1, device=device)
+        y = torch.arange(1, device=device)
+        z = torch.arange(1, device=device)
+
+        torch.add(x, y, out=z)
+        if device == "cuda":
+            torch.cuda.synchronize()
+        start = time.time()
+        for i in range(repeat):
+            torch.add(x, y, out=z)
+        # note we deliberately do not use torch.cuda.synchronize()
+        # because we want to see the overhead of the FFI call.
+        end = time.time()
+        print_speed(f"torch.add[{device}]", (end - start) / repeat)
+
+    # rough take away: add on cuda roughly takes 3e-6 sec/call
+    run_bench("cpu")
+    run_bench("cuda")
+
+
+def baseline_numpy_add(repeat):
+    """Run numpy.add with one element"""
+    x = np.arange(1)
+    y = np.arange(1)
+    z = np.arange(1)
+
+    np.add(x, y, out=z)
+    start = time.time()
+    for i in range(repeat):
+        np.add(x, y, out=z)
+    end = time.time()
+    speed = (end - start) / repeat
+    print_speed("numpy.add", speed)
+
+
+def baseline_cupy_add(repeat):
+    """Run cupy.add with one element"""
+    try:
+        import cupy
+    except ImportError:
+        # skip if cupy is not installed
+        return
+    x = cupy.arange(1)
+    y = cupy.arange(1)
+    z = cupy.arange(1)
+
+    cupy.add(x, y, out=z)
+    start = time.time()
+    for i in range(repeat):
+        cupy.add(x, y, out=z)
+    end = time.time()
+    speed = (end - start) / repeat
+    print_speed("cupy.add", speed)
+
+
+def tvm_ffi_nop(repeat):
+    """Overhead of tvm FFI python call via calling a NOP.
+
+    testing.nop is defined in c++ and do nothing.
+    """
+    nop = tvm_ffi.get_global_func("testing.nop")
+    x = tvm_ffi.from_dlpack(torch.arange(1))
+    y = tvm_ffi.from_dlpack(torch.arange(1))
+    z = tvm_ffi.from_dlpack(torch.arange(1))
+    nop(x, y, z)
+    start = time.time()
+    for i in range(repeat):
+        y = tvm_ffi.from_dlpack(x)
+    end = time.time()
+    print_speed("tvm.ffi.nop", (end - start) / repeat)
+
+
+def bench_ffi_nop_from_dlpack(name, x, y, z, repeat):
+    """run dlpack conversion + tvm.ffi.nop
+
+    Measures overhead of running dlpack for each args then invoke
+    """
+    nop = tvm_ffi.get_global_func("testing.nop")
+    tx = tvm_ffi.from_dlpack(x)
+    ty = tvm_ffi.from_dlpack(y)
+    tz = tvm_ffi.from_dlpack(z)
+    nop(tx, ty, tz)
+
+    start = time.time()
+    for i in range(repeat):
+        tx = tvm_ffi.from_dlpack(x)
+        ty = tvm_ffi.from_dlpack(y)
+        tz = tvm_ffi.from_dlpack(z)
+        nop(tx, ty, tz)
+    end = time.time()
+    print_speed(name, (end - start) / repeat)
+
+
+def tvm_ffi_nop_from_torch_dlpack(repeat):
+    """run dlpack conversion + tvm.ffi.nop
+
+    Measures overhead of running dlpack for each args then invoke
+    """
+    x = torch.arange(1)
+    y = torch.arange(1)
+    z = torch.arange(1)
+    bench_ffi_nop_from_dlpack("tvm.ffi.nop+from_dlpack(torch)", x, y, z, 
repeat)
+
+
+def tvm_ffi_nop_from_numpy_dlpack(repeat):
+    """run dlpack conversion + tvm.ffi.nop
+
+    Measures overhead of running dlpack for each args then invoke
+    """
+    x = np.arange(1)
+    y = np.arange(1)
+    z = np.arange(1)
+    bench_ffi_nop_from_dlpack("tvm.ffi.nop+from_dlpack(numpy)", x, y, z, 
repeat)
+
+
+def tvm_ffi_self_dlpack_nop(repeat):
+    """run dlpack conversion + tvm.ffi.nop
+
+    Measures overhead of running dlpack for each args then invoke
+    """
+    x = tvm_ffi.from_dlpack(torch.arange(1))
+    y = tvm_ffi.from_dlpack(torch.arange(1))
+    z = tvm_ffi.from_dlpack(torch.arange(1))
+    bench_ffi_nop_from_dlpack("tvm.ffi.nop+from_dlpack(tvm)", x, y, z, repeat)
+
+
+def bench_ffi_nop_from_dlpack(name, x, y, z, repeat):
+    """run dlpack conversion + tvm.ffi.nop
+
+    Measures overhead of running dlpack for each args then invoke
+    """
+    nop = tvm_ffi.get_global_func("testing.nop")
+    tx = tvm_ffi.from_dlpack(x)
+    ty = tvm_ffi.from_dlpack(y)
+    tz = tvm_ffi.from_dlpack(z)
+    nop(tx, ty, tz)
+
+    start = time.time()
+    for i in range(repeat):
+        tx = tvm_ffi.from_dlpack(x)
+        ty = tvm_ffi.from_dlpack(y)
+        tz = tvm_ffi.from_dlpack(z)
+        nop(tx, ty, tz)
+    end = time.time()
+    print_speed(name, (end - start) / repeat)
+
+
+def tvm_ffi_nop_from_torch_utils_to_dlpack(repeat):
+    """
+    Measures overhead of running dlpack for each args then invoke
+    but uses the legacy torch.utils.dlpack.to_dlpack API
+
+    This helps to measure possible implementation overhead of torch.
+    """
+    nop = tvm_ffi.get_global_func("testing.nop")
+    x = torch.arange(1)
+    y = torch.arange(1)
+    z = torch.arange(1)
+
+    tx = tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(x))
+    ty = tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(y))
+    tz = tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(z))
+    nop(tx, ty, tz)
+
+    start = time.time()
+    for i in range(repeat):
+        tx = tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(x))
+        ty = tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(y))
+        tz = tvm_ffi.from_dlpack(torch.utils.dlpack.to_dlpack(z))
+        nop(tx, ty, tz)
+    end = time.time()
+    speed = (end - start) / repeat
+    print_speed("tvm.ffi.nop+from_dlpack(torch.utils)", speed)
+
+
+def bench_tvm_ffi_nop_autodlpack(name, x, y, z, repeat):
+    """
+    Measures overhead of running dlpack via auto convert by directly
+    take torch.Tensor as inputs.
+    """
+    nop = tvm_ffi.get_global_func("testing.nop")
+    nop(x, y, z)
+    start = time.time()
+    for i in range(repeat):
+        nop(x, y, z)
+    end = time.time()
+    speed = (end - start) / repeat
+    print_speed(name, speed)
+
+
+def tvm_ffi_nop_autodlpack_from_torch(repeat, device="cpu"):
+    """
+    Measures overhead of running dlpack via auto convert by directly
+    take torch.Tensor as inputs.
+    """
+    # use larger to ensure alignment req is met
+    x = torch.arange(1, device=device)
+    y = torch.arange(1, device=device)
+    z = torch.arange(1, device=device)
+    bench_tvm_ffi_nop_autodlpack(f"tvm.ffi.nop.autodlpack(torch[{device}])", 
x, y, z, repeat)
+
+
+def tvm_ffi_nop_autodlpack_from_numpy(repeat):
+    """
+    Measures overhead of running dlpack via auto convert by directly
+    take numpy.ndarray as inputs.
+    """
+    # use larger to ensure alignment req is met
+    x = np.arange(256)
+    y = np.arange(256)
+    z = np.arange(256)
+    bench_tvm_ffi_nop_autodlpack("tvm.ffi.nop.autodlpack(numpy)", x, y, z, 
repeat)
+
+
+def bench_to_dlpack(x, name, repeat):
+    x.__dlpack__()
+    start = time.time()
+    for i in range(repeat):
+        x.__dlpack__()
+    end = time.time()
+    speed = (end - start) / repeat
+    print_speed(name, speed)
+
+
+def bench_to_dlpack_versioned(x, name, repeat, max_version=(1, 1)):
+    """
+    Measures overhead of running dlpack with latest 1.1.
+    """
+    try:
+        x.__dlpack__(max_version=max_version)
+        start = time.time()
+        for i in range(repeat):
+            x.__dlpack__(max_version=max_version)
+        end = time.time()
+        speed = (end - start) / repeat
+        print_speed(name, speed)
+    except Exception as e:
+        print_error(name, e)
+
+
+def bench_torch_utils_to_dlpack(repeat):
+    """
+    Measures overhead of running torch.utils.dlpack.to_dlpack
+    """
+    x = torch.arange(1)
+    torch.utils.dlpack.to_dlpack(x)
+    start = time.time()
+    for i in range(repeat):
+        torch.utils.dlpack.to_dlpack(x)
+    end = time.time()
+    speed = (end - start) / repeat
+    print_speed("torch.utils.dlpack.to_dlpack", speed)
+
+
+def main():
+    repeat = 10000
+    print("-----------------------------")
+    print("Benchmark f(x, y, z) overhead")
+    print("-----------------------------")
+    baseline_numpy_add(repeat)
+    baseline_torch_add(repeat)
+    baseline_cupy_add(repeat)
+    tvm_ffi_nop(repeat)
+    tvm_ffi_nop_from_torch_dlpack(repeat)
+    tvm_ffi_nop_from_numpy_dlpack(repeat)
+    tvm_ffi_self_dlpack_nop(repeat)
+    tvm_ffi_nop_from_torch_utils_to_dlpack(repeat)
+    tvm_ffi_nop_autodlpack_from_torch(repeat, "cpu")
+    tvm_ffi_nop_autodlpack_from_torch(repeat, "cuda")
+    tvm_ffi_nop_autodlpack_from_numpy(repeat)
+    print("-------------------------------")
+    print("Benchmark x.__dlpack__ overhead")
+    print("-------------------------------")
+    bench_torch_utils_to_dlpack(repeat)
+    bench_to_dlpack(torch.arange(1), "torch.__dlpack__", repeat)
+    bench_to_dlpack(np.arange(1), "numpy.__dlpack__", repeat)
+    bench_to_dlpack(tvm_ffi.from_dlpack(torch.arange(1)), "tvm.__dlpack__", 
repeat)
+    print("---------------------------------------------------")
+    print("Benchmark x.__dlpack__(max_version=(1,1)) overhead")
+    print("---------------------------------------------------")
+    bench_to_dlpack_versioned(torch.arange(1), 
"torch.__dlpack__(max_version=(1,1))", repeat)
+    bench_to_dlpack_versioned(np.arange(1), 
"numpy.__dlpack__(max_version=(1,1))", repeat)
+    bench_to_dlpack_versioned(
+        tvm_ffi.from_dlpack(torch.arange(1)), 
"tvm.__dlpack__(max_version=(1,1))", repeat
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/ffi/convert.py b/python/tvm/ffi/convert.py
index 467f7a2fb4..5b25ddae25 100644
--- a/python/tvm/ffi/convert.py
+++ b/python/tvm/ffi/convert.py
@@ -54,6 +54,11 @@ def convert(value: Any) -> Any:
         return core._convert_to_ffi_func(value)
     elif value is None:
         return None
+    elif hasattr(value, "__dlpack__"):
+        return core.from_dlpack(
+            value,
+            required_alignment=core.__dlpack_auto_import_required_alignment__,
+        )
     elif isinstance(value, Exception):
         return core._convert_to_ffi_error(value)
     else:
diff --git a/python/tvm/ffi/cython/function.pxi 
b/python/tvm/ffi/cython/function.pxi
index be80023c85..294a1246b2 100644
--- a/python/tvm/ffi/cython/function.pxi
+++ b/python/tvm/ffi/cython/function.pxi
@@ -17,6 +17,11 @@
 import ctypes
 from numbers import Real, Integral
 
+try:
+    import torch
+except ImportError:
+    torch = None
+
 
 cdef inline object make_ret(TVMFFIAny result):
     """convert result to return value."""
@@ -71,6 +76,17 @@ cdef inline int make_args(tuple py_args, TVMFFIAny* out, 
list temp_args) except
         elif isinstance(arg, Object):
             out[i].type_index = TVMFFIObjectGetTypeIndex((<Object>arg).chandle)
             out[i].v_ptr = (<Object>arg).chandle
+        elif torch is not None and isinstance(arg, torch.Tensor):
+            arg = from_dlpack(torch.utils.dlpack.to_dlpack(arg),
+                              
required_alignment=__dlpack_auto_import_required_alignment__)
+            out[i].type_index = kTVMFFINDArray
+            out[i].v_ptr = (<NDArray>arg).chandle
+            temp_args.append(arg)
+        elif hasattr(arg, "__dlpack__"):
+            arg = from_dlpack(arg, 
required_alignment=__dlpack_auto_import_required_alignment__)
+            out[i].type_index = kTVMFFINDArray
+            out[i].v_ptr = (<NDArray>arg).chandle
+            temp_args.append(arg)
         elif isinstance(arg, PyNativeObject):
             arg = arg.__tvm_ffi_object__
             out[i].type_index = TVMFFIObjectGetTypeIndex((<Object>arg).chandle)
diff --git a/python/tvm/ffi/cython/ndarray.pxi 
b/python/tvm/ffi/cython/ndarray.pxi
index cadf3de4fd..b8534b41b3 100644
--- a/python/tvm/ffi/cython/ndarray.pxi
+++ b/python/tvm/ffi/cython/ndarray.pxi
@@ -16,8 +16,10 @@
 # under the License.
 
 __dlpack_version__ = (1, 1)
+__dlpack_auto_import_required_alignment__ = 8
 _CLASS_NDARRAY = None
 
+
 def _set_class_ndarray(cls):
     global _CLASS_NDARRAY
     _CLASS_NDARRAY = cls
diff --git a/tests/python/ffi/test_ndarray.py b/tests/python/ffi/test_ndarray.py
index a5a6f5b074..5b75171b55 100644
--- a/tests/python/ffi/test_ndarray.py
+++ b/tests/python/ffi/test_ndarray.py
@@ -14,6 +14,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+import pytest
+
+try:
+    import torch
+except ImportError:
+    torch = None
 
 from tvm import ffi as tvm_ffi
 import numpy as np
@@ -47,3 +53,24 @@ def test_shape_object():
     shape3 = tvm_ffi.convert(shape)
     assert shape3.__tvm_ffi_object__.same_as(shape.__tvm_ffi_object__)
     assert isinstance(shape3, tvm_ffi.Shape)
+
+
[email protected](torch is None, reason="Torch is not installed")
+def test_ndarray_auto_dlpack():
+    def check(x, y):
+        assert isinstance(y, tvm_ffi.NDArray)
+        assert y.shape == (128,)
+        assert y.dtype == tvm_ffi.dtype("int64")
+        assert y.device.device_type == tvm_ffi.Device.kDLCPU
+        assert y.device.device_id == 0
+        x2 = torch.from_dlpack(y)
+        np.testing.assert_equal(x2.numpy(), x.numpy())
+
+    x = torch.arange(128)
+    fecho = tvm_ffi.get_global_func("testing.echo")
+    y = fecho(x)
+    check(x, y)
+
+    # pass in list of tensors
+    y = fecho([x])
+    check(x, y[0])

(tvm) branch main updated: [FFI][FEAT] AutoDLPack for taking external tensor objects (#17927)

Reply via email to