masahi commented on a change in pull request #7334:
URL: https://github.com/apache/tvm/pull/7334#discussion_r564020478
##########
File path: python/tvm/topi/cuda/scan.py
##########
@@ -19,30 +19,36 @@
import tvm
from tvm import te
from tvm._ffi import get_global_func
-from ..transform import expand_dims, squeeze
-from ..utils import ceil_div
+from ..transform import expand_dims, squeeze, transpose, reshape
+from ..utils import ceil_div, swap, prod, get_const_int
from ..math import cast
from .. import tag
from .injective import schedule_injective_from_existing
-def exclusive_sum_scan2d_ir(data, output, reduction=None):
+binop_name_to_func = {"sum": tvm.tir.generic.add}
Review comment:
The problem with this is for the thrust path, I need to pass this
function to C++ and translate it to `thrust::plus()`, for example. I didn't see
a good way to do that, so I opted for a dumb but simpler solution.
##########
File path: python/tvm/topi/cuda/scan.py
##########
@@ -251,99 +263,98 @@ def scan_thrust(data, output_dtype, exclusive=True,
return_reduction=False):
Whether or not do exclusive or inclusive scan.
return_reduction: bool, optional
- Whether or not return a 1-D tensor storing the reduction of each row.
+ Whether or not return a (N-1)-D tensor storing the reduction of each
scan axis.
Reductions are computed as part of the upsweep pass, so there is no
extra cost.
- If False, reductions are ignored.
+ If False, reductions are ignored. It must be False when exclusive is
False.
+
+ biop: string, optional
+ A string specifying which binary operator to use. Currently only "sum"
is supported.
Returns
-------
output : tvm.te.Tensor
- 1-D tensor that is the exclusive scan of the input, or
- 2-D tensor storing the exclusive scan of each row.
+ A N-D tensor of the same rank N and shape as the input data.
reduction : tvm.te.Tensor, optional
- 1-D tensor storing the reduction of each row.
+ (N-1)-D tensor storing the reduction of each scan axis.
Returned if return_reduction is True.
"""
data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf",
data_alignment=8)
output_buf = tvm.tir.decl_buffer(data.shape, output_dtype, "output_buf",
data_alignment=8)
+ binop_to_thrust_func_name = {"sum": "tvm.contrib.thrust.sum_scan"}
output = te.extern(
[data.shape],
[data],
lambda ins, outs: tvm.tir.call_packed(
- "tvm.contrib.thrust.sum_scan", ins[0], outs[0], exclusive
+ binop_to_thrust_func_name[binop], ins[0], outs[0], exclusive
),
dtype=[output_dtype],
in_buffers=[data_buf],
out_buffers=[output_buf],
- name="exclusive_sum_scan2d",
- tag="exclusive_sum_scan2d_gpu",
+ name="exclusive_scan_thrust",
+ tag="exclusive_scan_thrust_gpu",
)
if return_reduction:
assert exclusive, "return_reduction should be False for inclusive scan"
- reduction = get_reduction_from_exclusive_scan(data, output)
+ reduction = get_reduction_from_exclusive_scan(data, output, binop)
return output, reduction
return output
-def exclusive_scan(data, axis=-1, return_reduction=False, output_dtype=None):
- """Do exclusive scan on 1D input or along rows of 2D input.
+def exclusive_scan(data, axis=-1, return_reduction=False, output_dtype=None,
binop="sum"):
+ """Do exclusive scan on 1D or multidimensional input.
Parameters
----------
data : tvm.te.Tensor
- Input data. 1-D tensor with shape [scan_axis_size], or
- 2-D tensor with shape [batch_size, scan_axis_size].
+ Input data of any shape.
axis: int, optional
- The axis to do scan on. For now, only the inner most axis is supported.
+ The axis to do scan on. By default, scan is done on the innermost axis.
return_reduction: bool, optional
- Whether or not return a 1-D tensor storing the reduction of each row.
+ Whether or not return a tensor storing the reduction over each scan
axis.
+ If the input rank is N, this tensor is of rank N - 1.
Reductions are computed as part of the upsweep pass, so there is no
extra cost.
If False, reductions are ignored.
output_dtype: string, optional
The dtype of the output scan tensor. If not provided, the dtype of the
input is used.
+ biop: string, optional
+ A string specifying which binary operator to use. Currently only "sum"
is supported.
+
Returns
-------
output : tvm.te.Tensor
- 1-D tensor that is the exclusive scan of the input, or
- 2-D tensor storing the exclusive scan of each row.
+ A N-D tensor of the same rank N and shape as the input data.
reduction : tvm.te.Tensor, optional
- 1-D tensor storing the reduction of each row.
+ (N-1)-D tensor storing the reduction of each scan axis.
Returned if return_reduction is True.
"""
- # TODO(masahi): Support other binary operators
- ndim = len(data.shape)
- if axis < 0:
- axis += ndim
- assert axis == ndim - 1, "Only support scan on the inner most axis."
-
- if output_dtype is None:
- output_dtype = data.dtype
- target = tvm.target.Target.current()
- if target and target.kind.name == "cuda" and is_thrust_available():
- return scan_thrust(data, output_dtype, exclusive=True,
return_reduction=return_reduction)
+ def do_scan(data, output_dtype):
+ target = tvm.target.Target.current()
+ if target and target.kind.name == "cuda" and is_thrust_available():
Review comment:
It's a bit tricky, since `exclusive_scan` is called by other ops, I need
to introduce separate implementation and strategy for every op that uses it.
Currently they are `get_valid_counts`, `argwhere` and `cumsum`. Soon I'll add
`unique`.
##########
File path: python/tvm/relay/op/transform.py
##########
@@ -1320,3 +1320,28 @@ def adv_index(inputs):
Output tensor.
"""
return _make.adv_index(Tuple(inputs))
+
+
+def cumsum(data, axis=None, dtype=None):
+ """Numpy style cumsum op. Return the cumulative sum of the elements along
a given axis.
+
+ Parameters
+ ----------
+ data : relay.Expr
+ The input data to the operator.
+
+ axis : int, optional
+ Axis along which the cumulative sum is computed. The default (None) is
to compute
+ the cumsum over the flattened array.
+
+ dtype : string, optional
+ Type of the returned array and of the accumulator in which the
elements are summed.
+ If dtype is not specified, it defaults to the dtype of data.
Review comment:
Added examples from the numpy doc and more.
##########
File path: python/tvm/topi/cuda/scan.py
##########
@@ -19,30 +19,36 @@
import tvm
from tvm import te
from tvm._ffi import get_global_func
-from ..transform import expand_dims, squeeze
-from ..utils import ceil_div
+from ..transform import expand_dims, squeeze, transpose, reshape
+from ..utils import ceil_div, swap, prod, get_const_int
from ..math import cast
from .. import tag
from .injective import schedule_injective_from_existing
-def exclusive_sum_scan2d_ir(data, output, reduction=None):
+binop_name_to_func = {"sum": tvm.tir.generic.add}
Review comment:
Interesting, didn't realize I can use a function as a dict key. I'll try
this, thanks.
##########
File path: python/tvm/topi/cumsum.py
##########
@@ -0,0 +1,105 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Cumsum operator"""
+from ..tir import decl_buffer, ir_builder
+from ..te import extern
+from .utils import prod, get_const_int
+from .math import cast
+
+
+def cumsum(data, axis=None, dtype=None):
+ """Numpy style cumsum op. Return the cumulative sum of the elements along
a given axis.
+
+ Parameters
+ ----------
+ data : tvm.te.Tensor
+ The input data to the operator.
+
+ axis : int, optional
+ Axis along which the cumulative sum is computed. The default (None) is
to compute
+ the cumsum over the flattened array.
+
+ dtype : string, optional
+ Type of the returned array and of the accumulator in which the
elements are summed.
+ If dtype is not specified, it defaults to the dtype of data.
+
+ Returns
+ -------
+ result : tvm.te.Tensor
+ The result has the same size as data, and the same shape as data if
axis is not None.
+ If axis is None, the result is a 1-d array.
+ """
+ if dtype is None or dtype == "":
+ dtype = data.dtype
+
+ def maybe_cast(x):
+ if dtype != data.dtype:
+ return cast(x, dtype)
+ return x
+
+ axis_mul_before = 1
+ axis_mul_after = 1
+
+ if axis is None:
+ axis = 0
+ cumsum_axis_len = prod(data.shape)
+ shape = (cumsum_axis_len,)
+ else:
+ if not isinstance(axis, int):
+ axis = get_const_int(axis)
+
+ shape = data.shape
+ cumsum_axis_len = shape[axis]
+
+ if axis < 0:
+ axis = len(shape) + axis
+
+ for i, value in enumerate(shape, 0):
+ if i < axis:
+ axis_mul_before *= value
+ elif i > axis:
+ axis_mul_after *= value
+
+ def gen_ir(data_buf, out_buf):
+ ib = ir_builder.create()
+ data_buf = ib.buffer_ptr(data_buf)
+ out_buf = ib.buffer_ptr(out_buf)
+
+ with ib.for_range(0, axis_mul_before, "i") as i:
Review comment:
Done. Fused `i` and `j` loop into a single parallel loop and do some
math to recover `i` and `j`. Parallelizing `i` loop alone doesn't help when the
scan axis is 0.
##########
File path: python/tvm/topi/cuda/scan.py
##########
@@ -19,30 +19,36 @@
import tvm
from tvm import te
from tvm._ffi import get_global_func
-from ..transform import expand_dims, squeeze
-from ..utils import ceil_div
+from ..transform import expand_dims, squeeze, transpose, reshape
+from ..utils import ceil_div, swap, prod, get_const_int
from ..math import cast
from .. import tag
from .injective import schedule_injective_from_existing
-def exclusive_sum_scan2d_ir(data, output, reduction=None):
+binop_name_to_func = {"sum": tvm.tir.generic.add}
+
+
+def exclusive_scan_ir(data, output, reduction=None, binop="sum"):
"""Low level IR to do exclusive sum scan along rows of 2D input.
Parameters
----------
data : Buffer
- Input data. 2-D Buffer with shape [batch_size, scan_axis_size].
+ Input N-D Buffer. Scan is done over the innermost axis.
output: Buffer
- A buffer to store the output scan, of the same size as data
+ A buffer to store the output scan, of the same shape as data
reduction: Buffer, optional
- 1D Buffer of size [batch_size], to store the sum of each row.
+ (N-1)-D Buffer, to store the sum of each scan axis.
+
+ biop: string, optional
+ A string specifying which binary operator to use. Currently only "sum"
is supported.
Review comment:
Yes see the discussion at
https://github.com/apache/tvm/pull/7334#discussion_r563926496
I'll try if I can improve on this
##########
File path: python/tvm/topi/cuda/scan.py
##########
@@ -353,28 +364,83 @@ def exclusive_scan(data, axis=-1, return_reduction=False,
output_dtype=None):
output = te.extern(
[data.shape],
[data],
- lambda ins, outs: exclusive_sum_scan2d_ir(ins[0], outs[0]),
+ lambda ins, outs: exclusive_scan_ir(ins[0], outs[0],
binop=binop),
dtype=[output_dtype],
in_buffers=[data_buf],
out_buffers=[output_buf],
name="exclusive_scan",
tag="exclusive_scan_gpu",
)
reduction = None
- else:
- assert False, "Unsupported dimension {}".format(ndim)
- if ndim == 1:
- output = squeeze(output, 0)
+ if ndim == 1:
+ output = squeeze(output, 0)
+ if return_reduction:
+ reduction = squeeze(reduction, 0)
+
if return_reduction:
- reduction = squeeze(reduction, 0)
+ return output, reduction
+
+ return output
+
+ if output_dtype is None or output_dtype == "":
+ output_dtype = data.dtype
+
+ ndim = len(data.shape)
+ if axis < 0:
+ axis += ndim
+
+ # If scan axis is not the innermost one, swap the scan and the innermost
axes
+ # Scan is always done on the innermost axis, for performance reason.
+ if axis != ndim - 1:
+ axes = swap(list(range(ndim)), axis)
+ data = transpose(data, axes)
+
+ if return_reduction:
+ output, reduction = do_scan(data, output_dtype)
+ else:
+ output = do_scan(data, output_dtype)
+
+ if axis != ndim - 1:
+ axes = swap(list(range(ndim)), axis)
+ output = transpose(output, axes)
if return_reduction:
return output, reduction
return output
+def inclusive_scan(data, axis=-1, output_dtype=None, binop="sum"):
+ """Do inclusive scan on 1D or multidimensional input.
+
+ Parameters
+ ----------
+ data : tvm.te.Tensor
+ Input data of any shape.
+
+ axis: int, optional
+ The axis to do scan on. By default, scan is done on the innermost axis.
+
+ output_dtype: string, optional
+ The dtype of the output scan tensor. If not provided, the dtype of the
input is used.
+
+ biop: string, optional
+ A string specifying which binary operator to use. Currently only "sum"
is supported.
+
+ Returns
+ -------
+ output : tvm.te.Tensor
+ A N-D tensor of the same rank N as the input data.
+ """
+ ex_scan = exclusive_scan(data, axis, output_dtype=output_dtype,
binop=binop)
+
+ if output_dtype is not None and data.dtype != output_dtype and
output_dtype != "":
+ data = cast(data, output_dtype)
+
+ return binop_name_to_func[binop](data, ex_scan)
Review comment:
prod makes sense, to support `cumprod`. But currently only "sum" is
supported. If I manage to convert `binop` argument from to function, we don't
need to worry about this issue.
##########
File path: python/tvm/topi/cuda/scan.py
##########
@@ -353,28 +364,83 @@ def exclusive_scan(data, axis=-1, return_reduction=False,
output_dtype=None):
output = te.extern(
[data.shape],
[data],
- lambda ins, outs: exclusive_sum_scan2d_ir(ins[0], outs[0]),
+ lambda ins, outs: exclusive_scan_ir(ins[0], outs[0],
binop=binop),
dtype=[output_dtype],
in_buffers=[data_buf],
out_buffers=[output_buf],
name="exclusive_scan",
tag="exclusive_scan_gpu",
)
reduction = None
- else:
- assert False, "Unsupported dimension {}".format(ndim)
- if ndim == 1:
- output = squeeze(output, 0)
+ if ndim == 1:
+ output = squeeze(output, 0)
+ if return_reduction:
+ reduction = squeeze(reduction, 0)
+
if return_reduction:
- reduction = squeeze(reduction, 0)
+ return output, reduction
+
+ return output
+
+ if output_dtype is None or output_dtype == "":
+ output_dtype = data.dtype
+
+ ndim = len(data.shape)
+ if axis < 0:
+ axis += ndim
+
+ # If scan axis is not the innermost one, swap the scan and the innermost
axes
+ # Scan is always done on the innermost axis, for performance reason.
+ if axis != ndim - 1:
+ axes = swap(list(range(ndim)), axis)
+ data = transpose(data, axes)
+
+ if return_reduction:
+ output, reduction = do_scan(data, output_dtype)
+ else:
+ output = do_scan(data, output_dtype)
+
+ if axis != ndim - 1:
+ axes = swap(list(range(ndim)), axis)
+ output = transpose(output, axes)
if return_reduction:
return output, reduction
return output
+def inclusive_scan(data, axis=-1, output_dtype=None, binop="sum"):
+ """Do inclusive scan on 1D or multidimensional input.
+
+ Parameters
+ ----------
+ data : tvm.te.Tensor
+ Input data of any shape.
+
+ axis: int, optional
+ The axis to do scan on. By default, scan is done on the innermost axis.
+
+ output_dtype: string, optional
+ The dtype of the output scan tensor. If not provided, the dtype of the
input is used.
+
+ biop: string, optional
+ A string specifying which binary operator to use. Currently only "sum"
is supported.
+
+ Returns
+ -------
+ output : tvm.te.Tensor
+ A N-D tensor of the same rank N as the input data.
+ """
+ ex_scan = exclusive_scan(data, axis, output_dtype=output_dtype,
binop=binop)
+
+ if output_dtype is not None and data.dtype != output_dtype and
output_dtype != "":
+ data = cast(data, output_dtype)
+
+ return binop_name_to_func[binop](data, ex_scan)
Review comment:
prod makes sense, to support `cumprod`. But currently only "sum" is
supported. If I manage to convert `binop` argument to function, we don't need
to worry about this issue.
##########
File path: python/tvm/topi/cuda/scan.py
##########
@@ -19,30 +19,36 @@
import tvm
from tvm import te
from tvm._ffi import get_global_func
-from ..transform import expand_dims, squeeze
-from ..utils import ceil_div
+from ..transform import expand_dims, squeeze, transpose, reshape
+from ..utils import ceil_div, swap, prod, get_const_int
from ..math import cast
from .. import tag
from .injective import schedule_injective_from_existing
-def exclusive_sum_scan2d_ir(data, output, reduction=None):
+binop_name_to_func = {"sum": tvm.tir.generic.add}
Review comment:
Made `binop` a function, and we now look up thrust function using tir
function as a key.
##########
File path: python/tvm/topi/cuda/scan.py
##########
@@ -19,30 +19,36 @@
import tvm
from tvm import te
from tvm._ffi import get_global_func
-from ..transform import expand_dims, squeeze
-from ..utils import ceil_div
+from ..transform import expand_dims, squeeze, transpose, reshape
+from ..utils import ceil_div, swap, prod, get_const_int
from ..math import cast
from .. import tag
from .injective import schedule_injective_from_existing
-def exclusive_sum_scan2d_ir(data, output, reduction=None):
+binop_name_to_func = {"sum": tvm.tir.generic.add}
Review comment:
Made `binop` argument a function, and we now look up thrust function
using tir function as a key.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]