mbrookhart commented on a change in pull request #7334: URL: https://github.com/apache/tvm/pull/7334#discussion_r564068569
########## File path: python/tvm/topi/cumsum.py ########## @@ -0,0 +1,105 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name +"""Cumsum operator""" +from ..tir import decl_buffer, ir_builder +from ..te import extern +from .utils import prod, get_const_int +from .math import cast + + +def cumsum(data, axis=None, dtype=None): + """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis. + + Parameters + ---------- + data : tvm.te.Tensor + The input data to the operator. + + axis : int, optional + Axis along which the cumulative sum is computed. The default (None) is to compute + the cumsum over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are summed. + If dtype is not specified, it defaults to the dtype of data. + + Returns + ------- + result : tvm.te.Tensor + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + """ + if dtype is None or dtype == "": + dtype = data.dtype + + def maybe_cast(x): + if dtype != data.dtype: + return cast(x, dtype) + return x + + axis_mul_before = 1 + axis_mul_after = 1 + + if axis is None: + axis = 0 + cumsum_axis_len = prod(data.shape) + shape = (cumsum_axis_len,) + else: + if not isinstance(axis, int): + axis = get_const_int(axis) + + shape = data.shape + cumsum_axis_len = shape[axis] + + if axis < 0: + axis = len(shape) + axis + + for i, value in enumerate(shape, 0): + if i < axis: + axis_mul_before *= value + elif i > axis: + axis_mul_after *= value + + def gen_ir(data_buf, out_buf): + ib = ir_builder.create() + data_buf = ib.buffer_ptr(data_buf) + out_buf = ib.buffer_ptr(out_buf) + + with ib.for_range(0, axis_mul_before, "i") as i: Review comment: Make the outer loop parallel on CPU? ########## File path: python/tvm/topi/cuda/scan.py ########## @@ -19,30 +19,36 @@ import tvm from tvm import te from tvm._ffi import get_global_func -from ..transform import expand_dims, squeeze -from ..utils import ceil_div +from ..transform import expand_dims, squeeze, transpose, reshape +from ..utils import ceil_div, swap, prod, get_const_int from ..math import cast from .. import tag from .injective import schedule_injective_from_existing -def exclusive_sum_scan2d_ir(data, output, reduction=None): +binop_name_to_func = {"sum": tvm.tir.generic.add} + + +def exclusive_scan_ir(data, output, reduction=None, binop="sum"): """Low level IR to do exclusive sum scan along rows of 2D input. Parameters ---------- data : Buffer - Input data. 2-D Buffer with shape [batch_size, scan_axis_size]. + Input N-D Buffer. Scan is done over the innermost axis. output: Buffer - A buffer to store the output scan, of the same size as data + A buffer to store the output scan, of the same shape as data reduction: Buffer, optional - 1D Buffer of size [batch_size], to store the sum of each row. + (N-1)-D Buffer, to store the sum of each scan axis. + + biop: string, optional + A string specifying which binary operator to use. Currently only "sum" is supported. Review comment: Maybe we should make this a lambda instead of a flag? I guess that's harder to support with thrust ########## File path: python/tvm/topi/cuda/scan.py ########## @@ -353,28 +364,83 @@ def exclusive_scan(data, axis=-1, return_reduction=False, output_dtype=None): output = te.extern( [data.shape], [data], - lambda ins, outs: exclusive_sum_scan2d_ir(ins[0], outs[0]), + lambda ins, outs: exclusive_scan_ir(ins[0], outs[0], binop=binop), dtype=[output_dtype], in_buffers=[data_buf], out_buffers=[output_buf], name="exclusive_scan", tag="exclusive_scan_gpu", ) reduction = None - else: - assert False, "Unsupported dimension {}".format(ndim) - if ndim == 1: - output = squeeze(output, 0) + if ndim == 1: + output = squeeze(output, 0) + if return_reduction: + reduction = squeeze(reduction, 0) + if return_reduction: - reduction = squeeze(reduction, 0) + return output, reduction + + return output + + if output_dtype is None or output_dtype == "": + output_dtype = data.dtype + + ndim = len(data.shape) + if axis < 0: + axis += ndim + + # If scan axis is not the innermost one, swap the scan and the innermost axes + # Scan is always done on the innermost axis, for performance reason. + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + + if return_reduction: + output, reduction = do_scan(data, output_dtype) + else: + output = do_scan(data, output_dtype) + + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + output = transpose(output, axes) if return_reduction: return output, reduction return output +def inclusive_scan(data, axis=-1, output_dtype=None, binop="sum"): + """Do inclusive scan on 1D or multidimensional input. + + Parameters + ---------- + data : tvm.te.Tensor + Input data of any shape. + + axis: int, optional + The axis to do scan on. By default, scan is done on the innermost axis. + + output_dtype: string, optional + The dtype of the output scan tensor. If not provided, the dtype of the input is used. + + biop: string, optional + A string specifying which binary operator to use. Currently only "sum" is supported. + + Returns + ------- + output : tvm.te.Tensor + A N-D tensor of the same rank N as the input data. + """ + ex_scan = exclusive_scan(data, axis, output_dtype=output_dtype, binop=binop) + + if output_dtype is not None and data.dtype != output_dtype and output_dtype != "": + data = cast(data, output_dtype) + + return binop_name_to_func[binop](data, ex_scan) Review comment: Is this valid for other ops, say mul? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
