bkmgit commented on a change in pull request #11882: URL: https://github.com/apache/arrow/pull/11882#discussion_r776936853
########## File path: cpp/src/arrow/compute/kernels/scalar_compare.cc ########## @@ -746,6 +787,155 @@ std::shared_ptr<ScalarFunction> MakeScalarMinMax(std::string name, return func; } +template <typename Op> +void AddIntegerBetween(const std::shared_ptr<DataType>& ty, ScalarFunction* func) { + auto exec = GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, Op>(*ty); + DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(), std::move(exec))); +} + +template <typename InType, typename Op> +void AddGenericBetween(const std::shared_ptr<DataType>& ty, ScalarFunction* func) { + DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(), + ScalarTernaryEqualTypes<BooleanType, InType, Op>::Exec)); +} + +template <typename OutType, typename ArgType, typename Op> +struct BetweenTimestamps : public ScalarTernaryEqualTypes<OutType, ArgType, Op> { + using Base = ScalarTernaryEqualTypes<OutType, ArgType, Op>; + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const auto& var = checked_cast<const TimestampType&>(*batch[0].type()); + const auto& lhs = checked_cast<const TimestampType&>(*batch[1].type()); + const auto& rhs = checked_cast<const TimestampType&>(*batch[2].type()); + if ((var.timezone().empty() != lhs.timezone().empty()) || + (var.timezone().empty() != rhs.timezone().empty()) || + (lhs.timezone().empty() != rhs.timezone().empty())) { + return Status::Invalid("Cannot compare timestamps with and without timezones."); + } + return Base::Exec(ctx, batch, out); + } +}; + +template <typename Op> +std::shared_ptr<ScalarFunction> MakeBetweenFunction(std::string name, + const FunctionDoc* doc) { + auto func = std::make_shared<CompareFunction>(name, Arity::Ternary(), doc); + + DCHECK_OK(func->AddKernel( + {boolean(), boolean(), boolean()}, boolean(), + ScalarTernary<BooleanType, BooleanType, BooleanType, BooleanType, Op>::Exec)); + + for (const std::shared_ptr<DataType>& ty : IntTypes()) { + AddIntegerBetween<Op>(ty, func.get()); + } + + AddIntegerBetween<Op>(date32(), func.get()); + AddIntegerBetween<Op>(date64(), func.get()); + + AddGenericBetween<FloatType, Op>(float32(), func.get()); + AddGenericBetween<DoubleType, Op>(float64(), func.get()); + + // Add timestamp kernels + for (auto unit : TimeUnit::values()) { + InputType in_type(match::TimestampTypeUnit(unit)); + DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(), + BetweenTimestamps<BooleanType, TimestampType, Op>::Exec)); + } + + // Duration + for (auto unit : TimeUnit::values()) { + InputType in_type(match::DurationTypeUnit(unit)); + auto exec = + GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, Op>(int64()); + DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(), std::move(exec))); + } + + // Time32 and Time64 + for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI}) { + InputType in_type(match::Time32TypeUnit(unit)); + auto exec = + GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, Op>(int32()); + DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(), std::move(exec))); + } + for (auto unit : {TimeUnit::MICRO, TimeUnit::NANO}) { + InputType in_type(match::Time64TypeUnit(unit)); + auto exec = + GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, Op>(int64()); + DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(), std::move(exec))); + } + + for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) { + auto exec = GenerateVarBinaryBase<ScalarTernaryEqualTypes, BooleanType, Op>(*ty); + DCHECK_OK(func->AddKernel( + {InputType::Array(ty), InputType::Scalar(ty), InputType::Scalar(ty)}, boolean(), + exec)); + DCHECK_OK(func->AddKernel( + {InputType::Array(ty), InputType::Array(ty), InputType::Array(ty)}, boolean(), + std::move(exec))); + } + + for (const auto id : {Type::DECIMAL128, Type::DECIMAL256}) { + auto exec = GenerateDecimal<ScalarTernaryEqualTypes, BooleanType, Op>(id); + DCHECK_OK(func->AddKernel({InputType(id), InputType(id), InputType(id)}, boolean(), + std::move(exec))); + } + + { + auto exec = ScalarTernaryEqualTypes<BooleanType, FixedSizeBinaryType, Op>::Exec; + auto ty = InputType(Type::FIXED_SIZE_BINARY); + DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(), std::move(exec))); + } + + return func; +} + +const BetweenOptions* GetDefaultBetweenOptions() { + static const auto kBetweenOptions = BetweenOptions::Defaults(); + return &kBetweenOptions; +} + +const FunctionDoc between_doc{"Check if values are in a range, val betwen a and b", + ("A null on either side emits a null comparison result.\n" + "options are used to specify if the endpoints are\n" + "inclusive, possible values are NEITHER (a<val<b),\n" + "LEFT (a<=val<b), RIGHT (a<val<=b), and the default\n" + "if not specified BOTH (a<=val<=b)"), + {"val", "a", "b"}}; + +class BetweenMetaFunction : public MetaFunction { + public: + BetweenMetaFunction() + : MetaFunction("between", Arity::Ternary(), &between_doc, + GetDefaultBetweenOptions()) {} + + Result<Datum> ExecuteImpl(const std::vector<Datum>& args, + const FunctionOptions* options, + ExecContext* ctx) const override { + const BetweenOptions& between_options = static_cast<const BetweenOptions&>(*options); + Datum result; Review comment: Thanks for your feedback. Ok, on moving the functions to within the dispatch for between. Underlying APIs should support natural integration into other programming languages and not force Pandas/Python approach, but also be easy to maintain and at least consistent for related functions. Would it make sense to then also use one compare function, but have functions on top of this for `greater`, `less` etc.? The `not_between` function could then be implemented by calling `between` and reversing the result. I understand that Pandas has lot of influence. In addition to databases, there are other libraries such as - [Ruby Daru](https://github.com/ankane/rover) - [R Dplyr](https://dplyr.tidyverse.org/reference/between.html) - [Hive Between](https://sqlandhadoop.com/hive-between/) - [Ruby Rover](https://github.com/ankane/rover) - [Ruby CSV table](https://ruby-doc.org/stdlib-3.1.0/libdoc/csv/rdoc/CSV/Table.html) - [Link compare](https://docs.microsoft.com/en-us/dotnet/visual-basic/language-reference/operators/comparison-operators) - [Apache Calcite Algebra](https://calcite.apache.org/docs/algebra.html) For many of these, chained operations is the way to implement between. It is unclear if one can write a Kernel generator that would minimize loading data from memory for chained operations - Calcite approach seems useful, but need to understand it better. Will ask on mailing list for thoughts on these kind of approaches since they would be something new. Finally, I have not done much performance measurement and optimization of these kernels, but this can probably be done once tests and relevant functionality are working. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org