[GitHub] [arrow] bkmgit commented on a change in pull request #11882: ARROW-9843: [C++] Implement Between ternary kernel

GitBox Thu, 30 Dec 2021 22:34:37 -0800


bkmgit commented on a change in pull request #11882:
URL: https://github.com/apache/arrow/pull/11882#discussion_r776936853




##########
File path: cpp/src/arrow/compute/kernels/scalar_compare.cc
##########
@@ -746,6 +787,155 @@ std::shared_ptr<ScalarFunction> 
MakeScalarMinMax(std::string name,
   return func;
 }
 
+template <typename Op>
+void AddIntegerBetween(const std::shared_ptr<DataType>& ty, ScalarFunction* 
func) {
+  auto exec = GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, 
Op>(*ty);
+  DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(), std::move(exec)));
+}
+
+template <typename InType, typename Op>
+void AddGenericBetween(const std::shared_ptr<DataType>& ty, ScalarFunction* 
func) {
+  DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(),
+                            ScalarTernaryEqualTypes<BooleanType, InType, 
Op>::Exec));
+}
+
+template <typename OutType, typename ArgType, typename Op>
+struct BetweenTimestamps : public ScalarTernaryEqualTypes<OutType, ArgType, 
Op> {
+  using Base = ScalarTernaryEqualTypes<OutType, ArgType, Op>;
+
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& var = checked_cast<const TimestampType&>(*batch[0].type());
+    const auto& lhs = checked_cast<const TimestampType&>(*batch[1].type());
+    const auto& rhs = checked_cast<const TimestampType&>(*batch[2].type());
+    if ((var.timezone().empty() != lhs.timezone().empty()) ||
+        (var.timezone().empty() != rhs.timezone().empty()) ||
+        (lhs.timezone().empty() != rhs.timezone().empty())) {
+      return Status::Invalid("Cannot compare timestamps with and without 
timezones.");
+    }
+    return Base::Exec(ctx, batch, out);
+  }
+};
+
+template <typename Op>
+std::shared_ptr<ScalarFunction> MakeBetweenFunction(std::string name,
+                                                    const FunctionDoc* doc) {
+  auto func = std::make_shared<CompareFunction>(name, Arity::Ternary(), doc);
+
+  DCHECK_OK(func->AddKernel(
+      {boolean(), boolean(), boolean()}, boolean(),
+      ScalarTernary<BooleanType, BooleanType, BooleanType, BooleanType, 
Op>::Exec));
+
+  for (const std::shared_ptr<DataType>& ty : IntTypes()) {
+    AddIntegerBetween<Op>(ty, func.get());
+  }
+
+  AddIntegerBetween<Op>(date32(), func.get());
+  AddIntegerBetween<Op>(date64(), func.get());
+
+  AddGenericBetween<FloatType, Op>(float32(), func.get());
+  AddGenericBetween<DoubleType, Op>(float64(), func.get());
+
+  // Add timestamp kernels
+  for (auto unit : TimeUnit::values()) {
+    InputType in_type(match::TimestampTypeUnit(unit));
+    DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(),
+                              BetweenTimestamps<BooleanType, TimestampType, 
Op>::Exec));
+  }
+
+  // Duration
+  for (auto unit : TimeUnit::values()) {
+    InputType in_type(match::DurationTypeUnit(unit));
+    auto exec =
+        GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, 
Op>(int64());
+    DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(), 
std::move(exec)));
+  }
+
+  // Time32 and Time64
+  for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI}) {
+    InputType in_type(match::Time32TypeUnit(unit));
+    auto exec =
+        GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, 
Op>(int32());
+    DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(), 
std::move(exec)));
+  }
+  for (auto unit : {TimeUnit::MICRO, TimeUnit::NANO}) {
+    InputType in_type(match::Time64TypeUnit(unit));
+    auto exec =
+        GeneratePhysicalInteger<ScalarTernaryEqualTypes, BooleanType, 
Op>(int64());
+    DCHECK_OK(func->AddKernel({in_type, in_type, in_type}, boolean(), 
std::move(exec)));
+  }
+
+  for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+    auto exec = GenerateVarBinaryBase<ScalarTernaryEqualTypes, BooleanType, 
Op>(*ty);
+    DCHECK_OK(func->AddKernel(
+        {InputType::Array(ty), InputType::Scalar(ty), InputType::Scalar(ty)}, 
boolean(),
+        exec));
+    DCHECK_OK(func->AddKernel(
+        {InputType::Array(ty), InputType::Array(ty), InputType::Array(ty)}, 
boolean(),
+        std::move(exec)));
+  }
+
+  for (const auto id : {Type::DECIMAL128, Type::DECIMAL256}) {
+    auto exec = GenerateDecimal<ScalarTernaryEqualTypes, BooleanType, Op>(id);
+    DCHECK_OK(func->AddKernel({InputType(id), InputType(id), InputType(id)}, 
boolean(),
+                              std::move(exec)));
+  }
+
+  {
+    auto exec = ScalarTernaryEqualTypes<BooleanType, FixedSizeBinaryType, 
Op>::Exec;
+    auto ty = InputType(Type::FIXED_SIZE_BINARY);
+    DCHECK_OK(func->AddKernel({ty, ty, ty}, boolean(), std::move(exec)));
+  }
+
+  return func;
+}
+
+const BetweenOptions* GetDefaultBetweenOptions() {
+  static const auto kBetweenOptions = BetweenOptions::Defaults();
+  return &kBetweenOptions;
+}
+
+const FunctionDoc between_doc{"Check if values are in a range, val betwen a 
and b",
+                              ("A null on either side emits a null comparison 
result.\n"
+                               "options are used to specify if the endpoints 
are\n"
+                               "inclusive, possible values are NEITHER 
(a<val<b),\n"
+                               "LEFT (a<=val<b), RIGHT (a<val<=b), and the 
default\n"
+                               "if not specified BOTH (a<=val<=b)"),
+                              {"val", "a", "b"}};
+
+class BetweenMetaFunction : public MetaFunction {
+ public:
+  BetweenMetaFunction()
+      : MetaFunction("between", Arity::Ternary(), &between_doc,
+                     GetDefaultBetweenOptions()) {}
+
+  Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+                            const FunctionOptions* options,
+                            ExecContext* ctx) const override {
+    const BetweenOptions& between_options = static_cast<const 
BetweenOptions&>(*options);
+    Datum result;

Review comment:
       Thanks for your feedback. Ok, on moving the functions to within the 
dispatch for between.  Underlying APIs should support natural integration into 
other programming languages and not force Pandas/Python approach, but also be 
easy to maintain and at least consistent for related functions. Would it make 
sense to then also use one compare function, but have functions on top of this 
for `greater`, `less` etc.? The `not_between` function could then be 
implemented by calling `between` and reversing the result.
   
   I understand that Pandas has lot of influence. In addition to databases, 
there are other libraries such as
   - [Ruby Daru](https://github.com/ankane/rover)
   - [R Dplyr](https://dplyr.tidyverse.org/reference/between.html)
   - [Hive Between](https://sqlandhadoop.com/hive-between/)
   - [Ruby Rover](https://github.com/ankane/rover)
   - [Ruby CSV 
table](https://ruby-doc.org/stdlib-3.1.0/libdoc/csv/rdoc/CSV/Table.html)
   - [Link 
compare](https://docs.microsoft.com/en-us/dotnet/visual-basic/language-reference/operators/comparison-operators)
   - [Apache Calcite Algebra](https://calcite.apache.org/docs/algebra.html)
   
   For many of these, chained operations is the way to implement between. It is 
unclear if one can write a Kernel generator that would minimize loading data 
from memory for chained operations - Calcite approach seems useful, but need to 
understand it better. Will ask on mailing list for thoughts on these kind of 
approaches since they would be something new.
   
   Finally, I have not done much performance measurement and optimization of 
these kernels, but this can probably be done once tests and relevant 
functionality are working.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] bkmgit commented on a change in pull request #11882: ARROW-9843: [C++] Implement Between ternary kernel

Reply via email to