This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 1214083f7e ARROW-17135: [C++] Reduce code size in 
compute/kernels/scalar_compare.cc (#13654)
1214083f7e is described below

commit 1214083f7ece4e1797b7f3cdecfec1c2cfa8bf89
Author: Wes McKinney <[email protected]>
AuthorDate: Wed Jul 20 13:12:23 2022 -0700

    ARROW-17135: [C++] Reduce code size in compute/kernels/scalar_compare.cc 
(#13654)
    
    This "leaner" implementation reduces the generated code size of this C++ 
file from 2307768 bytes to 1192608 bytes in gcc 10.3.0. The benchmarks are also 
faster (on my avx2 laptop):
    
    before
    
    ```
    
-----------------------------------------------------------------------------------------------
    Benchmark                                     Time             CPU   
Iterations UserCounters...
    
-----------------------------------------------------------------------------------------------
    GreaterArrayArrayInt64/32768/10000         32.1 us         32.1 us        
21533 items_per_second=1020.16M/s null_percent=0.01 size=32.768k
    GreaterArrayArrayInt64/32768/100           32.1 us         32.1 us        
21603 items_per_second=1019.27M/s null_percent=1 size=32.768k
    GreaterArrayArrayInt64/32768/10            32.1 us         32.1 us        
21479 items_per_second=1020.82M/s null_percent=10 size=32.768k
    GreaterArrayArrayInt64/32768/2             32.0 us         32.0 us        
21468 items_per_second=1023.12M/s null_percent=50 size=32.768k
    GreaterArrayArrayInt64/32768/1             32.3 us         32.3 us        
21720 items_per_second=1013.44M/s null_percent=100 size=32.768k
    GreaterArrayArrayInt64/32768/0             31.6 us         31.6 us        
21828 items_per_second=1036.94M/s null_percent=0 size=32.768k
    GreaterArrayScalarInt64/32768/10000        20.8 us         20.8 us        
33461 items_per_second=1.57238G/s null_percent=0.01 size=32.768k
    GreaterArrayScalarInt64/32768/100          20.9 us         20.9 us        
33625 items_per_second=1.56611G/s null_percent=1 size=32.768k
    GreaterArrayScalarInt64/32768/10           20.8 us         20.8 us        
33553 items_per_second=1.57338G/s null_percent=10 size=32.768k
    GreaterArrayScalarInt64/32768/2            20.9 us         20.9 us        
33348 items_per_second=1.5687G/s null_percent=50 size=32.768k
    GreaterArrayScalarInt64/32768/1            20.9 us         20.9 us        
33419 items_per_second=1.56879G/s null_percent=100 size=32.768k
    GreaterArrayScalarInt64/32768/0            20.5 us         20.5 us        
34116 items_per_second=1.59837G/s null_percent=0 size=32.768k
    ```
    
    after
    
    ```
    
-----------------------------------------------------------------------------------------------
    Benchmark                                     Time             CPU   
Iterations UserCounters...
    
-----------------------------------------------------------------------------------------------
    GreaterArrayArrayInt64/32768/10000         18.1 us         18.1 us        
38751 items_per_second=1.81199G/s null_percent=0.01 size=32.768k
    GreaterArrayArrayInt64/32768/100           17.5 us         17.5 us        
39374 items_per_second=1.86821G/s null_percent=1 size=32.768k
    GreaterArrayArrayInt64/32768/10            19.0 us         19.0 us        
33941 items_per_second=1.72066G/s null_percent=10 size=32.768k
    GreaterArrayArrayInt64/32768/2             18.0 us         18.0 us        
39589 items_per_second=1.81817G/s null_percent=50 size=32.768k
    GreaterArrayArrayInt64/32768/1             18.1 us         18.1 us        
39061 items_per_second=1.80719G/s null_percent=100 size=32.768k
    GreaterArrayArrayInt64/32768/0             17.5 us         17.5 us        
39813 items_per_second=1.87031G/s null_percent=0 size=32.768k
    GreaterArrayScalarInt64/32768/10000        16.3 us         16.3 us        
42281 items_per_second=2.01525G/s null_percent=0.01 size=32.768k
    GreaterArrayScalarInt64/32768/100          16.5 us         16.5 us        
42266 items_per_second=1.98195G/s null_percent=1 size=32.768k
    GreaterArrayScalarInt64/32768/10           16.5 us         16.5 us        
41872 items_per_second=1.98615G/s null_percent=10 size=32.768k
    GreaterArrayScalarInt64/32768/2            16.3 us         16.3 us        
42130 items_per_second=2.00447G/s null_percent=50 size=32.768k
    GreaterArrayScalarInt64/32768/1            16.2 us         16.2 us        
42391 items_per_second=2.02296G/s null_percent=100 size=32.768k
    GreaterArrayScalarInt64/32768/0            15.9 us         15.9 us        
43498 items_per_second=2.0614G/s null_percent=0 size=32.768k
    ```
    
    Authored-by: Wes McKinney <[email protected]>
    Signed-off-by: Wes McKinney <[email protected]>
---
 cpp/src/arrow/compute/kernels/codegen_internal.cc  |   4 -
 cpp/src/arrow/compute/kernels/codegen_internal.h   |  33 +--
 cpp/src/arrow/compute/kernels/scalar_arithmetic.cc |   8 +-
 cpp/src/arrow/compute/kernels/scalar_compare.cc    | 241 +++++++++++++++++----
 cpp/src/arrow/util/bit_util.h                      |  10 +
 cpp/tools/binary_symbol_explore.py                 |   1 +
 6 files changed, 226 insertions(+), 71 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc 
b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index 66724727fd..7be51188e0 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -29,10 +29,6 @@ namespace arrow {
 namespace compute {
 namespace internal {
 
-Status ExecFail(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return Status::NotImplemented("This kernel is malformed");
-}
-
 const std::vector<std::shared_ptr<DataType>>& ExampleParametricTypes() {
   static DataTypeVector example_parametric_types = {
       decimal128(12, 2),
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h 
b/cpp/src/arrow/compute/kernels/codegen_internal.h
index f008314e8b..a6ede14176 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -964,8 +964,6 @@ struct FailFunctor<VectorKernel::ChunkedExec> {
   }
 };
 
-Status ExecFail(KernelContext* ctx, const ExecSpan& batch, ExecResult* out);
-
 // GD for numeric types (integer and floating point)
 template <template <typename...> class Generator, typename Type0,
           typename KernelType = ArrayKernelExec, typename... Args>
@@ -1009,7 +1007,7 @@ ArrayKernelExec GenerateFloatingPoint(detail::GetTypeId 
get_id) {
       return Generator<Type0, DoubleType, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
 
@@ -1037,7 +1035,7 @@ ArrayKernelExec GenerateInteger(detail::GetTypeId get_id) 
{
       return Generator<Type0, UInt64Type, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
 
@@ -1068,7 +1066,7 @@ ArrayKernelExec GeneratePhysicalInteger(detail::GetTypeId 
get_id) {
       return Generator<Type0, UInt64Type, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
 
@@ -1104,8 +1102,9 @@ KernelType ArithmeticExecFromOp(detail::GetTypeId get_id) 
{
   }
 }
 
-template <template <typename... Args> class Generator, typename... Args>
-ArrayKernelExec GeneratePhysicalNumeric(detail::GetTypeId get_id) {
+template <typename ReturnType, template <typename... Args> class Generator,
+          typename... Args>
+ReturnType GeneratePhysicalNumericGeneric(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return Generator<Int8Type, Args...>::Exec;
@@ -1135,9 +1134,13 @@ ArrayKernelExec 
GeneratePhysicalNumeric(detail::GetTypeId get_id) {
       return Generator<DoubleType, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
+template <template <typename... Args> class Generator, typename... Args>
+ArrayKernelExec GeneratePhysicalNumeric(detail::GetTypeId get_id) {
+  return GeneratePhysicalNumericGeneric<ArrayKernelExec, Generator, 
Args...>(get_id);
+}
 
 // Generate a kernel given a templated functor for decimal types
 template <template <typename... Args> class Generator, typename... Args>
@@ -1149,7 +1152,7 @@ ArrayKernelExec 
GenerateDecimalToDecimal(detail::GetTypeId get_id) {
       return Generator<Decimal256Type, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
 
@@ -1169,7 +1172,7 @@ ArrayKernelExec GenerateSignedInteger(detail::GetTypeId 
get_id) {
       return Generator<Type0, Int64Type, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
 
@@ -1249,7 +1252,7 @@ ArrayKernelExec 
GenerateVarBinaryToVarBinary(detail::GetTypeId get_id) {
       return Generator<LargeStringType, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
 
@@ -1270,7 +1273,7 @@ ArrayKernelExec GenerateVarBinaryBase(detail::GetTypeId 
get_id) {
       return Generator<Type0, LargeBinaryType, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
 
@@ -1288,7 +1291,7 @@ ArrayKernelExec GenerateVarBinary(detail::GetTypeId 
get_id) {
       return Generator<Type0, LargeStringType, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
 
@@ -1312,7 +1315,7 @@ ArrayKernelExec GenerateTemporal(detail::GetTypeId 
get_id) {
       return Generator<Type0, TimestampType, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
 
@@ -1328,7 +1331,7 @@ ArrayKernelExec GenerateDecimal(detail::GetTypeId get_id) 
{
       return Generator<Type0, Decimal256Type, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc 
b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index e513e07d49..984c3b5653 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -1021,7 +1021,7 @@ ArrayKernelExec 
TypeAgnosticBitWiseExecFromOp(detail::GetTypeId get_id) {
       return KernelGenerator<UInt64Type, UInt64Type, Op>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
 
@@ -1046,7 +1046,7 @@ ArrayKernelExec ShiftExecFromOp(detail::GetTypeId get_id) 
{
       return KernelGenerator<UInt64Type, UInt64Type, Op>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
 
@@ -1059,7 +1059,7 @@ ArrayKernelExec 
GenerateArithmeticFloatingPoint(detail::GetTypeId get_id) {
       return KernelGenerator<DoubleType, DoubleType, Op>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
 
@@ -1188,7 +1188,7 @@ ArrayKernelExec 
GenerateArithmeticWithFixedIntOutType(detail::GetTypeId get_id)
       return KernelGenerator<DoubleType, DoubleType, Op>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
+      return nullptr;
   }
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc 
b/cpp/src/arrow/compute/kernels/scalar_compare.cc
index 07778ca113..f071986dd2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -158,11 +158,145 @@ struct Maximum {
 
 // Implement Less, LessEqual by flipping arguments to Greater, GreaterEqual
 
-template <typename OutType, typename ArgType, typename Op>
-struct CompareTimestamps
-    : public applicator::ScalarBinaryEqualTypes<OutType, ArgType, Op> {
-  using Base = applicator::ScalarBinaryEqualTypes<OutType, ArgType, Op>;
+template <typename Type, typename Op>
+struct ComparePrimitiveArrayArray {
+  using T = typename Type::c_type;
+  static void Exec(const void* left_values_void, const void* right_values_void,
+                   int64_t length, void* out_bitmap_void) {
+    const T* left_values = reinterpret_cast<const T*>(left_values_void);
+    const T* right_values = reinterpret_cast<const T*>(right_values_void);
+    uint8_t* out_bitmap = reinterpret_cast<uint8_t*>(out_bitmap_void);
+    static constexpr int kBatchSize = 32;
+    int64_t num_batches = length / kBatchSize;
+    uint32_t temp_output[kBatchSize];
+    for (int64_t j = 0; j < num_batches; ++j) {
+      for (int i = 0; i < kBatchSize; ++i) {
+        temp_output[i] = Op::template Call<bool, T, T>(nullptr, *left_values++,
+                                                       *right_values++, 
nullptr);
+      }
+      bit_util::PackBits<kBatchSize>(temp_output, out_bitmap);
+      out_bitmap += kBatchSize / 8;
+    }
+    int64_t bit_index = 0;
+    for (int64_t j = kBatchSize * num_batches; j < length; ++j) {
+      bit_util::SetBitTo(out_bitmap, bit_index++,
+                         Op::template Call<bool, T, T>(nullptr, *left_values++,
+                                                       *right_values++, 
nullptr));
+    }
+  }
+};
+
+template <typename Type, typename Op>
+struct ComparePrimitiveArrayScalar {
+  using T = typename Type::c_type;
+  static void Exec(const void* left_values_void, const void* right_value_void,
+                   int64_t length, void* out_bitmap_void) {
+    const T* left_values = reinterpret_cast<const T*>(left_values_void);
+    const T right_value = *reinterpret_cast<const T*>(right_value_void);
+    uint8_t* out_bitmap = reinterpret_cast<uint8_t*>(out_bitmap_void);
+    static constexpr int kBatchSize = 32;
+    int64_t num_batches = length / kBatchSize;
+    uint32_t temp_output[kBatchSize];
+    for (int64_t j = 0; j < num_batches; ++j) {
+      for (int i = 0; i < kBatchSize; ++i) {
+        temp_output[i] =
+            Op::template Call<bool, T, T>(nullptr, *left_values++, 
right_value, nullptr);
+      }
+      bit_util::PackBits<kBatchSize>(temp_output, out_bitmap);
+      out_bitmap += kBatchSize / 8;
+    }
+    int64_t bit_index = 0;
+    for (int64_t j = kBatchSize * num_batches; j < length; ++j) {
+      bit_util::SetBitTo(
+          out_bitmap, bit_index++,
+          Op::template Call<bool, T, T>(nullptr, *left_values++, right_value, 
nullptr));
+    }
+  }
+};
+
+template <typename Type, typename Op>
+struct ComparePrimitiveScalarArray {
+  using T = typename Type::c_type;
+  static void Exec(const void* left_value_void, const void* right_values_void,
+                   int64_t length, void* out_bitmap_void) {
+    const T left_value = *reinterpret_cast<const T*>(left_value_void);
+    const T* right_values = reinterpret_cast<const T*>(right_values_void);
+    uint8_t* out_bitmap = reinterpret_cast<uint8_t*>(out_bitmap_void);
+    static constexpr int kBatchSize = 32;
+    int64_t num_batches = length / kBatchSize;
+    uint32_t temp_output[kBatchSize];
+    for (int64_t j = 0; j < num_batches; ++j) {
+      for (int i = 0; i < kBatchSize; ++i) {
+        temp_output[i] =
+            Op::template Call<bool, T, T>(nullptr, left_value, 
*right_values++, nullptr);
+      }
+      bit_util::PackBits<kBatchSize>(temp_output, out_bitmap);
+      out_bitmap += kBatchSize / 8;
+    }
+    int64_t bit_index = 0;
+    for (int64_t j = kBatchSize * num_batches; j < length; ++j) {
+      bit_util::SetBitTo(
+          out_bitmap, bit_index++,
+          Op::template Call<bool, T, T>(nullptr, left_value, *right_values++, 
nullptr));
+    }
+  }
+};
+
+using BinaryKernel = void (*)(const void*, const void*, int64_t, void*);
 
+struct CompareData : public KernelState {
+  BinaryKernel func_aa;
+  BinaryKernel func_sa;
+  BinaryKernel func_as;
+  CompareData(BinaryKernel func_aa, BinaryKernel func_sa, BinaryKernel func_as)
+      : func_aa(func_aa), func_sa(func_sa), func_as(func_as) {}
+};
+
+template <typename Type>
+struct CompareKernel {
+  using T = typename Type::c_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const auto kernel = static_cast<const ScalarKernel*>(ctx->kernel());
+    DCHECK(kernel);
+    const auto kernel_data = checked_cast<const 
CompareData*>(kernel->data.get());
+
+    ArraySpan* out_arr = out->array_span();
+
+    // TODO: implement path for offset not multiple of 8
+    const bool out_is_byte_aligned = out_arr->offset % 8 == 0;
+
+    std::shared_ptr<Buffer> out_buffer_tmp;
+    uint8_t* out_buffer;
+    if (out_is_byte_aligned) {
+      out_buffer = out_arr->buffers[1].data + out_arr->offset / 8;
+    } else {
+      ARROW_ASSIGN_OR_RAISE(out_buffer_tmp,
+                            
ctx->Allocate(bit_util::BytesForBits(batch.length)));
+      out_buffer = out_buffer_tmp->mutable_data();
+    }
+    if (batch[0].is_array() && batch[1].is_array()) {
+      kernel_data->func_aa(batch[0].array.GetValues<T>(1), 
batch[1].array.GetValues<T>(1),
+                           batch.length, out_buffer);
+    } else if (batch[1].is_scalar()) {
+      T value = UnboxScalar<Type>::Unbox(*batch[1].scalar);
+      kernel_data->func_as(batch[0].array.GetValues<T>(1), &value, 
batch.length,
+                           out_buffer);
+    } else {
+      T value = UnboxScalar<Type>::Unbox(*batch[0].scalar);
+      kernel_data->func_sa(&value, batch[1].array.GetValues<T>(1), 
batch.length,
+                           out_buffer);
+    }
+    if (!out_is_byte_aligned) {
+      ::arrow::internal::CopyBitmap(out_buffer, /*offset=*/0, batch.length,
+                                    out_arr->buffers[1].data, out_arr->offset);
+    }
+    return Status::OK();
+  }
+};
+
+template <typename Op>
+struct CompareTimestamps {
   static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
     const auto& lhs = checked_cast<const TimestampType&>(*batch[0].type());
     const auto& rhs = checked_cast<const TimestampType&>(*batch[1].type());
@@ -171,22 +305,34 @@ struct CompareTimestamps
           "Cannot compare timestamp with timezone to timestamp without 
timezone, got: ",
           lhs, " and ", rhs);
     }
-    return Base::Exec(ctx, batch, out);
+    return CompareKernel<Int64Type>::Exec(ctx, batch, out);
   }
 };
 
 template <typename Op>
-void AddIntegerCompare(const std::shared_ptr<DataType>& ty, ScalarFunction* 
func) {
-  auto exec =
-      GeneratePhysicalInteger<applicator::ScalarBinaryEqualTypes, BooleanType, 
Op>(*ty);
-  DCHECK_OK(func->AddKernel({ty, ty}, boolean(), std::move(exec)));
+ScalarKernel GetCompareKernel(InputType ty, Type::type compare_type,
+                              ArrayKernelExec exec) {
+  ScalarKernel kernel;
+  kernel.signature = KernelSignature::Make({ty, ty}, boolean());
+  BinaryKernel func_aa =
+      GeneratePhysicalNumericGeneric<BinaryKernel, ComparePrimitiveArrayArray, 
Op>(
+          compare_type);
+  BinaryKernel func_sa =
+      GeneratePhysicalNumericGeneric<BinaryKernel, 
ComparePrimitiveScalarArray, Op>(
+          compare_type);
+  BinaryKernel func_as =
+      GeneratePhysicalNumericGeneric<BinaryKernel, 
ComparePrimitiveArrayScalar, Op>(
+          compare_type);
+  kernel.data = std::make_shared<CompareData>(func_aa, func_sa, func_as);
+  kernel.exec = exec;
+  return kernel;
 }
 
-template <typename InType, typename Op>
-void AddGenericCompare(const std::shared_ptr<DataType>& ty, ScalarFunction* 
func) {
-  DCHECK_OK(
-      func->AddKernel({ty, ty}, boolean(),
-                      applicator::ScalarBinaryEqualTypes<BooleanType, InType, 
Op>::Exec));
+template <typename Op>
+void AddPrimitiveCompare(const std::shared_ptr<DataType>& ty, ScalarFunction* 
func) {
+  ArrayKernelExec exec = GeneratePhysicalNumeric<CompareKernel>(ty);
+  ScalarKernel kernel = GetCompareKernel<Op>(ty, ty->id(), exec);
+  DCHECK_OK(func->AddKernel(kernel));
 }
 
 struct CompareFunction : ScalarFunction {
@@ -247,45 +393,37 @@ std::shared_ptr<ScalarFunction> 
MakeCompareFunction(std::string name, FunctionDo
       {boolean(), boolean()}, boolean(),
       applicator::ScalarBinary<BooleanType, BooleanType, BooleanType, 
Op>::Exec));
 
-  for (const std::shared_ptr<DataType>& ty : IntTypes()) {
-    AddIntegerCompare<Op>(ty, func.get());
+  for (const std::shared_ptr<DataType>& ty : NumericTypes()) {
+    AddPrimitiveCompare<Op>(ty, func.get());
   }
-  AddIntegerCompare<Op>(date32(), func.get());
-  AddIntegerCompare<Op>(date64(), func.get());
-
-  AddGenericCompare<FloatType, Op>(float32(), func.get());
-  AddGenericCompare<DoubleType, Op>(float64(), func.get());
+  AddPrimitiveCompare<Op>(date32(), func.get());
+  AddPrimitiveCompare<Op>(date64(), func.get());
 
   // Add timestamp kernels
   for (auto unit : TimeUnit::values()) {
     InputType in_type(match::TimestampTypeUnit(unit));
-    DCHECK_OK(func->AddKernel({in_type, in_type}, boolean(),
-                              CompareTimestamps<BooleanType, TimestampType, 
Op>::Exec));
+    ScalarKernel kernel =
+        GetCompareKernel<Op>(in_type, Type::INT64, 
CompareTimestamps<Op>::Exec);
+    DCHECK_OK(func->AddKernel(kernel));
   }
 
   // Duration
   for (auto unit : TimeUnit::values()) {
     InputType in_type(match::DurationTypeUnit(unit));
-    auto exec =
-        GeneratePhysicalInteger<applicator::ScalarBinaryEqualTypes, 
BooleanType, Op>(
-            int64());
-    DCHECK_OK(func->AddKernel({in_type, in_type}, boolean(), std::move(exec)));
+    ArrayKernelExec exec = GeneratePhysicalNumeric<CompareKernel>(int64());
+    DCHECK_OK(func->AddKernel(GetCompareKernel<Op>(in_type, Type::INT64, 
exec)));
   }
 
   // Time32 and Time64
   for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI}) {
     InputType in_type(match::Time32TypeUnit(unit));
-    auto exec =
-        GeneratePhysicalInteger<applicator::ScalarBinaryEqualTypes, 
BooleanType, Op>(
-            int32());
-    DCHECK_OK(func->AddKernel({in_type, in_type}, boolean(), std::move(exec)));
+    ArrayKernelExec exec = GeneratePhysicalNumeric<CompareKernel>(int32());
+    DCHECK_OK(func->AddKernel(GetCompareKernel<Op>(in_type, Type::INT32, 
exec)));
   }
   for (auto unit : {TimeUnit::MICRO, TimeUnit::NANO}) {
     InputType in_type(match::Time64TypeUnit(unit));
-    auto exec =
-        GeneratePhysicalInteger<applicator::ScalarBinaryEqualTypes, 
BooleanType, Op>(
-            int64());
-    DCHECK_OK(func->AddKernel({in_type, in_type}, boolean(), std::move(exec)));
+    ArrayKernelExec exec = GeneratePhysicalNumeric<CompareKernel>(int64());
+    DCHECK_OK(func->AddKernel(GetCompareKernel<Op>(in_type, Type::INT64, 
exec)));
   }
 
   for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
@@ -310,30 +448,37 @@ std::shared_ptr<ScalarFunction> 
MakeCompareFunction(std::string name, FunctionDo
   return func;
 }
 
-struct FlippedData : public KernelState {
+struct FlippedData : public CompareData {
   ArrayKernelExec unflipped_exec;
-  explicit FlippedData(ArrayKernelExec unflipped_exec) : 
unflipped_exec(unflipped_exec) {}
+  explicit FlippedData(ArrayKernelExec unflipped_exec, BinaryKernel func_aa = 
nullptr,
+                       BinaryKernel func_sa = nullptr, BinaryKernel func_as = 
nullptr)
+      : CompareData{func_aa, func_sa, func_as}, unflipped_exec(unflipped_exec) 
{}
 };
 
-Status FlippedBinaryExec(KernelContext* ctx, const ExecSpan& span, ExecResult* 
out) {
+Status FlippedCompare(KernelContext* ctx, const ExecSpan& span, ExecResult* 
out) {
   const auto kernel = static_cast<const ScalarKernel*>(ctx->kernel());
-  DCHECK(kernel);
-  const auto kernel_data = static_cast<const FlippedData*>(kernel->data.get());
-
+  const auto kernel_data = checked_cast<const 
FlippedData*>(kernel->data.get());
   ExecSpan flipped_span = span;
   std::swap(flipped_span.values[0], flipped_span.values[1]);
   return kernel_data->unflipped_exec(ctx, flipped_span, out);
 }
 
-std::shared_ptr<ScalarFunction> MakeFlippedFunction(std::string name,
-                                                    const ScalarFunction& func,
-                                                    FunctionDoc doc) {
+std::shared_ptr<ScalarFunction> MakeFlippedCompare(std::string name,
+                                                   const ScalarFunction& func,
+                                                   FunctionDoc doc) {
   auto flipped_func =
       std::make_shared<CompareFunction>(name, Arity::Binary(), std::move(doc));
   for (const ScalarKernel* kernel : func.kernels()) {
     ScalarKernel flipped_kernel = *kernel;
-    flipped_kernel.data = std::make_shared<FlippedData>(kernel->exec);
-    flipped_kernel.exec = FlippedBinaryExec;
+    if (kernel->data) {
+      auto compare_data = checked_cast<const CompareData*>(kernel->data.get());
+      flipped_kernel.data =
+          std::make_shared<FlippedData>(kernel->exec, compare_data->func_aa,
+                                        compare_data->func_sa, 
compare_data->func_as);
+    } else {
+      flipped_kernel.data = std::make_shared<FlippedData>(kernel->exec);
+    }
+    flipped_kernel.exec = FlippedCompare;
     DCHECK_OK(flipped_func->AddKernel(std::move(flipped_kernel)));
   }
   return flipped_func;
@@ -750,8 +895,8 @@ void RegisterScalarComparison(FunctionRegistry* registry) {
   auto greater_equal =
       MakeCompareFunction<GreaterEqual>("greater_equal", greater_equal_doc);
 
-  auto less = MakeFlippedFunction("less", *greater, less_doc);
-  auto less_equal = MakeFlippedFunction("less_equal", *greater_equal, 
less_equal_doc);
+  auto less = MakeFlippedCompare("less", *greater, less_doc);
+  auto less_equal = MakeFlippedCompare("less_equal", *greater_equal, 
less_equal_doc);
   DCHECK_OK(registry->AddFunction(std::move(less)));
   DCHECK_OK(registry->AddFunction(std::move(less_equal)));
   DCHECK_OK(registry->AddFunction(std::move(greater)));
diff --git a/cpp/src/arrow/util/bit_util.h b/cpp/src/arrow/util/bit_util.h
index 8583e10b22..04ab07af1d 100644
--- a/cpp/src/arrow/util/bit_util.h
+++ b/cpp/src/arrow/util/bit_util.h
@@ -353,5 +353,15 @@ constexpr Word SpliceWord(int n, Word low, Word high) {
   return (high & ~PrecedingWordBitmask<Word>(n)) | (low & 
PrecedingWordBitmask<Word>(n));
 }
 
+/// \brief Pack integers into a bitmap in batches of 8
+template <int batch_size>
+void PackBits(const uint32_t* values, uint8_t* out) {
+  for (int i = 0; i < batch_size / 8; ++i) {
+    *out++ = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 |
+              values[4] << 4 | values[5] << 5 | values[6] << 6 | values[7] << 
7);
+    values += 8;
+  }
+}
+
 }  // namespace bit_util
 }  // namespace arrow
diff --git a/cpp/tools/binary_symbol_explore.py 
b/cpp/tools/binary_symbol_explore.py
index dfe81cea84..d7fa54da1b 100644
--- a/cpp/tools/binary_symbol_explore.py
+++ b/cpp/tools/binary_symbol_explore.py
@@ -114,6 +114,7 @@ if __name__ == '__main__':
                                                   'contender', 'diff'])
         pd.options.display.max_rows = 1000
         pd.options.display.max_colwidth = 150
+        print(diff[diff['diff'] < - 700])
         print(diff[diff['diff'] > 700])
     else:
         # TODO

Reply via email to