lidavidm commented on a change in pull request #11233:
URL: https://github.com/apache/arrow/pull/11233#discussion_r730876707



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -4173,48 +4238,50 @@ void MakeUnaryStringUTF8TransformKernel(std::string 
name, FunctionRegistry* regi
 using StringPredicate =
     std::function<bool(KernelContext*, const uint8_t*, size_t, Status*)>;
 
-template <typename Type>
-Status ApplyPredicate(KernelContext* ctx, const ExecBatch& batch,
-                      StringPredicate predicate, Datum* out) {
-  Status st = Status::OK();
-  EnsureLookupTablesFilled();
-  if (batch[0].kind() == Datum::ARRAY) {
-    const ArrayData& input = *batch[0].array();
-    ArrayIterator<Type> input_it(input);
-    ArrayData* out_arr = out->mutable_array();
-    ::arrow::internal::GenerateBitsUnrolled(
-        out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
-        [&]() -> bool {
-          util::string_view val = input_it();
-          return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()), 
val.size(),
-                           &st);
-        });
-  } else {
-    const auto& input = checked_cast<const 
BaseBinaryScalar&>(*batch[0].scalar());
-    if (input.is_valid) {
-      bool boolean_result = predicate(ctx, input.value->data(),
-                                      
static_cast<size_t>(input.value->size()), &st);
-      // UTF decoding can lead to issues
-      if (st.ok()) {
-        out->value = std::make_shared<BooleanScalar>(boolean_result);
+template <typename Type, typename Predicate>
+struct StringPredicateFunctor {
+  static Status ApplyPredicate(KernelContext* ctx, const ExecBatch& batch,
+                               StringPredicate predicate, Datum* out) {
+    Status st = Status::OK();
+    EnsureLookupTablesFilled();
+    if (batch[0].kind() == Datum::ARRAY) {
+      const ArrayData& input = *batch[0].array();
+      ArrayIterator<Type> input_it(input);
+      ArrayData* out_arr = out->mutable_array();
+      ::arrow::internal::GenerateBitsUnrolled(
+          out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
+          [&]() -> bool {
+            util::string_view val = input_it();
+            return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()),
+                             val.size(), &st);
+          });
+    } else {
+      const auto& input = checked_cast<const 
BaseBinaryScalar&>(*batch[0].scalar());
+      if (input.is_valid) {
+        bool boolean_result = predicate(ctx, input.value->data(),
+                                        
static_cast<size_t>(input.value->size()), &st);
+        // UTF decoding can lead to issues
+        if (st.ok()) {
+          out->value = std::make_shared<BooleanScalar>(boolean_result);
+        }
       }
     }
+    return st;
   }
-  return st;
-}
+
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    return ApplyPredicate(ctx, batch, Predicate::Call, out);

Review comment:
       Why not get rid of the parameter and inline Predicate::Call above?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -88,19 +102,316 @@ class BaseTestStringKernels : public ::testing::Test {
   std::shared_ptr<DataType> offset_type() {
     return TypeTraits<OffsetType>::type_singleton();
   }
+
+  template <typename CType = const char*>
+  std::shared_ptr<Array> MakeArray(const std::vector<CType>& values,
+                                   const std::vector<bool>& is_valid = {}) {
+    return _MakeArray<TestType, CType>(type(), values, is_valid);
+  }
 };
 
+template <typename TestType>
+class TestBaseBinaryKernels : public BaseTestStringKernels<TestType> {};
+
+TYPED_TEST_SUITE(TestBaseBinaryKernels, BaseBinaryArrowTypes);
+
 template <typename TestType>
 class TestBinaryKernels : public BaseTestStringKernels<TestType> {};
 
 TYPED_TEST_SUITE(TestBinaryKernels, BinaryArrowTypes);
 
-TYPED_TEST(TestBinaryKernels, BinaryLength) {
+template <typename TestType>
+class TestStringKernels : public BaseTestStringKernels<TestType> {};
+
+TYPED_TEST_SUITE(TestStringKernels, StringArrowTypes);
+
+TYPED_TEST(TestBaseBinaryKernels, BinaryLength) {
   this->CheckUnary("binary_length", R"(["aaa", null, "áéíóú", "", "b"])",
                    this->offset_type(), "[3, null, 10, 0, 1]");
+
+  // Invalid UTF-8 inputs
+  this->CheckUnary("binary_length", this->MakeArray({"\xf7\x0f\xab", 
"\xff\x9b\xc3\xbb"}),
+                   this->offset_type(), "[3, 4]");
+
+  // Invalid UTF-8 inputs with null bytes
+  this->CheckUnary("binary_length",
+                   this->template MakeArray<std::string>(
+                       {{"\xf7\x00\xab", 3}, {"\x00\x9b\x00\xbb", 4}, 
{"\x00\x00", 2}}),
+                   this->offset_type(), "[3, 4, 2]");
+}
+
+// The NonUtf8XXX tests use kernels that do not accept invalid UTF-8 when
+// processing [Large]StringType data. These tests use invalid UTF-8 inputs.
+TYPED_TEST(TestBinaryKernels, NonUtf8) {
+#ifdef ARROW_WITH_RE2
+  for (auto ignore_case : {true, false}) {
+#else
+  for (auto ignore_case : {false}) {
+#endif
+    MatchSubstringOptions options("\xfc\x40", ignore_case);
+    this->CheckUnary(
+        "find_substring",
+        this->MakeArray({"\xfc\x40\xab", "\xff\x9b\xfc\x40\xab", 
"\x01\xfc\x41"}),
+        this->offset_type(), "[0, 2, -1]", &options);
+
+    // @ = \x40

Review comment:
       nit: why not just either consistently use \x40 or @?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -4173,48 +4238,50 @@ void MakeUnaryStringUTF8TransformKernel(std::string 
name, FunctionRegistry* regi
 using StringPredicate =
     std::function<bool(KernelContext*, const uint8_t*, size_t, Status*)>;
 
-template <typename Type>
-Status ApplyPredicate(KernelContext* ctx, const ExecBatch& batch,
-                      StringPredicate predicate, Datum* out) {
-  Status st = Status::OK();
-  EnsureLookupTablesFilled();
-  if (batch[0].kind() == Datum::ARRAY) {
-    const ArrayData& input = *batch[0].array();
-    ArrayIterator<Type> input_it(input);
-    ArrayData* out_arr = out->mutable_array();
-    ::arrow::internal::GenerateBitsUnrolled(
-        out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
-        [&]() -> bool {
-          util::string_view val = input_it();
-          return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()), 
val.size(),
-                           &st);
-        });
-  } else {
-    const auto& input = checked_cast<const 
BaseBinaryScalar&>(*batch[0].scalar());
-    if (input.is_valid) {
-      bool boolean_result = predicate(ctx, input.value->data(),
-                                      
static_cast<size_t>(input.value->size()), &st);
-      // UTF decoding can lead to issues
-      if (st.ok()) {
-        out->value = std::make_shared<BooleanScalar>(boolean_result);
+template <typename Type, typename Predicate>
+struct StringPredicateFunctor {
+  static Status ApplyPredicate(KernelContext* ctx, const ExecBatch& batch,
+                               StringPredicate predicate, Datum* out) {
+    Status st = Status::OK();
+    EnsureLookupTablesFilled();
+    if (batch[0].kind() == Datum::ARRAY) {
+      const ArrayData& input = *batch[0].array();
+      ArrayIterator<Type> input_it(input);
+      ArrayData* out_arr = out->mutable_array();
+      ::arrow::internal::GenerateBitsUnrolled(
+          out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
+          [&]() -> bool {
+            util::string_view val = input_it();
+            return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()),
+                             val.size(), &st);
+          });
+    } else {
+      const auto& input = checked_cast<const 
BaseBinaryScalar&>(*batch[0].scalar());
+      if (input.is_valid) {
+        bool boolean_result = predicate(ctx, input.value->data(),
+                                        
static_cast<size_t>(input.value->size()), &st);
+        // UTF decoding can lead to issues
+        if (st.ok()) {
+          out->value = std::make_shared<BooleanScalar>(boolean_result);
+        }
       }
     }
+    return st;
   }
-  return st;
-}
+
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    return ApplyPredicate(ctx, batch, Predicate::Call, out);

Review comment:
       And since ApplyPredicate isn't really used outside this functor, why not 
just inline ApplyPredicate into Exec?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to