westonpace commented on a change in pull request #11233:
URL: https://github.com/apache/arrow/pull/11233#discussion_r728451115



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2902,9 +2929,9 @@ struct ExtractRegexData {
       // No input type specified => propagate shape
       return args[0];
     }
-    // Input type is either String or LargeString and is also the type of each
+    // Input type is [Large]Binary or [Large]String and is also the type of 
each

Review comment:
       Minor Nit: ...or fixed-size binary

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2601,10 +2625,11 @@ struct RegexSubStringReplacer {
 
   // Using RE2::FindAndConsume we can only find the pattern if it is a group, 
therefore
   // we have 2 regexes, one with () around it, one without.
-  explicit RegexSubStringReplacer(const ReplaceSubstringOptions& options)
+  explicit RegexSubstringReplacer(const ReplaceSubstringOptions& options)
       : options_(options),
-        regex_find_("(" + options_.pattern + ")", RE2::Quiet),
-        regex_replacement_(options_.pattern, RE2::Quiet) {}
+        regex_find_("(" + options_.pattern + ")",
+                    Type::is_utf8 ? RE2::Quiet : RE2::Latin1),
+        regex_replacement_(options_.pattern, Type::is_utf8 ? RE2::Quiet : 
RE2::Latin1) {}

Review comment:
       Can we do `RE2::Quiet | RE2::Latin1` if `Type::is_utf8`?  Or is the only 
possible error a code-point / decoding error which is just not possible with 
latin1?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -3038,23 +3062,17 @@ const FunctionDoc extract_regex_doc(
 void AddExtractRegex(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("extract_regex", Arity::Unary(),
                                                &extract_regex_doc);
-  using t32 = ExtractRegex<StringType>;
-  using t64 = ExtractRegex<LargeStringType>;
   OutputType out_ty(ResolveExtractRegexOutput);
-  ScalarKernel kernel;
-
-  // Null values will be computed based on regex match or not
-  kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
-  kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
-  kernel.signature.reset(new KernelSignature({utf8()}, out_ty));
-  kernel.exec = t32::Exec;
-  kernel.init = t32::State::Init;
-  DCHECK_OK(func->AddKernel(kernel));
-  kernel.signature.reset(new KernelSignature({large_utf8()}, out_ty));
-  kernel.exec = t64::Exec;
-  kernel.init = t64::State::Init;
-  DCHECK_OK(func->AddKernel(kernel));
-
+  for (const auto& ty : StringTypes()) {

Review comment:
       Should this include binary types?

##########
File path: cpp/src/arrow/testing/gtest_util.h
##########
@@ -175,6 +175,8 @@ using TemporalArrowTypes =
 
 using DecimalArrowTypes = ::testing::Types<Decimal128Type, Decimal256Type>;
 
+using BinaryBaseArrowTypes = ::testing::Types<BinaryType, LargeBinaryType>;
+
 using BinaryArrowTypes =

Review comment:
       Should we add FixedSizeBinaryType to this list?

##########
File path: cpp/src/arrow/testing/gtest_util.h
##########
@@ -175,6 +175,8 @@ using TemporalArrowTypes =
 
 using DecimalArrowTypes = ::testing::Types<Decimal128Type, Decimal256Type>;
 
+using BinaryBaseArrowTypes = ::testing::Types<BinaryType, LargeBinaryType>;
+

Review comment:
       Nit: This is a little misleading because string extends BaseBinaryType 
so I think of it as "binary base"

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -3592,15 +3617,15 @@ void AddBinaryLength(FunctionRegistry* registry) {
 void AddUtf8Length(FunctionRegistry* registry) {
   auto func =
       std::make_shared<ScalarFunction>("utf8_length", Arity::Unary(), 
&utf8_length_doc);
-
-  ArrayKernelExec exec_offset_32 =
-      applicator::ScalarUnaryNotNull<Int32Type, StringType, Utf8Length>::Exec;
-  DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32)));
-
-  ArrayKernelExec exec_offset_64 =
-      applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, 
Utf8Length>::Exec;
-  DCHECK_OK(func->AddKernel({large_utf8()}, int64(), 
std::move(exec_offset_64)));
-
+  {
+    auto exec = applicator::ScalarUnaryNotNull<Int32Type, StringType, 
Utf8Length>::Exec;
+    DCHECK_OK(func->AddKernel({utf8()}, int32(), exec));
+  }
+  {
+    auto exec =
+        applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, 
Utf8Length>::Exec;
+    DCHECK_OK(func->AddKernel({large_utf8()}, int64(), exec));
+  }

Review comment:
       Should you use `std::move(exec)`?  It doesn't look like we move execs 
elsewhere but we were doing it previously.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to