[GitHub] [arrow] lidavidm commented on a change in pull request #11233: ARROW-13879: [C++] Mixed support for binary types in regex functions

GitBox Mon, 27 Sep 2021 06:27:43 -0700


lidavidm commented on a change in pull request #11233:
URL: https://github.com/apache/arrow/pull/11233#discussion_r716671796




##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2201,11 +2201,11 @@ void AddSplitWhitespaceAscii(FunctionRegistry* 
registry) {
   auto func =
       std::make_shared<ScalarFunction>("ascii_split_whitespace", 
Arity::Unary(),
                                        &ascii_split_whitespace_doc, 
&default_options);
-  using t32 = SplitWhitespaceAsciiExec<StringType, ListType>;
-  using t64 = SplitWhitespaceAsciiExec<LargeStringType, ListType>;
-  DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, 
t32::State::Init));
-  DCHECK_OK(
-      func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, 
t64::State::Init));
+
+  for (const auto& ty : StringTypes()) {
+    auto exec = GenerateTypeAgnosticVarBinary<SplitWhitespaceAsciiExec, 
ListType>(ty);
+    DCHECK_OK(func->AddKernel({ty}, {list(ty)}, exec, SplitState::Init));
+  }

Review comment:
       Ditto - I don't think this needs to support binary values.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -724,7 +724,7 @@ TYPED_TEST(TestStringKernels, IsUpperAscii) {
                    "[false, null, false, true, true, false, false]");
 }
 
-TYPED_TEST(TestStringKernels, MatchSubstring) {
+TYPED_TEST(TestBinaryKernels, MatchSubstring) {

Review comment:
       For any test that now includes binary values, we need to test invalid 
UTF-8.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2868,8 +2865,8 @@ struct ExtractRegex : public ExtractRegexBase {
       if (input.is_valid && Match(util::string_view(*input.value))) {
         result->value.reserve(group_count);
         for (int i = 0; i < group_count; i++) {
-          result->value.push_back(
-              std::make_shared<ScalarType>(found_values[i].as_string()));
+          result->value.push_back(std::make_shared<ScalarType>(
+              std::make_shared<Buffer>(ToStringView(found_values[i]))));

Review comment:
       This isn't safe, the string_view constructor is non-owning. We need to 
allocate a buffer/string here.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1628,12 +1627,10 @@ const FunctionDoc utf8_slice_codeunits_doc(
 void AddSlice(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("utf8_slice_codeunits", 
Arity::Unary(),
                                                &utf8_slice_codeunits_doc);
-  using t32 = SliceCodeunits<StringType>;
-  using t64 = SliceCodeunits<LargeStringType>;
-  DCHECK_OK(
-      func->AddKernel({utf8()}, utf8(), t32::Exec, 
SliceCodeunitsTransform::State::Init));
-  DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), t64::Exec,
-                            SliceCodeunitsTransform::State::Init));
+  for (const auto& ty : StringTypes()) {
+    auto exec = GenerateTypeAgnosticVarBinary<SliceCodeunits>(ty);

Review comment:
       This function is named utf8_slice_codeunits, it shouldn't support binary 
types. "codeunits" doesn't make sense on a non-Unicode string and the 
implementation uses UTF-8 functions to count bytes; it'll give incorrect 
results on an arbitrary binary string.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1132,52 +1132,51 @@ void AddMatchSubstring(FunctionRegistry* registry) {
   {
     auto func = std::make_shared<ScalarFunction>("match_substring", 
Arity::Unary(),
                                                  &match_substring_doc);
-    auto exec_32 = MatchSubstring<StringType, PlainSubstringMatcher>::Exec;
-    auto exec_64 = MatchSubstring<LargeStringType, 
PlainSubstringMatcher>::Exec;
-    DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, 
MatchSubstringState::Init));
-    DCHECK_OK(
-        func->AddKernel({large_utf8()}, boolean(), exec_64, 
MatchSubstringState::Init));
+    for (const auto& ty : BaseBinaryTypes()) {
+      auto exec =
+          GenerateTypeAgnosticVarBinaryBase<MatchSubstring, 
PlainSubstringMatcher>(ty);
+      DCHECK_OK(func->AddKernel({ty}, boolean(), exec, 
MatchSubstringState::Init));
+    }
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
   {
     auto func = std::make_shared<ScalarFunction>("starts_with", Arity::Unary(),
                                                  &match_substring_doc);
-    auto exec_32 = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec;
-    auto exec_64 = MatchSubstring<LargeStringType, 
PlainStartsWithMatcher>::Exec;
-    DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, 
MatchSubstringState::Init));
-    DCHECK_OK(
-        func->AddKernel({large_utf8()}, boolean(), exec_64, 
MatchSubstringState::Init));
+    for (const auto& ty : BaseBinaryTypes()) {
+      auto exec =
+          GenerateTypeAgnosticVarBinaryBase<MatchSubstring, 
PlainStartsWithMatcher>(ty);
+      DCHECK_OK(func->AddKernel({ty}, boolean(), exec, 
MatchSubstringState::Init));
+    }

Review comment:
       I think for anything which supports an ignore_case flag, it should be 
disabled or ignored for the regex matcher. Note RE2 assumes input is either 
UTF-8 or Latin1 encoded; we should also test invalid UTF-8.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2201,11 +2201,11 @@ void AddSplitWhitespaceAscii(FunctionRegistry* 
registry) {
   auto func =
       std::make_shared<ScalarFunction>("ascii_split_whitespace", 
Arity::Unary(),
                                        &ascii_split_whitespace_doc, 
&default_options);
-  using t32 = SplitWhitespaceAsciiExec<StringType, ListType>;
-  using t64 = SplitWhitespaceAsciiExec<LargeStringType, ListType>;
-  DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, 
t32::State::Init));
-  DCHECK_OK(
-      func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, 
t64::State::Init));
+
+  for (const auto& ty : StringTypes()) {
+    auto exec = GenerateTypeAgnosticVarBinary<SplitWhitespaceAsciiExec, 
ListType>(ty);
+    DCHECK_OK(func->AddKernel({ty}, {list(ty)}, exec, SplitState::Init));
+  }

Review comment:
       (etc. for kernels below labeled ascii_ or utf8_.)

##########
File path: cpp/src/arrow/compute/kernels/codegen_internal.h
##########
@@ -1188,6 +1188,24 @@ ArrayKernelExec 
GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
   }
 }
 
+// similar to GenerateTypeAgnosticPrimitive, but for variable binary and 
string types
+template <template <typename...> class Generator, typename... Args>
+ArrayKernelExec GenerateTypeAgnosticVarBinary(detail::GetTypeId get_id) {

Review comment:
       This is also essentially the same as GenerateVarBinary (we need a way to 
differentiate the kernels that have a Type0 argument)

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2868,8 +2865,8 @@ struct ExtractRegex : public ExtractRegexBase {
       if (input.is_valid && Match(util::string_view(*input.value))) {
         result->value.reserve(group_count);
         for (int i = 0; i < group_count; i++) {
-          result->value.push_back(
-              std::make_shared<ScalarType>(found_values[i].as_string()));
+          result->value.push_back(std::make_shared<ScalarType>(
+              std::make_shared<Buffer>(ToStringView(found_values[i]))));

Review comment:
       (I suppose it works in tests because RE2 might be returning a slice of 
the input?)

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1132,52 +1132,51 @@ void AddMatchSubstring(FunctionRegistry* registry) {
   {
     auto func = std::make_shared<ScalarFunction>("match_substring", 
Arity::Unary(),
                                                  &match_substring_doc);
-    auto exec_32 = MatchSubstring<StringType, PlainSubstringMatcher>::Exec;
-    auto exec_64 = MatchSubstring<LargeStringType, 
PlainSubstringMatcher>::Exec;
-    DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, 
MatchSubstringState::Init));
-    DCHECK_OK(
-        func->AddKernel({large_utf8()}, boolean(), exec_64, 
MatchSubstringState::Init));
+    for (const auto& ty : BaseBinaryTypes()) {
+      auto exec =
+          GenerateTypeAgnosticVarBinaryBase<MatchSubstring, 
PlainSubstringMatcher>(ty);
+      DCHECK_OK(func->AddKernel({ty}, boolean(), exec, 
MatchSubstringState::Init));
+    }
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
   {
     auto func = std::make_shared<ScalarFunction>("starts_with", Arity::Unary(),
                                                  &match_substring_doc);
-    auto exec_32 = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec;
-    auto exec_64 = MatchSubstring<LargeStringType, 
PlainStartsWithMatcher>::Exec;
-    DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, 
MatchSubstringState::Init));
-    DCHECK_OK(
-        func->AddKernel({large_utf8()}, boolean(), exec_64, 
MatchSubstringState::Init));
+    for (const auto& ty : BaseBinaryTypes()) {
+      auto exec =
+          GenerateTypeAgnosticVarBinaryBase<MatchSubstring, 
PlainStartsWithMatcher>(ty);
+      DCHECK_OK(func->AddKernel({ty}, boolean(), exec, 
MatchSubstringState::Init));
+    }

Review comment:
       I believe we would also need to configure RE2 to assume Latin1 instead 
of UTF-8 on binary data as a result.

##########
File path: cpp/src/arrow/compute/kernels/codegen_internal.h
##########
@@ -1188,6 +1188,24 @@ ArrayKernelExec 
GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
   }
 }
 
+// similar to GenerateTypeAgnosticPrimitive, but for variable binary and 
string types
+template <template <typename...> class Generator, typename... Args>
+ArrayKernelExec GenerateTypeAgnosticVarBinary(detail::GetTypeId get_id) {

Review comment:
       This is no longer type agnostic. 

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -3421,41 +3415,45 @@ const FunctionDoc utf8_length_doc("Compute UTF8 string 
lengths",
 
 void AddStrptime(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(), 
&strptime_doc);

Review comment:
       Hmm, strptime explicitly works on string inputs - I don't think it makes 
sense to try to parse timestamps from a binary string? (Unless you have odd 
data with binary delimiters?) 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] lidavidm commented on a change in pull request #11233: ARROW-13879: [C++] Mixed support for binary types in regex functions

Reply via email to