lidavidm commented on a change in pull request #11233:
URL: https://github.com/apache/arrow/pull/11233#discussion_r716671796
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2201,11 +2201,11 @@ void AddSplitWhitespaceAscii(FunctionRegistry*
registry) {
auto func =
std::make_shared<ScalarFunction>("ascii_split_whitespace",
Arity::Unary(),
&ascii_split_whitespace_doc,
&default_options);
- using t32 = SplitWhitespaceAsciiExec<StringType, ListType>;
- using t64 = SplitWhitespaceAsciiExec<LargeStringType, ListType>;
- DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec,
t32::State::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec,
t64::State::Init));
+
+ for (const auto& ty : StringTypes()) {
+ auto exec = GenerateTypeAgnosticVarBinary<SplitWhitespaceAsciiExec,
ListType>(ty);
+ DCHECK_OK(func->AddKernel({ty}, {list(ty)}, exec, SplitState::Init));
+ }
Review comment:
Ditto - I don't think this needs to support binary values.
##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -724,7 +724,7 @@ TYPED_TEST(TestStringKernels, IsUpperAscii) {
"[false, null, false, true, true, false, false]");
}
-TYPED_TEST(TestStringKernels, MatchSubstring) {
+TYPED_TEST(TestBinaryKernels, MatchSubstring) {
Review comment:
For any test that now includes binary values, we need to test invalid
UTF-8.
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2868,8 +2865,8 @@ struct ExtractRegex : public ExtractRegexBase {
if (input.is_valid && Match(util::string_view(*input.value))) {
result->value.reserve(group_count);
for (int i = 0; i < group_count; i++) {
- result->value.push_back(
- std::make_shared<ScalarType>(found_values[i].as_string()));
+ result->value.push_back(std::make_shared<ScalarType>(
+ std::make_shared<Buffer>(ToStringView(found_values[i]))));
Review comment:
This isn't safe, the string_view constructor is non-owning. We need to
allocate a buffer/string here.
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1628,12 +1627,10 @@ const FunctionDoc utf8_slice_codeunits_doc(
void AddSlice(FunctionRegistry* registry) {
auto func = std::make_shared<ScalarFunction>("utf8_slice_codeunits",
Arity::Unary(),
&utf8_slice_codeunits_doc);
- using t32 = SliceCodeunits<StringType>;
- using t64 = SliceCodeunits<LargeStringType>;
- DCHECK_OK(
- func->AddKernel({utf8()}, utf8(), t32::Exec,
SliceCodeunitsTransform::State::Init));
- DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), t64::Exec,
- SliceCodeunitsTransform::State::Init));
+ for (const auto& ty : StringTypes()) {
+ auto exec = GenerateTypeAgnosticVarBinary<SliceCodeunits>(ty);
Review comment:
This function is named utf8_slice_codeunits, it shouldn't support binary
types. "codeunits" doesn't make sense on a non-Unicode string and the
implementation uses UTF-8 functions to count bytes; it'll give incorrect
results on an arbitrary binary string.
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1132,52 +1132,51 @@ void AddMatchSubstring(FunctionRegistry* registry) {
{
auto func = std::make_shared<ScalarFunction>("match_substring",
Arity::Unary(),
&match_substring_doc);
- auto exec_32 = MatchSubstring<StringType, PlainSubstringMatcher>::Exec;
- auto exec_64 = MatchSubstring<LargeStringType,
PlainSubstringMatcher>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32,
MatchSubstringState::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, boolean(), exec_64,
MatchSubstringState::Init));
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto exec =
+ GenerateTypeAgnosticVarBinaryBase<MatchSubstring,
PlainSubstringMatcher>(ty);
+ DCHECK_OK(func->AddKernel({ty}, boolean(), exec,
MatchSubstringState::Init));
+ }
DCHECK_OK(registry->AddFunction(std::move(func)));
}
{
auto func = std::make_shared<ScalarFunction>("starts_with", Arity::Unary(),
&match_substring_doc);
- auto exec_32 = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec;
- auto exec_64 = MatchSubstring<LargeStringType,
PlainStartsWithMatcher>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32,
MatchSubstringState::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, boolean(), exec_64,
MatchSubstringState::Init));
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto exec =
+ GenerateTypeAgnosticVarBinaryBase<MatchSubstring,
PlainStartsWithMatcher>(ty);
+ DCHECK_OK(func->AddKernel({ty}, boolean(), exec,
MatchSubstringState::Init));
+ }
Review comment:
I think for anything which supports an ignore_case flag, it should be
disabled or ignored for the regex matcher. Note RE2 assumes input is either
UTF-8 or Latin1 encoded; we should also test invalid UTF-8.
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2201,11 +2201,11 @@ void AddSplitWhitespaceAscii(FunctionRegistry*
registry) {
auto func =
std::make_shared<ScalarFunction>("ascii_split_whitespace",
Arity::Unary(),
&ascii_split_whitespace_doc,
&default_options);
- using t32 = SplitWhitespaceAsciiExec<StringType, ListType>;
- using t64 = SplitWhitespaceAsciiExec<LargeStringType, ListType>;
- DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec,
t32::State::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec,
t64::State::Init));
+
+ for (const auto& ty : StringTypes()) {
+ auto exec = GenerateTypeAgnosticVarBinary<SplitWhitespaceAsciiExec,
ListType>(ty);
+ DCHECK_OK(func->AddKernel({ty}, {list(ty)}, exec, SplitState::Init));
+ }
Review comment:
(etc. for kernels below labeled ascii_ or utf8_.)
##########
File path: cpp/src/arrow/compute/kernels/codegen_internal.h
##########
@@ -1188,6 +1188,24 @@ ArrayKernelExec
GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
}
}
+// similar to GenerateTypeAgnosticPrimitive, but for variable binary and
string types
+template <template <typename...> class Generator, typename... Args>
+ArrayKernelExec GenerateTypeAgnosticVarBinary(detail::GetTypeId get_id) {
Review comment:
This is also essentially the same as GenerateVarBinary (we need a way to
differentiate the kernels that have a Type0 argument)
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2868,8 +2865,8 @@ struct ExtractRegex : public ExtractRegexBase {
if (input.is_valid && Match(util::string_view(*input.value))) {
result->value.reserve(group_count);
for (int i = 0; i < group_count; i++) {
- result->value.push_back(
- std::make_shared<ScalarType>(found_values[i].as_string()));
+ result->value.push_back(std::make_shared<ScalarType>(
+ std::make_shared<Buffer>(ToStringView(found_values[i]))));
Review comment:
(I suppose it works in tests because RE2 might be returning a slice of
the input?)
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1132,52 +1132,51 @@ void AddMatchSubstring(FunctionRegistry* registry) {
{
auto func = std::make_shared<ScalarFunction>("match_substring",
Arity::Unary(),
&match_substring_doc);
- auto exec_32 = MatchSubstring<StringType, PlainSubstringMatcher>::Exec;
- auto exec_64 = MatchSubstring<LargeStringType,
PlainSubstringMatcher>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32,
MatchSubstringState::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, boolean(), exec_64,
MatchSubstringState::Init));
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto exec =
+ GenerateTypeAgnosticVarBinaryBase<MatchSubstring,
PlainSubstringMatcher>(ty);
+ DCHECK_OK(func->AddKernel({ty}, boolean(), exec,
MatchSubstringState::Init));
+ }
DCHECK_OK(registry->AddFunction(std::move(func)));
}
{
auto func = std::make_shared<ScalarFunction>("starts_with", Arity::Unary(),
&match_substring_doc);
- auto exec_32 = MatchSubstring<StringType, PlainStartsWithMatcher>::Exec;
- auto exec_64 = MatchSubstring<LargeStringType,
PlainStartsWithMatcher>::Exec;
- DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32,
MatchSubstringState::Init));
- DCHECK_OK(
- func->AddKernel({large_utf8()}, boolean(), exec_64,
MatchSubstringState::Init));
+ for (const auto& ty : BaseBinaryTypes()) {
+ auto exec =
+ GenerateTypeAgnosticVarBinaryBase<MatchSubstring,
PlainStartsWithMatcher>(ty);
+ DCHECK_OK(func->AddKernel({ty}, boolean(), exec,
MatchSubstringState::Init));
+ }
Review comment:
I believe we would also need to configure RE2 to assume Latin1 instead
of UTF-8 on binary data as a result.
##########
File path: cpp/src/arrow/compute/kernels/codegen_internal.h
##########
@@ -1188,6 +1188,24 @@ ArrayKernelExec
GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
}
}
+// similar to GenerateTypeAgnosticPrimitive, but for variable binary and
string types
+template <template <typename...> class Generator, typename... Args>
+ArrayKernelExec GenerateTypeAgnosticVarBinary(detail::GetTypeId get_id) {
Review comment:
This is no longer type agnostic.
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -3421,41 +3415,45 @@ const FunctionDoc utf8_length_doc("Compute UTF8 string
lengths",
void AddStrptime(FunctionRegistry* registry) {
auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(),
&strptime_doc);
Review comment:
Hmm, strptime explicitly works on string inputs - I don't think it makes
sense to try to parse timestamps from a binary string? (Unless you have odd
data with binary delimiters?)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]