pitrou commented on code in PR #45577:
URL: https://github.com/apache/arrow/pull/45577#discussion_r1987367006
##########
docs/source/cpp/compute.rst:
##########
@@ -1128,16 +1128,22 @@ when a positive ``max_splits`` is given.
String component extraction
~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+---------------+-------+------------------------+-------------+-------------------------------+-------+
-| Function name | Arity | Input types | Output type | Options class
| Notes |
-+===============+=======+========================+=============+===============================+=======+
-| extract_regex | Unary | Binary- or String-like | Struct |
:struct:`ExtractRegexOptions` | \(1) |
-+---------------+-------+------------------------+-------------+-------------------------------+-------+
++--------------------+-------+------------------------+-------------+-----------------------------------+-------+
+| Function name | Arity | Input types | Output type | Options
class | Notes |
++====================+=======+========================+=============+===================================+=======+
+| extract_regex | Unary | Binary- or String-like | Struct |
:struct:`ExtractRegexOptions` | \(1) |
++--------------------+-------+------------------------+-------------+-----------------------------------+-------+
+| extract_regex_span | Unary | Binary- or String-like | Struct |
:struct:`ExtractRegexSpanOptions` | \(2) |
++--------------------+-------+------------------------+-------------+-----------------------------------+-------+
* \(1) Extract substrings defined by a regular expression using the Google RE2
library. The output struct field names refer to the named capture groups,
e.g. 'letter' and 'digit' for the regular expression
``(?P<letter>[ab])(?P<digit>\\d)``.
+* \(2) Extract the offset and length of substrings defined by a regular
expression
+ using the Google RE2 library. The output struct field names refer to the
named
+ capture groups, e.g. 'letter' and 'digit' for the regular expression
+ ``(?P<letter>[ab])(?P<digit>\\d)``.
Review Comment:
We should also describe the output fields.
```suggestion
``(?P<letter>[ab])(?P<digit>\\d)``. Each output struct field is a fixed
size list
of two integers: the index to the start of the captured group and the
length
of the captured group, respectively.
```
##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2347,6 +2351,136 @@ void AddAsciiStringExtractRegex(FunctionRegistry*
registry) {
}
DCHECK_OK(registry->AddFunction(std::move(func)));
}
+struct ExtractRegexSpanData : public BaseExtractRegexData {
+ static Result<ExtractRegexSpanData> Make(const std::string& pattern,
+ bool is_utf8 = true) {
+ auto data = ExtractRegexSpanData(pattern, is_utf8);
+ ARROW_RETURN_NOT_OK(data.Init());
+ return data;
+ }
+
+ Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types)
const {
+ const DataType* input_type = types[0].type;
+ if (input_type == nullptr) {
+ return nullptr;
+ }
+ DCHECK(is_base_binary_like(input_type->id()));
+ const size_t field_count = num_groups();
+ FieldVector fields;
+ fields.reserve(field_count);
+ const auto owned_type = input_type->GetSharedPtr();
+ auto type = is_binary_like(owned_type->id()) ? int32() : int64();
+ for (const auto& group_name : group_names) {
+ // size list is 2 as every span contains position and length
+ fields.push_back(field(group_name, fixed_size_list(type, 2)));
+ }
+ return struct_(fields);
Review Comment:
Can move the vector to avoid a temporary copy
```suggestion
return struct_(std::move(fields));
```
##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2347,6 +2351,136 @@ void AddAsciiStringExtractRegex(FunctionRegistry*
registry) {
}
DCHECK_OK(registry->AddFunction(std::move(func)));
}
+struct ExtractRegexSpanData : public BaseExtractRegexData {
+ static Result<ExtractRegexSpanData> Make(const std::string& pattern,
+ bool is_utf8 = true) {
+ auto data = ExtractRegexSpanData(pattern, is_utf8);
+ ARROW_RETURN_NOT_OK(data.Init());
+ return data;
+ }
+
+ Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types)
const {
+ const DataType* input_type = types[0].type;
+ if (input_type == nullptr) {
+ return nullptr;
+ }
+ DCHECK(is_base_binary_like(input_type->id()));
+ const size_t field_count = num_groups();
+ FieldVector fields;
+ fields.reserve(field_count);
+ const auto owned_type = input_type->GetSharedPtr();
+ auto type = is_binary_like(owned_type->id()) ? int32() : int64();
Review Comment:
This can be simplified:
```suggestion
auto index_type = is_binary_like(input_type->id()) ? int32() : int64();
```
##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2308,9 +2313,8 @@ struct ExtractRegex : public ExtractRegexBase {
RETURN_NOT_OK(field_builders[i]->Append(ToStringView(found_values[i])));
}
return struct_builder->Append();
- } else {
- return struct_builder->AppendNull();
}
+ return struct_builder->AppendNull();
Review Comment:
This change is harmless but it's also pointless, and it will make it more
difficult to navigate in the git history. Can you perhaps undo it?
##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2347,6 +2351,136 @@ void AddAsciiStringExtractRegex(FunctionRegistry*
registry) {
}
DCHECK_OK(registry->AddFunction(std::move(func)));
}
+struct ExtractRegexSpanData : public BaseExtractRegexData {
+ static Result<ExtractRegexSpanData> Make(const std::string& pattern,
+ bool is_utf8 = true) {
+ auto data = ExtractRegexSpanData(pattern, is_utf8);
+ ARROW_RETURN_NOT_OK(data.Init());
+ return data;
+ }
+
+ Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types)
const {
+ const DataType* input_type = types[0].type;
+ if (input_type == nullptr) {
+ return nullptr;
+ }
+ DCHECK(is_base_binary_like(input_type->id()));
+ const size_t field_count = num_groups();
+ FieldVector fields;
+ fields.reserve(field_count);
+ const auto owned_type = input_type->GetSharedPtr();
+ auto type = is_binary_like(owned_type->id()) ? int32() : int64();
+ for (const auto& group_name : group_names) {
+ // size list is 2 as every span contains position and length
+ fields.push_back(field(group_name, fixed_size_list(type, 2)));
Review Comment:
```suggestion
fields.push_back(field(group_name, fixed_size_list(index_type, 2)));
```
##########
cpp/src/arrow/compute/kernels/scalar_string_test.cc:
##########
@@ -314,6 +314,7 @@ TYPED_TEST(TestBinaryKernels, NonUtf8Regex) {
this->MakeArray({"\xfc\x40", "this \xfc\x40 that
\xfc\x40"}),
this->MakeArray({"bazz", "this bazz that \xfc\x40"}),
&options);
}
+ // TODO the following test is broken
Review Comment:
We should still open an issue for this TODO and reference it here, could you
do it?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]