pitrou commented on code in PR #45577:
URL: https://github.com/apache/arrow/pull/45577#discussion_r1987367006


##########
docs/source/cpp/compute.rst:
##########
@@ -1128,16 +1128,22 @@ when a positive ``max_splits`` is given.
 String component extraction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-+---------------+-------+------------------------+-------------+-------------------------------+-------+
-| Function name | Arity | Input types            | Output type | Options class 
                | Notes |
-+===============+=======+========================+=============+===============================+=======+
-| extract_regex | Unary | Binary- or String-like | Struct      | 
:struct:`ExtractRegexOptions` | \(1)  |
-+---------------+-------+------------------------+-------------+-------------------------------+-------+
++--------------------+-------+------------------------+-------------+-----------------------------------+-------+
+| Function name      | Arity | Input types            | Output type | Options 
class                     | Notes |
++====================+=======+========================+=============+===================================+=======+
+| extract_regex      | Unary | Binary- or String-like | Struct      | 
:struct:`ExtractRegexOptions`     | \(1)  |
++--------------------+-------+------------------------+-------------+-----------------------------------+-------+
+| extract_regex_span | Unary | Binary- or String-like | Struct      | 
:struct:`ExtractRegexSpanOptions` | \(2)  |
++--------------------+-------+------------------------+-------------+-----------------------------------+-------+
 
 * \(1) Extract substrings defined by a regular expression using the Google RE2
   library.  The output struct field names refer to the named capture groups,
   e.g. 'letter' and 'digit' for the regular expression
   ``(?P<letter>[ab])(?P<digit>\\d)``.
+* \(2) Extract  the offset and length of substrings defined by a regular 
expression
+  using the Google RE2 library.  The output struct field names refer to the 
named
+  capture groups, e.g. 'letter' and 'digit' for the regular expression
+  ``(?P<letter>[ab])(?P<digit>\\d)``.

Review Comment:
   We should also describe the output fields.
   ```suggestion
     ``(?P<letter>[ab])(?P<digit>\\d)``. Each output struct field is a fixed 
size list
     of two integers: the index to the start of the captured group and the 
length
     of the captured group, respectively.
   ```



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2347,6 +2351,136 @@ void AddAsciiStringExtractRegex(FunctionRegistry* 
registry) {
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
+struct ExtractRegexSpanData : public BaseExtractRegexData {
+  static Result<ExtractRegexSpanData> Make(const std::string& pattern,
+                                           bool is_utf8 = true) {
+    auto data = ExtractRegexSpanData(pattern, is_utf8);
+    ARROW_RETURN_NOT_OK(data.Init());
+    return data;
+  }
+
+  Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) 
const {
+    const DataType* input_type = types[0].type;
+    if (input_type == nullptr) {
+      return nullptr;
+    }
+    DCHECK(is_base_binary_like(input_type->id()));
+    const size_t field_count = num_groups();
+    FieldVector fields;
+    fields.reserve(field_count);
+    const auto owned_type = input_type->GetSharedPtr();
+    auto type = is_binary_like(owned_type->id()) ? int32() : int64();
+    for (const auto& group_name : group_names) {
+      // size list is 2 as every span contains position and length
+      fields.push_back(field(group_name, fixed_size_list(type, 2)));
+    }
+    return struct_(fields);

Review Comment:
   Can move the vector to avoid a temporary copy
   ```suggestion
       return struct_(std::move(fields));
   ```



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2347,6 +2351,136 @@ void AddAsciiStringExtractRegex(FunctionRegistry* 
registry) {
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
+struct ExtractRegexSpanData : public BaseExtractRegexData {
+  static Result<ExtractRegexSpanData> Make(const std::string& pattern,
+                                           bool is_utf8 = true) {
+    auto data = ExtractRegexSpanData(pattern, is_utf8);
+    ARROW_RETURN_NOT_OK(data.Init());
+    return data;
+  }
+
+  Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) 
const {
+    const DataType* input_type = types[0].type;
+    if (input_type == nullptr) {
+      return nullptr;
+    }
+    DCHECK(is_base_binary_like(input_type->id()));
+    const size_t field_count = num_groups();
+    FieldVector fields;
+    fields.reserve(field_count);
+    const auto owned_type = input_type->GetSharedPtr();
+    auto type = is_binary_like(owned_type->id()) ? int32() : int64();

Review Comment:
   This can be simplified:
   ```suggestion
       auto index_type = is_binary_like(input_type->id()) ? int32() : int64();
   ```



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2308,9 +2313,8 @@ struct ExtractRegex : public ExtractRegexBase {
           
RETURN_NOT_OK(field_builders[i]->Append(ToStringView(found_values[i])));
         }
         return struct_builder->Append();
-      } else {
-        return struct_builder->AppendNull();
       }
+      return struct_builder->AppendNull();

Review Comment:
   This change is harmless but it's also pointless, and it will make it more 
difficult to navigate in the git history. Can you perhaps undo it?



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2347,6 +2351,136 @@ void AddAsciiStringExtractRegex(FunctionRegistry* 
registry) {
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
+struct ExtractRegexSpanData : public BaseExtractRegexData {
+  static Result<ExtractRegexSpanData> Make(const std::string& pattern,
+                                           bool is_utf8 = true) {
+    auto data = ExtractRegexSpanData(pattern, is_utf8);
+    ARROW_RETURN_NOT_OK(data.Init());
+    return data;
+  }
+
+  Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) 
const {
+    const DataType* input_type = types[0].type;
+    if (input_type == nullptr) {
+      return nullptr;
+    }
+    DCHECK(is_base_binary_like(input_type->id()));
+    const size_t field_count = num_groups();
+    FieldVector fields;
+    fields.reserve(field_count);
+    const auto owned_type = input_type->GetSharedPtr();
+    auto type = is_binary_like(owned_type->id()) ? int32() : int64();
+    for (const auto& group_name : group_names) {
+      // size list is 2 as every span contains position and length
+      fields.push_back(field(group_name, fixed_size_list(type, 2)));

Review Comment:
   ```suggestion
         fields.push_back(field(group_name, fixed_size_list(index_type, 2)));
   ```



##########
cpp/src/arrow/compute/kernels/scalar_string_test.cc:
##########
@@ -314,6 +314,7 @@ TYPED_TEST(TestBinaryKernels, NonUtf8Regex) {
                      this->MakeArray({"\xfc\x40", "this \xfc\x40 that 
\xfc\x40"}),
                      this->MakeArray({"bazz", "this bazz that \xfc\x40"}), 
&options);
   }
+  // TODO the following test is broken

Review Comment:
   We should still open an issue for this TODO and reference it here, could you 
do it?
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to