Re: [PR] GH-44615: [C++] Add possibility to extract spans/byte offsets directly for compute.extract_regex [arrow]

via GitHub Mon, 24 Feb 2025 06:37:12 -0800


pitrou commented on code in PR #45577:
URL: https://github.com/apache/arrow/pull/45577#discussion_r1967722863



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2185,51 +2186,61 @@ void AddAsciiStringReplaceSubstring(FunctionRegistry* 
registry) {
 using ExtractRegexState = OptionsWrapper<ExtractRegexOptions>;
 
 // TODO cache this once per ExtractRegexOptions
-struct ExtractRegexData {
-  // Use unique_ptr<> because RE2 is non-movable (for ARROW_ASSIGN_OR_RAISE)
-  std::unique_ptr<RE2> regex;
-  std::vector<std::string> group_names;
-
+class ExtractRegexData {
+ public:
   static Result<ExtractRegexData> Make(const ExtractRegexOptions& options,
                                        bool is_utf8 = true) {
     ExtractRegexData data(options.pattern, is_utf8);
-    RETURN_NOT_OK(RegexStatus(*data.regex));
-
-    const int group_count = data.regex->NumberOfCapturingGroups();
-    const auto& name_map = data.regex->CapturingGroupNames();
-    data.group_names.reserve(group_count);
-
-    for (int i = 0; i < group_count; i++) {
-      auto item = name_map.find(i + 1);  // re2 starts counting from 1
-      if (item == name_map.end()) {
-        // XXX should we instead just create fields with an empty name?
-        return Status::Invalid("Regular expression contains unnamed groups");
-      }
-      data.group_names.emplace_back(item->second);
-    }
+    ARROW_RETURN_NOT_OK(data.Init());
     return data;
   }
 
   Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) 
const {
     const DataType* input_type = types[0].type;
-    if (input_type == nullptr) {
+    // as mentioned here
+    // 
https://arrow.apache.org/docs/developers/cpp/development.html#code-style-linting-and-ci
+    // nullptr should not be used

Review Comment:
   `nullptr` is ok in `.cc` files, `NULLPTR` is only for `.h` files. Could you 
revert these changes?



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2347,6 +2357,126 @@ void AddAsciiStringExtractRegex(FunctionRegistry* 
registry) {
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
+class ExtractRegexSpanData : public ExtractRegexData {
+ public:
+  static Result<ExtractRegexSpanData> Make(const std::string& pattern) {
+    auto data = ExtractRegexSpanData(pattern, true);
+    ARROW_RETURN_NOT_OK(data.Init());
+    return data;
+  }
+
+  Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) 
const {
+    const DataType* input_type = types[0].type;
+    if (input_type == NULLPTR) {
+      return NULLPTR;
+    }
+    DCHECK(is_base_binary_like(input_type->id()));
+    const size_t field_count = group_names_.size();
+    FieldVector fields;
+    fields.reserve(field_count);
+    const auto owned_type = input_type->GetSharedPtr();
+    for (const auto& group_name : group_names_) {
+      auto type = is_binary_like(owned_type->id()) ? int32() : int64();
+      // size list is 2 as every span contains position and length
+      fields.push_back(field(group_name + "_span", fixed_size_list(type, 2)));
+    }
+    return struct_(fields);
+  }
+
+ private:
+  ExtractRegexSpanData(const std::string& pattern, const bool is_utf8)
+      : ExtractRegexData(pattern, is_utf8) {}
+};
+
+template <typename Type>
+struct ExtractRegexSpan : ExtractRegexBase {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+  using ExtractRegexBase::ExtractRegexBase;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    auto options = OptionsWrapper<ExtractRegexSpanOptions>::Get(ctx);
+    ARROW_ASSIGN_OR_RAISE(auto data, 
ExtractRegexSpanData::Make(options.pattern));
+    return ExtractRegexSpan{data}.Extract(ctx, batch, out);
+  }
+  Status Extract(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    DCHECK_NE(out->array_data(), NULLPTR);
+    std::shared_ptr<DataType> out_type = out->array_data()->type;
+    DCHECK_NE(out_type, NULLPTR);
+    std::unique_ptr<ArrayBuilder> out_builder;
+    ARROW_RETURN_NOT_OK(
+        MakeBuilder(ctx->memory_pool(), out->type()->GetSharedPtr(), 
&out_builder));
+    auto struct_builder = 
checked_pointer_cast<StructBuilder>(std::move(out_builder));
+    std::vector<FixedSizeListBuilder*> span_builders;
+    std::vector<ArrayBuilder*> array_builders;
+    span_builders.reserve(group_count);
+    array_builders.reserve(group_count);
+    for (int i = 0; i < group_count; i++) {
+      span_builders.push_back(
+          
checked_cast<FixedSizeListBuilder*>(struct_builder->field_builder(i)));
+      array_builders.push_back(span_builders[i]->value_builder());
+    }
+    auto visit_null = [&]() { return struct_builder->AppendNull(); };
+    auto visit_value = [&](std::string_view element) -> Status {
+      if (Match(element)) {
+        for (int i = 0; i < group_count; i++) {
+          // https://github.com/google/re2/issues/24#issuecomment-97653183
+          if (found_values[i].data() != NULLPTR) {
+            int64_t begin = found_values[i].data() - element.data();
+            int64_t size = found_values[i].size();
+            if (is_binary_like(batch.GetTypes()[0].id())) {
+              
ARROW_RETURN_NOT_OK(checked_cast<Int32Builder*>(array_builders[i])
+                                      
->AppendValues({static_cast<int32_t>(begin),
+                                                      
static_cast<int32_t>(size)}));
+            } else {
+              
ARROW_RETURN_NOT_OK(checked_cast<Int64Builder*>(array_builders[i])
+                                      ->AppendValues({begin, size}));
+            }
+
+            ARROW_RETURN_NOT_OK(span_builders[i]->Append());
+          } else {
+            ARROW_RETURN_NOT_OK(span_builders[i]->AppendNull());
+          }
+        }
+        ARROW_RETURN_NOT_OK(struct_builder->Append());
+      } else {
+        ARROW_RETURN_NOT_OK(struct_builder->AppendNull());
+      }
+      return Status::OK();
+    };
+    ARROW_RETURN_NOT_OK(
+        VisitArraySpanInline<Type>(batch[0].array, visit_value, visit_null));
+
+    ARROW_ASSIGN_OR_RAISE(auto out_array, struct_builder->Finish());
+    out->value = out_array->data();
+    return Status::OK();
+  }
+};
+
+const FunctionDoc extract_regex_doc_span(
+    "likes extract_regex; however, it contains the position and length of 
results", "",

Review Comment:
   Ok, please write an actual doc even if it copy-pastes from extract_regex :)



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2347,6 +2357,126 @@ void AddAsciiStringExtractRegex(FunctionRegistry* 
registry) {
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
+class ExtractRegexSpanData : public ExtractRegexData {

Review Comment:
   Hmm, instead of the following class diagram:
   ```
   ExtractRegexData (defines ResolveOutputType)
   |---- ExtractRegexSpanData (redefines ResolveOutputType)
   ```
   
   I think it would be more consistent to have:
   ```
   BaseExtractRegexData
   |---- ExtractRegexData (defines ResolveOutputType)
   |---- ExtractRegexSpanData (redefines ResolveOutputType)
   ```



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2347,6 +2357,126 @@ void AddAsciiStringExtractRegex(FunctionRegistry* 
registry) {
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
+class ExtractRegexSpanData : public ExtractRegexData {
+ public:
+  static Result<ExtractRegexSpanData> Make(const std::string& pattern) {
+    auto data = ExtractRegexSpanData(pattern, true);
+    ARROW_RETURN_NOT_OK(data.Init());
+    return data;
+  }
+
+  Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) 
const {
+    const DataType* input_type = types[0].type;
+    if (input_type == NULLPTR) {
+      return NULLPTR;
+    }
+    DCHECK(is_base_binary_like(input_type->id()));
+    const size_t field_count = group_names_.size();
+    FieldVector fields;
+    fields.reserve(field_count);
+    const auto owned_type = input_type->GetSharedPtr();
+    for (const auto& group_name : group_names_) {
+      auto type = is_binary_like(owned_type->id()) ? int32() : int64();
+      // size list is 2 as every span contains position and length
+      fields.push_back(field(group_name + "_span", fixed_size_list(type, 2)));
+    }
+    return struct_(fields);
+  }
+
+ private:
+  ExtractRegexSpanData(const std::string& pattern, const bool is_utf8)
+      : ExtractRegexData(pattern, is_utf8) {}
+};
+
+template <typename Type>
+struct ExtractRegexSpan : ExtractRegexBase {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+  using ExtractRegexBase::ExtractRegexBase;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    auto options = OptionsWrapper<ExtractRegexSpanOptions>::Get(ctx);
+    ARROW_ASSIGN_OR_RAISE(auto data, 
ExtractRegexSpanData::Make(options.pattern));
+    return ExtractRegexSpan{data}.Extract(ctx, batch, out);
+  }
+  Status Extract(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    DCHECK_NE(out->array_data(), NULLPTR);
+    std::shared_ptr<DataType> out_type = out->array_data()->type;
+    DCHECK_NE(out_type, NULLPTR);
+    std::unique_ptr<ArrayBuilder> out_builder;
+    ARROW_RETURN_NOT_OK(
+        MakeBuilder(ctx->memory_pool(), out->type()->GetSharedPtr(), 
&out_builder));
+    auto struct_builder = 
checked_pointer_cast<StructBuilder>(std::move(out_builder));
+    std::vector<FixedSizeListBuilder*> span_builders;
+    std::vector<ArrayBuilder*> array_builders;
+    span_builders.reserve(group_count);
+    array_builders.reserve(group_count);
+    for (int i = 0; i < group_count; i++) {
+      span_builders.push_back(
+          
checked_cast<FixedSizeListBuilder*>(struct_builder->field_builder(i)));
+      array_builders.push_back(span_builders[i]->value_builder());
+    }
+    auto visit_null = [&]() { return struct_builder->AppendNull(); };
+    auto visit_value = [&](std::string_view element) -> Status {
+      if (Match(element)) {
+        for (int i = 0; i < group_count; i++) {
+          // https://github.com/google/re2/issues/24#issuecomment-97653183
+          if (found_values[i].data() != NULLPTR) {
+            int64_t begin = found_values[i].data() - element.data();
+            int64_t size = found_values[i].size();
+            if (is_binary_like(batch.GetTypes()[0].id())) {

Review Comment:
   This is doing a runtime type check for each element, while it's probably 
known at compile-time?
   You can probably inspect `Type::offset_type`.
   
   Actually, above you can probably write:
   ```c++
   using offset_type = typename Type::offset_type;
   using OffsetBuilderType = typename TypeTraits<typename 
CTypeTraits<offset_type>::ArrowType>::BuilderType
   std::vector<OffsetBuilderType*> offset_builders;
   ```



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2185,51 +2186,61 @@ void AddAsciiStringReplaceSubstring(FunctionRegistry* 
registry) {
 using ExtractRegexState = OptionsWrapper<ExtractRegexOptions>;
 
 // TODO cache this once per ExtractRegexOptions
-struct ExtractRegexData {
-  // Use unique_ptr<> because RE2 is non-movable (for ARROW_ASSIGN_OR_RAISE)
-  std::unique_ptr<RE2> regex;
-  std::vector<std::string> group_names;
-
+class ExtractRegexData {
+ public:
   static Result<ExtractRegexData> Make(const ExtractRegexOptions& options,
                                        bool is_utf8 = true) {
     ExtractRegexData data(options.pattern, is_utf8);
-    RETURN_NOT_OK(RegexStatus(*data.regex));
-
-    const int group_count = data.regex->NumberOfCapturingGroups();
-    const auto& name_map = data.regex->CapturingGroupNames();
-    data.group_names.reserve(group_count);
-
-    for (int i = 0; i < group_count; i++) {
-      auto item = name_map.find(i + 1);  // re2 starts counting from 1
-      if (item == name_map.end()) {
-        // XXX should we instead just create fields with an empty name?
-        return Status::Invalid("Regular expression contains unnamed groups");
-      }
-      data.group_names.emplace_back(item->second);
-    }
+    ARROW_RETURN_NOT_OK(data.Init());
     return data;
   }
 
   Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) 
const {
     const DataType* input_type = types[0].type;
-    if (input_type == nullptr) {
+    // as mentioned here
+    // 
https://arrow.apache.org/docs/developers/cpp/development.html#code-style-linting-and-ci
+    // nullptr should not be used
+    if (input_type == NULLPTR) {
       // No input type specified
-      return nullptr;
+      return NULLPTR;
     }
     // Input type is either [Large]Binary or [Large]String and is also the type
     // of each field in the output struct type.
     DCHECK(is_base_binary_like(input_type->id()));
     FieldVector fields;
-    fields.reserve(group_names.size());
+    fields.reserve(group_names_.size());
     std::shared_ptr<DataType> owned_type = input_type->GetSharedPtr();
-    std::transform(group_names.begin(), group_names.end(), 
std::back_inserter(fields),
+    std::transform(group_names_.begin(), group_names_.end(), 
std::back_inserter(fields),
                    [&](const std::string& name) { return field(name, 
owned_type); });
-    return struct_(std::move(fields));
+    return struct_(fields);
   }
+  int64_t num_group() const { return group_names_.size(); }
+  std::shared_ptr<RE2> regex() const { return regex_; }
 
- private:
+ protected:
   explicit ExtractRegexData(const std::string& pattern, bool is_utf8 = true)
-      : regex(new RE2(pattern, MakeRE2Options(is_utf8))) {}
+      : regex_(new RE2(pattern, MakeRE2Options(is_utf8))) {}
+
+  Status Init() {
+    RETURN_NOT_OK(RegexStatus(*regex_));
+
+    const int group_count = regex_->NumberOfCapturingGroups();
+    const auto& name_map = regex_->CapturingGroupNames();
+    group_names_.reserve(group_count);
+
+    for (int i = 0; i < group_count; i++) {
+      auto item = name_map.find(i + 1);  // re2 starts counting from 1
+      if (item == name_map.end()) {
+        // XXX should we instead just create fields with an empty name?
+        return Status::Invalid("Regular expression contains unnamed groups");
+      }
+      group_names_.emplace_back(item->second);
+    }
+    return Status::OK();
+  }
+
+  std::shared_ptr<RE2> regex_;

Review Comment:
   Why `shared_ptr`? `unique_ptr` was fine.



##########
cpp/src/arrow/compute/kernels/scalar_string_test.cc:
##########
@@ -314,6 +314,7 @@ TYPED_TEST(TestBinaryKernels, NonUtf8Regex) {
                      this->MakeArray({"\xfc\x40", "this \xfc\x40 that 
\xfc\x40"}),
                      this->MakeArray({"bazz", "this bazz that \xfc\x40"}), 
&options);
   }
+  // TODO the following test is broken

Review Comment:
   Do you plan to fix it in this PR? Otherwise, can you please open a GH issue 
and reference it here?



##########
cpp/src/arrow/compute/kernels/scalar_string_test.cc:
##########
@@ -1958,6 +1960,29 @@ TYPED_TEST(TestBaseBinaryKernels, ExtractRegex) {
                    R"([{"letter": "a", "digit": "1"}, {"letter": "b", "digit": 
"3"}])",
                    &options);
 }
+TYPED_TEST(TestBaseBinaryKernels, ExtractRegexSapn) {
+  ExtractRegexSpanOptions options{"(?P<letter>[ab])(?P<digit>\\d)"};

Review Comment:
   Can your example showcase an optional capture, and also failed matches to 
make sure that nulls are appropriately set in those sityations?



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2185,51 +2186,61 @@ void AddAsciiStringReplaceSubstring(FunctionRegistry* 
registry) {
 using ExtractRegexState = OptionsWrapper<ExtractRegexOptions>;
 
 // TODO cache this once per ExtractRegexOptions
-struct ExtractRegexData {
-  // Use unique_ptr<> because RE2 is non-movable (for ARROW_ASSIGN_OR_RAISE)
-  std::unique_ptr<RE2> regex;
-  std::vector<std::string> group_names;
-
+class ExtractRegexData {
+ public:
   static Result<ExtractRegexData> Make(const ExtractRegexOptions& options,
                                        bool is_utf8 = true) {
     ExtractRegexData data(options.pattern, is_utf8);
-    RETURN_NOT_OK(RegexStatus(*data.regex));
-
-    const int group_count = data.regex->NumberOfCapturingGroups();
-    const auto& name_map = data.regex->CapturingGroupNames();
-    data.group_names.reserve(group_count);
-
-    for (int i = 0; i < group_count; i++) {
-      auto item = name_map.find(i + 1);  // re2 starts counting from 1
-      if (item == name_map.end()) {
-        // XXX should we instead just create fields with an empty name?
-        return Status::Invalid("Regular expression contains unnamed groups");
-      }
-      data.group_names.emplace_back(item->second);
-    }
+    ARROW_RETURN_NOT_OK(data.Init());
     return data;
   }
 
   Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) 
const {
     const DataType* input_type = types[0].type;
-    if (input_type == nullptr) {
+    // as mentioned here
+    // 
https://arrow.apache.org/docs/developers/cpp/development.html#code-style-linting-and-ci
+    // nullptr should not be used
+    if (input_type == NULLPTR) {
       // No input type specified
-      return nullptr;
+      return NULLPTR;
     }
     // Input type is either [Large]Binary or [Large]String and is also the type
     // of each field in the output struct type.
     DCHECK(is_base_binary_like(input_type->id()));
     FieldVector fields;
-    fields.reserve(group_names.size());
+    fields.reserve(group_names_.size());
     std::shared_ptr<DataType> owned_type = input_type->GetSharedPtr();
-    std::transform(group_names.begin(), group_names.end(), 
std::back_inserter(fields),
+    std::transform(group_names_.begin(), group_names_.end(), 
std::back_inserter(fields),
                    [&](const std::string& name) { return field(name, 
owned_type); });
-    return struct_(std::move(fields));
+    return struct_(fields);

Review Comment:
   It's better to keep `std::move` here.



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2347,6 +2357,126 @@ void AddAsciiStringExtractRegex(FunctionRegistry* 
registry) {
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
+class ExtractRegexSpanData : public ExtractRegexData {
+ public:
+  static Result<ExtractRegexSpanData> Make(const std::string& pattern) {
+    auto data = ExtractRegexSpanData(pattern, true);
+    ARROW_RETURN_NOT_OK(data.Init());
+    return data;
+  }
+
+  Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) 
const {
+    const DataType* input_type = types[0].type;
+    if (input_type == NULLPTR) {
+      return NULLPTR;
+    }
+    DCHECK(is_base_binary_like(input_type->id()));
+    const size_t field_count = group_names_.size();
+    FieldVector fields;
+    fields.reserve(field_count);
+    const auto owned_type = input_type->GetSharedPtr();
+    for (const auto& group_name : group_names_) {
+      auto type = is_binary_like(owned_type->id()) ? int32() : int64();
+      // size list is 2 as every span contains position and length
+      fields.push_back(field(group_name + "_span", fixed_size_list(type, 2)));
+    }
+    return struct_(fields);
+  }
+
+ private:
+  ExtractRegexSpanData(const std::string& pattern, const bool is_utf8)
+      : ExtractRegexData(pattern, is_utf8) {}
+};
+
+template <typename Type>
+struct ExtractRegexSpan : ExtractRegexBase {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+  using ExtractRegexBase::ExtractRegexBase;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    auto options = OptionsWrapper<ExtractRegexSpanOptions>::Get(ctx);
+    ARROW_ASSIGN_OR_RAISE(auto data, 
ExtractRegexSpanData::Make(options.pattern));
+    return ExtractRegexSpan{data}.Extract(ctx, batch, out);
+  }
+  Status Extract(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    DCHECK_NE(out->array_data(), NULLPTR);
+    std::shared_ptr<DataType> out_type = out->array_data()->type;
+    DCHECK_NE(out_type, NULLPTR);
+    std::unique_ptr<ArrayBuilder> out_builder;
+    ARROW_RETURN_NOT_OK(
+        MakeBuilder(ctx->memory_pool(), out->type()->GetSharedPtr(), 
&out_builder));
+    auto struct_builder = 
checked_pointer_cast<StructBuilder>(std::move(out_builder));
+    std::vector<FixedSizeListBuilder*> span_builders;
+    std::vector<ArrayBuilder*> array_builders;
+    span_builders.reserve(group_count);
+    array_builders.reserve(group_count);
+    for (int i = 0; i < group_count; i++) {
+      span_builders.push_back(
+          
checked_cast<FixedSizeListBuilder*>(struct_builder->field_builder(i)));
+      array_builders.push_back(span_builders[i]->value_builder());

Review Comment:
   We know up front what the final array lengths will be, so we can probably 
   ```suggestion
         span_builders.push_back(
             
checked_cast<FixedSizeListBuilder*>(struct_builder->field_builder(i)));
         array_builders.push_back(span_builders[i]->value_builder());
         RETURN_NOT_OK(span_builders.back()->Reserve(batch[0].array.length));
         RETURN_NOT_OK(array_builders.back()->Reserve(2 * 
batch[0].array.length));
   ```



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2347,6 +2357,126 @@ void AddAsciiStringExtractRegex(FunctionRegistry* 
registry) {
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
+class ExtractRegexSpanData : public ExtractRegexData {
+ public:
+  static Result<ExtractRegexSpanData> Make(const std::string& pattern) {
+    auto data = ExtractRegexSpanData(pattern, true);
+    ARROW_RETURN_NOT_OK(data.Init());
+    return data;
+  }
+
+  Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) 
const {
+    const DataType* input_type = types[0].type;
+    if (input_type == NULLPTR) {
+      return NULLPTR;
+    }
+    DCHECK(is_base_binary_like(input_type->id()));
+    const size_t field_count = group_names_.size();
+    FieldVector fields;
+    fields.reserve(field_count);
+    const auto owned_type = input_type->GetSharedPtr();
+    for (const auto& group_name : group_names_) {
+      auto type = is_binary_like(owned_type->id()) ? int32() : int64();
+      // size list is 2 as every span contains position and length
+      fields.push_back(field(group_name + "_span", fixed_size_list(type, 2)));
+    }
+    return struct_(fields);
+  }
+
+ private:
+  ExtractRegexSpanData(const std::string& pattern, const bool is_utf8)
+      : ExtractRegexData(pattern, is_utf8) {}
+};
+
+template <typename Type>
+struct ExtractRegexSpan : ExtractRegexBase {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+  using ExtractRegexBase::ExtractRegexBase;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    auto options = OptionsWrapper<ExtractRegexSpanOptions>::Get(ctx);
+    ARROW_ASSIGN_OR_RAISE(auto data, 
ExtractRegexSpanData::Make(options.pattern));
+    return ExtractRegexSpan{data}.Extract(ctx, batch, out);
+  }
+  Status Extract(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    DCHECK_NE(out->array_data(), NULLPTR);
+    std::shared_ptr<DataType> out_type = out->array_data()->type;
+    DCHECK_NE(out_type, NULLPTR);
+    std::unique_ptr<ArrayBuilder> out_builder;
+    ARROW_RETURN_NOT_OK(
+        MakeBuilder(ctx->memory_pool(), out->type()->GetSharedPtr(), 
&out_builder));
+    auto struct_builder = 
checked_pointer_cast<StructBuilder>(std::move(out_builder));
+    std::vector<FixedSizeListBuilder*> span_builders;
+    std::vector<ArrayBuilder*> array_builders;
+    span_builders.reserve(group_count);
+    array_builders.reserve(group_count);
+    for (int i = 0; i < group_count; i++) {
+      span_builders.push_back(
+          
checked_cast<FixedSizeListBuilder*>(struct_builder->field_builder(i)));
+      array_builders.push_back(span_builders[i]->value_builder());
+    }
+    auto visit_null = [&]() { return struct_builder->AppendNull(); };
+    auto visit_value = [&](std::string_view element) -> Status {
+      if (Match(element)) {
+        for (int i = 0; i < group_count; i++) {
+          // https://github.com/google/re2/issues/24#issuecomment-97653183
+          if (found_values[i].data() != NULLPTR) {
+            int64_t begin = found_values[i].data() - element.data();
+            int64_t size = found_values[i].size();
+            if (is_binary_like(batch.GetTypes()[0].id())) {
+              
ARROW_RETURN_NOT_OK(checked_cast<Int32Builder*>(array_builders[i])
+                                      
->AppendValues({static_cast<int32_t>(begin),

Review Comment:
   If we `Reserve`d enough space above, we can probably replace all `Append` 
calls with the corresponding `UnsafeAppend` calls that don't try to resize.



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2347,6 +2357,126 @@ void AddAsciiStringExtractRegex(FunctionRegistry* 
registry) {
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
+class ExtractRegexSpanData : public ExtractRegexData {
+ public:
+  static Result<ExtractRegexSpanData> Make(const std::string& pattern) {
+    auto data = ExtractRegexSpanData(pattern, true);
+    ARROW_RETURN_NOT_OK(data.Init());
+    return data;
+  }
+
+  Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) 
const {
+    const DataType* input_type = types[0].type;
+    if (input_type == NULLPTR) {
+      return NULLPTR;
+    }
+    DCHECK(is_base_binary_like(input_type->id()));
+    const size_t field_count = group_names_.size();
+    FieldVector fields;
+    fields.reserve(field_count);
+    const auto owned_type = input_type->GetSharedPtr();
+    for (const auto& group_name : group_names_) {
+      auto type = is_binary_like(owned_type->id()) ? int32() : int64();
+      // size list is 2 as every span contains position and length
+      fields.push_back(field(group_name + "_span", fixed_size_list(type, 2)));
+    }
+    return struct_(fields);
+  }
+
+ private:
+  ExtractRegexSpanData(const std::string& pattern, const bool is_utf8)
+      : ExtractRegexData(pattern, is_utf8) {}
+};
+
+template <typename Type>
+struct ExtractRegexSpan : ExtractRegexBase {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+  using ExtractRegexBase::ExtractRegexBase;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    auto options = OptionsWrapper<ExtractRegexSpanOptions>::Get(ctx);
+    ARROW_ASSIGN_OR_RAISE(auto data, 
ExtractRegexSpanData::Make(options.pattern));
+    return ExtractRegexSpan{data}.Extract(ctx, batch, out);
+  }
+  Status Extract(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    DCHECK_NE(out->array_data(), NULLPTR);
+    std::shared_ptr<DataType> out_type = out->array_data()->type;
+    DCHECK_NE(out_type, NULLPTR);
+    std::unique_ptr<ArrayBuilder> out_builder;
+    ARROW_RETURN_NOT_OK(
+        MakeBuilder(ctx->memory_pool(), out->type()->GetSharedPtr(), 
&out_builder));
+    auto struct_builder = 
checked_pointer_cast<StructBuilder>(std::move(out_builder));
+    std::vector<FixedSizeListBuilder*> span_builders;
+    std::vector<ArrayBuilder*> array_builders;
+    span_builders.reserve(group_count);
+    array_builders.reserve(group_count);
+    for (int i = 0; i < group_count; i++) {
+      span_builders.push_back(
+          
checked_cast<FixedSizeListBuilder*>(struct_builder->field_builder(i)));
+      array_builders.push_back(span_builders[i]->value_builder());
+    }
+    auto visit_null = [&]() { return struct_builder->AppendNull(); };
+    auto visit_value = [&](std::string_view element) -> Status {
+      if (Match(element)) {
+        for (int i = 0; i < group_count; i++) {
+          // https://github.com/google/re2/issues/24#issuecomment-97653183
+          if (found_values[i].data() != NULLPTR) {
+            int64_t begin = found_values[i].data() - element.data();
+            int64_t size = found_values[i].size();
+            if (is_binary_like(batch.GetTypes()[0].id())) {
+              
ARROW_RETURN_NOT_OK(checked_cast<Int32Builder*>(array_builders[i])
+                                      
->AppendValues({static_cast<int32_t>(begin),
+                                                      
static_cast<int32_t>(size)}));
+            } else {
+              
ARROW_RETURN_NOT_OK(checked_cast<Int64Builder*>(array_builders[i])
+                                      ->AppendValues({begin, size}));
+            }
+
+            ARROW_RETURN_NOT_OK(span_builders[i]->Append());
+          } else {
+            ARROW_RETURN_NOT_OK(span_builders[i]->AppendNull());
+          }
+        }
+        ARROW_RETURN_NOT_OK(struct_builder->Append());
+      } else {
+        ARROW_RETURN_NOT_OK(struct_builder->AppendNull());
+      }
+      return Status::OK();
+    };
+    ARROW_RETURN_NOT_OK(
+        VisitArraySpanInline<Type>(batch[0].array, visit_value, visit_null));
+
+    ARROW_ASSIGN_OR_RAISE(auto out_array, struct_builder->Finish());
+    out->value = out_array->data();
+    return Status::OK();
+  }
+};
+
+const FunctionDoc extract_regex_doc_span(
+    "likes extract_regex; however, it contains the position and length of 
results", "",
+    {"strings"}, "ExtractRegexSpanOptions", true);
+
+Result<TypeHolder> resolver(KernelContext* ctx, const std::vector<TypeHolder>& 
types) {

Review Comment:
   let's call this `ResolveExtractRegexSpanOutputType` or something?



##########
cpp/src/arrow/compute/api_scalar.h:
##########
@@ -264,7 +264,17 @@ class ARROW_EXPORT ExtractRegexOptions : public 
FunctionOptions {
   /// Regular expression with named capture fields
   std::string pattern;
 };
+class ARROW_EXPORT ExtractRegexSpanOptions : public FunctionOptions {
+ public:
+  explicit ExtractRegexSpanOptions(std::string pattern);
+  ExtractRegexSpanOptions();
+  static constexpr char const kTypeName[] = "ExtractRegexSpanOptions";
 
+  /// Regular expression with named capture fields
+  std::string pattern;
+
+  /// Shows the matched string

Review Comment:
   Remove this comment?



##########
cpp/src/arrow/compute/kernels/scalar_string_ascii.cc:
##########
@@ -2185,51 +2186,61 @@ void AddAsciiStringReplaceSubstring(FunctionRegistry* 
registry) {
 using ExtractRegexState = OptionsWrapper<ExtractRegexOptions>;
 
 // TODO cache this once per ExtractRegexOptions
-struct ExtractRegexData {
-  // Use unique_ptr<> because RE2 is non-movable (for ARROW_ASSIGN_OR_RAISE)
-  std::unique_ptr<RE2> regex;
-  std::vector<std::string> group_names;
-
+class ExtractRegexData {
+ public:
   static Result<ExtractRegexData> Make(const ExtractRegexOptions& options,
                                        bool is_utf8 = true) {
     ExtractRegexData data(options.pattern, is_utf8);
-    RETURN_NOT_OK(RegexStatus(*data.regex));
-
-    const int group_count = data.regex->NumberOfCapturingGroups();
-    const auto& name_map = data.regex->CapturingGroupNames();
-    data.group_names.reserve(group_count);
-
-    for (int i = 0; i < group_count; i++) {
-      auto item = name_map.find(i + 1);  // re2 starts counting from 1
-      if (item == name_map.end()) {
-        // XXX should we instead just create fields with an empty name?
-        return Status::Invalid("Regular expression contains unnamed groups");
-      }
-      data.group_names.emplace_back(item->second);
-    }
+    ARROW_RETURN_NOT_OK(data.Init());
     return data;
   }
 
   Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) 
const {
     const DataType* input_type = types[0].type;
-    if (input_type == nullptr) {
+    // as mentioned here
+    // 
https://arrow.apache.org/docs/developers/cpp/development.html#code-style-linting-and-ci
+    // nullptr should not be used
+    if (input_type == NULLPTR) {
       // No input type specified
-      return nullptr;
+      return NULLPTR;
     }
     // Input type is either [Large]Binary or [Large]String and is also the type
     // of each field in the output struct type.
     DCHECK(is_base_binary_like(input_type->id()));
     FieldVector fields;
-    fields.reserve(group_names.size());
+    fields.reserve(group_names_.size());
     std::shared_ptr<DataType> owned_type = input_type->GetSharedPtr();
-    std::transform(group_names.begin(), group_names.end(), 
std::back_inserter(fields),
+    std::transform(group_names_.begin(), group_names_.end(), 
std::back_inserter(fields),
                    [&](const std::string& name) { return field(name, 
owned_type); });
-    return struct_(std::move(fields));
+    return struct_(fields);
   }
+  int64_t num_group() const { return group_names_.size(); }
+  std::shared_ptr<RE2> regex() const { return regex_; }
 
- private:
+ protected:

Review Comment:
   Making the fields `protected` seems a bit gratuitous (this is only an 
internal helper class).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-44615: [C++] Add possibility to extract spans/byte offsets directly for compute.extract_regex [arrow]

Reply via email to