pitrou commented on code in PR #45577: URL: https://github.com/apache/arrow/pull/45577#discussion_r1969153467
########## cpp/src/arrow/compute/kernels/scalar_string_ascii.cc: ########## @@ -2347,6 +2357,126 @@ void AddAsciiStringExtractRegex(FunctionRegistry* registry) { } DCHECK_OK(registry->AddFunction(std::move(func))); } +class ExtractRegexSpanData : public ExtractRegexData { + public: + static Result<ExtractRegexSpanData> Make(const std::string& pattern) { + auto data = ExtractRegexSpanData(pattern, true); + ARROW_RETURN_NOT_OK(data.Init()); + return data; + } + + Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) const { + const DataType* input_type = types[0].type; + if (input_type == NULLPTR) { + return NULLPTR; + } + DCHECK(is_base_binary_like(input_type->id())); + const size_t field_count = group_names_.size(); + FieldVector fields; + fields.reserve(field_count); + const auto owned_type = input_type->GetSharedPtr(); + for (const auto& group_name : group_names_) { + auto type = is_binary_like(owned_type->id()) ? int32() : int64(); + // size list is 2 as every span contains position and length + fields.push_back(field(group_name + "_span", fixed_size_list(type, 2))); + } + return struct_(fields); + } + + private: + ExtractRegexSpanData(const std::string& pattern, const bool is_utf8) + : ExtractRegexData(pattern, is_utf8) {} +}; + +template <typename Type> +struct ExtractRegexSpan : ExtractRegexBase { + using ArrayType = typename TypeTraits<Type>::ArrayType; + using BuilderType = typename TypeTraits<Type>::BuilderType; + using ExtractRegexBase::ExtractRegexBase; + + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + auto options = OptionsWrapper<ExtractRegexSpanOptions>::Get(ctx); + ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexSpanData::Make(options.pattern)); + return ExtractRegexSpan{data}.Extract(ctx, batch, out); + } + Status Extract(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + DCHECK_NE(out->array_data(), NULLPTR); + std::shared_ptr<DataType> out_type = out->array_data()->type; + DCHECK_NE(out_type, NULLPTR); + std::unique_ptr<ArrayBuilder> out_builder; + ARROW_RETURN_NOT_OK( + MakeBuilder(ctx->memory_pool(), out->type()->GetSharedPtr(), &out_builder)); + auto struct_builder = checked_pointer_cast<StructBuilder>(std::move(out_builder)); + std::vector<FixedSizeListBuilder*> span_builders; + std::vector<ArrayBuilder*> array_builders; + span_builders.reserve(group_count); + array_builders.reserve(group_count); + for (int i = 0; i < group_count; i++) { + span_builders.push_back( + checked_cast<FixedSizeListBuilder*>(struct_builder->field_builder(i))); + array_builders.push_back(span_builders[i]->value_builder()); + } + auto visit_null = [&]() { return struct_builder->AppendNull(); }; + auto visit_value = [&](std::string_view element) -> Status { + if (Match(element)) { + for (int i = 0; i < group_count; i++) { + // https://github.com/google/re2/issues/24#issuecomment-97653183 + if (found_values[i].data() != NULLPTR) { + int64_t begin = found_values[i].data() - element.data(); + int64_t size = found_values[i].size(); + if (is_binary_like(batch.GetTypes()[0].id())) { + ARROW_RETURN_NOT_OK(checked_cast<Int32Builder*>(array_builders[i]) + ->AppendValues({static_cast<int32_t>(begin), Review Comment: Ah, even if `UnsafeAppend` doesn't exist for `FixedSizeListBuilder`, `Reserve` is still useful to avoid several reallocations when appending. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org