xhochy commented on a change in pull request #7593:
URL: https://github.com/apache/arrow/pull/7593#discussion_r448909535
##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -259,6 +259,15 @@ Result<Datum> IsValid(const Datum& values, ExecContext*
ctx = NULLPTR);
ARROW_EXPORT
Result<Datum> IsNull(const Datum& values, ExecContext* ctx = NULLPTR);
+// ----------------------------------------------------------------------
+// String functions
+
+struct ARROW_EXPORT ContainsExactOptions : public FunctionOptions {
+ explicit ContainsExactOptions(std::string pattern = "") : pattern(pattern) {}
Review comment:
This is needed to be able to construct an instance of it in Cython. I
can remove this and switch the Cython instantiation to a pointer.
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -297,6 +297,116 @@ void AddAsciiLength(FunctionRegistry* registry) {
DCHECK_OK(registry->AddFunction(std::move(func)));
}
+// ----------------------------------------------------------------------
+// exact pattern detection
+
+using StrToBoolTransformFunc =
+ std::function<void(const void*, const uint8_t*, int64_t, int64_t,
uint8_t*)>;
+
+// Apply `transform` to input character data- this function cannot change the
+// length
+template <typename Type>
+void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch,
+ StrToBoolTransformFunc transform, Datum* out) {
+ using offset_type = typename Type::offset_type;
+
+ if (batch[0].kind() == Datum::ARRAY) {
+ const ArrayData& input = *batch[0].array();
+ ArrayData* out_arr = out->mutable_array();
+ if (input.length > 0) {
+ transform(
+ reinterpret_cast<const offset_type*>(input.buffers[1]->data()) +
input.offset,
+ input.buffers[2]->data(), input.length, out_arr->offset,
+ out_arr->buffers[1]->mutable_data());
+ }
+ } else {
+ const auto& input = checked_cast<const
BaseBinaryScalar&>(*batch[0].scalar());
+ auto result =
checked_pointer_cast<BooleanScalar>(MakeNullScalar(out->type()));
+ uint8_t result_value = 0;
+ if (input.is_valid) {
+ result->is_valid = true;
+ std::array<offset_type, 2> offsets{0,
+
static_cast<offset_type>(input.value->size())};
+ transform(offsets.data(), input.value->data(), 1, /*output_offset=*/0,
+ &result_value);
+ out->value = std::make_shared<BooleanScalar>(result_value > 0);
+ }
+ }
+}
+
+template <typename offset_type>
+void TransformContainsExact(const uint8_t* pattern, int64_t pattern_length,
+ const offset_type* offsets, const uint8_t* data,
+ int64_t length, int64_t output_offset, uint8_t*
output) {
+ // This is an implementation of the Knuth-Morris-Pratt algorithm
+
+ // Phase 1: Build the prefix table
+ std::vector<offset_type> prefix_table(pattern_length + 1);
+ offset_type prefix_length = -1;
+ prefix_table[0] = -1;
+ for (offset_type pos = 0; pos < pattern_length; ++pos) {
+ // The prefix cannot be expanded, reset.
+ if (prefix_length >= 0 && pattern[pos] != pattern[prefix_length]) {
+ prefix_length = prefix_table[prefix_length];
+ }
+ prefix_length++;
+ prefix_table[pos + 1] = prefix_length;
+ }
+
+ // Phase 2: Find the prefix in the data
+ FirstTimeBitmapWriter bitmap_writer(output, output_offset, length);
+ for (int64_t i = 0; i < length; ++i) {
+ const uint8_t* current_data = data + offsets[i];
+ int64_t current_length = offsets[i + 1] - offsets[i];
+
+ int64_t pattern_pos = 0;
+ for (int64_t k = 0; k < current_length; k++) {
+ if (pattern[pattern_pos] == current_data[k]) {
+ pattern_pos++;
+ if (pattern_pos == pattern_length) {
+ bitmap_writer.Set();
+ break;
+ }
+ } else {
+ pattern_pos = std::max<offset_type>(0, prefix_table[pattern_pos]);
+ }
+ }
+ bitmap_writer.Next();
+ }
+ bitmap_writer.Finish();
Review comment:
The bit vector is 0-initialised, so this is not needed
(`FirstTimeBitmapWriter::Clear` is no-op).
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]