[GitHub] [arrow] maartenbreddels commented on a change in pull request #8271: ARROW-9991: [C++] split kernels for strings/binary

GitBox Wed, 07 Oct 2020 02:27:25 -0700


maartenbreddels commented on a change in pull request #8271:
URL: https://github.com/apache/arrow/pull/8271#discussion_r500868020




##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -809,6 +809,475 @@ struct IsUpperAscii : 
CharacterPredicateAscii<IsUpperAscii> {
   }
 };
 
+// splitting
+
+template <typename Type, typename ListType, typename Options, typename Derived>
+struct SplitBaseTransform {
+  // TODO: assert offsets types are the same?
+  using offset_type = typename Type::offset_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using ArrayListType = typename TypeTraits<ListType>::ArrayType;
+  using ListScalarType = typename TypeTraits<ListType>::ScalarType;
+  using ScalarType = typename TypeTraits<Type>::ScalarType;
+  using Builder = typename TypeTraits<Type>::BuilderType;
+  using State = OptionsWrapper<Options>;
+
+  static void Split(const uint8_t* input_string, offset_type 
input_string_nbytes,
+                    offset_type** output_string_offsets, offset_type* 
string_output_count,
+                    offset_type* string_output_offset, uint8_t** 
output_string_data,
+                    const Options& options) {
+    const uint8_t* begin = input_string;
+    const uint8_t* end = begin + input_string_nbytes;
+
+    int64_t max_splits = options.max_splits;
+    // if there is no max splits, reversing does not make sense (and is 
probably less
+    // efficient), but is useful for testing
+    if (options.reverse) {
+      // note that i points 1 further than the 'current'
+      const uint8_t* i = end;
+      // we will record the parts in reverse order
+      std::vector<std::pair<const uint8_t*, const uint8_t*>> parts;
+      if (max_splits > -1) {
+        parts.reserve(max_splits + 1);
+      }
+      while (max_splits != 0) {
+        const uint8_t *separator_begin, *separator_end;
+        // find with whatever algo the part we will 'cut out'
+        if (Derived::FindReverse(begin, i, &separator_begin, &separator_end, 
options)) {
+          parts.emplace_back(separator_end, i);
+          i = separator_begin;
+          max_splits--;
+        } else {
+          // if we cannot find a separator, we're done
+          break;
+        }
+      }
+      parts.emplace_back(begin, i);
+      // now we do the copying
+      for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
+        auto part = *it;
+        // copy the string data
+        for (auto j = part.first; j < part.second; j++) {
+          *(*output_string_data)++ = *j;
+          (*string_output_offset)++;
+        }
+        // write out the string entry (offset)
+        *(*output_string_offsets)++ = *string_output_offset;
+        (*string_output_count)++;
+      }
+    } else {
+      const uint8_t* i = begin;
+      while (max_splits != 0) {
+        const uint8_t *separator_begin, *separator_end;
+        // find with whatever algo the part we will 'cut out'
+        if (Derived::Find(i, end, &separator_begin, &separator_end, options)) {
+          // copy the part till the beginning of the 'cut'
+          while (i < separator_begin) {
+            *(*output_string_data)++ = *i++;
+            (*string_output_offset)++;
+          }
+          // 'finish' the string by writing the offset
+          *(*output_string_offsets)++ = *string_output_offset;
+          (*string_output_count)++;
+          // jump of the part we cut out
+          i = separator_end;
+          max_splits--;
+        } else {
+          // if we cannot find a separator, we're done
+          break;
+        }
+      }
+      // copy bytes after the pattern
+      while (i < end) {
+        *(*output_string_data)++ = *i++;
+        (*string_output_offset)++;
+      }
+      // and write out the trailing part (can be an empty string)
+      *(*output_string_offsets)++ = *string_output_offset;
+      (*string_output_count)++;
+    }
+  }
+  static Status CheckOptions(const Options& options) { return Status::OK(); }
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    EnsureLookupTablesFilled();  // only needed for unicode
+    Options options = State::Get(ctx);
+    KERNEL_RETURN_IF_ERROR(ctx, Derived::CheckOptions(options));
+
+    if (batch[0].kind() == Datum::ARRAY) {
+      const ArrayData& input = *batch[0].array();
+      ArrayType input_boxed(batch[0].array());
+      ArrayData* output_list_data = out->mutable_array();
+
+      offset_type input_nbytes = input_boxed.total_values_length();
+      offset_type input_nstrings = static_cast<offset_type>(input.length);
+
+      offset_type output_nbytes_max = input_nbytes;
+      int64_t output_nstrings_max = Derived::CalculateMaxSplits(input_boxed, 
options);
+      if (output_nstrings_max > std::numeric_limits<offset_type>::max()) {
+        ctx->SetStatus(
+            Status::CapacityError("Result might not fit in a 32bit list or 
string array, "
+                                  "convert to large_utf8"));
+        return;
+      }
+
+      // Why is the offset buffer not preallocated?

Review comment:
       done https://issues.apache.org/jira/browse/ARROW-10207




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] maartenbreddels commented on a change in pull request #8271: ARROW-9991: [C++] split kernels for strings/binary

Reply via email to