[GitHub] [arrow] pitrou commented on a change in pull request #8271: ARROW-9991: [C++] split kernels for strings/binary

GitBox Thu, 08 Oct 2020 03:22:24 -0700


pitrou commented on a change in pull request #8271:
URL: https://github.com/apache/arrow/pull/8271#discussion_r501601274




##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -809,6 +812,418 @@ struct IsUpperAscii : 
CharacterPredicateAscii<IsUpperAscii> {
   }
 };
 
+// splitting
+
+template <typename Type, typename ListType, typename Options, typename Derived>
+struct SplitBaseTransform {
+  // TODO: assert offsets types are the same?

Review comment:
       They don't need to, so no?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -809,6 +812,418 @@ struct IsUpperAscii : 
CharacterPredicateAscii<IsUpperAscii> {
   }
 };
 
+// splitting
+
+template <typename Type, typename ListType, typename Options, typename Derived>
+struct SplitBaseTransform {
+  // TODO: assert offsets types are the same?
+  using string_offset_type = typename Type::offset_type;
+  using list_offset_type = typename ListType::offset_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using ArrayListType = typename TypeTraits<ListType>::ArrayType;
+  using ListScalarType = typename TypeTraits<ListType>::ScalarType;
+  using ScalarType = typename TypeTraits<Type>::ScalarType;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+  using ListOffsetsBuilderType = TypedBufferBuilder<list_offset_type>;
+  using State = OptionsWrapper<Options>;
+
+  std::vector<util::string_view> parts;
+  Options options;
+
+  explicit SplitBaseTransform(Options options) : options(options) {}
+
+  Status Split(const util::string_view& s, BuilderType* builder) {
+    const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.data());
+    const uint8_t* end = begin + s.length();
+
+    int64_t max_splits = options.max_splits;
+    // if there is no max splits, reversing does not make sense (and is 
probably less
+    // efficient), but is useful for testing
+    if (options.reverse) {
+      // note that i points 1 further than the 'current'
+      const uint8_t* i = end;
+      // we will record the parts in reverse order
+      if (max_splits > -1) {
+        parts.reserve(max_splits + 1);
+      }
+      while (max_splits != 0) {
+        const uint8_t *separator_begin, *separator_end;
+        // find with whatever algo the part we will 'cut out'
+        if (static_cast<Derived&>(*this).FindReverse(begin, i, 
&separator_begin,
+                                                     &separator_end, options)) 
{
+          parts.emplace_back(reinterpret_cast<const char*>(separator_end),
+                             i - separator_end);
+          i = separator_begin;
+          max_splits--;
+        } else {
+          // if we cannot find a separator, we're done
+          break;
+        }
+      }
+      parts.emplace_back(reinterpret_cast<const char*>(begin), i - begin);
+      // now we do the copying
+      for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
+        RETURN_NOT_OK(builder->Append(*it));
+      }
+      parts.clear();
+    } else {
+      const uint8_t* i = begin;
+      while (max_splits != 0) {
+        const uint8_t *separator_begin, *separator_end;
+        // find with whatever algo the part we will 'cut out'
+        if (static_cast<Derived&>(*this).Find(i, end, &separator_begin, 
&separator_end,
+                                              options)) {
+          // the part till the beginning of the 'cut'
+          RETURN_NOT_OK(
+              builder->Append(i, 
static_cast<string_offset_type>(separator_begin - i)));
+          i = separator_end;
+          max_splits--;
+        } else {
+          // if we cannot find a separator, we're done
+          break;
+        }
+      }
+      // trailing part
+      RETURN_NOT_OK(builder->Append(i, static_cast<string_offset_type>(end - 
i)));
+    }
+    return Status::OK();
+  }
+  static Status CheckOptions(const Options& options) { return Status::OK(); }
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    Options options = State::Get(ctx);
+    Derived splitter(options);  // we make an instance to reuse the parts 
vectors
+    splitter.Split(ctx, batch, out);
+  }
+
+  void Split(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    EnsureLookupTablesFilled();  // only needed for unicode
+    KERNEL_RETURN_IF_ERROR(ctx, Derived::CheckOptions(options));
+
+    if (batch[0].kind() == Datum::ARRAY) {
+      const ArrayData& input = *batch[0].array();
+      ArrayType input_boxed(batch[0].array());
+
+      string_offset_type input_nbytes = input_boxed.total_values_length();
+      string_offset_type input_nstrings = 
static_cast<string_offset_type>(input.length);
+
+      string_offset_type output_nbytes_max = input_nbytes;
+      int64_t output_nstrings_max = Derived::CalculateMaxSplits(input_boxed, 
options);
+      if (output_nstrings_max > std::numeric_limits<list_offset_type>::max()) {
+        ctx->SetStatus(
+            Status::CapacityError("Result might not fit in a 32bit list or 
string array, "
+                                  "convert to large_utf8"));
+        return;
+      }
+
+      BuilderType builder(input.type, ctx->memory_pool());
+      KERNEL_RETURN_IF_ERROR(ctx, builder.ReserveData(output_nbytes_max));
+      KERNEL_RETURN_IF_ERROR(ctx, builder.Resize(output_nstrings_max));

Review comment:
       I'm skeptical this is a good idea to reserve `output_nstrings_max`, as 
this will allocate much more than necessary in most cases.

##########
File path: docs/source/cpp/compute.rst
##########
@@ -393,6 +393,34 @@ Containment tests
 * \(3) Output is true iff the corresponding input element is equal to one
   of the elements in :member:`SetLookupOptions::value_set`.
 
+
+String splitting
+~~~~~~~~~~~~~~~~
+
+These functions split strings into list of strings. All kernels take a
+max_splits and a reverse argument, where max_splits == -1 means no limit. When
+reverse is true, the splitting is done starting from the end of the string. 
Note
+that when max_splits == -1, reverse has no effect.
+
++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
+| Function name            | Arity      | Input types             | Output 
type       | Options class                    | Notes   |
++==========================+============+=========================+===================+==================================+=========+
+| split_pattern            | Unary      | String-like             | List-like  
       | :struct:`SplitPatternOptions`    | \(1)    |
++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
+| utf8_split_whitespace    | Unary      | String-like             | List-like  
       | :struct:`SplitOptions`           | \(2)    |
++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
+| ascii_split_whitespace   | Unary      | String-like             | List-like  
       | :struct:`SplitOptions`           | \(3)    |
++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
+
+* \(1) The string is split when an exact pattern is found (the pattern itself 
is
+not included in the output).
+
+* \(2) A non-zero length sequence of Unicode defined whitespace codepoints is 
seen as separator.
+
+* \(3) A non-zero length sequence of ASCII defined whitespace bytes (\t, \n \v,
+\f, \r  and ' ') is seen as separator.

Review comment:
       I think you'll need to escape the literal strings here (`\t` etc.).

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -809,6 +812,418 @@ struct IsUpperAscii : 
CharacterPredicateAscii<IsUpperAscii> {
   }
 };
 
+// splitting
+
+template <typename Type, typename ListType, typename Options, typename Derived>
+struct SplitBaseTransform {
+  // TODO: assert offsets types are the same?
+  using string_offset_type = typename Type::offset_type;
+  using list_offset_type = typename ListType::offset_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using ArrayListType = typename TypeTraits<ListType>::ArrayType;
+  using ListScalarType = typename TypeTraits<ListType>::ScalarType;
+  using ScalarType = typename TypeTraits<Type>::ScalarType;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+  using ListOffsetsBuilderType = TypedBufferBuilder<list_offset_type>;
+  using State = OptionsWrapper<Options>;
+
+  std::vector<util::string_view> parts;
+  Options options;
+
+  explicit SplitBaseTransform(Options options) : options(options) {}
+
+  Status Split(const util::string_view& s, BuilderType* builder) {
+    const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.data());
+    const uint8_t* end = begin + s.length();
+
+    int64_t max_splits = options.max_splits;
+    // if there is no max splits, reversing does not make sense (and is 
probably less
+    // efficient), but is useful for testing
+    if (options.reverse) {
+      // note that i points 1 further than the 'current'
+      const uint8_t* i = end;
+      // we will record the parts in reverse order
+      if (max_splits > -1) {
+        parts.reserve(max_splits + 1);
+      }
+      while (max_splits != 0) {
+        const uint8_t *separator_begin, *separator_end;
+        // find with whatever algo the part we will 'cut out'
+        if (static_cast<Derived&>(*this).FindReverse(begin, i, 
&separator_begin,
+                                                     &separator_end, options)) 
{
+          parts.emplace_back(reinterpret_cast<const char*>(separator_end),
+                             i - separator_end);
+          i = separator_begin;
+          max_splits--;
+        } else {
+          // if we cannot find a separator, we're done
+          break;
+        }
+      }
+      parts.emplace_back(reinterpret_cast<const char*>(begin), i - begin);
+      // now we do the copying
+      for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
+        RETURN_NOT_OK(builder->Append(*it));
+      }
+      parts.clear();
+    } else {
+      const uint8_t* i = begin;
+      while (max_splits != 0) {
+        const uint8_t *separator_begin, *separator_end;
+        // find with whatever algo the part we will 'cut out'
+        if (static_cast<Derived&>(*this).Find(i, end, &separator_begin, 
&separator_end,
+                                              options)) {
+          // the part till the beginning of the 'cut'
+          RETURN_NOT_OK(
+              builder->Append(i, 
static_cast<string_offset_type>(separator_begin - i)));
+          i = separator_end;
+          max_splits--;
+        } else {
+          // if we cannot find a separator, we're done
+          break;
+        }
+      }
+      // trailing part
+      RETURN_NOT_OK(builder->Append(i, static_cast<string_offset_type>(end - 
i)));
+    }
+    return Status::OK();
+  }
+  static Status CheckOptions(const Options& options) { return Status::OK(); }
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    Options options = State::Get(ctx);
+    Derived splitter(options);  // we make an instance to reuse the parts 
vectors
+    splitter.Split(ctx, batch, out);
+  }
+
+  void Split(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    EnsureLookupTablesFilled();  // only needed for unicode
+    KERNEL_RETURN_IF_ERROR(ctx, Derived::CheckOptions(options));
+
+    if (batch[0].kind() == Datum::ARRAY) {
+      const ArrayData& input = *batch[0].array();
+      ArrayType input_boxed(batch[0].array());
+
+      string_offset_type input_nbytes = input_boxed.total_values_length();
+      string_offset_type input_nstrings = 
static_cast<string_offset_type>(input.length);
+
+      string_offset_type output_nbytes_max = input_nbytes;
+      int64_t output_nstrings_max = Derived::CalculateMaxSplits(input_boxed, 
options);
+      if (output_nstrings_max > std::numeric_limits<list_offset_type>::max()) {

Review comment:
       I don't think this should be done statically. Instead, you should 
accumulate until the capacity limit is reached, and then raise the error.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -809,6 +812,418 @@ struct IsUpperAscii : 
CharacterPredicateAscii<IsUpperAscii> {
   }
 };
 
+// splitting
+
+template <typename Type, typename ListType, typename Options, typename Derived>
+struct SplitBaseTransform {
+  // TODO: assert offsets types are the same?
+  using string_offset_type = typename Type::offset_type;
+  using list_offset_type = typename ListType::offset_type;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using ArrayListType = typename TypeTraits<ListType>::ArrayType;
+  using ListScalarType = typename TypeTraits<ListType>::ScalarType;
+  using ScalarType = typename TypeTraits<Type>::ScalarType;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+  using ListOffsetsBuilderType = TypedBufferBuilder<list_offset_type>;
+  using State = OptionsWrapper<Options>;
+
+  std::vector<util::string_view> parts;
+  Options options;
+
+  explicit SplitBaseTransform(Options options) : options(options) {}
+
+  Status Split(const util::string_view& s, BuilderType* builder) {
+    const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.data());
+    const uint8_t* end = begin + s.length();
+
+    int64_t max_splits = options.max_splits;
+    // if there is no max splits, reversing does not make sense (and is 
probably less
+    // efficient), but is useful for testing
+    if (options.reverse) {
+      // note that i points 1 further than the 'current'
+      const uint8_t* i = end;
+      // we will record the parts in reverse order
+      if (max_splits > -1) {
+        parts.reserve(max_splits + 1);
+      }
+      while (max_splits != 0) {
+        const uint8_t *separator_begin, *separator_end;
+        // find with whatever algo the part we will 'cut out'
+        if (static_cast<Derived&>(*this).FindReverse(begin, i, 
&separator_begin,
+                                                     &separator_end, options)) 
{
+          parts.emplace_back(reinterpret_cast<const char*>(separator_end),
+                             i - separator_end);
+          i = separator_begin;
+          max_splits--;
+        } else {
+          // if we cannot find a separator, we're done
+          break;
+        }
+      }
+      parts.emplace_back(reinterpret_cast<const char*>(begin), i - begin);
+      // now we do the copying
+      for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
+        RETURN_NOT_OK(builder->Append(*it));
+      }
+      parts.clear();
+    } else {
+      const uint8_t* i = begin;
+      while (max_splits != 0) {
+        const uint8_t *separator_begin, *separator_end;
+        // find with whatever algo the part we will 'cut out'
+        if (static_cast<Derived&>(*this).Find(i, end, &separator_begin, 
&separator_end,
+                                              options)) {
+          // the part till the beginning of the 'cut'
+          RETURN_NOT_OK(
+              builder->Append(i, 
static_cast<string_offset_type>(separator_begin - i)));
+          i = separator_end;
+          max_splits--;
+        } else {
+          // if we cannot find a separator, we're done
+          break;
+        }
+      }
+      // trailing part
+      RETURN_NOT_OK(builder->Append(i, static_cast<string_offset_type>(end - 
i)));
+    }
+    return Status::OK();
+  }
+  static Status CheckOptions(const Options& options) { return Status::OK(); }
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    Options options = State::Get(ctx);
+    Derived splitter(options);  // we make an instance to reuse the parts 
vectors
+    splitter.Split(ctx, batch, out);
+  }
+
+  void Split(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    EnsureLookupTablesFilled();  // only needed for unicode
+    KERNEL_RETURN_IF_ERROR(ctx, Derived::CheckOptions(options));
+
+    if (batch[0].kind() == Datum::ARRAY) {
+      const ArrayData& input = *batch[0].array();
+      ArrayType input_boxed(batch[0].array());
+
+      string_offset_type input_nbytes = input_boxed.total_values_length();
+      string_offset_type input_nstrings = 
static_cast<string_offset_type>(input.length);
+
+      string_offset_type output_nbytes_max = input_nbytes;
+      int64_t output_nstrings_max = Derived::CalculateMaxSplits(input_boxed, 
options);

Review comment:
       You should also take into account `options.max_splits`, no?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a change in pull request #8271: ARROW-9991: [C++] split kernels for strings/binary

Reply via email to