pitrou commented on a change in pull request #8990:
URL: https://github.com/apache/arrow/pull/8990#discussion_r603289347



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1828,6 +1828,148 @@ void AddUtf8Length(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// binary join
+
+template <typename Type>
+struct Join {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using ListArrayType = ListArray;
+  using offset_type = typename Type::offset_type;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::SCALAR) {

Review comment:
       For readability, can you split this function into different functions 
based on the various input kinds, e.g. `ExecArrayArray`, `ExecArrayScalar`, 
`ExecScalarArray`, `ExecScalarScalar`?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1828,6 +1828,148 @@ void AddUtf8Length(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// binary join
+
+template <typename Type>
+struct Join {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using ListArrayType = ListArray;
+  using offset_type = typename Type::offset_type;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::SCALAR) {
+      const ListScalar& list = checked_cast<const 
ListScalar&>(*batch[0].scalar());
+      if (!list.is_valid) {
+        return;
+      }
+      if (batch[1].kind() == Datum::SCALAR) {
+        const BaseBinaryScalar& separator_scalar =
+            checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+        if (!separator_scalar.is_valid) {
+          return;
+        }
+        util::string_view separator(*separator_scalar.value);
+
+        TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
+        auto Append = [&](util::string_view value) {
+          return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
+                                static_cast<offset_type>(value.size()));
+        };
+
+        const ArrayType* strings = static_cast<const 
ArrayType*>(list.value.get());
+        if (strings->null_count() > 0) {
+          // since the input list is not null, the out datum needs to be 
assigned to
+          *out = MakeNullScalar(list.value->type());
+          return;
+        }
+        if (strings->length() > 0) {
+          KERNEL_RETURN_IF_ERROR(ctx, Append(strings->GetView(0)));
+          for (int64_t j = 1; j < strings->length(); j++) {
+            KERNEL_RETURN_IF_ERROR(ctx, Append(separator));
+            KERNEL_RETURN_IF_ERROR(ctx, Append(strings->GetView(j)));
+          }
+        }
+        std::shared_ptr<Buffer> string_buffer;
+
+        KERNEL_RETURN_IF_ERROR(ctx, builder.Finish(&string_buffer));
+        KERNEL_ASSIGN_OR_RAISE(auto scalar_right_type, ctx,
+                               MakeScalar<std::shared_ptr<Buffer>>(
+                                   list.value->type(), 
std::move(string_buffer)));
+        *out = scalar_right_type;
+      }  // do we want to support scalar[list[str]] with array[str] ?
+    } else {
+      const ListArrayType list(batch[0].array());
+      ArrayData* output = out->mutable_array();
+
+      BuilderType builder(ctx->memory_pool());
+      KERNEL_RETURN_IF_ERROR(ctx, builder.Reserve(list.length()));
+      if (batch[1].kind() == Datum::ARRAY) {
+        ArrayType separator_array(batch[1].array());
+        for (int64_t i = 0; i < list.length(); ++i) {
+          const std::shared_ptr<Array> slice = list.value_slice(i);
+          const ArrayType* strings = static_cast<const 
ArrayType*>(slice.get());
+          if ((strings->null_count() > 0) || (list.IsNull(i)) ||
+              separator_array.IsNull(i)) {
+            KERNEL_RETURN_IF_ERROR(ctx, builder.AppendNull());
+          } else {
+            const auto separator = separator_array.GetView(i);
+            if (strings->length() > 0) {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.Append(strings->GetView(0)));
+              for (int64_t j = 1; j < strings->length(); j++) {
+                KERNEL_RETURN_IF_ERROR(ctx, builder.AppendCurrent(separator));
+                KERNEL_RETURN_IF_ERROR(ctx, 
builder.AppendCurrent(strings->GetView(j)));
+              }
+            } else {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.AppendEmptyValue());
+            }
+          }
+        }
+      } else if (batch[1].kind() == Datum::SCALAR) {
+        const auto& separator_scalar =
+            checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+        if (!separator_scalar.is_valid) {
+          KERNEL_ASSIGN_OR_RAISE(
+              auto nulls, ctx,
+              MakeArrayOfNull(list.value_type(), list.length(), 
ctx->memory_pool()));
+          *output = *nulls->data();
+          output->type = list.value_type();
+          return;
+        }
+        util::string_view separator(*separator_scalar.value);
+
+        for (int64_t i = 0; i < list.length(); ++i) {
+          const std::shared_ptr<Array> slice = list.value_slice(i);
+          const ArrayType* strings = static_cast<const 
ArrayType*>(slice.get());
+          if ((strings->null_count() > 0) || (list.IsNull(i))) {
+            KERNEL_RETURN_IF_ERROR(ctx, builder.AppendNull());
+          } else {
+            if (strings->length() > 0) {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.Append(strings->GetView(0)));
+              for (int64_t j = 1; j < strings->length(); j++) {
+                KERNEL_RETURN_IF_ERROR(ctx, builder.AppendCurrent(separator));
+                KERNEL_RETURN_IF_ERROR(ctx, 
builder.AppendCurrent(strings->GetView(j)));
+              }
+            } else {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.AppendEmptyValue());
+            }
+          }
+        }
+      }
+      std::shared_ptr<Array> string_array;
+
+      KERNEL_RETURN_IF_ERROR(ctx, builder.Finish(&string_array));
+      *output = *string_array->data();
+      // correct the output type based on the input
+      output->type = list.value_type();
+    }
+  }
+};
+
+const FunctionDoc binary_join_doc(
+    "Join list of strings together with a `seperator` to form a single string",
+    ("Insert `seperator` between each list element, and concatenate them."),
+    {"list", "separator"});
+
+void AddJoin(FunctionRegistry* registry) {
+  auto func =
+      std::make_shared<ScalarFunction>("binary_join", Arity::Binary(), 
&binary_join_doc);
+  for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
+    auto exec = GenerateTypeAgnosticVarBinaryBase<Join>(*ty);
+    DCHECK_OK(
+        func->AddKernel({InputType::Array(list(ty)), InputType::Scalar(ty)}, 
ty, exec));
+    DCHECK_OK(
+        func->AddKernel({InputType::Array(list(ty)), InputType::Array(ty)}, 
ty, exec));
+    // do we want to support scalar[list[str]] with array[str] ?

Review comment:
       It doesn't sound very useful. Either way, if it's not supported, then 
simply remove the commented out code.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1828,6 +1828,148 @@ void AddUtf8Length(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// binary join
+
+template <typename Type>
+struct Join {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using ListArrayType = ListArray;
+  using offset_type = typename Type::offset_type;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::SCALAR) {
+      const ListScalar& list = checked_cast<const 
ListScalar&>(*batch[0].scalar());
+      if (!list.is_valid) {
+        return;
+      }
+      if (batch[1].kind() == Datum::SCALAR) {
+        const BaseBinaryScalar& separator_scalar =
+            checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+        if (!separator_scalar.is_valid) {
+          return;
+        }
+        util::string_view separator(*separator_scalar.value);
+
+        TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
+        auto Append = [&](util::string_view value) {
+          return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
+                                static_cast<offset_type>(value.size()));
+        };
+
+        const ArrayType* strings = static_cast<const 
ArrayType*>(list.value.get());
+        if (strings->null_count() > 0) {
+          // since the input list is not null, the out datum needs to be 
assigned to
+          *out = MakeNullScalar(list.value->type());
+          return;
+        }
+        if (strings->length() > 0) {
+          KERNEL_RETURN_IF_ERROR(ctx, Append(strings->GetView(0)));
+          for (int64_t j = 1; j < strings->length(); j++) {
+            KERNEL_RETURN_IF_ERROR(ctx, Append(separator));
+            KERNEL_RETURN_IF_ERROR(ctx, Append(strings->GetView(j)));
+          }
+        }
+        std::shared_ptr<Buffer> string_buffer;
+
+        KERNEL_RETURN_IF_ERROR(ctx, builder.Finish(&string_buffer));
+        KERNEL_ASSIGN_OR_RAISE(auto scalar_right_type, ctx,
+                               MakeScalar<std::shared_ptr<Buffer>>(
+                                   list.value->type(), 
std::move(string_buffer)));
+        *out = scalar_right_type;
+      }  // do we want to support scalar[list[str]] with array[str] ?
+    } else {
+      const ListArrayType list(batch[0].array());
+      ArrayData* output = out->mutable_array();
+
+      BuilderType builder(ctx->memory_pool());
+      KERNEL_RETURN_IF_ERROR(ctx, builder.Reserve(list.length()));
+      if (batch[1].kind() == Datum::ARRAY) {
+        ArrayType separator_array(batch[1].array());
+        for (int64_t i = 0; i < list.length(); ++i) {
+          const std::shared_ptr<Array> slice = list.value_slice(i);
+          const ArrayType* strings = static_cast<const 
ArrayType*>(slice.get());
+          if ((strings->null_count() > 0) || (list.IsNull(i)) ||
+              separator_array.IsNull(i)) {
+            KERNEL_RETURN_IF_ERROR(ctx, builder.AppendNull());
+          } else {
+            const auto separator = separator_array.GetView(i);
+            if (strings->length() > 0) {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.Append(strings->GetView(0)));
+              for (int64_t j = 1; j < strings->length(); j++) {
+                KERNEL_RETURN_IF_ERROR(ctx, builder.AppendCurrent(separator));
+                KERNEL_RETURN_IF_ERROR(ctx, 
builder.AppendCurrent(strings->GetView(j)));
+              }
+            } else {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.AppendEmptyValue());
+            }
+          }
+        }
+      } else if (batch[1].kind() == Datum::SCALAR) {
+        const auto& separator_scalar =
+            checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+        if (!separator_scalar.is_valid) {
+          KERNEL_ASSIGN_OR_RAISE(
+              auto nulls, ctx,
+              MakeArrayOfNull(list.value_type(), list.length(), 
ctx->memory_pool()));
+          *output = *nulls->data();
+          output->type = list.value_type();
+          return;
+        }
+        util::string_view separator(*separator_scalar.value);
+
+        for (int64_t i = 0; i < list.length(); ++i) {
+          const std::shared_ptr<Array> slice = list.value_slice(i);
+          const ArrayType* strings = static_cast<const 
ArrayType*>(slice.get());
+          if ((strings->null_count() > 0) || (list.IsNull(i))) {
+            KERNEL_RETURN_IF_ERROR(ctx, builder.AppendNull());
+          } else {
+            if (strings->length() > 0) {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.Append(strings->GetView(0)));
+              for (int64_t j = 1; j < strings->length(); j++) {
+                KERNEL_RETURN_IF_ERROR(ctx, builder.AppendCurrent(separator));
+                KERNEL_RETURN_IF_ERROR(ctx, 
builder.AppendCurrent(strings->GetView(j)));
+              }
+            } else {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.AppendEmptyValue());
+            }
+          }
+        }
+      }
+      std::shared_ptr<Array> string_array;
+
+      KERNEL_RETURN_IF_ERROR(ctx, builder.Finish(&string_array));
+      *output = *string_array->data();
+      // correct the output type based on the input
+      output->type = list.value_type();
+    }
+  }
+};
+
+const FunctionDoc binary_join_doc(
+    "Join list of strings together with a `seperator` to form a single string",

Review comment:
       "separator"

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1828,6 +1828,148 @@ void AddUtf8Length(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// binary join
+
+template <typename Type>
+struct Join {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using ListArrayType = ListArray;
+  using offset_type = typename Type::offset_type;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::SCALAR) {
+      const ListScalar& list = checked_cast<const 
ListScalar&>(*batch[0].scalar());
+      if (!list.is_valid) {
+        return;
+      }
+      if (batch[1].kind() == Datum::SCALAR) {
+        const BaseBinaryScalar& separator_scalar =
+            checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+        if (!separator_scalar.is_valid) {
+          return;
+        }
+        util::string_view separator(*separator_scalar.value);
+
+        TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
+        auto Append = [&](util::string_view value) {
+          return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
+                                static_cast<offset_type>(value.size()));
+        };
+
+        const ArrayType* strings = static_cast<const 
ArrayType*>(list.value.get());

Review comment:
       Use `checked_cast`

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1828,6 +1828,148 @@ void AddUtf8Length(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// binary join
+
+template <typename Type>
+struct Join {

Review comment:
       Note that the computation logic doesn't depend on utf8 vs. binary, hence 
it would be nice to only instantiate two kernel routines (one per offset type), 
not four.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -428,6 +433,26 @@ TYPED_TEST(TestStringKernels, 
StrptimeDoesNotProvideDefaultOptions) {
   ASSERT_RAISES(Invalid, CallFunction("strptime", {input}));
 }
 
+TYPED_TEST(TestStringKernels, Join) {
+  auto separator = this->scalar("--");
+  CheckScalarBinary(
+      "binary_join", list(this->type()),
+      R"([["a", "bb", "ccc"], [], null, ["dd"], ["eee", null], ["ff", ""]])", 
separator,
+      this->type(), R"(["a--bb--ccc", "", null, "dd", null, "ff--"])");

Review comment:
       I'm ok with this behaviour. We can add an option later if we want to 
override it.

##########
File path: docs/source/cpp/compute.rst
##########
@@ -575,6 +575,22 @@ when a positive ``max_splits`` is given.
   as separator.
 
 
+String joining
+~~~~~~~~~~~~~~
+
+The inverse of string splitting:
+
++--------------------------+------------+-------------------------+-------------------+---------+
+| Function name            | Arity      | Input types             | Output 
type       | Notes   |
++==========================+============+=========================+===================+=========+
+| binary_join              | Binary     | List-like               | 
String-like       | \(1)    |

Review comment:
       Since the two inputs must be of different types, it would be nice to 
make this explicit here (perhaps by making two columns "first input type" and 
"second input type"?).

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1828,6 +1828,148 @@ void AddUtf8Length(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// binary join
+
+template <typename Type>
+struct Join {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using ListArrayType = ListArray;
+  using offset_type = typename Type::offset_type;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::SCALAR) {
+      const ListScalar& list = checked_cast<const 
ListScalar&>(*batch[0].scalar());
+      if (!list.is_valid) {
+        return;
+      }
+      if (batch[1].kind() == Datum::SCALAR) {
+        const BaseBinaryScalar& separator_scalar =
+            checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+        if (!separator_scalar.is_valid) {
+          return;
+        }
+        util::string_view separator(*separator_scalar.value);
+
+        TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
+        auto Append = [&](util::string_view value) {
+          return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
+                                static_cast<offset_type>(value.size()));
+        };
+
+        const ArrayType* strings = static_cast<const 
ArrayType*>(list.value.get());
+        if (strings->null_count() > 0) {
+          // since the input list is not null, the out datum needs to be 
assigned to
+          *out = MakeNullScalar(list.value->type());
+          return;
+        }
+        if (strings->length() > 0) {
+          KERNEL_RETURN_IF_ERROR(ctx, Append(strings->GetView(0)));
+          for (int64_t j = 1; j < strings->length(); j++) {
+            KERNEL_RETURN_IF_ERROR(ctx, Append(separator));
+            KERNEL_RETURN_IF_ERROR(ctx, Append(strings->GetView(j)));
+          }
+        }
+        std::shared_ptr<Buffer> string_buffer;
+
+        KERNEL_RETURN_IF_ERROR(ctx, builder.Finish(&string_buffer));
+        KERNEL_ASSIGN_OR_RAISE(auto scalar_right_type, ctx,
+                               MakeScalar<std::shared_ptr<Buffer>>(
+                                   list.value->type(), 
std::move(string_buffer)));
+        *out = scalar_right_type;
+      }  // do we want to support scalar[list[str]] with array[str] ?
+    } else {
+      const ListArrayType list(batch[0].array());
+      ArrayData* output = out->mutable_array();
+
+      BuilderType builder(ctx->memory_pool());
+      KERNEL_RETURN_IF_ERROR(ctx, builder.Reserve(list.length()));
+      if (batch[1].kind() == Datum::ARRAY) {
+        ArrayType separator_array(batch[1].array());
+        for (int64_t i = 0; i < list.length(); ++i) {
+          const std::shared_ptr<Array> slice = list.value_slice(i);
+          const ArrayType* strings = static_cast<const 
ArrayType*>(slice.get());
+          if ((strings->null_count() > 0) || (list.IsNull(i)) ||
+              separator_array.IsNull(i)) {
+            KERNEL_RETURN_IF_ERROR(ctx, builder.AppendNull());
+          } else {
+            const auto separator = separator_array.GetView(i);
+            if (strings->length() > 0) {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.Append(strings->GetView(0)));
+              for (int64_t j = 1; j < strings->length(); j++) {
+                KERNEL_RETURN_IF_ERROR(ctx, builder.AppendCurrent(separator));
+                KERNEL_RETURN_IF_ERROR(ctx, 
builder.AppendCurrent(strings->GetView(j)));
+              }
+            } else {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.AppendEmptyValue());
+            }
+          }
+        }
+      } else if (batch[1].kind() == Datum::SCALAR) {
+        const auto& separator_scalar =
+            checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+        if (!separator_scalar.is_valid) {
+          KERNEL_ASSIGN_OR_RAISE(
+              auto nulls, ctx,
+              MakeArrayOfNull(list.value_type(), list.length(), 
ctx->memory_pool()));
+          *output = *nulls->data();
+          output->type = list.value_type();
+          return;
+        }
+        util::string_view separator(*separator_scalar.value);
+
+        for (int64_t i = 0; i < list.length(); ++i) {
+          const std::shared_ptr<Array> slice = list.value_slice(i);
+          const ArrayType* strings = static_cast<const 
ArrayType*>(slice.get());
+          if ((strings->null_count() > 0) || (list.IsNull(i))) {
+            KERNEL_RETURN_IF_ERROR(ctx, builder.AppendNull());
+          } else {
+            if (strings->length() > 0) {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.Append(strings->GetView(0)));
+              for (int64_t j = 1; j < strings->length(); j++) {
+                KERNEL_RETURN_IF_ERROR(ctx, builder.AppendCurrent(separator));
+                KERNEL_RETURN_IF_ERROR(ctx, 
builder.AppendCurrent(strings->GetView(j)));
+              }
+            } else {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.AppendEmptyValue());
+            }
+          }
+        }
+      }
+      std::shared_ptr<Array> string_array;
+
+      KERNEL_RETURN_IF_ERROR(ctx, builder.Finish(&string_array));
+      *output = *string_array->data();
+      // correct the output type based on the input
+      output->type = list.value_type();
+    }
+  }
+};
+
+const FunctionDoc binary_join_doc(
+    "Join list of strings together with a `seperator` to form a single string",
+    ("Insert `seperator` between each list element, and concatenate them."),

Review comment:
       Can you mention the null behaviour as well?

##########
File path: cpp/src/arrow/array/builder_binary.h
##########
@@ -77,6 +77,25 @@ class BaseBinaryBuilder : public ArrayBuilder {
     return Append(value.data(), static_cast<offset_type>(value.size()));
   }
 
+  /// AppendCurrent does not add a new offset
+  Status AppendCurrent(const uint8_t* value, offset_type length) {

Review comment:
       Call this `AppendToCurrent`?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1828,6 +1828,148 @@ void AddUtf8Length(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// binary join
+
+template <typename Type>
+struct Join {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using ListArrayType = ListArray;
+  using offset_type = typename Type::offset_type;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::SCALAR) {
+      const ListScalar& list = checked_cast<const 
ListScalar&>(*batch[0].scalar());
+      if (!list.is_valid) {
+        return;
+      }
+      if (batch[1].kind() == Datum::SCALAR) {
+        const BaseBinaryScalar& separator_scalar =
+            checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+        if (!separator_scalar.is_valid) {
+          return;
+        }
+        util::string_view separator(*separator_scalar.value);
+
+        TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
+        auto Append = [&](util::string_view value) {
+          return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
+                                static_cast<offset_type>(value.size()));
+        };
+
+        const ArrayType* strings = static_cast<const 
ArrayType*>(list.value.get());
+        if (strings->null_count() > 0) {
+          // since the input list is not null, the out datum needs to be 
assigned to
+          *out = MakeNullScalar(list.value->type());
+          return;
+        }
+        if (strings->length() > 0) {
+          KERNEL_RETURN_IF_ERROR(ctx, Append(strings->GetView(0)));
+          for (int64_t j = 1; j < strings->length(); j++) {
+            KERNEL_RETURN_IF_ERROR(ctx, Append(separator));
+            KERNEL_RETURN_IF_ERROR(ctx, Append(strings->GetView(j)));
+          }
+        }
+        std::shared_ptr<Buffer> string_buffer;
+
+        KERNEL_RETURN_IF_ERROR(ctx, builder.Finish(&string_buffer));
+        KERNEL_ASSIGN_OR_RAISE(auto scalar_right_type, ctx,
+                               MakeScalar<std::shared_ptr<Buffer>>(
+                                   list.value->type(), 
std::move(string_buffer)));
+        *out = scalar_right_type;
+      }  // do we want to support scalar[list[str]] with array[str] ?
+    } else {
+      const ListArrayType list(batch[0].array());
+      ArrayData* output = out->mutable_array();
+
+      BuilderType builder(ctx->memory_pool());
+      KERNEL_RETURN_IF_ERROR(ctx, builder.Reserve(list.length()));
+      if (batch[1].kind() == Datum::ARRAY) {
+        ArrayType separator_array(batch[1].array());
+        for (int64_t i = 0; i < list.length(); ++i) {
+          const std::shared_ptr<Array> slice = list.value_slice(i);
+          const ArrayType* strings = static_cast<const 
ArrayType*>(slice.get());
+          if ((strings->null_count() > 0) || (list.IsNull(i)) ||
+              separator_array.IsNull(i)) {
+            KERNEL_RETURN_IF_ERROR(ctx, builder.AppendNull());
+          } else {
+            const auto separator = separator_array.GetView(i);
+            if (strings->length() > 0) {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.Append(strings->GetView(0)));
+              for (int64_t j = 1; j < strings->length(); j++) {
+                KERNEL_RETURN_IF_ERROR(ctx, builder.AppendCurrent(separator));
+                KERNEL_RETURN_IF_ERROR(ctx, 
builder.AppendCurrent(strings->GetView(j)));
+              }
+            } else {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.AppendEmptyValue());

Review comment:
       or `UnsafeAppend("", 0)`...

##########
File path: docs/source/cpp/compute.rst
##########
@@ -575,6 +575,22 @@ when a positive ``max_splits`` is given.
   as separator.
 
 
+String joining
+~~~~~~~~~~~~~~
+
+The inverse of string splitting:
+
++--------------------------+------------+-------------------------+-------------------+---------+
+| Function name            | Arity      | Input types             | Output 
type       | Notes   |
++==========================+============+=========================+===================+=========+
+| binary_join              | Binary     | List-like               | 
String-like       | \(1)    |
++--------------------------+------------+-------------------------+-------------------+---------+
+
+* \(1) First argument must be an array, while the second argument (`separator`)
+can be a scalar or array. When the `separator` argument is an array, the join 
is
+performed by each corresponding element.

Review comment:
       Be careful with indentation in bullet lists, you need the continuation 
lines to be indented as the first line after the `*`:
   ```
   * \(1) First argument must be an array, while the second argument 
(`separator`)
     can be a scalar or array. When the `separator` argument is an array, the 
join is
     performed by each corresponding element.
   ```

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1828,6 +1828,148 @@ void AddUtf8Length(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// binary join
+
+template <typename Type>
+struct Join {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using ListArrayType = ListArray;
+  using offset_type = typename Type::offset_type;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::SCALAR) {
+      const ListScalar& list = checked_cast<const 
ListScalar&>(*batch[0].scalar());
+      if (!list.is_valid) {
+        return;
+      }
+      if (batch[1].kind() == Datum::SCALAR) {
+        const BaseBinaryScalar& separator_scalar =
+            checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+        if (!separator_scalar.is_valid) {
+          return;
+        }
+        util::string_view separator(*separator_scalar.value);
+
+        TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
+        auto Append = [&](util::string_view value) {
+          return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
+                                static_cast<offset_type>(value.size()));
+        };
+
+        const ArrayType* strings = static_cast<const 
ArrayType*>(list.value.get());
+        if (strings->null_count() > 0) {
+          // since the input list is not null, the out datum needs to be 
assigned to
+          *out = MakeNullScalar(list.value->type());
+          return;
+        }
+        if (strings->length() > 0) {
+          KERNEL_RETURN_IF_ERROR(ctx, Append(strings->GetView(0)));
+          for (int64_t j = 1; j < strings->length(); j++) {
+            KERNEL_RETURN_IF_ERROR(ctx, Append(separator));
+            KERNEL_RETURN_IF_ERROR(ctx, Append(strings->GetView(j)));
+          }
+        }
+        std::shared_ptr<Buffer> string_buffer;
+
+        KERNEL_RETURN_IF_ERROR(ctx, builder.Finish(&string_buffer));
+        KERNEL_ASSIGN_OR_RAISE(auto scalar_right_type, ctx,
+                               MakeScalar<std::shared_ptr<Buffer>>(
+                                   list.value->type(), 
std::move(string_buffer)));
+        *out = scalar_right_type;
+      }  // do we want to support scalar[list[str]] with array[str] ?
+    } else {
+      const ListArrayType list(batch[0].array());
+      ArrayData* output = out->mutable_array();
+
+      BuilderType builder(ctx->memory_pool());
+      KERNEL_RETURN_IF_ERROR(ctx, builder.Reserve(list.length()));
+      if (batch[1].kind() == Datum::ARRAY) {
+        ArrayType separator_array(batch[1].array());
+        for (int64_t i = 0; i < list.length(); ++i) {
+          const std::shared_ptr<Array> slice = list.value_slice(i);
+          const ArrayType* strings = static_cast<const 
ArrayType*>(slice.get());
+          if ((strings->null_count() > 0) || (list.IsNull(i)) ||
+              separator_array.IsNull(i)) {
+            KERNEL_RETURN_IF_ERROR(ctx, builder.AppendNull());

Review comment:
       Can be `UnsafeAppendNull`, since you reserved the space above.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1828,6 +1828,148 @@ void AddUtf8Length(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// binary join
+
+template <typename Type>
+struct Join {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using ListArrayType = ListArray;
+  using offset_type = typename Type::offset_type;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::SCALAR) {
+      const ListScalar& list = checked_cast<const 
ListScalar&>(*batch[0].scalar());
+      if (!list.is_valid) {
+        return;
+      }
+      if (batch[1].kind() == Datum::SCALAR) {
+        const BaseBinaryScalar& separator_scalar =
+            checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+        if (!separator_scalar.is_valid) {
+          return;
+        }
+        util::string_view separator(*separator_scalar.value);
+
+        TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
+        auto Append = [&](util::string_view value) {
+          return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
+                                static_cast<offset_type>(value.size()));
+        };
+
+        const ArrayType* strings = static_cast<const 
ArrayType*>(list.value.get());
+        if (strings->null_count() > 0) {
+          // since the input list is not null, the out datum needs to be 
assigned to
+          *out = MakeNullScalar(list.value->type());
+          return;
+        }
+        if (strings->length() > 0) {
+          KERNEL_RETURN_IF_ERROR(ctx, Append(strings->GetView(0)));
+          for (int64_t j = 1; j < strings->length(); j++) {
+            KERNEL_RETURN_IF_ERROR(ctx, Append(separator));
+            KERNEL_RETURN_IF_ERROR(ctx, Append(strings->GetView(j)));
+          }
+        }
+        std::shared_ptr<Buffer> string_buffer;
+
+        KERNEL_RETURN_IF_ERROR(ctx, builder.Finish(&string_buffer));
+        KERNEL_ASSIGN_OR_RAISE(auto scalar_right_type, ctx,
+                               MakeScalar<std::shared_ptr<Buffer>>(
+                                   list.value->type(), 
std::move(string_buffer)));
+        *out = scalar_right_type;
+      }  // do we want to support scalar[list[str]] with array[str] ?
+    } else {
+      const ListArrayType list(batch[0].array());
+      ArrayData* output = out->mutable_array();
+
+      BuilderType builder(ctx->memory_pool());
+      KERNEL_RETURN_IF_ERROR(ctx, builder.Reserve(list.length()));
+      if (batch[1].kind() == Datum::ARRAY) {
+        ArrayType separator_array(batch[1].array());
+        for (int64_t i = 0; i < list.length(); ++i) {
+          const std::shared_ptr<Array> slice = list.value_slice(i);
+          const ArrayType* strings = static_cast<const 
ArrayType*>(slice.get());
+          if ((strings->null_count() > 0) || (list.IsNull(i)) ||
+              separator_array.IsNull(i)) {
+            KERNEL_RETURN_IF_ERROR(ctx, builder.AppendNull());
+          } else {
+            const auto separator = separator_array.GetView(i);
+            if (strings->length() > 0) {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.Append(strings->GetView(0)));
+              for (int64_t j = 1; j < strings->length(); j++) {
+                KERNEL_RETURN_IF_ERROR(ctx, builder.AppendCurrent(separator));
+                KERNEL_RETURN_IF_ERROR(ctx, 
builder.AppendCurrent(strings->GetView(j)));
+              }
+            } else {
+              KERNEL_RETURN_IF_ERROR(ctx, builder.AppendEmptyValue());
+            }
+          }
+        }
+      } else if (batch[1].kind() == Datum::SCALAR) {
+        const auto& separator_scalar =
+            checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+        if (!separator_scalar.is_valid) {
+          KERNEL_ASSIGN_OR_RAISE(
+              auto nulls, ctx,
+              MakeArrayOfNull(list.value_type(), list.length(), 
ctx->memory_pool()));
+          *output = *nulls->data();
+          output->type = list.value_type();
+          return;
+        }
+        util::string_view separator(*separator_scalar.value);
+
+        for (int64_t i = 0; i < list.length(); ++i) {
+          const std::shared_ptr<Array> slice = list.value_slice(i);

Review comment:
       Same comments as above wrt. efficiency.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1828,6 +1828,148 @@ void AddUtf8Length(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// binary join
+
+template <typename Type>
+struct Join {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using ListArrayType = ListArray;
+  using offset_type = typename Type::offset_type;
+  using BuilderType = typename TypeTraits<Type>::BuilderType;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::SCALAR) {
+      const ListScalar& list = checked_cast<const 
ListScalar&>(*batch[0].scalar());
+      if (!list.is_valid) {
+        return;
+      }
+      if (batch[1].kind() == Datum::SCALAR) {
+        const BaseBinaryScalar& separator_scalar =
+            checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
+        if (!separator_scalar.is_valid) {
+          return;
+        }
+        util::string_view separator(*separator_scalar.value);
+
+        TypedBufferBuilder<uint8_t> builder(ctx->memory_pool());
+        auto Append = [&](util::string_view value) {
+          return builder.Append(reinterpret_cast<const uint8_t*>(value.data()),
+                                static_cast<offset_type>(value.size()));
+        };
+
+        const ArrayType* strings = static_cast<const 
ArrayType*>(list.value.get());
+        if (strings->null_count() > 0) {
+          // since the input list is not null, the out datum needs to be 
assigned to
+          *out = MakeNullScalar(list.value->type());
+          return;
+        }
+        if (strings->length() > 0) {
+          KERNEL_RETURN_IF_ERROR(ctx, Append(strings->GetView(0)));
+          for (int64_t j = 1; j < strings->length(); j++) {
+            KERNEL_RETURN_IF_ERROR(ctx, Append(separator));
+            KERNEL_RETURN_IF_ERROR(ctx, Append(strings->GetView(j)));
+          }
+        }
+        std::shared_ptr<Buffer> string_buffer;
+
+        KERNEL_RETURN_IF_ERROR(ctx, builder.Finish(&string_buffer));
+        KERNEL_ASSIGN_OR_RAISE(auto scalar_right_type, ctx,
+                               MakeScalar<std::shared_ptr<Buffer>>(
+                                   list.value->type(), 
std::move(string_buffer)));
+        *out = scalar_right_type;
+      }  // do we want to support scalar[list[str]] with array[str] ?
+    } else {
+      const ListArrayType list(batch[0].array());
+      ArrayData* output = out->mutable_array();
+
+      BuilderType builder(ctx->memory_pool());
+      KERNEL_RETURN_IF_ERROR(ctx, builder.Reserve(list.length()));
+      if (batch[1].kind() == Datum::ARRAY) {
+        ArrayType separator_array(batch[1].array());
+        for (int64_t i = 0; i < list.length(); ++i) {
+          const std::shared_ptr<Array> slice = list.value_slice(i);

Review comment:
       I think it would be much faster to avoid instantiating this temporary 
array for each list element.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to