pitrou commented on a change in pull request #10806:
URL: https://github.com/apache/arrow/pull/10806#discussion_r692134196
##########
File path: cpp/src/arrow/array/builder_base.h
##########
@@ -123,6 +123,23 @@ class ARROW_EXPORT ArrayBuilder {
Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
Status AppendScalars(const ScalarVector& scalars);
+ /// \brief Append a range of values from an array
+ Status AppendArraySlice(const ArrayData& array, const int64_t offset,
+ const int64_t length) {
+ if (!type()->Equals(*array.type)) {
+ // TODO: test this
+ return Status::TypeError("Expected array of type ", *type(),
+ " but got array of type ", *array.type);
+ }
+ return AppendArraySliceUnchecked(array, offset, length);
+ }
+ /// \brief Append a range of values from an array without checking type
compatibility
+ virtual Status AppendArraySliceUnchecked(const ArrayData& array, const
int64_t offset,
+ const int64_t length) {
+ return Status::NotImplemented("AppendArraySliceUnchecked for builder for
", *type());
+ }
+ // TODO: overloads for arrays
Review comment:
Either do it just here, or just remove the TODO, IMHO. It's not worth
keeping this comment for a trivial two-liner.
##########
File path: cpp/src/arrow/array/builder_base.h
##########
@@ -123,6 +123,23 @@ class ARROW_EXPORT ArrayBuilder {
Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
Status AppendScalars(const ScalarVector& scalars);
+ /// \brief Append a range of values from an array
+ Status AppendArraySlice(const ArrayData& array, const int64_t offset,
+ const int64_t length) {
+ if (!type()->Equals(*array.type)) {
+ // TODO: test this
+ return Status::TypeError("Expected array of type ", *type(),
+ " but got array of type ", *array.type);
+ }
+ return AppendArraySliceUnchecked(array, offset, length);
+ }
Review comment:
Hmm, I would not bother with the checked version personally. Just
mention in the docstring that the types are supposed to be identical.
##########
File path: cpp/src/arrow/array/util.cc
##########
@@ -442,9 +442,13 @@ class NullArrayFactory {
// First buffer is always null
out_->buffers[0] = nullptr;
- // Type codes are all zero, so we can use buffer_ which has had it's memory
- // zeroed
out_->buffers[1] = buffer_;
+ // buffer_ is zeroed, but 0 may not be a valid type code
+ if (type.type_codes()[0] != 0) {
+ ARROW_ASSIGN_OR_RAISE(out_->buffers[1], AllocateBuffer(buffer_->size(),
pool_));
Review comment:
`length_` rather than `buffer_->size()`? (can `buffer_` be larger
because of the children or offsets?)
##########
File path: cpp/src/arrow/array/builder_binary.h
##########
@@ -274,6 +274,23 @@ class BaseBinaryBuilder : public ArrayBuilder {
return Status::OK();
}
+ Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset,
+ int64_t length) override {
+ auto bitmap = array.GetValues<uint8_t>(0, 0);
+ auto offsets = array.GetValues<offset_type>(1);
+ auto data = array.GetValues<uint8_t>(2, 0);
+ for (int64_t i = 0; i < length; i++) {
+ if (!bitmap || BitUtil::GetBit(bitmap, array.offset + offset + i)) {
Review comment:
May add a JIRA for this to do it later, though.
##########
File path: cpp/src/arrow/array/builder_union.h
##########
@@ -155,6 +156,21 @@ class ARROW_EXPORT DenseUnionBuilder : public
BasicUnionBuilder {
return offsets_builder_.Append(offset);
}
+ Status AppendArraySliceUnchecked(const ArrayData& array, const int64_t
offset,
+ const int64_t length) override {
Review comment:
The definition can probably moved to `builder_union.cc` instead?
(same for SparseUnionBuilder)
##########
File path: cpp/src/arrow/array/util.cc
##########
@@ -442,9 +442,13 @@ class NullArrayFactory {
// First buffer is always null
out_->buffers[0] = nullptr;
- // Type codes are all zero, so we can use buffer_ which has had it's memory
- // zeroed
out_->buffers[1] = buffer_;
+ // buffer_ is zeroed, but 0 may not be a valid type code
+ if (type.type_codes()[0] != 0) {
+ ARROW_ASSIGN_OR_RAISE(out_->buffers[1], AllocateBuffer(buffer_->size(),
pool_));
+ std::memset(out_->buffers[1]->mutable_data(), type.type_codes()[0],
+ buffer_->size());
Review comment:
`length_` here too?
##########
File path: cpp/src/arrow/array/builder_binary.h
##########
@@ -274,6 +274,23 @@ class BaseBinaryBuilder : public ArrayBuilder {
return Status::OK();
}
+ Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset,
+ int64_t length) override {
+ auto bitmap = array.GetValues<uint8_t>(0, 0);
+ auto offsets = array.GetValues<offset_type>(1);
+ auto data = array.GetValues<uint8_t>(2, 0);
+ for (int64_t i = 0; i < length; i++) {
+ if (!bitmap || BitUtil::GetBit(bitmap, array.offset + offset + i)) {
Review comment:
Perhaps you may use `VisitArrayDataInline`.
Or even better, using `BitRunReader` you could alternate bulk appends of
nulls and bulk memory copies of data (though the offsets still have to be
handled individually).
##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
##########
@@ -830,6 +867,607 @@ TEST(TestCaseWhen, FixedSizeBinary) {
ArrayFromJSON(type, R"([null, null, null, "efg"])"));
}
+template <typename Type>
+class TestCaseWhenBinary : public ::testing::Test {};
+
+TYPED_TEST_SUITE(TestCaseWhenBinary, BinaryArrowTypes);
+
+TYPED_TEST(TestCaseWhenBinary, Basics) {
+ auto type = default_type_instance<TypeParam>();
+ auto cond_true = ScalarFromJSON(boolean(), "true");
+ auto cond_false = ScalarFromJSON(boolean(), "false");
+ auto cond_null = ScalarFromJSON(boolean(), "null");
+ auto cond1 = ArrayFromJSON(boolean(), "[true, true, null, null]");
+ auto cond2 = ArrayFromJSON(boolean(), "[true, false, true, null]");
+ auto scalar_null = ScalarFromJSON(type, "null");
+ auto scalar1 = ScalarFromJSON(type, R"("aBxYz")");
+ auto scalar2 = ScalarFromJSON(type, R"("b")");
+ auto values_null = ArrayFromJSON(type, "[null, null, null, null]");
+ auto values1 = ArrayFromJSON(type, R"(["cDE", null, "degfhi", "efg"])");
+ auto values2 = ArrayFromJSON(type, R"(["fghijk", "ghi", null, "hi"])");
+
+ // CheckScalar("case_when", {MakeStruct({}), values1}, values1);
Review comment:
Did you forget to uncomment all these?
##########
File path: cpp/src/arrow/array/builder_nested.h
##########
@@ -467,6 +518,19 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder {
return Status::OK();
}
+ Status AppendArraySliceUnchecked(const ArrayData& array, int64_t offset,
+ int64_t length) override {
+ for (int i = 0; static_cast<size_t>(i) < children_.size(); i++) {
+ children_[i]->AppendArraySliceUnchecked(*array.child_data[i],
array.offset + offset,
+ length);
+ }
+ const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data()
: nullptr;
+ for (int64_t row = offset; row < offset + length; row++) {
+ RETURN_NOT_OK(Append(!validity || BitUtil::GetBit(validity, array.offset
+ row)));
Review comment:
Can't we just do a bulk-append of the validity bitmap here?
##########
File path: cpp/src/arrow/compute/kernels/scalar_if_else.cc
##########
@@ -1413,6 +1417,588 @@ struct CaseWhenFunctor<NullType> {
}
};
+static Status ExecVarWidthScalarCaseWhen(KernelContext* ctx, const ExecBatch&
batch,
+ Datum* out) {
+ const auto& conds = checked_cast<const
StructScalar&>(*batch.values[0].scalar());
+ Datum result;
+ for (size_t i = 0; i < batch.values.size() - 1; i++) {
+ if (i < conds.value.size()) {
+ const Scalar& cond = *conds.value[i];
+ if (cond.is_valid && internal::UnboxScalar<BooleanType>::Unbox(cond)) {
+ result = batch[i + 1];
+ break;
+ }
+ } else {
+ // ELSE clause
+ result = batch[i + 1];
+ break;
+ }
+ }
+ if (out->is_scalar()) {
+ *out = result.is_scalar() ? result.scalar() : MakeNullScalar(out->type());
+ return Status::OK();
+ }
+ ArrayData* output = out->mutable_array();
+ if (!result.is_value()) {
+ // All conditions false, no 'else' argument
+ ARROW_ASSIGN_OR_RAISE(
+ auto array, MakeArrayOfNull(output->type, batch.length,
ctx->memory_pool()));
+ *output = *array->data();
+ } else if (result.is_scalar()) {
+ ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*result.scalar(),
batch.length,
+ ctx->memory_pool()));
+ *output = *array->data();
+ } else {
+ *output = *result.array();
+ }
+ return Status::OK();
+}
+
+template <typename ReserveData, typename AppendScalar, typename AppendArray>
+static Status ExecVarWidthArrayCaseWhen(KernelContext* ctx, const ExecBatch&
batch,
+ Datum* out, ReserveData reserve_data,
+ AppendScalar append_scalar,
+ AppendArray append_array) {
+ const auto& conds_array = *batch.values[0].array();
+ ArrayData* output = out->mutable_array();
+ const bool have_else_arg =
+ static_cast<size_t>(conds_array.type->num_fields()) <
(batch.values.size() - 1);
+ std::unique_ptr<ArrayBuilder> raw_builder;
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), out->type(), &raw_builder));
+ RETURN_NOT_OK(raw_builder->Reserve(batch.length));
+ RETURN_NOT_OK(reserve_data(raw_builder.get()));
+
+ for (int64_t row = 0; row < batch.length; row++) {
+ int64_t selected = have_else_arg ?
static_cast<int64_t>(batch.values.size() - 1) : -1;
+ for (int64_t arg = 0; static_cast<size_t>(arg) <
conds_array.child_data.size();
+ arg++) {
+ const ArrayData& cond_array = *conds_array.child_data[arg];
+ if ((!cond_array.buffers[0] ||
+ BitUtil::GetBit(cond_array.buffers[0]->data(),
+ conds_array.offset + cond_array.offset + row)) &&
+ BitUtil::GetBit(cond_array.buffers[1]->data(),
+ conds_array.offset + cond_array.offset + row)) {
+ selected = arg + 1;
+ break;
+ }
+ }
+ if (selected < 0) {
+ RETURN_NOT_OK(raw_builder->AppendNull());
Review comment:
Ah, fair enough. Let's not bother then.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]