asuhan commented on a change in pull request #11793: URL: https://github.com/apache/arrow/pull/11793#discussion_r759907102
########## File path: cpp/src/arrow/compute/kernels/scalar_compare.cc ########## @@ -439,6 +472,330 @@ struct ScalarMinMax { } }; +template <typename Type, typename Op> +struct BinaryScalarMinMax { + using ArrayType = typename TypeTraits<Type>::ArrayType; + using BuilderType = typename TypeTraits<Type>::BuilderType; + using offset_type = typename Type::offset_type; + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx); + if (std::all_of(batch.values.begin(), batch.values.end(), + [](const Datum& d) { return d.is_scalar(); })) { + return ExecOnlyScalar(ctx, options, batch, out); + } + return ExecContainingArrays(ctx, options, batch, out); + } + + static Status ExecOnlyScalar(KernelContext* ctx, + const ElementWiseAggregateOptions& options, + const ExecBatch& batch, Datum* out) { + if (batch.values.empty()) { + return Status::OK(); + } + auto output = checked_cast<BaseBinaryScalar*>(out->scalar().get()); + if (!options.skip_nulls) { + // any nulls in the input will produce a null output + for (const auto& value : batch.values) { + if (!value.scalar()->is_valid) { + output->is_valid = false; + return Status::OK(); + } + } + } + const Scalar& first_scalar = *batch.values.front().scalar(); + string_view result = UnboxScalar<Type>::Unbox(first_scalar); + bool valid = first_scalar.is_valid; + for (size_t i = 1; i < batch.values.size(); i++) { + const Scalar& scalar = *batch[i].scalar(); + if (!scalar.is_valid) { + DCHECK(options.skip_nulls); + continue; + } else { + string_view value = UnboxScalar<Type>::Unbox(scalar); + result = !valid ? value : Op::Call(result, value); + valid = true; + } + } + if (valid) { + ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(result.size())); + uint8_t* buf = output->value->mutable_data(); + buf = std::copy(result.begin(), result.end(), buf); + output->is_valid = true; + } else { + output->is_valid = false; + } + return Status::OK(); + } + + static Status ExecContainingArrays(KernelContext* ctx, + const ElementWiseAggregateOptions& options, + const ExecBatch& batch, Datum* out) { + // Presize data to avoid reallocations + int64_t final_size = 0; + for (int64_t i = 0; i < batch.length; i++) { + auto size = CalculateRowSize(options, batch, i); + if (size > 0) final_size += size; + } + BuilderType builder(ctx->memory_pool()); + RETURN_NOT_OK(builder.Reserve(batch.length)); + RETURN_NOT_OK(builder.ReserveData(final_size)); + + std::vector<util::optional<string_view>> valid_cols(batch.values.size()); + for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) { + size_t num_valid = 0; + for (size_t col = 0; col < batch.values.size(); col++) { + if (batch[col].is_scalar()) { + const auto& scalar = *batch[col].scalar(); + if (scalar.is_valid) { + valid_cols[col] = UnboxScalar<Type>::Unbox(scalar); + num_valid++; + } else { + valid_cols[col] = util::nullopt; + } + } else { + const ArrayData& array = *batch[col].array(); + if (!array.MayHaveNulls() || + BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) { + const offset_type* offsets = array.GetValues<offset_type>(1); + const uint8_t* data = array.GetValues<uint8_t>(2, /*absolute_offset=*/0); + const int64_t length = offsets[row + 1] - offsets[row]; + valid_cols[col] = + string_view(reinterpret_cast<const char*>(data + offsets[row]), length); + num_valid++; + } else { + valid_cols[col] = util::nullopt; + } + } + } + + if (num_valid == 0 || (num_valid < batch.values.size() && !options.skip_nulls)) { + // We had some nulls + builder.UnsafeAppendNull(); + continue; + } + util::optional<string_view> result = valid_cols.front(); + for (size_t col = 1; col < batch.values.size(); ++col) { + util::optional<string_view> value = valid_cols[col]; + if (!value) { + DCHECK(options.skip_nulls); Review comment: This asserts that it's impossible to find a null value here. We handle the case when we have some nulls and don't skip nulls right above this loop. ########## File path: cpp/src/arrow/compute/kernels/scalar_compare.cc ########## @@ -439,6 +472,330 @@ struct ScalarMinMax { } }; +template <typename Type, typename Op> +struct BinaryScalarMinMax { + using ArrayType = typename TypeTraits<Type>::ArrayType; + using BuilderType = typename TypeTraits<Type>::BuilderType; + using offset_type = typename Type::offset_type; + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx); + if (std::all_of(batch.values.begin(), batch.values.end(), + [](const Datum& d) { return d.is_scalar(); })) { + return ExecOnlyScalar(ctx, options, batch, out); + } + return ExecContainingArrays(ctx, options, batch, out); + } + + static Status ExecOnlyScalar(KernelContext* ctx, + const ElementWiseAggregateOptions& options, + const ExecBatch& batch, Datum* out) { + if (batch.values.empty()) { + return Status::OK(); + } + auto output = checked_cast<BaseBinaryScalar*>(out->scalar().get()); + if (!options.skip_nulls) { + // any nulls in the input will produce a null output + for (const auto& value : batch.values) { + if (!value.scalar()->is_valid) { + output->is_valid = false; + return Status::OK(); + } + } + } + const Scalar& first_scalar = *batch.values.front().scalar(); + string_view result = UnboxScalar<Type>::Unbox(first_scalar); + bool valid = first_scalar.is_valid; + for (size_t i = 1; i < batch.values.size(); i++) { + const Scalar& scalar = *batch[i].scalar(); + if (!scalar.is_valid) { + DCHECK(options.skip_nulls); + continue; + } else { + string_view value = UnboxScalar<Type>::Unbox(scalar); + result = !valid ? value : Op::Call(result, value); + valid = true; + } + } + if (valid) { + ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(result.size())); + uint8_t* buf = output->value->mutable_data(); + buf = std::copy(result.begin(), result.end(), buf); + output->is_valid = true; + } else { + output->is_valid = false; + } + return Status::OK(); + } + + static Status ExecContainingArrays(KernelContext* ctx, + const ElementWiseAggregateOptions& options, + const ExecBatch& batch, Datum* out) { + // Presize data to avoid reallocations + int64_t final_size = 0; + for (int64_t i = 0; i < batch.length; i++) { + auto size = CalculateRowSize(options, batch, i); + if (size > 0) final_size += size; + } + BuilderType builder(ctx->memory_pool()); + RETURN_NOT_OK(builder.Reserve(batch.length)); + RETURN_NOT_OK(builder.ReserveData(final_size)); + + std::vector<util::optional<string_view>> valid_cols(batch.values.size()); + for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) { + size_t num_valid = 0; + for (size_t col = 0; col < batch.values.size(); col++) { + if (batch[col].is_scalar()) { + const auto& scalar = *batch[col].scalar(); + if (scalar.is_valid) { + valid_cols[col] = UnboxScalar<Type>::Unbox(scalar); + num_valid++; + } else { + valid_cols[col] = util::nullopt; + } + } else { + const ArrayData& array = *batch[col].array(); Review comment: Done. ########## File path: cpp/src/arrow/compute/kernels/scalar_compare.cc ########## @@ -439,6 +472,330 @@ struct ScalarMinMax { } }; +template <typename Type, typename Op> +struct BinaryScalarMinMax { + using ArrayType = typename TypeTraits<Type>::ArrayType; + using BuilderType = typename TypeTraits<Type>::BuilderType; + using offset_type = typename Type::offset_type; + + static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx); + if (std::all_of(batch.values.begin(), batch.values.end(), + [](const Datum& d) { return d.is_scalar(); })) { + return ExecOnlyScalar(ctx, options, batch, out); + } + return ExecContainingArrays(ctx, options, batch, out); + } + + static Status ExecOnlyScalar(KernelContext* ctx, + const ElementWiseAggregateOptions& options, + const ExecBatch& batch, Datum* out) { + if (batch.values.empty()) { + return Status::OK(); + } + auto output = checked_cast<BaseBinaryScalar*>(out->scalar().get()); + if (!options.skip_nulls) { + // any nulls in the input will produce a null output + for (const auto& value : batch.values) { + if (!value.scalar()->is_valid) { + output->is_valid = false; + return Status::OK(); + } + } + } + const Scalar& first_scalar = *batch.values.front().scalar(); + string_view result = UnboxScalar<Type>::Unbox(first_scalar); + bool valid = first_scalar.is_valid; + for (size_t i = 1; i < batch.values.size(); i++) { + const Scalar& scalar = *batch[i].scalar(); + if (!scalar.is_valid) { + DCHECK(options.skip_nulls); + continue; + } else { + string_view value = UnboxScalar<Type>::Unbox(scalar); + result = !valid ? value : Op::Call(result, value); + valid = true; + } + } + if (valid) { + ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(result.size())); + uint8_t* buf = output->value->mutable_data(); + buf = std::copy(result.begin(), result.end(), buf); + output->is_valid = true; + } else { + output->is_valid = false; + } + return Status::OK(); + } + + static Status ExecContainingArrays(KernelContext* ctx, + const ElementWiseAggregateOptions& options, + const ExecBatch& batch, Datum* out) { + // Presize data to avoid reallocations + int64_t final_size = 0; + for (int64_t i = 0; i < batch.length; i++) { + auto size = CalculateRowSize(options, batch, i); + if (size > 0) final_size += size; + } + BuilderType builder(ctx->memory_pool()); + RETURN_NOT_OK(builder.Reserve(batch.length)); + RETURN_NOT_OK(builder.ReserveData(final_size)); + + std::vector<util::optional<string_view>> valid_cols(batch.values.size()); + for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) { + size_t num_valid = 0; + for (size_t col = 0; col < batch.values.size(); col++) { + if (batch[col].is_scalar()) { + const auto& scalar = *batch[col].scalar(); + if (scalar.is_valid) { + valid_cols[col] = UnboxScalar<Type>::Unbox(scalar); + num_valid++; + } else { + valid_cols[col] = util::nullopt; + } + } else { + const ArrayData& array = *batch[col].array(); + if (!array.MayHaveNulls() || + BitUtil::GetBit(array.buffers[0]->data(), array.offset + row)) { + const offset_type* offsets = array.GetValues<offset_type>(1); + const uint8_t* data = array.GetValues<uint8_t>(2, /*absolute_offset=*/0); Review comment: Done. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org