felipecrv commented on code in PR #35036:
URL: https://github.com/apache/arrow/pull/35036#discussion_r1170058420
##########
cpp/src/arrow/compute/kernels/scalar_validity.cc:
##########
@@ -82,6 +84,71 @@ static void SetNanBits(const ArraySpan& arr, uint8_t*
out_bitmap, int64_t out_of
}
}
+static void SetSparseUnionLogicalNullBits(const ArraySpan& span, uint8_t*
out_bitmap,
+ int64_t out_offset) {
+ const auto* sparse_union_type =
+ ::arrow::internal::checked_cast<const SparseUnionType*>(span.type);
+ DCHECK_LE(span.child_data.size(), 128);
+
+ const int8_t* types = span.GetValues<int8_t>(1); // NOLINT
+ for (int64_t i = 0; i < span.length; i++) {
+ const int8_t child_id = sparse_union_type->child_ids()[types[i]];
+ if (span.child_data[child_id].IsNull(i + span.offset)) {
+ bit_util::SetBit(out_bitmap, i + out_offset);
+ }
+ }
+}
+
+static void SetDenseUnionLogicalNullBits(const ArraySpan& span, uint8_t*
out_bitmap,
+ int64_t out_offset) {
+ const auto* dense_union_type =
+ ::arrow::internal::checked_cast<const DenseUnionType*>(span.type);
+ DCHECK_LE(span.child_data.size(), 128);
+
+ const int8_t* types = span.GetValues<int8_t>(1); // NOLINT
+ const int32_t* offsets = span.GetValues<int32_t>(2); // NOLINT
+ for (int64_t i = 0; i < span.length; i++) {
+ const int8_t child_id = dense_union_type->child_ids()[types[i]];
+ const int32_t offset = offsets[i];
+ if (span.child_data[child_id].IsNull(offset)) {
+ bit_util::SetBit(out_bitmap, i + out_offset);
+ }
+ }
+}
+
+template <typename RunEndCType>
+void SetREELogicalNullBits(const ArraySpan& span, uint8_t* out_bitmap,
+ int64_t out_offset) {
+ const auto& values = arrow::ree_util::ValuesArray(span);
+ const auto* values_bitmap = values.MayHaveNulls() ? values.buffers[0].data :
NULLPTR;
+
+ if (!values_bitmap) {
+ return;
+ }
+
+ arrow::ree_util::RunEndEncodedArraySpan<RunEndCType> ree_span(span);
+ auto end = ree_span.end();
+ for (auto it = ree_span.begin(); it != end; ++it) {
+ if (!bit_util::GetBit(values_bitmap, values.offset +
it.index_into_array())) {
+ bit_util::SetBitsTo(out_bitmap, it.logical_position() + out_offset,
it.run_length(),
+ true);
+ }
+ }
Review Comment:
I have an open PR implementing REE-based filters which can help make this
possible.
But note that each run in a `REE<Boolean>` costs `sizeof(RunEndCType) + 1`
bit of memory. We need boolean runs of lengths higher than 17, 33, 65, for
`REE<Boolean>` to start being better than the bitmap encoding.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]