cyb70289 commented on a change in pull request #11674:
URL: https://github.com/apache/arrow/pull/11674#discussion_r748706286
##########
File path: cpp/src/arrow/util/bitmap_ops.cc
##########
@@ -237,71 +237,105 @@ bool OptionalBitmapEquals(const std::shared_ptr<Buffer>&
left, int64_t left_offs
namespace {
-template <template <typename> class BitOp>
-void AlignedBitmapOp(const uint8_t* left, int64_t left_offset, const uint8_t*
right,
- int64_t right_offset, uint8_t* out, int64_t out_offset,
- int64_t length) {
+template <template <typename> class BitOp, bool ComputeNewValidityCount>
+int64_t AlignedBitmapOp(const uint8_t* left, int64_t left_offset, const
uint8_t* right,
+ int64_t right_offset, uint8_t* out, int64_t out_offset,
+ int64_t length) {
BitOp<uint8_t> op;
DCHECK_EQ(left_offset % 8, right_offset % 8);
DCHECK_EQ(left_offset % 8, out_offset % 8);
+ uint8_t* outFront = out;
const int64_t nbytes = BitUtil::BytesForBits(length + left_offset % 8);
left += left_offset / 8;
right += right_offset / 8;
out += out_offset / 8;
+ uint64_t outPopCount = 0;
for (int64_t i = 0; i < nbytes; ++i) {
out[i] = op(left[i], right[i]);
}
+ if (ComputeNewValidityCount) {
+ outPopCount = CountSetBits(outFront, out_offset, length);
+ }
+ return outPopCount;
}
-template <template <typename> class BitOp>
-void UnalignedBitmapOp(const uint8_t* left, int64_t left_offset, const
uint8_t* right,
- int64_t right_offset, uint8_t* out, int64_t out_offset,
- int64_t length) {
+template <template <typename> class BitOp, bool ComputeNewValidityCount>
+int64_t UnalignedBitmapOp(const uint8_t* left, int64_t left_offset, const
uint8_t* right,
+ int64_t right_offset, uint8_t* out, int64_t
out_offset,
+ int64_t length) {
BitOp<uint64_t> op_word;
BitOp<uint8_t> op_byte;
+ uint64_t outPopCount = 0;
auto left_reader = internal::BitmapWordReader<uint64_t>(left, left_offset,
length);
auto right_reader = internal::BitmapWordReader<uint64_t>(right,
right_offset, length);
auto writer = internal::BitmapWordWriter<uint64_t>(out, out_offset, length);
auto nwords = left_reader.words();
while (nwords--) {
- writer.PutNextWord(op_word(left_reader.NextWord(),
right_reader.NextWord()));
+ auto outWord = op_word(left_reader.NextWord(), right_reader.NextWord());
+ if (ComputeNewValidityCount) {
+ outPopCount += BitUtil::PopCount(outWord);
+ }
+ writer.PutNextWord(outWord);
}
auto nbytes = left_reader.trailing_bytes();
while (nbytes--) {
int left_valid_bits, right_valid_bits;
uint8_t left_byte = left_reader.NextTrailingByte(left_valid_bits);
uint8_t right_byte = right_reader.NextTrailingByte(right_valid_bits);
DCHECK_EQ(left_valid_bits, right_valid_bits);
- writer.PutNextTrailingByte(op_byte(left_byte, right_byte),
left_valid_bits);
+ auto outByte = op_byte(left_byte, right_byte);
Review comment:
use snake case `out_byte`
##########
File path: cpp/src/arrow/util/bitmap_ops.cc
##########
@@ -237,71 +237,105 @@ bool OptionalBitmapEquals(const std::shared_ptr<Buffer>&
left, int64_t left_offs
namespace {
-template <template <typename> class BitOp>
-void AlignedBitmapOp(const uint8_t* left, int64_t left_offset, const uint8_t*
right,
- int64_t right_offset, uint8_t* out, int64_t out_offset,
- int64_t length) {
+template <template <typename> class BitOp, bool ComputeNewValidityCount>
+int64_t AlignedBitmapOp(const uint8_t* left, int64_t left_offset, const
uint8_t* right,
+ int64_t right_offset, uint8_t* out, int64_t out_offset,
+ int64_t length) {
BitOp<uint8_t> op;
DCHECK_EQ(left_offset % 8, right_offset % 8);
DCHECK_EQ(left_offset % 8, out_offset % 8);
+ uint8_t* outFront = out;
const int64_t nbytes = BitUtil::BytesForBits(length + left_offset % 8);
left += left_offset / 8;
right += right_offset / 8;
out += out_offset / 8;
+ uint64_t outPopCount = 0;
for (int64_t i = 0; i < nbytes; ++i) {
out[i] = op(left[i], right[i]);
}
+ if (ComputeNewValidityCount) {
+ outPopCount = CountSetBits(outFront, out_offset, length);
+ }
+ return outPopCount;
Review comment:
Move this new code to the caller side to avoid the additional template
instance.
We konw the trivial c++ loop above generates non-trivial machine code due to
auto vectorization.
##########
File path: cpp/src/arrow/util/bitmap_ops.h
##########
@@ -124,7 +124,8 @@ Result<std::shared_ptr<Buffer>> BitmapAnd(MemoryPool* pool,
const uint8_t* left,
/// the results in out starting at the given bit-offset.
ARROW_EXPORT
void BitmapAnd(const uint8_t* left, int64_t left_offset, const uint8_t* right,
- int64_t right_offset, int64_t length, int64_t out_offset,
uint8_t* out);
+ int64_t right_offset, int64_t length, int64_t out_offset,
uint8_t* out,
+ int64_t* const out_pop_count);
Review comment:
remove `const`, and make `nullptr` the default value as commented.
##########
File path: cpp/src/arrow/util/bit_util_test.cc
##########
@@ -1357,10 +1372,15 @@ class BitmapOp : public TestBase {
// Clear out buffer and try non-allocating version
std::memset(out->mutable_data(), 0, out->size());
+ int64_t expectedPopCount = 0;
ASSERT_OK(op.Call(left->mutable_data(), left_offset,
right->mutable_data(),
- right_offset, length, out_offset,
out->mutable_data()));
+ right_offset, length, out_offset,
out->mutable_data(),
+ &expectedPopCount));
reader = internal::BitmapReader(out->mutable_data(), out_offset,
length);
ASSERT_READER_VALUES(reader, result_bits);
+
+ auto refPopCount = SlowCountBits(out->mutable_data(), out_offset,
length);
Review comment:
We usaully call this `expected_pop_count` in arrow unit tests.
##########
File path: cpp/src/arrow/util/bitmap_ops.cc
##########
@@ -311,39 +345,49 @@ Result<std::shared_ptr<Buffer>> BitmapAnd(MemoryPool*
pool, const uint8_t* left,
int64_t left_offset, const uint8_t*
right,
int64_t right_offset, int64_t length,
int64_t out_offset) {
+ // TODO(pradeep) Figure out the return type for this version of BitmapOps
+ int64_t out_pop_count = 0;
return BitmapOp<std::bit_and>(pool, left, left_offset, right, right_offset,
length,
- out_offset);
+ out_offset, &out_pop_count);
Review comment:
I don't understand why this `out_pop_count`. It looks useless.
##########
File path: cpp/src/arrow/util/bitmap_ops.cc
##########
@@ -237,71 +237,105 @@ bool OptionalBitmapEquals(const std::shared_ptr<Buffer>&
left, int64_t left_offs
namespace {
-template <template <typename> class BitOp>
-void AlignedBitmapOp(const uint8_t* left, int64_t left_offset, const uint8_t*
right,
- int64_t right_offset, uint8_t* out, int64_t out_offset,
- int64_t length) {
+template <template <typename> class BitOp, bool ComputeNewValidityCount>
+int64_t AlignedBitmapOp(const uint8_t* left, int64_t left_offset, const
uint8_t* right,
+ int64_t right_offset, uint8_t* out, int64_t out_offset,
+ int64_t length) {
BitOp<uint8_t> op;
DCHECK_EQ(left_offset % 8, right_offset % 8);
DCHECK_EQ(left_offset % 8, out_offset % 8);
+ uint8_t* outFront = out;
const int64_t nbytes = BitUtil::BytesForBits(length + left_offset % 8);
left += left_offset / 8;
right += right_offset / 8;
out += out_offset / 8;
+ uint64_t outPopCount = 0;
for (int64_t i = 0; i < nbytes; ++i) {
out[i] = op(left[i], right[i]);
}
+ if (ComputeNewValidityCount) {
+ outPopCount = CountSetBits(outFront, out_offset, length);
+ }
+ return outPopCount;
}
-template <template <typename> class BitOp>
-void UnalignedBitmapOp(const uint8_t* left, int64_t left_offset, const
uint8_t* right,
- int64_t right_offset, uint8_t* out, int64_t out_offset,
- int64_t length) {
+template <template <typename> class BitOp, bool ComputeNewValidityCount>
Review comment:
From your test result, looks pop count calculation doesn't impact
performance.
If that's the case, I think we should remove this new template argument and
always calc and return pop count in this function. Caller can simply ignore the
return value if it don't need it.
##########
File path: cpp/src/arrow/array/array_nested.cc
##########
@@ -680,7 +680,7 @@ Result<std::shared_ptr<Array>>
SparseUnionArray::GetFlattenedField(
if (child_null_bitmap) {
BitmapAnd(flattened_null_bitmap->data(), child_offset,
child_null_bitmap->data(),
child_offset, child_data->length, child_offset,
- flattened_null_bitmap->mutable_data());
+ flattened_null_bitmap->mutable_data(), /*out pop count*/
nullptr);
Review comment:
Make`nullptr` the default value to this new argument to leave current
code untouched if they don't care about pop count.
##########
File path: cpp/src/arrow/util/bit_util_test.cc
##########
@@ -1357,10 +1372,15 @@ class BitmapOp : public TestBase {
// Clear out buffer and try non-allocating version
std::memset(out->mutable_data(), 0, out->size());
+ int64_t expectedPopCount = 0;
Review comment:
I think it should be simply `pop_count`, not expected.
##########
File path: cpp/src/arrow/util/bit_util_benchmark.cc
##########
@@ -159,9 +159,12 @@ static void BenchmarkAndImpl(benchmark::State& state,
DoAnd&& do_and) {
static void BenchmarkBitmapAnd(benchmark::State& state) {
BenchmarkAndImpl(state, [](const internal::Bitmap(&bitmaps)[2],
internal::Bitmap* out) {
+ int64_t outPopCount = 0;
Review comment:
use snake case
##########
File path: cpp/src/arrow/util/bit_util_benchmark.cc
##########
@@ -159,9 +159,12 @@ static void BenchmarkAndImpl(benchmark::State& state,
DoAnd&& do_and) {
static void BenchmarkBitmapAnd(benchmark::State& state) {
BenchmarkAndImpl(state, [](const internal::Bitmap(&bitmaps)[2],
internal::Bitmap* out) {
+ int64_t outPopCount = 0;
internal::BitmapAnd(bitmaps[0].buffer()->data(), bitmaps[0].offset(),
bitmaps[1].buffer()->data(), bitmaps[1].offset(),
- bitmaps[0].length(), 0, out->buffer()->mutable_data());
+ bitmaps[0].length(), 0, out->buffer()->mutable_data(),
+ &outPopCount);
+ benchmark::DoNotOptimize(outPopCount);
Review comment:
Augment current benchmark to add PopCount tests. Popcount is not used in
many cases.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]