pitrou commented on code in PR #13768:
URL: https://github.com/apache/arrow/pull/13768#discussion_r936950524
##########
go/arrow/bitutil/bitmaps.go:
##########
@@ -422,3 +423,81 @@ func CopyBitmap(src []byte, srcOffset, length int, dst
[]byte, dstOffset int) {
dst[nbytes-1] &= ^trailMask
dst[nbytes-1] |= lastData & trailMask
}
+
+type bitOp struct {
+ opWord func(uint64, uint64) uint64
+ opByte func(byte, byte) byte
+}
+
+var (
+ bitAndOp = bitOp{
+ opWord: func(l, r uint64) uint64 { return l & r },
+ opByte: func(l, r byte) byte { return l & r },
+ }
+ bitOrOp = bitOp{
+ opWord: func(l, r uint64) uint64 { return l | r },
+ opByte: func(l, r byte) byte { return l | r },
+ }
+)
+
+func alignedBitmapOp(op bitOp, left, right []byte, lOffset, rOffset int64, out
[]byte, outOffset int64, length int64) {
+ debug.Assert(lOffset%8 == rOffset%8, "aligned bitmap op called with
unaligned offsets")
+ debug.Assert(lOffset%8 == outOffset%8, "aligned bitmap op called with
unaligned output offset")
+
+ nbytes := BytesForBits(length + lOffset%8)
+ left = left[lOffset/8:]
+ right = right[rOffset/8:]
+ out = out[outOffset/8:]
+ for i := int64(0); i < nbytes; i++ {
+ out[i] = op.opByte(left[i], right[i])
+ }
+}
+
+func unalignedBitmapOp(op bitOp, left, right []byte, lOffset, rOffset int64,
out []byte, outOffset int64, length int64) {
+ leftRdr := NewBitmapWordReader(left, int(lOffset), int(length))
+ rightRdr := NewBitmapWordReader(right, int(rOffset), int(length))
+ writer := NewBitmapWordWriter(out, int(outOffset), int(length))
+
+ for nwords := leftRdr.Words(); nwords > 0; nwords-- {
+ writer.PutNextWord(op.opWord(leftRdr.NextWord(),
rightRdr.NextWord()))
Review Comment:
Well, if I run the C++ benchmarks, I get this:
```
BenchmarkBitmapAnd/32768/0 1696 ns 1696 ns
411775 bytes_per_second=17.9927G/s
BenchmarkBitmapAnd/131072/0 6843 ns 6842 ns
101856 bytes_per_second=17.8405G/s
BenchmarkBitmapAnd/32768/1 3999 ns 3998 ns
174766 bytes_per_second=7.63287G/s
BenchmarkBitmapAnd/131072/1 15546 ns 15544 ns
44802 bytes_per_second=7.85325G/s
BenchmarkBitmapAnd/32768/2 3988 ns 3987 ns
175050 bytes_per_second=7.65504G/s
BenchmarkBitmapAnd/131072/2 15601 ns 15599 ns
44678 bytes_per_second=7.82568G/s
```
... where the first benchmark parameter is the number of bytes and the
second is the bif offset, so you see the results show a similar performance
profile as the Go ones (with the aligned implementation being a couple times
faster than the unaligned one).
The reason this works is probably that C++ compilers optimize hard enough to
turn the byte-wise loop into a word-wise loop plus byte-wise prolog/epilog.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]