pitrou commented on code in PR #13768:
URL: https://github.com/apache/arrow/pull/13768#discussion_r936950524


##########
go/arrow/bitutil/bitmaps.go:
##########
@@ -422,3 +423,81 @@ func CopyBitmap(src []byte, srcOffset, length int, dst 
[]byte, dstOffset int) {
        dst[nbytes-1] &= ^trailMask
        dst[nbytes-1] |= lastData & trailMask
 }
+
+type bitOp struct {
+       opWord func(uint64, uint64) uint64
+       opByte func(byte, byte) byte
+}
+
+var (
+       bitAndOp = bitOp{
+               opWord: func(l, r uint64) uint64 { return l & r },
+               opByte: func(l, r byte) byte { return l & r },
+       }
+       bitOrOp = bitOp{
+               opWord: func(l, r uint64) uint64 { return l | r },
+               opByte: func(l, r byte) byte { return l | r },
+       }
+)
+
+func alignedBitmapOp(op bitOp, left, right []byte, lOffset, rOffset int64, out 
[]byte, outOffset int64, length int64) {
+       debug.Assert(lOffset%8 == rOffset%8, "aligned bitmap op called with 
unaligned offsets")
+       debug.Assert(lOffset%8 == outOffset%8, "aligned bitmap op called with 
unaligned output offset")
+
+       nbytes := BytesForBits(length + lOffset%8)
+       left = left[lOffset/8:]
+       right = right[rOffset/8:]
+       out = out[outOffset/8:]
+       for i := int64(0); i < nbytes; i++ {
+               out[i] = op.opByte(left[i], right[i])
+       }
+}
+
+func unalignedBitmapOp(op bitOp, left, right []byte, lOffset, rOffset int64, 
out []byte, outOffset int64, length int64) {
+       leftRdr := NewBitmapWordReader(left, int(lOffset), int(length))
+       rightRdr := NewBitmapWordReader(right, int(rOffset), int(length))
+       writer := NewBitmapWordWriter(out, int(outOffset), int(length))
+
+       for nwords := leftRdr.Words(); nwords > 0; nwords-- {
+               writer.PutNextWord(op.opWord(leftRdr.NextWord(), 
rightRdr.NextWord()))

Review Comment:
   Well, if I run the C++ benchmarks, I get this:
   ```
   BenchmarkBitmapAnd/32768/0                   1696 ns         1696 ns       
411775 bytes_per_second=17.9927G/s
   BenchmarkBitmapAnd/131072/0                  6843 ns         6842 ns       
101856 bytes_per_second=17.8405G/s
   BenchmarkBitmapAnd/32768/1                   3999 ns         3998 ns       
174766 bytes_per_second=7.63287G/s
   BenchmarkBitmapAnd/131072/1                 15546 ns        15544 ns        
44802 bytes_per_second=7.85325G/s
   BenchmarkBitmapAnd/32768/2                   3988 ns         3987 ns       
175050 bytes_per_second=7.65504G/s
   BenchmarkBitmapAnd/131072/2                 15601 ns        15599 ns        
44678 bytes_per_second=7.82568G/s
   ```
   ... where the first benchmark parameter is the number of bytes and the 
second is the bif offset, so you see the results show a similar performance 
profile as the Go ones (with the aligned implementation being a couple times 
faster than the unaligned one).
   
   The reason this works is probably that C++ compilers optimize hard enough to 
turn the byte-wise loop into a word-wise loop plus byte-wise prolog/epilog.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to