pitrou commented on code in PR #13768: URL: https://github.com/apache/arrow/pull/13768#discussion_r936950524
########## go/arrow/bitutil/bitmaps.go: ########## @@ -422,3 +423,81 @@ func CopyBitmap(src []byte, srcOffset, length int, dst []byte, dstOffset int) { dst[nbytes-1] &= ^trailMask dst[nbytes-1] |= lastData & trailMask } + +type bitOp struct { + opWord func(uint64, uint64) uint64 + opByte func(byte, byte) byte +} + +var ( + bitAndOp = bitOp{ + opWord: func(l, r uint64) uint64 { return l & r }, + opByte: func(l, r byte) byte { return l & r }, + } + bitOrOp = bitOp{ + opWord: func(l, r uint64) uint64 { return l | r }, + opByte: func(l, r byte) byte { return l | r }, + } +) + +func alignedBitmapOp(op bitOp, left, right []byte, lOffset, rOffset int64, out []byte, outOffset int64, length int64) { + debug.Assert(lOffset%8 == rOffset%8, "aligned bitmap op called with unaligned offsets") + debug.Assert(lOffset%8 == outOffset%8, "aligned bitmap op called with unaligned output offset") + + nbytes := BytesForBits(length + lOffset%8) + left = left[lOffset/8:] + right = right[rOffset/8:] + out = out[outOffset/8:] + for i := int64(0); i < nbytes; i++ { + out[i] = op.opByte(left[i], right[i]) + } +} + +func unalignedBitmapOp(op bitOp, left, right []byte, lOffset, rOffset int64, out []byte, outOffset int64, length int64) { + leftRdr := NewBitmapWordReader(left, int(lOffset), int(length)) + rightRdr := NewBitmapWordReader(right, int(rOffset), int(length)) + writer := NewBitmapWordWriter(out, int(outOffset), int(length)) + + for nwords := leftRdr.Words(); nwords > 0; nwords-- { + writer.PutNextWord(op.opWord(leftRdr.NextWord(), rightRdr.NextWord())) Review Comment: Well, if I run the C++ benchmarks, I get this: ``` BenchmarkBitmapAnd/32768/0 1696 ns 1696 ns 411775 bytes_per_second=17.9927G/s BenchmarkBitmapAnd/131072/0 6843 ns 6842 ns 101856 bytes_per_second=17.8405G/s BenchmarkBitmapAnd/32768/1 3999 ns 3998 ns 174766 bytes_per_second=7.63287G/s BenchmarkBitmapAnd/131072/1 15546 ns 15544 ns 44802 bytes_per_second=7.85325G/s BenchmarkBitmapAnd/32768/2 3988 ns 3987 ns 175050 bytes_per_second=7.65504G/s BenchmarkBitmapAnd/131072/2 15601 ns 15599 ns 44678 bytes_per_second=7.82568G/s ``` ... where the first benchmark parameter is the number of bytes and the second is the bif offset, so you see the results show a similar performance profile as the Go ones (with the aligned implementation being a couple times faster than the unaligned one). The reason this works is probably that C++ compilers optimize hard enough to turn the byte-wise loop into a word-wise loop plus byte-wise prolog/epilog. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org