This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new 6006ca2 perf: Improved Bit (Un)packing Performance (#280)
6006ca2 is described below
commit 6006ca2736dda6dd54e8695cd2591d75456e65d2
Author: William Ayd <[email protected]>
AuthorDate: Fri Oct 6 09:56:34 2023 -0400
perf: Improved Bit (Un)packing Performance (#280)
I was very surprised by this but getting rid of the shifting yielded a
huge performance boost for me locally. I was benchmarking some pandas
code that took ~500us to unpack 1 million boolean values - with this
simple change that time fell to ~30us
Not an expert in assembly but here is what godbolt produces to set the
index of 1 before:
```asm
movzx eax, BYTE PTR [rbp-1]
shr al
mov edx, eax
.loc 1 6 6
mov rax, QWORD PTR [rbp-32]
add rax, 1
.loc 1 6 24
and edx, 1
.loc 1 6 10
mov BYTE PTR [rax], dl
```
and after:
```asm
movzx eax, BYTE PTR [rbp-1]
and eax, 2
.loc 1 6 25
test eax, eax
setne dl
.loc 1 6 6
mov rax, QWORD PTR [rbp-32]
add rax, 1
.loc 1 6 10
mov BYTE PTR [rax], dl
```
Assuming the `shr` instruction is inefficient compared to the `test` /
`setne` approach taken in the latter
---
src/nanoarrow/buffer_inline.h | 44 +++++++++++++++++++++++--------------------
1 file changed, 24 insertions(+), 20 deletions(-)
diff --git a/src/nanoarrow/buffer_inline.h b/src/nanoarrow/buffer_inline.h
index 232dd42..190943f 100644
--- a/src/nanoarrow/buffer_inline.h
+++ b/src/nanoarrow/buffer_inline.h
@@ -223,35 +223,39 @@ static inline int64_t _ArrowBytesForBits(int64_t bits) {
}
static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) {
- out[0] = (word >> 0) & 1;
- out[1] = (word >> 1) & 1;
- out[2] = (word >> 2) & 1;
- out[3] = (word >> 3) & 1;
- out[4] = (word >> 4) & 1;
- out[5] = (word >> 5) & 1;
- out[6] = (word >> 6) & 1;
- out[7] = (word >> 7) & 1;
+ out[0] = (word & 0x1) != 0;
+ out[1] = (word & 0x2) != 0;
+ out[2] = (word & 0x4) != 0;
+ out[3] = (word & 0x8) != 0;
+ out[4] = (word & 0x10) != 0;
+ out[5] = (word & 0x20) != 0;
+ out[6] = (word & 0x40) != 0;
+ out[7] = (word & 0x80) != 0;
}
static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) {
- out[0] = (word >> 0) & 1;
- out[1] = (word >> 1) & 1;
- out[2] = (word >> 2) & 1;
- out[3] = (word >> 3) & 1;
- out[4] = (word >> 4) & 1;
- out[5] = (word >> 5) & 1;
- out[6] = (word >> 6) & 1;
- out[7] = (word >> 7) & 1;
+ out[0] = (word & 0x1) != 0;
+ out[1] = (word & 0x2) != 0;
+ out[2] = (word & 0x4) != 0;
+ out[3] = (word & 0x8) != 0;
+ out[4] = (word & 0x10) != 0;
+ out[5] = (word & 0x20) != 0;
+ out[6] = (word & 0x40) != 0;
+ out[7] = (word & 0x80) != 0;
}
static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) {
- *out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 |
values[4] << 4 |
- values[5] << 5 | values[6] << 6 | values[7] << 7);
+ *out = (values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) |
+ ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) |
+ ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) |
+ ((values[7] + 0x7f) & 0x80));
}
static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) {
- *out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 |
values[4] << 4 |
- values[5] << 5 | values[6] << 6 | values[7] << 7);
+ *out = (values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) |
+ ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) |
+ ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) |
+ ((values[7] + 0x7f) & 0x80));
}
static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) {