This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 6006ca2  perf: Improved Bit (Un)packing Performance (#280)
6006ca2 is described below

commit 6006ca2736dda6dd54e8695cd2591d75456e65d2
Author: William Ayd <[email protected]>
AuthorDate: Fri Oct 6 09:56:34 2023 -0400

    perf: Improved Bit (Un)packing Performance (#280)
    
    I was very surprised by this but getting rid of the shifting yielded a
    huge performance boost for me locally. I was benchmarking some pandas
    code that took ~500us to unpack 1 million boolean values - with this
    simple change that time fell to ~30us
    
    Not an expert in assembly but here is what godbolt produces to set the
    index of 1 before:
    
    ```asm
            movzx   eax, BYTE PTR [rbp-1]
            shr     al
            mov     edx, eax
            .loc 1 6 6
            mov     rax, QWORD PTR [rbp-32]
            add     rax, 1
            .loc 1 6 24
            and     edx, 1
            .loc 1 6 10
            mov     BYTE PTR [rax], dl
    ```
    
    and after:
    
    ```asm
            movzx   eax, BYTE PTR [rbp-1]
            and     eax, 2
            .loc 1 6 25
            test    eax, eax
            setne   dl
            .loc 1 6 6
            mov     rax, QWORD PTR [rbp-32]
            add     rax, 1
            .loc 1 6 10
            mov     BYTE PTR [rax], dl
    ```
    
    Assuming the `shr` instruction is inefficient compared to the `test` /
    `setne` approach taken in the latter
---
 src/nanoarrow/buffer_inline.h | 44 +++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/src/nanoarrow/buffer_inline.h b/src/nanoarrow/buffer_inline.h
index 232dd42..190943f 100644
--- a/src/nanoarrow/buffer_inline.h
+++ b/src/nanoarrow/buffer_inline.h
@@ -223,35 +223,39 @@ static inline int64_t _ArrowBytesForBits(int64_t bits) {
 }
 
 static inline void _ArrowBitsUnpackInt8(const uint8_t word, int8_t* out) {
-  out[0] = (word >> 0) & 1;
-  out[1] = (word >> 1) & 1;
-  out[2] = (word >> 2) & 1;
-  out[3] = (word >> 3) & 1;
-  out[4] = (word >> 4) & 1;
-  out[5] = (word >> 5) & 1;
-  out[6] = (word >> 6) & 1;
-  out[7] = (word >> 7) & 1;
+  out[0] = (word & 0x1) != 0;
+  out[1] = (word & 0x2) != 0;
+  out[2] = (word & 0x4) != 0;
+  out[3] = (word & 0x8) != 0;
+  out[4] = (word & 0x10) != 0;
+  out[5] = (word & 0x20) != 0;
+  out[6] = (word & 0x40) != 0;
+  out[7] = (word & 0x80) != 0;
 }
 
 static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) {
-  out[0] = (word >> 0) & 1;
-  out[1] = (word >> 1) & 1;
-  out[2] = (word >> 2) & 1;
-  out[3] = (word >> 3) & 1;
-  out[4] = (word >> 4) & 1;
-  out[5] = (word >> 5) & 1;
-  out[6] = (word >> 6) & 1;
-  out[7] = (word >> 7) & 1;
+  out[0] = (word & 0x1) != 0;
+  out[1] = (word & 0x2) != 0;
+  out[2] = (word & 0x4) != 0;
+  out[3] = (word & 0x8) != 0;
+  out[4] = (word & 0x10) != 0;
+  out[5] = (word & 0x20) != 0;
+  out[6] = (word & 0x40) != 0;
+  out[7] = (word & 0x80) != 0;
 }
 
 static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) {
-  *out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | 
values[4] << 4 |
-          values[5] << 5 | values[6] << 6 | values[7] << 7);
+  *out = (values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) |
+          ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) |
+          ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) |
+          ((values[7] + 0x7f) & 0x80));
 }
 
 static inline void _ArrowBitmapPackInt32(const int32_t* values, uint8_t* out) {
-  *out = (values[0] | values[1] << 1 | values[2] << 2 | values[3] << 3 | 
values[4] << 4 |
-          values[5] << 5 | values[6] << 6 | values[7] << 7);
+  *out = (values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) |
+          ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) |
+          ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) |
+          ((values[7] + 0x7f) & 0x80));
 }
 
 static inline int8_t ArrowBitGet(const uint8_t* bits, int64_t i) {

Reply via email to