cyb70289 commented on a change in pull request #9635: URL: https://github.com/apache/arrow/pull/9635#discussion_r592108324
########## File path: cpp/src/arrow/util/bit_run_reader.h ########## @@ -424,48 +452,15 @@ inline uint64_t BaseSetBitRunReader<true>::ConsumeBits(uint64_t word, int32_t nu return word << num_bits; } -template <> -inline BaseSetBitRunReader<false>::BaseSetBitRunReader(const uint8_t* bitmap, - int64_t start_offset, - int64_t length) - : bitmap_(bitmap + start_offset / 8), length_(length), remaining_(length_) { - const int8_t bit_offset = static_cast<int8_t>(start_offset % 8); - if (length > 0 && bit_offset) { - // Get MSBs from first byte - current_num_bits_ = - std::min(static_cast<int32_t>(length), static_cast<int32_t>(8 - bit_offset)); - current_word_ = LoadPartialWord(bit_offset, current_num_bits_); - } else { - current_num_bits_ = 0; - current_word_ = 0; - } -} - -template <> -inline BaseSetBitRunReader<true>::BaseSetBitRunReader(const uint8_t* bitmap, - int64_t start_offset, - int64_t length) - : bitmap_(bitmap + (start_offset + length) / 8), - length_(length), - remaining_(length_) { - const int8_t end_bit_offset = static_cast<int8_t>((start_offset + length) % 8); - if (length > 0 && end_bit_offset) { - // Get LSBs from last byte - ++bitmap_; - current_num_bits_ = - std::min(static_cast<int32_t>(length), static_cast<int32_t>(end_bit_offset)); - current_word_ = LoadPartialWord(8 - end_bit_offset, current_num_bits_); - } else { - current_num_bits_ = 0; - current_word_ = 0; - } -} - using SetBitRunReader = BaseSetBitRunReader</*Reverse=*/false>; using ReverseSetBitRunReader = BaseSetBitRunReader</*Reverse=*/true>; // Functional-style bit run visitors. +// XXX: Try to make this function small so the compiler can inline and optimize +// the `visit` function, which is normally a hot loop with vectorizable code. +// - don't inline SetBitRunReader constructor, it doesn't hurt performance +// - un-inline NextRun hurts 'many null' cases a bit, but improves normal cases Review comment: Compiler failed to inline VisitSetBitRuns function as it contains too complex codes from BaseSetBitRunReader. It turns `visit` into a function call, and compiler cannot optimize it (unroll loops, vectorize, etc.). Un-inline non performance critical code (BaseSetBitRunReader constructor and NextRun()) makes this function small. It does helps the compiler to inline `visit` and generates optimized code. It works cleanly for clang. Gcc doesn't respect this change, it still cannot inline `visit`. I tried some tricks to make gcc happy, but the changes are fragile and not explainable, so I will ignore them. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org