cyb70289 commented on a change in pull request #9635:
URL: https://github.com/apache/arrow/pull/9635#discussion_r592108324



##########
File path: cpp/src/arrow/util/bit_run_reader.h
##########
@@ -424,48 +452,15 @@ inline uint64_t 
BaseSetBitRunReader<true>::ConsumeBits(uint64_t word, int32_t nu
   return word << num_bits;
 }
 
-template <>
-inline BaseSetBitRunReader<false>::BaseSetBitRunReader(const uint8_t* bitmap,
-                                                       int64_t start_offset,
-                                                       int64_t length)
-    : bitmap_(bitmap + start_offset / 8), length_(length), remaining_(length_) 
{
-  const int8_t bit_offset = static_cast<int8_t>(start_offset % 8);
-  if (length > 0 && bit_offset) {
-    // Get MSBs from first byte
-    current_num_bits_ =
-        std::min(static_cast<int32_t>(length), static_cast<int32_t>(8 - 
bit_offset));
-    current_word_ = LoadPartialWord(bit_offset, current_num_bits_);
-  } else {
-    current_num_bits_ = 0;
-    current_word_ = 0;
-  }
-}
-
-template <>
-inline BaseSetBitRunReader<true>::BaseSetBitRunReader(const uint8_t* bitmap,
-                                                      int64_t start_offset,
-                                                      int64_t length)
-    : bitmap_(bitmap + (start_offset + length) / 8),
-      length_(length),
-      remaining_(length_) {
-  const int8_t end_bit_offset = static_cast<int8_t>((start_offset + length) % 
8);
-  if (length > 0 && end_bit_offset) {
-    // Get LSBs from last byte
-    ++bitmap_;
-    current_num_bits_ =
-        std::min(static_cast<int32_t>(length), 
static_cast<int32_t>(end_bit_offset));
-    current_word_ = LoadPartialWord(8 - end_bit_offset, current_num_bits_);
-  } else {
-    current_num_bits_ = 0;
-    current_word_ = 0;
-  }
-}
-
 using SetBitRunReader = BaseSetBitRunReader</*Reverse=*/false>;
 using ReverseSetBitRunReader = BaseSetBitRunReader</*Reverse=*/true>;
 
 // Functional-style bit run visitors.
 
+// XXX: Try to make this function small so the compiler can inline and optimize
+// the `visit` function, which is normally a hot loop with vectorizable code.
+// - don't inline SetBitRunReader constructor, it doesn't hurt performance
+// - un-inline NextRun hurts 'many null' cases a bit, but improves normal cases

Review comment:
       Compiler failed to inline VisitSetBitRuns function as it contains too 
complex codes from BaseSetBitRunReader. It turns `visit` into a function call, 
and compiler cannot optimize it (unroll loops, vectorize, etc.).
   
   Un-inline non performance critical code (BaseSetBitRunReader constructor and 
NextRun()) makes this function small. It does helps the compiler to inline 
visit and generates optimized code.
   
   It works cleanly for clang.
   Gcc doesn't respect this change, it still cannot inline visit. I tried some 
tricks to make gcc happy, but the changes are fragile and not explainable, so I 
will ignore them.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to