cyb70289 commented on a change in pull request #9635:
URL: https://github.com/apache/arrow/pull/9635#discussion_r592081807



##########
File path: cpp/src/arrow/util/bit_run_reader.h
##########
@@ -424,48 +383,16 @@ inline uint64_t 
BaseSetBitRunReader<true>::ConsumeBits(uint64_t word, int32_t nu
   return word << num_bits;
 }
 
-template <>
-inline BaseSetBitRunReader<false>::BaseSetBitRunReader(const uint8_t* bitmap,
-                                                       int64_t start_offset,
-                                                       int64_t length)
-    : bitmap_(bitmap + start_offset / 8), length_(length), remaining_(length_) 
{
-  const int8_t bit_offset = static_cast<int8_t>(start_offset % 8);
-  if (length > 0 && bit_offset) {
-    // Get MSBs from first byte
-    current_num_bits_ =
-        std::min(static_cast<int32_t>(length), static_cast<int32_t>(8 - 
bit_offset));
-    current_word_ = LoadPartialWord(bit_offset, current_num_bits_);
-  } else {
-    current_num_bits_ = 0;
-    current_word_ = 0;
-  }
-}
-
-template <>
-inline BaseSetBitRunReader<true>::BaseSetBitRunReader(const uint8_t* bitmap,
-                                                      int64_t start_offset,
-                                                      int64_t length)
-    : bitmap_(bitmap + (start_offset + length) / 8),
-      length_(length),
-      remaining_(length_) {
-  const int8_t end_bit_offset = static_cast<int8_t>((start_offset + length) % 
8);
-  if (length > 0 && end_bit_offset) {
-    // Get LSBs from last byte
-    ++bitmap_;
-    current_num_bits_ =
-        std::min(static_cast<int32_t>(length), 
static_cast<int32_t>(end_bit_offset));
-    current_word_ = LoadPartialWord(8 - end_bit_offset, current_num_bits_);
-  } else {
-    current_num_bits_ = 0;
-    current_word_ = 0;
-  }
-}
-
 using SetBitRunReader = BaseSetBitRunReader</*Reverse=*/false>;
 using ReverseSetBitRunReader = BaseSetBitRunReader</*Reverse=*/true>;
 
 // Functional-style bit run visitors.
 
+// XXX: Try to make this function small so the compiler can inline and optimize
+// the `visit` function, which is normally a hot loop with vectorizable code.
+// - don't inline SetBitRunReader constructor, it doesn't hurt performance
+// - un-inline NextRun makes it easier to inline more complex `visit` 
functions,
+//   it hurts 'half null' case a bit, but improves normal cases.

Review comment:
       Look compiler failed to inline VisitSetBitRuns function as it contains 
too complex codes from BaseSetBitRunReader. It turns `visit` into a function 
call, and compiler cannot optimize it (unroll loops, vectorize, etc.).
   
   Un-inline non performance critical code to make this function small. It does 
helps the compiler to inline `visit` and generates optimized code.
   
   It works cleanly for clang.
   Gcc doesn't respect this change, it still cannot inline `visit`. I tried 
some tricks to make gcc happy, but the changes are fragile and not explainable, 
so I will ignore them.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to