pitrou commented on a change in pull request #7143:
URL: https://github.com/apache/arrow/pull/7143#discussion_r434040540



##########
File path: cpp/src/arrow/util/bit_util.h
##########
@@ -550,6 +558,109 @@ class BitmapReader {
   int64_t bit_offset_;
 };
 
+struct BitRun {
+  int64_t length;
+  // Whether bits are set at this point.
+  bool set;
+
+  std::string ToString() const {
+    return std::string("{Length: ") + std::to_string(length) +
+           ", set=" + std::to_string(set) + "}";
+  }
+};
+
+static inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
+  return lhs.length == rhs.length && lhs.set == rhs.set;
+}
+
+class BitRunReaderScalar {
+ public:
+  BitRunReaderScalar(const uint8_t* bitmap, int64_t start_offset, int64_t 
length)
+      : reader_(bitmap, start_offset, length) {}
+
+  BitRun NextRun() {
+    BitRun rl = {/*length=*/0, reader_.IsSet()};
+    // Advance while the values are equal and not at the end of list.
+    while (reader_.position() < reader_.length() && reader_.IsSet() == rl.set) 
{
+      rl.length++;
+      reader_.Next();
+    }
+    return rl;
+  }
+
+ private:
+  BitmapReader reader_;
+};
+
+#if defined(ARROW_LITTLE_ENDIAN)
+/// A convenience class for counting the number of continguous set/unset bits
+/// in a bitmap.
+class ARROW_EXPORT BitRunReader {
+ public:
+  /// \brief Constructs new BitRunReader.
+  ///
+  /// \param[in] bitmap source data
+  /// \param[in] start_offset bit offset into the source data
+  /// \param[in] length number of bits to copy
+  BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length);
+
+  /// Returns a new BitRun containing the number of contiguous
+  /// bits with the same value.  length == 0 indicates the
+  /// end of the bitmap.
+  BitRun NextRun() {
+    if (ARROW_PREDICT_FALSE(position_ >= length_)) {
+      return {/*length=*/0, false};
+    }
+    // This implementation relies on a efficient implementations of
+    // CountTrailingZeros and assumes that runs are more often then
+    // not.  The logic is to incrementally find the next bit change
+    // from the current position.  This is done by zeroing all
+    // bits in word_ up to position_ and using the TrailingZeroCount
+    // to find the index of the next set bit.
+
+    // The runs alternate on each call, so flip the bit.
+    current_run_bit_set_ = !current_run_bit_set_;
+
+    // Invert the word for proper use of CountTrailingZeros and
+    // clear bits so CountTrailingZeros can do it magic.
+    InvertRemainingBits();
+
+    int64_t start_position = position_;
+    // Go  forward until the next change from unset to set.
+    int64_t new_bits = BitUtil::CountTrailingZeros(word_) - (start_position % 
64);
+    position_ += new_bits;
+
+    if (ARROW_PREDICT_FALSE(position_ % 64 == 0) &&
+        ARROW_PREDICT_TRUE(position_ < length_)) {
+      // Continue extending position while we can advance an entire word.
+      // (updates position_ accordingly).
+      AdvanceTillChange();

Review comment:
       If you folded `AdvanceTillChange()` here, the code would probably be 
more readable (less duplicated conditions).

##########
File path: cpp/src/arrow/util/bit_util.cc
##########
@@ -411,6 +411,66 @@ Result<std::shared_ptr<Buffer>> BitmapOp(MemoryPool* pool, 
const uint8_t* left,
 
 }  // namespace
 
+BitRunReader::BitRunReader(const uint8_t* bitmap, int64_t start_offset, 
int64_t length)
+    : bitmap_(bitmap + (start_offset / 8)),
+      position_(start_offset % 8),
+      length_(position_ + length) {
+  if (ARROW_PREDICT_FALSE(length == 0)) {
+    word_ = 0;
+    return;
+  }
+
+  // Prepare for inversion in NextRun.
+  current_run_bit_set_ = !BitUtil::GetBit(bitmap, start_offset);
+  LoadWord();
+  // Clear out any preceding bits.
+  word_ = word_ & ~BitUtil::PartialWordMask(position_);
+}
+
+void BitRunReader::AdvanceTillChange() {
+  int64_t new_bits = 0;
+  do {
+    // Advance the position of the bitmap for loading.
+    bitmap_ += sizeof(uint64_t);
+    LoadWord();
+    new_bits = BitUtil::CountTrailingZeros(word_);
+    // Continue calculating run length.
+    position_ += new_bits;
+  } while (ARROW_PREDICT_FALSE(position_ % 64 == 0) &&
+           ARROW_PREDICT_TRUE(position_ < length_) && new_bits > 0);
+}
+
+void BitRunReader::LoadWord() {
+  word_ = 0;
+  // On the initial load if there is an offset we need to account for this when

Review comment:
       Ok, why don't you have a separate function for the initial load? You're 
paying the price for this on every subsequent load (especially as the function 
may not be inlined at all).

##########
File path: cpp/src/arrow/util/bit_util.h
##########
@@ -550,6 +558,109 @@ class BitmapReader {
   int64_t bit_offset_;
 };
 
+struct BitRun {
+  int64_t length;
+  // Whether bits are set at this point.
+  bool set;
+
+  std::string ToString() const {
+    return std::string("{Length: ") + std::to_string(length) +
+           ", set=" + std::to_string(set) + "}";
+  }
+};
+
+static inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
+  return lhs.length == rhs.length && lhs.set == rhs.set;
+}
+
+class BitRunReaderScalar {
+ public:
+  BitRunReaderScalar(const uint8_t* bitmap, int64_t start_offset, int64_t 
length)
+      : reader_(bitmap, start_offset, length) {}
+
+  BitRun NextRun() {
+    BitRun rl = {/*length=*/0, reader_.IsSet()};
+    // Advance while the values are equal and not at the end of list.
+    while (reader_.position() < reader_.length() && reader_.IsSet() == rl.set) 
{
+      rl.length++;
+      reader_.Next();
+    }
+    return rl;
+  }
+
+ private:
+  BitmapReader reader_;
+};
+
+#if defined(ARROW_LITTLE_ENDIAN)
+/// A convenience class for counting the number of continguous set/unset bits
+/// in a bitmap.
+class ARROW_EXPORT BitRunReader {
+ public:
+  /// \brief Constructs new BitRunReader.
+  ///
+  /// \param[in] bitmap source data
+  /// \param[in] start_offset bit offset into the source data
+  /// \param[in] length number of bits to copy
+  BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length);
+
+  /// Returns a new BitRun containing the number of contiguous
+  /// bits with the same value.  length == 0 indicates the
+  /// end of the bitmap.
+  BitRun NextRun() {
+    if (ARROW_PREDICT_FALSE(position_ >= length_)) {
+      return {/*length=*/0, false};
+    }
+    // This implementation relies on a efficient implementations of
+    // CountTrailingZeros and assumes that runs are more often then
+    // not.  The logic is to incrementally find the next bit change
+    // from the current position.  This is done by zeroing all
+    // bits in word_ up to position_ and using the TrailingZeroCount
+    // to find the index of the next set bit.
+
+    // The runs alternate on each call, so flip the bit.
+    current_run_bit_set_ = !current_run_bit_set_;
+
+    // Invert the word for proper use of CountTrailingZeros and
+    // clear bits so CountTrailingZeros can do it magic.
+    InvertRemainingBits();
+
+    int64_t start_position = position_;
+    // Go  forward until the next change from unset to set.
+    int64_t new_bits = BitUtil::CountTrailingZeros(word_) - (start_position % 
64);
+    position_ += new_bits;
+
+    if (ARROW_PREDICT_FALSE(position_ % 64 == 0) &&

Review comment:
       Be aware that signed modulo is slightly slower than unsigned modulo (the 
compiler doesn't know that `position_` is positive).

##########
File path: cpp/src/arrow/util/bit_util.h
##########
@@ -550,6 +558,109 @@ class BitmapReader {
   int64_t bit_offset_;
 };
 
+struct BitRun {
+  int64_t length;
+  // Whether bits are set at this point.
+  bool set;
+
+  std::string ToString() const {
+    return std::string("{Length: ") + std::to_string(length) +
+           ", set=" + std::to_string(set) + "}";
+  }
+};
+
+static inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
+  return lhs.length == rhs.length && lhs.set == rhs.set;
+}
+
+class BitRunReaderScalar {
+ public:
+  BitRunReaderScalar(const uint8_t* bitmap, int64_t start_offset, int64_t 
length)
+      : reader_(bitmap, start_offset, length) {}
+
+  BitRun NextRun() {
+    BitRun rl = {/*length=*/0, reader_.IsSet()};
+    // Advance while the values are equal and not at the end of list.
+    while (reader_.position() < reader_.length() && reader_.IsSet() == rl.set) 
{
+      rl.length++;
+      reader_.Next();
+    }
+    return rl;
+  }
+
+ private:
+  BitmapReader reader_;
+};
+
+#if defined(ARROW_LITTLE_ENDIAN)
+/// A convenience class for counting the number of continguous set/unset bits
+/// in a bitmap.
+class ARROW_EXPORT BitRunReader {
+ public:
+  /// \brief Constructs new BitRunReader.
+  ///
+  /// \param[in] bitmap source data
+  /// \param[in] start_offset bit offset into the source data
+  /// \param[in] length number of bits to copy
+  BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length);
+
+  /// Returns a new BitRun containing the number of contiguous
+  /// bits with the same value.  length == 0 indicates the
+  /// end of the bitmap.
+  BitRun NextRun() {
+    if (ARROW_PREDICT_FALSE(position_ >= length_)) {
+      return {/*length=*/0, false};
+    }
+    // This implementation relies on a efficient implementations of
+    // CountTrailingZeros and assumes that runs are more often then
+    // not.  The logic is to incrementally find the next bit change
+    // from the current position.  This is done by zeroing all
+    // bits in word_ up to position_ and using the TrailingZeroCount
+    // to find the index of the next set bit.
+
+    // The runs alternate on each call, so flip the bit.
+    current_run_bit_set_ = !current_run_bit_set_;
+
+    // Invert the word for proper use of CountTrailingZeros and
+    // clear bits so CountTrailingZeros can do it magic.
+    InvertRemainingBits();
+
+    int64_t start_position = position_;
+    // Go  forward until the next change from unset to set.
+    int64_t new_bits = BitUtil::CountTrailingZeros(word_) - (start_position % 
64);
+    position_ += new_bits;
+
+    if (ARROW_PREDICT_FALSE(position_ % 64 == 0) &&
+        ARROW_PREDICT_TRUE(position_ < length_)) {
+      // Continue extending position while we can advance an entire word.
+      // (updates position_ accordingly).
+      AdvanceTillChange();
+    }
+
+    return {/*length=*/position_ - start_position, current_run_bit_set_};
+  }
+
+ private:
+  void AdvanceTillChange();
+  void InvertRemainingBits() {
+    // Mask applied everying above the lowest bit.

Review comment:
       ???

##########
File path: cpp/src/arrow/util/bit_util.h
##########
@@ -550,6 +558,109 @@ class BitmapReader {
   int64_t bit_offset_;
 };
 
+struct BitRun {
+  int64_t length;
+  // Whether bits are set at this point.
+  bool set;
+
+  std::string ToString() const {
+    return std::string("{Length: ") + std::to_string(length) +
+           ", set=" + std::to_string(set) + "}";
+  }
+};
+
+static inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
+  return lhs.length == rhs.length && lhs.set == rhs.set;
+}
+
+class BitRunReaderScalar {
+ public:
+  BitRunReaderScalar(const uint8_t* bitmap, int64_t start_offset, int64_t 
length)
+      : reader_(bitmap, start_offset, length) {}
+
+  BitRun NextRun() {
+    BitRun rl = {/*length=*/0, reader_.IsSet()};
+    // Advance while the values are equal and not at the end of list.
+    while (reader_.position() < reader_.length() && reader_.IsSet() == rl.set) 
{
+      rl.length++;
+      reader_.Next();
+    }
+    return rl;
+  }
+
+ private:
+  BitmapReader reader_;
+};
+
+#if defined(ARROW_LITTLE_ENDIAN)
+/// A convenience class for counting the number of continguous set/unset bits
+/// in a bitmap.
+class ARROW_EXPORT BitRunReader {
+ public:
+  /// \brief Constructs new BitRunReader.
+  ///
+  /// \param[in] bitmap source data
+  /// \param[in] start_offset bit offset into the source data
+  /// \param[in] length number of bits to copy
+  BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length);
+
+  /// Returns a new BitRun containing the number of contiguous
+  /// bits with the same value.  length == 0 indicates the
+  /// end of the bitmap.
+  BitRun NextRun() {
+    if (ARROW_PREDICT_FALSE(position_ >= length_)) {
+      return {/*length=*/0, false};
+    }
+    // This implementation relies on a efficient implementations of
+    // CountTrailingZeros and assumes that runs are more often then
+    // not.  The logic is to incrementally find the next bit change
+    // from the current position.  This is done by zeroing all
+    // bits in word_ up to position_ and using the TrailingZeroCount
+    // to find the index of the next set bit.
+
+    // The runs alternate on each call, so flip the bit.
+    current_run_bit_set_ = !current_run_bit_set_;
+
+    // Invert the word for proper use of CountTrailingZeros and
+    // clear bits so CountTrailingZeros can do it magic.
+    InvertRemainingBits();
+
+    int64_t start_position = position_;
+    // Go  forward until the next change from unset to set.
+    int64_t new_bits = BitUtil::CountTrailingZeros(word_) - (start_position % 
64);
+    position_ += new_bits;
+
+    if (ARROW_PREDICT_FALSE(position_ % 64 == 0) &&

Review comment:
       You could instead maintain a bit offset and/or a word mask. Could also 
perhaps help deconvolute `InvertRemainingBits`...




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to