emkornfield commented on a change in pull request #7143:
URL: https://github.com/apache/arrow/pull/7143#discussion_r434316303



##########
File path: cpp/src/arrow/util/bit_util.h
##########
@@ -550,6 +558,109 @@ class BitmapReader {
   int64_t bit_offset_;
 };
 
+struct BitRun {
+  int64_t length;
+  // Whether bits are set at this point.
+  bool set;
+
+  std::string ToString() const {
+    return std::string("{Length: ") + std::to_string(length) +
+           ", set=" + std::to_string(set) + "}";
+  }
+};
+
+static inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
+  return lhs.length == rhs.length && lhs.set == rhs.set;
+}
+
+class BitRunReaderScalar {
+ public:
+  BitRunReaderScalar(const uint8_t* bitmap, int64_t start_offset, int64_t 
length)
+      : reader_(bitmap, start_offset, length) {}
+
+  BitRun NextRun() {
+    BitRun rl = {/*length=*/0, reader_.IsSet()};
+    // Advance while the values are equal and not at the end of list.
+    while (reader_.position() < reader_.length() && reader_.IsSet() == rl.set) 
{
+      rl.length++;
+      reader_.Next();
+    }
+    return rl;
+  }
+
+ private:
+  BitmapReader reader_;
+};
+
+#if defined(ARROW_LITTLE_ENDIAN)
+/// A convenience class for counting the number of continguous set/unset bits
+/// in a bitmap.
+class ARROW_EXPORT BitRunReader {
+ public:
+  /// \brief Constructs new BitRunReader.
+  ///
+  /// \param[in] bitmap source data
+  /// \param[in] start_offset bit offset into the source data
+  /// \param[in] length number of bits to copy
+  BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length);
+
+  /// Returns a new BitRun containing the number of contiguous
+  /// bits with the same value.  length == 0 indicates the
+  /// end of the bitmap.
+  BitRun NextRun() {
+    if (ARROW_PREDICT_FALSE(position_ >= length_)) {
+      return {/*length=*/0, false};
+    }
+    // This implementation relies on a efficient implementations of
+    // CountTrailingZeros and assumes that runs are more often then
+    // not.  The logic is to incrementally find the next bit change
+    // from the current position.  This is done by zeroing all
+    // bits in word_ up to position_ and using the TrailingZeroCount
+    // to find the index of the next set bit.
+
+    // The runs alternate on each call, so flip the bit.
+    current_run_bit_set_ = !current_run_bit_set_;
+
+    // Invert the word for proper use of CountTrailingZeros and
+    // clear bits so CountTrailingZeros can do it magic.
+    InvertRemainingBits();
+
+    int64_t start_position = position_;
+    // Go  forward until the next change from unset to set.
+    int64_t new_bits = BitUtil::CountTrailingZeros(word_) - (start_position % 
64);
+    position_ += new_bits;
+
+    if (ARROW_PREDICT_FALSE(position_ % 64 == 0) &&

Review comment:
       Thanks for the tip.  changing to position_ & 63.  Antoine, I'm not sure 
what you are thinking for maintaining a bitmask could you provide some more 
details?

##########
File path: cpp/src/arrow/util/bit_util.h
##########
@@ -550,6 +558,109 @@ class BitmapReader {
   int64_t bit_offset_;
 };
 
+struct BitRun {
+  int64_t length;
+  // Whether bits are set at this point.
+  bool set;
+
+  std::string ToString() const {
+    return std::string("{Length: ") + std::to_string(length) +
+           ", set=" + std::to_string(set) + "}";
+  }
+};
+
+static inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
+  return lhs.length == rhs.length && lhs.set == rhs.set;
+}
+
+class BitRunReaderScalar {
+ public:
+  BitRunReaderScalar(const uint8_t* bitmap, int64_t start_offset, int64_t 
length)
+      : reader_(bitmap, start_offset, length) {}
+
+  BitRun NextRun() {
+    BitRun rl = {/*length=*/0, reader_.IsSet()};
+    // Advance while the values are equal and not at the end of list.
+    while (reader_.position() < reader_.length() && reader_.IsSet() == rl.set) 
{
+      rl.length++;
+      reader_.Next();
+    }
+    return rl;
+  }
+
+ private:
+  BitmapReader reader_;
+};
+
+#if defined(ARROW_LITTLE_ENDIAN)
+/// A convenience class for counting the number of continguous set/unset bits
+/// in a bitmap.
+class ARROW_EXPORT BitRunReader {
+ public:
+  /// \brief Constructs new BitRunReader.
+  ///
+  /// \param[in] bitmap source data
+  /// \param[in] start_offset bit offset into the source data
+  /// \param[in] length number of bits to copy
+  BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length);
+
+  /// Returns a new BitRun containing the number of contiguous
+  /// bits with the same value.  length == 0 indicates the
+  /// end of the bitmap.
+  BitRun NextRun() {
+    if (ARROW_PREDICT_FALSE(position_ >= length_)) {
+      return {/*length=*/0, false};
+    }
+    // This implementation relies on a efficient implementations of
+    // CountTrailingZeros and assumes that runs are more often then
+    // not.  The logic is to incrementally find the next bit change
+    // from the current position.  This is done by zeroing all
+    // bits in word_ up to position_ and using the TrailingZeroCount
+    // to find the index of the next set bit.
+
+    // The runs alternate on each call, so flip the bit.
+    current_run_bit_set_ = !current_run_bit_set_;
+
+    // Invert the word for proper use of CountTrailingZeros and
+    // clear bits so CountTrailingZeros can do it magic.
+    InvertRemainingBits();
+
+    int64_t start_position = position_;
+    // Go  forward until the next change from unset to set.
+    int64_t new_bits = BitUtil::CountTrailingZeros(word_) - (start_position % 
64);
+    position_ += new_bits;
+
+    if (ARROW_PREDICT_FALSE(position_ % 64 == 0) &&
+        ARROW_PREDICT_TRUE(position_ < length_)) {
+      // Continue extending position while we can advance an entire word.
+      // (updates position_ accordingly).
+      AdvanceTillChange();

Review comment:
       By folded do you mean inline?  This make a significant performance hit 
on parquet benchmarks.  I added a comment.

##########
File path: cpp/src/arrow/util/bit_util.h
##########
@@ -550,6 +558,109 @@ class BitmapReader {
   int64_t bit_offset_;
 };
 
+struct BitRun {
+  int64_t length;
+  // Whether bits are set at this point.
+  bool set;
+
+  std::string ToString() const {
+    return std::string("{Length: ") + std::to_string(length) +
+           ", set=" + std::to_string(set) + "}";
+  }
+};
+
+static inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
+  return lhs.length == rhs.length && lhs.set == rhs.set;
+}
+
+class BitRunReaderScalar {
+ public:
+  BitRunReaderScalar(const uint8_t* bitmap, int64_t start_offset, int64_t 
length)
+      : reader_(bitmap, start_offset, length) {}
+
+  BitRun NextRun() {
+    BitRun rl = {/*length=*/0, reader_.IsSet()};
+    // Advance while the values are equal and not at the end of list.
+    while (reader_.position() < reader_.length() && reader_.IsSet() == rl.set) 
{
+      rl.length++;
+      reader_.Next();
+    }
+    return rl;
+  }
+
+ private:
+  BitmapReader reader_;
+};
+
+#if defined(ARROW_LITTLE_ENDIAN)
+/// A convenience class for counting the number of continguous set/unset bits
+/// in a bitmap.
+class ARROW_EXPORT BitRunReader {
+ public:
+  /// \brief Constructs new BitRunReader.
+  ///
+  /// \param[in] bitmap source data
+  /// \param[in] start_offset bit offset into the source data
+  /// \param[in] length number of bits to copy
+  BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length);
+
+  /// Returns a new BitRun containing the number of contiguous
+  /// bits with the same value.  length == 0 indicates the
+  /// end of the bitmap.
+  BitRun NextRun() {
+    if (ARROW_PREDICT_FALSE(position_ >= length_)) {
+      return {/*length=*/0, false};
+    }
+    // This implementation relies on a efficient implementations of
+    // CountTrailingZeros and assumes that runs are more often then
+    // not.  The logic is to incrementally find the next bit change
+    // from the current position.  This is done by zeroing all
+    // bits in word_ up to position_ and using the TrailingZeroCount
+    // to find the index of the next set bit.
+
+    // The runs alternate on each call, so flip the bit.
+    current_run_bit_set_ = !current_run_bit_set_;
+
+    // Invert the word for proper use of CountTrailingZeros and
+    // clear bits so CountTrailingZeros can do it magic.
+    InvertRemainingBits();
+
+    int64_t start_position = position_;
+    // Go  forward until the next change from unset to set.
+    int64_t new_bits = BitUtil::CountTrailingZeros(word_) - (start_position % 
64);
+    position_ += new_bits;
+
+    if (ARROW_PREDICT_FALSE(position_ % 64 == 0) &&
+        ARROW_PREDICT_TRUE(position_ < length_)) {
+      // Continue extending position while we can advance an entire word.
+      // (updates position_ accordingly).
+      AdvanceTillChange();
+    }
+
+    return {/*length=*/position_ - start_position, current_run_bit_set_};
+  }
+
+ private:
+  void AdvanceTillChange();
+  void InvertRemainingBits() {
+    // Mask applied everying above the lowest bit.

Review comment:
       tried to clarify.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to