binmahone commented on code in PR #39608:
URL: https://github.com/apache/arrow/pull/39608#discussion_r1461440137


##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,8 +303,150 @@ class TypedColumnReader : public ColumnReader {
                                           int32_t* dict_len) = 0;
 };
 
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+  static IntervalRange Intersection(const IntervalRange& left,
+                                    const IntervalRange& right) {
+    if (left.start <= right.start) {
+      if (left.end >= right.start) {
+        return {right.start, std::min(left.end, right.end)};
+      }
+    } else if (right.end >= left.start) {
+      return {left.start, std::min(left.end, right.end)};
+    }
+    return {-1, -1};  // Return a default Range object if no intersection 
range found
+  }
+
+  IntervalRange(const int64_t start_, const int64_t end_) : start(start_), 
end(end_) {
+    if (start > end) {
+      throw ParquetException("Invalid range with start: " + 
std::to_string(start) +
+                             " and end: " + std::to_string(end));
+    }
+  }
+
+  size_t Count() const {
+    if(!IsValid()) {
+      throw ParquetException("Invalid range with start: " + 
std::to_string(start) +
+                             " and end: " + std::to_string(end));
+    }
+    return end - start + 1;
+  }
+
+  bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+  bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+  bool IsOverlap(const IntervalRange& other) const {
+    return !IsBefore(other) && !IsAfter(other);
+  }
+
+  bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+  std::string ToString() const {
+    return "(" + std::to_string(start) + ", " + std::to_string(end) + ")";
+  }
+
+  // inclusive
+  int64_t start = -1;
+  // inclusive
+  int64_t end = -1;
+};
+
+struct BitmapRange {
+  int64_t offset;
+  // zero added to, if there are less than 64 elements left in the column.
+  uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and 
non-overlapping.
+class RowRanges {
+ public:
+  RowRanges() = default;
+  virtual ~RowRanges() = default;
+  virtual size_t RowCount() const = 0;
+  virtual int64_t LastRow() const = 0;
+  virtual bool IsValid() const = 0;
+  virtual bool IsOverlapping(const IntervalRange& searchRange) const = 0;
+  // Given a RowRanges with rows accross all RGs, split it into N RowRanges, 
where N = number of RGs
+  // e.g.: suppose we have 2 RGs: [0-99] and [100-199], and user is interested 
in RowRanges [90-110], then
+  // this function will return 2 RowRanges: [90-99] and [0-10]
+  virtual std::vector<std::unique_ptr<RowRanges>> SplitByRowGroups(const 
std::vector<int64_t>& rows_per_rg) const = 0;
+  virtual std::string ToString() const = 0;
+
+  // Returns a vector of PageLocations that must be read all to get values for
+  // all included in this range virtual std::vector<PageLocation>
+  // PageIndexesToInclude(const std::vector<PageLocation>&  all_pages) = 0;
+
+  class Iterator {
+  public:
+    virtual std::variant<IntervalRange, BitmapRange, End> NextRange() = 0;
+    virtual ~Iterator() = default;
+  };
+  virtual std::unique_ptr<Iterator> NewIterator() const = 0;
+
+};
+
+class IntervalRanges : public RowRanges {

Review Comment:
   To me, IntervalRanges is not very "internal". Clients need to initialize 
their own IntervalRanges with classes like IntervalRange to pass into the API



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to