wgtmac commented on code in PR #39608:
URL: https://github.com/apache/arrow/pull/39608#discussion_r1456716619
##########
cpp/src/parquet/column_reader.h:
##########
@@ -22,6 +22,7 @@
#include <utility>
#include <vector>
+#include "page_index.h"
Review Comment:
Is this required? Could we use forward declaration instead?
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,8 +303,150 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
+ const IntervalRange& right) {
+ if (left.start <= right.start) {
+ if (left.end >= right.start) {
+ return {right.start, std::min(left.end, right.end)};
+ }
+ } else if (right.end >= left.start) {
+ return {left.start, std::min(left.end, right.end)};
+ }
+ return {-1, -1}; // Return a default Range object if no intersection
range found
+ }
+
+ IntervalRange(const int64_t start_, const int64_t end_) : start(start_),
end(end_) {
+ if (start > end) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ }
+
+ size_t Count() const {
+ if(!IsValid()) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ return end - start + 1;
+ }
+
+ bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+ bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+ bool IsOverlap(const IntervalRange& other) const {
+ return !IsBefore(other) && !IsAfter(other);
+ }
+
+ bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+ std::string ToString() const {
+ return "(" + std::to_string(start) + ", " + std::to_string(end) + ")";
+ }
+
+ // inclusive
+ int64_t start = -1;
+ // inclusive
+ int64_t end = -1;
+};
+
+struct BitmapRange {
+ int64_t offset;
+ // zero added to, if there are less than 64 elements left in the column.
+ uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and
non-overlapping.
+class RowRanges {
+ public:
+ RowRanges() = default;
+ virtual ~RowRanges() = default;
+ virtual size_t RowCount() const = 0;
+ virtual int64_t LastRow() const = 0;
+ virtual bool IsValid() const = 0;
Review Comment:
Do we actually need `IsValid()`? Is it possible to prohibit constructing
invalid row ranges from the constructor?
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,8 +303,150 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
+ const IntervalRange& right) {
+ if (left.start <= right.start) {
+ if (left.end >= right.start) {
+ return {right.start, std::min(left.end, right.end)};
+ }
+ } else if (right.end >= left.start) {
+ return {left.start, std::min(left.end, right.end)};
+ }
+ return {-1, -1}; // Return a default Range object if no intersection
range found
+ }
+
+ IntervalRange(const int64_t start_, const int64_t end_) : start(start_),
end(end_) {
+ if (start > end) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ }
+
+ size_t Count() const {
+ if(!IsValid()) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ return end - start + 1;
+ }
+
+ bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+ bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+ bool IsOverlap(const IntervalRange& other) const {
+ return !IsBefore(other) && !IsAfter(other);
+ }
+
+ bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+ std::string ToString() const {
+ return "(" + std::to_string(start) + ", " + std::to_string(end) + ")";
+ }
+
+ // inclusive
+ int64_t start = -1;
+ // inclusive
+ int64_t end = -1;
+};
+
+struct BitmapRange {
+ int64_t offset;
+ // zero added to, if there are less than 64 elements left in the column.
+ uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and
non-overlapping.
+class RowRanges {
+ public:
+ RowRanges() = default;
+ virtual ~RowRanges() = default;
+ virtual size_t RowCount() const = 0;
Review Comment:
```suggestion
/// \brief Total number of rows in the row ranges.
virtual size_t num_rows() const = 0;
```
Trivial getter functions like this should use snake case. And we need add
docstring to user-faced public api.
Same for similar APIs below.
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,8 +303,150 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
Review Comment:
It would be good to move all row range stuff to a separate
parquet/arrow/row_range.h
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,8 +303,150 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
+ const IntervalRange& right) {
+ if (left.start <= right.start) {
+ if (left.end >= right.start) {
+ return {right.start, std::min(left.end, right.end)};
+ }
+ } else if (right.end >= left.start) {
+ return {left.start, std::min(left.end, right.end)};
+ }
+ return {-1, -1}; // Return a default Range object if no intersection
range found
+ }
+
+ IntervalRange(const int64_t start_, const int64_t end_) : start(start_),
end(end_) {
+ if (start > end) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ }
+
+ size_t Count() const {
+ if(!IsValid()) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ return end - start + 1;
+ }
+
+ bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+ bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+ bool IsOverlap(const IntervalRange& other) const {
+ return !IsBefore(other) && !IsAfter(other);
+ }
+
+ bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+ std::string ToString() const {
+ return "(" + std::to_string(start) + ", " + std::to_string(end) + ")";
+ }
+
+ // inclusive
+ int64_t start = -1;
+ // inclusive
+ int64_t end = -1;
+};
+
+struct BitmapRange {
+ int64_t offset;
+ // zero added to, if there are less than 64 elements left in the column.
+ uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and
non-overlapping.
+class RowRanges {
+ public:
+ RowRanges() = default;
+ virtual ~RowRanges() = default;
+ virtual size_t RowCount() const = 0;
+ virtual int64_t LastRow() const = 0;
Review Comment:
```suggestion
virtual int64_t last_row() const = 0;
```
For completeness, should we also provide `first_row()` ?
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,8 +303,150 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
+ const IntervalRange& right) {
+ if (left.start <= right.start) {
+ if (left.end >= right.start) {
+ return {right.start, std::min(left.end, right.end)};
+ }
+ } else if (right.end >= left.start) {
+ return {left.start, std::min(left.end, right.end)};
+ }
+ return {-1, -1}; // Return a default Range object if no intersection
range found
+ }
+
+ IntervalRange(const int64_t start_, const int64_t end_) : start(start_),
end(end_) {
+ if (start > end) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ }
+
+ size_t Count() const {
+ if(!IsValid()) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ return end - start + 1;
+ }
+
+ bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+ bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+ bool IsOverlap(const IntervalRange& other) const {
+ return !IsBefore(other) && !IsAfter(other);
+ }
+
+ bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+ std::string ToString() const {
+ return "(" + std::to_string(start) + ", " + std::to_string(end) + ")";
+ }
+
+ // inclusive
+ int64_t start = -1;
+ // inclusive
+ int64_t end = -1;
+};
+
+struct BitmapRange {
+ int64_t offset;
+ // zero added to, if there are less than 64 elements left in the column.
+ uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and
non-overlapping.
+class RowRanges {
+ public:
+ RowRanges() = default;
+ virtual ~RowRanges() = default;
+ virtual size_t RowCount() const = 0;
+ virtual int64_t LastRow() const = 0;
+ virtual bool IsValid() const = 0;
+ virtual bool IsOverlapping(const IntervalRange& searchRange) const = 0;
Review Comment:
```suggestion
virtual bool IsOverlapping(const RowRanges& other) const = 0;
```
IMO, `IntervalRange` and `BitmapRange` should not appear in the public
function. We can add this in the implementation class.
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,8 +303,150 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
+ const IntervalRange& right) {
+ if (left.start <= right.start) {
+ if (left.end >= right.start) {
+ return {right.start, std::min(left.end, right.end)};
+ }
+ } else if (right.end >= left.start) {
+ return {left.start, std::min(left.end, right.end)};
+ }
+ return {-1, -1}; // Return a default Range object if no intersection
range found
+ }
+
+ IntervalRange(const int64_t start_, const int64_t end_) : start(start_),
end(end_) {
+ if (start > end) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ }
+
+ size_t Count() const {
+ if(!IsValid()) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ return end - start + 1;
+ }
+
+ bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+ bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+ bool IsOverlap(const IntervalRange& other) const {
+ return !IsBefore(other) && !IsAfter(other);
+ }
+
+ bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+ std::string ToString() const {
+ return "(" + std::to_string(start) + ", " + std::to_string(end) + ")";
+ }
+
+ // inclusive
+ int64_t start = -1;
+ // inclusive
+ int64_t end = -1;
+};
+
+struct BitmapRange {
+ int64_t offset;
+ // zero added to, if there are less than 64 elements left in the column.
+ uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and
non-overlapping.
+class RowRanges {
+ public:
+ RowRanges() = default;
+ virtual ~RowRanges() = default;
+ virtual size_t RowCount() const = 0;
+ virtual int64_t LastRow() const = 0;
+ virtual bool IsValid() const = 0;
+ virtual bool IsOverlapping(const IntervalRange& searchRange) const = 0;
+ // Given a RowRanges with rows accross all RGs, split it into N RowRanges,
where N = number of RGs
+ // e.g.: suppose we have 2 RGs: [0-99] and [100-199], and user is interested
in RowRanges [90-110], then
+ // this function will return 2 RowRanges: [90-99] and [0-10]
+ virtual std::vector<std::unique_ptr<RowRanges>> SplitByRowGroups(const
std::vector<int64_t>& rows_per_rg) const = 0;
Review Comment:
```suggestion
/// \brief Split the row ranges into sub row ranges according to the
/// specified number of rows per sub row ranges.
///
/// \param num_rows_per_row_ranges number of rows per sub row range after
split.
/// \throw ParquetException if num_rows_per_row_ranges does not match total
/// number of rows in the row range.
virtual std::vector<std::unique_ptr<RowRanges>> SplitByRowCount(const
std::vector<int64_t>& num_rows_per_row_ranges) const = 0;
```
IMO, we need to make it more generic by avoiding the concept of row group
here.
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,8 +303,150 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
+ const IntervalRange& right) {
+ if (left.start <= right.start) {
+ if (left.end >= right.start) {
+ return {right.start, std::min(left.end, right.end)};
+ }
+ } else if (right.end >= left.start) {
+ return {left.start, std::min(left.end, right.end)};
+ }
+ return {-1, -1}; // Return a default Range object if no intersection
range found
+ }
+
+ IntervalRange(const int64_t start_, const int64_t end_) : start(start_),
end(end_) {
+ if (start > end) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ }
+
+ size_t Count() const {
+ if(!IsValid()) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ return end - start + 1;
+ }
+
+ bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+ bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+ bool IsOverlap(const IntervalRange& other) const {
+ return !IsBefore(other) && !IsAfter(other);
+ }
+
+ bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+ std::string ToString() const {
+ return "(" + std::to_string(start) + ", " + std::to_string(end) + ")";
+ }
+
+ // inclusive
+ int64_t start = -1;
Review Comment:
Should we avoid the default values as well as the constructor definition to
support aggregate initialization? We can add static function to create a valid
IntervalRange.
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,8 +303,150 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
+ const IntervalRange& right) {
+ if (left.start <= right.start) {
+ if (left.end >= right.start) {
+ return {right.start, std::min(left.end, right.end)};
+ }
+ } else if (right.end >= left.start) {
+ return {left.start, std::min(left.end, right.end)};
+ }
+ return {-1, -1}; // Return a default Range object if no intersection
range found
+ }
+
+ IntervalRange(const int64_t start_, const int64_t end_) : start(start_),
end(end_) {
+ if (start > end) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ }
+
+ size_t Count() const {
+ if(!IsValid()) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ return end - start + 1;
+ }
+
+ bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+ bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+ bool IsOverlap(const IntervalRange& other) const {
+ return !IsBefore(other) && !IsAfter(other);
+ }
+
+ bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+ std::string ToString() const {
+ return "(" + std::to_string(start) + ", " + std::to_string(end) + ")";
+ }
+
+ // inclusive
+ int64_t start = -1;
+ // inclusive
+ int64_t end = -1;
+};
+
+struct BitmapRange {
+ int64_t offset;
+ // zero added to, if there are less than 64 elements left in the column.
+ uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and
non-overlapping.
+class RowRanges {
+ public:
+ RowRanges() = default;
+ virtual ~RowRanges() = default;
+ virtual size_t RowCount() const = 0;
+ virtual int64_t LastRow() const = 0;
+ virtual bool IsValid() const = 0;
+ virtual bool IsOverlapping(const IntervalRange& searchRange) const = 0;
+ // Given a RowRanges with rows accross all RGs, split it into N RowRanges,
where N = number of RGs
+ // e.g.: suppose we have 2 RGs: [0-99] and [100-199], and user is interested
in RowRanges [90-110], then
+ // this function will return 2 RowRanges: [90-99] and [0-10]
+ virtual std::vector<std::unique_ptr<RowRanges>> SplitByRowGroups(const
std::vector<int64_t>& rows_per_rg) const = 0;
+ virtual std::string ToString() const = 0;
+
+ // Returns a vector of PageLocations that must be read all to get values for
Review Comment:
Remove this if it is not required at the moment.
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,8 +303,150 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
Review Comment:
My personal preference is to simply define it as below
```
struct IntervalRange {
int64_t start;
int64_t end;
};
```
Then move all operations to a separate IntervalRangeUtil class. Users do not
care about these operations.
##########
cpp/src/parquet/column_reader.h:
##########
@@ -424,6 +567,10 @@ class PARQUET_EXPORT RecordReader {
/// \brief True if reading dense for nullable columns.
bool read_dense_for_nullable() const { return read_dense_for_nullable_; }
+ void reset_current_rg_processed_records() { current_rg_processed_records_ =
0; }
+
+ void set_record_skipper(const std::shared_ptr<RecordSkipper>& skipper) {
skipper_ = skipper; }
Review Comment:
```suggestion
void set_record_skipper(std::shared_ptr<RecordSkipper> skipper) { skipper_
= std::move(skipper); }
```
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,8 +303,150 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
+ const IntervalRange& right) {
+ if (left.start <= right.start) {
+ if (left.end >= right.start) {
+ return {right.start, std::min(left.end, right.end)};
+ }
+ } else if (right.end >= left.start) {
+ return {left.start, std::min(left.end, right.end)};
+ }
+ return {-1, -1}; // Return a default Range object if no intersection
range found
+ }
+
+ IntervalRange(const int64_t start_, const int64_t end_) : start(start_),
end(end_) {
+ if (start > end) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ }
+
+ size_t Count() const {
+ if(!IsValid()) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ return end - start + 1;
+ }
+
+ bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+ bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+ bool IsOverlap(const IntervalRange& other) const {
+ return !IsBefore(other) && !IsAfter(other);
+ }
+
+ bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+ std::string ToString() const {
+ return "(" + std::to_string(start) + ", " + std::to_string(end) + ")";
+ }
+
+ // inclusive
+ int64_t start = -1;
+ // inclusive
+ int64_t end = -1;
+};
+
+struct BitmapRange {
+ int64_t offset;
+ // zero added to, if there are less than 64 elements left in the column.
+ uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and
non-overlapping.
+class RowRanges {
+ public:
+ RowRanges() = default;
+ virtual ~RowRanges() = default;
+ virtual size_t RowCount() const = 0;
+ virtual int64_t LastRow() const = 0;
+ virtual bool IsValid() const = 0;
+ virtual bool IsOverlapping(const IntervalRange& searchRange) const = 0;
+ // Given a RowRanges with rows accross all RGs, split it into N RowRanges,
where N = number of RGs
+ // e.g.: suppose we have 2 RGs: [0-99] and [100-199], and user is interested
in RowRanges [90-110], then
+ // this function will return 2 RowRanges: [90-99] and [0-10]
+ virtual std::vector<std::unique_ptr<RowRanges>> SplitByRowGroups(const
std::vector<int64_t>& rows_per_rg) const = 0;
+ virtual std::string ToString() const = 0;
+
+ // Returns a vector of PageLocations that must be read all to get values for
+ // all included in this range virtual std::vector<PageLocation>
+ // PageIndexesToInclude(const std::vector<PageLocation>& all_pages) = 0;
+
+ class Iterator {
+ public:
+ virtual std::variant<IntervalRange, BitmapRange, End> NextRange() = 0;
+ virtual ~Iterator() = default;
+ };
+ virtual std::unique_ptr<Iterator> NewIterator() const = 0;
+
+};
+
+class IntervalRanges : public RowRanges {
Review Comment:
What about adding a separate `row_range_internal.h` to hold this class and
its friends?
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,8 +303,150 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
+ const IntervalRange& right) {
+ if (left.start <= right.start) {
+ if (left.end >= right.start) {
+ return {right.start, std::min(left.end, right.end)};
+ }
+ } else if (right.end >= left.start) {
+ return {left.start, std::min(left.end, right.end)};
+ }
+ return {-1, -1}; // Return a default Range object if no intersection
range found
+ }
+
+ IntervalRange(const int64_t start_, const int64_t end_) : start(start_),
end(end_) {
+ if (start > end) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ }
+
+ size_t Count() const {
+ if(!IsValid()) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ return end - start + 1;
+ }
+
+ bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+ bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+ bool IsOverlap(const IntervalRange& other) const {
+ return !IsBefore(other) && !IsAfter(other);
+ }
+
+ bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+ std::string ToString() const {
+ return "(" + std::to_string(start) + ", " + std::to_string(end) + ")";
+ }
+
+ // inclusive
+ int64_t start = -1;
+ // inclusive
+ int64_t end = -1;
+};
+
+struct BitmapRange {
+ int64_t offset;
+ // zero added to, if there are less than 64 elements left in the column.
+ uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and
non-overlapping.
+class RowRanges {
+ public:
+ RowRanges() = default;
+ virtual ~RowRanges() = default;
+ virtual size_t RowCount() const = 0;
+ virtual int64_t LastRow() const = 0;
+ virtual bool IsValid() const = 0;
+ virtual bool IsOverlapping(const IntervalRange& searchRange) const = 0;
+ // Given a RowRanges with rows accross all RGs, split it into N RowRanges,
where N = number of RGs
+ // e.g.: suppose we have 2 RGs: [0-99] and [100-199], and user is interested
in RowRanges [90-110], then
+ // this function will return 2 RowRanges: [90-99] and [0-10]
+ virtual std::vector<std::unique_ptr<RowRanges>> SplitByRowGroups(const
std::vector<int64_t>& rows_per_rg) const = 0;
+ virtual std::string ToString() const = 0;
+
+ // Returns a vector of PageLocations that must be read all to get values for
+ // all included in this range virtual std::vector<PageLocation>
+ // PageIndexesToInclude(const std::vector<PageLocation>& all_pages) = 0;
+
+ class Iterator {
+ public:
+ virtual std::variant<IntervalRange, BitmapRange, End> NextRange() = 0;
+ virtual ~Iterator() = default;
+ };
+ virtual std::unique_ptr<Iterator> NewIterator() const = 0;
+
+};
+
+class IntervalRanges : public RowRanges {
+ public:
+ IntervalRanges();
+ explicit IntervalRanges(const IntervalRange& range);
+ explicit IntervalRanges(const std::vector<IntervalRange>& ranges);
+ std::unique_ptr<Iterator> NewIterator() const override;
+ size_t RowCount() const override;
+ int64_t LastRow() const override;
+ bool IsValid() const override;
+ bool IsOverlapping(const IntervalRange& searchRange) const override;
+ std::string ToString() const override;
+ std::vector<std::unique_ptr<RowRanges>> SplitByRowGroups(
+ const std::vector<int64_t>& rows_per_rg) const override;
+ static IntervalRanges Intersection(const IntervalRanges& left,
+ const IntervalRanges& right);
+ void Add(const IntervalRange& range);
+ const std::vector<IntervalRange>& GetRanges() const;
+
+ private:
+ std::vector<IntervalRange> ranges_;
+};
+
+class IntervalRowRangesIterator : public RowRanges::Iterator {
+ public:
+ IntervalRowRangesIterator(const std::vector<IntervalRange>& ranges);
+ ~IntervalRowRangesIterator() override;
+ std::variant<IntervalRange, BitmapRange, End> NextRange() override;
+
+ private:
+ const std::vector<IntervalRange>& ranges_;
+ size_t current_index_ = 0;
+};
+
namespace internal {
+// A RecordSkipper is used to skip uncessary rows within each pages.
+class PARQUET_EXPORT RecordSkipper {
Review Comment:
Seems we can use forward declaration here and move it to the cpp file?
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,8 +303,150 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
+ const IntervalRange& right) {
+ if (left.start <= right.start) {
+ if (left.end >= right.start) {
+ return {right.start, std::min(left.end, right.end)};
+ }
+ } else if (right.end >= left.start) {
+ return {left.start, std::min(left.end, right.end)};
+ }
+ return {-1, -1}; // Return a default Range object if no intersection
range found
+ }
+
+ IntervalRange(const int64_t start_, const int64_t end_) : start(start_),
end(end_) {
+ if (start > end) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ }
+
+ size_t Count() const {
+ if(!IsValid()) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ return end - start + 1;
+ }
+
+ bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+ bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+ bool IsOverlap(const IntervalRange& other) const {
+ return !IsBefore(other) && !IsAfter(other);
+ }
+
+ bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+ std::string ToString() const {
+ return "(" + std::to_string(start) + ", " + std::to_string(end) + ")";
+ }
+
+ // inclusive
+ int64_t start = -1;
+ // inclusive
+ int64_t end = -1;
+};
+
+struct BitmapRange {
+ int64_t offset;
+ // zero added to, if there are less than 64 elements left in the column.
+ uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and
non-overlapping.
+class RowRanges {
+ public:
+ RowRanges() = default;
Review Comment:
Remove the default ctor?
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,8 +303,150 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
+ const IntervalRange& right) {
+ if (left.start <= right.start) {
+ if (left.end >= right.start) {
+ return {right.start, std::min(left.end, right.end)};
+ }
+ } else if (right.end >= left.start) {
+ return {left.start, std::min(left.end, right.end)};
+ }
+ return {-1, -1}; // Return a default Range object if no intersection
range found
+ }
+
+ IntervalRange(const int64_t start_, const int64_t end_) : start(start_),
end(end_) {
+ if (start > end) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ }
+
+ size_t Count() const {
+ if(!IsValid()) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ return end - start + 1;
+ }
+
+ bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+ bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+ bool IsOverlap(const IntervalRange& other) const {
+ return !IsBefore(other) && !IsAfter(other);
+ }
+
+ bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+ std::string ToString() const {
+ return "(" + std::to_string(start) + ", " + std::to_string(end) + ")";
+ }
+
+ // inclusive
+ int64_t start = -1;
+ // inclusive
+ int64_t end = -1;
+};
+
+struct BitmapRange {
+ int64_t offset;
+ // zero added to, if there are less than 64 elements left in the column.
+ uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and
non-overlapping.
+class RowRanges {
+ public:
+ RowRanges() = default;
Review Comment:
BTW, we need some utility function to make it easy for users to create row
ranges in the common case.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]