mapleFU commented on code in PR #38867:
URL: https://github.com/apache/arrow/pull/38867#discussion_r1444429627
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,7 +300,284 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
+ const IntervalRange& right) {
+ if (left.start <= right.start) {
+ if (left.end >= right.start) {
+ return {right.start, std::min(left.end, right.end)};
+ }
+ } else if (right.end >= left.start) {
+ return {left.start, std::min(left.end, right.end)};
+ }
+ return {-1, -1}; // Return a default Range object if no intersection
range found
+ }
+
+ IntervalRange(const int64_t start_, const int64_t end_) : start(start_),
end(end_) {
+ if (start > end) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ }
+
+ size_t Count() const { return end - start + 1; }
Review Comment:
`IntervalRange({-1, -1}).Count() == 1` ? Would this be weird?
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,7 +300,284 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
+ const IntervalRange& right) {
+ if (left.start <= right.start) {
+ if (left.end >= right.start) {
+ return {right.start, std::min(left.end, right.end)};
+ }
+ } else if (right.end >= left.start) {
+ return {left.start, std::min(left.end, right.end)};
+ }
+ return {-1, -1}; // Return a default Range object if no intersection
range found
+ }
+
+ IntervalRange(const int64_t start_, const int64_t end_) : start(start_),
end(end_) {
+ if (start > end) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ }
+
+ size_t Count() const { return end - start + 1; }
+
+ bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+ bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+ bool IsOverlap(const IntervalRange& other) const {
+ return !IsBefore(other) && !IsAfter(other);
+ }
+
+ bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+ std::string ToString() const {
+ return "[" + std::to_string(start) + ", " + std::to_string(end) + "]";
+ }
+
+ // inclusive
+ int64_t start;
+ // inclusive
+ int64_t end;
+};
+
+struct BitmapRange {
+ int64_t offset;
+ // zero added to, if there are less than 64 elements left in the column.
+ uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and
non-overlapping.
+class RowRanges {
+ public:
+ RowRanges() = default;
+
+ explicit RowRanges(const IntervalRange& range) { ranges.push_back(range); }
+
+ RowRanges(const std::vector<IntervalRange>& ranges) { this->ranges = ranges;
}
+
+ RowRanges(const RowRanges& other) { ranges = other.ranges; }
+
+ RowRanges(RowRanges&& other) noexcept { ranges = std::move(other.ranges); }
+
+ static RowRanges Intersection(const RowRanges& left, const RowRanges& right)
{
+ RowRanges result;
+
+ size_t rightIndex = 0;
+ for (const IntervalRange& l : left.ranges) {
+ for (size_t i = rightIndex, n = right.ranges.size(); i < n; ++i) {
+ const IntervalRange& r = right.ranges[i];
+ if (l.IsBefore(r)) {
+ break;
+ } else if (l.IsAfter(r)) {
+ rightIndex = i + 1;
+ continue;
+ }
+ result.Add(IntervalRange::Intersection(l, r));
+ }
+ }
+
+ return result;
+ }
+
+ void Add(const IntervalRange& range) {
+ const IntervalRange rangeToAdd = range;
+ if (ranges.size() > 1 && rangeToAdd.start <= ranges.back().end) {
+ throw ParquetException("Ranges must be added in order");
+ }
+ ranges.push_back(rangeToAdd);
+ }
+
+ size_t RowCount() const {
+ size_t cnt = 0;
+ for (const IntervalRange& range : ranges) {
+ cnt += range.Count();
+ }
+ return cnt;
+ }
+
+ bool IsValid() const {
+ if (ranges.size() == 0) return true;
+ if (ranges[0].start < 0) {
+ return false;
+ }
+ for (size_t i = 1; i < ranges.size(); i++) {
+ if (ranges[i].start <= ranges[i - 1].end) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool IsOverlapping(int64_t start, int64_t end) const {
+ const IntervalRange searchRange(start, end);
+ return IsOverlapping(searchRange);
+ }
+
+ bool IsOverlapping(const IntervalRange& searchRange) const {
+ auto it = std::lower_bound(
+ ranges.begin(), ranges.end(), searchRange,
+ [](const IntervalRange& r1, const IntervalRange& r2) { return
r1.IsBefore(r2); });
+ return it != ranges.end() && !(*it).IsAfter(searchRange);
+ }
+
+ std::vector<IntervalRange>& GetRanges() { return ranges; }
+
+ const std::vector<IntervalRange>& GetRanges() const { return ranges; }
+
+ // Split the ranges into N+1 parts at the given split point, where N =
+ // split_points.size() The RowRows object itself is not modified
+ std::vector<RowRanges> SplitAt(const std::vector<int64_t>& split_points)
const {
+ if (split_points.size() == 0) {
+ return {*this};
+ }
+
+ std::vector<RowRanges> result;
+ int64_t last_split_point = -1;
+ for (const int64_t split_point : split_points) {
+ if (split_point <= 0) {
+ throw ParquetException("Invalid split point " +
std::to_string(split_point));
+ }
+ if (split_point <= last_split_point) {
+ throw ParquetException("Split points must be in ascending order");
+ }
+ last_split_point = split_point;
+ }
+
+ RowRanges spaces;
+ for (size_t i = 0; i < split_points.size(); ++i) {
+ auto start = i == 0 ? 0 : split_points[i - 1];
+ auto end = split_points[i] - 1;
+ spaces.Add({start, end});
+ }
+ spaces.Add(
+ {split_points[split_points.size() - 1],
std::numeric_limits<int64_t>::max()});
+
+ for (IntervalRange space : spaces.GetRanges()) {
+ RowRanges intersection = RowRanges::Intersection(RowRanges(space),
*this);
+ result.push_back(intersection);
+ }
+
+ return result;
+ }
+
+ const IntervalRange& operator[](size_t index) const {
+ // check index
+ if (index >= ranges.size() || index < 0) {
+ throw ParquetException("Index out of range");
+ }
+ return ranges[index];
+ }
+
+ RowRanges shift(const int64_t offset) const {
+ RowRanges result;
+ for (const IntervalRange& range : ranges) {
+ result.Add({range.start + offset, range.end + offset});
+ }
+ return result;
+ }
+
+ std::string ToString() const {
+ std::string result = "[";
+ for (const IntervalRange& range : ranges) {
+ result +=
+ "(" + std::to_string(range.start) + ", " + std::to_string(range.end)
+ "), ";
Review Comment:
Uses `range.ToString()`?
##########
cpp/src/parquet/column_reader.h:
##########
@@ -302,7 +300,284 @@ class TypedColumnReader : public ColumnReader {
int32_t* dict_len) = 0;
};
+// Represent a range to read. The range is inclusive on both ends.
+struct IntervalRange {
+ static IntervalRange Intersection(const IntervalRange& left,
+ const IntervalRange& right) {
+ if (left.start <= right.start) {
+ if (left.end >= right.start) {
+ return {right.start, std::min(left.end, right.end)};
+ }
+ } else if (right.end >= left.start) {
+ return {left.start, std::min(left.end, right.end)};
+ }
+ return {-1, -1}; // Return a default Range object if no intersection
range found
+ }
+
+ IntervalRange(const int64_t start_, const int64_t end_) : start(start_),
end(end_) {
+ if (start > end) {
+ throw ParquetException("Invalid range with start: " +
std::to_string(start) +
+ " and end: " + std::to_string(end));
+ }
+ }
+
+ size_t Count() const { return end - start + 1; }
+
+ bool IsBefore(const IntervalRange& other) const { return end < other.start; }
+
+ bool IsAfter(const IntervalRange& other) const { return start > other.end; }
+
+ bool IsOverlap(const IntervalRange& other) const {
+ return !IsBefore(other) && !IsAfter(other);
+ }
+
+ bool IsValid() const { return start >= 0 && end >= 0 && end >= start; }
+
+ std::string ToString() const {
+ return "[" + std::to_string(start) + ", " + std::to_string(end) + "]";
+ }
+
+ // inclusive
+ int64_t start;
+ // inclusive
+ int64_t end;
+};
+
+struct BitmapRange {
+ int64_t offset;
+ // zero added to, if there are less than 64 elements left in the column.
+ uint64_t bitmap;
+};
+
+struct End {};
+
+// Represent a set of ranges to read. The ranges are sorted and
non-overlapping.
+class RowRanges {
+ public:
+ RowRanges() = default;
+
+ explicit RowRanges(const IntervalRange& range) { ranges.push_back(range); }
+
+ RowRanges(const std::vector<IntervalRange>& ranges) { this->ranges = ranges;
}
+
+ RowRanges(const RowRanges& other) { ranges = other.ranges; }
+
+ RowRanges(RowRanges&& other) noexcept { ranges = std::move(other.ranges); }
Review Comment:
= default?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]