binmahone commented on code in PR #38867:
URL: https://github.com/apache/arrow/pull/38867#discussion_r1451694236
##########
cpp/src/parquet/arrow/reader.cc:
##########
@@ -336,19 +342,66 @@ class FileReaderImpl : public FileReader {
return ReadRowGroup(i, Iota(reader_->metadata()->num_columns()), table);
}
+ // This is a internal API owned by FileReaderImpl, not exposed in FileReader
+ Status GetRecordBatchReaderWithRowRanges(const std::vector<int>&
row_group_indices,
+ const std::vector<int>&
column_indices,
+ const
std::shared_ptr<std::vector<RowRanges>> & row_ranges_map,
+ std::unique_ptr<RecordBatchReader>*
out);
+
+ Status GetRecordBatchReader(const RowRanges& rows_to_return,
+ const std::vector<int>& column_indices,
+ std::unique_ptr<RecordBatchReader>* out)
override {
+ const auto metadata = reader_->metadata();
+ // check if the row ranges are valid
+ if (!rows_to_return.IsValid()) {
+ return Status::Invalid("The provided row range is invalid, keep it
monotone and non-interleaving: " +
+ rows_to_return.ToString());
+ }
+ // check if the row ranges are within the row group boundaries
+ if (rows_to_return.RowCount() != 0 &&
rows_to_return.GetRanges().back().end >= metadata->num_rows()) {
+ return Status::Invalid("The provided row range " +
rows_to_return.ToString() +
+ " exceeds the number of rows in the file: " +
+ std::to_string(metadata->num_rows()));
+ }
+
+ std::vector<int64_t> split_points;
+ int64_t rows_so_far = 0;
+ for (int i = 0 ; i < metadata->num_row_groups() - 1; i++) {
+ rows_so_far += metadata->RowGroup(i)->num_rows();
+ split_points.push_back(rows_so_far);
+ }
+ // We'll assign a RowRanges for each RG, even if it's not required to
return any rows
+ const std::vector<RowRanges> splits = rows_to_return.SplitAt(split_points);
+ // Call row_ranges_map because array index is the row group index
+ const std::shared_ptr<std::vector<RowRanges>> row_ranges_map =
+ std::make_shared<std::vector<RowRanges>>();
+ rows_so_far = 0;
+ std::vector<int> row_group_indices;
+ for (int i = 0 ; i < metadata->num_row_groups(); i++) {
+ row_ranges_map->push_back(splits[i].shift(-rows_so_far));
+ rows_so_far += metadata->RowGroup(i)->num_rows();
+ if (row_ranges_map->at(i).RowCount() > 0)
+ row_group_indices.push_back(i);
+ }
+
+ return GetRecordBatchReaderWithRowRanges(row_group_indices,
column_indices, row_ranges_map, out);
Review Comment:
I'm trying to reuse the existing GetRecordBatchReaderWithRowRanges function,
so row_group_indices must be kept for other usage. row_ranges_map itself is
indexed by row group number, in LeafReader::NextRowGroup , empty RG check is
performed by `if (row_ranges.RowCount() != 0)`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]