Re: [PR] GH-39392: [C++][Parquet] Support page pruning [arrow]

via GitHub Mon, 08 Jan 2024 18:51:09 -0800


wgtmac commented on code in PR #39393:
URL: https://github.com/apache/arrow/pull/39393#discussion_r1444125279



##########
cpp/src/parquet/arrow/reader_internal.h:
##########
@@ -68,15 +72,34 @@ class FileColumnIterator {
         schema_(reader->metadata()->schema()),
         row_groups_(row_groups.begin(), row_groups.end()) {}
 
+  explicit FileColumnIterator(int column_index, ParquetFileReader* reader,
+                              std::vector<int> row_groups, RowRangesOpt 
row_ranges)

Review Comment:
   Consider to combine row_groups and row_ranges.



##########
cpp/src/parquet/arrow/reader.h:
##########
@@ -198,6 +204,10 @@ class PARQUET_EXPORT FileReader {
   ::arrow::Status GetRecordBatchReader(const std::vector<int>& 
row_group_indices,
                                        
std::shared_ptr<::arrow::RecordBatchReader>* out);
   ::arrow::Status 
GetRecordBatchReader(std::shared_ptr<::arrow::RecordBatchReader>* out);
+  ::arrow::Status GetRecordBatchReader(

Review Comment:
   ditto



##########
cpp/src/parquet/arrow/reader.h:
##########
@@ -176,6 +177,11 @@ class PARQUET_EXPORT FileReader {
   ///
   /// \returns error Status if either row_group_indices or column_indices
   ///     contains an invalid index
+  virtual ::arrow::Status GetRecordBatchReader(
+      const std::vector<int>& row_group_indices, const std::vector<int>& 
column_indices,
+      const std::optional<std::unordered_map<int, RowRanges>>& row_ranges,
+      std::unique_ptr<::arrow::RecordBatchReader>* out) = 0;

Review Comment:
   It looks duplicate to provide `row_group_indices` and `row_ranges` at the 
same time.



##########
cpp/src/parquet/arrow/reader.cc:
##########
@@ -973,10 +998,15 @@ Status GetReader(const SchemaField& field, const 
std::shared_ptr<ReaderContext>&
 
 Status FileReaderImpl::GetRecordBatchReader(const std::vector<int>& row_groups,
                                             const std::vector<int>& 
column_indices,
+                                            const RowRangesOpt& row_ranges,
                                             
std::unique_ptr<RecordBatchReader>* out) {
   RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
 
-  if (reader_properties_.pre_buffer()) {
+  // When row_ranges has value, only the data of hit pages should be load,

Review Comment:
   If the IO coalescing process is predictable, we can still enable prebuffer 
here with some code change. If we need to implement this, some logic of 
ChunkBufferedInputStream should be refactored out to be shared by prebuffer.



##########
cpp/src/parquet/arrow/reader.cc:
##########
@@ -157,6 +159,13 @@ class FileReaderImpl : public FileReader {
     };
   }
 
+  FileColumnIteratorFactory SomeRowGroupsFactory(std::vector<int> row_groups,

Review Comment:
   I'd suggest to use interface below:
   ```
   FileColumnIteratorFactory SomeRowGroupsFactory(std::map<int, RowRanges> 
ranges);
   ```
   
   In this way, `GetFieldReader` and `GetFieldReaders` can be changed without 
adding extra optional parameter.
   
   ```
     // RowGroups can be either std::vector<int> or std::map<int, RowRanges>
     template <typename RowGroups>
     Status GetFieldReader(int i,
                           const std::shared_ptr<std::unordered_set<int>>& 
included_leaves,
                           const RowGroups& row_groups,
                           std::unique_ptr<ColumnReaderImpl>* out)
   
     template <typename RowGroups>
     Status GetFieldReaders(const std::vector<int>& column_indices,
                            const RowGroups& row_groups,
                            std::vector<std::shared_ptr<ColumnReaderImpl>>* out,
                            std::shared_ptr<::arrow::Schema>* out_schema)
   ```



##########
cpp/src/parquet/arrow/reader_internal.h:
##########
@@ -68,15 +72,34 @@ class FileColumnIterator {
         schema_(reader->metadata()->schema()),
         row_groups_(row_groups.begin(), row_groups.end()) {}
 
+  explicit FileColumnIterator(int column_index, ParquetFileReader* reader,
+                              std::vector<int> row_groups, RowRangesOpt 
row_ranges)
+      : column_index_(column_index),
+        reader_(reader),
+        schema_(reader->metadata()->schema()),
+        row_groups_(row_groups.begin(), row_groups.end()),
+        row_ranges_(std::move(row_ranges)) {}
+
   virtual ~FileColumnIterator() {}
 
   std::unique_ptr<::parquet::PageReader> NextChunk() {
     if (row_groups_.empty()) {
       return nullptr;
     }
 
-    auto row_group_reader = reader_->RowGroup(row_groups_.front());
+    const int row_group_ordinal = row_groups_.front();
+    auto row_group_reader = reader_->RowGroup(row_group_ordinal);
     row_groups_.pop_front();
+    if (row_ranges_.has_value()) {
+      auto row_ranges_map = row_ranges_.value();
+      auto it = row_ranges_map.find(row_group_ordinal);
+      if (it != row_ranges_map.end()) {
+        auto index_reader = 
reader_->GetPageIndexReader()->RowGroup(row_group_ordinal);

Review Comment:
   Error handling is required here when index_reader is not available.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-39392: [C++][Parquet] Support page pruning [arrow]

Reply via email to