Hor911 commented on issue #34460:
URL: https://github.com/apache/arrow/issues/34460#issuecomment-1455218992

   Implementation of 2 new methods and little ReadRowGroups for dedup. Tested 
with large and complex parquet files and works very well.
   
   ```
   Status FileReaderImpl::WillNeedRowGroups(const std::vector<int>& row_groups,
                                            const std::vector<int>& 
column_indices) {
     RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
   
     // PARQUET-1698/PARQUET-1820: pre-buffer row groups/column chunks if 
enabled
     if (reader_properties_.pre_buffer()) {
       BEGIN_PARQUET_CATCH_EXCEPTIONS
       parquet_reader()->PreBuffer(row_groups, column_indices,
                                   reader_properties_.io_context(),
                                   reader_properties_.cache_options());
       END_PARQUET_CATCH_EXCEPTIONS
     }
     return Status::OK();
   }
   
   Status FileReaderImpl::DecodeRowGroups(const std::vector<int>& row_groups,
                          const std::vector<int>& column_indices,
                          std::shared_ptr<::arrow::Table>* out) {
     RETURN_NOT_OK(BoundsCheck(row_groups, column_indices));
   
     auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices,
                                /*cpu_executor=*/nullptr);
     ARROW_ASSIGN_OR_RAISE(*out, fut.MoveResult());
     return Status::OK();
   }
   
   Status FileReaderImpl::ReadRowGroups(const std::vector<int>& row_groups,
                                        const std::vector<int>& column_indices,
                                        std::shared_ptr<Table>* out) {
     RETURN_NOT_OK(WillNeedRowGroups(row_groups, column_indices));
   
     auto fut = DecodeRowGroups(/*self=*/nullptr, row_groups, column_indices,
                                /*cpu_executor=*/nullptr);
     ARROW_ASSIGN_OR_RAISE(*out, fut.MoveResult());
     return Status::OK();
   }
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to