bkietz commented on a change in pull request #8507:
URL: https://github.com/apache/arrow/pull/8507#discussion_r512771339



##########
File path: cpp/src/arrow/dataset/file_parquet.h
##########
@@ -208,43 +146,34 @@ class ARROW_DS_EXPORT RowGroupInfo : public 
util::EqualityComparable<RowGroupInf
 /// number of scanned RowGroups, or to partition the scans across multiple
 /// threads.
 ///
-/// It can also attach optional statistics with each RowGroups, providing
-/// pushdown predicate benefits before invoking any heavy IO. This can induce
+/// Metadata can be explicitly provided, enabling pushdown predicate benefits 
without
+/// the potentially heavy IO of loading Metadata from the file system. This 
can induce
 /// significant performance boost when scanning high latency file systems.
 class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment {
  public:
   Result<FragmentVector> SplitByRowGroup(const std::shared_ptr<Expression>& 
predicate);
 
-  /// \brief Return the RowGroups selected by this fragment, or nullptr
-  /// if all RowGroups in the parquet file are selected.
-  const std::vector<RowGroupInfo>* row_groups();
-
-  /// \brief Return the number of row groups selected by this fragment.
-  Result<int> GetNumRowGroups();
+  /// \brief Return the RowGroups selected by this fragment.
+  const std::vector<int>& row_groups() const { return row_groups_; }
 
-  /// \brief Indicate if the attached statistics are complete and the physical 
schema
-  /// is cached.
-  ///
-  /// The statistics are complete if the provided RowGroups (see 
`row_groups()`)
-  /// is not empty / and all RowGroup return true on 
`RowGroup::HasStatistics()`.
-  bool HasCompleteMetadata() const { return has_complete_metadata_; }
+  /// \brief Return the FileMetaData associated with this fragment.
+  const std::shared_ptr<parquet::FileMetaData>& metadata() const { return 
metadata_; }
 
-  /// \brief Ensure attached statistics are complete and the physical schema 
is cached.
+  /// \brief Ensure this fragment's FileMetaData is in memory.
   Status EnsureCompleteMetadata(parquet::arrow::FileReader* reader = NULLPTR);
 
   /// \brief Return a filtered subset of the ParquetFileFragment.
   Result<std::shared_ptr<Fragment>> Subset(const std::shared_ptr<Expression>& 
predicate);
-  Result<std::shared_ptr<Fragment>> Subset(const std::vector<int> 
row_group_ids);
+  Result<std::shared_ptr<Fragment>> Subset(std::vector<int> row_group_ids);
 
  private:
   ParquetFileFragment(FileSource source, std::shared_ptr<FileFormat> format,
                       std::shared_ptr<Expression> partition_expression,
                       std::shared_ptr<Schema> physical_schema,
-                      std::vector<RowGroupInfo> row_groups);
+                      std::vector<int> row_groups, bool select_all_row_groups 
= false);

Review comment:
       Will do




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to