Dandandan commented on code in PR #7739:
URL: https://github.com/apache/arrow-datafusion/pull/7739#discussion_r1350756179
##########
datafusion/core/src/datasource/physical_plan/parquet.rs:
##########
@@ -559,19 +565,35 @@ pub trait ParquetFileReaderFactory: Debug + Send + Sync +
'static {
#[derive(Debug)]
pub struct DefaultParquetFileReaderFactory {
store: Arc<dyn ObjectStore>,
+ /// An cache of [`SharedMetaData`] used to avoid fetching the metadata of
a file
+ /// multiple times, as might occur if a file is repartitioned into
multiple groups
+ metadata_cache: MetadataCache,
}
impl DefaultParquetFileReaderFactory {
/// Create a factory.
pub fn new(store: Arc<dyn ObjectStore>) -> Self {
- Self { store }
+ Self {
+ store,
+ metadata_cache: Default::default(),
+ }
}
}
+/// A shared cache of [`SharedMetaData`]
+type MetadataCache = Arc<parking_lot::Mutex<HashMap<Path, SharedMetaData>>>;
+
+/// A shared [`ParquetMetaData`] used to only fetch metadata once
+type SharedMetaData = Arc<futures::lock::Mutex<Option<Arc<ParquetMetaData>>>>;
+
/// Implements [`AsyncFileReader`] for a parquet file in object storage
struct ParquetFileReader {
file_metrics: ParquetFileMetrics,
inner: ParquetObjectReader,
+ /// The [`SharedMetaData`] for this object
+ ///
+ /// This avoids looking up metadata multiple times for the same file
+ metadata: SharedMetaData,
Review Comment:
Can we make this configurable, so users can opt in to this depending on
their loads / use cases?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]