tustvold commented on code in PR #7739:
URL: https://github.com/apache/arrow-datafusion/pull/7739#discussion_r1345811732


##########
datafusion/core/src/datasource/physical_plan/parquet.rs:
##########
@@ -559,19 +565,35 @@ pub trait ParquetFileReaderFactory: Debug + Send + Sync + 
'static {
 #[derive(Debug)]
 pub struct DefaultParquetFileReaderFactory {
     store: Arc<dyn ObjectStore>,
+    /// An cache of [`SharedMetaData`] used to avoid fetching the metadata of 
a file
+    /// multiple times, as might occur if a file is repartitioned into 
multiple groups
+    metadata_cache: MetadataCache,
 }
 
 impl DefaultParquetFileReaderFactory {
     /// Create a factory.
     pub fn new(store: Arc<dyn ObjectStore>) -> Self {
-        Self { store }
+        Self {
+            store,
+            metadata_cache: Default::default(),
+        }
     }
 }
 
+/// A shared cache of [`SharedMetaData`]
+type MetadataCache = Arc<parking_lot::Mutex<HashMap<Path, SharedMetaData>>>;
+
+/// A shared [`ParquetMetaData`] used to only fetch metadata once
+type SharedMetaData = Arc<futures::lock::Mutex<Option<Arc<ParquetMetaData>>>>;
+
 /// Implements [`AsyncFileReader`] for a parquet file in object storage
 struct ParquetFileReader {
     file_metrics: ParquetFileMetrics,
     inner: ParquetObjectReader,
+    /// The [`SharedMetaData`] for this object
+    ///
+    /// This avoids looking up metadata multiple times for the same file
+    metadata: SharedMetaData,

Review Comment:
   The major downside of this approach is it entails keeping the metadata, 
which may not be small, around for the entire length of the query. I'm not sure 
this is a good thing either...



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to