Re: [PR] Don't fetch and decode parquet metadata multiple times [arrow-datafusion]

via GitHub Mon, 09 Oct 2023 13:19:18 -0700


Dandandan commented on code in PR #7739:
URL: https://github.com/apache/arrow-datafusion/pull/7739#discussion_r1350756179



##########
datafusion/core/src/datasource/physical_plan/parquet.rs:
##########
@@ -559,19 +565,35 @@ pub trait ParquetFileReaderFactory: Debug + Send + Sync + 
'static {
 #[derive(Debug)]
 pub struct DefaultParquetFileReaderFactory {
     store: Arc<dyn ObjectStore>,
+    /// An cache of [`SharedMetaData`] used to avoid fetching the metadata of 
a file
+    /// multiple times, as might occur if a file is repartitioned into 
multiple groups
+    metadata_cache: MetadataCache,
 }
 
 impl DefaultParquetFileReaderFactory {
     /// Create a factory.
     pub fn new(store: Arc<dyn ObjectStore>) -> Self {
-        Self { store }
+        Self {
+            store,
+            metadata_cache: Default::default(),
+        }
     }
 }
 
+/// A shared cache of [`SharedMetaData`]
+type MetadataCache = Arc<parking_lot::Mutex<HashMap<Path, SharedMetaData>>>;
+
+/// A shared [`ParquetMetaData`] used to only fetch metadata once
+type SharedMetaData = Arc<futures::lock::Mutex<Option<Arc<ParquetMetaData>>>>;
+
 /// Implements [`AsyncFileReader`] for a parquet file in object storage
 struct ParquetFileReader {
     file_metrics: ParquetFileMetrics,
     inner: ParquetObjectReader,
+    /// The [`SharedMetaData`] for this object
+    ///
+    /// This avoids looking up metadata multiple times for the same file
+    metadata: SharedMetaData,

Review Comment:
   Can we make this configurable, so users can opt in to this depending on 
their loads / use cases?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Don't fetch and decode parquet metadata multiple times [arrow-datafusion]

Reply via email to