nuno-faria commented on code in PR #17133: URL: https://github.com/apache/datafusion/pull/17133#discussion_r2269170815
########## datafusion/core/src/datasource/listing/table.rs: ########## @@ -861,16 +874,23 @@ impl ListingOptions { /// If the query has a predicate like `WHERE date = '2024-06-01'` /// only the corresponding directory will be read. /// -/// `ListingTable` also supports limit, filter and projection pushdown for formats that -/// support it as such as Parquet. -/// /// # See Also /// /// 1. [`ListingTableConfig`]: Configuration options /// 1. [`DataSourceExec`]: `ExecutionPlan` used by `ListingTable` /// /// [`DataSourceExec`]: crate::datasource::source::DataSourceExec /// +/// # Caching Metadata +/// +/// Some formats, such as Parquet, use the `FileMetadataCache` to cache file +/// metadata that is needed to execute but expensive to read, such as row Review Comment: ```suggestion /// metadata that is needed to execute but expensive to read, such as row ``` ########## datafusion/core/src/datasource/listing/table.rs: ########## @@ -861,16 +874,23 @@ impl ListingOptions { /// If the query has a predicate like `WHERE date = '2024-06-01'` /// only the corresponding directory will be read. /// -/// `ListingTable` also supports limit, filter and projection pushdown for formats that -/// support it as such as Parquet. -/// /// # See Also /// /// 1. [`ListingTableConfig`]: Configuration options /// 1. [`DataSourceExec`]: `ExecutionPlan` used by `ListingTable` /// /// [`DataSourceExec`]: crate::datasource::source::DataSourceExec /// +/// # Caching Metadata +/// +/// Some formats, such as Parquet, use the `FileMetadataCache` to cache file +/// metadata that is needed to execute but expensive to read, such as row +/// groups and statistics. The cache is scoped to the [`SessionContext`] and can +/// be configured via the [runtime config options] Review Comment: ```suggestion /// be configured via the [runtime config options]. ``` ########## datafusion/execution/src/cache/cache_manager.rs: ########## @@ -24,27 +24,55 @@ use std::any::Any; use std::fmt::{Debug, Formatter}; use std::sync::Arc; -/// The cache of listing files statistics. -/// if set [`CacheManagerConfig::with_files_statistics_cache`] -/// Will avoid infer same file statistics repeatedly during the session lifetime, -/// this cache will store in [`crate::runtime_env::RuntimeEnv`]. +/// A cache for [`Statistics`]. +/// +/// If enabled via [`CacheManagerConfig::with_files_statistics_cache`] this +/// cache avoids inferring the same file statistics repeatedly during the +/// session lifetime. +/// +/// See [`crate::runtime_env::RuntimeEnv`] for more details pub type FileStatisticsCache = Arc<dyn CacheAccessor<Path, Arc<Statistics>, Extra = ObjectMeta>>; +/// Cache for storing the [`ObjectMeta`]s that result from listing a path +/// +/// Listing a path means doing an object store "list" operation or `ls` +/// command on the local filesystem. This operation can be expensive, +/// especially when done over remote object stores. +/// +/// See [`crate::runtime_env::RuntimeEnv`] for more details pub type ListFilesCache = Arc<dyn CacheAccessor<Path, Arc<Vec<ObjectMeta>>, Extra = ObjectMeta>>; -/// Represents generic file-embedded metadata. +/// Generic file-embedded metadata used with [`FileMetadataCache`]. +/// +/// For example, Parquet footers and page metadata can be represented +/// using this trait. +/// +/// See [`crate::runtime_env::RuntimeEnv`] for more details pub trait FileMetadata: Any + Send + Sync { - /// Returns the file metadata as [`Any`] so that it can be downcasted to a specific + /// Returns the file metadata as [`Any`] so that it can be downcast to a specific /// implementation. fn as_any(&self) -> &dyn Any; /// Returns the size of the metadata in bytes. fn memory_size(&self) -> usize; } -/// Cache to store file-embedded metadata. +/// Cache for file-embedded metadata. +/// +/// This cache stores per-file metadata in the form of [`FileMetadata`], +/// +/// For example, the built in [`ListingTable`] uses this cache to avoid parsing +/// Parquet footers multiple times for the same file. +/// +/// DataFusion provides a default implementation, [`DefaultFilesMetadataCache`], +/// and users can also provide their own implementations to implement custom +/// caching strategies. +/// +/// See [`crate::runtime_env::RuntimeEnv`] for more details Review Comment: ```suggestion /// See [`crate::runtime_env::RuntimeEnv`] for more details. ``` ########## datafusion/execution/src/cache/cache_manager.rs: ########## @@ -169,11 +207,18 @@ impl CacheManagerConfig { self } + /// Set the cache for listing files. + /// + /// Default is `None` (disabled). + Review Comment: ```suggestion ``` ########## datafusion/execution/src/cache/cache_unit.rs: ########## @@ -271,21 +281,36 @@ impl DefaultFilesMetadataCacheState { } } +/// Default implementation of [`FileMetadataCache`] +/// /// Collected file embedded metadata cache. -/// The metadata for some file is invalided when the file size or last modification time have been -/// changed. -/// The `memory_limit` passed in the constructor controls the maximum size of the cache, which uses -/// a Least Recently Used eviction algorithm. -/// Users should use the `get` and `put` methods. The `get_with_extra` and `put_with_extra` methods -/// simply call `get` and `put`, respectively. +/// +/// The metadata for each file is invalided when the file size or last Review Comment: My original comment had a typo: ```suggestion /// The metadata for each file is invalidated when the file size or last ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org