This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 0808f3a8d2 Improvements to `list_files_cache` table function (#19703)
0808f3a8d2 is described below

commit 0808f3a8d2646c9435557db059759653c3f2c383
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Jan 14 16:55:10 2026 -0500

    Improvements to `list_files_cache` table function (#19703)
    
    ## Which issue does this PR close?
    
    - Follow on to https://github.com/apache/datafusion/pull/19616
    
    ## Rationale for this change
    
    I had a few minor comments / suggestions while reviewing
    https://github.com/apache/datafusion/pull/19616 from @jizezhang but they
    weren't needed to do the initial merge, so I would like to propose them
    in a follow up PR
    
    ## What changes are included in this PR?
    
    1. Improve documentation
    2. Improve handling of `table_ref` in ListingTableURL
    3. use Null rather than `"NULL"` in `list_files_cache` table function
    
    I can break this into separate PRs if that would help
    
    ## Are these changes tested?
    
    Yes by CI
    
    ## Are there any user-facing changes?
    
    The `list_files_cache` function now might return null
---
 datafusion-cli/src/functions.rs                         | 13 ++++++++-----
 datafusion/core/src/datasource/listing_table_factory.rs |  4 +---
 datafusion/datasource/src/url.rs                        | 12 +++++++-----
 datafusion/execution/src/cache/cache_manager.rs         |  1 +
 datafusion/execution/src/cache/list_files_cache.rs      |  5 +++++
 5 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs
index 6a97c5355f..e50339d296 100644
--- a/datafusion-cli/src/functions.rs
+++ b/datafusion-cli/src/functions.rs
@@ -703,10 +703,13 @@ impl TableFunctionImpl for StatisticsCacheFunc {
     }
 }
 
-// Implementation of the `list_files_cache` table function in datafusion-cli.
+/// Implementation of the `list_files_cache` table function in datafusion-cli.
+///
+/// This function returns the cached results of running a LIST command on a
+/// particular object store path for a table. The object metadata is returned 
as
+/// a List of Structs, with one Struct for each object. DataFusion uses these
+/// cached results to plan queries against external tables.
 ///
-/// This function returns the cached results of running a LIST command on a 
particular object store path for a table. The object metadata is returned as a 
List of Structs, with one Struct for each object.
-/// DataFusion uses these cached results to plan queries against external 
tables.
 /// # Schema
 /// ```sql
 /// > describe select * from list_files_cache();
@@ -788,7 +791,7 @@ impl TableFunctionImpl for ListFilesCacheFunc {
             Field::new("metadata", DataType::Struct(nested_fields.clone()), 
true);
 
         let schema = Arc::new(Schema::new(vec![
-            Field::new("table", DataType::Utf8, false),
+            Field::new("table", DataType::Utf8, true),
             Field::new("path", DataType::Utf8, false),
             Field::new("metadata_size_bytes", DataType::UInt64, false),
             // expires field in ListFilesEntry has type Instant when set, from 
which we cannot get "the number of seconds", hence using Duration instead of 
Timestamp as data type.
@@ -821,7 +824,7 @@ impl TableFunctionImpl for ListFilesCacheFunc {
             let mut current_offset: i32 = 0;
 
             for (path, entry) in list_files_cache.list_entries() {
-                table_arr.push(path.table.map_or("NULL".to_string(), |t| 
t.to_string()));
+                table_arr.push(path.table.map(|t| t.to_string()));
                 path_arr.push(path.path.to_string());
                 metadata_size_bytes_arr.push(entry.size_bytes as u64);
                 // calculates time left before entry expires
diff --git a/datafusion/core/src/datasource/listing_table_factory.rs 
b/datafusion/core/src/datasource/listing_table_factory.rs
index 86af691fd7..98f61a8528 100644
--- a/datafusion/core/src/datasource/listing_table_factory.rs
+++ b/datafusion/core/src/datasource/listing_table_factory.rs
@@ -161,9 +161,7 @@ impl TableProviderFactory for ListingTableFactory {
                         }
                         None => format!("*.{}", cmd.file_type.to_lowercase()),
                     };
-                    table_path = table_path
-                        .with_glob(glob.as_ref())?
-                        .with_table_ref(cmd.name.clone());
+                    table_path = table_path.with_glob(glob.as_ref())?;
                 }
                 let schema = options.infer_schema(session_state, 
&table_path).await?;
                 let df_schema = Arc::clone(&schema).to_dfschema()?;
diff --git a/datafusion/datasource/src/url.rs b/datafusion/datasource/src/url.rs
index 678bd280fc..0c274806c0 100644
--- a/datafusion/datasource/src/url.rs
+++ b/datafusion/datasource/src/url.rs
@@ -43,7 +43,7 @@ pub struct ListingTableUrl {
     prefix: Path,
     /// An optional glob expression used to filter files
     glob: Option<Pattern>,
-
+    /// Optional table reference for the table this url belongs to
     table_ref: Option<TableReference>,
 }
 
@@ -341,17 +341,19 @@ impl ListingTableUrl {
     }
 
     /// Returns a copy of current [`ListingTableUrl`] with a specified `glob`
-    pub fn with_glob(self, glob: &str) -> Result<Self> {
-        let glob =
-            Pattern::new(glob).map_err(|e| 
DataFusionError::External(Box::new(e)))?;
-        Self::try_new(self.url, Some(glob))
+    pub fn with_glob(mut self, glob: &str) -> Result<Self> {
+        self.glob =
+            Some(Pattern::new(glob).map_err(|e| 
DataFusionError::External(Box::new(e)))?);
+        Ok(self)
     }
 
+    /// Set the table reference for this [`ListingTableUrl`]
     pub fn with_table_ref(mut self, table_ref: TableReference) -> Self {
         self.table_ref = Some(table_ref);
         self
     }
 
+    /// Return the table reference for this [`ListingTableUrl`]
     pub fn get_table_ref(&self) -> &Option<TableReference> {
         &self.table_ref
     }
diff --git a/datafusion/execution/src/cache/cache_manager.rs 
b/datafusion/execution/src/cache/cache_manager.rs
index 4cc5586440..bd34c441bd 100644
--- a/datafusion/execution/src/cache/cache_manager.rs
+++ b/datafusion/execution/src/cache/cache_manager.rs
@@ -196,6 +196,7 @@ pub trait ListFilesCache: CacheAccessor<TableScopedPath, 
CachedFileList> {
     /// Retrieves the information about the entries currently cached.
     fn list_entries(&self) -> HashMap<TableScopedPath, ListFilesEntry>;
 
+    /// Drop all entries for the given table reference.
     fn drop_table_entries(&self, table_ref: &Option<TableReference>) -> 
Result<()>;
 }
 
diff --git a/datafusion/execution/src/cache/list_files_cache.rs 
b/datafusion/execution/src/cache/list_files_cache.rs
index c86a03574e..b1b8e6b500 100644
--- a/datafusion/execution/src/cache/list_files_cache.rs
+++ b/datafusion/execution/src/cache/list_files_cache.rs
@@ -139,6 +139,11 @@ pub const DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT: usize = 
1024 * 1024; // 1MiB
 /// The default cache TTL for the [`DefaultListFilesCache`]
 pub const DEFAULT_LIST_FILES_CACHE_TTL: Option<Duration> = None; // Infinite
 
+/// Key for [`DefaultListFilesCache`]
+///
+/// Each entry is scoped to its use within a specific table so that the cache
+/// can differentiate between identical paths in different tables, and
+/// table-level cache invalidation.
 #[derive(PartialEq, Eq, Hash, Clone, Debug)]
 pub struct TableScopedPath {
     pub table: Option<TableReference>,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to