This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch branch-46
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/branch-46 by this push:
     new 76d833ac21 Improve documentation for `DataSourceExec`, 
`FileScanConfig`, `DataSource` etc (#14941) (#14965)
76d833ac21 is described below

commit 76d833ac215053e102424617f754946ea198388f
Author: Andrew Lamb <[email protected]>
AuthorDate: Sun Mar 2 08:23:39 2025 -0500

    Improve documentation for `DataSourceExec`, `FileScanConfig`, `DataSource` 
etc (#14941) (#14965)
---
 datafusion/core/src/datasource/listing/table.rs | 13 ++++++++++---
 datafusion/core/src/lib.rs                      | 10 +++++-----
 datafusion/datasource/src/file.rs               |  4 ++--
 datafusion/datasource/src/source.rs             | 23 +++++++++++++++++++++--
 4 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs 
b/datafusion/core/src/datasource/listing/table.rs
index 41e939d60b..a983f0696e 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -616,6 +616,7 @@ impl ListingOptions {
 /// using an  [`ObjectStore`] instance, for example from local files or objects
 /// from AWS S3.
 ///
+/// # Reading Directories
 /// For example, given the `table1` directory (or object store prefix)
 ///
 /// ```text
@@ -651,13 +652,19 @@ impl ListingOptions {
 /// If the query has a predicate like `WHERE date = '2024-06-01'`
 /// only the corresponding directory will be read.
 ///
-/// `ListingTable` also supports filter and projection pushdown for formats 
that
+/// `ListingTable` also supports limit, filter and projection pushdown for 
formats that
 /// support it as such as Parquet.
 ///
+/// # Implementation
+///
+/// `ListingTable` Uses [`DataSourceExec`] to execute the data. See that struct
+/// for more details.
+///
+/// [`DataSourceExec`]: crate::datasource::source::DataSourceExec
+///
 /// # Example
 ///
-/// Here is an example of reading a directory of parquet files using a
-/// [`ListingTable`]:
+/// To read a directory of parquet files using a [`ListingTable`]:
 ///
 /// ```no_run
 /// # use datafusion::prelude::SessionContext;
diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs
index 9a0d0157c1..dfd171082f 100644
--- a/datafusion/core/src/lib.rs
+++ b/datafusion/core/src/lib.rs
@@ -298,10 +298,10 @@
 //!         (built in or user provided)    ExecutionPlan
 //! ```
 //!
-//! DataFusion includes several built in data sources for common use
-//! cases, and can be extended by implementing the [`TableProvider`]
-//! trait. A [`TableProvider`] provides information for planning and
-//! an [`ExecutionPlan`]s for execution.
+//! A [`TableProvider`] provides information for planning and
+//! an [`ExecutionPlan`]s for execution. DataFusion includes [`ListingTable`]
+//! which supports reading several common file formats, and you can support any
+//! new file format by implementing the [`TableProvider`] trait. See also:
 //!
 //! 1. [`ListingTable`]: Reads data from Parquet, JSON, CSV, or AVRO
 //!    files.  Supports single files or multiple files with HIVE style
@@ -314,7 +314,7 @@
 //!
 //! [`ListingTable`]: crate::datasource::listing::ListingTable
 //! [`MemTable`]: crate::datasource::memory::MemTable
-//! [`StreamingTable`]: datafusion_catalog::streaming::StreamingTable
+//! [`StreamingTable`]: crate::catalog::streaming::StreamingTable
 //!
 //! ## Plan Representations
 //!
diff --git a/datafusion/datasource/src/file.rs 
b/datafusion/datasource/src/file.rs
index 8d8cbbc67b..0066f39801 100644
--- a/datafusion/datasource/src/file.rs
+++ b/datafusion/datasource/src/file.rs
@@ -33,9 +33,9 @@ use datafusion_physical_plan::DisplayFormatType;
 
 use object_store::ObjectStore;
 
-/// Common behaviors that every file format needs to implement.
+/// Common file format behaviors needs to implement.
 ///
-/// See initialization examples on `ParquetSource`, `CsvSource`
+/// See implementation examples such as `ParquetSource`, `CsvSource`
 pub trait FileSource: Send + Sync {
     /// Creates a `dyn FileOpener` based on given parameters
     fn create_file_opener(
diff --git a/datafusion/datasource/src/source.rs 
b/datafusion/datasource/src/source.rs
index b3089a6e59..07cee7fba0 100644
--- a/datafusion/datasource/src/source.rs
+++ b/datafusion/datasource/src/source.rs
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//! [`DataSource`] and [`DataSourceExec`]
+
 use std::any::Any;
 use std::fmt;
 use std::fmt::{Debug, Formatter};
@@ -34,9 +36,15 @@ use datafusion_physical_expr::{EquivalenceProperties, 
Partitioning};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 
 /// Common behaviors in Data Sources for both from Files and Memory.
-/// See `DataSourceExec` for physical plan implementation
 ///
+/// # See Also
+/// * [`DataSourceExec`] for physical plan implementation
+/// * [`FileSource`] for file format implementations (Parquet, Json, etc)
+///
+/// # Notes
 /// Requires `Debug` to assist debugging
+///
+/// [`FileSource`]: crate::file::FileSource
 pub trait DataSource: Send + Sync + Debug {
     fn open(
         &self,
@@ -71,10 +79,21 @@ pub trait DataSource: Send + Sync + Debug {
     ) -> datafusion_common::Result<Option<Arc<dyn ExecutionPlan>>>;
 }
 
-/// Unified data source for file formats like JSON, CSV, AVRO, ARROW, PARQUET
+/// [`ExecutionPlan`] handles different file formats like JSON, CSV, AVRO, 
ARROW, PARQUET
+///
+/// `DataSourceExec` implements common functionality such as applying 
projections,
+/// and caching plan properties.
+///
+/// The [`DataSource`] trait describes where to find the data for this data
+/// source (for example what files or what in memory partitions). Format
+/// specifics are implemented with the [`FileSource`] trait.
+///
+/// [`FileSource`]: crate::file::FileSource
 #[derive(Clone, Debug)]
 pub struct DataSourceExec {
+    /// The source of the data -- for example, `FileScanConfig` or 
`MemorySourceConfig`
     data_source: Arc<dyn DataSource>,
+    /// Cached plan properties such as sort order
     cache: PlanProperties,
 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to