This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch branch-46
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/branch-46 by this push:
new 76d833ac21 Improve documentation for `DataSourceExec`,
`FileScanConfig`, `DataSource` etc (#14941) (#14965)
76d833ac21 is described below
commit 76d833ac215053e102424617f754946ea198388f
Author: Andrew Lamb <[email protected]>
AuthorDate: Sun Mar 2 08:23:39 2025 -0500
Improve documentation for `DataSourceExec`, `FileScanConfig`, `DataSource`
etc (#14941) (#14965)
---
datafusion/core/src/datasource/listing/table.rs | 13 ++++++++++---
datafusion/core/src/lib.rs | 10 +++++-----
datafusion/datasource/src/file.rs | 4 ++--
datafusion/datasource/src/source.rs | 23 +++++++++++++++++++++--
4 files changed, 38 insertions(+), 12 deletions(-)
diff --git a/datafusion/core/src/datasource/listing/table.rs
b/datafusion/core/src/datasource/listing/table.rs
index 41e939d60b..a983f0696e 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -616,6 +616,7 @@ impl ListingOptions {
/// using an [`ObjectStore`] instance, for example from local files or objects
/// from AWS S3.
///
+/// # Reading Directories
/// For example, given the `table1` directory (or object store prefix)
///
/// ```text
@@ -651,13 +652,19 @@ impl ListingOptions {
/// If the query has a predicate like `WHERE date = '2024-06-01'`
/// only the corresponding directory will be read.
///
-/// `ListingTable` also supports filter and projection pushdown for formats
that
+/// `ListingTable` also supports limit, filter and projection pushdown for
formats that
/// support it as such as Parquet.
///
+/// # Implementation
+///
+/// `ListingTable` Uses [`DataSourceExec`] to execute the data. See that struct
+/// for more details.
+///
+/// [`DataSourceExec`]: crate::datasource::source::DataSourceExec
+///
/// # Example
///
-/// Here is an example of reading a directory of parquet files using a
-/// [`ListingTable`]:
+/// To read a directory of parquet files using a [`ListingTable`]:
///
/// ```no_run
/// # use datafusion::prelude::SessionContext;
diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs
index 9a0d0157c1..dfd171082f 100644
--- a/datafusion/core/src/lib.rs
+++ b/datafusion/core/src/lib.rs
@@ -298,10 +298,10 @@
//! (built in or user provided) ExecutionPlan
//! ```
//!
-//! DataFusion includes several built in data sources for common use
-//! cases, and can be extended by implementing the [`TableProvider`]
-//! trait. A [`TableProvider`] provides information for planning and
-//! an [`ExecutionPlan`]s for execution.
+//! A [`TableProvider`] provides information for planning and
+//! an [`ExecutionPlan`]s for execution. DataFusion includes [`ListingTable`]
+//! which supports reading several common file formats, and you can support any
+//! new file format by implementing the [`TableProvider`] trait. See also:
//!
//! 1. [`ListingTable`]: Reads data from Parquet, JSON, CSV, or AVRO
//! files. Supports single files or multiple files with HIVE style
@@ -314,7 +314,7 @@
//!
//! [`ListingTable`]: crate::datasource::listing::ListingTable
//! [`MemTable`]: crate::datasource::memory::MemTable
-//! [`StreamingTable`]: datafusion_catalog::streaming::StreamingTable
+//! [`StreamingTable`]: crate::catalog::streaming::StreamingTable
//!
//! ## Plan Representations
//!
diff --git a/datafusion/datasource/src/file.rs
b/datafusion/datasource/src/file.rs
index 8d8cbbc67b..0066f39801 100644
--- a/datafusion/datasource/src/file.rs
+++ b/datafusion/datasource/src/file.rs
@@ -33,9 +33,9 @@ use datafusion_physical_plan::DisplayFormatType;
use object_store::ObjectStore;
-/// Common behaviors that every file format needs to implement.
+/// Common file format behaviors needs to implement.
///
-/// See initialization examples on `ParquetSource`, `CsvSource`
+/// See implementation examples such as `ParquetSource`, `CsvSource`
pub trait FileSource: Send + Sync {
/// Creates a `dyn FileOpener` based on given parameters
fn create_file_opener(
diff --git a/datafusion/datasource/src/source.rs
b/datafusion/datasource/src/source.rs
index b3089a6e59..07cee7fba0 100644
--- a/datafusion/datasource/src/source.rs
+++ b/datafusion/datasource/src/source.rs
@@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.
+//! [`DataSource`] and [`DataSourceExec`]
+
use std::any::Any;
use std::fmt;
use std::fmt::{Debug, Formatter};
@@ -34,9 +36,15 @@ use datafusion_physical_expr::{EquivalenceProperties,
Partitioning};
use datafusion_physical_expr_common::sort_expr::LexOrdering;
/// Common behaviors in Data Sources for both from Files and Memory.
-/// See `DataSourceExec` for physical plan implementation
///
+/// # See Also
+/// * [`DataSourceExec`] for physical plan implementation
+/// * [`FileSource`] for file format implementations (Parquet, Json, etc)
+///
+/// # Notes
/// Requires `Debug` to assist debugging
+///
+/// [`FileSource`]: crate::file::FileSource
pub trait DataSource: Send + Sync + Debug {
fn open(
&self,
@@ -71,10 +79,21 @@ pub trait DataSource: Send + Sync + Debug {
) -> datafusion_common::Result<Option<Arc<dyn ExecutionPlan>>>;
}
-/// Unified data source for file formats like JSON, CSV, AVRO, ARROW, PARQUET
+/// [`ExecutionPlan`] handles different file formats like JSON, CSV, AVRO,
ARROW, PARQUET
+///
+/// `DataSourceExec` implements common functionality such as applying
projections,
+/// and caching plan properties.
+///
+/// The [`DataSource`] trait describes where to find the data for this data
+/// source (for example what files or what in memory partitions). Format
+/// specifics are implemented with the [`FileSource`] trait.
+///
+/// [`FileSource`]: crate::file::FileSource
#[derive(Clone, Debug)]
pub struct DataSourceExec {
+ /// The source of the data -- for example, `FileScanConfig` or
`MemorySourceConfig`
data_source: Arc<dyn DataSource>,
+ /// Cached plan properties such as sort order
cache: PlanProperties,
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]