This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new e11a8ca00b refactor: Make catalog datasource (#14643)
e11a8ca00b is described below
commit e11a8ca00b19dee395b8a73e625b65b237e18c50
Author: logan-keede <[email protected]>
AuthorDate: Fri Feb 14 16:37:37 2025 +0530
refactor: Make catalog datasource (#14643)
* First Iteration
* Some more stuff
* cargo fmt
* fixing name catlog-datasource => datasource
* Update datafusion/datasource/README.md
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
Cargo.lock | 46 +++-
Cargo.toml | 3 +-
datafusion/catalog-listing/Cargo.toml | 24 +-
datafusion/catalog-listing/src/helpers.rs | 4 +-
datafusion/catalog-listing/src/mod.rs | 266 ---------------------
datafusion/core/Cargo.toml | 3 +-
datafusion/core/src/datasource/file_format/mod.rs | 4 +-
datafusion/core/src/datasource/listing/mod.rs | 5 +-
.../datasource/physical_plan/file_scan_config.rs | 2 +-
.../src/datasource/physical_plan/file_stream.rs | 6 +-
.../core/src/datasource/physical_plan/mod.rs | 6 +-
.../{catalog-listing => datasource}/Cargo.toml | 8 +-
datafusion/datasource/LICENSE.txt | 1 +
datafusion/datasource/NOTICE.txt | 1 +
datafusion/datasource/README.md | 24 ++
.../src/file_compression_type.rs | 0
.../src/file_groups.rs | 0
.../src/file_meta.rs | 0
.../src/file_scan_config.rs | 0
.../src/file_sink_config.rs | 0
.../src/file_stream.rs | 0
.../{catalog-listing => datasource}/src/mod.rs | 1 -
.../{catalog-listing => datasource}/src/url.rs | 2 +-
.../src/write/demux.rs | 0
.../src/write/mod.rs | 0
.../src/write/orchestration.rs | 0
26 files changed, 82 insertions(+), 324 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 0a7407b503..a03b295dab 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1733,6 +1733,7 @@ dependencies = [
"datafusion-catalog-listing",
"datafusion-common",
"datafusion-common-runtime",
+ "datafusion-datasource",
"datafusion-doc",
"datafusion-execution",
"datafusion-expr",
@@ -1823,32 +1824,20 @@ name = "datafusion-catalog-listing"
version = "45.0.0"
dependencies = [
"arrow",
- "async-compression",
"async-trait",
- "bytes",
- "bzip2 0.5.1",
- "chrono",
"datafusion-catalog",
"datafusion-common",
- "datafusion-common-runtime",
+ "datafusion-datasource",
"datafusion-execution",
"datafusion-expr",
"datafusion-physical-expr",
"datafusion-physical-expr-common",
"datafusion-physical-plan",
- "flate2",
"futures",
- "glob",
- "itertools 0.14.0",
"log",
"object_store",
- "rand 0.8.5",
"tempfile",
"tokio",
- "tokio-util",
- "url",
- "xz2",
- "zstd",
]
[[package]]
@@ -1912,6 +1901,37 @@ dependencies = [
"tokio",
]
+[[package]]
+name = "datafusion-datasource"
+version = "45.0.0"
+dependencies = [
+ "arrow",
+ "async-compression",
+ "async-trait",
+ "bytes",
+ "bzip2 0.5.1",
+ "chrono",
+ "datafusion-catalog",
+ "datafusion-common",
+ "datafusion-common-runtime",
+ "datafusion-execution",
+ "datafusion-expr",
+ "datafusion-physical-plan",
+ "flate2",
+ "futures",
+ "glob",
+ "itertools 0.14.0",
+ "log",
+ "object_store",
+ "rand 0.8.5",
+ "tempfile",
+ "tokio",
+ "tokio-util",
+ "url",
+ "xz2",
+ "zstd",
+]
+
[[package]]
name = "datafusion-doc"
version = "45.0.0"
diff --git a/Cargo.toml b/Cargo.toml
index 1e35b7f420..099e5f2297 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -99,9 +99,10 @@ ctor = "0.2.9"
dashmap = "6.0.1"
datafusion = { path = "datafusion/core", version = "45.0.0", default-features
= false }
datafusion-catalog = { path = "datafusion/catalog", version = "45.0.0" }
-datafusion-catalog-listing = { path = "datafusion/catalog-listing", version =
"45.0.0", default-features = false }
+datafusion-catalog-listing = { path = "datafusion/catalog-listing", version =
"45.0.0" }
datafusion-common = { path = "datafusion/common", version = "45.0.0",
default-features = false }
datafusion-common-runtime = { path = "datafusion/common-runtime", version =
"45.0.0" }
+datafusion-datasource = { path = "datafusion/datasource", version = "45.0.0",
default-features = false }
datafusion-doc = { path = "datafusion/doc", version = "45.0.0" }
datafusion-execution = { path = "datafusion/execution", version = "45.0.0" }
datafusion-expr = { path = "datafusion/expr", version = "45.0.0" }
diff --git a/datafusion/catalog-listing/Cargo.toml
b/datafusion/catalog-listing/Cargo.toml
index 0aa2083ebc..68d0ca3a14 100644
--- a/datafusion/catalog-listing/Cargo.toml
+++ b/datafusion/catalog-listing/Cargo.toml
@@ -27,43 +27,21 @@ repository.workspace = true
rust-version.workspace = true
version.workspace = true
-[features]
-compression = ["async-compression", "xz2", "bzip2", "flate2", "zstd",
"tokio-util"]
-default = ["compression"]
-
[dependencies]
arrow = { workspace = true }
-async-compression = { version = "0.4.0", features = [
- "bzip2",
- "gzip",
- "xz",
- "zstd",
- "tokio",
-], optional = true }
async-trait = { workspace = true }
-bytes = { workspace = true }
-bzip2 = { version = "0.5.1", optional = true }
-chrono = { workspace = true }
datafusion-catalog = { workspace = true }
datafusion-common = { workspace = true, features = ["object_store"] }
-datafusion-common-runtime = { workspace = true }
+datafusion-datasource = { workspace = true }
datafusion-execution = { workspace = true }
datafusion-expr = { workspace = true }
datafusion-physical-expr = { workspace = true }
datafusion-physical-expr-common = { workspace = true }
datafusion-physical-plan = { workspace = true }
-flate2 = { version = "1.0.24", optional = true }
futures = { workspace = true }
-glob = "0.3.0"
-itertools = { workspace = true }
log = { workspace = true }
object_store = { workspace = true }
-rand = { workspace = true }
tokio = { workspace = true }
-tokio-util = { version = "0.7.4", features = ["io"], optional = true }
-url = { workspace = true }
-xz2 = { version = "0.1", optional = true, features = ["static"] }
-zstd = { version = "0.13", optional = true, default-features = false }
[dev-dependencies]
tempfile = { workspace = true }
diff --git a/datafusion/catalog-listing/src/helpers.rs
b/datafusion/catalog-listing/src/helpers.rs
index ceacde2494..cf47526353 100644
--- a/datafusion/catalog-listing/src/helpers.rs
+++ b/datafusion/catalog-listing/src/helpers.rs
@@ -20,11 +20,11 @@
use std::mem;
use std::sync::Arc;
-use super::ListingTableUrl;
-use super::PartitionedFile;
use datafusion_catalog::Session;
use datafusion_common::internal_err;
use datafusion_common::{HashMap, Result, ScalarValue};
+use datafusion_datasource::ListingTableUrl;
+use datafusion_datasource::PartitionedFile;
use datafusion_expr::{BinaryExpr, Operator};
use arrow::{
diff --git a/datafusion/catalog-listing/src/mod.rs
b/datafusion/catalog-listing/src/mod.rs
index 9eb79ec07a..b98790e864 100644
--- a/datafusion/catalog-listing/src/mod.rs
+++ b/datafusion/catalog-listing/src/mod.rs
@@ -15,270 +15,4 @@
// specific language governing permissions and limitations
// under the License.
-//! A table that uses the `ObjectStore` listing capability
-//! to get the list of files to process.
-
-pub mod file_compression_type;
-pub mod file_groups;
-pub mod file_meta;
-pub mod file_scan_config;
-pub mod file_sink_config;
-pub mod file_stream;
pub mod helpers;
-pub mod url;
-pub mod write;
-use chrono::TimeZone;
-use datafusion_common::Result;
-use datafusion_common::{ScalarValue, Statistics};
-use futures::Stream;
-use object_store::{path::Path, ObjectMeta};
-use std::pin::Pin;
-use std::sync::Arc;
-
-pub use self::url::ListingTableUrl;
-
-/// Stream of files get listed from object store
-pub type PartitionedFileStream =
- Pin<Box<dyn Stream<Item = Result<PartitionedFile>> + Send + Sync +
'static>>;
-
-/// Only scan a subset of Row Groups from the Parquet file whose data
"midpoint"
-/// lies within the [start, end) byte offsets. This option can be used to scan
non-overlapping
-/// sections of a Parquet file in parallel.
-#[derive(Debug, Clone, PartialEq, Hash, Eq, PartialOrd, Ord)]
-pub struct FileRange {
- /// Range start
- pub start: i64,
- /// Range end
- pub end: i64,
-}
-
-impl FileRange {
- /// returns true if this file range contains the specified offset
- pub fn contains(&self, offset: i64) -> bool {
- offset >= self.start && offset < self.end
- }
-}
-
-#[derive(Debug, Clone)]
-/// A single file or part of a file that should be read, along with its
schema, statistics
-/// and partition column values that need to be appended to each row.
-pub struct PartitionedFile {
- /// Path for the file (e.g. URL, filesystem path, etc)
- pub object_meta: ObjectMeta,
- /// Values of partition columns to be appended to each row.
- ///
- /// These MUST have the same count, order, and type than the
[`table_partition_cols`].
- ///
- /// You may use [`wrap_partition_value_in_dict`] to wrap them if you have
used [`wrap_partition_type_in_dict`] to wrap the column type.
- ///
- ///
- /// [`wrap_partition_type_in_dict`]:
https://github.com/apache/datafusion/blob/main/datafusion/core/src/datasource/physical_plan/file_scan_config.rs#L55
- /// [`wrap_partition_value_in_dict`]:
https://github.com/apache/datafusion/blob/main/datafusion/core/src/datasource/physical_plan/file_scan_config.rs#L62
- /// [`table_partition_cols`]:
https://github.com/apache/datafusion/blob/main/datafusion/core/src/datasource/file_format/options.rs#L190
- pub partition_values: Vec<ScalarValue>,
- /// An optional file range for a more fine-grained parallel execution
- pub range: Option<FileRange>,
- /// Optional statistics that describe the data in this file if known.
- ///
- /// DataFusion relies on these statistics for planning (in particular to
sort file groups),
- /// so if they are incorrect, incorrect answers may result.
- pub statistics: Option<Statistics>,
- /// An optional field for user defined per object metadata
- pub extensions: Option<Arc<dyn std::any::Any + Send + Sync>>,
- /// The estimated size of the parquet metadata, in bytes
- pub metadata_size_hint: Option<usize>,
-}
-
-impl PartitionedFile {
- /// Create a simple file without metadata or partition
- pub fn new(path: impl Into<String>, size: u64) -> Self {
- Self {
- object_meta: ObjectMeta {
- location: Path::from(path.into()),
- last_modified: chrono::Utc.timestamp_nanos(0),
- size: size as usize,
- e_tag: None,
- version: None,
- },
- partition_values: vec![],
- range: None,
- statistics: None,
- extensions: None,
- metadata_size_hint: None,
- }
- }
-
- /// Create a file range without metadata or partition
- pub fn new_with_range(path: String, size: u64, start: i64, end: i64) ->
Self {
- Self {
- object_meta: ObjectMeta {
- location: Path::from(path),
- last_modified: chrono::Utc.timestamp_nanos(0),
- size: size as usize,
- e_tag: None,
- version: None,
- },
- partition_values: vec![],
- range: Some(FileRange { start, end }),
- statistics: None,
- extensions: None,
- metadata_size_hint: None,
- }
- .with_range(start, end)
- }
-
- /// Provide a hint to the size of the file metadata. If a hint is provided
- /// the reader will try and fetch the last `size_hint` bytes of the
parquet file optimistically.
- /// Without an appropriate hint, two read may be required to fetch the
metadata.
- pub fn with_metadata_size_hint(mut self, metadata_size_hint: usize) ->
Self {
- self.metadata_size_hint = Some(metadata_size_hint);
- self
- }
-
- /// Return a file reference from the given path
- pub fn from_path(path: String) -> Result<Self> {
- let size = std::fs::metadata(path.clone())?.len();
- Ok(Self::new(path, size))
- }
-
- /// Return the path of this partitioned file
- pub fn path(&self) -> &Path {
- &self.object_meta.location
- }
-
- /// Update the file to only scan the specified range (in bytes)
- pub fn with_range(mut self, start: i64, end: i64) -> Self {
- self.range = Some(FileRange { start, end });
- self
- }
-
- /// Update the user defined extensions for this file.
- ///
- /// This can be used to pass reader specific information.
- pub fn with_extensions(
- mut self,
- extensions: Arc<dyn std::any::Any + Send + Sync>,
- ) -> Self {
- self.extensions = Some(extensions);
- self
- }
-}
-
-impl From<ObjectMeta> for PartitionedFile {
- fn from(object_meta: ObjectMeta) -> Self {
- PartitionedFile {
- object_meta,
- partition_values: vec![],
- range: None,
- statistics: None,
- extensions: None,
- metadata_size_hint: None,
- }
- }
-}
-
-#[cfg(test)]
-mod tests {
- use super::ListingTableUrl;
- use datafusion_execution::object_store::{
- DefaultObjectStoreRegistry, ObjectStoreRegistry,
- };
- use object_store::{local::LocalFileSystem, path::Path};
- use std::{ops::Not, sync::Arc};
- use url::Url;
-
- #[test]
- fn test_object_store_listing_url() {
- let listing = ListingTableUrl::parse("file:///").unwrap();
- let store = listing.object_store();
- assert_eq!(store.as_str(), "file:///");
-
- let listing = ListingTableUrl::parse("s3://bucket/").unwrap();
- let store = listing.object_store();
- assert_eq!(store.as_str(), "s3://bucket/");
- }
-
- #[test]
- fn test_get_store_hdfs() {
- let sut = DefaultObjectStoreRegistry::default();
- let url = Url::parse("hdfs://localhost:8020").unwrap();
- sut.register_store(&url, Arc::new(LocalFileSystem::new()));
- let url = ListingTableUrl::parse("hdfs://localhost:8020/key").unwrap();
- sut.get_store(url.as_ref()).unwrap();
- }
-
- #[test]
- fn test_get_store_s3() {
- let sut = DefaultObjectStoreRegistry::default();
- let url = Url::parse("s3://bucket/key").unwrap();
- sut.register_store(&url, Arc::new(LocalFileSystem::new()));
- let url = ListingTableUrl::parse("s3://bucket/key").unwrap();
- sut.get_store(url.as_ref()).unwrap();
- }
-
- #[test]
- fn test_get_store_file() {
- let sut = DefaultObjectStoreRegistry::default();
- let url = ListingTableUrl::parse("file:///bucket/key").unwrap();
- sut.get_store(url.as_ref()).unwrap();
- }
-
- #[test]
- fn test_get_store_local() {
- let sut = DefaultObjectStoreRegistry::default();
- let url = ListingTableUrl::parse("../").unwrap();
- sut.get_store(url.as_ref()).unwrap();
- }
-
- #[test]
- fn test_url_contains() {
- let url = ListingTableUrl::parse("file:///var/data/mytable/").unwrap();
-
- // standard case with default config
- assert!(url.contains(
- &Path::parse("/var/data/mytable/data.parquet").unwrap(),
- true
- ));
-
- // standard case with `ignore_subdirectory` set to false
- assert!(url.contains(
- &Path::parse("/var/data/mytable/data.parquet").unwrap(),
- false
- ));
-
- // as per documentation, when `ignore_subdirectory` is true, we should
ignore files that aren't
- // a direct child of the `url`
- assert!(url
- .contains(
-
&Path::parse("/var/data/mytable/mysubfolder/data.parquet").unwrap(),
- true
- )
- .not());
-
- // when we set `ignore_subdirectory` to false, we should not ignore
the file
- assert!(url.contains(
-
&Path::parse("/var/data/mytable/mysubfolder/data.parquet").unwrap(),
- false
- ));
-
- // as above, `ignore_subdirectory` is false, so we include the file
- assert!(url.contains(
- &Path::parse("/var/data/mytable/year=2024/data.parquet").unwrap(),
- false
- ));
-
- // in this case, we include the file even when `ignore_subdirectory`
is true because the
- // path segment is a hive partition which doesn't count as a
subdirectory for the purposes
- // of `Url::contains`
- assert!(url.contains(
- &Path::parse("/var/data/mytable/year=2024/data.parquet").unwrap(),
- true
- ));
-
- // testing an empty path with default config
- assert!(url.contains(&Path::parse("/var/data/mytable/").unwrap(),
true));
-
- // testing an empty path with `ignore_subdirectory` set to false
- assert!(url.contains(&Path::parse("/var/data/mytable/").unwrap(),
false));
- }
-}
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index 6492e828e6..ead9c90b7e 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -43,7 +43,7 @@ array_expressions = ["nested_expressions"]
# Used to enable the avro format
avro = ["apache-avro", "num-traits", "datafusion-common/avro"]
backtrace = ["datafusion-common/backtrace"]
-compression = ["xz2", "bzip2", "flate2", "zstd",
"datafusion-catalog-listing/compression"]
+compression = ["xz2", "bzip2", "flate2", "zstd",
"datafusion-datasource/compression"]
crypto_expressions = ["datafusion-functions/crypto_expressions"]
datetime_expressions = ["datafusion-functions/datetime_expressions"]
default = [
@@ -95,6 +95,7 @@ datafusion-catalog = { workspace = true }
datafusion-catalog-listing = { workspace = true }
datafusion-common = { workspace = true, features = ["object_store"] }
datafusion-common-runtime = { workspace = true }
+datafusion-datasource = { workspace = true }
datafusion-execution = { workspace = true }
datafusion-expr = { workspace = true }
datafusion-functions = { workspace = true }
diff --git a/datafusion/core/src/datasource/file_format/mod.rs
b/datafusion/core/src/datasource/file_format/mod.rs
index dd48a95371..657fe6ca55 100644
--- a/datafusion/core/src/datasource/file_format/mod.rs
+++ b/datafusion/core/src/datasource/file_format/mod.rs
@@ -28,8 +28,8 @@ pub mod json;
pub mod options;
#[cfg(feature = "parquet")]
pub mod parquet;
-pub use datafusion_catalog_listing::file_compression_type;
-pub use datafusion_catalog_listing::write;
+pub use datafusion_datasource::file_compression_type;
+pub use datafusion_datasource::write;
use std::any::Any;
use std::collections::{HashMap, VecDeque};
diff --git a/datafusion/core/src/datasource/listing/mod.rs
b/datafusion/core/src/datasource/listing/mod.rs
index 39323b993d..a58db55bcc 100644
--- a/datafusion/core/src/datasource/listing/mod.rs
+++ b/datafusion/core/src/datasource/listing/mod.rs
@@ -19,5 +19,8 @@
//! to get the list of files to process.
mod table;
-pub use datafusion_catalog_listing::*;
+pub use datafusion_catalog_listing::helpers;
+pub use datafusion_datasource::{
+ FileRange, ListingTableUrl, PartitionedFile, PartitionedFileStream,
+};
pub use table::{ListingOptions, ListingTable, ListingTableConfig};
diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs
b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs
index 3708fe6abd..e979eb49d0 100644
--- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs
+++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs
@@ -35,7 +35,7 @@ use datafusion_common::{ColumnStatistics, Constraints,
Statistics};
use datafusion_physical_expr::{EquivalenceProperties, LexOrdering,
Partitioning};
use crate::datasource::data_source::FileSource;
-pub use datafusion_catalog_listing::file_scan_config::*;
+pub use datafusion_datasource::file_scan_config::*;
use datafusion_execution::{SendableRecordBatchStream, TaskContext};
use datafusion_physical_plan::display::{display_orderings,
ProjectSchemaDisplay};
use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
diff --git a/datafusion/core/src/datasource/physical_plan/file_stream.rs
b/datafusion/core/src/datasource/physical_plan/file_stream.rs
index c88d4c4458..7944d6fa90 100644
--- a/datafusion/core/src/datasource/physical_plan/file_stream.rs
+++ b/datafusion/core/src/datasource/physical_plan/file_stream.rs
@@ -37,11 +37,9 @@ use crate::physical_plan::RecordBatchStream;
use arrow::datatypes::SchemaRef;
use arrow::error::ArrowError;
use arrow::record_batch::RecordBatch;
-pub use datafusion_catalog_listing::file_stream::{FileOpenFuture, FileOpener,
OnError};
-use datafusion_catalog_listing::file_stream::{
- FileStreamMetrics, FileStreamState, NextOpen,
-};
use datafusion_common::ScalarValue;
+pub use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener,
OnError};
+use datafusion_datasource::file_stream::{FileStreamMetrics, FileStreamState,
NextOpen};
use futures::{ready, FutureExt, Stream, StreamExt};
diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs
b/datafusion/core/src/datasource/physical_plan/mod.rs
index 18174bd54e..953c99322e 100644
--- a/datafusion/core/src/datasource/physical_plan/mod.rs
+++ b/datafusion/core/src/datasource/physical_plan/mod.rs
@@ -50,9 +50,9 @@ pub use avro::AvroSource;
#[allow(deprecated)]
pub use csv::{CsvExec, CsvExecBuilder};
pub use csv::{CsvOpener, CsvSource};
-pub use datafusion_catalog_listing::file_groups::FileGroupPartitioner;
-pub use datafusion_catalog_listing::file_meta::FileMeta;
-pub use datafusion_catalog_listing::file_sink_config::*;
+pub use datafusion_datasource::file_groups::FileGroupPartitioner;
+pub use datafusion_datasource::file_meta::FileMeta;
+pub use datafusion_datasource::file_sink_config::*;
pub use file_scan_config::{
wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileScanConfig,
};
diff --git a/datafusion/catalog-listing/Cargo.toml
b/datafusion/datasource/Cargo.toml
similarity index 91%
copy from datafusion/catalog-listing/Cargo.toml
copy to datafusion/datasource/Cargo.toml
index 0aa2083ebc..caf1c60a78 100644
--- a/datafusion/catalog-listing/Cargo.toml
+++ b/datafusion/datasource/Cargo.toml
@@ -16,8 +16,8 @@
# under the License.
[package]
-name = "datafusion-catalog-listing"
-description = "datafusion-catalog-listing"
+name = "datafusion-datasource"
+description = "datafusion-datasource"
authors.workspace = true
edition.workspace = true
homepage.workspace = true
@@ -49,8 +49,6 @@ datafusion-common = { workspace = true, features =
["object_store"] }
datafusion-common-runtime = { workspace = true }
datafusion-execution = { workspace = true }
datafusion-expr = { workspace = true }
-datafusion-physical-expr = { workspace = true }
-datafusion-physical-expr-common = { workspace = true }
datafusion-physical-plan = { workspace = true }
flate2 = { version = "1.0.24", optional = true }
futures = { workspace = true }
@@ -72,5 +70,5 @@ tempfile = { workspace = true }
workspace = true
[lib]
-name = "datafusion_catalog_listing"
+name = "datafusion_datasource"
path = "src/mod.rs"
diff --git a/datafusion/datasource/LICENSE.txt
b/datafusion/datasource/LICENSE.txt
new file mode 120000
index 0000000000..1ef648f64b
--- /dev/null
+++ b/datafusion/datasource/LICENSE.txt
@@ -0,0 +1 @@
+../../LICENSE.txt
\ No newline at end of file
diff --git a/datafusion/datasource/NOTICE.txt b/datafusion/datasource/NOTICE.txt
new file mode 120000
index 0000000000..fb051c92b1
--- /dev/null
+++ b/datafusion/datasource/NOTICE.txt
@@ -0,0 +1 @@
+../../NOTICE.txt
\ No newline at end of file
diff --git a/datafusion/datasource/README.md b/datafusion/datasource/README.md
new file mode 100644
index 0000000000..2479a28ae6
--- /dev/null
+++ b/datafusion/datasource/README.md
@@ -0,0 +1,24 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# DataFusion datasource
+
+[DataFusion][df] is an extensible query execution framework, written in Rust,
that uses Apache Arrow as its in-memory format.
+
+This crate is a submodule of DataFusion that defines common DataSource related
components like FileScanConfig, FileCompression etc.
diff --git a/datafusion/catalog-listing/src/file_compression_type.rs
b/datafusion/datasource/src/file_compression_type.rs
similarity index 100%
rename from datafusion/catalog-listing/src/file_compression_type.rs
rename to datafusion/datasource/src/file_compression_type.rs
diff --git a/datafusion/catalog-listing/src/file_groups.rs
b/datafusion/datasource/src/file_groups.rs
similarity index 100%
rename from datafusion/catalog-listing/src/file_groups.rs
rename to datafusion/datasource/src/file_groups.rs
diff --git a/datafusion/catalog-listing/src/file_meta.rs
b/datafusion/datasource/src/file_meta.rs
similarity index 100%
rename from datafusion/catalog-listing/src/file_meta.rs
rename to datafusion/datasource/src/file_meta.rs
diff --git a/datafusion/catalog-listing/src/file_scan_config.rs
b/datafusion/datasource/src/file_scan_config.rs
similarity index 100%
rename from datafusion/catalog-listing/src/file_scan_config.rs
rename to datafusion/datasource/src/file_scan_config.rs
diff --git a/datafusion/catalog-listing/src/file_sink_config.rs
b/datafusion/datasource/src/file_sink_config.rs
similarity index 100%
rename from datafusion/catalog-listing/src/file_sink_config.rs
rename to datafusion/datasource/src/file_sink_config.rs
diff --git a/datafusion/catalog-listing/src/file_stream.rs
b/datafusion/datasource/src/file_stream.rs
similarity index 100%
rename from datafusion/catalog-listing/src/file_stream.rs
rename to datafusion/datasource/src/file_stream.rs
diff --git a/datafusion/catalog-listing/src/mod.rs
b/datafusion/datasource/src/mod.rs
similarity index 99%
copy from datafusion/catalog-listing/src/mod.rs
copy to datafusion/datasource/src/mod.rs
index 9eb79ec07a..c735c3108b 100644
--- a/datafusion/catalog-listing/src/mod.rs
+++ b/datafusion/datasource/src/mod.rs
@@ -24,7 +24,6 @@ pub mod file_meta;
pub mod file_scan_config;
pub mod file_sink_config;
pub mod file_stream;
-pub mod helpers;
pub mod url;
pub mod write;
use chrono::TimeZone;
diff --git a/datafusion/catalog-listing/src/url.rs
b/datafusion/datasource/src/url.rs
similarity index 99%
rename from datafusion/catalog-listing/src/url.rs
rename to datafusion/datasource/src/url.rs
index 2e6415ba3b..89e73a8a2b 100644
--- a/datafusion/catalog-listing/src/url.rs
+++ b/datafusion/datasource/src/url.rs
@@ -193,7 +193,7 @@ impl ListingTableUrl {
///
/// Examples:
/// ```rust
- /// use datafusion_catalog_listing::ListingTableUrl;
+ /// use datafusion_datasource::ListingTableUrl;
/// let url = ListingTableUrl::parse("file:///foo/bar.csv").unwrap();
/// assert_eq!(url.file_extension(), Some("csv"));
/// let url = ListingTableUrl::parse("file:///foo/bar").unwrap();
diff --git a/datafusion/catalog-listing/src/write/demux.rs
b/datafusion/datasource/src/write/demux.rs
similarity index 100%
rename from datafusion/catalog-listing/src/write/demux.rs
rename to datafusion/datasource/src/write/demux.rs
diff --git a/datafusion/catalog-listing/src/write/mod.rs
b/datafusion/datasource/src/write/mod.rs
similarity index 100%
rename from datafusion/catalog-listing/src/write/mod.rs
rename to datafusion/datasource/src/write/mod.rs
diff --git a/datafusion/catalog-listing/src/write/orchestration.rs
b/datafusion/datasource/src/write/orchestration.rs
similarity index 100%
rename from datafusion/catalog-listing/src/write/orchestration.rs
rename to datafusion/datasource/src/write/orchestration.rs
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]