paleolimbot commented on code in PR #251: URL: https://github.com/apache/sedona-db/pull/251#discussion_r2499614666
########## rust/sedona-datasource/src/spec.rs: ########## @@ -0,0 +1,193 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{collections::HashMap, fmt::Debug, sync::Arc}; + +use arrow_array::RecordBatchReader; +use arrow_schema::{Schema, SchemaRef}; +use async_trait::async_trait; + +use datafusion::{config::TableOptions, datasource::listing::FileRange}; +use datafusion_common::{Result, Statistics}; +use datafusion_execution::object_store::ObjectStoreUrl; +use datafusion_physical_expr::PhysicalExpr; +use object_store::{ObjectMeta, ObjectStore}; + +/// Simple file format specification +/// +/// In DataFusion, various parts of the file format are split among the +/// FileFormatFactory, the FileFormat, the FileSource, the FileOpener, +/// and a few other traits. This trait is designed to provide a few +/// important features of a natively implemented FileFormat but consolidating +/// the components of implementing the format in the same place. This is +/// intended to provide a less verbose way to implement readers for a wide +/// variety of spatial formats. +#[async_trait] +pub trait ExternalFormatSpec: Debug + Send + Sync { + /// Infer a schema for a given file + /// + /// Given a single file, infer what schema [ExternalFormatSpec::open_reader] + /// would produce in the absence of any other guidance. + async fn infer_schema(&self, location: &Object) -> Result<Schema>; + + /// Open a [RecordBatchReader] for a given file + /// + /// The implementation must handle the `file_projection`; however, + /// need not handle the `filters` (but may use them for pruning). + async fn open_reader(&self, args: &OpenReaderArgs) + -> Result<Box<dyn RecordBatchReader + Send>>; Review Comment: This is the user-facing API that would be used to implement readers ########## rust/sedona-datasource/src/provider.rs: ########## @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow_schema::SchemaRef; +use async_trait::async_trait; +use datafusion::{ + config::TableOptions, + datasource::listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl}, + execution::{options::ReadOptions, SessionState}, + prelude::{SessionConfig, SessionContext}, +}; +use datafusion_common::{exec_err, Result}; + +use crate::{format::ExternalFileFormat, spec::ExternalFormatSpec}; + +/// Create a [ListingTable] from an [ExternalFormatSpec] and one or more URLs +/// +/// This can be used to resolve a format specification into a TableProvider that +/// may be registered with a [SessionContext]. +pub async fn external_listing_table( + spec: Arc<dyn ExternalFormatSpec>, + context: &SessionContext, + table_paths: Vec<ListingTableUrl>, + check_extension: bool, +) -> Result<ListingTable> { Review Comment: This is the wrapper we'd use to implement `read_xxxx()` in Python ########## rust/sedona-datasource/src/format.rs: ########## @@ -0,0 +1,717 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{any::Any, collections::HashMap, fmt::Debug, sync::Arc}; + +use arrow_schema::{Schema, SchemaRef}; +use async_trait::async_trait; +use datafusion::{ + config::ConfigOptions, + datasource::{ + file_format::{file_compression_type::FileCompressionType, FileFormat, FileFormatFactory}, + listing::PartitionedFile, + physical_plan::{ + FileGroupPartitioner, FileMeta, FileOpenFuture, FileOpener, FileScanConfig, + FileSinkConfig, FileSource, + }, + }, +}; +use datafusion_catalog::{memory::DataSourceExec, Session}; +use datafusion_common::{not_impl_err, DataFusionError, GetExt, Result, Statistics}; +use datafusion_physical_expr::{LexOrdering, LexRequirement, PhysicalExpr}; +use datafusion_physical_plan::{ + filter_pushdown::{FilterPushdownPropagation, PushedDown}, + metrics::ExecutionPlanMetricsSet, + ExecutionPlan, +}; +use futures::{StreamExt, TryStreamExt}; +use object_store::{ObjectMeta, ObjectStore}; +use sedona_common::sedona_internal_err; + +use crate::spec::{ExternalFormatSpec, Object, OpenReaderArgs, SupportsRepartition}; + +/// Create a [FileFormatFactory] from a [ExternalFormatSpec] +/// +/// The FileFormatFactory is the object that may be registered with a +/// SessionStateBuilder to allow SQL queries to access this format. +#[derive(Debug)] +pub struct ExternalFormatFactory { + spec: Arc<dyn ExternalFormatSpec>, +} Review Comment: This is what we'd use to register a format with the session so that things like `SELECT * FROM 'gpkg/*.gpkg'` work in SQL. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
