alamb commented on code in PR #12920: URL: https://github.com/apache/datafusion/pull/12920#discussion_r1811062815
########## datafusion/ffi/README.md: ########## @@ -0,0 +1,81 @@ +<!--- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +# `datafusion-ffi`: Apache DataFusion Foreign Function Interface + +This crate contains code to allow interoperability of Apache [DataFusion] +with functions from other languages using a stable interface. Review Comment: It may also be good to point out that the ffi interface allows different *versions* of datafusion to interact with each other over stable interfaces in the intro ```suggestion This crate contains code to allow interoperability of Apache [DataFusion] with functions from other languages and/or versions using a stable interface. ``` ########## datafusion/ffi/src/plan_properties.rs: ########## @@ -0,0 +1,330 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ffi::c_void, sync::Arc}; + +use abi_stable::{ + std_types::{ + RResult::{self, RErr, ROk}, + RStr, RVec, + }, + StableAbi, +}; +use arrow::{ + datatypes::{Schema, SchemaRef}, + ffi::FFI_ArrowSchema, +}; +use datafusion::{ + error::{DataFusionError, Result}, + physical_expr::EquivalenceProperties, + physical_plan::{ExecutionMode, PlanProperties}, + prelude::SessionContext, +}; +use datafusion_proto::{ + physical_plan::{ + from_proto::{parse_physical_sort_exprs, parse_protobuf_partitioning}, + to_proto::{serialize_partitioning, serialize_physical_sort_exprs}, + DefaultPhysicalExtensionCodec, + }, + protobuf::{Partitioning, PhysicalSortExprNodeCollection}, +}; +use log::error; +use prost::Message; + +// TODO: should we just make ExecutionMode repr(C)? +#[repr(C)] +#[allow(non_camel_case_types)] +#[derive(Clone, StableAbi)] +pub enum FFI_ExecutionMode { + Bounded, + Unbounded, + PipelineBreaking, +} + +impl From<ExecutionMode> for FFI_ExecutionMode { + fn from(value: ExecutionMode) -> Self { + match value { + ExecutionMode::Bounded => FFI_ExecutionMode::Bounded, + ExecutionMode::Unbounded => FFI_ExecutionMode::Unbounded, + ExecutionMode::PipelineBreaking => FFI_ExecutionMode::PipelineBreaking, + } + } +} + +impl From<FFI_ExecutionMode> for ExecutionMode { + fn from(value: FFI_ExecutionMode) -> Self { + match value { + FFI_ExecutionMode::Bounded => ExecutionMode::Bounded, + FFI_ExecutionMode::Unbounded => ExecutionMode::Unbounded, + FFI_ExecutionMode::PipelineBreaking => ExecutionMode::PipelineBreaking, + } + } +} + +#[repr(C)] +#[derive(Debug, StableAbi)] +pub struct WrappedSchema(#[sabi(unsafe_opaque_field)] pub FFI_ArrowSchema); + +impl From<SchemaRef> for WrappedSchema { + fn from(value: SchemaRef) -> Self { + let ffi_schema = match FFI_ArrowSchema::try_from(value.as_ref()) { + Ok(s) => s, + Err(e) => { + error!("Unable to convert DataFusion Schema to FFI_ArrowSchema in FFI_PlanProperties. {}", e); + FFI_ArrowSchema::empty() + } + }; + + WrappedSchema(ffi_schema) + } +} + +impl From<WrappedSchema> for SchemaRef { + fn from(value: WrappedSchema) -> Self { + let schema = match Schema::try_from(&value.0) { + Ok(s) => s, + Err(e) => { + error!("Unable to convert from FFI_ArrowSchema to DataFusion Schema in FFI_PlanProperties. {}", e); + Schema::empty() + } + }; + Arc::new(schema) + } +} + +#[repr(C)] +#[derive(Debug, StableAbi)] +#[allow(missing_docs)] +#[allow(non_camel_case_types)] +pub struct FFI_PlanProperties { Review Comment: Do we need to expose PlanProperties? This seems pretty low level and a large API surface area. Unless it is all needed I suggest we don't expose it at first and only do so when needed ########## datafusion/ffi/src/table_provider.rs: ########## @@ -0,0 +1,419 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{any::Any, ffi::c_void, sync::Arc}; + +use abi_stable::{ + std_types::{ROption, RResult, RString, RVec}, + StableAbi, +}; +use arrow::datatypes::SchemaRef; +use async_ffi::{FfiFuture, FutureExt}; +use async_trait::async_trait; +use datafusion::{ + catalog::{Session, TableProvider}, + datasource::TableType, + error::DataFusionError, + execution::session_state::SessionStateBuilder, + logical_expr::TableProviderFilterPushDown, + physical_plan::ExecutionPlan, + prelude::{Expr, SessionContext}, +}; +use datafusion_proto::{ + logical_plan::{ + from_proto::parse_exprs, to_proto::serialize_exprs, DefaultLogicalExtensionCodec, + }, + protobuf::LogicalExprList, +}; +use prost::Message; + +use crate::{ + plan_properties::WrappedSchema, + session_config::ForeignSessionConfig, + table_source::{FFI_TableProviderFilterPushDown, FFI_TableType}, +}; + +use super::{ + execution_plan::{FFI_ExecutionPlan, ForeignExecutionPlan}, + session_config::FFI_SessionConfig, +}; +use datafusion::error::Result; + +/// A stable interface for creating a DataFusion TableProvider. +#[repr(C)] +#[derive(Debug, StableAbi)] +#[allow(non_camel_case_types)] +pub struct FFI_TableProvider { + pub schema: unsafe extern "C" fn(provider: &Self) -> WrappedSchema, + pub scan: unsafe extern "C" fn( + provider: &Self, + session_config: &FFI_SessionConfig, + projections: RVec<usize>, + filters_serialized: RVec<u8>, Review Comment: it would probably help to document what these types are (e.g. that this is a protobuf serialized bytes) ########## datafusion/ffi/README.md: ########## @@ -0,0 +1,81 @@ +<!--- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +# `datafusion-ffi`: Apache DataFusion Foreign Function Interface + +This crate contains code to allow interoperability of Apache [DataFusion] +with functions from other languages using a stable interface. + +See [API Docs] for details and examples. + +We expect this crate may be used by both sides of the FFI. This allows users +to create modules that can interoperate with the necessity of using the same +version of DataFusion. The driving use case has been the `datafusion-python` +repository, but many other use cases may exist. We envision at least two +use cases. + +1. `datafusion-python` which will use the FFI to provide external services such + as a `TableProvider` without needing to re-export the entire `datafusion-python` + code base. With `datafusion-ffi` these packages do not need `datafusion-python` + as a dependency at all. +2. Users may want to create a modular interface that allows runtime loading of + libraries. + +## Struct Layout Review Comment: I suggest we move this discussion of code layout / struct naming into the rust code so it stays closer to the code and has less chance to get out of date ########## datafusion/ffi/README.md: ########## @@ -0,0 +1,81 @@ +<!--- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +# `datafusion-ffi`: Apache DataFusion Foreign Function Interface + +This crate contains code to allow interoperability of Apache [DataFusion] +with functions from other languages using a stable interface. + +See [API Docs] for details and examples. + +We expect this crate may be used by both sides of the FFI. This allows users +to create modules that can interoperate with the necessity of using the same +version of DataFusion. The driving use case has been the `datafusion-python` +repository, but many other use cases may exist. We envision at least two +use cases. + +1. `datafusion-python` which will use the FFI to provide external services such + as a `TableProvider` without needing to re-export the entire `datafusion-python` + code base. With `datafusion-ffi` these packages do not need `datafusion-python` + as a dependency at all. +2. Users may want to create a modular interface that allows runtime loading of + libraries. + +## Struct Layout + +In this crate we have a variety of structs which closely mimic the behavior of +their internal counterparts. In the following example, we will refer to the +`TableProvider`, but the same pattern exists for other structs. + +Each of the exposted structs in this crate is provided with a variant prefixed +with `Foreign`. This variant is designed to be used by the consumer of the +foreign code. The `Foreign` structs should _never_ access the `private_data` +fields. Instead they should only access the data returned through the function +calls defined on the `FFI_` structs. The second purpose of the `Foreign` +structs is to contain additional data that may be needed by the traits that +are implemented on them. Some of these traits require borrowing data which +can be far more convienent to be locally stored. + +For example, we have a struct `FFI_TableProvider` to give access to the +`TableProvider` functions like `table_type()` and `scan()`. If we write a +library that wishes to expose it's `TableProvider`, then we can access the +private data that contains the Arc reference to the `TableProvider` via +`FFI_TableProvider`. This data is local to the library. + +If we have a program that accesses a `TableProvider` via FFI, then it +will use `ForeignTableProvider`. When using `ForeignTableProvider` we **must** +not attempt to access the `private_data` field in `FFI_TableProvider`. If a +user is testing locally, you may be able to successfully access this field, but +it will only work if you are building against the exact same version of +`DataFusion` for both libraries **and** the same compiler. It will not work +in general. + +It is worth noting that which library is the `local` and which is `foreign` Review Comment: Maybe we should name this "DataFusion C API" or something instead of using the rust FFI term 🤔 ########## datafusion/ffi/README.md: ########## @@ -0,0 +1,81 @@ +<!--- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +# `datafusion-ffi`: Apache DataFusion Foreign Function Interface + +This crate contains code to allow interoperability of Apache [DataFusion] +with functions from other languages using a stable interface. + +See [API Docs] for details and examples. + +We expect this crate may be used by both sides of the FFI. This allows users +to create modules that can interoperate with the necessity of using the same +version of DataFusion. The driving use case has been the `datafusion-python` Review Comment: I think this text is backwards -- it is "without the necessity". Maybe we could rephrase like ```suggestion We expect this crate may be used by both sides of the FFI. This allows users to create modules that can interoperate using different versions of DataFusion. The driving use case has been the `datafusion-python` ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
