This is an automated email from the ASF dual-hosted git repository. comphead pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push: new 8e8c116f5c [datafusion-spark] Example of using Spark compatible function library (#16384) 8e8c116f5c is described below commit 8e8c116f5c6359482b429664ca46265111f2c77a Author: Andrew Lamb <and...@nerdnetworks.org> AuthorDate: Sun Jun 15 01:40:13 2025 -0400 [datafusion-spark] Example of using Spark compatible function library (#16384) * [datafusion-spark] Example of using Spark compatible function library * tweak * taplo format --- datafusion/core/src/lib.rs | 10 +++- datafusion/spark/Cargo.toml | 2 +- datafusion/spark/src/lib.rs | 61 +++++++++++++++++++--- docs/source/index.rst | 2 +- .../{ => functions}/adding-udfs.md | 0 docs/source/library-user-guide/functions/index.rst | 25 +++++++++ docs/source/library-user-guide/functions/spark.md | 29 ++++++++++ .../library-user-guide/working-with-exprs.md | 2 +- 8 files changed, 120 insertions(+), 11 deletions(-) diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 6956108e2d..989799b1f8 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -1047,8 +1047,14 @@ doc_comment::doctest!( #[cfg(doctest)] doc_comment::doctest!( - "../../../docs/source/library-user-guide/adding-udfs.md", - library_user_guide_adding_udfs + "../../../docs/source/library-user-guide/functions/adding-udfs.md", + library_user_guide_functions_adding_udfs +); + +#[cfg(doctest)] +doc_comment::doctest!( + "../../../docs/source/library-user-guide/functions/spark.md", + library_user_guide_functions_spark ); #[cfg(doctest)] diff --git a/datafusion/spark/Cargo.toml b/datafusion/spark/Cargo.toml index 1ded8c40aa..2c46cac6b7 100644 --- a/datafusion/spark/Cargo.toml +++ b/datafusion/spark/Cargo.toml @@ -41,6 +41,6 @@ datafusion-catalog = { workspace = true } datafusion-common = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } -datafusion-functions = { workspace = true } +datafusion-functions = { workspace = true, features = ["crypto_expressions"] } datafusion-macros = { workspace = true } log = { workspace = true } diff --git a/datafusion/spark/src/lib.rs b/datafusion/spark/src/lib.rs index 1fe5b6ecac..4ce9be1263 100644 --- a/datafusion/spark/src/lib.rs +++ b/datafusion/spark/src/lib.rs @@ -25,19 +25,68 @@ //! Spark Expression packages for [DataFusion]. //! -//! This crate contains a collection of various Spark expression packages for DataFusion, +//! This crate contains a collection of various Spark function packages for DataFusion, //! implemented using the extension API. //! //! [DataFusion]: https://crates.io/crates/datafusion //! -//! # Available Packages +//! +//! # Available Function Packages //! See the list of [modules](#modules) in this crate for available packages. //! -//! # Using A Package -//! You can register all functions in all packages using the [`register_all`] function. +//! # Example: using all function packages +//! +//! You can register all the functions in all packages using the [`register_all`] +//! function as shown below. +//! +//! ``` +//! # use datafusion_execution::FunctionRegistry; +//! # use datafusion_expr::{ScalarUDF, AggregateUDF, WindowUDF}; +//! # use datafusion_expr::planner::ExprPlanner; +//! # use datafusion_common::Result; +//! # use std::collections::HashSet; +//! # use std::sync::Arc; +//! # // Note: We can't use a real SessionContext here because the +//! # // `datafusion_spark` crate has no dependence on the DataFusion crate +//! # // thus use a dummy SessionContext that has enough of the implementation +//! # struct SessionContext {} +//! # impl FunctionRegistry for SessionContext { +//! # fn register_udf(&mut self, _udf: Arc<ScalarUDF>) -> Result<Option<Arc<ScalarUDF>>> { Ok (None) } +//! # fn udfs(&self) -> HashSet<String> { unimplemented!() } +//! # fn udf(&self, _name: &str) -> Result<Arc<ScalarUDF>> { unimplemented!() } +//! # fn udaf(&self, name: &str) -> Result<Arc<AggregateUDF>> {unimplemented!() } +//! # fn udwf(&self, name: &str) -> Result<Arc<WindowUDF>> { unimplemented!() } +//! # fn expr_planners(&self) -> Vec<Arc<dyn ExprPlanner>> { unimplemented!() } +//! # } +//! # impl SessionContext { +//! # fn new() -> Self { SessionContext {} } +//! # async fn sql(&mut self, _query: &str) -> Result<()> { Ok(()) } +//! # } +//! # +//! # async fn stub() -> Result<()> { +//! // Create a new session context +//! let mut ctx = SessionContext::new(); +//! // register all spark functions with the context +//! datafusion_spark::register_all(&mut ctx)?; +//! // run a query. Note the `sha2` function is now available which +//! // has Spark semantics +//! let df = ctx.sql("SELECT sha2('The input String', 256)").await?; +//! # Ok(()) +//! # } +//! ``` +//! +//! # Example: calling a specific function in Rust +//! +//! Each package also exports an `expr_fn` submodule that create [`Expr`]s for +//! invoking functions via rust using a fluent style. For example, to invoke the +//! `sha2` function, you can use the following code: //! -//! Each package also exports an `expr_fn` submodule to help create [`Expr`]s that invoke -//! functions using a fluent style. For example: +//! ```rust +//! # use datafusion_expr::{col, lit}; +//! use datafusion_spark::expr_fn::sha2; +//! // Create the expression `sha2(my_data, 256)` +//! let expr = sha2(col("my_data"), lit(256)); +//!``` //! //![`Expr`]: datafusion_expr::Expr diff --git a/docs/source/index.rst b/docs/source/index.rst index 4b407e4e49..021a426e4c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -133,7 +133,7 @@ To get started, see library-user-guide/using-the-dataframe-api library-user-guide/building-logical-plans library-user-guide/catalogs - library-user-guide/adding-udfs + library-user-guide/functions/index library-user-guide/custom-table-providers library-user-guide/table-constraints library-user-guide/extending-operators diff --git a/docs/source/library-user-guide/adding-udfs.md b/docs/source/library-user-guide/functions/adding-udfs.md similarity index 100% rename from docs/source/library-user-guide/adding-udfs.md rename to docs/source/library-user-guide/functions/adding-udfs.md diff --git a/docs/source/library-user-guide/functions/index.rst b/docs/source/library-user-guide/functions/index.rst new file mode 100644 index 0000000000..d6127446c2 --- /dev/null +++ b/docs/source/library-user-guide/functions/index.rst @@ -0,0 +1,25 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Functions +============= + +.. toctree:: + :maxdepth: 2 + + adding-udfs + spark diff --git a/docs/source/library-user-guide/functions/spark.md b/docs/source/library-user-guide/functions/spark.md new file mode 100644 index 0000000000..c371ae1cb5 --- /dev/null +++ b/docs/source/library-user-guide/functions/spark.md @@ -0,0 +1,29 @@ +<!--- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +# Spark Compatible Functions + +The [`datafusion-spark`] crate provides Apache Spark-compatible expressions for +use with DataFusion. + +[`datafusion-spark`]: https://crates.io/crates/datafusion-spark + +Please see the documentation for the [`datafusion-spark` crate] for more details. + +[`datafusion-spark` crate]: https://docs.rs/datafusion-spark/latest/datafusion_spark/ diff --git a/docs/source/library-user-guide/working-with-exprs.md b/docs/source/library-user-guide/working-with-exprs.md index df4e5e3940..ce3d42cd13 100644 --- a/docs/source/library-user-guide/working-with-exprs.md +++ b/docs/source/library-user-guide/working-with-exprs.md @@ -75,7 +75,7 @@ Please see [expr_api.rs](https://github.com/apache/datafusion/blob/main/datafusi ## A Scalar UDF Example -We'll use a `ScalarUDF` expression as our example. This necessitates implementing an actual UDF, and for ease we'll use the same example from the [adding UDFs](./adding-udfs.md) guide. +We'll use a `ScalarUDF` expression as our example. This necessitates implementing an actual UDF, and for ease we'll use the same example from the [adding UDFs](functions/adding-udfs.md) guide. So assuming you've written that function, you can use it to create an `Expr`: --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org