(datafusion) branch main updated: Consolidate Examples: memtable.rs and parquet_multiple_files.rs (#13913)

alamb Sat, 28 Dec 2024 06:46:36 -0800

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new f3e0fa2861 Consolidate Examples: memtable.rs and 
parquet_multiple_files.rs (#13913)
f3e0fa2861 is described below

commit f3e0fa286120315f75782a7edc3b1fd82f252f83
Author: Andrew Lamb <[email protected]>
AuthorDate: Sat Dec 28 09:44:07 2024 -0500

    Consolidate Examples: memtable.rs and parquet_multiple_files.rs (#13913)
---
 datafusion-examples/README.md                      |  3 +-
 datafusion-examples/examples/memtable.rs           | 74 -----------------
 ...{parquet_sql_multiple_files.rs => sql_query.rs} | 94 +++++++++++++++++++---
 3 files changed, 83 insertions(+), 88 deletions(-)

diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md
index b06148ce26..3ec008a602 100644
--- a/datafusion-examples/README.md
+++ b/datafusion-examples/README.md
@@ -64,10 +64,8 @@ cargo run --example dataframe
 - [`flight_sql_server.rs`](examples/flight/flight_sql_server.rs): Run 
DataFusion as a standalone process and execute SQL queries from JDBC clients
 - [`function_factory.rs`](examples/function_factory.rs): Register `CREATE 
FUNCTION` handler to implement SQL macros
 - [`make_date.rs`](examples/make_date.rs): Examples of using the make_date 
function
-- [`memtable.rs`](examples/memtable.rs): Create an query data in memory using 
SQL and `RecordBatch`es
 - [`optimizer_rule.rs`](examples/optimizer_rule.rs): Use a custom 
OptimizerRule to replace certain predicates
 - [`parquet_index.rs`](examples/parquet_index.rs): Create an secondary index 
over several parquet files and use it to speed up queries
-- [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): 
Build and run a query plan from a SQL statement against multiple local Parquet 
files
 - [`parquet_exec_visitor.rs`](examples/parquet_exec_visitor.rs): Extract 
statistics by visiting an ExecutionPlan after execution
 - [`parse_sql_expr.rs`](examples/parse_sql_expr.rs): Parse SQL text into 
DataFusion `Expr`.
 - [`plan_to_sql.rs`](examples/plan_to_sql.rs): Generate SQL from DataFusion 
`Expr` and `LogicalPlan`
@@ -83,6 +81,7 @@ cargo run --example dataframe
 - [`sql_analysis.rs`](examples/sql_analysis.rs): Analyse SQL queries with 
DataFusion structures
 - [`sql_frontend.rs`](examples/sql_frontend.rs): Create LogicalPlans (only) 
from sql strings
 - [`sql_dialect.rs`](examples/sql_dialect.rs): Example of implementing a 
custom SQL dialect on top of `DFParser`
+- [`sql_query.rs`](examples/memtable.rs): Query data using SQL (in memory 
`RecordBatch`es, local Parquet files)q
 - [`to_char.rs`](examples/to_char.rs): Examples of using the to_char function
 - [`to_timestamp.rs`](examples/to_timestamp.rs): Examples of using 
to_timestamp functions
 
diff --git a/datafusion-examples/examples/memtable.rs 
b/datafusion-examples/examples/memtable.rs
deleted file mode 100644
index bb0b720eff..0000000000
--- a/datafusion-examples/examples/memtable.rs
+++ /dev/null
@@ -1,74 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use datafusion::arrow::array::{UInt64Array, UInt8Array};
-use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
-use datafusion::arrow::record_batch::RecordBatch;
-use datafusion::datasource::MemTable;
-use datafusion::error::Result;
-use datafusion::prelude::SessionContext;
-use std::sync::Arc;
-use std::time::Duration;
-use tokio::time::timeout;
-
-/// This example demonstrates executing a simple query against a [`MemTable`]
-#[tokio::main]
-async fn main() -> Result<()> {
-    let mem_table = create_memtable()?;
-
-    // create local execution context
-    let ctx = SessionContext::new();
-
-    // Register the in-memory table containing the data
-    ctx.register_table("users", Arc::new(mem_table))?;
-
-    let dataframe = ctx.sql("SELECT * FROM users;").await?;
-
-    timeout(Duration::from_secs(10), async move {
-        let result = dataframe.collect().await.unwrap();
-        let record_batch = result.first().unwrap();
-
-        assert_eq!(1, record_batch.column(0).len());
-        dbg!(record_batch.columns());
-    })
-    .await
-    .unwrap();
-
-    Ok(())
-}
-
-fn create_memtable() -> Result<MemTable> {
-    MemTable::try_new(get_schema(), vec![vec![create_record_batch()?]])
-}
-
-fn create_record_batch() -> Result<RecordBatch> {
-    let id_array = UInt8Array::from(vec![1]);
-    let account_array = UInt64Array::from(vec![9000]);
-
-    Ok(RecordBatch::try_new(
-        get_schema(),
-        vec![Arc::new(id_array), Arc::new(account_array)],
-    )
-    .unwrap())
-}
-
-fn get_schema() -> SchemaRef {
-    SchemaRef::new(Schema::new(vec![
-        Field::new("id", DataType::UInt8, false),
-        Field::new("bank_account", DataType::UInt64, true),
-    ]))
-}
diff --git a/datafusion-examples/examples/parquet_sql_multiple_files.rs 
b/datafusion-examples/examples/sql_query.rs
similarity index 54%
rename from datafusion-examples/examples/parquet_sql_multiple_files.rs
rename to datafusion-examples/examples/sql_query.rs
index b0d3922a32..f6d3936568 100644
--- a/datafusion-examples/examples/parquet_sql_multiple_files.rs
+++ b/datafusion-examples/examples/sql_query.rs
@@ -15,21 +15,90 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use datafusion::arrow::array::{UInt64Array, UInt8Array};
+use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use datafusion::arrow::record_batch::RecordBatch;
+use datafusion::datasource::file_format::parquet::ParquetFormat;
+use datafusion::datasource::listing::ListingOptions;
+use datafusion::datasource::MemTable;
+use datafusion::error::{DataFusionError, Result};
+use datafusion::prelude::SessionContext;
+use datafusion_common::exec_datafusion_err;
+use object_store::local::LocalFileSystem;
 use std::path::Path;
 use std::sync::Arc;
+use std::time::Duration;
+use tokio::time::timeout;
+
+/// Examples of various ways to execute queries using SQL
+///
+/// [`query_memtable`]: a simple query against a [`MemTable`]
+/// [`query_parquet`]: a simple query against a directory with multiple 
Parquet files
+///
+#[tokio::main]
+async fn main() -> Result<()> {
+    query_memtable().await?;
+    query_parquet().await?;
+    Ok(())
+}
 
-use datafusion::datasource::file_format::parquet::ParquetFormat;
-use datafusion::datasource::listing::ListingOptions;
-use datafusion::prelude::*;
+/// Run a simple query against a [`MemTable`]
+pub async fn query_memtable() -> Result<()> {
+    let mem_table = create_memtable()?;
 
-use object_store::local::LocalFileSystem;
+    // create local execution context
+    let ctx = SessionContext::new();
 
-/// This example demonstrates executing a simple query against an Arrow data 
source (a directory
-/// with multiple Parquet files) and fetching results. The query is run twice, 
once showing
-/// how to used `register_listing_table` with an absolute path, and once 
registering an
-/// ObjectStore to use a relative path.
-#[tokio::main]
-async fn main() -> Result<(), Box<dyn std::error::Error>> {
+    // Register the in-memory table containing the data
+    ctx.register_table("users", Arc::new(mem_table))?;
+
+    let dataframe = ctx.sql("SELECT * FROM users;").await?;
+
+    timeout(Duration::from_secs(10), async move {
+        let result = dataframe.collect().await.unwrap();
+        let record_batch = result.first().unwrap();
+
+        assert_eq!(1, record_batch.column(0).len());
+        dbg!(record_batch.columns());
+    })
+    .await
+    .unwrap();
+
+    Ok(())
+}
+
+fn create_memtable() -> Result<MemTable> {
+    MemTable::try_new(get_schema(), vec![vec![create_record_batch()?]])
+}
+
+fn create_record_batch() -> Result<RecordBatch> {
+    let id_array = UInt8Array::from(vec![1]);
+    let account_array = UInt64Array::from(vec![9000]);
+
+    Ok(RecordBatch::try_new(
+        get_schema(),
+        vec![Arc::new(id_array), Arc::new(account_array)],
+    )
+    .unwrap())
+}
+
+fn get_schema() -> SchemaRef {
+    SchemaRef::new(Schema::new(vec![
+        Field::new("id", DataType::UInt8, false),
+        Field::new("bank_account", DataType::UInt64, true),
+    ]))
+}
+
+/// The simplest way to query parquet files is to use the
+/// [`SessionContext::read_parquet`] API
+///
+/// For more control, you can use the lower level [`ListingOptions`] and
+/// [`ListingTable`] APIS
+///
+/// This example shows how to use relative and absolute paths.
+///
+/// [`ListingTable`]: datafusion::datasource::listing::ListingTable
+async fn query_parquet() -> Result<()> {
     // create local execution context
     let ctx = SessionContext::new();
 
@@ -73,13 +142,14 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     let test_data_path = Path::new(&test_data);
     let test_data_path_parent = test_data_path
         .parent()
-        .ok_or("test_data path needs a parent")?;
+        .ok_or(exec_datafusion_err!("test_data path needs a parent"))?;
 
     std::env::set_current_dir(test_data_path_parent)?;
 
     let local_fs = Arc::new(LocalFileSystem::default());
 
-    let u = url::Url::parse("file://./")?;
+    let u = url::Url::parse("file://./")
+        .map_err(|e| DataFusionError::External(Box::new(e)))?;
     ctx.register_object_store(&u, local_fs);
 
     // Register a listing table - this will use all files in the directory as 
data sources


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion) branch main updated: Consolidate Examples: memtable.rs and parquet_multiple_files.rs (#13913)

Reply via email to