alamb commented on a change in pull request #7972: URL: https://github.com/apache/arrow/pull/7972#discussion_r471094243
########## File path: rust/datafusion/src/dataframe.rs ########## @@ -15,42 +15,46 @@ // specific language governing permissions and limitations // under the License. -//! Table API for building a logical query plan. This is similar to the Table API in Ibis -//! and the DataFrame API in Apache Spark +//! DataFrame API for building and executing query plans. use crate::arrow::record_batch::RecordBatch; use crate::error::Result; -use crate::execution::context::ExecutionContext; use crate::logicalplan::{Expr, LogicalPlan}; use arrow::datatypes::Schema; use std::sync::Arc; -/// Table is an abstraction of a logical query plan -pub trait Table { +/// DataFrame is an abstraction of a logical query plan +pub trait DataFrame { /// Select columns by name - fn select_columns(&self, columns: Vec<&str>) -> Result<Arc<dyn Table>>; + fn select_columns(&self, columns: Vec<&str>) -> Result<Arc<dyn DataFrame>>; /// Create a projection based on arbitrary expressions - fn select(&self, expr: Vec<Expr>) -> Result<Arc<dyn Table>>; + fn select(&self, expr: Vec<Expr>) -> Result<Arc<dyn DataFrame>>; /// Create a selection based on a filter expression - fn filter(&self, expr: Expr) -> Result<Arc<dyn Table>>; + fn filter(&self, expr: Expr) -> Result<Arc<dyn DataFrame>>; /// Perform an aggregate query fn aggregate( &self, group_expr: Vec<Expr>, aggr_expr: Vec<Expr>, - ) -> Result<Arc<dyn Table>>; + ) -> Result<Arc<dyn DataFrame>>; /// limit the number of rows - fn limit(&self, n: usize) -> Result<Arc<dyn Table>>; + fn limit(&self, n: usize) -> Result<Arc<dyn DataFrame>>; /// Return the logical plan fn to_logical_plan(&self) -> LogicalPlan; - /// Return an expression representing a column within this table - fn col(&self, name: &str) -> Result<Expr>; + /// Collects the result as a vector of RecordBatch. + fn collect(&self, batch_size: usize) -> Result<Vec<RecordBatch>>; + + /// Returns the schema Review comment: ```suggestion /// Returns the schema (names and types of columns) in this DataFrame ``` ########## File path: rust/datafusion/src/execution/context.rs ########## @@ -89,12 +91,18 @@ impl ExecutionConfig { } /// Execution context for registering data sources and executing queries -pub struct ExecutionContext { - datasources: HashMap<String, Box<dyn TableProvider + Send + Sync>>, - scalar_functions: HashMap<String, Box<ScalarFunction>>, +#[derive(Clone)] +pub struct ExecutionContextState { + datasources: Rc<RefCell<HashMap<String, Box<dyn TableProvider + Send + Sync>>>>, Review comment: Can you explain the rationale for using `RefCell` here? It seems to me like `Arc<HashMap<...>>` is easier to reason about. I can't think of an example when running multi-threaded when you would want one thread to borrow mutably -- it seems like an ExecutionContextState would be created/configured by one thread, and then when shared would be read only. ########## File path: rust/datafusion/src/dataframe.rs ########## @@ -15,42 +15,46 @@ // specific language governing permissions and limitations // under the License. -//! Table API for building a logical query plan. This is similar to the Table API in Ibis -//! and the DataFrame API in Apache Spark +//! DataFrame API for building and executing query plans. use crate::arrow::record_batch::RecordBatch; use crate::error::Result; -use crate::execution::context::ExecutionContext; use crate::logicalplan::{Expr, LogicalPlan}; use arrow::datatypes::Schema; use std::sync::Arc; -/// Table is an abstraction of a logical query plan -pub trait Table { +/// DataFrame is an abstraction of a logical query plan Review comment: ```suggestion /// DataFrame represents a logical set of rows with the same named columns. /// Similar to a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) or [Spark DataFrame](https://spark.apache.org/docs/latest/sql-programming-guide.html) ``` ########## File path: rust/datafusion/src/execution/context.rs ########## @@ -17,9 +17,11 @@ //! ExecutionContext contains methods for registering data sources and executing queries +use std::cell::RefCell; use std::collections::{HashMap, HashSet}; use std::fs; use std::path::Path; +use std::rc::Rc; Review comment: As mentioned by @jorgecarleitao -- I suggest using `Arc` in this file unless there is a good reason to use `Rc` ########## File path: rust/datafusion/examples/memory_table_api.rs ########## @@ -51,15 +51,15 @@ fn main() -> Result<()> { // declare a table in memory. In spark API, this corresponds to createDataFrame(...). let provider = MemTable::new(schema, vec![vec![batch]])?; ctx.register_table("t", Box::new(provider)); - let t = ctx.table("t")?; + let df = ctx.table("t")?; Review comment: FWIW I think `table` is still reasonable from a DataFusion point of view as the data is creating a logical table in the context of a query, while being backed by a Dataframe. Changing to `register_dataframe` would be fine too in my opinion. I could go either way ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org