tustvold commented on code in PR #2226: URL: https://github.com/apache/arrow-datafusion/pull/2226#discussion_r851361531
########## datafusion/scheduler/src/pipeline/mod.rs: ########## @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::task::{Context, Poll}; + +use arrow::record_batch::RecordBatch; + +use crate::ArrowResult; + +pub mod execution; +pub mod repartition; + +/// A push-based interface used by the scheduler to drive query execution +/// +/// A pipeline processes data from one or more input partitions, producing output +/// to one or more output partitions. As a [`Pipeline`] may drawn on input from +/// more than one upstream [`Pipeline`], input partitions are identified by both +/// a child index, and a partition index, whereas output partitions are only +/// identified by a partition index. +/// +/// This is not intended as an eventual replacement for the physical plan representation +/// within DataFusion, [`ExecutionPlan`], but rather a generic interface that +/// parts of the physical plan are "compiled" into by the scheduler. +/// +/// # Push vs Pull Execution +/// +/// Whilst the interface exposed to the scheduler is push-based, in which member functions +/// computation is performed is intentionally left as an implementation detail of the [`Pipeline`] +/// +/// This allows flexibility to support the following different patterns, and potentially more: +/// +/// An eager, push-based pipeline, that processes a batch synchronously in [`Pipeline::push`] +/// and immediately wakes the corresponding output partition. +/// +/// A parallel, push-based pipeline, that enqueues the processing of a batch to the rayon +/// thread pool in [`Pipeline::push`], and wakes the corresponding output partition when +/// the job completes. Order and non-order preserving variants are possible +/// +/// A merge pipeline which combines data from one or more input partitions into one or +/// more output partitions. [`Pipeline::push`] adds data to an input buffer, and wakes +/// any output partitions that may now be able to make progress. This may be none none +/// if the operator is waiting on data from a different input partition +/// +/// An aggregation pipeline which combines data from one or more input partitions into +/// a single output partition. [`Pipeline::push`] would eagerly update the computed +/// aggregates, and the final [`Pipeline::close`] trigger flushing these to the output Review Comment: Good shout, will include. Whilst enumerating the full list of possibilities is not the intent, this can act as a wishlist or sorts 😂 ########## datafusion/scheduler/src/pipeline/mod.rs: ########## @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::task::{Context, Poll}; + +use arrow::record_batch::RecordBatch; + +use crate::ArrowResult; + +pub mod execution; +pub mod repartition; + +/// A push-based interface used by the scheduler to drive query execution +/// +/// A pipeline processes data from one or more input partitions, producing output +/// to one or more output partitions. As a [`Pipeline`] may drawn on input from +/// more than one upstream [`Pipeline`], input partitions are identified by both +/// a child index, and a partition index, whereas output partitions are only +/// identified by a partition index. +/// +/// This is not intended as an eventual replacement for the physical plan representation +/// within DataFusion, [`ExecutionPlan`], but rather a generic interface that +/// parts of the physical plan are "compiled" into by the scheduler. +/// +/// # Push vs Pull Execution +/// +/// Whilst the interface exposed to the scheduler is push-based, in which member functions +/// computation is performed is intentionally left as an implementation detail of the [`Pipeline`] +/// +/// This allows flexibility to support the following different patterns, and potentially more: +/// +/// An eager, push-based pipeline, that processes a batch synchronously in [`Pipeline::push`] +/// and immediately wakes the corresponding output partition. +/// +/// A parallel, push-based pipeline, that enqueues the processing of a batch to the rayon +/// thread pool in [`Pipeline::push`], and wakes the corresponding output partition when +/// the job completes. Order and non-order preserving variants are possible +/// +/// A merge pipeline which combines data from one or more input partitions into one or +/// more output partitions. [`Pipeline::push`] adds data to an input buffer, and wakes +/// any output partitions that may now be able to make progress. This may be none none +/// if the operator is waiting on data from a different input partition +/// +/// An aggregation pipeline which combines data from one or more input partitions into +/// a single output partition. [`Pipeline::push`] would eagerly update the computed +/// aggregates, and the final [`Pipeline::close`] trigger flushing these to the output Review Comment: Good shout, will include. Whilst enumerating the full list of possibilities is not the intent, this can act as a wishlist of sorts 😂 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
