This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new dd3a003c1c Minor: Improve documentation of `MemoryPool` (#6388)
dd3a003c1c is described below
commit dd3a003c1ca4e2109f33277d13f2b0b2fa500337
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon May 22 09:39:34 2023 -0400
Minor: Improve documentation of `MemoryPool` (#6388)
---
datafusion/execution/src/memory_pool/mod.rs | 25 ++++++++++++++++++++++++-
datafusion/execution/src/memory_pool/pool.rs | 16 +++++++++++++++-
2 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/datafusion/execution/src/memory_pool/mod.rs
b/datafusion/execution/src/memory_pool/mod.rs
index 31117dda4f..d002cda8d8 100644
--- a/datafusion/execution/src/memory_pool/mod.rs
+++ b/datafusion/execution/src/memory_pool/mod.rs
@@ -25,7 +25,30 @@ pub mod proxy;
pub use pool::*;
-/// The pool of memory on which [`MemoryReservation`] record their memory
reservations
+/// The pool of memory on which [`MemoryReservation`]s record their
+/// memory reservations.
+///
+/// DataFusion is a streaming query engine, processing most queries
+/// without buffering the entire input. However, certain operations
+/// such as sorting and grouping/joining with a large number of
+/// distinct groups/keys, can require buffering intermediate results
+/// and for large datasets this can require large amounts of memory.
+///
+/// In order to avoid allocating memory until the OS or the container
+/// system kills the process, DataFusion operators only allocate
+/// memory they are able to reserve from the configured
+/// [`MemoryPool`]. Once the memory tracked by the pool is exhausted,
+/// operators must either free memory by spilling to local disk or
+/// error.
+///
+/// A `MemoryPool` can be shared by concurrently executing plans in
+/// the same process to control memory usage in a multi-tenant system.
+///
+/// The following memory pool implementations are available:
+///
+/// * [`UnboundedMemoryPool`](pool::UnboundedMemoryPool)
+/// * [`GreedyMemoryPool`](pool::GreedyMemoryPool)
+/// * [`FairSpillPool`](pool::FairSpillPool)
pub trait MemoryPool: Send + Sync + std::fmt::Debug {
/// Registers a new [`MemoryConsumer`]
///
diff --git a/datafusion/execution/src/memory_pool/pool.rs
b/datafusion/execution/src/memory_pool/pool.rs
index 7bb9fa4253..7b68a86244 100644
--- a/datafusion/execution/src/memory_pool/pool.rs
+++ b/datafusion/execution/src/memory_pool/pool.rs
@@ -17,6 +17,7 @@
use crate::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation};
use datafusion_common::{DataFusionError, Result};
+use log::debug;
use parking_lot::Mutex;
use std::sync::atomic::{AtomicUsize, Ordering};
@@ -45,7 +46,11 @@ impl MemoryPool for UnboundedMemoryPool {
}
}
-/// A [`MemoryPool`] that implements a greedy first-come first-serve limit
+/// A [`MemoryPool`] that implements a greedy first-come first-serve limit.
+///
+/// This pool works well for queries that do not need to spill or have
+/// a single spillable operator. See [`GreedyMemoryPool`] if there are
+/// multiple spillable operators that all will spill.
#[derive(Debug)]
pub struct GreedyMemoryPool {
pool_size: usize,
@@ -55,6 +60,7 @@ pub struct GreedyMemoryPool {
impl GreedyMemoryPool {
/// Allocate up to `limit` bytes
pub fn new(pool_size: usize) -> Self {
+ debug!("Created new GreedyMemoryPool(pool_size={pool_size})");
Self {
pool_size,
used: AtomicUsize::new(0),
@@ -92,6 +98,13 @@ impl MemoryPool for GreedyMemoryPool {
/// an even fraction of the available memory sans any unspillable reservations
/// (i.e. `(pool_size - unspillable_memory) / num_spillable_reservations`)
///
+/// This pool works best when you know beforehand the query has
+/// multiple spillable operators that will likely all need to
+/// spill. Sometimes it will cause spills even when there was
+/// sufficient memory (reserved for other operators) to avoid doing
+/// so.
+///
+/// ```text
/// ┌───────────────────────z──────────────────────z───────────────┐
/// │ z z │
/// │ z z │
@@ -100,6 +113,7 @@ impl MemoryPool for GreedyMemoryPool {
/// │ z z │
/// │ z z │
/// └───────────────────────z──────────────────────z───────────────┘
+/// ```
///
/// Unspillable memory is allocated in a first-come, first-serve fashion
#[derive(Debug)]