This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new dd3a003c1c Minor: Improve documentation of `MemoryPool` (#6388)
dd3a003c1c is described below

commit dd3a003c1ca4e2109f33277d13f2b0b2fa500337
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon May 22 09:39:34 2023 -0400

    Minor: Improve documentation of `MemoryPool` (#6388)
---
 datafusion/execution/src/memory_pool/mod.rs  | 25 ++++++++++++++++++++++++-
 datafusion/execution/src/memory_pool/pool.rs | 16 +++++++++++++++-
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/datafusion/execution/src/memory_pool/mod.rs 
b/datafusion/execution/src/memory_pool/mod.rs
index 31117dda4f..d002cda8d8 100644
--- a/datafusion/execution/src/memory_pool/mod.rs
+++ b/datafusion/execution/src/memory_pool/mod.rs
@@ -25,7 +25,30 @@ pub mod proxy;
 
 pub use pool::*;
 
-/// The pool of memory on which [`MemoryReservation`] record their memory 
reservations
+/// The pool of memory on which [`MemoryReservation`]s record their
+/// memory reservations.
+///
+/// DataFusion is a streaming query engine, processing most queries
+/// without buffering the entire input. However, certain operations
+/// such as sorting and grouping/joining with a large number of
+/// distinct groups/keys, can require buffering intermediate results
+/// and for large datasets this can require large amounts of memory.
+///
+/// In order to avoid allocating memory until the OS or the container
+/// system kills the process, DataFusion operators only allocate
+/// memory they are able to reserve from the configured
+/// [`MemoryPool`]. Once the memory tracked by the pool is exhausted,
+/// operators must either free memory by spilling to local disk or
+/// error.
+///
+/// A `MemoryPool` can be shared by concurrently executing plans in
+/// the same process to control memory usage in a multi-tenant system.
+///
+/// The following memory pool implementations are available:
+///
+/// * [`UnboundedMemoryPool`](pool::UnboundedMemoryPool)
+/// * [`GreedyMemoryPool`](pool::GreedyMemoryPool)
+/// * [`FairSpillPool`](pool::FairSpillPool)
 pub trait MemoryPool: Send + Sync + std::fmt::Debug {
     /// Registers a new [`MemoryConsumer`]
     ///
diff --git a/datafusion/execution/src/memory_pool/pool.rs 
b/datafusion/execution/src/memory_pool/pool.rs
index 7bb9fa4253..7b68a86244 100644
--- a/datafusion/execution/src/memory_pool/pool.rs
+++ b/datafusion/execution/src/memory_pool/pool.rs
@@ -17,6 +17,7 @@
 
 use crate::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation};
 use datafusion_common::{DataFusionError, Result};
+use log::debug;
 use parking_lot::Mutex;
 use std::sync::atomic::{AtomicUsize, Ordering};
 
@@ -45,7 +46,11 @@ impl MemoryPool for UnboundedMemoryPool {
     }
 }
 
-/// A [`MemoryPool`] that implements a greedy first-come first-serve limit
+/// A [`MemoryPool`] that implements a greedy first-come first-serve limit.
+///
+/// This pool works well for queries that do not need to spill or have
+/// a single spillable operator. See [`GreedyMemoryPool`] if there are
+/// multiple spillable operators that all will spill.
 #[derive(Debug)]
 pub struct GreedyMemoryPool {
     pool_size: usize,
@@ -55,6 +60,7 @@ pub struct GreedyMemoryPool {
 impl GreedyMemoryPool {
     /// Allocate up to `limit` bytes
     pub fn new(pool_size: usize) -> Self {
+        debug!("Created new GreedyMemoryPool(pool_size={pool_size})");
         Self {
             pool_size,
             used: AtomicUsize::new(0),
@@ -92,6 +98,13 @@ impl MemoryPool for GreedyMemoryPool {
 /// an even fraction of the available memory sans any unspillable reservations
 /// (i.e. `(pool_size - unspillable_memory) / num_spillable_reservations`)
 ///
+/// This pool works best when you know beforehand the query has
+/// multiple spillable operators that will likely all need to
+/// spill. Sometimes it will cause spills even when there was
+/// sufficient memory (reserved for other operators) to avoid doing
+/// so.
+///
+/// ```text
 ///    ┌───────────────────────z──────────────────────z───────────────┐
 ///    │                       z                      z               │
 ///    │                       z                      z               │
@@ -100,6 +113,7 @@ impl MemoryPool for GreedyMemoryPool {
 ///    │                       z                      z               │
 ///    │                       z                      z               │
 ///    └───────────────────────z──────────────────────z───────────────┘
+/// ```
 ///
 /// Unspillable memory is allocated in a first-come, first-serve fashion
 #[derive(Debug)]

Reply via email to