From db5104d0aadc09510857eaec4c2fc98afa2b73a5 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 5 Dec 2022 18:29:40 +0000 Subject: [PATCH 01/11] Simplify MemoryManager --- datafusion/core/src/execution/context.rs | 7 +- .../core/src/execution/memory_manager/mod.rs | 617 ++++-------------- .../src/execution/memory_manager/proxy.rs | 87 --- datafusion/core/src/execution/mod.rs | 4 +- datafusion/core/src/execution/runtime_env.rs | 27 +- .../core/src/physical_plan/aggregates/hash.rs | 31 +- .../physical_plan/aggregates/no_grouping.rs | 27 +- .../src/physical_plan/aggregates/row_hash.rs | 31 +- datafusion/core/src/physical_plan/common.rs | 2 +- datafusion/core/src/physical_plan/explain.rs | 3 +- .../src/physical_plan/metrics/composite.rs | 12 +- .../core/src/physical_plan/metrics/tracker.rs | 46 +- .../core/src/physical_plan/sorts/sort.rs | 190 ++---- .../sorts/sort_preserving_merge.rs | 8 +- datafusion/core/tests/memory_limit.rs | 26 +- datafusion/core/tests/order_spill_fuzz.rs | 38 +- .../core/tests/provider_filter_pushdown.rs | 5 +- test-utils/src/data_gen.rs | 55 +- 18 files changed, 352 insertions(+), 864 deletions(-) diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index a9eb42bab6fc..6dc0fd62d5bf 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -74,7 +74,7 @@ use crate::config::{ ConfigOptions, OPT_BATCH_SIZE, OPT_COALESCE_BATCHES, OPT_COALESCE_TARGET_BATCH_SIZE, OPT_FILTER_NULL_JOIN_KEYS, OPT_OPTIMIZER_MAX_PASSES, OPT_OPTIMIZER_SKIP_FAILED_RULES, }; -use crate::execution::{runtime_env::RuntimeEnv, FunctionRegistry}; +use crate::execution::{runtime_env::RuntimeEnv, FunctionRegistry, MemoryManager}; use crate::physical_optimizer::enforcement::BasicEnforcement; use crate::physical_plan::file_format::{plan_to_csv, plan_to_json, plan_to_parquet}; use crate::physical_plan::planner::DefaultPhysicalPlanner; @@ -1912,6 +1912,11 @@ impl TaskContext { self.task_id.clone() } + /// Return the [`MemoryManager`] associated with this [TaskContext] + pub fn memory_manager(&self) -> &Arc { + &self.runtime.memory_manager + } + /// Return the [RuntimeEnv] associated with this [TaskContext] pub fn runtime_env(&self) -> Arc { self.runtime.clone() diff --git a/datafusion/core/src/execution/memory_manager/mod.rs b/datafusion/core/src/execution/memory_manager/mod.rs index c3ff444ebc27..4aef4d2b7617 100644 --- a/datafusion/core/src/execution/memory_manager/mod.rs +++ b/datafusion/core/src/execution/memory_manager/mod.rs @@ -18,20 +18,12 @@ //! Manages all available memory during query execution use crate::error::{DataFusionError, Result}; -use async_trait::async_trait; -use hashbrown::HashSet; -use log::{debug, warn}; -use parking_lot::{Condvar, Mutex}; -use std::fmt; -use std::fmt::{Debug, Display, Formatter}; +use log::debug; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; -use std::time::{Duration, Instant}; pub mod proxy; -static CONSUMER_ID: AtomicUsize = AtomicUsize::new(0); - #[derive(Debug, Clone)] /// Configuration information for memory management pub enum MemoryManagerConfig { @@ -97,339 +89,154 @@ impl MemoryManagerConfig { memory_fraction, }) } - - /// return the maximum size of the memory, in bytes, this config will allow - fn pool_size(&self) -> usize { - match self { - MemoryManagerConfig::Existing(existing) => existing.pool_size, - MemoryManagerConfig::New { - max_memory, - memory_fraction, - } => (*max_memory as f64 * *memory_fraction) as usize, - } - } -} - -fn next_id() -> usize { - CONSUMER_ID.fetch_add(1, Ordering::SeqCst) -} - -/// Type of the memory consumer -pub enum ConsumerType { - /// consumers that can grow its memory usage by requesting more from the memory manager or - /// shrinks its memory usage when we can no more assign available memory to it. - /// Examples are spillable sorter, spillable hashmap, etc. - Requesting, - /// consumers that are not spillable, counting in for only tracking purpose. - Tracking, -} - -#[derive(Clone, Debug, Hash, Eq, PartialEq)] -/// Id that uniquely identifies a Memory Consumer -pub struct MemoryConsumerId { - /// partition the consumer belongs to - pub partition_id: usize, - /// unique id - pub id: usize, -} - -impl MemoryConsumerId { - /// Auto incremented new Id - pub fn new(partition_id: usize) -> Self { - let id = next_id(); - Self { partition_id, id } - } -} - -impl Display for MemoryConsumerId { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - write!(f, "{}:{}", self.partition_id, self.id) - } -} - -#[async_trait] -/// A memory consumer that either takes up memory (of type `ConsumerType::Tracking`) -/// or grows/shrinks memory usage based on available memory (of type `ConsumerType::Requesting`). -pub trait MemoryConsumer: Send + Sync { - /// Display name of the consumer - fn name(&self) -> String; - - /// Unique id of the consumer - fn id(&self) -> &MemoryConsumerId; - - /// Ptr to MemoryManager - fn memory_manager(&self) -> Arc; - - /// Partition that the consumer belongs to - fn partition_id(&self) -> usize { - self.id().partition_id - } - - /// Type of the consumer - fn type_(&self) -> &ConsumerType; - - /// Grow memory by `required` to buffer more data in memory, - /// this may trigger spill before grow when the memory threshold is - /// reached for this consumer. - async fn try_grow(&self, required: usize) -> Result<()> { - let current = self.mem_used(); - debug!( - "trying to acquire {} whiling holding {} from consumer {}", - human_readable_size(required), - human_readable_size(current), - self.id(), - ); - - let can_grow_directly = - self.memory_manager().can_grow_directly(required, current); - if !can_grow_directly { - debug!( - "Failed to grow memory of {} directly from consumer {}, spilling first ...", - human_readable_size(required), - self.id() - ); - let freed = self.spill().await?; - self.memory_manager() - .record_free_then_acquire(freed, required); - } - Ok(()) - } - - /// Grow without spilling to the disk. It grows the memory directly - /// so it should be only used when the consumer already allocated the - /// memory and it is safe to grow without spilling. - fn grow(&self, required: usize) { - self.memory_manager().record_free_then_acquire(0, required); - } - - /// Return `freed` memory to the memory manager, - /// may wake up other requesters waiting for their minimum memory quota. - fn shrink(&self, freed: usize) { - self.memory_manager().record_free(freed); - } - - /// Spill in-memory buffers to disk, free memory, return the previous used - async fn spill(&self) -> Result; - - /// Current memory used by this consumer - fn mem_used(&self) -> usize; } -impl Debug for dyn MemoryConsumer { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - write!( - f, - "{}[{}]: {}", - self.name(), - self.id(), - human_readable_size(self.mem_used()) - ) - } -} - -impl Display for dyn MemoryConsumer { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - write!(f, "{}[{}]", self.name(), self.id(),) - } -} - -/* -The memory management architecture is the following: - -1. User designates max execution memory by setting RuntimeConfig.max_memory and RuntimeConfig.memory_fraction (float64 between 0..1). - The actual max memory DataFusion could use `pool_size = max_memory * memory_fraction`. -2. The entities that take up memory during its execution are called 'Memory Consumers'. Operators or others are encouraged to - register themselves to the memory manager and report its usage through `mem_used()`. -3. There are two kinds of consumers: - - 'Requesting' consumers that would acquire memory during its execution and release memory through `spill` if no more memory is available. - - 'Tracking' consumers that exist for reporting purposes to provide a more accurate memory usage estimation for memory consumers. -4. Requesting and tracking consumers share the pool. Each controlling consumer could acquire a maximum of - (pool_size - all_tracking_used) / active_num_controlling_consumers. - - Memory Space for the DataFusion Lib / Process of `pool_size` - ┌──────────────────────────────────────────────z─────────────────────────────┐ - │ z │ - │ z │ - │ Requesting z Tracking │ - │ Memory Consumers z Memory Consumers │ - │ z │ - │ z │ - └──────────────────────────────────────────────z─────────────────────────────┘ -*/ - -/// Manage memory usage during physical plan execution +/// The memory manager maintains a fixed size pool of memory +/// from which portions can be allocated #[derive(Debug)] pub struct MemoryManager { - requesters: Arc>>, - pool_size: usize, - requesters_total: Arc>, - trackers_total: AtomicUsize, - cv: Condvar, + state: Arc, } impl MemoryManager { /// Create new memory manager based on the configuration - #[allow(clippy::mutex_atomic)] pub fn new(config: MemoryManagerConfig) -> Arc { - let pool_size = config.pool_size(); - match config { MemoryManagerConfig::Existing(manager) => manager, - MemoryManagerConfig::New { .. } => { + MemoryManagerConfig::New { + max_memory, + memory_fraction, + } => { + let pool_size = (max_memory as f64 * memory_fraction) as usize; debug!( "Creating memory manager with initial size {}", human_readable_size(pool_size) ); Arc::new(Self { - requesters: Arc::new(Mutex::new(HashSet::new())), - pool_size, - requesters_total: Arc::new(Mutex::new(0)), - trackers_total: AtomicUsize::new(0), - cv: Condvar::new(), + state: Arc::new(MemoryManagerState { + pool_size, + used: AtomicUsize::new(0), + }), }) } } } - fn get_tracker_total(&self) -> usize { - self.trackers_total.load(Ordering::SeqCst) - } - - pub(crate) fn grow_tracker_usage(&self, delta: usize) { - self.trackers_total.fetch_add(delta, Ordering::SeqCst); - } - - pub(crate) fn shrink_tracker_usage(&self, delta: usize) { - let update = - self.trackers_total - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |x| { - if x >= delta { - Some(x - delta) - } else { - None - } - }); - update.unwrap_or_else(|_| { - panic!( - "Tracker total memory shrink by {} underflow, current value is ", - delta - ) - }); + /// Returns the maximum pool size + /// + /// Note: this can be less than the amount allocated as a result of [`MemoryManager::allocate`] + pub fn pool_size(&self) -> usize { + self.state.pool_size } - /// Return the total memory usage for all requesters - pub fn get_requester_total(&self) -> usize { - *self.requesters_total.lock() + /// Returns the number of allocated bytes + /// + /// Note: this can exceed the pool size as a result of [`MemoryManager::allocate`] + pub fn allocated(&self) -> usize { + self.state.used.load(Ordering::Relaxed) } - /// Register a new memory requester - pub(crate) fn register_requester(&self, requester_id: &MemoryConsumerId) { - self.requesters.lock().insert(requester_id.clone()); + /// Returns a new empty allocation identified by `name` + pub fn new_tracked_allocation(&self, name: String) -> TrackedAllocation { + TrackedAllocation::new_empty(name, Arc::clone(&self.state)) } +} - fn max_mem_for_requesters(&self) -> usize { - let trk_total = self.get_tracker_total(); - self.pool_size.saturating_sub(trk_total) +impl std::fmt::Display for MemoryManager { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!( + f, + "MemoryManager(capacity: {}, allocated: {})", + human_readable_size(self.state.pool_size), + human_readable_size(self.allocated()), + ) } +} - /// Grow memory attempt from a consumer, return if we could grant that much to it - fn can_grow_directly(&self, required: usize, current: usize) -> bool { - let num_rqt = self.requesters.lock().len(); - let mut rqt_current_used = self.requesters_total.lock(); - let mut rqt_max = self.max_mem_for_requesters(); - - let granted; - loop { - let max_per_rqt = rqt_max / num_rqt; - let min_per_rqt = max_per_rqt / 2; - - if required + current >= max_per_rqt { - granted = false; - break; - } +#[derive(Debug)] +struct MemoryManagerState { + pool_size: usize, + used: AtomicUsize, +} - let remaining = rqt_max.checked_sub(*rqt_current_used).unwrap_or_default(); - if remaining >= required { - granted = true; - *rqt_current_used += required; - break; - } else if current < min_per_rqt { - // if we cannot acquire at lease 1/2n memory, just wait for others - // to spill instead spill self frequently with limited total mem - debug!( - "Cannot acquire a minimum amount of {} memory from the manager of total {}, waiting for others to spill ...", - human_readable_size(min_per_rqt), human_readable_size(self.pool_size)); - let now = Instant::now(); - self.cv.wait(&mut rqt_current_used); - let elapsed = now.elapsed(); - if elapsed > Duration::from_secs(10) { - warn!("Elapsed on waiting for spilling: {:.2?}", elapsed); - } - } else { - granted = false; - break; - } +/// A [`TrackedAllocation`] tracks a reservation of memory in a [`MemoryManager`] +/// that is freed back to the memory pool on drop +#[derive(Debug)] +pub struct TrackedAllocation { + name: String, + size: usize, + state: Arc, +} - rqt_max = self.max_mem_for_requesters(); +impl TrackedAllocation { + fn new_empty(name: String, state: Arc) -> Self { + Self { + name, + size: 0, + state, } + } - granted + /// Returns the size of this [`TrackedAllocation`] in bytes + pub fn size(&self) -> usize { + self.size } - fn record_free_then_acquire(&self, freed: usize, acquired: usize) { - let mut requesters_total = self.requesters_total.lock(); - debug!( - "free_then_acquire: total {}, freed {}, acquired {}", - human_readable_size(*requesters_total), - human_readable_size(freed), - human_readable_size(acquired) - ); - assert!(*requesters_total >= freed); - *requesters_total -= freed; - *requesters_total += acquired; - self.cv.notify_all(); + /// Frees all bytes from this allocation returning the number of bytes freed + pub fn free(&mut self) -> usize { + let size = self.size; + if size != 0 { + self.shrink(size) + } + size + } + + /// Frees `capacity` bytes from this allocation + /// + /// # Panics + /// + /// Panics if `capacity` exceeds [`Self::size`] + pub fn shrink(&mut self, capacity: usize) { + let new_size = self.size.checked_sub(capacity).unwrap(); + self.state.used.fetch_sub(capacity, Ordering::SeqCst); + self.size = new_size + } + + /// Sets the size of this allocation to `capacity` + pub fn resize(&mut self, capacity: usize) { + use std::cmp::Ordering; + match capacity.cmp(&self.size) { + Ordering::Greater => self.grow(capacity - self.size), + Ordering::Less => self.shrink(self.size - capacity), + _ => {} + } } - fn record_free(&self, freed: usize) { - let mut requesters_total = self.requesters_total.lock(); - debug!( - "free: total {}, freed {}", - human_readable_size(*requesters_total), - human_readable_size(freed) - ); - assert!(*requesters_total >= freed); - *requesters_total -= freed; - self.cv.notify_all(); + /// Increase the size of this by `capacity` bytes + pub fn grow(&mut self, capacity: usize) { + self.state.used.fetch_add(capacity, Ordering::SeqCst); + self.size += capacity; } - /// Drop a memory consumer and reclaim the memory - pub(crate) fn drop_consumer(&self, id: &MemoryConsumerId, mem_used: usize) { - // find in requesters first - { - let mut requesters = self.requesters.lock(); - if requesters.remove(id) { - let mut total = self.requesters_total.lock(); - assert!(*total >= mem_used); - *total -= mem_used; - self.cv.notify_all(); - return; - } - } - self.shrink_tracker_usage(mem_used); - self.cv.notify_all(); + /// Try to increase the size of this [`TrackedAllocation`] by `capacity` bytes + pub fn try_grow(&mut self, capacity: usize) -> Result<()> { + let pool_size = self.state.pool_size; + self.state + .used + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |used| { + let new_used = used + capacity; + (new_used <= pool_size).then_some(new_used) + }) + .map_err(|used| DataFusionError::ResourcesExhausted(format!("Failed to allocate additional {} bytes for {} with {} bytes already allocated - maximum available is {}", capacity, self.name, self.size, used)))?; + self.size += capacity; + Ok(()) } } -impl Display for MemoryManager { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - write!(f, - "MemoryManager usage statistics: total {}, trackers used {}, total {} requesters used: {}", - human_readable_size(self.pool_size), - human_readable_size(self.get_tracker_total()), - self.requesters.lock().len(), - human_readable_size(self.get_requester_total()), - ) +impl Drop for TrackedAllocation { + fn drop(&mut self) { + self.free(); } } @@ -460,205 +267,67 @@ pub fn human_readable_size(size: usize) -> String { #[cfg(test)] mod tests { use super::*; - use crate::error::Result; - use crate::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; - use crate::execution::MemoryConsumer; - use crate::physical_plan::metrics::{ExecutionPlanMetricsSet, MemTrackingMetrics}; - use async_trait::async_trait; - use std::sync::atomic::{AtomicUsize, Ordering}; - use std::sync::Arc; - - struct DummyRequester { - id: MemoryConsumerId, - runtime: Arc, - spills: AtomicUsize, - mem_used: AtomicUsize, - } - - impl DummyRequester { - fn new(partition: usize, runtime: Arc) -> Self { - Self { - id: MemoryConsumerId::new(partition), - runtime, - spills: AtomicUsize::new(0), - mem_used: AtomicUsize::new(0), - } - } - - async fn do_with_mem(&self, grow: usize) -> Result<()> { - self.try_grow(grow).await?; - self.mem_used.fetch_add(grow, Ordering::SeqCst); - Ok(()) - } - fn get_spills(&self) -> usize { - self.spills.load(Ordering::SeqCst) - } - } - - #[async_trait] - impl MemoryConsumer for DummyRequester { - fn name(&self) -> String { - "dummy".to_owned() - } - - fn id(&self) -> &MemoryConsumerId { - &self.id - } - - fn memory_manager(&self) -> Arc { - self.runtime.memory_manager.clone() - } - - fn type_(&self) -> &ConsumerType { - &ConsumerType::Requesting - } - - async fn spill(&self) -> Result { - self.spills.fetch_add(1, Ordering::SeqCst); - let used = self.mem_used.swap(0, Ordering::SeqCst); - Ok(used) - } - - fn mem_used(&self) -> usize { - self.mem_used.load(Ordering::SeqCst) - } - } - - struct DummyTracker { - id: MemoryConsumerId, - runtime: Arc, - mem_used: usize, - } - - impl DummyTracker { - fn new(partition: usize, runtime: Arc, mem_used: usize) -> Self { - runtime.grow_tracker_usage(mem_used); - Self { - id: MemoryConsumerId::new(partition), - runtime, - mem_used, - } - } - } - - #[async_trait] - impl MemoryConsumer for DummyTracker { - fn name(&self) -> String { - "dummy".to_owned() - } - - fn id(&self) -> &MemoryConsumerId { - &self.id - } - - fn memory_manager(&self) -> Arc { - self.runtime.memory_manager.clone() - } - - fn type_(&self) -> &ConsumerType { - &ConsumerType::Tracking - } - - async fn spill(&self) -> Result { - Ok(0) - } - - fn mem_used(&self) -> usize { - self.mem_used - } - } - - #[tokio::test] - async fn basic_functionalities() { - let config = RuntimeConfig::new() - .with_memory_manager(MemoryManagerConfig::try_new_limit(100, 1.0).unwrap()); - let runtime = Arc::new(RuntimeEnv::new(config).unwrap()); - - DummyTracker::new(0, runtime.clone(), 5); - assert_eq!(runtime.memory_manager.get_tracker_total(), 5); - - let tracker1 = DummyTracker::new(0, runtime.clone(), 10); - assert_eq!(runtime.memory_manager.get_tracker_total(), 15); - - DummyTracker::new(0, runtime.clone(), 15); - assert_eq!(runtime.memory_manager.get_tracker_total(), 30); - - runtime.drop_consumer(tracker1.id(), tracker1.mem_used); - assert_eq!(runtime.memory_manager.get_tracker_total(), 20); - - // MemTrackingMetrics as an easy way to track memory - let ms = ExecutionPlanMetricsSet::new(); - let tracking_metric = MemTrackingMetrics::new_with_rt(&ms, 0, runtime.clone()); - tracking_metric.init_mem_used(15); - assert_eq!(runtime.memory_manager.get_tracker_total(), 35); - - drop(tracking_metric); - assert_eq!(runtime.memory_manager.get_tracker_total(), 20); - - let requester1 = DummyRequester::new(0, runtime.clone()); - runtime.register_requester(requester1.id()); - - // first requester entered, should be able to use any of the remaining 80 - requester1.do_with_mem(40).await.unwrap(); - requester1.do_with_mem(10).await.unwrap(); - assert_eq!(requester1.get_spills(), 0); - assert_eq!(requester1.mem_used(), 50); - assert_eq!(*runtime.memory_manager.requesters_total.lock(), 50); - - let requester2 = DummyRequester::new(0, runtime.clone()); - runtime.register_requester(requester2.id()); - - requester2.do_with_mem(20).await.unwrap(); - requester2.do_with_mem(30).await.unwrap(); - assert_eq!(requester2.get_spills(), 1); - assert_eq!(requester2.mem_used(), 30); - - requester1.do_with_mem(10).await.unwrap(); - assert_eq!(requester1.get_spills(), 1); - assert_eq!(requester1.mem_used(), 10); - - assert_eq!(*runtime.memory_manager.requesters_total.lock(), 40); - } - - #[tokio::test] + #[test] #[should_panic(expected = "invalid max_memory. Expected greater than 0, got 0")] - async fn test_try_new_with_limit_0() { + fn test_try_new_with_limit_0() { MemoryManagerConfig::try_new_limit(0, 1.0).unwrap(); } - #[tokio::test] + #[test] #[should_panic( expected = "invalid fraction. Expected greater than 0 and less than 1.0, got -9.6" )] - async fn test_try_new_with_limit_neg_fraction() { + fn test_try_new_with_limit_neg_fraction() { MemoryManagerConfig::try_new_limit(100, -9.6).unwrap(); } - #[tokio::test] + #[test] #[should_panic( expected = "invalid fraction. Expected greater than 0 and less than 1.0, got 9.6" )] - async fn test_try_new_with_limit_too_large() { + fn test_try_new_with_limit_too_large() { MemoryManagerConfig::try_new_limit(100, 9.6).unwrap(); } - #[tokio::test] - async fn test_try_new_with_limit_pool_size() { + #[test] + fn test_try_new_with_limit_pool_size() { let config = MemoryManagerConfig::try_new_limit(100, 0.5).unwrap(); - assert_eq!(config.pool_size(), 50); + let manager = MemoryManager::new(config); + assert_eq!(manager.pool_size(), 50); let config = MemoryManagerConfig::try_new_limit(100000, 0.1).unwrap(); - assert_eq!(config.pool_size(), 10000); + let manager = MemoryManager::new(config); + assert_eq!(manager.pool_size(), 10000); } - #[tokio::test] - async fn test_memory_manager_underflow() { + #[test] + fn test_memory_manager_underflow() { let config = MemoryManagerConfig::try_new_limit(100, 0.5).unwrap(); let manager = MemoryManager::new(config); - manager.grow_tracker_usage(100); + let mut a1 = manager.new_tracked_allocation("a1".to_string()); + assert_eq!(manager.allocated(), 0); + + a1.grow(100); + assert_eq!(manager.allocated(), 100); + + assert_eq!(a1.free(), 100); + assert_eq!(manager.allocated(), 0); + + a1.try_grow(100).unwrap_err(); + assert_eq!(manager.allocated(), 0); + + a1.try_grow(30).unwrap(); + assert_eq!(manager.allocated(), 30); + + let mut a2 = manager.new_tracked_allocation("a2".to_string()); + a2.try_grow(25).unwrap_err(); + assert_eq!(manager.allocated(), 30); + + drop(a1); + assert_eq!(manager.allocated(), 0); - manager.register_requester(&MemoryConsumerId::new(1)); - assert!(!manager.can_grow_directly(20, 0)); + a2.try_grow(25).unwrap(); + assert_eq!(manager.allocated(), 25); } } diff --git a/datafusion/core/src/execution/memory_manager/proxy.rs b/datafusion/core/src/execution/memory_manager/proxy.rs index 2a5bd2507357..43532f9a81f1 100644 --- a/datafusion/core/src/execution/memory_manager/proxy.rs +++ b/datafusion/core/src/execution/memory_manager/proxy.rs @@ -16,96 +16,9 @@ // under the License. //! Utilities that help with tracking of memory allocations. -use std::sync::Arc; -use async_trait::async_trait; -use datafusion_common::DataFusionError; use hashbrown::raw::{Bucket, RawTable}; -use super::{ConsumerType, MemoryConsumer, MemoryConsumerId, MemoryManager}; - -/// Accounting proxy for memory usage. -/// -/// This is helpful when calculating memory usage on the actual data structure is expensive but it is easy to track -/// allocations while processing data. -/// -/// This consumer will NEVER spill. -pub struct MemoryConsumerProxy { - /// Name - name: String, - - /// Consumer ID. - id: MemoryConsumerId, - - /// Linked memory manager. - memory_manager: Arc, - - /// Currently used size in bytes. - used: usize, -} - -impl MemoryConsumerProxy { - /// Create new proxy consumer and register it with the given memory manager. - pub fn new( - name: impl Into, - id: MemoryConsumerId, - memory_manager: Arc, - ) -> Self { - memory_manager.register_requester(&id); - - Self { - name: name.into(), - id, - memory_manager, - used: 0, - } - } - - /// Try to allocate given amount of memory. - pub async fn alloc(&mut self, bytes: usize) -> Result<(), DataFusionError> { - self.try_grow(bytes).await?; - self.used = self.used.checked_add(bytes).expect("overflow"); - Ok(()) - } -} - -#[async_trait] -impl MemoryConsumer for MemoryConsumerProxy { - fn name(&self) -> String { - self.name.clone() - } - - fn id(&self) -> &crate::execution::MemoryConsumerId { - &self.id - } - - fn memory_manager(&self) -> Arc { - Arc::clone(&self.memory_manager) - } - - fn type_(&self) -> &ConsumerType { - &ConsumerType::Tracking - } - - async fn spill(&self) -> Result { - Err(DataFusionError::ResourcesExhausted(format!( - "Cannot spill {}", - self.name - ))) - } - - fn mem_used(&self) -> usize { - self.used - } -} - -impl Drop for MemoryConsumerProxy { - fn drop(&mut self) { - self.memory_manager - .drop_consumer(self.id(), self.mem_used()); - } -} - /// Extension trait for [`Vec`] to account for allocations. pub trait VecAllocExt { /// Item type. diff --git a/datafusion/core/src/execution/mod.rs b/datafusion/core/src/execution/mod.rs index 024980dee059..d827afe46d57 100644 --- a/datafusion/core/src/execution/mod.rs +++ b/datafusion/core/src/execution/mod.rs @@ -48,7 +48,5 @@ pub mod registry; pub mod runtime_env; pub use disk_manager::DiskManager; -pub use memory_manager::{ - human_readable_size, MemoryConsumer, MemoryConsumerId, MemoryManager, -}; +pub use memory_manager::{human_readable_size, MemoryManager, TrackedAllocation}; pub use registry::FunctionRegistry; diff --git a/datafusion/core/src/execution/runtime_env.rs b/datafusion/core/src/execution/runtime_env.rs index 64da4a103b16..c48de86efea1 100644 --- a/datafusion/core/src/execution/runtime_env.rs +++ b/datafusion/core/src/execution/runtime_env.rs @@ -20,16 +20,15 @@ use crate::{ error::Result, - execution::{ - disk_manager::{DiskManager, DiskManagerConfig}, - memory_manager::{MemoryConsumerId, MemoryManager, MemoryManagerConfig}, - }, + execution::disk_manager::{DiskManager, DiskManagerConfig}, }; use std::collections::HashMap; use crate::datasource::datasource::TableProviderFactory; use crate::datasource::listing_table_factory::ListingTableFactory; use crate::datasource::object_store::ObjectStoreRegistry; +use crate::execution::memory_manager::MemoryManagerConfig; +use crate::execution::MemoryManager; use datafusion_common::DataFusionError; use object_store::ObjectStore; use std::fmt::{Debug, Formatter}; @@ -74,26 +73,6 @@ impl RuntimeEnv { }) } - /// Register the consumer to get it tracked - pub fn register_requester(&self, id: &MemoryConsumerId) { - self.memory_manager.register_requester(id); - } - - /// Drop the consumer from get tracked, reclaim memory - pub fn drop_consumer(&self, id: &MemoryConsumerId, mem_used: usize) { - self.memory_manager.drop_consumer(id, mem_used) - } - - /// Grow tracker memory of `delta` - pub fn grow_tracker_usage(&self, delta: usize) { - self.memory_manager.grow_tracker_usage(delta) - } - - /// Shrink tracker memory of `delta` - pub fn shrink_tracker_usage(&self, delta: usize) { - self.memory_manager.shrink_tracker_usage(delta) - } - /// Registers a custom `ObjectStore` to be used when accessing a /// specific scheme and host. This allows DataFusion to create /// external tables from urls that do not have built in support diff --git a/datafusion/core/src/physical_plan/aggregates/hash.rs b/datafusion/core/src/physical_plan/aggregates/hash.rs index 8bf929630f05..33e93afeeb3d 100644 --- a/datafusion/core/src/physical_plan/aggregates/hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/hash.rs @@ -28,10 +28,7 @@ use futures::stream::{Stream, StreamExt}; use crate::error::Result; use crate::execution::context::TaskContext; -use crate::execution::memory_manager::proxy::{ - MemoryConsumerProxy, RawTableAllocExt, VecAllocExt, -}; -use crate::execution::MemoryConsumerId; +use crate::execution::memory_manager::proxy::{RawTableAllocExt, VecAllocExt}; use crate::physical_plan::aggregates::{ evaluate_group_by, evaluate_many, AccumulatorItem, AggregateMode, PhysicalGroupBy, }; @@ -41,6 +38,7 @@ use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr}; use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream}; use crate::scalar::ScalarValue; +use crate::execution::memory_manager::TrackedAllocation; use arrow::{array::ArrayRef, compute, compute::cast}; use arrow::{ array::{Array, UInt32Builder}, @@ -125,6 +123,10 @@ impl GroupedHashAggregateStream { timer.done(); + let allocation = context + .memory_manager() + .new_tracked_allocation(format!("GroupedHashAggregateStream[{}]", partition)); + let inner = GroupedHashAggregateStreamInner { schema: Arc::clone(&schema), mode, @@ -134,11 +136,7 @@ impl GroupedHashAggregateStream { baseline_metrics, aggregate_expressions, accumulators: Accumulators { - memory_consumer: MemoryConsumerProxy::new( - "GroupBy Hash Accumulators", - MemoryConsumerId::new(partition), - Arc::clone(&context.runtime_env().memory_manager), - ), + allocation, map: RawTable::with_capacity(0), group_states: Vec::with_capacity(0), }, @@ -172,15 +170,10 @@ impl GroupedHashAggregateStream { // allocate memory // This happens AFTER we actually used the memory, but simplifies the whole accounting and we are OK with // overshooting a bit. Also this means we either store the whole record batch or not. - let result = match result { - Ok(allocated) => { - this.accumulators.memory_consumer.alloc(allocated).await - } - Err(e) => Err(e), - }; - - match result { - Ok(()) => continue, + match result.and_then(|allocated| { + this.accumulators.allocation.try_grow(allocated) + }) { + Ok(_) => continue, Err(e) => Err(ArrowError::ExternalError(Box::new(e))), } } @@ -441,7 +434,7 @@ struct GroupState { /// The state of all the groups struct Accumulators { - memory_consumer: MemoryConsumerProxy, + allocation: TrackedAllocation, /// Logically maps group values to an index in `group_states` /// diff --git a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs index 64cc4f569c8c..d254f85bd577 100644 --- a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs +++ b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs @@ -18,8 +18,6 @@ //! Aggregate without grouping columns use crate::execution::context::TaskContext; -use crate::execution::memory_manager::proxy::MemoryConsumerProxy; -use crate::execution::MemoryConsumerId; use crate::physical_plan::aggregates::{ aggregate_expressions, create_accumulators, finalize_aggregation, AccumulatorItem, AggregateMode, @@ -35,6 +33,7 @@ use futures::stream::BoxStream; use std::sync::Arc; use std::task::{Context, Poll}; +use crate::execution::memory_manager::TrackedAllocation; use futures::stream::{Stream, StreamExt}; /// stream struct for aggregation without grouping columns @@ -55,7 +54,7 @@ struct AggregateStreamInner { baseline_metrics: BaselineMetrics, aggregate_expressions: Vec>>, accumulators: Vec, - memory_consumer: MemoryConsumerProxy, + allocation: TrackedAllocation, finished: bool, } @@ -69,14 +68,13 @@ impl AggregateStream { baseline_metrics: BaselineMetrics, context: Arc, partition: usize, - ) -> datafusion_common::Result { + ) -> Result { let aggregate_expressions = aggregate_expressions(&aggr_expr, &mode, 0)?; let accumulators = create_accumulators(&aggr_expr)?; - let memory_consumer = MemoryConsumerProxy::new( - "GroupBy None Accumulators", - MemoryConsumerId::new(partition), - Arc::clone(&context.runtime_env().memory_manager), - ); + + let allocation = context + .memory_manager() + .new_tracked_allocation(format!("AggregateStream[{}]", partition)); let inner = AggregateStreamInner { schema: Arc::clone(&schema), @@ -85,7 +83,7 @@ impl AggregateStream { baseline_metrics, aggregate_expressions, accumulators, - memory_consumer, + allocation, finished: false, }; let stream = futures::stream::unfold(inner, |mut this| async move { @@ -111,12 +109,9 @@ impl AggregateStream { // allocate memory // This happens AFTER we actually used the memory, but simplifies the whole accounting and we are OK with // overshooting a bit. Also this means we either store the whole record batch or not. - let result = match result { - Ok(allocated) => this.memory_consumer.alloc(allocated).await, - Err(e) => Err(e), - }; - - match result { + match result + .and_then(|allocated| this.allocation.try_grow(allocated)) + { Ok(_) => continue, Err(e) => Err(ArrowError::ExternalError(Box::new(e))), } diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs index c73fa3da0c2e..8308c4a4e723 100644 --- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs @@ -27,10 +27,7 @@ use futures::stream::{Stream, StreamExt}; use crate::error::Result; use crate::execution::context::TaskContext; -use crate::execution::memory_manager::proxy::{ - MemoryConsumerProxy, RawTableAllocExt, VecAllocExt, -}; -use crate::execution::MemoryConsumerId; +use crate::execution::memory_manager::proxy::{RawTableAllocExt, VecAllocExt}; use crate::physical_plan::aggregates::{ evaluate_group_by, evaluate_many, group_schema, AccumulatorItemV2, AggregateMode, PhysicalGroupBy, @@ -40,6 +37,7 @@ use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput}; use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr}; use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream}; +use crate::execution::memory_manager::TrackedAllocation; use arrow::compute::cast; use arrow::datatypes::Schema; use arrow::{array::ArrayRef, compute}; @@ -141,13 +139,13 @@ impl GroupedHashAggregateStreamV2 { let aggr_schema = aggr_state_schema(&aggr_expr)?; let aggr_layout = Arc::new(RowLayout::new(&aggr_schema, RowType::WordAligned)); + let allocation = context.memory_manager().new_tracked_allocation(format!( + "GroupedHashAggregateStreamV2[{}]", + partition + )); let aggr_state = AggregationState { - memory_consumer: MemoryConsumerProxy::new( - "GroupBy Hash (Row) AggregationState", - MemoryConsumerId::new(partition), - Arc::clone(&context.runtime_env().memory_manager), - ), + allocation, map: RawTable::with_capacity(0), group_states: Vec::with_capacity(0), }; @@ -196,15 +194,10 @@ impl GroupedHashAggregateStreamV2 { // allocate memory // This happens AFTER we actually used the memory, but simplifies the whole accounting and we are OK with // overshooting a bit. Also this means we either store the whole record batch or not. - let result = match result { - Ok(allocated) => { - this.aggr_state.memory_consumer.alloc(allocated).await - } - Err(e) => Err(e), - }; - - match result { - Ok(()) => continue, + match result.and_then(|allocated| { + this.aggr_state.allocation.try_grow(allocated) + }) { + Ok(_) => continue, Err(e) => Err(ArrowError::ExternalError(Box::new(e))), } } @@ -465,7 +458,7 @@ struct RowGroupState { /// The state of all the groups struct AggregationState { - memory_consumer: MemoryConsumerProxy, + allocation: TrackedAllocation, /// Logically maps group values to an index in `group_states` /// diff --git a/datafusion/core/src/physical_plan/common.rs b/datafusion/core/src/physical_plan/common.rs index b4db3a32b522..b29dc0cb8c11 100644 --- a/datafusion/core/src/physical_plan/common.rs +++ b/datafusion/core/src/physical_plan/common.rs @@ -51,7 +51,7 @@ impl SizedRecordBatchStream { pub fn new( schema: SchemaRef, batches: Vec>, - metrics: MemTrackingMetrics, + mut metrics: MemTrackingMetrics, ) -> Self { let size = batches.iter().map(|b| batch_byte_size(b)).sum::(); metrics.init_mem_used(size); diff --git a/datafusion/core/src/physical_plan/explain.rs b/datafusion/core/src/physical_plan/explain.rs index ac350b1837e8..6689eb5b9d6d 100644 --- a/datafusion/core/src/physical_plan/explain.rs +++ b/datafusion/core/src/physical_plan/explain.rs @@ -152,7 +152,8 @@ impl ExecutionPlan for ExplainExec { )?; let metrics = ExecutionPlanMetricsSet::new(); - let tracking_metrics = MemTrackingMetrics::new(&metrics, partition); + let tracking_metrics = + MemTrackingMetrics::new(&metrics, context.memory_manager(), partition); debug!( "Before returning SizedRecordBatch in ExplainExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id()); diff --git a/datafusion/core/src/physical_plan/metrics/composite.rs b/datafusion/core/src/physical_plan/metrics/composite.rs index cd4d5c38a9ec..0e3a8edd3e73 100644 --- a/datafusion/core/src/physical_plan/metrics/composite.rs +++ b/datafusion/core/src/physical_plan/metrics/composite.rs @@ -17,7 +17,7 @@ //! Metrics common for complex operators with multiple steps. -use crate::execution::runtime_env::RuntimeEnv; +use crate::execution::MemoryManager; use crate::physical_plan::metrics::tracker::MemTrackingMetrics; use crate::physical_plan::metrics::{ BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricValue, MetricsSet, Time, @@ -32,7 +32,7 @@ use std::time::Duration; /// Collects all metrics during a complex operation, which is composed of multiple steps and /// each stage reports its statistics separately. /// Give sort as an example, when the dataset is more significant than available memory, it will report -/// multiple in-mem sort metrics and final merge-sort metrics from `SortPreservingMergeStream`. +/// multiple in-mem sort metrics and final merge-sort metrics from `SortPreservingMergeStream`. /// Therefore, We need a separation of metrics for which are final metrics (for output_rows accumulation), /// and which are intermediate metrics that we only account for elapsed_compute time. pub struct CompositeMetricsSet { @@ -69,18 +69,18 @@ impl CompositeMetricsSet { pub fn new_intermediate_tracking( &self, partition: usize, - runtime: Arc, + memory_manager: &MemoryManager, ) -> MemTrackingMetrics { - MemTrackingMetrics::new_with_rt(&self.mid, partition, runtime) + MemTrackingMetrics::new(&self.mid, memory_manager, partition) } /// create a new final memory tracking metrics pub fn new_final_tracking( &self, partition: usize, - runtime: Arc, + memory_manager: &MemoryManager, ) -> MemTrackingMetrics { - MemTrackingMetrics::new_with_rt(&self.final_, partition, runtime) + MemTrackingMetrics::new(&self.final_, memory_manager, partition) } fn merge_compute_time(&self, dest: &Time) { diff --git a/datafusion/core/src/physical_plan/metrics/tracker.rs b/datafusion/core/src/physical_plan/metrics/tracker.rs index d8017b95ae8d..d0811afd31c4 100644 --- a/datafusion/core/src/physical_plan/metrics/tracker.rs +++ b/datafusion/core/src/physical_plan/metrics/tracker.rs @@ -17,14 +17,13 @@ //! Metrics with memory usage tracking capability -use crate::execution::runtime_env::RuntimeEnv; -use crate::execution::MemoryConsumerId; use crate::physical_plan::metrics::{ BaselineMetrics, Count, ExecutionPlanMetricsSet, Time, }; -use std::sync::Arc; use std::task::Poll; +use crate::execution::memory_manager::TrackedAllocation; +use crate::execution::MemoryManager; use arrow::{error::ArrowError, record_batch::RecordBatch}; /// Simplified version of tracking memory consumer, @@ -34,34 +33,22 @@ use arrow::{error::ArrowError, record_batch::RecordBatch}; /// and get the memory usage bookkeeping in the memory manager easily. #[derive(Debug)] pub struct MemTrackingMetrics { - id: MemoryConsumerId, - runtime: Option>, + allocation: TrackedAllocation, metrics: BaselineMetrics, } /// Delegates most of the metrics functionalities to the inner BaselineMetrics, /// intercept memory metrics functionalities and do memory manager bookkeeping. impl MemTrackingMetrics { - /// Create metrics similar to [BaselineMetrics] - pub fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self { - let id = MemoryConsumerId::new(partition); - Self { - id, - runtime: None, - metrics: BaselineMetrics::new(metrics, partition), - } - } - - /// Create memory tracking metrics with reference to runtime - pub fn new_with_rt( + /// Create memory tracking metrics with reference to memory manager + pub fn new( metrics: &ExecutionPlanMetricsSet, + memory_manager: &MemoryManager, partition: usize, - runtime: Arc, ) -> Self { - let id = MemoryConsumerId::new(partition); Self { - id, - runtime: Some(runtime), + allocation: memory_manager + .new_tracked_allocation(format!("MemTrackingMetrics[{}]", partition)), metrics: BaselineMetrics::new(metrics, partition), } } @@ -77,11 +64,9 @@ impl MemTrackingMetrics { } /// setup initial memory usage and register it with memory manager - pub fn init_mem_used(&self, size: usize) { + pub fn init_mem_used(&mut self, size: usize) { self.metrics.mem_used().set(size); - if let Some(rt) = self.runtime.as_ref() { - rt.memory_manager.grow_tracker_usage(size); - } + self.allocation.resize(size) } /// return the metric for the total number of output rows produced @@ -118,14 +103,3 @@ impl MemTrackingMetrics { self.metrics.record_poll(poll) } } - -impl Drop for MemTrackingMetrics { - fn drop(&mut self) { - self.metrics.try_done(); - if self.mem_used() != 0 { - if let Some(rt) = self.runtime.as_ref() { - rt.drop_consumer(&self.id, self.mem_used()); - } - } - } -} diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index b6c37d109859..f1a7771f966f 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -21,10 +21,9 @@ use crate::error::{DataFusionError, Result}; use crate::execution::context::TaskContext; -use crate::execution::memory_manager::{ - human_readable_size, ConsumerType, MemoryConsumer, MemoryConsumerId, MemoryManager, -}; +use crate::execution::memory_manager::human_readable_size; use crate::execution::runtime_env::RuntimeEnv; +use crate::execution::TrackedAllocation; use crate::physical_plan::common::{batch_byte_size, IPCWriter, SizedRecordBatchStream}; use crate::physical_plan::expressions::PhysicalSortExpr; use crate::physical_plan::metrics::{ @@ -45,9 +44,7 @@ use arrow::datatypes::SchemaRef; use arrow::error::{ArrowError, Result as ArrowResult}; use arrow::ipc::reader::FileReader; use arrow::record_batch::RecordBatch; -use async_trait::async_trait; use datafusion_physical_expr::EquivalenceProperties; -use futures::lock::Mutex; use futures::{Stream, StreamExt, TryFutureExt, TryStreamExt}; use log::{debug, error}; use std::any::Any; @@ -73,10 +70,9 @@ use tokio::task; /// buffer the batch in memory, go to 1. /// 3. when input is exhausted, merge all in memory batches and spills to get a total order. struct ExternalSorter { - id: MemoryConsumerId, schema: SchemaRef, - in_mem_batches: Mutex>, - spills: Mutex>, + in_mem_batches: Vec, + spills: Vec, /// Sort expressions expr: Vec, session_config: Arc, @@ -84,6 +80,8 @@ struct ExternalSorter { metrics_set: CompositeMetricsSet, metrics: BaselineMetrics, fetch: Option, + allocation: TrackedAllocation, + partition_id: usize, } impl ExternalSorter { @@ -97,31 +95,38 @@ impl ExternalSorter { fetch: Option, ) -> Self { let metrics = metrics_set.new_intermediate_baseline(partition_id); + let allocation = runtime + .memory_manager + .new_tracked_allocation(format!("ExternalSorter[{}]", partition_id)); + Self { - id: MemoryConsumerId::new(partition_id), schema, - in_mem_batches: Mutex::new(vec![]), - spills: Mutex::new(vec![]), + in_mem_batches: vec![], + spills: vec![], expr, session_config, runtime, metrics_set, metrics, fetch, + allocation, + partition_id, } } async fn insert_batch( - &self, + &mut self, input: RecordBatch, tracking_metrics: &MemTrackingMetrics, ) -> Result<()> { if input.num_rows() > 0 { let size = batch_byte_size(&input); - debug!("Inserting {} rows of {} bytes", input.num_rows(), size); - self.try_grow(size).await?; + if self.allocation.try_grow(size).is_err() { + self.spill().await?; + self.allocation.try_grow(size)? + } + self.metrics.mem_used().add(size); - let mut in_mem_batches = self.in_mem_batches.lock().await; // NB timer records time taken on drop, so there are no // calls to `timer.done()` below. let _timer = tracking_metrics.elapsed_compute().timer(); @@ -136,61 +141,58 @@ impl ExternalSorter { // We don't have to call try_grow here, since we have already used the // memory (so spilling right here wouldn't help at all for the current // operation). But we still have to record it so that other requesters - // would know about this unexpected increase in memory consuption. + // would know about this unexpected increase in memory consumption. let new_size_delta = new_size - size; - self.grow(new_size_delta); + self.allocation.grow(new_size_delta); self.metrics.mem_used().add(new_size_delta); } Ordering::Less => { let size_delta = size - new_size; - self.shrink(size_delta); + self.allocation.shrink(size_delta); self.metrics.mem_used().sub(size_delta); } Ordering::Equal => {} } - in_mem_batches.push(partial); + self.in_mem_batches.push(partial); } Ok(()) } async fn spilled_before(&self) -> bool { - let spills = self.spills.lock().await; - !spills.is_empty() + !self.spills.is_empty() } /// MergeSort in mem batches as well as spills into total order with `SortPreservingMergeStream`. - async fn sort(&self) -> Result { - let partition = self.partition_id(); + async fn sort(&mut self) -> Result { let batch_size = self.session_config.batch_size(); - let mut in_mem_batches = self.in_mem_batches.lock().await; if self.spilled_before().await { - let tracking_metrics = self - .metrics_set - .new_intermediate_tracking(partition, self.runtime.clone()); + let tracking_metrics = self.metrics_set.new_intermediate_tracking( + self.partition_id, + self.runtime.memory_manager.as_ref(), + ); let mut streams: Vec = vec![]; - if in_mem_batches.len() > 0 { + if !self.in_mem_batches.is_empty() { let in_mem_stream = in_mem_partial_sort( - &mut in_mem_batches, + &mut self.in_mem_batches, self.schema.clone(), &self.expr, batch_size, tracking_metrics, self.fetch, )?; - let prev_used = self.free_all_memory(); + let prev_used = self.allocation.free(); streams.push(SortedStream::new(in_mem_stream, prev_used)); } - let mut spills = self.spills.lock().await; - - for spill in spills.drain(..) { + for spill in self.spills.drain(..) { let stream = read_spill_as_stream(spill, self.schema.clone())?; streams.push(SortedStream::new(stream, 0)); } - let tracking_metrics = self - .metrics_set - .new_final_tracking(partition, self.runtime.clone()); + let tracking_metrics = self.metrics_set.new_final_tracking( + self.partition_id, + self.runtime.memory_manager.as_ref(), + ); Ok(Box::pin(SortPreservingMergeStream::new_from_streams( streams, self.schema.clone(), @@ -198,12 +200,13 @@ impl ExternalSorter { tracking_metrics, self.session_config.batch_size(), )?)) - } else if in_mem_batches.len() > 0 { - let tracking_metrics = self - .metrics_set - .new_final_tracking(partition, self.runtime.clone()); + } else if !self.in_mem_batches.is_empty() { + let tracking_metrics = self.metrics_set.new_final_tracking( + self.partition_id, + self.runtime.memory_manager.as_ref(), + ); let result = in_mem_partial_sort( - &mut in_mem_batches, + &mut self.in_mem_batches, self.schema.clone(), &self.expr, batch_size, @@ -211,19 +214,13 @@ impl ExternalSorter { self.fetch, ); // Report to the memory manager we are no longer using memory - self.free_all_memory(); + self.allocation.free(); result } else { Ok(Box::pin(EmptyRecordBatchStream::new(self.schema.clone()))) } } - fn free_all_memory(&self) -> usize { - let used = self.metrics.mem_used().set(0); - self.shrink(used); - used - } - fn used(&self) -> usize { self.metrics.mem_used().value() } @@ -235,66 +232,23 @@ impl ExternalSorter { fn spill_count(&self) -> usize { self.metrics.spill_count().value() } -} - -impl Debug for ExternalSorter { - fn fmt(&self, f: &mut Formatter) -> fmt::Result { - f.debug_struct("ExternalSorter") - .field("id", &self.id()) - .field("memory_used", &self.used()) - .field("spilled_bytes", &self.spilled_bytes()) - .field("spill_count", &self.spill_count()) - .finish() - } -} -impl Drop for ExternalSorter { - fn drop(&mut self) { - self.runtime.drop_consumer(self.id(), self.used()); - } -} - -#[async_trait] -impl MemoryConsumer for ExternalSorter { - fn name(&self) -> String { - "ExternalSorter".to_owned() - } - - fn id(&self) -> &MemoryConsumerId { - &self.id - } - - fn memory_manager(&self) -> Arc { - self.runtime.memory_manager.clone() - } - - fn type_(&self) -> &ConsumerType { - &ConsumerType::Requesting - } - - async fn spill(&self) -> Result { - let partition = self.partition_id(); - let mut in_mem_batches = self.in_mem_batches.lock().await; + async fn spill(&mut self) -> Result { // we could always get a chance to free some memory as long as we are holding some - if in_mem_batches.len() == 0 { + if self.in_mem_batches.is_empty() { return Ok(0); } - debug!( - "{}[{}] spilling sort data of {} to disk while inserting ({} time(s) so far)", - self.name(), - self.id(), - self.used(), - self.spill_count() - ); + debug!("Spilling sort data of ExternalSorter to disk whilst inserting"); - let tracking_metrics = self - .metrics_set - .new_intermediate_tracking(partition, self.runtime.clone()); + let tracking_metrics = self.metrics_set.new_intermediate_tracking( + self.partition_id, + self.runtime.memory_manager.as_ref(), + ); let spillfile = self.runtime.disk_manager.create_tmp_file("Sorting")?; let stream = in_mem_partial_sort( - &mut in_mem_batches, + &mut self.in_mem_batches, self.schema.clone(), &self.expr, self.session_config.batch_size(), @@ -304,15 +258,21 @@ impl MemoryConsumer for ExternalSorter { spill_partial_sorted_stream(&mut stream?, spillfile.path(), self.schema.clone()) .await?; - let mut spills = self.spills.lock().await; + self.allocation.free(); let used = self.metrics.mem_used().set(0); self.metrics.record_spill(used); - spills.push(spillfile); + self.spills.push(spillfile); Ok(used) } +} - fn mem_used(&self) -> usize { - self.metrics.mem_used().value() +impl Debug for ExternalSorter { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + f.debug_struct("ExternalSorter") + .field("memory_used", &self.used()) + .field("spilled_bytes", &self.spilled_bytes()) + .field("spill_count", &self.spill_count()) + .finish() } } @@ -528,7 +488,7 @@ impl SortedSizedRecordBatchStream { schema: SchemaRef, batches: Vec, sorted_iter: SortedIterator, - metrics: MemTrackingMetrics, + mut metrics: MemTrackingMetrics, ) -> Self { let size = batches.iter().map(batch_byte_size).sum::() + sorted_iter.memory_size(); @@ -911,8 +871,8 @@ async fn do_sort( ); let schema = input.schema(); let tracking_metrics = - metrics_set.new_intermediate_tracking(partition_id, context.runtime_env()); - let sorter = ExternalSorter::new( + metrics_set.new_intermediate_tracking(partition_id, context.memory_manager()); + let mut sorter = ExternalSorter::new( partition_id, schema.clone(), expr, @@ -921,7 +881,6 @@ async fn do_sort( context.runtime_env(), fetch, ); - context.runtime_env().register_requester(sorter.id()); while let Some(batch) = input.next().await { let batch = batch?; sorter.insert_batch(batch, &tracking_metrics).await?; @@ -1005,10 +964,7 @@ mod tests { assert_eq!(c7.value(c7.len() - 1), 254,); assert_eq!( - session_ctx - .runtime_env() - .memory_manager - .get_requester_total(), + session_ctx.runtime_env().memory_manager.allocated(), 0, "The sort should have returned all memory used back to the memory manager" ); @@ -1077,10 +1033,7 @@ mod tests { assert_eq!(c7.value(c7.len() - 1), 254,); assert_eq!( - session_ctx - .runtime_env() - .memory_manager - .get_requester_total(), + session_ctx.runtime_env().memory_manager.allocated(), 0, "The sort should have returned all memory used back to the memory manager" ); @@ -1100,7 +1053,7 @@ mod tests { // all the batches we are processing, we expect it to spill. (None, true), // When we have a limit however, the buffered size of batches should fit in memory - // since it is much lover than the total size of the input batch. + // since it is much lower than the total size of the input batch. (Some(1), false), ]; @@ -1331,10 +1284,7 @@ mod tests { assert_strong_count_converges_to_zero(refs).await; assert_eq!( - session_ctx - .runtime_env() - .memory_manager - .get_requester_total(), + session_ctx.runtime_env().memory_manager.allocated(), 0, "The sort should have returned all memory used back to the memory manager" ); diff --git a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs index 212c4c955b32..d733f73a84a2 100644 --- a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs +++ b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs @@ -169,7 +169,8 @@ impl ExecutionPlan for SortPreservingMergeExec { ))); } - let tracking_metrics = MemTrackingMetrics::new(&self.metrics, partition); + let tracking_metrics = + MemTrackingMetrics::new(&self.metrics, context.memory_manager(), partition); let input_partitions = self.input.output_partitioning().partition_count(); debug!( @@ -342,7 +343,7 @@ impl SortPreservingMergeStream { streams: Vec, schema: SchemaRef, expressions: &[PhysicalSortExpr], - tracking_metrics: MemTrackingMetrics, + mut tracking_metrics: MemTrackingMetrics, batch_size: usize, ) -> Result { let stream_count = streams.len(); @@ -1258,7 +1259,8 @@ mod tests { } let metrics = ExecutionPlanMetricsSet::new(); - let tracking_metrics = MemTrackingMetrics::new(&metrics, 0); + let tracking_metrics = + MemTrackingMetrics::new(&metrics, task_ctx.memory_manager(), 0); let merge_stream = SortPreservingMergeStream::new_from_streams( streams, diff --git a/datafusion/core/tests/memory_limit.rs b/datafusion/core/tests/memory_limit.rs index 20ad555d66e1..91d66e884623 100644 --- a/datafusion/core/tests/memory_limit.rs +++ b/datafusion/core/tests/memory_limit.rs @@ -19,14 +19,13 @@ use std::sync::Arc; -use arrow::record_batch::RecordBatch; use datafusion::datasource::MemTable; use datafusion::execution::disk_manager::DiskManagerConfig; use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use datafusion_common::assert_contains; use datafusion::prelude::{SessionConfig, SessionContext}; -use test_utils::{stagger_batch, AccessLogGenerator}; +use test_utils::AccessLogGenerator; #[cfg(test)] #[ctor::ctor] @@ -39,6 +38,7 @@ async fn oom_sort() { run_limit_test( "select * from t order by host DESC", "Resources exhausted: Memory Exhausted while Sorting (DiskManager is disabled)", + 200_000, ) .await } @@ -47,7 +47,8 @@ async fn oom_sort() { async fn group_by_none() { run_limit_test( "select median(image) from t", - "Resources exhausted: Cannot spill GroupBy None Accumulators", + "Resources exhausted: Failed to allocate additional", + 20_000, ) .await } @@ -56,7 +57,8 @@ async fn group_by_none() { async fn group_by_row_hash() { run_limit_test( "select count(*) from t GROUP BY response_bytes", - "Resources exhausted: Cannot spill GroupBy Hash (Row) AggregationState", + "Resources exhausted: Failed to allocate additional", + 2_000, ) .await } @@ -66,23 +68,21 @@ async fn group_by_hash() { run_limit_test( // group by dict column "select count(*) from t GROUP BY service, host, pod, container", - "Resources exhausted: Cannot spill GroupBy Hash Accumulators", + "Resources exhausted: Failed to allocate additional", + 1_000, ) .await } /// 50 byte memory limit -const MEMORY_LIMIT_BYTES: usize = 50; const MEMORY_FRACTION: f64 = 0.95; /// runs the specified query against 1000 rows with a 50 /// byte memory limit and no disk manager enabled. -async fn run_limit_test(query: &str, expected_error: &str) { - let generator = AccessLogGenerator::new().with_row_limit(Some(1000)); - - let batches: Vec = generator - // split up into more than one batch, as the size limit in sort is not enforced until the second batch - .flat_map(stagger_batch) +async fn run_limit_test(query: &str, expected_error: &str, memory_limit: usize) { + let batches: Vec<_> = AccessLogGenerator::new() + .with_row_limit(1000) + .with_max_batch_size(50) .collect(); let table = MemTable::try_new(batches[0].schema(), vec![batches]).unwrap(); @@ -91,7 +91,7 @@ async fn run_limit_test(query: &str, expected_error: &str) { // do not allow spilling .with_disk_manager(DiskManagerConfig::Disabled) // Only allow 50 bytes - .with_memory_limit(MEMORY_LIMIT_BYTES, MEMORY_FRACTION); + .with_memory_limit(memory_limit, MEMORY_FRACTION); let runtime = RuntimeEnv::new(rt_config).unwrap(); diff --git a/datafusion/core/tests/order_spill_fuzz.rs b/datafusion/core/tests/order_spill_fuzz.rs index cc700d5d2cb7..eb1f3d6de3b6 100644 --- a/datafusion/core/tests/order_spill_fuzz.rs +++ b/datafusion/core/tests/order_spill_fuzz.rs @@ -31,18 +31,18 @@ use datafusion::physical_plan::{collect, ExecutionPlan}; use datafusion::prelude::{SessionConfig, SessionContext}; use rand::Rng; use std::sync::Arc; -use test_utils::{batches_to_vec, partitions_to_sorted_vec, stagger_batch_with_seed}; +use test_utils::{batches_to_vec, partitions_to_sorted_vec}; #[tokio::test] #[cfg_attr(tarpaulin, ignore)] async fn test_sort_1k_mem() { - run_sort(1024, vec![(5, false), (2000, true), (1000000, true)]).await + run_sort(10240, vec![(5, false), (20000, true), (1000000, true)]).await } #[tokio::test] #[cfg_attr(tarpaulin, ignore)] async fn test_sort_100k_mem() { - run_sort(102400, vec![(5, false), (2000, false), (1000000, true)]).await + run_sort(102400, vec![(5, false), (20000, false), (1000000, true)]).await } #[tokio::test] @@ -95,10 +95,7 @@ async fn run_sort(pool_size: usize, size_spill: Vec<(usize, bool)>) { } assert_eq!( - session_ctx - .runtime_env() - .memory_manager - .get_requester_total(), + session_ctx.runtime_env().memory_manager.allocated(), 0, "The sort should have returned all memory used back to the memory manager" ); @@ -110,12 +107,23 @@ async fn run_sort(pool_size: usize, size_spill: Vec<(usize, bool)>) { /// with randomized i32 content fn make_staggered_batches(len: usize) -> Vec { let mut rng = rand::thread_rng(); - let mut input: Vec = vec![0; len]; - rng.fill(&mut input[..]); - let input = Int32Array::from_iter_values(input.into_iter()); - - // split into several record batches - let batch = - RecordBatch::try_from_iter(vec![("x", Arc::new(input) as ArrayRef)]).unwrap(); - stagger_batch_with_seed(batch, 42) + let max_batch = 1024; + + let mut batches = vec![]; + let mut remaining = len; + while remaining != 0 { + let to_read = rng.gen_range(0..=remaining.min(max_batch)); + remaining -= to_read; + + batches.push( + RecordBatch::try_from_iter(vec![( + "x", + Arc::new(Int32Array::from_iter_values( + std::iter::from_fn(|| Some(rng.gen())).take(to_read), + )) as ArrayRef, + )]) + .unwrap(), + ) + } + batches } diff --git a/datafusion/core/tests/provider_filter_pushdown.rs b/datafusion/core/tests/provider_filter_pushdown.rs index 7276820f6f59..1ed3a5913609 100644 --- a/datafusion/core/tests/provider_filter_pushdown.rs +++ b/datafusion/core/tests/provider_filter_pushdown.rs @@ -90,10 +90,11 @@ impl ExecutionPlan for CustomPlan { fn execute( &self, partition: usize, - _context: Arc, + context: Arc, ) -> Result { let metrics = ExecutionPlanMetricsSet::new(); - let tracking_metrics = MemTrackingMetrics::new(&metrics, partition); + let tracking_metrics = + MemTrackingMetrics::new(&metrics, context.memory_manager(), partition); Ok(Box::pin(SizedRecordBatchStream::new( self.schema(), self.batches.clone(), diff --git a/test-utils/src/data_gen.rs b/test-utils/src/data_gen.rs index c82d56ef21f9..19db65400a17 100644 --- a/test-utils/src/data_gen.rs +++ b/test-utils/src/data_gen.rs @@ -78,6 +78,13 @@ impl BatchBuilder { ])) } + fn is_finished(&self) -> bool { + self.row_limit + .as_ref() + .map(|x| *x <= self.row_count) + .unwrap_or_default() + } + fn append(&mut self, rng: &mut StdRng, host: &str, service: &str) { let num_pods = rng.gen_range(1..15); let pods = generate_sorted_strings(rng, num_pods, 30..40); @@ -91,6 +98,10 @@ impl BatchBuilder { let num_entries = rng.gen_range(1024..8192); for i in 0..num_entries { + if self.is_finished() { + return; + } + let time = i as i64 * 1024; self.append_row(rng, host, &pod, service, &container, &image, time); } @@ -109,12 +120,6 @@ impl BatchBuilder { image: &str, time: i64, ) { - // skip if over limit - if let Some(limit) = self.row_limit { - if self.row_count >= limit { - return; - } - } self.row_count += 1; let methods = &["GET", "PUT", "POST", "HEAD", "PATCH", "DELETE"]; @@ -237,7 +242,9 @@ pub struct AccessLogGenerator { rng: StdRng, host_idx: usize, /// optional number of rows produced - row_limit: Option, + row_limit: usize, + /// maximum rows per batch + max_batch_size: usize, /// How many rows have been returned so far row_count: usize, } @@ -259,7 +266,8 @@ impl AccessLogGenerator { schema: BatchBuilder::schema(), host_idx: 0, rng: StdRng::from_seed(seed), - row_limit: None, + row_limit: usize::MAX, + max_batch_size: usize::MAX, row_count: 0, } } @@ -269,8 +277,14 @@ impl AccessLogGenerator { self.schema.clone() } + /// Limit the maximum batch size + pub fn with_max_batch_size(mut self, batch_size: usize) -> Self { + self.max_batch_size = batch_size; + self + } + /// Return up to row_limit rows; - pub fn with_row_limit(mut self, row_limit: Option) -> Self { + pub fn with_row_limit(mut self, row_limit: usize) -> Self { self.row_limit = row_limit; self } @@ -280,15 +294,13 @@ impl Iterator for AccessLogGenerator { type Item = RecordBatch; fn next(&mut self) -> Option { - // if we have a limit and have passed it, stop generating - if let Some(limit) = self.row_limit { - if self.row_count >= limit { - return None; - } + if self.row_count == self.row_limit { + return None; } - let mut builder = BatchBuilder::default() - .with_row_limit(self.row_limit.map(|limit| limit - self.row_count)); + let mut builder = BatchBuilder::default().with_row_limit(Some( + self.max_batch_size.min(self.row_limit - self.row_count), + )); let host = format!( "i-{:016x}.ec2.internal", @@ -300,19 +312,14 @@ impl Iterator for AccessLogGenerator { if self.rng.gen_bool(0.5) { continue; } + if builder.is_finished() { + break; + } builder.append(&mut self.rng, &host, service); } let batch = builder.finish(Arc::clone(&self.schema)); - // limit batch if needed to stay under row limit - let batch = if let Some(limit) = self.row_limit { - let num_rows = limit - self.row_count; - batch.slice(0, num_rows) - } else { - batch - }; - self.row_count += batch.num_rows(); Some(batch) } From 34c4318e0beee13744801d384a5f669a2ab5a998 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Tue, 6 Dec 2022 10:52:28 +0000 Subject: [PATCH 02/11] Fix tests --- datafusion/core/tests/parquet/filter_pushdown.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/tests/parquet/filter_pushdown.rs b/datafusion/core/tests/parquet/filter_pushdown.rs index 999becafd0a5..fc74d7ded7e7 100644 --- a/datafusion/core/tests/parquet/filter_pushdown.rs +++ b/datafusion/core/tests/parquet/filter_pushdown.rs @@ -57,7 +57,7 @@ async fn single_file() { let tempdir = TempDir::new().unwrap(); - let generator = AccessLogGenerator::new().with_row_limit(Some(NUM_ROWS)); + let generator = AccessLogGenerator::new().with_row_limit(NUM_ROWS); // default properties let props = WriterProperties::builder().build(); @@ -236,7 +236,7 @@ async fn single_file() { async fn single_file_small_data_pages() { let tempdir = TempDir::new().unwrap(); - let generator = AccessLogGenerator::new().with_row_limit(Some(NUM_ROWS)); + let generator = AccessLogGenerator::new().with_row_limit(NUM_ROWS); // set the max page rows with arbitrary sizes 8311 to increase // effectiveness of page filtering From 21dc95d54245ffeac3d4a528838fd83a2c52a5f1 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 16 Dec 2022 18:01:31 +0000 Subject: [PATCH 03/11] Add MemoryPool abstraction --- .../core/src/execution/memory_manager/mod.rs | 250 ++++++---------- .../core/src/execution/memory_manager/pool.rs | 282 ++++++++++++++++++ datafusion/core/src/execution/runtime_env.rs | 30 +- datafusion/core/src/lib.rs | 1 + .../core/src/physical_plan/aggregates/hash.rs | 2 +- .../physical_plan/aggregates/no_grouping.rs | 2 +- .../src/physical_plan/aggregates/row_hash.rs | 7 +- .../core/src/physical_plan/metrics/tracker.rs | 2 +- .../core/src/physical_plan/sorts/sort.rs | 9 +- datafusion/core/tests/order_spill_fuzz.rs | 2 +- 10 files changed, 403 insertions(+), 184 deletions(-) create mode 100644 datafusion/core/src/execution/memory_manager/pool.rs diff --git a/datafusion/core/src/execution/memory_manager/mod.rs b/datafusion/core/src/execution/memory_manager/mod.rs index 4aef4d2b7617..d1ed5c975541 100644 --- a/datafusion/core/src/execution/memory_manager/mod.rs +++ b/datafusion/core/src/execution/memory_manager/mod.rs @@ -17,164 +17,126 @@ //! Manages all available memory during query execution -use crate::error::{DataFusionError, Result}; -use log::debug; -use std::sync::atomic::{AtomicUsize, Ordering}; +use crate::error::Result; use std::sync::Arc; +mod pool; pub mod proxy; -#[derive(Debug, Clone)] -/// Configuration information for memory management -pub enum MemoryManagerConfig { - /// Use the existing [MemoryManager] - Existing(Arc), - - /// Create a new [MemoryManager] that will use up to some - /// fraction of total system memory. - New { - /// Max execution memory allowed for DataFusion. Defaults to - /// `usize::MAX`, which will not attempt to limit the memory - /// used during plan execution. - max_memory: usize, - - /// The fraction of `max_memory` that the memory manager will - /// use for execution. - /// - /// The purpose of this config is to set aside memory for - /// untracked data structures, and imprecise size estimation - /// during memory acquisition. Defaults to 0.7 - memory_fraction: f64, - }, -} +pub use pool::*; -impl Default for MemoryManagerConfig { - fn default() -> Self { - Self::New { - max_memory: usize::MAX, - memory_fraction: 0.7, - } - } -} +/// The pool of memory from which [`MemoryManager`] and [`TrackedAllocation`] allocate +pub trait MemoryPool: Send + Sync + std::fmt::Debug { + /// Records the creation of a new [`TrackedAllocation`] with [`AllocationOptions`] + fn allocate(&self, _options: &AllocationOptions) {} -impl MemoryManagerConfig { - /// Create a new memory [MemoryManager] with no limit on the - /// memory used - pub fn new() -> Self { - Default::default() - } + /// Records the destruction of a [`TrackedAllocation`] with [`AllocationOptions`] + fn free(&self, _options: &AllocationOptions) {} - /// Create a configuration based on an existing [MemoryManager] - pub fn new_existing(existing: Arc) -> Self { - Self::Existing(existing) - } + /// Infallibly grow the provided `allocation` by `additional` bytes + /// + /// This must always succeed + fn grow(&self, allocation: &TrackedAllocation, additional: usize); - /// Create a new [MemoryManager] with a `max_memory` and `fraction` - pub fn try_new_limit(max_memory: usize, memory_fraction: f64) -> Result { - if max_memory == 0 { - return Err(DataFusionError::Plan(format!( - "invalid max_memory. Expected greater than 0, got {}", - max_memory - ))); - } - if !(memory_fraction > 0f64 && memory_fraction <= 1f64) { - return Err(DataFusionError::Plan(format!( - "invalid fraction. Expected greater than 0 and less than 1.0, got {}", - memory_fraction - ))); - } + /// Infallibly shrink the provided `allocation` by `shrink` bytes + fn shrink(&self, allocation: &TrackedAllocation, shrink: usize); - Ok(Self::New { - max_memory, - memory_fraction, - }) - } + /// Attempt to grow the provided `allocation` by `additional` bytes + /// + /// On error the `allocation` will not be increased in size + fn try_grow(&self, allocation: &TrackedAllocation, additional: usize) -> Result<()>; + + /// Return the total amount of memory allocated + fn allocated(&self) -> usize; } -/// The memory manager maintains a fixed size pool of memory -/// from which portions can be allocated +/// A cooperative MemoryManager which tracks memory in a cooperative fashion. +/// `ExecutionPlan` nodes such as `SortExec`, which require large amounts of memory +/// register their memory requests with the MemoryManager which then tracks the total +/// memory that has been allocated across all such nodes. +/// +/// The associated [`MemoryPool`] determines how to respond to memory allocation +/// requests, and any associated fairness control #[derive(Debug)] pub struct MemoryManager { - state: Arc, + pool: Arc, } impl MemoryManager { /// Create new memory manager based on the configuration - pub fn new(config: MemoryManagerConfig) -> Arc { - match config { - MemoryManagerConfig::Existing(manager) => manager, - MemoryManagerConfig::New { - max_memory, - memory_fraction, - } => { - let pool_size = (max_memory as f64 * memory_fraction) as usize; - debug!( - "Creating memory manager with initial size {}", - human_readable_size(pool_size) - ); - - Arc::new(Self { - state: Arc::new(MemoryManagerState { - pool_size, - used: AtomicUsize::new(0), - }), - }) - } - } - } - - /// Returns the maximum pool size - /// - /// Note: this can be less than the amount allocated as a result of [`MemoryManager::allocate`] - pub fn pool_size(&self) -> usize { - self.state.pool_size + pub fn new(pool: Arc) -> Self { + Self { pool } } /// Returns the number of allocated bytes /// /// Note: this can exceed the pool size as a result of [`MemoryManager::allocate`] pub fn allocated(&self) -> usize { - self.state.used.load(Ordering::Relaxed) + self.pool.allocated() } /// Returns a new empty allocation identified by `name` - pub fn new_tracked_allocation(&self, name: String) -> TrackedAllocation { - TrackedAllocation::new_empty(name, Arc::clone(&self.state)) + pub fn new_allocation(&self, name: String) -> TrackedAllocation { + self.new_allocation_with_options(AllocationOptions::new(name)) } -} -impl std::fmt::Display for MemoryManager { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!( - f, - "MemoryManager(capacity: {}, allocated: {})", - human_readable_size(self.state.pool_size), - human_readable_size(self.allocated()), - ) + /// Returns a new empty allocation with the provided [`AllocationOptions`] + pub fn new_allocation_with_options( + &self, + options: AllocationOptions, + ) -> TrackedAllocation { + TrackedAllocation::new_empty(options, Arc::clone(&self.pool)) } } +/// Options associated with a [`TrackedAllocation`] #[derive(Debug)] -struct MemoryManagerState { - pool_size: usize, - used: AtomicUsize, +pub struct AllocationOptions { + name: String, + can_spill: bool, +} + +impl AllocationOptions { + /// Create a new [`AllocationOptions`] + pub fn new(name: String) -> Self { + Self { + name, + can_spill: false, + } + } + + /// Set whether this allocation can be spilled to disk + pub fn with_can_spill(self, can_spill: bool) -> Self { + Self { can_spill, ..self } + } + + /// Returns true if this allocation can spill to disk + pub fn can_spill(&self) -> bool { + self.can_spill + } + + /// Returns the name associated with this allocation + pub fn name(&self) -> &str { + &self.name + } } /// A [`TrackedAllocation`] tracks a reservation of memory in a [`MemoryManager`] /// that is freed back to the memory pool on drop #[derive(Debug)] pub struct TrackedAllocation { - name: String, + options: AllocationOptions, size: usize, - state: Arc, + policy: Arc, } impl TrackedAllocation { - fn new_empty(name: String, state: Arc) -> Self { + fn new_empty(options: AllocationOptions, policy: Arc) -> Self { + policy.allocate(&options); Self { - name, + options, size: 0, - state, + policy, } } @@ -183,6 +145,11 @@ impl TrackedAllocation { self.size } + /// Returns this allocations [`AllocationOptions`] + pub fn options(&self) -> &AllocationOptions { + &self.options + } + /// Frees all bytes from this allocation returning the number of bytes freed pub fn free(&mut self) -> usize { let size = self.size; @@ -199,7 +166,7 @@ impl TrackedAllocation { /// Panics if `capacity` exceeds [`Self::size`] pub fn shrink(&mut self, capacity: usize) { let new_size = self.size.checked_sub(capacity).unwrap(); - self.state.used.fetch_sub(capacity, Ordering::SeqCst); + self.policy.shrink(self, capacity); self.size = new_size } @@ -215,20 +182,13 @@ impl TrackedAllocation { /// Increase the size of this by `capacity` bytes pub fn grow(&mut self, capacity: usize) { - self.state.used.fetch_add(capacity, Ordering::SeqCst); + self.policy.grow(self, capacity); self.size += capacity; } /// Try to increase the size of this [`TrackedAllocation`] by `capacity` bytes pub fn try_grow(&mut self, capacity: usize) -> Result<()> { - let pool_size = self.state.pool_size; - self.state - .used - .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |used| { - let new_used = used + capacity; - (new_used <= pool_size).then_some(new_used) - }) - .map_err(|used| DataFusionError::ResourcesExhausted(format!("Failed to allocate additional {} bytes for {} with {} bytes already allocated - maximum available is {}", capacity, self.name, self.size, used)))?; + self.policy.try_grow(self, capacity)?; self.size += capacity; Ok(()) } @@ -237,6 +197,7 @@ impl TrackedAllocation { impl Drop for TrackedAllocation { fn drop(&mut self) { self.free(); + self.policy.free(&self.options); } } @@ -268,44 +229,11 @@ pub fn human_readable_size(size: usize) -> String { mod tests { use super::*; - #[test] - #[should_panic(expected = "invalid max_memory. Expected greater than 0, got 0")] - fn test_try_new_with_limit_0() { - MemoryManagerConfig::try_new_limit(0, 1.0).unwrap(); - } - - #[test] - #[should_panic( - expected = "invalid fraction. Expected greater than 0 and less than 1.0, got -9.6" - )] - fn test_try_new_with_limit_neg_fraction() { - MemoryManagerConfig::try_new_limit(100, -9.6).unwrap(); - } - - #[test] - #[should_panic( - expected = "invalid fraction. Expected greater than 0 and less than 1.0, got 9.6" - )] - fn test_try_new_with_limit_too_large() { - MemoryManagerConfig::try_new_limit(100, 9.6).unwrap(); - } - - #[test] - fn test_try_new_with_limit_pool_size() { - let config = MemoryManagerConfig::try_new_limit(100, 0.5).unwrap(); - let manager = MemoryManager::new(config); - assert_eq!(manager.pool_size(), 50); - - let config = MemoryManagerConfig::try_new_limit(100000, 0.1).unwrap(); - let manager = MemoryManager::new(config); - assert_eq!(manager.pool_size(), 10000); - } - #[test] fn test_memory_manager_underflow() { - let config = MemoryManagerConfig::try_new_limit(100, 0.5).unwrap(); - let manager = MemoryManager::new(config); - let mut a1 = manager.new_tracked_allocation("a1".to_string()); + let policy = Arc::new(GreedyMemoryPool::new(50)); + let manager = MemoryManager::new(policy); + let mut a1 = manager.new_allocation("a1".to_string()); assert_eq!(manager.allocated(), 0); a1.grow(100); @@ -320,7 +248,7 @@ mod tests { a1.try_grow(30).unwrap(); assert_eq!(manager.allocated(), 30); - let mut a2 = manager.new_tracked_allocation("a2".to_string()); + let mut a2 = manager.new_allocation("a2".to_string()); a2.try_grow(25).unwrap_err(); assert_eq!(manager.allocated(), 30); diff --git a/datafusion/core/src/execution/memory_manager/pool.rs b/datafusion/core/src/execution/memory_manager/pool.rs new file mode 100644 index 000000000000..c742253d19ed --- /dev/null +++ b/datafusion/core/src/execution/memory_manager/pool.rs @@ -0,0 +1,282 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::execution::memory_manager::{AllocationOptions, MemoryPool}; +use crate::execution::TrackedAllocation; +use datafusion_common::{DataFusionError, Result}; +use parking_lot::Mutex; +use std::sync::atomic::{AtomicUsize, Ordering}; + +/// A [`MemoryPool`] that enforces no limit +#[derive(Debug, Default)] +pub struct UnboundedMemoryPool { + used: AtomicUsize, +} + +impl MemoryPool for UnboundedMemoryPool { + fn grow(&self, _allocation: &TrackedAllocation, additional: usize) { + self.used.fetch_add(additional, Ordering::SeqCst); + } + + fn shrink(&self, _allocation: &TrackedAllocation, shrink: usize) { + self.used.fetch_sub(shrink, Ordering::SeqCst); + } + + fn try_grow(&self, allocation: &TrackedAllocation, additional: usize) -> Result<()> { + self.grow(allocation, additional); + Ok(()) + } + + fn allocated(&self) -> usize { + self.used.load(Ordering::Relaxed) + } +} + +/// A [`MemoryPool`] that implements a greedy first-come first-serve limit +#[derive(Debug)] +pub struct GreedyMemoryPool { + pool_size: usize, + used: AtomicUsize, +} + +impl GreedyMemoryPool { + /// Allocate up to `limit` bytes + pub fn new(pool_size: usize) -> Self { + Self { + pool_size, + used: AtomicUsize::new(0), + } + } +} + +impl MemoryPool for GreedyMemoryPool { + fn grow(&self, _allocation: &TrackedAllocation, additional: usize) { + self.used.fetch_add(additional, Ordering::Relaxed); + } + + fn shrink(&self, _allocation: &TrackedAllocation, shrink: usize) { + self.used.fetch_sub(shrink, Ordering::Relaxed); + } + + fn try_grow(&self, allocation: &TrackedAllocation, additional: usize) -> Result<()> { + self.used + .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |used| { + let new_used = used + additional; + (new_used <= self.pool_size).then_some(new_used) + }) + .map_err(|used| { + insufficient_capacity_err(allocation, additional, self.pool_size - used) + })?; + Ok(()) + } + + fn allocated(&self) -> usize { + self.used.load(Ordering::Relaxed) + } +} + +/// A [`MemoryPool`] that prevents spillable allocations from using more than +/// an even fraction of the available memory sans any unspillable allocations +/// (i.e. `(pool_size - unspillable_memory) / num_spillable_allocations`) +/// +/// ┌───────────────────────z──────────────────────z───────────────┐ +/// │ z z │ +/// │ z z │ +/// │ Spillable z Unspillable z Free │ +/// │ Memory z Memory z Memory │ +/// │ z z │ +/// │ z z │ +/// └───────────────────────z──────────────────────z───────────────┘ +/// +/// Unspillable memory is allocated in a first-come, first-serve fashion +#[derive(Debug)] +pub struct FairSpillPool { + /// The total memory limit + pool_size: usize, + + state: Mutex, +} + +#[derive(Debug)] +struct FairSpillPoolState { + /// The number of allocations that can spill + num_spill: usize, + + /// The total amount of memory allocated that can be spilled + spillable: usize, + + /// The total amount of memory allocated by consumers that cannot spill + unspillable: usize, +} + +impl FairSpillPool { + /// Allocate up to `limit` bytes + pub fn new(pool_size: usize) -> Self { + Self { + pool_size, + state: Mutex::new(FairSpillPoolState { + num_spill: 0, + spillable: 0, + unspillable: 0, + }), + } + } +} + +impl MemoryPool for FairSpillPool { + fn allocate(&self, options: &AllocationOptions) { + if options.can_spill { + self.state.lock().num_spill += 1; + } + } + + fn free(&self, options: &AllocationOptions) { + if options.can_spill { + self.state.lock().num_spill -= 1; + } + } + + fn grow(&self, allocation: &TrackedAllocation, additional: usize) { + let mut state = self.state.lock(); + match allocation.options.can_spill { + true => state.spillable += additional, + false => state.unspillable += additional, + } + } + + fn shrink(&self, allocation: &TrackedAllocation, shrink: usize) { + let mut state = self.state.lock(); + match allocation.options.can_spill { + true => state.spillable -= shrink, + false => state.unspillable -= shrink, + } + } + + fn try_grow(&self, allocation: &TrackedAllocation, additional: usize) -> Result<()> { + let mut state = self.state.lock(); + + match allocation.options.can_spill { + true => { + // The total amount of memory available to spilling consumers + let spill_available = self.pool_size.saturating_sub(state.unspillable); + + // No spiller may use more than their fraction of the memory available + let available = spill_available + .checked_div(state.num_spill) + .unwrap_or(spill_available); + + if allocation.size + additional > available { + return Err(insufficient_capacity_err( + allocation, additional, available, + )); + } + state.spillable += additional; + } + false => { + let available = self + .pool_size + .saturating_sub(state.unspillable + state.unspillable); + + if available < additional { + return Err(insufficient_capacity_err( + allocation, additional, available, + )); + } + state.unspillable += additional; + } + } + Ok(()) + } + + fn allocated(&self) -> usize { + let state = self.state.lock(); + state.spillable + state.unspillable + } +} + +fn insufficient_capacity_err( + allocation: &TrackedAllocation, + additional: usize, + available: usize, +) -> DataFusionError { + DataFusionError::ResourcesExhausted(format!("Failed to allocate additional {} bytes for {} with {} bytes already allocated - maximum available is {}", additional, allocation.options.name, allocation.size, available)) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::execution::memory_manager::AllocationOptions; + use crate::execution::MemoryManager; + use std::sync::Arc; + + #[test] + fn test_fair() { + let manager = MemoryManager::new(Arc::new(FairSpillPool::new(100))); + + let mut a1 = manager.new_allocation("unspillable".to_string()); + // Can grow beyond capacity of pool + a1.grow(2000); + assert_eq!(manager.allocated(), 2000); + + let options = AllocationOptions::new("s1".to_string()).with_can_spill(true); + let mut a2 = manager.new_allocation_with_options(options); + // Can grow beyond capacity of pool + a2.grow(2000); + + assert_eq!(manager.allocated(), 4000); + + let err = a2.try_grow(1).unwrap_err().to_string(); + assert_eq!(err, "Resources exhausted: Failed to allocate additional 1 bytes for s1 with 2000 bytes already allocated - maximum available is 0"); + + let err = a2.try_grow(1).unwrap_err().to_string(); + assert_eq!(err, "Resources exhausted: Failed to allocate additional 1 bytes for s1 with 2000 bytes already allocated - maximum available is 0"); + + a1.shrink(1990); + a2.shrink(2000); + + assert_eq!(manager.allocated(), 10); + + a1.try_grow(10).unwrap(); + assert_eq!(manager.allocated(), 20); + + // Can grow a2 to 80 as only spilling consumer + a2.try_grow(80).unwrap(); + assert_eq!(manager.allocated(), 100); + + a2.shrink(70); + + assert_eq!(a1.size(), 20); + assert_eq!(a2.size(), 10); + assert_eq!(manager.allocated(), 30); + + let options = AllocationOptions::new("s2".to_string()).with_can_spill(true); + let mut a3 = manager.new_allocation_with_options(options); + + let err = a3.try_grow(70).unwrap_err().to_string(); + assert_eq!(err, "Resources exhausted: Failed to allocate additional 70 bytes for s2 with 0 bytes already allocated - maximum available is 40"); + + //Shrinking a2 to zero doesn't allow a3 to allocate more than 45 + a2.free(); + let err = a3.try_grow(70).unwrap_err().to_string(); + assert_eq!(err, "Resources exhausted: Failed to allocate additional 70 bytes for s2 with 0 bytes already allocated - maximum available is 40"); + + // But dropping a2 does + drop(a2); + assert_eq!(manager.allocated(), 20); + a3.try_grow(80).unwrap(); + } +} diff --git a/datafusion/core/src/execution/runtime_env.rs b/datafusion/core/src/execution/runtime_env.rs index c48de86efea1..cc90d11592c6 100644 --- a/datafusion/core/src/execution/runtime_env.rs +++ b/datafusion/core/src/execution/runtime_env.rs @@ -27,7 +27,9 @@ use std::collections::HashMap; use crate::datasource::datasource::TableProviderFactory; use crate::datasource::listing_table_factory::ListingTableFactory; use crate::datasource::object_store::ObjectStoreRegistry; -use crate::execution::memory_manager::MemoryManagerConfig; +use crate::execution::memory_manager::{ + GreedyMemoryPool, MemoryPool, UnboundedMemoryPool, +}; use crate::execution::MemoryManager; use datafusion_common::DataFusionError; use object_store::ObjectStore; @@ -59,14 +61,17 @@ impl RuntimeEnv { /// Create env based on configuration pub fn new(config: RuntimeConfig) -> Result { let RuntimeConfig { - memory_manager, + memory_pool, disk_manager, object_store_registry, table_factories, } = config; + let pool = + memory_pool.unwrap_or_else(|| Arc::new(UnboundedMemoryPool::default())); + Ok(Self { - memory_manager: MemoryManager::new(memory_manager), + memory_manager: Arc::new(MemoryManager::new(pool)), disk_manager: DiskManager::try_new(disk_manager)?, object_store_registry, table_factories, @@ -121,8 +126,10 @@ impl Default for RuntimeEnv { pub struct RuntimeConfig { /// DiskManager to manage temporary disk file usage pub disk_manager: DiskManagerConfig, - /// MemoryManager to limit access to memory - pub memory_manager: MemoryManagerConfig, + /// [`MemoryPool`] from which to allocate memory + /// + /// Defaults to using an [`UnboundedMemoryPool`] if `None` + pub memory_pool: Option>, /// ObjectStoreRegistry to get object store based on url pub object_store_registry: Arc, /// Custom table factories for things like deltalake that are not part of core datafusion @@ -151,9 +158,9 @@ impl RuntimeConfig { self } - /// Customize memory manager - pub fn with_memory_manager(mut self, memory_manager: MemoryManagerConfig) -> Self { - self.memory_manager = memory_manager; + /// Customize memory policy + pub fn with_memory_policy(mut self, memory_pool: Arc) -> Self { + self.memory_pool = Some(memory_pool); self } @@ -178,11 +185,12 @@ impl RuntimeConfig { /// Specify the total memory to use while running the DataFusion /// plan to `max_memory * memory_fraction` in bytes. /// + /// This defaults to using [`GreedyMemoryPool`] + /// /// Note DataFusion does not yet respect this limit in all cases. pub fn with_memory_limit(self, max_memory: usize, memory_fraction: f64) -> Self { - self.with_memory_manager( - MemoryManagerConfig::try_new_limit(max_memory, memory_fraction).unwrap(), - ) + let pool_size = (max_memory as f64 * memory_fraction) as usize; + self.with_memory_policy(Arc::new(GreedyMemoryPool::new(pool_size))) } /// Use the specified path to create any needed temporary files diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 30bf6dc7eb7d..09b6c6691f72 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -212,6 +212,7 @@ /// DataFusion crate version pub const DATAFUSION_VERSION: &str = env!("CARGO_PKG_VERSION"); +extern crate core; extern crate sqlparser; pub mod avro_to_arrow; diff --git a/datafusion/core/src/physical_plan/aggregates/hash.rs b/datafusion/core/src/physical_plan/aggregates/hash.rs index 33e93afeeb3d..16dfdf31fd79 100644 --- a/datafusion/core/src/physical_plan/aggregates/hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/hash.rs @@ -125,7 +125,7 @@ impl GroupedHashAggregateStream { let allocation = context .memory_manager() - .new_tracked_allocation(format!("GroupedHashAggregateStream[{}]", partition)); + .new_allocation(format!("GroupedHashAggregateStream[{}]", partition)); let inner = GroupedHashAggregateStreamInner { schema: Arc::clone(&schema), diff --git a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs index d254f85bd577..00b1e316629d 100644 --- a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs +++ b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs @@ -74,7 +74,7 @@ impl AggregateStream { let allocation = context .memory_manager() - .new_tracked_allocation(format!("AggregateStream[{}]", partition)); + .new_allocation(format!("AggregateStream[{}]", partition)); let inner = AggregateStreamInner { schema: Arc::clone(&schema), diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs index 8308c4a4e723..c5be249ee7f2 100644 --- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs @@ -139,10 +139,9 @@ impl GroupedHashAggregateStreamV2 { let aggr_schema = aggr_state_schema(&aggr_expr)?; let aggr_layout = Arc::new(RowLayout::new(&aggr_schema, RowType::WordAligned)); - let allocation = context.memory_manager().new_tracked_allocation(format!( - "GroupedHashAggregateStreamV2[{}]", - partition - )); + let allocation = context + .memory_manager() + .new_allocation(format!("GroupedHashAggregateStreamV2[{}]", partition)); let aggr_state = AggregationState { allocation, diff --git a/datafusion/core/src/physical_plan/metrics/tracker.rs b/datafusion/core/src/physical_plan/metrics/tracker.rs index d0811afd31c4..40be3714f364 100644 --- a/datafusion/core/src/physical_plan/metrics/tracker.rs +++ b/datafusion/core/src/physical_plan/metrics/tracker.rs @@ -48,7 +48,7 @@ impl MemTrackingMetrics { ) -> Self { Self { allocation: memory_manager - .new_tracked_allocation(format!("MemTrackingMetrics[{}]", partition)), + .new_allocation(format!("MemTrackingMetrics[{}]", partition)), metrics: BaselineMetrics::new(metrics, partition), } } diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index f1a7771f966f..53886594b498 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -21,7 +21,7 @@ use crate::error::{DataFusionError, Result}; use crate::execution::context::TaskContext; -use crate::execution::memory_manager::human_readable_size; +use crate::execution::memory_manager::{human_readable_size, AllocationOptions}; use crate::execution::runtime_env::RuntimeEnv; use crate::execution::TrackedAllocation; use crate::physical_plan::common::{batch_byte_size, IPCWriter, SizedRecordBatchStream}; @@ -95,9 +95,10 @@ impl ExternalSorter { fetch: Option, ) -> Self { let metrics = metrics_set.new_intermediate_baseline(partition_id); - let allocation = runtime - .memory_manager - .new_tracked_allocation(format!("ExternalSorter[{}]", partition_id)); + let options = AllocationOptions::new(format!("ExternalSorter[{}]", partition_id)) + .with_can_spill(true); + + let allocation = runtime.memory_manager.new_allocation_with_options(options); Self { schema, diff --git a/datafusion/core/tests/order_spill_fuzz.rs b/datafusion/core/tests/order_spill_fuzz.rs index eb1f3d6de3b6..ce4384a3cd79 100644 --- a/datafusion/core/tests/order_spill_fuzz.rs +++ b/datafusion/core/tests/order_spill_fuzz.rs @@ -76,7 +76,7 @@ async fn run_sort(pool_size: usize, size_spill: Vec<(usize, bool)>) { let exec = MemoryExec::try_new(&input, schema, None).unwrap(); let sort = Arc::new(SortExec::try_new(sort, Arc::new(exec), None).unwrap()); - let runtime_config = RuntimeConfig::new().with_memory_manager( + let runtime_config = RuntimeConfig::new().with_memory_policy( MemoryManagerConfig::try_new_limit(pool_size, 1.0).unwrap(), ); let runtime = Arc::new(RuntimeEnv::new(runtime_config).unwrap()); From e39c09f6001b1bc0df94d18f353e0362f23944bd Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 16 Dec 2022 18:05:03 +0000 Subject: [PATCH 04/11] Misc fixes --- datafusion/core/src/execution/memory_manager/pool.rs | 4 ++-- datafusion/core/tests/order_spill_fuzz.rs | 7 +++---- datafusion/core/tests/sql/create_drop.rs | 4 ++-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/datafusion/core/src/execution/memory_manager/pool.rs b/datafusion/core/src/execution/memory_manager/pool.rs index c742253d19ed..fe7cd37a6371 100644 --- a/datafusion/core/src/execution/memory_manager/pool.rs +++ b/datafusion/core/src/execution/memory_manager/pool.rs @@ -29,11 +29,11 @@ pub struct UnboundedMemoryPool { impl MemoryPool for UnboundedMemoryPool { fn grow(&self, _allocation: &TrackedAllocation, additional: usize) { - self.used.fetch_add(additional, Ordering::SeqCst); + self.used.fetch_add(additional, Ordering::Relaxed); } fn shrink(&self, _allocation: &TrackedAllocation, shrink: usize) { - self.used.fetch_sub(shrink, Ordering::SeqCst); + self.used.fetch_sub(shrink, Ordering::Relaxed); } fn try_grow(&self, allocation: &TrackedAllocation, additional: usize) -> Result<()> { diff --git a/datafusion/core/tests/order_spill_fuzz.rs b/datafusion/core/tests/order_spill_fuzz.rs index ce4384a3cd79..9491e7cd1a02 100644 --- a/datafusion/core/tests/order_spill_fuzz.rs +++ b/datafusion/core/tests/order_spill_fuzz.rs @@ -22,7 +22,7 @@ use arrow::{ compute::SortOptions, record_batch::RecordBatch, }; -use datafusion::execution::memory_manager::MemoryManagerConfig; +use datafusion::execution::memory_manager::GreedyMemoryPool; use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use datafusion::physical_plan::expressions::{col, PhysicalSortExpr}; use datafusion::physical_plan::memory::MemoryExec; @@ -76,9 +76,8 @@ async fn run_sort(pool_size: usize, size_spill: Vec<(usize, bool)>) { let exec = MemoryExec::try_new(&input, schema, None).unwrap(); let sort = Arc::new(SortExec::try_new(sort, Arc::new(exec), None).unwrap()); - let runtime_config = RuntimeConfig::new().with_memory_policy( - MemoryManagerConfig::try_new_limit(pool_size, 1.0).unwrap(), - ); + let runtime_config = RuntimeConfig::new() + .with_memory_policy(Arc::new(GreedyMemoryPool::new(pool_size))); let runtime = Arc::new(RuntimeEnv::new(runtime_config).unwrap()); let session_ctx = SessionContext::with_config_rt(SessionConfig::new(), runtime); diff --git a/datafusion/core/tests/sql/create_drop.rs b/datafusion/core/tests/sql/create_drop.rs index fa2cbc00ddb8..353e9ae025d6 100644 --- a/datafusion/core/tests/sql/create_drop.rs +++ b/datafusion/core/tests/sql/create_drop.rs @@ -360,7 +360,7 @@ async fn create_partitioned_external_table() -> Result<()> { .read(true) .write(true) .create(true) - .open(&file_path0) + .open(file_path0) .expect("creating temp file") .write_all(data0.as_bytes()) .expect("writing data"); @@ -372,7 +372,7 @@ async fn create_partitioned_external_table() -> Result<()> { .read(true) .write(true) .create(true) - .open(&file_path1) + .open(file_path1) .expect("creating temp file") .write_all(data1.as_bytes()) .expect("writing data"); From 718a94b72a0b65486d26ceb482e9b3393ee3dd5e Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 16 Dec 2022 18:39:57 +0000 Subject: [PATCH 05/11] Remove MemoryManager --- datafusion/core/src/execution/context.rs | 31 ++++---- .../core/src/execution/memory_manager/mod.rs | 77 ++++++------------- .../core/src/execution/memory_manager/pool.rs | 28 +++---- datafusion/core/src/execution/mod.rs | 1 - datafusion/core/src/execution/runtime_env.rs | 7 +- .../core/src/physical_plan/aggregates/hash.rs | 7 +- .../physical_plan/aggregates/no_grouping.rs | 7 +- .../src/physical_plan/aggregates/row_hash.rs | 7 +- datafusion/core/src/physical_plan/explain.rs | 2 +- .../src/physical_plan/metrics/composite.rs | 10 +-- .../core/src/physical_plan/metrics/tracker.rs | 10 +-- .../core/src/physical_plan/sorts/sort.rs | 44 +++++------ .../sorts/sort_preserving_merge.rs | 4 +- datafusion/core/tests/order_spill_fuzz.rs | 4 +- .../core/tests/provider_filter_pushdown.rs | 2 +- 15 files changed, 107 insertions(+), 134 deletions(-) diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index b9eb904b6a1a..0ef531282cb5 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -78,7 +78,7 @@ use crate::config::{ ConfigOptions, OPT_BATCH_SIZE, OPT_COALESCE_BATCHES, OPT_COALESCE_TARGET_BATCH_SIZE, OPT_FILTER_NULL_JOIN_KEYS, OPT_OPTIMIZER_MAX_PASSES, OPT_OPTIMIZER_SKIP_FAILED_RULES, }; -use crate::execution::{runtime_env::RuntimeEnv, FunctionRegistry, MemoryManager}; +use crate::execution::{runtime_env::RuntimeEnv, FunctionRegistry}; use crate::physical_optimizer::enforcement::BasicEnforcement; use crate::physical_plan::file_format::{plan_to_csv, plan_to_json, plan_to_parquet}; use crate::physical_plan::planner::DefaultPhysicalPlanner; @@ -99,6 +99,7 @@ use url::Url; use crate::catalog::listing_schema::ListingSchemaProvider; use crate::datasource::object_store::ObjectStoreUrl; +use crate::execution::memory_manager::MemoryPool; use uuid::Uuid; use super::options::{ @@ -1961,9 +1962,9 @@ impl TaskContext { self.task_id.clone() } - /// Return the [`MemoryManager`] associated with this [TaskContext] - pub fn memory_manager(&self) -> &Arc { - &self.runtime.memory_manager + /// Return the [`MemoryPool`] associated with this [TaskContext] + pub fn memory_pool(&self) -> &Arc { + &self.runtime.memory_pool } /// Return the [RuntimeEnv] associated with this [TaskContext] @@ -2031,6 +2032,7 @@ mod tests { use super::*; use crate::assert_batches_eq; use crate::execution::context::QueryPlanner; + use crate::execution::memory_manager::TrackedAllocation; use crate::execution::runtime_env::RuntimeConfig; use crate::physical_plan::expressions::AvgAccumulator; use crate::test; @@ -2056,20 +2058,23 @@ mod tests { let ctx1 = SessionContext::new(); // configure with same memory / disk manager - let memory_manager = ctx1.runtime_env().memory_manager.clone(); + let memory_pool = ctx1.runtime_env().memory_pool.clone(); + + let mut allocation = TrackedAllocation::new(&memory_pool, "test".to_string()); + allocation.grow(100); + let disk_manager = ctx1.runtime_env().disk_manager.clone(); let ctx2 = SessionContext::with_config_rt(SessionConfig::new(), ctx1.runtime_env()); - assert!(std::ptr::eq( - Arc::as_ptr(&memory_manager), - Arc::as_ptr(&ctx1.runtime_env().memory_manager) - )); - assert!(std::ptr::eq( - Arc::as_ptr(&memory_manager), - Arc::as_ptr(&ctx2.runtime_env().memory_manager) - )); + assert_eq!(ctx1.runtime_env().memory_pool.allocated(), 100); + assert_eq!(ctx2.runtime_env().memory_pool.allocated(), 100); + + drop(allocation); + + assert_eq!(ctx1.runtime_env().memory_pool.allocated(), 0); + assert_eq!(ctx2.runtime_env().memory_pool.allocated(), 0); assert!(std::ptr::eq( Arc::as_ptr(&disk_manager), diff --git a/datafusion/core/src/execution/memory_manager/mod.rs b/datafusion/core/src/execution/memory_manager/mod.rs index d1ed5c975541..2c6c33724549 100644 --- a/datafusion/core/src/execution/memory_manager/mod.rs +++ b/datafusion/core/src/execution/memory_manager/mod.rs @@ -50,45 +50,6 @@ pub trait MemoryPool: Send + Sync + std::fmt::Debug { fn allocated(&self) -> usize; } -/// A cooperative MemoryManager which tracks memory in a cooperative fashion. -/// `ExecutionPlan` nodes such as `SortExec`, which require large amounts of memory -/// register their memory requests with the MemoryManager which then tracks the total -/// memory that has been allocated across all such nodes. -/// -/// The associated [`MemoryPool`] determines how to respond to memory allocation -/// requests, and any associated fairness control -#[derive(Debug)] -pub struct MemoryManager { - pool: Arc, -} - -impl MemoryManager { - /// Create new memory manager based on the configuration - pub fn new(pool: Arc) -> Self { - Self { pool } - } - - /// Returns the number of allocated bytes - /// - /// Note: this can exceed the pool size as a result of [`MemoryManager::allocate`] - pub fn allocated(&self) -> usize { - self.pool.allocated() - } - - /// Returns a new empty allocation identified by `name` - pub fn new_allocation(&self, name: String) -> TrackedAllocation { - self.new_allocation_with_options(AllocationOptions::new(name)) - } - - /// Returns a new empty allocation with the provided [`AllocationOptions`] - pub fn new_allocation_with_options( - &self, - options: AllocationOptions, - ) -> TrackedAllocation { - TrackedAllocation::new_empty(options, Arc::clone(&self.pool)) - } -} - /// Options associated with a [`TrackedAllocation`] #[derive(Debug)] pub struct AllocationOptions { @@ -131,12 +92,21 @@ pub struct TrackedAllocation { } impl TrackedAllocation { - fn new_empty(options: AllocationOptions, policy: Arc) -> Self { - policy.allocate(&options); + /// Create a new [`TrackedAllocation`] in the provided [`MemoryPool`] + pub fn new(pool: &Arc, name: String) -> Self { + Self::new_with_options(pool, AllocationOptions::new(name)) + } + + /// Create a new [`TrackedAllocation`] in the provided [`MemoryPool`] + pub fn new_with_options( + pool: &Arc, + options: AllocationOptions, + ) -> Self { + pool.allocate(&options); Self { options, size: 0, - policy, + policy: Arc::clone(pool), } } @@ -231,31 +201,30 @@ mod tests { #[test] fn test_memory_manager_underflow() { - let policy = Arc::new(GreedyMemoryPool::new(50)); - let manager = MemoryManager::new(policy); - let mut a1 = manager.new_allocation("a1".to_string()); - assert_eq!(manager.allocated(), 0); + let pool = Arc::new(GreedyMemoryPool::new(50)) as _; + let mut a1 = TrackedAllocation::new(&pool, "a1".to_string()); + assert_eq!(pool.allocated(), 0); a1.grow(100); - assert_eq!(manager.allocated(), 100); + assert_eq!(pool.allocated(), 100); assert_eq!(a1.free(), 100); - assert_eq!(manager.allocated(), 0); + assert_eq!(pool.allocated(), 0); a1.try_grow(100).unwrap_err(); - assert_eq!(manager.allocated(), 0); + assert_eq!(pool.allocated(), 0); a1.try_grow(30).unwrap(); - assert_eq!(manager.allocated(), 30); + assert_eq!(pool.allocated(), 30); - let mut a2 = manager.new_allocation("a2".to_string()); + let mut a2 = TrackedAllocation::new(&pool, "a2".to_string()); a2.try_grow(25).unwrap_err(); - assert_eq!(manager.allocated(), 30); + assert_eq!(pool.allocated(), 30); drop(a1); - assert_eq!(manager.allocated(), 0); + assert_eq!(pool.allocated(), 0); a2.try_grow(25).unwrap(); - assert_eq!(manager.allocated(), 25); + assert_eq!(pool.allocated(), 25); } } diff --git a/datafusion/core/src/execution/memory_manager/pool.rs b/datafusion/core/src/execution/memory_manager/pool.rs index fe7cd37a6371..e1ee2a0e9f18 100644 --- a/datafusion/core/src/execution/memory_manager/pool.rs +++ b/datafusion/core/src/execution/memory_manager/pool.rs @@ -15,8 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::execution::memory_manager::{AllocationOptions, MemoryPool}; -use crate::execution::TrackedAllocation; +use crate::execution::memory_manager::{ + AllocationOptions, MemoryPool, TrackedAllocation, +}; use datafusion_common::{DataFusionError, Result}; use parking_lot::Mutex; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -220,24 +221,23 @@ fn insufficient_capacity_err( mod tests { use super::*; use crate::execution::memory_manager::AllocationOptions; - use crate::execution::MemoryManager; use std::sync::Arc; #[test] fn test_fair() { - let manager = MemoryManager::new(Arc::new(FairSpillPool::new(100))); + let pool = Arc::new(FairSpillPool::new(100)) as _; - let mut a1 = manager.new_allocation("unspillable".to_string()); + let mut a1 = TrackedAllocation::new(&pool, "unspillable".to_string()); // Can grow beyond capacity of pool a1.grow(2000); - assert_eq!(manager.allocated(), 2000); + assert_eq!(pool.allocated(), 2000); let options = AllocationOptions::new("s1".to_string()).with_can_spill(true); - let mut a2 = manager.new_allocation_with_options(options); + let mut a2 = TrackedAllocation::new_with_options(&pool, options); // Can grow beyond capacity of pool a2.grow(2000); - assert_eq!(manager.allocated(), 4000); + assert_eq!(pool.allocated(), 4000); let err = a2.try_grow(1).unwrap_err().to_string(); assert_eq!(err, "Resources exhausted: Failed to allocate additional 1 bytes for s1 with 2000 bytes already allocated - maximum available is 0"); @@ -248,23 +248,23 @@ mod tests { a1.shrink(1990); a2.shrink(2000); - assert_eq!(manager.allocated(), 10); + assert_eq!(pool.allocated(), 10); a1.try_grow(10).unwrap(); - assert_eq!(manager.allocated(), 20); + assert_eq!(pool.allocated(), 20); // Can grow a2 to 80 as only spilling consumer a2.try_grow(80).unwrap(); - assert_eq!(manager.allocated(), 100); + assert_eq!(pool.allocated(), 100); a2.shrink(70); assert_eq!(a1.size(), 20); assert_eq!(a2.size(), 10); - assert_eq!(manager.allocated(), 30); + assert_eq!(pool.allocated(), 30); let options = AllocationOptions::new("s2".to_string()).with_can_spill(true); - let mut a3 = manager.new_allocation_with_options(options); + let mut a3 = TrackedAllocation::new_with_options(&pool, options); let err = a3.try_grow(70).unwrap_err().to_string(); assert_eq!(err, "Resources exhausted: Failed to allocate additional 70 bytes for s2 with 0 bytes already allocated - maximum available is 40"); @@ -276,7 +276,7 @@ mod tests { // But dropping a2 does drop(a2); - assert_eq!(manager.allocated(), 20); + assert_eq!(pool.allocated(), 20); a3.try_grow(80).unwrap(); } } diff --git a/datafusion/core/src/execution/mod.rs b/datafusion/core/src/execution/mod.rs index d827afe46d57..d4ace40378bc 100644 --- a/datafusion/core/src/execution/mod.rs +++ b/datafusion/core/src/execution/mod.rs @@ -48,5 +48,4 @@ pub mod registry; pub mod runtime_env; pub use disk_manager::DiskManager; -pub use memory_manager::{human_readable_size, MemoryManager, TrackedAllocation}; pub use registry::FunctionRegistry; diff --git a/datafusion/core/src/execution/runtime_env.rs b/datafusion/core/src/execution/runtime_env.rs index cc90d11592c6..1370cc37afba 100644 --- a/datafusion/core/src/execution/runtime_env.rs +++ b/datafusion/core/src/execution/runtime_env.rs @@ -30,7 +30,6 @@ use crate::datasource::object_store::ObjectStoreRegistry; use crate::execution::memory_manager::{ GreedyMemoryPool, MemoryPool, UnboundedMemoryPool, }; -use crate::execution::MemoryManager; use datafusion_common::DataFusionError; use object_store::ObjectStore; use std::fmt::{Debug, Formatter}; @@ -42,7 +41,7 @@ use url::Url; /// Execution runtime environment. pub struct RuntimeEnv { /// Runtime memory management - pub memory_manager: Arc, + pub memory_pool: Arc, /// Manage temporary files during query execution pub disk_manager: Arc, /// Object Store Registry @@ -67,11 +66,11 @@ impl RuntimeEnv { table_factories, } = config; - let pool = + let memory_pool = memory_pool.unwrap_or_else(|| Arc::new(UnboundedMemoryPool::default())); Ok(Self { - memory_manager: Arc::new(MemoryManager::new(pool)), + memory_pool, disk_manager: DiskManager::try_new(disk_manager)?, object_store_registry, table_factories, diff --git a/datafusion/core/src/physical_plan/aggregates/hash.rs b/datafusion/core/src/physical_plan/aggregates/hash.rs index f0f6af87a2bc..0492ccc2571c 100644 --- a/datafusion/core/src/physical_plan/aggregates/hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/hash.rs @@ -124,9 +124,10 @@ impl GroupedHashAggregateStream { timer.done(); - let allocation = context - .memory_manager() - .new_allocation(format!("GroupedHashAggregateStream[{}]", partition)); + let allocation = TrackedAllocation::new( + context.memory_pool(), + format!("GroupedHashAggregateStream[{}]", partition), + ); let inner = GroupedHashAggregateStreamInner { schema: Arc::clone(&schema), diff --git a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs index 00b1e316629d..3f9568e0d8b0 100644 --- a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs +++ b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs @@ -72,9 +72,10 @@ impl AggregateStream { let aggregate_expressions = aggregate_expressions(&aggr_expr, &mode, 0)?; let accumulators = create_accumulators(&aggr_expr)?; - let allocation = context - .memory_manager() - .new_allocation(format!("AggregateStream[{}]", partition)); + let allocation = TrackedAllocation::new( + context.memory_pool(), + format!("AggregateStream[{}]", partition), + ); let inner = AggregateStreamInner { schema: Arc::clone(&schema), diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs index c5be249ee7f2..4c03b75cef49 100644 --- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs @@ -139,9 +139,10 @@ impl GroupedHashAggregateStreamV2 { let aggr_schema = aggr_state_schema(&aggr_expr)?; let aggr_layout = Arc::new(RowLayout::new(&aggr_schema, RowType::WordAligned)); - let allocation = context - .memory_manager() - .new_allocation(format!("GroupedHashAggregateStreamV2[{}]", partition)); + let allocation = TrackedAllocation::new( + context.memory_pool(), + format!("GroupedHashAggregateStreamV2[{}]", partition), + ); let aggr_state = AggregationState { allocation, diff --git a/datafusion/core/src/physical_plan/explain.rs b/datafusion/core/src/physical_plan/explain.rs index 6689eb5b9d6d..077ed8dcc461 100644 --- a/datafusion/core/src/physical_plan/explain.rs +++ b/datafusion/core/src/physical_plan/explain.rs @@ -153,7 +153,7 @@ impl ExecutionPlan for ExplainExec { let metrics = ExecutionPlanMetricsSet::new(); let tracking_metrics = - MemTrackingMetrics::new(&metrics, context.memory_manager(), partition); + MemTrackingMetrics::new(&metrics, context.memory_pool(), partition); debug!( "Before returning SizedRecordBatch in ExplainExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id()); diff --git a/datafusion/core/src/physical_plan/metrics/composite.rs b/datafusion/core/src/physical_plan/metrics/composite.rs index 0e3a8edd3e73..f5f81c16356a 100644 --- a/datafusion/core/src/physical_plan/metrics/composite.rs +++ b/datafusion/core/src/physical_plan/metrics/composite.rs @@ -17,7 +17,7 @@ //! Metrics common for complex operators with multiple steps. -use crate::execution::MemoryManager; +use crate::execution::memory_manager::MemoryPool; use crate::physical_plan::metrics::tracker::MemTrackingMetrics; use crate::physical_plan::metrics::{ BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricValue, MetricsSet, Time, @@ -69,18 +69,18 @@ impl CompositeMetricsSet { pub fn new_intermediate_tracking( &self, partition: usize, - memory_manager: &MemoryManager, + pool: &Arc, ) -> MemTrackingMetrics { - MemTrackingMetrics::new(&self.mid, memory_manager, partition) + MemTrackingMetrics::new(&self.mid, pool, partition) } /// create a new final memory tracking metrics pub fn new_final_tracking( &self, partition: usize, - memory_manager: &MemoryManager, + pool: &Arc, ) -> MemTrackingMetrics { - MemTrackingMetrics::new(&self.final_, memory_manager, partition) + MemTrackingMetrics::new(&self.final_, pool, partition) } fn merge_compute_time(&self, dest: &Time) { diff --git a/datafusion/core/src/physical_plan/metrics/tracker.rs b/datafusion/core/src/physical_plan/metrics/tracker.rs index 40be3714f364..4161f7c2beb7 100644 --- a/datafusion/core/src/physical_plan/metrics/tracker.rs +++ b/datafusion/core/src/physical_plan/metrics/tracker.rs @@ -20,10 +20,10 @@ use crate::physical_plan::metrics::{ BaselineMetrics, Count, ExecutionPlanMetricsSet, Time, }; +use std::sync::Arc; use std::task::Poll; -use crate::execution::memory_manager::TrackedAllocation; -use crate::execution::MemoryManager; +use crate::execution::memory_manager::{MemoryPool, TrackedAllocation}; use arrow::{error::ArrowError, record_batch::RecordBatch}; /// Simplified version of tracking memory consumer, @@ -43,12 +43,12 @@ impl MemTrackingMetrics { /// Create memory tracking metrics with reference to memory manager pub fn new( metrics: &ExecutionPlanMetricsSet, - memory_manager: &MemoryManager, + pool: &Arc, partition: usize, ) -> Self { + let name = format!("MemTrackingMetrics[{}]", partition); Self { - allocation: memory_manager - .new_allocation(format!("MemTrackingMetrics[{}]", partition)), + allocation: TrackedAllocation::new(pool, name), metrics: BaselineMetrics::new(metrics, partition), } } diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index 53886594b498..c2a04a487a19 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -21,9 +21,10 @@ use crate::error::{DataFusionError, Result}; use crate::execution::context::TaskContext; -use crate::execution::memory_manager::{human_readable_size, AllocationOptions}; +use crate::execution::memory_manager::{ + human_readable_size, AllocationOptions, TrackedAllocation, +}; use crate::execution::runtime_env::RuntimeEnv; -use crate::execution::TrackedAllocation; use crate::physical_plan::common::{batch_byte_size, IPCWriter, SizedRecordBatchStream}; use crate::physical_plan::expressions::PhysicalSortExpr; use crate::physical_plan::metrics::{ @@ -98,7 +99,8 @@ impl ExternalSorter { let options = AllocationOptions::new(format!("ExternalSorter[{}]", partition_id)) .with_can_spill(true); - let allocation = runtime.memory_manager.new_allocation_with_options(options); + let allocation = + TrackedAllocation::new_with_options(&runtime.memory_pool, options); Self { schema, @@ -168,10 +170,9 @@ impl ExternalSorter { let batch_size = self.session_config.batch_size(); if self.spilled_before().await { - let tracking_metrics = self.metrics_set.new_intermediate_tracking( - self.partition_id, - self.runtime.memory_manager.as_ref(), - ); + let tracking_metrics = self + .metrics_set + .new_intermediate_tracking(self.partition_id, &self.runtime.memory_pool); let mut streams: Vec = vec![]; if !self.in_mem_batches.is_empty() { let in_mem_stream = in_mem_partial_sort( @@ -190,10 +191,9 @@ impl ExternalSorter { let stream = read_spill_as_stream(spill, self.schema.clone())?; streams.push(SortedStream::new(stream, 0)); } - let tracking_metrics = self.metrics_set.new_final_tracking( - self.partition_id, - self.runtime.memory_manager.as_ref(), - ); + let tracking_metrics = self + .metrics_set + .new_final_tracking(self.partition_id, &self.runtime.memory_pool); Ok(Box::pin(SortPreservingMergeStream::new_from_streams( streams, self.schema.clone(), @@ -202,10 +202,9 @@ impl ExternalSorter { self.session_config.batch_size(), )?)) } else if !self.in_mem_batches.is_empty() { - let tracking_metrics = self.metrics_set.new_final_tracking( - self.partition_id, - self.runtime.memory_manager.as_ref(), - ); + let tracking_metrics = self + .metrics_set + .new_final_tracking(self.partition_id, &self.runtime.memory_pool); let result = in_mem_partial_sort( &mut self.in_mem_batches, self.schema.clone(), @@ -242,10 +241,9 @@ impl ExternalSorter { debug!("Spilling sort data of ExternalSorter to disk whilst inserting"); - let tracking_metrics = self.metrics_set.new_intermediate_tracking( - self.partition_id, - self.runtime.memory_manager.as_ref(), - ); + let tracking_metrics = self + .metrics_set + .new_intermediate_tracking(self.partition_id, &self.runtime.memory_pool); let spillfile = self.runtime.disk_manager.create_tmp_file("Sorting")?; let stream = in_mem_partial_sort( @@ -872,7 +870,7 @@ async fn do_sort( ); let schema = input.schema(); let tracking_metrics = - metrics_set.new_intermediate_tracking(partition_id, context.memory_manager()); + metrics_set.new_intermediate_tracking(partition_id, context.memory_pool()); let mut sorter = ExternalSorter::new( partition_id, schema.clone(), @@ -965,7 +963,7 @@ mod tests { assert_eq!(c7.value(c7.len() - 1), 254,); assert_eq!( - session_ctx.runtime_env().memory_manager.allocated(), + session_ctx.runtime_env().memory_pool.allocated(), 0, "The sort should have returned all memory used back to the memory manager" ); @@ -1034,7 +1032,7 @@ mod tests { assert_eq!(c7.value(c7.len() - 1), 254,); assert_eq!( - session_ctx.runtime_env().memory_manager.allocated(), + session_ctx.runtime_env().memory_pool.allocated(), 0, "The sort should have returned all memory used back to the memory manager" ); @@ -1285,7 +1283,7 @@ mod tests { assert_strong_count_converges_to_zero(refs).await; assert_eq!( - session_ctx.runtime_env().memory_manager.allocated(), + session_ctx.runtime_env().memory_pool.allocated(), 0, "The sort should have returned all memory used back to the memory manager" ); diff --git a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs index d733f73a84a2..f069cc5b007c 100644 --- a/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs +++ b/datafusion/core/src/physical_plan/sorts/sort_preserving_merge.rs @@ -170,7 +170,7 @@ impl ExecutionPlan for SortPreservingMergeExec { } let tracking_metrics = - MemTrackingMetrics::new(&self.metrics, context.memory_manager(), partition); + MemTrackingMetrics::new(&self.metrics, context.memory_pool(), partition); let input_partitions = self.input.output_partitioning().partition_count(); debug!( @@ -1260,7 +1260,7 @@ mod tests { let metrics = ExecutionPlanMetricsSet::new(); let tracking_metrics = - MemTrackingMetrics::new(&metrics, task_ctx.memory_manager(), 0); + MemTrackingMetrics::new(&metrics, task_ctx.memory_pool(), 0); let merge_stream = SortPreservingMergeStream::new_from_streams( streams, diff --git a/datafusion/core/tests/order_spill_fuzz.rs b/datafusion/core/tests/order_spill_fuzz.rs index 9491e7cd1a02..547c6de4f966 100644 --- a/datafusion/core/tests/order_spill_fuzz.rs +++ b/datafusion/core/tests/order_spill_fuzz.rs @@ -94,9 +94,9 @@ async fn run_sort(pool_size: usize, size_spill: Vec<(usize, bool)>) { } assert_eq!( - session_ctx.runtime_env().memory_manager.allocated(), + session_ctx.runtime_env().memory_pool.allocated(), 0, - "The sort should have returned all memory used back to the memory manager" + "The sort should have returned all memory used back to the memory pool" ); assert_eq!(expected, actual, "failure in @ pool_size {}", pool_size); } diff --git a/datafusion/core/tests/provider_filter_pushdown.rs b/datafusion/core/tests/provider_filter_pushdown.rs index 1ed3a5913609..13160fd52e1a 100644 --- a/datafusion/core/tests/provider_filter_pushdown.rs +++ b/datafusion/core/tests/provider_filter_pushdown.rs @@ -94,7 +94,7 @@ impl ExecutionPlan for CustomPlan { ) -> Result { let metrics = ExecutionPlanMetricsSet::new(); let tracking_metrics = - MemTrackingMetrics::new(&metrics, context.memory_manager(), partition); + MemTrackingMetrics::new(&metrics, context.memory_pool(), partition); Ok(Box::pin(SizedRecordBatchStream::new( self.schema(), self.batches.clone(), From 15c6a4e0ad970b2b2bd0755fa80708f82135740c Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 16 Dec 2022 18:42:39 +0000 Subject: [PATCH 06/11] Tweak doc --- datafusion/core/src/execution/context.rs | 2 +- datafusion/core/src/execution/memory_manager/mod.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 0ef531282cb5..4c76114d26b3 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -2054,7 +2054,7 @@ mod tests { #[tokio::test] async fn shared_memory_and_disk_manager() { // Demonstrate the ability to share DiskManager and - // MemoryManager between two different executions. + // MemoryPool between two different executions. let ctx1 = SessionContext::new(); // configure with same memory / disk manager diff --git a/datafusion/core/src/execution/memory_manager/mod.rs b/datafusion/core/src/execution/memory_manager/mod.rs index 2c6c33724549..f3698ea0e1a3 100644 --- a/datafusion/core/src/execution/memory_manager/mod.rs +++ b/datafusion/core/src/execution/memory_manager/mod.rs @@ -25,7 +25,7 @@ pub mod proxy; pub use pool::*; -/// The pool of memory from which [`MemoryManager`] and [`TrackedAllocation`] allocate +/// The pool of memory from which [`TrackedAllocation`] allocate pub trait MemoryPool: Send + Sync + std::fmt::Debug { /// Records the creation of a new [`TrackedAllocation`] with [`AllocationOptions`] fn allocate(&self, _options: &AllocationOptions) {} @@ -82,8 +82,8 @@ impl AllocationOptions { } } -/// A [`TrackedAllocation`] tracks a reservation of memory in a [`MemoryManager`] -/// that is freed back to the memory pool on drop +/// A [`TrackedAllocation`] tracks a reservation of memory in a [`MemoryPool`] +/// that is freed back to the pool on drop #[derive(Debug)] pub struct TrackedAllocation { options: AllocationOptions, From 239555650735d969b6338a1b9594f7b853c230e6 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 16 Dec 2022 18:46:33 +0000 Subject: [PATCH 07/11] Rename module --- datafusion/core/src/execution/context.rs | 4 ++-- .../core/src/execution/{memory_manager => memory_pool}/mod.rs | 2 +- .../src/execution/{memory_manager => memory_pool}/pool.rs | 4 ++-- .../src/execution/{memory_manager => memory_pool}/proxy.rs | 0 datafusion/core/src/execution/mod.rs | 2 +- datafusion/core/src/execution/runtime_env.rs | 2 +- datafusion/core/src/physical_plan/aggregates/hash.rs | 4 ++-- datafusion/core/src/physical_plan/aggregates/no_grouping.rs | 2 +- datafusion/core/src/physical_plan/aggregates/row_hash.rs | 4 ++-- datafusion/core/src/physical_plan/metrics/composite.rs | 2 +- datafusion/core/src/physical_plan/metrics/tracker.rs | 4 ++-- datafusion/core/src/physical_plan/sorts/sort.rs | 2 +- datafusion/core/tests/order_spill_fuzz.rs | 2 +- 13 files changed, 17 insertions(+), 17 deletions(-) rename datafusion/core/src/execution/{memory_manager => memory_pool}/mod.rs (99%) rename datafusion/core/src/execution/{memory_manager => memory_pool}/pool.rs (98%) rename datafusion/core/src/execution/{memory_manager => memory_pool}/proxy.rs (100%) diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 4c76114d26b3..8e7a69d51c8d 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -99,7 +99,7 @@ use url::Url; use crate::catalog::listing_schema::ListingSchemaProvider; use crate::datasource::object_store::ObjectStoreUrl; -use crate::execution::memory_manager::MemoryPool; +use crate::execution::memory_pool::MemoryPool; use uuid::Uuid; use super::options::{ @@ -2032,7 +2032,7 @@ mod tests { use super::*; use crate::assert_batches_eq; use crate::execution::context::QueryPlanner; - use crate::execution::memory_manager::TrackedAllocation; + use crate::execution::memory_pool::TrackedAllocation; use crate::execution::runtime_env::RuntimeConfig; use crate::physical_plan::expressions::AvgAccumulator; use crate::test; diff --git a/datafusion/core/src/execution/memory_manager/mod.rs b/datafusion/core/src/execution/memory_pool/mod.rs similarity index 99% rename from datafusion/core/src/execution/memory_manager/mod.rs rename to datafusion/core/src/execution/memory_pool/mod.rs index f3698ea0e1a3..9359af10a608 100644 --- a/datafusion/core/src/execution/memory_manager/mod.rs +++ b/datafusion/core/src/execution/memory_pool/mod.rs @@ -200,7 +200,7 @@ mod tests { use super::*; #[test] - fn test_memory_manager_underflow() { + fn test_memory_pool_underflow() { let pool = Arc::new(GreedyMemoryPool::new(50)) as _; let mut a1 = TrackedAllocation::new(&pool, "a1".to_string()); assert_eq!(pool.allocated(), 0); diff --git a/datafusion/core/src/execution/memory_manager/pool.rs b/datafusion/core/src/execution/memory_pool/pool.rs similarity index 98% rename from datafusion/core/src/execution/memory_manager/pool.rs rename to datafusion/core/src/execution/memory_pool/pool.rs index e1ee2a0e9f18..db704a0b209e 100644 --- a/datafusion/core/src/execution/memory_manager/pool.rs +++ b/datafusion/core/src/execution/memory_pool/pool.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::execution::memory_manager::{ +use crate::execution::memory_pool::{ AllocationOptions, MemoryPool, TrackedAllocation, }; use datafusion_common::{DataFusionError, Result}; @@ -220,7 +220,7 @@ fn insufficient_capacity_err( #[cfg(test)] mod tests { use super::*; - use crate::execution::memory_manager::AllocationOptions; + use crate::execution::memory_pool::AllocationOptions; use std::sync::Arc; #[test] diff --git a/datafusion/core/src/execution/memory_manager/proxy.rs b/datafusion/core/src/execution/memory_pool/proxy.rs similarity index 100% rename from datafusion/core/src/execution/memory_manager/proxy.rs rename to datafusion/core/src/execution/memory_pool/proxy.rs diff --git a/datafusion/core/src/execution/mod.rs b/datafusion/core/src/execution/mod.rs index d4ace40378bc..5eb859df9304 100644 --- a/datafusion/core/src/execution/mod.rs +++ b/datafusion/core/src/execution/mod.rs @@ -42,7 +42,7 @@ pub mod context; pub mod disk_manager; -pub mod memory_manager; +pub mod memory_pool; pub mod options; pub mod registry; pub mod runtime_env; diff --git a/datafusion/core/src/execution/runtime_env.rs b/datafusion/core/src/execution/runtime_env.rs index 1370cc37afba..db4a5d6591f3 100644 --- a/datafusion/core/src/execution/runtime_env.rs +++ b/datafusion/core/src/execution/runtime_env.rs @@ -27,7 +27,7 @@ use std::collections::HashMap; use crate::datasource::datasource::TableProviderFactory; use crate::datasource::listing_table_factory::ListingTableFactory; use crate::datasource::object_store::ObjectStoreRegistry; -use crate::execution::memory_manager::{ +use crate::execution::memory_pool::{ GreedyMemoryPool, MemoryPool, UnboundedMemoryPool, }; use datafusion_common::DataFusionError; diff --git a/datafusion/core/src/physical_plan/aggregates/hash.rs b/datafusion/core/src/physical_plan/aggregates/hash.rs index 0492ccc2571c..e6ceb1d95806 100644 --- a/datafusion/core/src/physical_plan/aggregates/hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/hash.rs @@ -29,7 +29,7 @@ use futures::stream::{Stream, StreamExt}; use crate::error::Result; use crate::execution::context::TaskContext; -use crate::execution::memory_manager::proxy::{RawTableAllocExt, VecAllocExt}; +use crate::execution::memory_pool::proxy::{RawTableAllocExt, VecAllocExt}; use crate::physical_plan::aggregates::{ evaluate_group_by, evaluate_many, AccumulatorItem, AggregateMode, PhysicalGroupBy, }; @@ -39,7 +39,7 @@ use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr}; use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream}; use crate::scalar::ScalarValue; -use crate::execution::memory_manager::TrackedAllocation; +use crate::execution::memory_pool::TrackedAllocation; use arrow::{array::ArrayRef, compute, compute::cast}; use arrow::{ array::{Array, UInt32Builder}, diff --git a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs index 3f9568e0d8b0..ae5b672964a7 100644 --- a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs +++ b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs @@ -33,7 +33,7 @@ use futures::stream::BoxStream; use std::sync::Arc; use std::task::{Context, Poll}; -use crate::execution::memory_manager::TrackedAllocation; +use crate::execution::memory_pool::TrackedAllocation; use futures::stream::{Stream, StreamExt}; /// stream struct for aggregation without grouping columns diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs index 4c03b75cef49..a220507645dc 100644 --- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs @@ -27,7 +27,7 @@ use futures::stream::{Stream, StreamExt}; use crate::error::Result; use crate::execution::context::TaskContext; -use crate::execution::memory_manager::proxy::{RawTableAllocExt, VecAllocExt}; +use crate::execution::memory_pool::proxy::{RawTableAllocExt, VecAllocExt}; use crate::physical_plan::aggregates::{ evaluate_group_by, evaluate_many, group_schema, AccumulatorItemV2, AggregateMode, PhysicalGroupBy, @@ -37,7 +37,7 @@ use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput}; use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr}; use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream}; -use crate::execution::memory_manager::TrackedAllocation; +use crate::execution::memory_pool::TrackedAllocation; use arrow::compute::cast; use arrow::datatypes::Schema; use arrow::{array::ArrayRef, compute}; diff --git a/datafusion/core/src/physical_plan/metrics/composite.rs b/datafusion/core/src/physical_plan/metrics/composite.rs index f5f81c16356a..3c257805d2c5 100644 --- a/datafusion/core/src/physical_plan/metrics/composite.rs +++ b/datafusion/core/src/physical_plan/metrics/composite.rs @@ -17,7 +17,7 @@ //! Metrics common for complex operators with multiple steps. -use crate::execution::memory_manager::MemoryPool; +use crate::execution::memory_pool::MemoryPool; use crate::physical_plan::metrics::tracker::MemTrackingMetrics; use crate::physical_plan::metrics::{ BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricValue, MetricsSet, Time, diff --git a/datafusion/core/src/physical_plan/metrics/tracker.rs b/datafusion/core/src/physical_plan/metrics/tracker.rs index 4161f7c2beb7..2fc8a6cd7c7a 100644 --- a/datafusion/core/src/physical_plan/metrics/tracker.rs +++ b/datafusion/core/src/physical_plan/metrics/tracker.rs @@ -23,11 +23,11 @@ use crate::physical_plan::metrics::{ use std::sync::Arc; use std::task::Poll; -use crate::execution::memory_manager::{MemoryPool, TrackedAllocation}; +use crate::execution::memory_pool::{MemoryPool, TrackedAllocation}; use arrow::{error::ArrowError, record_batch::RecordBatch}; /// Simplified version of tracking memory consumer, -/// see also: [`Tracking`](crate::execution::memory_manager::ConsumerType::Tracking) +/// see also: [`Tracking`](crate::execution::memory_pool::ConsumerType::Tracking) /// /// You could use this to replace [BaselineMetrics], report the memory, /// and get the memory usage bookkeeping in the memory manager easily. diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index c2a04a487a19..5e6f1dc260e4 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -21,7 +21,7 @@ use crate::error::{DataFusionError, Result}; use crate::execution::context::TaskContext; -use crate::execution::memory_manager::{ +use crate::execution::memory_pool::{ human_readable_size, AllocationOptions, TrackedAllocation, }; use crate::execution::runtime_env::RuntimeEnv; diff --git a/datafusion/core/tests/order_spill_fuzz.rs b/datafusion/core/tests/order_spill_fuzz.rs index 547c6de4f966..63a96c578db9 100644 --- a/datafusion/core/tests/order_spill_fuzz.rs +++ b/datafusion/core/tests/order_spill_fuzz.rs @@ -22,7 +22,7 @@ use arrow::{ compute::SortOptions, record_batch::RecordBatch, }; -use datafusion::execution::memory_manager::GreedyMemoryPool; +use datafusion::execution::memory_pool::GreedyMemoryPool; use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use datafusion::physical_plan::expressions::{col, PhysicalSortExpr}; use datafusion::physical_plan::memory::MemoryExec; From 40346a66571c7f8748f772d60516e73ac0b023e7 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 16 Dec 2022 18:50:05 +0000 Subject: [PATCH 08/11] Format --- datafusion/core/src/execution/memory_pool/pool.rs | 4 +--- datafusion/core/src/execution/runtime_env.rs | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/datafusion/core/src/execution/memory_pool/pool.rs b/datafusion/core/src/execution/memory_pool/pool.rs index db704a0b209e..da2ebc52ad73 100644 --- a/datafusion/core/src/execution/memory_pool/pool.rs +++ b/datafusion/core/src/execution/memory_pool/pool.rs @@ -15,9 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::execution::memory_pool::{ - AllocationOptions, MemoryPool, TrackedAllocation, -}; +use crate::execution::memory_pool::{AllocationOptions, MemoryPool, TrackedAllocation}; use datafusion_common::{DataFusionError, Result}; use parking_lot::Mutex; use std::sync::atomic::{AtomicUsize, Ordering}; diff --git a/datafusion/core/src/execution/runtime_env.rs b/datafusion/core/src/execution/runtime_env.rs index db4a5d6591f3..5a6be6a25138 100644 --- a/datafusion/core/src/execution/runtime_env.rs +++ b/datafusion/core/src/execution/runtime_env.rs @@ -27,9 +27,7 @@ use std::collections::HashMap; use crate::datasource::datasource::TableProviderFactory; use crate::datasource::listing_table_factory::ListingTableFactory; use crate::datasource::object_store::ObjectStoreRegistry; -use crate::execution::memory_pool::{ - GreedyMemoryPool, MemoryPool, UnboundedMemoryPool, -}; +use crate::execution::memory_pool::{GreedyMemoryPool, MemoryPool, UnboundedMemoryPool}; use datafusion_common::DataFusionError; use object_store::ObjectStore; use std::fmt::{Debug, Formatter}; From 62723a2a1d2bfbc8b6569524a7998df4dd3f1126 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 19 Dec 2022 09:50:03 +0000 Subject: [PATCH 09/11] Review feedback --- datafusion/core/src/execution/context.rs | 8 +- .../core/src/execution/memory_pool/mod.rs | 97 +++++++--------- .../core/src/execution/memory_pool/pool.rs | 109 +++++++++--------- datafusion/core/src/execution/runtime_env.rs | 4 +- .../core/src/physical_plan/aggregates/hash.rs | 15 ++- .../physical_plan/aggregates/no_grouping.rs | 14 +-- .../src/physical_plan/aggregates/row_hash.rs | 15 ++- .../core/src/physical_plan/metrics/tracker.rs | 19 ++- .../core/src/physical_plan/sorts/sort.rs | 36 +++--- datafusion/core/tests/order_spill_fuzz.rs | 2 +- 10 files changed, 153 insertions(+), 166 deletions(-) diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 8e7a69d51c8d..1e733d7161c4 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -2032,7 +2032,7 @@ mod tests { use super::*; use crate::assert_batches_eq; use crate::execution::context::QueryPlanner; - use crate::execution::memory_pool::TrackedAllocation; + use crate::execution::memory_pool::MemoryConsumer; use crate::execution::runtime_env::RuntimeConfig; use crate::physical_plan::expressions::AvgAccumulator; use crate::test; @@ -2060,8 +2060,8 @@ mod tests { // configure with same memory / disk manager let memory_pool = ctx1.runtime_env().memory_pool.clone(); - let mut allocation = TrackedAllocation::new(&memory_pool, "test".to_string()); - allocation.grow(100); + let mut reservation = MemoryConsumer::new("test").register(&memory_pool); + reservation.grow(100); let disk_manager = ctx1.runtime_env().disk_manager.clone(); @@ -2071,7 +2071,7 @@ mod tests { assert_eq!(ctx1.runtime_env().memory_pool.allocated(), 100); assert_eq!(ctx2.runtime_env().memory_pool.allocated(), 100); - drop(allocation); + drop(reservation); assert_eq!(ctx1.runtime_env().memory_pool.allocated(), 0); assert_eq!(ctx2.runtime_env().memory_pool.allocated(), 0); diff --git a/datafusion/core/src/execution/memory_pool/mod.rs b/datafusion/core/src/execution/memory_pool/mod.rs index 9359af10a608..a093afae631a 100644 --- a/datafusion/core/src/execution/memory_pool/mod.rs +++ b/datafusion/core/src/execution/memory_pool/mod.rs @@ -25,43 +25,43 @@ pub mod proxy; pub use pool::*; -/// The pool of memory from which [`TrackedAllocation`] allocate +/// The pool of memory on which [`MemoryReservation`] record their memory usage pub trait MemoryPool: Send + Sync + std::fmt::Debug { - /// Records the creation of a new [`TrackedAllocation`] with [`AllocationOptions`] - fn allocate(&self, _options: &AllocationOptions) {} + /// Records the creation of a new [`MemoryReservation`] with [`MemoryConsumer`] + fn register(&self, _consumer: &MemoryConsumer) {} - /// Records the destruction of a [`TrackedAllocation`] with [`AllocationOptions`] - fn free(&self, _options: &AllocationOptions) {} + /// Records the destruction of a [`MemoryReservation`] with [`MemoryConsumer`] + fn unregister(&self, _consumer: &MemoryConsumer) {} - /// Infallibly grow the provided `allocation` by `additional` bytes + /// Infallibly grow the provided `reservation` by `additional` bytes /// /// This must always succeed - fn grow(&self, allocation: &TrackedAllocation, additional: usize); + fn grow(&self, reservation: &MemoryReservation, additional: usize); - /// Infallibly shrink the provided `allocation` by `shrink` bytes - fn shrink(&self, allocation: &TrackedAllocation, shrink: usize); + /// Infallibly shrink the provided `reservation` by `shrink` bytes + fn shrink(&self, reservation: &MemoryReservation, shrink: usize); - /// Attempt to grow the provided `allocation` by `additional` bytes + /// Attempt to grow the provided `reservation` by `additional` bytes /// /// On error the `allocation` will not be increased in size - fn try_grow(&self, allocation: &TrackedAllocation, additional: usize) -> Result<()>; + fn try_grow(&self, reservation: &MemoryReservation, additional: usize) -> Result<()>; /// Return the total amount of memory allocated fn allocated(&self) -> usize; } -/// Options associated with a [`TrackedAllocation`] +/// A memory consumer that can be tracked by [`MemoryReservation`] in a [`MemoryPool`] #[derive(Debug)] -pub struct AllocationOptions { +pub struct MemoryConsumer { name: String, can_spill: bool, } -impl AllocationOptions { - /// Create a new [`AllocationOptions`] - pub fn new(name: String) -> Self { +impl MemoryConsumer { + /// Create a new empty [`MemoryConsumer`] that can be grown using [`MemoryReservation`] + pub fn new(name: impl Into) -> Self { Self { - name, + name: name.into(), can_spill: false, } } @@ -80,47 +80,35 @@ impl AllocationOptions { pub fn name(&self) -> &str { &self.name } + + /// Registers this [`MemoryConsumer`] with the provided [`MemoryPool`] returning + /// a [`MemoryReservation`] that can be used to grow or shrink the memory reservation + pub fn register(self, pool: &Arc) -> MemoryReservation { + pool.register(&self); + MemoryReservation { + consumer: self, + size: 0, + policy: Arc::clone(pool), + } + } } -/// A [`TrackedAllocation`] tracks a reservation of memory in a [`MemoryPool`] +/// A [`MemoryReservation`] tracks a reservation of memory in a [`MemoryPool`] /// that is freed back to the pool on drop #[derive(Debug)] -pub struct TrackedAllocation { - options: AllocationOptions, +pub struct MemoryReservation { + consumer: MemoryConsumer, size: usize, policy: Arc, } -impl TrackedAllocation { - /// Create a new [`TrackedAllocation`] in the provided [`MemoryPool`] - pub fn new(pool: &Arc, name: String) -> Self { - Self::new_with_options(pool, AllocationOptions::new(name)) - } - - /// Create a new [`TrackedAllocation`] in the provided [`MemoryPool`] - pub fn new_with_options( - pool: &Arc, - options: AllocationOptions, - ) -> Self { - pool.allocate(&options); - Self { - options, - size: 0, - policy: Arc::clone(pool), - } - } - - /// Returns the size of this [`TrackedAllocation`] in bytes +impl MemoryReservation { + /// Returns the size of this reservation in bytes pub fn size(&self) -> usize { self.size } - /// Returns this allocations [`AllocationOptions`] - pub fn options(&self) -> &AllocationOptions { - &self.options - } - - /// Frees all bytes from this allocation returning the number of bytes freed + /// Frees all bytes from this reservation returning the number of bytes freed pub fn free(&mut self) -> usize { let size = self.size; if size != 0 { @@ -129,7 +117,7 @@ impl TrackedAllocation { size } - /// Frees `capacity` bytes from this allocation + /// Frees `capacity` bytes from this reservation /// /// # Panics /// @@ -140,7 +128,7 @@ impl TrackedAllocation { self.size = new_size } - /// Sets the size of this allocation to `capacity` + /// Sets the size of this reservation to `capacity` pub fn resize(&mut self, capacity: usize) { use std::cmp::Ordering; match capacity.cmp(&self.size) { @@ -150,13 +138,13 @@ impl TrackedAllocation { } } - /// Increase the size of this by `capacity` bytes + /// Increase the size of this reservation by `capacity` bytes pub fn grow(&mut self, capacity: usize) { self.policy.grow(self, capacity); self.size += capacity; } - /// Try to increase the size of this [`TrackedAllocation`] by `capacity` bytes + /// Try to increase the size of this reservation by `capacity` bytes pub fn try_grow(&mut self, capacity: usize) -> Result<()> { self.policy.try_grow(self, capacity)?; self.size += capacity; @@ -164,10 +152,9 @@ impl TrackedAllocation { } } -impl Drop for TrackedAllocation { +impl Drop for MemoryReservation { fn drop(&mut self) { - self.free(); - self.policy.free(&self.options); + self.policy.unregister(&self.consumer); } } @@ -202,7 +189,7 @@ mod tests { #[test] fn test_memory_pool_underflow() { let pool = Arc::new(GreedyMemoryPool::new(50)) as _; - let mut a1 = TrackedAllocation::new(&pool, "a1".to_string()); + let mut a1 = MemoryConsumer::new("a1").register(&pool); assert_eq!(pool.allocated(), 0); a1.grow(100); @@ -217,7 +204,7 @@ mod tests { a1.try_grow(30).unwrap(); assert_eq!(pool.allocated(), 30); - let mut a2 = TrackedAllocation::new(&pool, "a2".to_string()); + let mut a2 = MemoryConsumer::new("a2").register(&pool); a2.try_grow(25).unwrap_err(); assert_eq!(pool.allocated(), 30); diff --git a/datafusion/core/src/execution/memory_pool/pool.rs b/datafusion/core/src/execution/memory_pool/pool.rs index da2ebc52ad73..9042bedd1316 100644 --- a/datafusion/core/src/execution/memory_pool/pool.rs +++ b/datafusion/core/src/execution/memory_pool/pool.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::execution::memory_pool::{AllocationOptions, MemoryPool, TrackedAllocation}; +use crate::execution::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation}; use datafusion_common::{DataFusionError, Result}; use parking_lot::Mutex; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -27,16 +27,16 @@ pub struct UnboundedMemoryPool { } impl MemoryPool for UnboundedMemoryPool { - fn grow(&self, _allocation: &TrackedAllocation, additional: usize) { + fn grow(&self, _reservation: &MemoryReservation, additional: usize) { self.used.fetch_add(additional, Ordering::Relaxed); } - fn shrink(&self, _allocation: &TrackedAllocation, shrink: usize) { + fn shrink(&self, _reservation: &MemoryReservation, shrink: usize) { self.used.fetch_sub(shrink, Ordering::Relaxed); } - fn try_grow(&self, allocation: &TrackedAllocation, additional: usize) -> Result<()> { - self.grow(allocation, additional); + fn try_grow(&self, reservation: &MemoryReservation, additional: usize) -> Result<()> { + self.grow(reservation, additional); Ok(()) } @@ -63,22 +63,22 @@ impl GreedyMemoryPool { } impl MemoryPool for GreedyMemoryPool { - fn grow(&self, _allocation: &TrackedAllocation, additional: usize) { + fn grow(&self, _reservation: &MemoryReservation, additional: usize) { self.used.fetch_add(additional, Ordering::Relaxed); } - fn shrink(&self, _allocation: &TrackedAllocation, shrink: usize) { + fn shrink(&self, _reservation: &MemoryReservation, shrink: usize) { self.used.fetch_sub(shrink, Ordering::Relaxed); } - fn try_grow(&self, allocation: &TrackedAllocation, additional: usize) -> Result<()> { + fn try_grow(&self, reservation: &MemoryReservation, additional: usize) -> Result<()> { self.used .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |used| { let new_used = used + additional; (new_used <= self.pool_size).then_some(new_used) }) .map_err(|used| { - insufficient_capacity_err(allocation, additional, self.pool_size - used) + insufficient_capacity_err(reservation, additional, self.pool_size - used) })?; Ok(()) } @@ -88,9 +88,9 @@ impl MemoryPool for GreedyMemoryPool { } } -/// A [`MemoryPool`] that prevents spillable allocations from using more than -/// an even fraction of the available memory sans any unspillable allocations -/// (i.e. `(pool_size - unspillable_memory) / num_spillable_allocations`) +/// A [`MemoryPool`] that prevents spillable reservations from using more than +/// an even fraction of the available memory sans any unspillable reservations +/// (i.e. `(pool_size - unspillable_memory) / num_spillable_reservations`) /// /// ┌───────────────────────z──────────────────────z───────────────┐ /// │ z z │ @@ -112,13 +112,13 @@ pub struct FairSpillPool { #[derive(Debug)] struct FairSpillPoolState { - /// The number of allocations that can spill + /// The number of consumers that can spill num_spill: usize, - /// The total amount of memory allocated that can be spilled + /// The total amount of memory reserved that can be spilled spillable: usize, - /// The total amount of memory allocated by consumers that cannot spill + /// The total amount of memory reserved by consumers that cannot spill unspillable: usize, } @@ -137,38 +137,38 @@ impl FairSpillPool { } impl MemoryPool for FairSpillPool { - fn allocate(&self, options: &AllocationOptions) { - if options.can_spill { + fn register(&self, consumer: &MemoryConsumer) { + if consumer.can_spill { self.state.lock().num_spill += 1; } } - fn free(&self, options: &AllocationOptions) { - if options.can_spill { + fn unregister(&self, consumer: &MemoryConsumer) { + if consumer.can_spill { self.state.lock().num_spill -= 1; } } - fn grow(&self, allocation: &TrackedAllocation, additional: usize) { + fn grow(&self, reservation: &MemoryReservation, additional: usize) { let mut state = self.state.lock(); - match allocation.options.can_spill { + match reservation.consumer.can_spill { true => state.spillable += additional, false => state.unspillable += additional, } } - fn shrink(&self, allocation: &TrackedAllocation, shrink: usize) { + fn shrink(&self, reservation: &MemoryReservation, shrink: usize) { let mut state = self.state.lock(); - match allocation.options.can_spill { + match reservation.consumer.can_spill { true => state.spillable -= shrink, false => state.unspillable -= shrink, } } - fn try_grow(&self, allocation: &TrackedAllocation, additional: usize) -> Result<()> { + fn try_grow(&self, reservation: &MemoryReservation, additional: usize) -> Result<()> { let mut state = self.state.lock(); - match allocation.options.can_spill { + match reservation.consumer.can_spill { true => { // The total amount of memory available to spilling consumers let spill_available = self.pool_size.saturating_sub(state.unspillable); @@ -178,9 +178,11 @@ impl MemoryPool for FairSpillPool { .checked_div(state.num_spill) .unwrap_or(spill_available); - if allocation.size + additional > available { + if reservation.size + additional > available { return Err(insufficient_capacity_err( - allocation, additional, available, + reservation, + additional, + available, )); } state.spillable += additional; @@ -192,7 +194,9 @@ impl MemoryPool for FairSpillPool { if available < additional { return Err(insufficient_capacity_err( - allocation, additional, available, + reservation, + additional, + available, )); } state.unspillable += additional; @@ -208,73 +212,74 @@ impl MemoryPool for FairSpillPool { } fn insufficient_capacity_err( - allocation: &TrackedAllocation, + reservation: &MemoryReservation, additional: usize, available: usize, ) -> DataFusionError { - DataFusionError::ResourcesExhausted(format!("Failed to allocate additional {} bytes for {} with {} bytes already allocated - maximum available is {}", additional, allocation.options.name, allocation.size, available)) + DataFusionError::ResourcesExhausted(format!("Failed to allocate additional {} bytes for {} with {} bytes already allocated - maximum available is {}", additional, reservation.consumer.name, reservation.size, available)) } #[cfg(test)] mod tests { use super::*; - use crate::execution::memory_pool::AllocationOptions; use std::sync::Arc; #[test] fn test_fair() { let pool = Arc::new(FairSpillPool::new(100)) as _; - let mut a1 = TrackedAllocation::new(&pool, "unspillable".to_string()); + let mut r1 = MemoryConsumer::new("unspillable").register(&pool); // Can grow beyond capacity of pool - a1.grow(2000); + r1.grow(2000); assert_eq!(pool.allocated(), 2000); - let options = AllocationOptions::new("s1".to_string()).with_can_spill(true); - let mut a2 = TrackedAllocation::new_with_options(&pool, options); + let mut r2 = MemoryConsumer::new("s1") + .with_can_spill(true) + .register(&pool); // Can grow beyond capacity of pool - a2.grow(2000); + r2.grow(2000); assert_eq!(pool.allocated(), 4000); - let err = a2.try_grow(1).unwrap_err().to_string(); + let err = r2.try_grow(1).unwrap_err().to_string(); assert_eq!(err, "Resources exhausted: Failed to allocate additional 1 bytes for s1 with 2000 bytes already allocated - maximum available is 0"); - let err = a2.try_grow(1).unwrap_err().to_string(); + let err = r2.try_grow(1).unwrap_err().to_string(); assert_eq!(err, "Resources exhausted: Failed to allocate additional 1 bytes for s1 with 2000 bytes already allocated - maximum available is 0"); - a1.shrink(1990); - a2.shrink(2000); + r1.shrink(1990); + r2.shrink(2000); assert_eq!(pool.allocated(), 10); - a1.try_grow(10).unwrap(); + r1.try_grow(10).unwrap(); assert_eq!(pool.allocated(), 20); // Can grow a2 to 80 as only spilling consumer - a2.try_grow(80).unwrap(); + r2.try_grow(80).unwrap(); assert_eq!(pool.allocated(), 100); - a2.shrink(70); + r2.shrink(70); - assert_eq!(a1.size(), 20); - assert_eq!(a2.size(), 10); + assert_eq!(r1.size(), 20); + assert_eq!(r2.size(), 10); assert_eq!(pool.allocated(), 30); - let options = AllocationOptions::new("s2".to_string()).with_can_spill(true); - let mut a3 = TrackedAllocation::new_with_options(&pool, options); + let mut r3 = MemoryConsumer::new("s2") + .with_can_spill(true) + .register(&pool); - let err = a3.try_grow(70).unwrap_err().to_string(); + let err = r3.try_grow(70).unwrap_err().to_string(); assert_eq!(err, "Resources exhausted: Failed to allocate additional 70 bytes for s2 with 0 bytes already allocated - maximum available is 40"); //Shrinking a2 to zero doesn't allow a3 to allocate more than 45 - a2.free(); - let err = a3.try_grow(70).unwrap_err().to_string(); + r2.free(); + let err = r3.try_grow(70).unwrap_err().to_string(); assert_eq!(err, "Resources exhausted: Failed to allocate additional 70 bytes for s2 with 0 bytes already allocated - maximum available is 40"); // But dropping a2 does - drop(a2); + drop(r2); assert_eq!(pool.allocated(), 20); - a3.try_grow(80).unwrap(); + r3.try_grow(80).unwrap(); } } diff --git a/datafusion/core/src/execution/runtime_env.rs b/datafusion/core/src/execution/runtime_env.rs index 5a6be6a25138..d559e7c7fa35 100644 --- a/datafusion/core/src/execution/runtime_env.rs +++ b/datafusion/core/src/execution/runtime_env.rs @@ -156,7 +156,7 @@ impl RuntimeConfig { } /// Customize memory policy - pub fn with_memory_policy(mut self, memory_pool: Arc) -> Self { + pub fn with_memory_pool(mut self, memory_pool: Arc) -> Self { self.memory_pool = Some(memory_pool); self } @@ -187,7 +187,7 @@ impl RuntimeConfig { /// Note DataFusion does not yet respect this limit in all cases. pub fn with_memory_limit(self, max_memory: usize, memory_fraction: f64) -> Self { let pool_size = (max_memory as f64 * memory_fraction) as usize; - self.with_memory_policy(Arc::new(GreedyMemoryPool::new(pool_size))) + self.with_memory_pool(Arc::new(GreedyMemoryPool::new(pool_size))) } /// Use the specified path to create any needed temporary files diff --git a/datafusion/core/src/physical_plan/aggregates/hash.rs b/datafusion/core/src/physical_plan/aggregates/hash.rs index e6ceb1d95806..64b21ecf9a6b 100644 --- a/datafusion/core/src/physical_plan/aggregates/hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/hash.rs @@ -39,7 +39,7 @@ use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr}; use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream}; use crate::scalar::ScalarValue; -use crate::execution::memory_pool::TrackedAllocation; +use crate::execution::memory_pool::{MemoryConsumer, MemoryReservation}; use arrow::{array::ArrayRef, compute, compute::cast}; use arrow::{ array::{Array, UInt32Builder}, @@ -124,10 +124,9 @@ impl GroupedHashAggregateStream { timer.done(); - let allocation = TrackedAllocation::new( - context.memory_pool(), - format!("GroupedHashAggregateStream[{}]", partition), - ); + let reservation = + MemoryConsumer::new(format!("GroupedHashAggregateStream[{}]", partition)) + .register(context.memory_pool()); let inner = GroupedHashAggregateStreamInner { schema: Arc::clone(&schema), @@ -138,7 +137,7 @@ impl GroupedHashAggregateStream { baseline_metrics, aggregate_expressions, accumulators: Some(Accumulators { - allocation, + reservation, map: RawTable::with_capacity(0), group_states: Vec::with_capacity(0), }), @@ -175,7 +174,7 @@ impl GroupedHashAggregateStream { // This happens AFTER we actually used the memory, but simplifies the whole accounting and we are OK with // overshooting a bit. Also this means we either store the whole record batch or not. match result.and_then(|allocated| { - accumulators.allocation.try_grow(allocated) + accumulators.reservation.try_grow(allocated) }) { Ok(_) => continue, Err(e) => Err(ArrowError::ExternalError(Box::new(e))), @@ -439,7 +438,7 @@ struct GroupState { /// The state of all the groups struct Accumulators { - allocation: TrackedAllocation, + reservation: MemoryReservation, /// Logically maps group values to an index in `group_states` /// diff --git a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs index ae5b672964a7..8a312abafd9b 100644 --- a/datafusion/core/src/physical_plan/aggregates/no_grouping.rs +++ b/datafusion/core/src/physical_plan/aggregates/no_grouping.rs @@ -33,7 +33,7 @@ use futures::stream::BoxStream; use std::sync::Arc; use std::task::{Context, Poll}; -use crate::execution::memory_pool::TrackedAllocation; +use crate::execution::memory_pool::{MemoryConsumer, MemoryReservation}; use futures::stream::{Stream, StreamExt}; /// stream struct for aggregation without grouping columns @@ -54,7 +54,7 @@ struct AggregateStreamInner { baseline_metrics: BaselineMetrics, aggregate_expressions: Vec>>, accumulators: Vec, - allocation: TrackedAllocation, + reservation: MemoryReservation, finished: bool, } @@ -72,10 +72,8 @@ impl AggregateStream { let aggregate_expressions = aggregate_expressions(&aggr_expr, &mode, 0)?; let accumulators = create_accumulators(&aggr_expr)?; - let allocation = TrackedAllocation::new( - context.memory_pool(), - format!("AggregateStream[{}]", partition), - ); + let reservation = MemoryConsumer::new(format!("AggregateStream[{}]", partition)) + .register(context.memory_pool()); let inner = AggregateStreamInner { schema: Arc::clone(&schema), @@ -84,7 +82,7 @@ impl AggregateStream { baseline_metrics, aggregate_expressions, accumulators, - allocation, + reservation, finished: false, }; let stream = futures::stream::unfold(inner, |mut this| async move { @@ -111,7 +109,7 @@ impl AggregateStream { // This happens AFTER we actually used the memory, but simplifies the whole accounting and we are OK with // overshooting a bit. Also this means we either store the whole record batch or not. match result - .and_then(|allocated| this.allocation.try_grow(allocated)) + .and_then(|allocated| this.reservation.try_grow(allocated)) { Ok(_) => continue, Err(e) => Err(ArrowError::ExternalError(Box::new(e))), diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash.rs b/datafusion/core/src/physical_plan/aggregates/row_hash.rs index a220507645dc..e769397871ef 100644 --- a/datafusion/core/src/physical_plan/aggregates/row_hash.rs +++ b/datafusion/core/src/physical_plan/aggregates/row_hash.rs @@ -37,7 +37,7 @@ use crate::physical_plan::metrics::{BaselineMetrics, RecordOutput}; use crate::physical_plan::{aggregates, AggregateExpr, PhysicalExpr}; use crate::physical_plan::{RecordBatchStream, SendableRecordBatchStream}; -use crate::execution::memory_pool::TrackedAllocation; +use crate::execution::memory_pool::{MemoryConsumer, MemoryReservation}; use arrow::compute::cast; use arrow::datatypes::Schema; use arrow::{array::ArrayRef, compute}; @@ -139,13 +139,12 @@ impl GroupedHashAggregateStreamV2 { let aggr_schema = aggr_state_schema(&aggr_expr)?; let aggr_layout = Arc::new(RowLayout::new(&aggr_schema, RowType::WordAligned)); - let allocation = TrackedAllocation::new( - context.memory_pool(), - format!("GroupedHashAggregateStreamV2[{}]", partition), - ); + let reservation = + MemoryConsumer::new(format!("GroupedHashAggregateStreamV2[{}]", partition)) + .register(context.memory_pool()); let aggr_state = AggregationState { - allocation, + reservation, map: RawTable::with_capacity(0), group_states: Vec::with_capacity(0), }; @@ -195,7 +194,7 @@ impl GroupedHashAggregateStreamV2 { // This happens AFTER we actually used the memory, but simplifies the whole accounting and we are OK with // overshooting a bit. Also this means we either store the whole record batch or not. match result.and_then(|allocated| { - this.aggr_state.allocation.try_grow(allocated) + this.aggr_state.reservation.try_grow(allocated) }) { Ok(_) => continue, Err(e) => Err(ArrowError::ExternalError(Box::new(e))), @@ -458,7 +457,7 @@ struct RowGroupState { /// The state of all the groups struct AggregationState { - allocation: TrackedAllocation, + reservation: MemoryReservation, /// Logically maps group values to an index in `group_states` /// diff --git a/datafusion/core/src/physical_plan/metrics/tracker.rs b/datafusion/core/src/physical_plan/metrics/tracker.rs index 2fc8a6cd7c7a..c61398c65810 100644 --- a/datafusion/core/src/physical_plan/metrics/tracker.rs +++ b/datafusion/core/src/physical_plan/metrics/tracker.rs @@ -23,17 +23,13 @@ use crate::physical_plan::metrics::{ use std::sync::Arc; use std::task::Poll; -use crate::execution::memory_pool::{MemoryPool, TrackedAllocation}; +use crate::execution::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation}; use arrow::{error::ArrowError, record_batch::RecordBatch}; -/// Simplified version of tracking memory consumer, -/// see also: [`Tracking`](crate::execution::memory_pool::ConsumerType::Tracking) -/// -/// You could use this to replace [BaselineMetrics], report the memory, -/// and get the memory usage bookkeeping in the memory manager easily. +/// Wraps a [`BaselineMetrics`] and records memory usage on a [`MemoryReservation`] #[derive(Debug)] pub struct MemTrackingMetrics { - allocation: TrackedAllocation, + reservation: MemoryReservation, metrics: BaselineMetrics, } @@ -46,9 +42,12 @@ impl MemTrackingMetrics { pool: &Arc, partition: usize, ) -> Self { - let name = format!("MemTrackingMetrics[{}]", partition); + let reservation = + MemoryConsumer::new(format!("MemTrackingMetrics[{}]", partition)) + .register(pool); + Self { - allocation: TrackedAllocation::new(pool, name), + reservation, metrics: BaselineMetrics::new(metrics, partition), } } @@ -66,7 +65,7 @@ impl MemTrackingMetrics { /// setup initial memory usage and register it with memory manager pub fn init_mem_used(&mut self, size: usize) { self.metrics.mem_used().set(size); - self.allocation.resize(size) + self.reservation.resize(size) } /// return the metric for the total number of output rows produced diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index 5e6f1dc260e4..12649fda33e3 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -22,7 +22,7 @@ use crate::error::{DataFusionError, Result}; use crate::execution::context::TaskContext; use crate::execution::memory_pool::{ - human_readable_size, AllocationOptions, TrackedAllocation, + human_readable_size, MemoryConsumer, MemoryReservation, }; use crate::execution::runtime_env::RuntimeEnv; use crate::physical_plan::common::{batch_byte_size, IPCWriter, SizedRecordBatchStream}; @@ -81,7 +81,7 @@ struct ExternalSorter { metrics_set: CompositeMetricsSet, metrics: BaselineMetrics, fetch: Option, - allocation: TrackedAllocation, + reservation: MemoryReservation, partition_id: usize, } @@ -96,11 +96,11 @@ impl ExternalSorter { fetch: Option, ) -> Self { let metrics = metrics_set.new_intermediate_baseline(partition_id); - let options = AllocationOptions::new(format!("ExternalSorter[{}]", partition_id)) - .with_can_spill(true); - let allocation = - TrackedAllocation::new_with_options(&runtime.memory_pool, options); + let reservation = + MemoryConsumer::new(format!("ExternalSorter[{}]", partition_id)) + .with_can_spill(true) + .register(&runtime.memory_pool); Self { schema, @@ -112,7 +112,7 @@ impl ExternalSorter { metrics_set, metrics, fetch, - allocation, + reservation, partition_id, } } @@ -124,9 +124,9 @@ impl ExternalSorter { ) -> Result<()> { if input.num_rows() > 0 { let size = batch_byte_size(&input); - if self.allocation.try_grow(size).is_err() { + if self.reservation.try_grow(size).is_err() { self.spill().await?; - self.allocation.try_grow(size)? + self.reservation.try_grow(size)? } self.metrics.mem_used().add(size); @@ -146,12 +146,12 @@ impl ExternalSorter { // operation). But we still have to record it so that other requesters // would know about this unexpected increase in memory consumption. let new_size_delta = new_size - size; - self.allocation.grow(new_size_delta); + self.reservation.grow(new_size_delta); self.metrics.mem_used().add(new_size_delta); } Ordering::Less => { let size_delta = size - new_size; - self.allocation.shrink(size_delta); + self.reservation.shrink(size_delta); self.metrics.mem_used().sub(size_delta); } Ordering::Equal => {} @@ -161,15 +161,15 @@ impl ExternalSorter { Ok(()) } - async fn spilled_before(&self) -> bool { + fn spilled_before(&self) -> bool { !self.spills.is_empty() } /// MergeSort in mem batches as well as spills into total order with `SortPreservingMergeStream`. - async fn sort(&mut self) -> Result { + fn sort(&mut self) -> Result { let batch_size = self.session_config.batch_size(); - if self.spilled_before().await { + if self.spilled_before() { let tracking_metrics = self .metrics_set .new_intermediate_tracking(self.partition_id, &self.runtime.memory_pool); @@ -183,7 +183,7 @@ impl ExternalSorter { tracking_metrics, self.fetch, )?; - let prev_used = self.allocation.free(); + let prev_used = self.reservation.free(); streams.push(SortedStream::new(in_mem_stream, prev_used)); } @@ -214,7 +214,7 @@ impl ExternalSorter { self.fetch, ); // Report to the memory manager we are no longer using memory - self.allocation.free(); + self.reservation.free(); result } else { Ok(Box::pin(EmptyRecordBatchStream::new(self.schema.clone()))) @@ -257,7 +257,7 @@ impl ExternalSorter { spill_partial_sorted_stream(&mut stream?, spillfile.path(), self.schema.clone()) .await?; - self.allocation.free(); + self.reservation.free(); let used = self.metrics.mem_used().set(0); self.metrics.record_spill(used); self.spills.push(spillfile); @@ -884,7 +884,7 @@ async fn do_sort( let batch = batch?; sorter.insert_batch(batch, &tracking_metrics).await?; } - let result = sorter.sort().await; + let result = sorter.sort(); debug!( "End do_sort for partition {} of context session_id {} and task_id {:?}", partition_id, diff --git a/datafusion/core/tests/order_spill_fuzz.rs b/datafusion/core/tests/order_spill_fuzz.rs index 63a96c578db9..ab1fe0e07dfa 100644 --- a/datafusion/core/tests/order_spill_fuzz.rs +++ b/datafusion/core/tests/order_spill_fuzz.rs @@ -77,7 +77,7 @@ async fn run_sort(pool_size: usize, size_spill: Vec<(usize, bool)>) { let sort = Arc::new(SortExec::try_new(sort, Arc::new(exec), None).unwrap()); let runtime_config = RuntimeConfig::new() - .with_memory_policy(Arc::new(GreedyMemoryPool::new(pool_size))); + .with_memory_pool(Arc::new(GreedyMemoryPool::new(pool_size))); let runtime = Arc::new(RuntimeEnv::new(runtime_config).unwrap()); let session_ctx = SessionContext::with_config_rt(SessionConfig::new(), runtime); From 773e9835c1cb895761988fd6f2e9bfe2d87f33d8 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 19 Dec 2022 09:54:15 +0000 Subject: [PATCH 10/11] Further tweaks --- datafusion/core/src/execution/context.rs | 8 +++--- .../core/src/execution/memory_pool/mod.rs | 28 +++++++++++-------- .../core/src/execution/memory_pool/pool.rs | 20 ++++++------- .../core/src/physical_plan/sorts/sort.rs | 6 ++-- datafusion/core/tests/order_spill_fuzz.rs | 2 +- 5 files changed, 34 insertions(+), 30 deletions(-) diff --git a/datafusion/core/src/execution/context.rs b/datafusion/core/src/execution/context.rs index 1e733d7161c4..344d8b43b799 100644 --- a/datafusion/core/src/execution/context.rs +++ b/datafusion/core/src/execution/context.rs @@ -2068,13 +2068,13 @@ mod tests { let ctx2 = SessionContext::with_config_rt(SessionConfig::new(), ctx1.runtime_env()); - assert_eq!(ctx1.runtime_env().memory_pool.allocated(), 100); - assert_eq!(ctx2.runtime_env().memory_pool.allocated(), 100); + assert_eq!(ctx1.runtime_env().memory_pool.reserved(), 100); + assert_eq!(ctx2.runtime_env().memory_pool.reserved(), 100); drop(reservation); - assert_eq!(ctx1.runtime_env().memory_pool.allocated(), 0); - assert_eq!(ctx2.runtime_env().memory_pool.allocated(), 0); + assert_eq!(ctx1.runtime_env().memory_pool.reserved(), 0); + assert_eq!(ctx2.runtime_env().memory_pool.reserved(), 0); assert!(std::ptr::eq( Arc::as_ptr(&disk_manager), diff --git a/datafusion/core/src/execution/memory_pool/mod.rs b/datafusion/core/src/execution/memory_pool/mod.rs index a093afae631a..fd608fd59054 100644 --- a/datafusion/core/src/execution/memory_pool/mod.rs +++ b/datafusion/core/src/execution/memory_pool/mod.rs @@ -25,12 +25,16 @@ pub mod proxy; pub use pool::*; -/// The pool of memory on which [`MemoryReservation`] record their memory usage +/// The pool of memory on which [`MemoryReservation`] record their memory reservations pub trait MemoryPool: Send + Sync + std::fmt::Debug { - /// Records the creation of a new [`MemoryReservation`] with [`MemoryConsumer`] + /// Registers a new [`MemoryConsumer`] + /// + /// Note: Subsequent calls to [`Self::grow`] must be made to reserve memory fn register(&self, _consumer: &MemoryConsumer) {} /// Records the destruction of a [`MemoryReservation`] with [`MemoryConsumer`] + /// + /// Note: Prior calls to [`Self::shrink`] must be made to free any reserved memory fn unregister(&self, _consumer: &MemoryConsumer) {} /// Infallibly grow the provided `reservation` by `additional` bytes @@ -46,8 +50,8 @@ pub trait MemoryPool: Send + Sync + std::fmt::Debug { /// On error the `allocation` will not be increased in size fn try_grow(&self, reservation: &MemoryReservation, additional: usize) -> Result<()>; - /// Return the total amount of memory allocated - fn allocated(&self) -> usize; + /// Return the total amount of memory reserved + fn reserved(&self) -> usize; } /// A memory consumer that can be tracked by [`MemoryReservation`] in a [`MemoryPool`] @@ -190,28 +194,28 @@ mod tests { fn test_memory_pool_underflow() { let pool = Arc::new(GreedyMemoryPool::new(50)) as _; let mut a1 = MemoryConsumer::new("a1").register(&pool); - assert_eq!(pool.allocated(), 0); + assert_eq!(pool.reserved(), 0); a1.grow(100); - assert_eq!(pool.allocated(), 100); + assert_eq!(pool.reserved(), 100); assert_eq!(a1.free(), 100); - assert_eq!(pool.allocated(), 0); + assert_eq!(pool.reserved(), 0); a1.try_grow(100).unwrap_err(); - assert_eq!(pool.allocated(), 0); + assert_eq!(pool.reserved(), 0); a1.try_grow(30).unwrap(); - assert_eq!(pool.allocated(), 30); + assert_eq!(pool.reserved(), 30); let mut a2 = MemoryConsumer::new("a2").register(&pool); a2.try_grow(25).unwrap_err(); - assert_eq!(pool.allocated(), 30); + assert_eq!(pool.reserved(), 30); drop(a1); - assert_eq!(pool.allocated(), 0); + assert_eq!(pool.reserved(), 0); a2.try_grow(25).unwrap(); - assert_eq!(pool.allocated(), 25); + assert_eq!(pool.reserved(), 25); } } diff --git a/datafusion/core/src/execution/memory_pool/pool.rs b/datafusion/core/src/execution/memory_pool/pool.rs index 9042bedd1316..5d28629be9c2 100644 --- a/datafusion/core/src/execution/memory_pool/pool.rs +++ b/datafusion/core/src/execution/memory_pool/pool.rs @@ -40,7 +40,7 @@ impl MemoryPool for UnboundedMemoryPool { Ok(()) } - fn allocated(&self) -> usize { + fn reserved(&self) -> usize { self.used.load(Ordering::Relaxed) } } @@ -83,7 +83,7 @@ impl MemoryPool for GreedyMemoryPool { Ok(()) } - fn allocated(&self) -> usize { + fn reserved(&self) -> usize { self.used.load(Ordering::Relaxed) } } @@ -205,7 +205,7 @@ impl MemoryPool for FairSpillPool { Ok(()) } - fn allocated(&self) -> usize { + fn reserved(&self) -> usize { let state = self.state.lock(); state.spillable + state.unspillable } @@ -231,7 +231,7 @@ mod tests { let mut r1 = MemoryConsumer::new("unspillable").register(&pool); // Can grow beyond capacity of pool r1.grow(2000); - assert_eq!(pool.allocated(), 2000); + assert_eq!(pool.reserved(), 2000); let mut r2 = MemoryConsumer::new("s1") .with_can_spill(true) @@ -239,7 +239,7 @@ mod tests { // Can grow beyond capacity of pool r2.grow(2000); - assert_eq!(pool.allocated(), 4000); + assert_eq!(pool.reserved(), 4000); let err = r2.try_grow(1).unwrap_err().to_string(); assert_eq!(err, "Resources exhausted: Failed to allocate additional 1 bytes for s1 with 2000 bytes already allocated - maximum available is 0"); @@ -250,20 +250,20 @@ mod tests { r1.shrink(1990); r2.shrink(2000); - assert_eq!(pool.allocated(), 10); + assert_eq!(pool.reserved(), 10); r1.try_grow(10).unwrap(); - assert_eq!(pool.allocated(), 20); + assert_eq!(pool.reserved(), 20); // Can grow a2 to 80 as only spilling consumer r2.try_grow(80).unwrap(); - assert_eq!(pool.allocated(), 100); + assert_eq!(pool.reserved(), 100); r2.shrink(70); assert_eq!(r1.size(), 20); assert_eq!(r2.size(), 10); - assert_eq!(pool.allocated(), 30); + assert_eq!(pool.reserved(), 30); let mut r3 = MemoryConsumer::new("s2") .with_can_spill(true) @@ -279,7 +279,7 @@ mod tests { // But dropping a2 does drop(r2); - assert_eq!(pool.allocated(), 20); + assert_eq!(pool.reserved(), 20); r3.try_grow(80).unwrap(); } } diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index 12649fda33e3..85eca5450849 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -963,7 +963,7 @@ mod tests { assert_eq!(c7.value(c7.len() - 1), 254,); assert_eq!( - session_ctx.runtime_env().memory_pool.allocated(), + session_ctx.runtime_env().memory_pool.reserved(), 0, "The sort should have returned all memory used back to the memory manager" ); @@ -1032,7 +1032,7 @@ mod tests { assert_eq!(c7.value(c7.len() - 1), 254,); assert_eq!( - session_ctx.runtime_env().memory_pool.allocated(), + session_ctx.runtime_env().memory_pool.reserved(), 0, "The sort should have returned all memory used back to the memory manager" ); @@ -1283,7 +1283,7 @@ mod tests { assert_strong_count_converges_to_zero(refs).await; assert_eq!( - session_ctx.runtime_env().memory_pool.allocated(), + session_ctx.runtime_env().memory_pool.reserved(), 0, "The sort should have returned all memory used back to the memory manager" ); diff --git a/datafusion/core/tests/order_spill_fuzz.rs b/datafusion/core/tests/order_spill_fuzz.rs index ab1fe0e07dfa..923b44e2681b 100644 --- a/datafusion/core/tests/order_spill_fuzz.rs +++ b/datafusion/core/tests/order_spill_fuzz.rs @@ -94,7 +94,7 @@ async fn run_sort(pool_size: usize, size_spill: Vec<(usize, bool)>) { } assert_eq!( - session_ctx.runtime_env().memory_pool.allocated(), + session_ctx.runtime_env().memory_pool.reserved(), 0, "The sort should have returned all memory used back to the memory pool" ); From e6a575046f060f2107dd21962c7c027246523fa0 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 19 Dec 2022 12:49:20 +0000 Subject: [PATCH 11/11] Fix Drop --- datafusion/core/src/execution/memory_pool/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/core/src/execution/memory_pool/mod.rs b/datafusion/core/src/execution/memory_pool/mod.rs index fd608fd59054..6369cda4d149 100644 --- a/datafusion/core/src/execution/memory_pool/mod.rs +++ b/datafusion/core/src/execution/memory_pool/mod.rs @@ -158,6 +158,7 @@ impl MemoryReservation { impl Drop for MemoryReservation { fn drop(&mut self) { + self.free(); self.policy.unregister(&self.consumer); } }