diff --git a/crates/subspace-farmer/src/plotter/gpu/cuda.rs b/crates/subspace-farmer/src/plotter/gpu/cuda.rs index d53467203a..370439edf5 100644 --- a/crates/subspace-farmer/src/plotter/gpu/cuda.rs +++ b/crates/subspace-farmer/src/plotter/gpu/cuda.rs @@ -3,7 +3,8 @@ use crate::plotter::gpu::GpuRecordsEncoder; use async_lock::Mutex as AsyncMutex; use parking_lot::Mutex; -use rayon::{ThreadPool, ThreadPoolBuildError, ThreadPoolBuilder}; +use rayon::{current_thread_index, ThreadPool, ThreadPoolBuildError, ThreadPoolBuilder}; +use std::process::exit; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use subspace_core_primitives::pieces::{PieceOffset, Record}; @@ -93,8 +94,25 @@ impl CudaRecordsEncoder { global_mutex: Arc<AsyncMutex<()>>, ) -> Result<Self, ThreadPoolBuildError> { let id = cuda_device.id(); + let thread_name = move |thread_index| format!("cuda-{id}.{thread_index}"); + // TODO: remove this panic handler when rayon logs panic_info + // https://github.com/rayon-rs/rayon/issues/1208 + let panic_handler = move |panic_info| { + if let Some(index) = current_thread_index() { + eprintln!("panic on thread {}: {:?}", thread_name(index), panic_info); + } else { + // We want to guarantee exit, rather than panicking in a panic handler. + eprintln!( + "rayon panic handler called on non-rayon thread: {:?}", + panic_info + ); + } + exit(1); + }; + let thread_pool = ThreadPoolBuilder::new() - .thread_name(move |thread_index| format!("cuda-{id}.{thread_index}")) + .thread_name(thread_name) + .panic_handler(panic_handler) // Make sure there is overlap between records, so GPU is almost always busy .num_threads(2) .build()?; diff --git a/crates/subspace-farmer/src/utils.rs b/crates/subspace-farmer/src/utils.rs index 6824504102..540b87ec92 100644 --- a/crates/subspace-farmer/src/utils.rs +++ b/crates/subspace-farmer/src/utils.rs @@ -8,11 +8,14 @@ use crate::thread_pool_manager::{PlottingThreadPoolManager, PlottingThreadPoolPa use futures::channel::oneshot; use futures::channel::oneshot::Canceled; use futures::future::Either; -use rayon::{ThreadBuilder, ThreadPool, ThreadPoolBuildError, ThreadPoolBuilder}; +use rayon::{ + current_thread_index, ThreadBuilder, ThreadPool, ThreadPoolBuildError, ThreadPoolBuilder, +}; use std::future::Future; use std::num::NonZeroUsize; use std::ops::Deref; use std::pin::{pin, Pin}; +use std::process::exit; use std::task::{Context, Poll}; use std::{fmt, io, iter, thread}; use thread_priority::{set_current_thread_priority, ThreadPriority}; @@ -488,11 +491,28 @@ fn create_plotting_thread_pool_manager_thread_pool_pair( cpu_core_set: CpuCoreSet, thread_priority: Option<ThreadPriority>, ) -> Result<ThreadPool, ThreadPoolBuildError> { + let thread_name = + move |thread_index| format!("{thread_prefix}-{thread_pool_index}.{thread_index}"); + // TODO: remove this panic handler when rayon logs panic_info + // https://github.com/rayon-rs/rayon/issues/1208 + // (we'll lose the thread name, because it's not stored within rayon's WorkerThread) + let panic_handler = move |panic_info| { + if let Some(index) = current_thread_index() { + eprintln!("panic on thread {}: {:?}", thread_name(index), panic_info); + } else { + // We want to guarantee exit, rather than panicking in a panic handler. + eprintln!( + "rayon panic handler called on non-rayon thread: {:?}", + panic_info + ); + } + exit(1); + }; + ThreadPoolBuilder::new() - .thread_name(move |thread_index| { - format!("{thread_prefix}-{thread_pool_index}.{thread_index}") - }) + .thread_name(thread_name) .num_threads(cpu_core_set.cpu_cores().len()) + .panic_handler(panic_handler) .spawn_handler({ let handle = Handle::current();