Skip to content

Commit

Permalink
Allow zeusd dev and testing on MacOS (#82)
Browse files Browse the repository at this point in the history
  • Loading branch information
jaywonchung authored May 27, 2024
1 parent b2e3f55 commit 2cf328e
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 71 deletions.
4 changes: 3 additions & 1 deletion zeusd/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ name = "zeusd"

[dependencies]
actix-web = "4"
nvml-wrapper = "0.10"
tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
thiserror = "1"
clap = { version = "4.5.4", features = ["derive"] }
Expand All @@ -28,6 +27,9 @@ tracing-actix-web = "0.7.10"
nix = { version = "0.29", default-features = false, features = ["user"] }
paste = "1"

[target.'cfg(target_os = "linux")'.dependencies]
nvml-wrapper = "0.10"

[dev-dependencies]
once_cell = "1.7.2"
reqwest = { version = "0.11", default-features = false, features = ["json"] }
Expand Down
75 changes: 75 additions & 0 deletions zeusd/src/devices/gpu/linux.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
use nvml_wrapper::enums::device::GpuLockedClocksSetting;
use nvml_wrapper::{Device, Nvml};

use crate::devices::gpu::GpuManager;
use crate::error::ZeusdError;

#[cfg(target_os = "linux")]
pub struct NvmlGpu<'n> {
_nvml: &'static Nvml,
device: Device<'n>,
}

#[cfg(target_os = "linux")]
impl NvmlGpu<'static> {
pub fn init(index: u32) -> Result<Self, ZeusdError> {
// `Device` needs to hold a reference to `Nvml`, meaning that `Nvml` must outlive `Device`.
// We can achieve this by leaking a `Box` containing `Nvml` and holding a reference to it.
// `Nvml` will actually live until the server terminates inside the GPU management task.
let _nvml = Box::leak(Box::new(Nvml::init()?));
let device = _nvml.device_by_index(index)?;
Ok(Self { _nvml, device })
}
}

#[cfg(target_os = "linux")]
impl GpuManager for NvmlGpu<'static> {
fn device_count() -> Result<u32, ZeusdError> {
let nvml = Nvml::init()?;
Ok(nvml.device_count()?)
}

#[inline]
fn set_persistent_mode(&mut self, enabled: bool) -> Result<(), ZeusdError> {
Ok(self.device.set_persistent(enabled)?)
}

#[inline]
fn set_power_management_limit(&mut self, power_limit_mw: u32) -> Result<(), ZeusdError> {
Ok(self.device.set_power_management_limit(power_limit_mw)?)
}

#[inline]
fn set_gpu_locked_clocks(
&mut self,
min_clock_mhz: u32,
max_clock_mhz: u32,
) -> Result<(), ZeusdError> {
let setting = GpuLockedClocksSetting::Numeric {
min_clock_mhz,
max_clock_mhz,
};
Ok(self.device.set_gpu_locked_clocks(setting)?)
}

#[inline]
fn reset_gpu_locked_clocks(&mut self) -> Result<(), ZeusdError> {
Ok(self.device.reset_gpu_locked_clocks()?)
}

#[inline]
fn set_mem_locked_clocks(
&mut self,
min_clock_mhz: u32,
max_clock_mhz: u32,
) -> Result<(), ZeusdError> {
Ok(self
.device
.set_mem_locked_clocks(min_clock_mhz, max_clock_mhz)?)
}

#[inline]
fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError> {
Ok(self.device.reset_mem_locked_clocks()?)
}
}
48 changes: 48 additions & 0 deletions zeusd/src/devices/gpu/macos.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
use crate::devices::gpu::GpuManager;
use crate::error::ZeusdError;

pub struct NvmlGpu;

impl NvmlGpu {
pub fn init(_index: u32) -> Result<Self, ZeusdError> {
Ok(Self)
}
}

impl GpuManager for NvmlGpu {
fn device_count() -> Result<u32, ZeusdError> {
Ok(1)
}

fn set_persistent_mode(&mut self, _enabled: bool) -> Result<(), ZeusdError> {
Ok(())
}

fn set_power_management_limit(&mut self, _power_limit_mw: u32) -> Result<(), ZeusdError> {
Ok(())
}

fn set_gpu_locked_clocks(
&mut self,
_min_clock_mhz: u32,
_max_clock_mhz: u32,
) -> Result<(), ZeusdError> {
Ok(())
}

fn reset_gpu_locked_clocks(&mut self) -> Result<(), ZeusdError> {
Ok(())
}

fn set_mem_locked_clocks(
&mut self,
_min_clock_mhz: u32,
_max_clock_mhz: u32,
) -> Result<(), ZeusdError> {
Ok(())
}

fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError> {
Ok(())
}
}
82 changes: 12 additions & 70 deletions zeusd/src/devices/gpu.rs → zeusd/src/devices/gpu/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
//! GPU management module that interfaces with NVML
use std::time::Instant;
#[cfg(target_os = "linux")]
mod linux;

#[cfg(target_os = "linux")]
pub use linux::NvmlGpu;

#[cfg(target_os = "macos")]
mod macos;

use nvml_wrapper::enums::device::GpuLockedClocksSetting;
use nvml_wrapper::{Device, Nvml};
#[cfg(target_os = "macos")]
pub use macos::NvmlGpu;

use std::time::Instant;
use tokio::sync::mpsc::{Sender, UnboundedReceiver, UnboundedSender};
use tracing::Span;

Expand Down Expand Up @@ -33,73 +42,6 @@ pub trait GpuManager {
fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError>;
}

pub struct NvmlGpu<'n> {
_nvml: &'static Nvml,
device: Device<'n>,
}

impl NvmlGpu<'static> {
pub fn init(index: u32) -> Result<Self, ZeusdError> {
// `Device` needs to hold a reference to `Nvml`, meaning that `Nvml` must outlive `Device`.
// We can achieve this by leaking a `Box` containing `Nvml` and holding a reference to it.
// `Nvml` will actually live until the server terminates inside the GPU management task.
let _nvml = Box::leak(Box::new(Nvml::init()?));
let device = _nvml.device_by_index(index)?;
Ok(Self { _nvml, device })
}
}

impl GpuManager for NvmlGpu<'static> {
fn device_count() -> Result<u32, ZeusdError> {
let nvml = Nvml::init()?;
Ok(nvml.device_count()?)
}

#[inline]
fn set_persistent_mode(&mut self, enabled: bool) -> Result<(), ZeusdError> {
Ok(self.device.set_persistent(enabled)?)
}

#[inline]
fn set_power_management_limit(&mut self, power_limit_mw: u32) -> Result<(), ZeusdError> {
Ok(self.device.set_power_management_limit(power_limit_mw)?)
}

#[inline]
fn set_gpu_locked_clocks(
&mut self,
min_clock_mhz: u32,
max_clock_mhz: u32,
) -> Result<(), ZeusdError> {
let setting = GpuLockedClocksSetting::Numeric {
min_clock_mhz,
max_clock_mhz,
};
Ok(self.device.set_gpu_locked_clocks(setting)?)
}

#[inline]
fn reset_gpu_locked_clocks(&mut self) -> Result<(), ZeusdError> {
Ok(self.device.reset_gpu_locked_clocks()?)
}

#[inline]
fn set_mem_locked_clocks(
&mut self,
min_clock_mhz: u32,
max_clock_mhz: u32,
) -> Result<(), ZeusdError> {
Ok(self
.device
.set_mem_locked_clocks(min_clock_mhz, max_clock_mhz)?)
}

#[inline]
fn reset_mem_locked_clocks(&mut self) -> Result<(), ZeusdError> {
Ok(self.device.reset_mem_locked_clocks()?)
}
}

/// A request to execute a GPU command.
///
/// This is the type that is sent to the GPU management background task.
Expand Down

0 comments on commit 2cf328e

Please sign in to comment.