Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce cpu load during init #188

Merged
merged 3 commits into from
Feb 16, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 42 additions & 9 deletions scrypt-ocl/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use ocl::{
builders::ProgramBuilder,
enums::{DeviceInfo, DeviceInfoResult, KernelWorkGroupInfo, KernelWorkGroupInfoResult},
Buffer, Device, DeviceType, Kernel, MemFlags, Platform, ProQue, SpatialDims,
Buffer, Context, Device, DeviceType, Event, Kernel, MemFlags, Platform, ProQue, Queue,
SpatialDims,
};
use post::initialize::{Initialize, VrfNonce, ENTIRE_LABEL_SIZE, LABEL_SIZE};
use std::{cmp::min, fmt::Display, io::Write, ops::Range};
Expand Down Expand Up @@ -155,15 +156,20 @@ impl Scrypter {
max_mem_alloc_size / 1024 / 1024,
);

let context = Context::builder()
.platform(platform)
.devices(device)
.build()?;

let src = include_str!("scrypt-jane.cl");
let program_builder = ProgramBuilder::new()
.source(src)
.cmplr_def("LOOKUP_GAP", LOOKUP_GAP as i32)
.clone();

let read_queue = Queue::new(&context, device, None)?;
let pro_que = ProQue::builder()
.platform(platform)
.device(device)
.context(context)
.prog_bldr(program_builder)
.dims(1)
.build()?;
Expand Down Expand Up @@ -203,16 +209,21 @@ impl Scrypter {
log::info!("Allocating buffer for input: {INPUT_SIZE} bytes");
let input = Buffer::<u32>::builder()
.len(INPUT_SIZE / 4)
.flags(MemFlags::new().read_only())
.flags(MemFlags::new().read_only().host_write_only())
.queue(pro_que.queue().clone())
.build()?;

let output_size = global_work_size * ENTIRE_LABEL_SIZE;
log::info!("Allocating buffer for output: {output_size} bytes");
let output = Buffer::<u8>::builder()
.len(output_size)
.flags(MemFlags::new().write_only())
.queue(pro_que.queue().clone())
.flags(
MemFlags::new()
.alloc_host_ptr()
.write_only()
.host_read_only(),
)
.queue(read_queue)
.build()?;

let lookup_size = global_work_size * kernel_lookup_mem_size;
Expand Down Expand Up @@ -255,7 +266,10 @@ impl Scrypter {
let mut best_nonce = None;
let labels_end = labels.end;

for index in labels.step_by(self.global_work_size) {
let mut total_kernel_duration = std::time::Duration::ZERO;
let mut last_kernel_duration = std::time::Duration::ZERO;

for (iter, index) in labels.step_by(self.global_work_size).enumerate() {
self.kernel.set_arg(1, index)?;

let index_end = min(index + self.global_work_size as u64, labels_end);
Expand All @@ -271,13 +285,32 @@ impl Scrypter {
self.kernel
.set_default_global_work_size(SpatialDims::One(gws));

let mut kernel_event = Event::empty();
unsafe {
self.kernel.enq()?;
self.kernel.cmd().enew(&mut kernel_event).enq()?;
}

let read_start = std::time::Instant::now();
// On some platforms (eg. Nvidia), the read command will spin CPU 100% until the kernel finishes.
// Hence we wait a bit before reading the buffer.
// The wait time is based on the average kernel duration, with some margin.
if iter > 0 {
let average = total_kernel_duration.div_f32(iter as f32);
let wait = (last_kernel_duration + average).div_f32(2.0).mul_f32(0.9);
log::trace!("waiting for kernel to finish for {wait:?}");
std::thread::sleep(wait);
}

let labels_buffer =
&mut self.labels_buffer.as_mut_slice()[..labels_to_init * ENTIRE_LABEL_SIZE];
self.output.read(labels_buffer.as_mut()).enq()?;
self.output
.cmd()
.ewait(&kernel_event)
.read(labels_buffer.as_mut())
.enq()?;

last_kernel_duration = read_start.elapsed();
total_kernel_duration += last_kernel_duration;

// Look for VRF nonce if enabled
// TODO: run in background / in parallel to GPU
Expand Down
Loading