spacemeshos · poszu · Feb 16, 2024 · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024
diff --git a/scrypt-ocl/src/lib.rs b/scrypt-ocl/src/lib.rs
@@ -1,7 +1,8 @@
 use ocl::{
     builders::ProgramBuilder,
     enums::{DeviceInfo, DeviceInfoResult, KernelWorkGroupInfo, KernelWorkGroupInfoResult},
-    Buffer, Device, DeviceType, Kernel, MemFlags, Platform, ProQue, SpatialDims,
+    Buffer, Context, Device, DeviceType, Event, Kernel, MemFlags, Platform, ProQue, Queue,
+    SpatialDims,
 };
 use post::initialize::{Initialize, VrfNonce, ENTIRE_LABEL_SIZE, LABEL_SIZE};
 use std::{cmp::min, fmt::Display, io::Write, ops::Range};
@@ -155,15 +156,20 @@ impl Scrypter {
             max_mem_alloc_size / 1024 / 1024,
         );
 
+        let context = Context::builder()
+            .platform(platform)
+            .devices(device)
+            .build()?;
+
         let src = include_str!("scrypt-jane.cl");
         let program_builder = ProgramBuilder::new()
             .source(src)
             .cmplr_def("LOOKUP_GAP", LOOKUP_GAP as i32)
             .clone();
 
+        let read_queue = Queue::new(&context, device, None)?;
         let pro_que = ProQue::builder()
-            .platform(platform)
-            .device(device)
+            .context(context)
             .prog_bldr(program_builder)
             .dims(1)
             .build()?;
@@ -203,16 +209,21 @@ impl Scrypter {
         log::info!("Allocating buffer for input: {INPUT_SIZE} bytes");
         let input = Buffer::<u32>::builder()
             .len(INPUT_SIZE / 4)
-            .flags(MemFlags::new().read_only())
+            .flags(MemFlags::new().read_only().host_write_only())
             .queue(pro_que.queue().clone())
             .build()?;
 
         let output_size = global_work_size * ENTIRE_LABEL_SIZE;
         log::info!("Allocating buffer for output: {output_size} bytes");
         let output = Buffer::<u8>::builder()
             .len(output_size)
-            .flags(MemFlags::new().write_only())
-            .queue(pro_que.queue().clone())
+            .flags(
+                MemFlags::new()
+                    .alloc_host_ptr()
+                    .write_only()
+                    .host_read_only(),
+            )
+            .queue(read_queue)
             .build()?;
 
         let lookup_size = global_work_size * kernel_lookup_mem_size;
@@ -255,7 +266,10 @@ impl Scrypter {
         let mut best_nonce = None;
         let labels_end = labels.end;
 
-        for index in labels.step_by(self.global_work_size) {
+        let mut total_kernel_duration = std::time::Duration::ZERO;
+        let mut last_kernel_duration = std::time::Duration::ZERO;
+
+        for (iter, index) in labels.step_by(self.global_work_size).enumerate() {
             self.kernel.set_arg(1, index)?;
 
             let index_end = min(index + self.global_work_size as u64, labels_end);
@@ -271,13 +285,32 @@ impl Scrypter {
             self.kernel
                 .set_default_global_work_size(SpatialDims::One(gws));
 
+            let mut kernel_event = Event::empty();
             unsafe {
-                self.kernel.enq()?;
+                self.kernel.cmd().enew(&mut kernel_event).enq()?;
+            }
+
+            let read_start = std::time::Instant::now();
+            // On some platforms (eg. Nvidia), the read command will spin CPU 100% until the kernel finishes.
+            // Hence we wait a bit before reading the buffer.
+            // The wait time is based on the average kernel duration, with some margin.
+            if iter > 0 {
+                let average = total_kernel_duration.div_f32(iter as f32);
+                let wait = (last_kernel_duration + average).div_f32(2.0).mul_f32(0.9);
+                log::trace!("waiting for kernel to finish for {wait:?}");
+                std::thread::sleep(wait);
             }
 
             let labels_buffer =
                 &mut self.labels_buffer.as_mut_slice()[..labels_to_init * ENTIRE_LABEL_SIZE];
-            self.output.read(labels_buffer.as_mut()).enq()?;
+            self.output
+                .cmd()
+                .ewait(&kernel_event)
+                .read(labels_buffer.as_mut())
+                .enq()?;
+
+            last_kernel_duration = read_start.elapsed();
+            total_kernel_duration += last_kernel_duration;
 
             // Look for VRF nonce if enabled
             // TODO: run in background / in parallel to GPU