WIP: A draft of the pipeline, bearing in mind that use_cpu_from is us…

…eful
linebender · Feb 11, 2025 · 02206ff · 02206ff
1 parent 40f24b2
commit 02206ff
Show file tree

Hide file tree

Showing 3 changed files with 197 additions and 41 deletions.
diff --git a/vello/src/cpu.rs b/vello/src/cpu.rs
@@ -1,6 +1,8 @@
+use std::ops::ControlFlow;
+
 use vello_encoding::{
     BinHeader, BufferSize, BumpAllocators, Clip, ClipBbox, ClipBic, ClipElement, DrawBbox,
-    DrawMonoid, Encoding, IndirectCount, Layout, LineSoup, Path, PathBbox, PathMonoid, PathSegment,
+    DrawMonoid, Encoding, IndirectCount, LineSoup, Path, PathBbox, PathMonoid, PathSegment,
     RenderConfig, Resolver, SegmentCount, Tile,
 };
 use vello_shaders::{
@@ -28,7 +30,7 @@ pub struct Buffer<T: bytemuck::Zeroable + bytemuck::NoUninit> {
 }
 
 impl<T: bytemuck::Zeroable + bytemuck::NoUninit> Buffer<T> {
-    fn to_fit(&mut self, size: BufferSize<T>) -> &mut [T] {
+    fn fit_slice(&mut self, size: BufferSize<T>) -> &mut [T] {
         self.inner
             .resize_with(size.len().try_into().expect("32 bit platform"), || {
                 T::zeroed()
@@ -67,11 +69,36 @@ pub struct CoarseBuffers {
     ptcl: Buffer<u32>,
 }
 
+pub enum Stages {
+    PathTagReduce,
+    PathTagScan,
+    BboxClear,
+    Flatten,
+    DrawReduce,
+    DrawLeaf,
+    ClipReduce,
+    ClipLeaf,
+    Binning,
+    TileAlloc,
+    PathCount,
+    Backdrop,
+    Coarse,
+    PathTiling,
+}
+
 pub fn run_coarse_cpu(
     params: &RenderParams,
     buffers: &mut CoarseBuffers,
     cpu_config: &RenderConfig,
 ) {
+    let _ = run_coarse_cpu_internal(params, buffers, cpu_config);
+}
+
+fn run_coarse_cpu_internal(
+    params: &RenderParams,
+    buffers: &mut CoarseBuffers,
+    cpu_config: &RenderConfig,
+) -> ControlFlow<()> {
     let packed = &mut buffers.packed;
 
     // HACK: The coarse workgroup counts is the number of active bins.
@@ -90,47 +117,52 @@ pub fn run_coarse_cpu(
     let buffer_sizes = &cpu_config.buffer_sizes;
     let wg_counts = &cpu_config.workgroup_counts;
 
+    // Some buffers are marked as "write-only". This means that they will be performant to
+    // write into https://docs.rs/wgpu/latest/wgpu/struct.BufferViewMut.html directly (not yet set up)
+
     // TODO: This is an alignment hazard, which just happens to work on mainstream platforms
     // Maybe don't merge as-is?
     let scene_buf = bytemuck::cast_slice(packed);
     let config_buf = cpu_config.gpu;
-    let info_bin_data_buf = buffers.bin_data.to_fit(buffer_sizes.bin_data);
-    let tile_buf = buffers.tiles.to_fit(buffer_sizes.tiles);
-    let segments_buf = buffers.segments.to_fit(buffer_sizes.segments);
+    let info_bin_data_buf = buffers.bin_data.fit_slice(buffer_sizes.bin_data);
+    let tile_buf = buffers.tiles.fit_slice(buffer_sizes.tiles);
+    // Write-only
+    let segments_buf = buffers.segments.fit_slice(buffer_sizes.segments);
+    // Write-only
+    let ptcl_buf = buffers.ptcl.fit_slice(buffer_sizes.ptcl);
 
-    let ptcl_buf = buffers.ptcl.to_fit(buffer_sizes.ptcl);
-    let reduced_buf = buffers.path_reduced.to_fit(buffer_sizes.path_reduced);
+    let reduced_buf = buffers.path_reduced.fit_slice(buffer_sizes.path_reduced);
 
     pathtag_reduce_main(wg_counts.path_reduce.0, &config_buf, scene_buf, reduced_buf);
 
-    let tagmonoid_buf = buffers.path_monoids.to_fit(buffer_sizes.path_monoids);
+    let tagmonoid_buf = buffers.path_monoids.fit_slice(buffer_sizes.path_monoids);
 
     pathtag_scan_main(
         wg_counts.path_scan.0,
         &config_buf,
         scene_buf,
-        reduced_buf,
+        &*reduced_buf,
         tagmonoid_buf,
     );
 
     // Could re-use `reduced_buf` from this point
 
-    let path_bbox_buf = buffers.path_bboxes.to_fit(buffer_sizes.path_bboxes);
+    let path_bbox_buf = buffers.path_bboxes.fit_slice(buffer_sizes.path_bboxes);
 
     bbox_clear_main(&config_buf, path_bbox_buf);
     let bump_buf = &mut buffers.bump_alloc;
-    let lines_buf = buffers.lines.to_fit(buffer_sizes.lines);
+    let lines_buf = buffers.lines.fit_slice(buffer_sizes.lines);
     flatten_main(
         wg_counts.flatten.0,
         &config_buf,
         scene_buf,
-        tagmonoid_buf,
+        &*tagmonoid_buf,
         path_bbox_buf,
         bump_buf,
         lines_buf,
     );
 
-    let draw_reduced_buf = buffers.draw_reduced.to_fit(buffer_sizes.draw_reduced);
+    let draw_reduced_buf = buffers.draw_reduced.fit_slice(buffer_sizes.draw_reduced);
 
     draw_reduce_main(
         wg_counts.draw_reduce.0,
@@ -139,58 +171,58 @@ pub fn run_coarse_cpu(
         draw_reduced_buf,
     );
 
-    let draw_monoid_buf = buffers.draw_monoids.to_fit(buffer_sizes.draw_monoids);
-    let clip_inp_buf = buffers.clip_inps.to_fit(buffer_sizes.clip_inps);
+    let draw_monoid_buf = buffers.draw_monoids.fit_slice(buffer_sizes.draw_monoids);
+    let clip_inp_buf = buffers.clip_inps.fit_slice(buffer_sizes.clip_inps);
     draw_leaf_main(
         wg_counts.draw_leaf.0,
         &config_buf,
         scene_buf,
-        draw_reduced_buf,
-        path_bbox_buf,
+        &*draw_reduced_buf,
+        &*path_bbox_buf,
         draw_monoid_buf,
         info_bin_data_buf,
         clip_inp_buf,
     );
 
     // Could re-use `draw_reduced_buf` from this point
 
-    let clip_el_buf = buffers.clip_els.to_fit(buffer_sizes.clip_els);
+    let clip_el_buf = buffers.clip_els.fit_slice(buffer_sizes.clip_els);
 
-    let clip_bic_buf = buffers.clip_bics.to_fit(buffer_sizes.clip_bics);
+    let clip_bic_buf = buffers.clip_bics.fit_slice(buffer_sizes.clip_bics);
 
     if wg_counts.clip_reduce.0 > 0 {
         clip_reduce_main(
             wg_counts.clip_reduce.0,
-            clip_inp_buf,
-            path_bbox_buf,
+            &*clip_inp_buf,
+            &*path_bbox_buf,
             clip_bic_buf,
             clip_el_buf,
         );
     }
-    let clip_bbox_buf = buffers.clip_bboxes.to_fit(buffer_sizes.clip_bboxes);
+    let clip_bbox_buf = buffers.clip_bboxes.fit_slice(buffer_sizes.clip_bboxes);
 
     if wg_counts.clip_leaf.0 > 0 {
         clip_leaf_main(
             &config_buf,
             clip_inp_buf,
-            path_bbox_buf,
+            &*path_bbox_buf,
             draw_monoid_buf,
             clip_bbox_buf,
         );
     }
 
     // Could re-use `clip_inp_buf`, `clip_bic_buf`, and `clip_el_buf` from this point
 
-    let draw_bbox_buf = buffers.draw_bboxes.to_fit(buffer_sizes.draw_bboxes);
+    let draw_bbox_buf = buffers.draw_bboxes.fit_slice(buffer_sizes.draw_bboxes);
 
-    let bin_header_buf = buffers.bin_headers.to_fit(buffer_sizes.bin_headers);
+    let bin_header_buf = buffers.bin_headers.fit_slice(buffer_sizes.bin_headers);
 
     binning_main(
         wg_counts.binning.0,
         &config_buf,
-        draw_monoid_buf,
-        path_bbox_buf,
-        clip_bbox_buf,
+        &*draw_monoid_buf,
+        &*path_bbox_buf,
+        &*clip_bbox_buf,
         draw_bbox_buf,
         bump_buf,
         info_bin_data_buf,
@@ -202,11 +234,11 @@ pub fn run_coarse_cpu(
     // TODO: What does this comment mean?
     // Note: this only needs to be rounded up because of the workaround to store the tile_offset
     // in storage rather than workgroup memory.
-    let path_buf = buffers.paths.to_fit(buffer_sizes.paths);
+    let path_buf = buffers.paths.fit_slice(buffer_sizes.paths);
     tile_alloc_main(
         &config_buf,
         scene_buf,
-        draw_bbox_buf,
+        &*draw_bbox_buf,
         bump_buf,
         path_buf,
         tile_buf,
@@ -218,36 +250,38 @@ pub fn run_coarse_cpu(
 
     path_count_setup_main(bump_buf, &mut indirect_count_buf);
 
-    let seg_counts_buf = buffers.seg_counts.to_fit(buffer_sizes.seg_counts);
-    path_count_main(bump_buf, lines_buf, path_buf, tile_buf, seg_counts_buf);
+    let seg_counts_buf = buffers.seg_counts.fit_slice(buffer_sizes.seg_counts);
+    path_count_main(bump_buf, &*lines_buf, &*path_buf, tile_buf, seg_counts_buf);
 
-    backdrop_main(&config_buf, bump_buf, path_buf, tile_buf);
+    backdrop_main(&config_buf, &*bump_buf, &*path_buf, tile_buf);
 
     coarse_main(
         &config_buf,
         scene_buf,
-        draw_monoid_buf,
-        bin_header_buf,
-        info_bin_data_buf,
-        path_buf,
+        &*draw_monoid_buf,
+        &*bin_header_buf,
+        &*info_bin_data_buf,
+        &*path_buf,
         tile_buf,
         bump_buf,
         ptcl_buf,
     );
 
+    // TODO: Remove
     path_tiling_setup_main(
         bump_buf,
         &mut indirect_count_buf, /* ptcl_buf (for forwarding errors to fine)*/
     );
 
     path_tiling_main(
         bump_buf,
-        seg_counts_buf,
-        lines_buf,
-        path_buf,
-        tile_buf,
+        &*seg_counts_buf,
+        &*lines_buf,
+        &*path_buf,
+        &*tile_buf,
         segments_buf,
     );
+    ControlFlow::Continue(())
 }
 
 pub fn render_to_texture(

diff --git a/vello/src/lib.rs b/vello/src/lib.rs
@@ -120,6 +120,7 @@ mod shaders;
 pub mod cpu;
 #[cfg(feature = "wgpu")]
 pub mod util;
+pub mod v2;
 #[cfg(feature = "wgpu")]
 mod wgpu_engine;
 

diff --git a/vello/src/v2.rs b/vello/src/v2.rs
@@ -0,0 +1,121 @@
+// Copyright 2025 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+// Thinking about it: what do we need?
+
+// Use case: Runs on demand?
+// 1) CPU then GPU
+
+// Use case: Debug stages
+// 1) Run single stage with fixed input
+// 2) Maybe generate inputs on CPU first
+// 3) Download results
+
+use std::ops::ControlFlow;
+
+pub struct CpuSteps {
+    end_cpu_after: PipelineStep,
+    run: bool,
+}
+
+#[derive(Clone, Copy)]
+struct StepMeta {
+    run: bool,
+}
+
+#[derive(Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
+enum PipelineStep {
+    One,
+    Two,
+}
+
+impl CpuSteps {
+    fn start_stage(&mut self, step: PipelineStep) -> ControlFlow<(), StepMeta> {
+        // If we're a later step than the final CPU step
+        if step > self.end_cpu_after {
+            return ControlFlow::Break(());
+        }
+        ControlFlow::Continue(StepMeta { run: self.run })
+    }
+}
+struct Buffer<T> {
+    cpu_write_count: u16,
+    cpu_read_count: u16,
+    remaining_writes_cpu: u16,
+    remaining_reads_cpu: u16,
+    cpu_content: Vec<T>,
+    staging_buffer: wgpu::Buffer,
+    staging_written: bool,
+
+    gpu_written: bool,
+    gpu_buffer: wgpu::Buffer,
+    staging_queue: Vec<wgpu::Buffer>,
+}
+impl<T> Buffer<T> {
+    fn read(&mut self, stage: StepMeta) -> &[T] {
+        if stage.run {
+            self.remaining_reads_cpu -= 1;
+            &self.cpu_content
+        } else {
+            self.cpu_read_count += 1;
+            &[]
+        }
+    }
+    fn write(&mut self, stage: StepMeta) -> &mut [T] {
+        if stage.run {
+            self.remaining_writes_cpu -= 1;
+            if self.remaining_reads_cpu == 0 && self.remaining_writes_cpu == 0 {
+                // self.staging_written = true;
+                // return self
+                //     .staging_buffer
+                //     .slice(..)
+                //     .get_mapped_range_mut()
+                //     .deref_mut();
+            }
+            &mut self.cpu_content
+        } else {
+            self.cpu_write_count += 1;
+            &mut []
+        }
+    }
+    fn read_write(&mut self, stage: StepMeta) -> &mut [T] {
+        if stage.run {
+            self.remaining_reads_cpu -= 1;
+            self.remaining_writes_cpu -= 1;
+            &mut self.cpu_content
+        } else {
+            self.cpu_write_count += 1;
+            self.cpu_read_count += 1;
+            &mut []
+        }
+    }
+}
+
+struct Buffers {
+    a: Buffer<u8>,
+    b: Buffer<u16>,
+    c: Buffer<u16>,
+}
+
+pub fn tiny_pipeline_model(mut stages: CpuSteps, buffers: &mut Buffers) -> ControlFlow<()> {
+    cpu_stage_1(&mut stages, buffers)?;
+    cpu_stage_1(&mut stages, buffers)?;
+    cpu_stage_1(&mut stages, buffers)
+}
+
+fn cpu_stage_1(stages: &mut CpuSteps, buffers: &mut Buffers) -> ControlFlow<()> {
+    let meta = stages.start_stage(PipelineStep::One)?;
+    let a = buffers.a.read(meta);
+    let b = buffers.b.write(meta);
+    let c = buffers.c.read_write(meta);
+    if meta.run {
+        stage_1::stage_1(a, &*b, c);
+    }
+    ControlFlow::Continue(())
+}
+
+mod stage_1 {
+    pub fn stage_1(a: &[u8], b: &[u16], c: &mut [u16]) {
+        // ..
+    }
+}