Skip to content

Commit

Permalink
WIP: A draft of the pipeline, bearing in mind that use_cpu_from is us…
Browse files Browse the repository at this point in the history
…eful
  • Loading branch information
DJMcNab committed Feb 11, 2025
1 parent 40f24b2 commit 02206ff
Show file tree
Hide file tree
Showing 3 changed files with 197 additions and 41 deletions.
116 changes: 75 additions & 41 deletions vello/src/cpu.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::ops::ControlFlow;

use vello_encoding::{
BinHeader, BufferSize, BumpAllocators, Clip, ClipBbox, ClipBic, ClipElement, DrawBbox,
DrawMonoid, Encoding, IndirectCount, Layout, LineSoup, Path, PathBbox, PathMonoid, PathSegment,
DrawMonoid, Encoding, IndirectCount, LineSoup, Path, PathBbox, PathMonoid, PathSegment,
RenderConfig, Resolver, SegmentCount, Tile,
};
use vello_shaders::{
Expand Down Expand Up @@ -28,7 +30,7 @@ pub struct Buffer<T: bytemuck::Zeroable + bytemuck::NoUninit> {
}

impl<T: bytemuck::Zeroable + bytemuck::NoUninit> Buffer<T> {
fn to_fit(&mut self, size: BufferSize<T>) -> &mut [T] {
fn fit_slice(&mut self, size: BufferSize<T>) -> &mut [T] {
self.inner
.resize_with(size.len().try_into().expect("32 bit platform"), || {
T::zeroed()
Expand Down Expand Up @@ -67,11 +69,36 @@ pub struct CoarseBuffers {
ptcl: Buffer<u32>,
}

pub enum Stages {
PathTagReduce,
PathTagScan,
BboxClear,
Flatten,
DrawReduce,
DrawLeaf,
ClipReduce,
ClipLeaf,
Binning,
TileAlloc,
PathCount,
Backdrop,
Coarse,
PathTiling,
}

pub fn run_coarse_cpu(
params: &RenderParams,
buffers: &mut CoarseBuffers,
cpu_config: &RenderConfig,
) {
let _ = run_coarse_cpu_internal(params, buffers, cpu_config);
}

fn run_coarse_cpu_internal(
params: &RenderParams,
buffers: &mut CoarseBuffers,
cpu_config: &RenderConfig,
) -> ControlFlow<()> {
let packed = &mut buffers.packed;

// HACK: The coarse workgroup counts is the number of active bins.
Expand All @@ -90,47 +117,52 @@ pub fn run_coarse_cpu(
let buffer_sizes = &cpu_config.buffer_sizes;
let wg_counts = &cpu_config.workgroup_counts;

// Some buffers are marked as "write-only". This means that they will be performant to
// write into https://docs.rs/wgpu/latest/wgpu/struct.BufferViewMut.html directly (not yet set up)

// TODO: This is an alignment hazard, which just happens to work on mainstream platforms
// Maybe don't merge as-is?
let scene_buf = bytemuck::cast_slice(packed);
let config_buf = cpu_config.gpu;
let info_bin_data_buf = buffers.bin_data.to_fit(buffer_sizes.bin_data);
let tile_buf = buffers.tiles.to_fit(buffer_sizes.tiles);
let segments_buf = buffers.segments.to_fit(buffer_sizes.segments);
let info_bin_data_buf = buffers.bin_data.fit_slice(buffer_sizes.bin_data);
let tile_buf = buffers.tiles.fit_slice(buffer_sizes.tiles);
// Write-only
let segments_buf = buffers.segments.fit_slice(buffer_sizes.segments);
// Write-only
let ptcl_buf = buffers.ptcl.fit_slice(buffer_sizes.ptcl);

let ptcl_buf = buffers.ptcl.to_fit(buffer_sizes.ptcl);
let reduced_buf = buffers.path_reduced.to_fit(buffer_sizes.path_reduced);
let reduced_buf = buffers.path_reduced.fit_slice(buffer_sizes.path_reduced);

pathtag_reduce_main(wg_counts.path_reduce.0, &config_buf, scene_buf, reduced_buf);

let tagmonoid_buf = buffers.path_monoids.to_fit(buffer_sizes.path_monoids);
let tagmonoid_buf = buffers.path_monoids.fit_slice(buffer_sizes.path_monoids);

pathtag_scan_main(
wg_counts.path_scan.0,
&config_buf,
scene_buf,
reduced_buf,
&*reduced_buf,
tagmonoid_buf,
);

// Could re-use `reduced_buf` from this point

let path_bbox_buf = buffers.path_bboxes.to_fit(buffer_sizes.path_bboxes);
let path_bbox_buf = buffers.path_bboxes.fit_slice(buffer_sizes.path_bboxes);

bbox_clear_main(&config_buf, path_bbox_buf);
let bump_buf = &mut buffers.bump_alloc;
let lines_buf = buffers.lines.to_fit(buffer_sizes.lines);
let lines_buf = buffers.lines.fit_slice(buffer_sizes.lines);
flatten_main(
wg_counts.flatten.0,
&config_buf,
scene_buf,
tagmonoid_buf,
&*tagmonoid_buf,
path_bbox_buf,
bump_buf,
lines_buf,
);

let draw_reduced_buf = buffers.draw_reduced.to_fit(buffer_sizes.draw_reduced);
let draw_reduced_buf = buffers.draw_reduced.fit_slice(buffer_sizes.draw_reduced);

draw_reduce_main(
wg_counts.draw_reduce.0,
Expand All @@ -139,58 +171,58 @@ pub fn run_coarse_cpu(
draw_reduced_buf,
);

let draw_monoid_buf = buffers.draw_monoids.to_fit(buffer_sizes.draw_monoids);
let clip_inp_buf = buffers.clip_inps.to_fit(buffer_sizes.clip_inps);
let draw_monoid_buf = buffers.draw_monoids.fit_slice(buffer_sizes.draw_monoids);
let clip_inp_buf = buffers.clip_inps.fit_slice(buffer_sizes.clip_inps);
draw_leaf_main(
wg_counts.draw_leaf.0,
&config_buf,
scene_buf,
draw_reduced_buf,
path_bbox_buf,
&*draw_reduced_buf,
&*path_bbox_buf,
draw_monoid_buf,
info_bin_data_buf,
clip_inp_buf,
);

// Could re-use `draw_reduced_buf` from this point

let clip_el_buf = buffers.clip_els.to_fit(buffer_sizes.clip_els);
let clip_el_buf = buffers.clip_els.fit_slice(buffer_sizes.clip_els);

let clip_bic_buf = buffers.clip_bics.to_fit(buffer_sizes.clip_bics);
let clip_bic_buf = buffers.clip_bics.fit_slice(buffer_sizes.clip_bics);

if wg_counts.clip_reduce.0 > 0 {
clip_reduce_main(
wg_counts.clip_reduce.0,
clip_inp_buf,
path_bbox_buf,
&*clip_inp_buf,
&*path_bbox_buf,
clip_bic_buf,
clip_el_buf,
);
}
let clip_bbox_buf = buffers.clip_bboxes.to_fit(buffer_sizes.clip_bboxes);
let clip_bbox_buf = buffers.clip_bboxes.fit_slice(buffer_sizes.clip_bboxes);

if wg_counts.clip_leaf.0 > 0 {
clip_leaf_main(
&config_buf,
clip_inp_buf,
path_bbox_buf,
&*path_bbox_buf,
draw_monoid_buf,
clip_bbox_buf,
);
}

// Could re-use `clip_inp_buf`, `clip_bic_buf`, and `clip_el_buf` from this point

let draw_bbox_buf = buffers.draw_bboxes.to_fit(buffer_sizes.draw_bboxes);
let draw_bbox_buf = buffers.draw_bboxes.fit_slice(buffer_sizes.draw_bboxes);

let bin_header_buf = buffers.bin_headers.to_fit(buffer_sizes.bin_headers);
let bin_header_buf = buffers.bin_headers.fit_slice(buffer_sizes.bin_headers);

binning_main(
wg_counts.binning.0,
&config_buf,
draw_monoid_buf,
path_bbox_buf,
clip_bbox_buf,
&*draw_monoid_buf,
&*path_bbox_buf,
&*clip_bbox_buf,
draw_bbox_buf,
bump_buf,
info_bin_data_buf,
Expand All @@ -202,11 +234,11 @@ pub fn run_coarse_cpu(
// TODO: What does this comment mean?
// Note: this only needs to be rounded up because of the workaround to store the tile_offset
// in storage rather than workgroup memory.
let path_buf = buffers.paths.to_fit(buffer_sizes.paths);
let path_buf = buffers.paths.fit_slice(buffer_sizes.paths);
tile_alloc_main(
&config_buf,
scene_buf,
draw_bbox_buf,
&*draw_bbox_buf,
bump_buf,
path_buf,
tile_buf,
Expand All @@ -218,36 +250,38 @@ pub fn run_coarse_cpu(

path_count_setup_main(bump_buf, &mut indirect_count_buf);

let seg_counts_buf = buffers.seg_counts.to_fit(buffer_sizes.seg_counts);
path_count_main(bump_buf, lines_buf, path_buf, tile_buf, seg_counts_buf);
let seg_counts_buf = buffers.seg_counts.fit_slice(buffer_sizes.seg_counts);
path_count_main(bump_buf, &*lines_buf, &*path_buf, tile_buf, seg_counts_buf);

backdrop_main(&config_buf, bump_buf, path_buf, tile_buf);
backdrop_main(&config_buf, &*bump_buf, &*path_buf, tile_buf);

coarse_main(
&config_buf,
scene_buf,
draw_monoid_buf,
bin_header_buf,
info_bin_data_buf,
path_buf,
&*draw_monoid_buf,
&*bin_header_buf,
&*info_bin_data_buf,
&*path_buf,
tile_buf,
bump_buf,
ptcl_buf,
);

// TODO: Remove
path_tiling_setup_main(
bump_buf,
&mut indirect_count_buf, /* ptcl_buf (for forwarding errors to fine)*/
);

path_tiling_main(
bump_buf,
seg_counts_buf,
lines_buf,
path_buf,
tile_buf,
&*seg_counts_buf,
&*lines_buf,
&*path_buf,
&*tile_buf,
segments_buf,
);
ControlFlow::Continue(())
}

pub fn render_to_texture(
Expand Down
1 change: 1 addition & 0 deletions vello/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ mod shaders;
pub mod cpu;
#[cfg(feature = "wgpu")]
pub mod util;
pub mod v2;
#[cfg(feature = "wgpu")]
mod wgpu_engine;

Expand Down
121 changes: 121 additions & 0 deletions vello/src/v2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// Copyright 2025 the Vello Authors
// SPDX-License-Identifier: Apache-2.0 OR MIT

// Thinking about it: what do we need?

// Use case: Runs on demand?
// 1) CPU then GPU

// Use case: Debug stages
// 1) Run single stage with fixed input
// 2) Maybe generate inputs on CPU first
// 3) Download results

use std::ops::ControlFlow;

pub struct CpuSteps {
end_cpu_after: PipelineStep,
run: bool,
}

#[derive(Clone, Copy)]
struct StepMeta {
run: bool,
}

#[derive(Clone, Copy, PartialEq, PartialOrd, Eq, Ord)]
enum PipelineStep {
One,
Two,
}

impl CpuSteps {
fn start_stage(&mut self, step: PipelineStep) -> ControlFlow<(), StepMeta> {
// If we're a later step than the final CPU step
if step > self.end_cpu_after {
return ControlFlow::Break(());
}
ControlFlow::Continue(StepMeta { run: self.run })
}
}
struct Buffer<T> {
cpu_write_count: u16,
cpu_read_count: u16,
remaining_writes_cpu: u16,
remaining_reads_cpu: u16,
cpu_content: Vec<T>,
staging_buffer: wgpu::Buffer,
staging_written: bool,

gpu_written: bool,
gpu_buffer: wgpu::Buffer,
staging_queue: Vec<wgpu::Buffer>,
}
impl<T> Buffer<T> {
fn read(&mut self, stage: StepMeta) -> &[T] {
if stage.run {
self.remaining_reads_cpu -= 1;
&self.cpu_content
} else {
self.cpu_read_count += 1;
&[]
}
}
fn write(&mut self, stage: StepMeta) -> &mut [T] {
if stage.run {
self.remaining_writes_cpu -= 1;
if self.remaining_reads_cpu == 0 && self.remaining_writes_cpu == 0 {
// self.staging_written = true;
// return self
// .staging_buffer
// .slice(..)
// .get_mapped_range_mut()
// .deref_mut();
}
&mut self.cpu_content
} else {
self.cpu_write_count += 1;
&mut []
}
}
fn read_write(&mut self, stage: StepMeta) -> &mut [T] {
if stage.run {
self.remaining_reads_cpu -= 1;
self.remaining_writes_cpu -= 1;
&mut self.cpu_content
} else {
self.cpu_write_count += 1;
self.cpu_read_count += 1;
&mut []
}
}
}

struct Buffers {
a: Buffer<u8>,
b: Buffer<u16>,
c: Buffer<u16>,
}

pub fn tiny_pipeline_model(mut stages: CpuSteps, buffers: &mut Buffers) -> ControlFlow<()> {
cpu_stage_1(&mut stages, buffers)?;
cpu_stage_1(&mut stages, buffers)?;
cpu_stage_1(&mut stages, buffers)
}

fn cpu_stage_1(stages: &mut CpuSteps, buffers: &mut Buffers) -> ControlFlow<()> {
let meta = stages.start_stage(PipelineStep::One)?;
let a = buffers.a.read(meta);
let b = buffers.b.write(meta);
let c = buffers.c.read_write(meta);
if meta.run {
stage_1::stage_1(a, &*b, c);
}
ControlFlow::Continue(())
}

mod stage_1 {
pub fn stage_1(a: &[u8], b: &[u16], c: &mut [u16]) {
// ..
}
}

0 comments on commit 02206ff

Please sign in to comment.