Skip to content

Commit

Permalink
Implement robust dynamic memory
Browse files Browse the repository at this point in the history
This is the core logic for robust dynamic memory. There are changes to both shaders and the driver logic.

On the shader side, failure information is more useful and fine grained. In particular, it now reports which stage failed and how much memory would have been required to make that stage succeed.

On the driver side, there is a new RenderDriver abstraction which owns command buffers (and associated query pools) and runs the logic to retry and reallocate buffers when necessary. There's also a fairly significant rework of the logic to produce the config block, as that overlaps the robust memory.

The RenderDriver abstraction may not stay. It was done this way to minimize code disruption, but arguably it should just be combined with Renderer.

Another change: the GLSL length() method on a buffer requires additional infrastructure (at least on Metal, where it needs a binding of its own), so we now pass that in as a field in the config.

This also moves blend memory to its own buffer. This worked out well because coarse rasterization can simply report the size of the blend buffer and it can be reallocated without needing to rerun the pipeline. In the previous state, blend allocations and ptcl writes were interleaved in coarse rasterization, so a failure of the former would require rerunning coarse. This should fix #83 (finally!)

There are a few loose ends. The binaries haven't (yet) been updated (I've been testing using a hand-written test program). Gradients weren't touched so still have a fixed size allocation. And the logic to calculate the new buffer size on allocation failure could be smarter.

Closes #175
  • Loading branch information
raphlinus committed Jul 8, 2022
1 parent b5b75cc commit 5cec8fc
Show file tree
Hide file tree
Showing 18 changed files with 866 additions and 515 deletions.
9 changes: 6 additions & 3 deletions piet-gpu/shader/backdrop.comp
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,15 @@ shared Alloc sh_row_alloc[BACKDROP_WG];
shared uint sh_row_width[BACKDROP_WG];

void main() {
if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
return;
}

uint th_ix = gl_LocalInvocationIndex;
uint element_ix = gl_GlobalInvocationID.x;

// Work assignment: 1 thread : 1 path element
uint row_count = 0;
bool mem_ok = mem_error == NO_ERROR;
if (gl_LocalInvocationID.y == 0) {
if (element_ix < conf.n_elements) {
// Possible TODO: it's not necessary to process backdrops of stroked paths.
Expand All @@ -68,7 +71,7 @@ void main() {
row_count = 0;
}
Alloc path_alloc = new_alloc(
path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
path.tiles.offset, (path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
sh_row_alloc[th_ix] = path_alloc;
}
sh_row_count[th_ix] = row_count;
Expand Down Expand Up @@ -98,7 +101,7 @@ void main() {
}
}
uint width = sh_row_width[el_ix];
if (width > 0 && mem_ok) {
if (width > 0) {
// Process one row sequentially
// Read backdrop value per tile and prefix sum it
Alloc tiles_alloc = sh_row_alloc[el_ix];
Expand Down
30 changes: 9 additions & 21 deletions piet-gpu/shader/binning.comp
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ layout(set = 0, binding = 1) readonly buffer ConfigBuf {
// Bitmaps are sliced (256bit into 8 (N_SLICE) 32bit submaps)
shared uint bitmaps[N_SLICE][N_TILE];
shared uint count[N_SLICE][N_TILE];
shared Alloc sh_chunk_alloc[N_TILE];
shared bool sh_alloc_failed;
shared uint sh_chunk_offset[N_TILE];

DrawMonoid load_draw_monoid(uint element_ix) {
uint base = (conf.drawmonoid_alloc.offset >> 2) + 4 * element_ix;
Expand Down Expand Up @@ -84,10 +83,6 @@ void main() {
for (uint i = 0; i < N_SLICE; i++) {
bitmaps[i][gl_LocalInvocationID.x] = 0;
}
if (gl_LocalInvocationID.x == 0) {
sh_alloc_failed = false;
}
barrier();

// Read inputs and determine coverage of bins
uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
Expand Down Expand Up @@ -148,26 +143,18 @@ void main() {
count[i][gl_LocalInvocationID.x] = element_count;
}
// element_count is number of elements covering bin for this invocation.
Alloc chunk_alloc = new_alloc(0, 0, true);
uint chunk_offset = 0;
if (element_count != 0) {
// TODO: aggregate atomic adds (subgroup is probably fastest)
MallocResult chunk = malloc(element_count * BinInstance_size);
chunk_alloc = chunk.alloc;
sh_chunk_alloc[gl_LocalInvocationID.x] = chunk_alloc;
if (chunk.failed) {
sh_alloc_failed = true;
}
chunk_offset = malloc_stage(element_count * BinInstance_size, conf.mem_size, STAGE_BINNING);
sh_chunk_offset[gl_LocalInvocationID.x] = chunk_offset;
}
// Note: it might be more efficient for reading to do this in the
// other order (each bin is a contiguous sequence of partitions)
uint out_ix = (conf.bin_alloc.offset >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
write_mem(conf.bin_alloc, out_ix, element_count);
write_mem(conf.bin_alloc, out_ix + 1, chunk_alloc.offset);
write_mem(conf.bin_alloc, out_ix + 1, chunk_offset);

barrier();
if (sh_alloc_failed || mem_error != NO_ERROR) {
return;
}

// Use similar strategy as Laine & Karras paper; loop over bbox of bins
// touched by this element
Expand All @@ -181,9 +168,10 @@ void main() {
if (my_slice > 0) {
idx += count[my_slice - 1][bin_ix];
}
Alloc out_alloc = sh_chunk_alloc[bin_ix];
uint out_offset = out_alloc.offset + idx * BinInstance_size;
BinInstance_write(out_alloc, BinInstanceRef(out_offset), BinInstance(element_ix));
uint chunk_offset = sh_chunk_offset[bin_ix];
if (chunk_offset != MALLOC_FAILED) {
memory[(chunk_offset >> 2) + idx] = element_ix;
}
}
x++;
if (x == x1) {
Expand Down
147 changes: 79 additions & 68 deletions piet-gpu/shader/coarse.comp
Original file line number Diff line number Diff line change
Expand Up @@ -72,49 +72,62 @@ void write_tile_alloc(uint el_ix, Alloc a) {

Alloc read_tile_alloc(uint el_ix, bool mem_ok) {
// All memory.
return new_alloc(0, memory.length() * 4, mem_ok);
return new_alloc(0, conf.mem_size, mem_ok);
}
#endif

// The maximum number of commands per annotated element.
#define ANNO_COMMANDS 2

// Perhaps cmd_alloc should be a global? This is a style question.
bool alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
// All writes to the output must be gated by mem_ok.
bool mem_ok = true;

// Perhaps cmd allocations should be a global? This is a style question.
void alloc_cmd(inout Alloc cmd_alloc, inout CmdRef cmd_ref, inout uint cmd_limit) {
if (cmd_ref.offset < cmd_limit) {
return true;
return;
}
uint new_cmd = malloc_stage(PTCL_INITIAL_ALLOC, conf.mem_size, STAGE_COARSE);
if (new_cmd == MALLOC_FAILED) {
mem_ok = false;
}
MallocResult new_cmd = malloc(PTCL_INITIAL_ALLOC);
if (new_cmd.failed) {
return false;
if (mem_ok) {
CmdJump jump = CmdJump(new_cmd);
Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
}
CmdJump jump = CmdJump(new_cmd.alloc.offset);
Cmd_Jump_write(cmd_alloc, cmd_ref, jump);
cmd_alloc = new_cmd.alloc;
cmd_ref = CmdRef(cmd_alloc.offset);
cmd_alloc = new_alloc(new_cmd, PTCL_INITIAL_ALLOC, true);
cmd_ref = CmdRef(new_cmd);
// Reserve space for the maximum number of commands and a potential jump.
cmd_limit = cmd_alloc.offset + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
return true;
cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - (ANNO_COMMANDS + 1) * Cmd_size;
}

void write_fill(Alloc alloc, inout CmdRef cmd_ref, Tile tile, float linewidth) {
if (linewidth < 0.0) {
if (tile.tile.offset != 0) {
CmdFill cmd_fill = CmdFill(tile.tile.offset, tile.backdrop);
Cmd_Fill_write(alloc, cmd_ref, cmd_fill);
if (mem_ok) {
Cmd_Fill_write(alloc, cmd_ref, cmd_fill);
}
cmd_ref.offset += 4 + CmdFill_size;
} else {
Cmd_Solid_write(alloc, cmd_ref);
if (mem_ok) {
Cmd_Solid_write(alloc, cmd_ref);
}
cmd_ref.offset += 4;
}
} else {
CmdStroke cmd_stroke = CmdStroke(tile.tile.offset, 0.5 * linewidth);
Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke);
if (mem_ok) {
Cmd_Stroke_write(alloc, cmd_ref, cmd_stroke);
}
cmd_ref.offset += 4 + CmdStroke_size;
}
}

void main() {
if (!check_deps(STAGE_BINNING | STAGE_TILE_ALLOC | STAGE_PATH_COARSE)) {
return;
}
// Could use either linear or 2d layouts for both dispatch and
// invocations within the workgroup. We'll use variables to abstract.
uint width_in_bins = (conf.width_in_tiles + N_TILE_X - 1) / N_TILE_X;
Expand Down Expand Up @@ -161,7 +174,6 @@ void main() {
uint drawtag_start = conf.drawtag_offset >> 2;
uint drawdata_start = conf.drawdata_offset >> 2;
uint drawinfo_start = conf.drawinfo_alloc.offset >> 2;
bool mem_ok = mem_error == NO_ERROR;
while (true) {
for (uint i = 0; i < N_SLICE; i++) {
sh_bitmaps[i][th_ix] = 0;
Expand All @@ -176,7 +188,7 @@ void main() {
uint in_ix = (conf.bin_alloc.offset >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
count = read_mem(conf.bin_alloc, in_ix);
uint offset = read_mem(conf.bin_alloc, in_ix + 1);
sh_part_elements[th_ix] = new_alloc(offset, count * BinInstance_size, mem_ok);
sh_part_elements[th_ix] = new_alloc(offset, count * BinInstance_size, true);
}
// prefix sum of counts
for (uint i = 0; i < LG_N_PART_READ; i++) {
Expand All @@ -200,7 +212,7 @@ void main() {
}
// use binary search to find element to read
uint ix = rd_ix + th_ix;
if (ix >= wr_ix && ix < ready_ix && mem_ok) {
if (ix >= wr_ix && ix < ready_ix) {
uint part_ix = 0;
for (uint i = 0; i < LG_N_PART_READ; i++) {
uint probe = part_ix + (uint(N_PART_READ / 2) >> i);
Expand Down Expand Up @@ -257,7 +269,7 @@ void main() {
uint base = path.tiles.offset - uint(dy * stride + dx) * Tile_size;
sh_tile_base[th_ix] = base;
Alloc path_alloc = new_alloc(path.tiles.offset,
(path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, mem_ok);
(path.bbox.z - path.bbox.x) * (path.bbox.w - path.bbox.y) * Tile_size, true);
write_tile_alloc(th_ix, path_alloc);
break;
default:
Expand Down Expand Up @@ -293,27 +305,25 @@ void main() {
uint x = sh_tile_x0[el_ix] + seq_ix % width;
uint y = sh_tile_y0[el_ix] + seq_ix / width;
bool include_tile = false;
if (mem_ok) {
Tile tile = Tile_read(read_tile_alloc(el_ix, mem_ok),
TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
bool is_clip = (tag & 1) != 0;
// Always include the tile if it contains a path segment.
// For draws, include the tile if it is solid.
// For clips, include the tile if it is empty - this way, logic
// below will suppress the drawing of inner elements.
// For blends, include the tile if
// (blend_mode, composition_mode) != (Normal, SrcOver)
bool is_blend = false;
if (is_clip) {
uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
uint scene_offset = memory[drawmonoid_base + 2];
uint dd = drawdata_start + (scene_offset >> 2);
uint blend = scene[dd];
is_blend = (blend != BlendComp_clip);
}
include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
|| is_blend;
Tile tile = Tile_read(read_tile_alloc(el_ix, true),
TileRef(sh_tile_base[el_ix] + (sh_tile_stride[el_ix] * y + x) * Tile_size));
bool is_clip = (tag & 1) != 0;
// Always include the tile if it contains a path segment.
// For draws, include the tile if it is solid.
// For clips, include the tile if it is empty - this way, logic
// below will suppress the drawing of inner elements.
// For blends, include the tile if
// (blend_mode, composition_mode) != (Normal, SrcOver)
bool is_blend = false;
if (is_clip) {
uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
uint scene_offset = memory[drawmonoid_base + 2];
uint dd = drawdata_start + (scene_offset >> 2);
uint blend = scene[dd];
is_blend = (blend != BlendComp_clip);
}
include_tile = tile.tile.offset != 0 || (tile.backdrop == 0) == is_clip
|| is_blend;
if (include_tile) {
uint el_slice = el_ix / 32;
uint el_mask = 1u << (el_ix & 31);
Expand All @@ -327,7 +337,7 @@ void main() {
// through the draw objects.
uint slice_ix = 0;
uint bitmap = sh_bitmaps[0][th_ix];
while (mem_ok) {
while (true) {
if (bitmap == 0) {
slice_ix++;
if (slice_ix == N_SLICE) {
Expand All @@ -347,7 +357,7 @@ void main() {
uint drawtag = scene[drawtag_start + element_ix];

if (clip_zero_depth == 0) {
Tile tile = Tile_read(read_tile_alloc(element_ref_ix, mem_ok),
Tile tile = Tile_read(read_tile_alloc(element_ref_ix, true),
TileRef(sh_tile_base[element_ref_ix] +
(sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
uint drawmonoid_base = drawmonoid_start + 4 * element_ix;
Expand All @@ -358,32 +368,30 @@ void main() {
switch (drawtag) {
case Drawtag_FillColor:
float linewidth = uintBitsToFloat(memory[di]);
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
break;
}
alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
write_fill(cmd_alloc, cmd_ref, tile, linewidth);
uint rgba = scene[dd];
Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(rgba));
if (mem_ok) {
Cmd_Color_write(cmd_alloc, cmd_ref, CmdColor(rgba));
}
cmd_ref.offset += 4 + CmdColor_size;
break;
case Drawtag_FillLinGradient:
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
break;
}
alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
linewidth = uintBitsToFloat(memory[di]);
write_fill(cmd_alloc, cmd_ref, tile, linewidth);
CmdLinGrad cmd_lin;
cmd_lin.index = scene[dd];
cmd_lin.line_x = uintBitsToFloat(memory[di + 1]);
cmd_lin.line_y = uintBitsToFloat(memory[di + 2]);
cmd_lin.line_c = uintBitsToFloat(memory[di + 3]);
Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin);
if (mem_ok) {
Cmd_LinGrad_write(cmd_alloc, cmd_ref, cmd_lin);
}
cmd_ref.offset += 4 + CmdLinGrad_size;
break;
case Drawtag_FillRadGradient:
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
break;
}
alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
linewidth = uintBitsToFloat(memory[di]);
write_fill(cmd_alloc, cmd_ref, tile, linewidth);
CmdRadGrad cmd_rad;
Expand All @@ -396,29 +404,31 @@ void main() {
cmd_rad.c1 = uintBitsToFloat(uvec2(memory[di + 7], memory[di + 8]));
cmd_rad.ra = uintBitsToFloat(memory[di + 9]);
cmd_rad.roff = uintBitsToFloat(memory[di + 10]);
Cmd_RadGrad_write(cmd_alloc, cmd_ref, cmd_rad);
if (mem_ok) {
Cmd_RadGrad_write(cmd_alloc, cmd_ref, cmd_rad);
}
cmd_ref.offset += 4 + CmdRadGrad_size;
break;
case Drawtag_FillImage:
alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
linewidth = uintBitsToFloat(memory[di]);
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
break;
}
write_fill(cmd_alloc, cmd_ref, tile, linewidth);
uint index = scene[dd];
uint raw1 = scene[dd + 1];
ivec2 offset = ivec2(int(raw1 << 16) >> 16, int(raw1) >> 16);
Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(index, offset));
if (mem_ok) {
Cmd_Image_write(cmd_alloc, cmd_ref, CmdImage(index, offset));
}
cmd_ref.offset += 4 + CmdImage_size;
break;
case Drawtag_BeginClip:
if (tile.tile.offset == 0 && tile.backdrop == 0) {
clip_zero_depth = clip_depth + 1;
} else {
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
break;
alloc_cmd(cmd_alloc, cmd_ref, cmd_limit);
if (mem_ok) {
Cmd_BeginClip_write(cmd_alloc, cmd_ref);
}
Cmd_BeginClip_write(cmd_alloc, cmd_ref);
cmd_ref.offset += 4;
render_blend_depth++;
max_blend_depth = max(max_blend_depth, render_blend_depth);
Expand All @@ -427,12 +437,11 @@ void main() {
break;
case Drawtag_EndClip:
clip_depth--;
if (!alloc_cmd(cmd_alloc, cmd_ref, cmd_limit)) {
break;
}
write_fill(cmd_alloc, cmd_ref, tile, -1.0);
uint blend = scene[dd];
Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
if (mem_ok) {
Cmd_EndClip_write(cmd_alloc, cmd_ref, CmdEndClip(blend));
}
cmd_ref.offset += 4 + CmdEndClip_size;
render_blend_depth--;
break;
Expand All @@ -459,11 +468,13 @@ void main() {
break;
}
if (bin_tile_x + tile_x < conf.width_in_tiles && bin_tile_y + tile_y < conf.height_in_tiles) {
Cmd_End_write(cmd_alloc, cmd_ref);
if (mem_ok) {
Cmd_End_write(cmd_alloc, cmd_ref);
}
if (max_blend_depth > BLEND_STACK_SPLIT) {
uint scratch_size = max_blend_depth * TILE_WIDTH_PX * TILE_HEIGHT_PX * CLIP_STATE_SIZE * 4;
MallocResult scratch = malloc(scratch_size);
alloc_write(scratch_alloc, scratch_alloc.offset, scratch.alloc);
uint scratch = atomicAdd(blend_offset, scratch_size);
write_mem(scratch_alloc, scratch_alloc.offset >> 2, scratch);
}
}
}
Binary file added piet-gpu/shader/image.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 5cec8fc

Please sign in to comment.