From 5931f4fa85b4a65d4edbb8810f8e767da1143e14 Mon Sep 17 00:00:00 2001 From: Reuben Dunnington <4075241+rdunnington@users.noreply.github.com> Date: Mon, 1 Jul 2024 15:49:13 -0700 Subject: [PATCH] Optimization passes (#55) * New zig build asm command to generate assembly for perf analysis * Gate the debug trace and trap features behind a compile-time option. Most users probably don't care about this and it has a non-trivial impact on perf. * Fix instruction immediates regressing to 32 bytes. When they were changed to be extern structs, the ordering started to matter. Now there's a comptime check to make sure they're always 16 bytes. * Embed param/return count in function instance to avoid an extra pointer hop. * Minor optimization to use @memset to initialize locals since the default values are always 0 anyway. * Some optimization notes for the future. --- build.zig | 18 ++++++---- optimize.md | 13 +++++++ run/main.zig | 10 +++++- src/core.zig | 1 + src/definition.zig | 34 ++++-------------- src/instance.zig | 6 ++-- src/vm_stack.zig | 88 +++++++++++++++++++--------------------------- src/wasi.zig | 8 ++--- test/wasm/main.zig | 5 ++- 9 files changed, 89 insertions(+), 94 deletions(-) create mode 100644 optimize.md diff --git a/build.zig b/build.zig index d2c8563..a4340c6 100644 --- a/build.zig +++ b/build.zig @@ -12,17 +12,19 @@ const ExeOpts = struct { step_name: []const u8, description: []const u8, step_dependencies: ?[]*Build.Step = null, - should_emit_asm: bool = false, + emit_asm_step: ?*Build.Step = null, options: *Build.Step.Options, }; pub fn build(b: *Build) void { - const should_emit_asm = b.option(bool, "asm", "Emit asm for the bytebox binaries") orelse false; - const enable_metering = b.option(bool, "meter", "Enable metering") orelse false; + const enable_debug_trace = b.option(bool, "debug_trace", "Enable debug tracing feature") orelse false; + const enable_debug_trap = b.option(bool, "debug_trap", "Enable debug trap features") orelse false; const options = b.addOptions(); options.addOption(bool, "enable_metering", enable_metering); + options.addOption(bool, "enable_debug_trace", enable_debug_trace); + options.addOption(bool, "enable_debug_trap", enable_debug_trap); const target = b.standardTargetOptions(.{}); const optimize = b.standardOptimizeOption(.{}); @@ -45,7 +47,7 @@ pub fn build(b: *Build) void { bytebox_module.addOptions("config", options); - // exe.root_module.addImport(import.name, import.module); + const emit_asm_step: *Build.Step = b.step("asm", "Emit assembly"); const imports = [_]ModuleImport{ .{ .name = "bytebox", .module = bytebox_module }, @@ -57,7 +59,7 @@ pub fn build(b: *Build) void { .root_src = "run/main.zig", .step_name = "run", .description = "Run a wasm program", - .should_emit_asm = should_emit_asm, + .emit_asm_step = emit_asm_step, .options = options, }); @@ -146,7 +148,11 @@ fn buildExeWithRunStep(b: *Build, target: Build.ResolvedTarget, optimize: std.bu } exe.root_module.addOptions("config", opts.options); - // exe.emit_asm = if (opts.should_emit_asm) .emit else .default; + if (opts.emit_asm_step) |asm_step| { + const asm_filename = std.fmt.allocPrint(b.allocator, "{s}.asm", .{opts.exe_name}) catch unreachable; + asm_step.dependOn(&b.addInstallFile(exe.getEmittedAsm(), asm_filename).step); + } + b.installArtifact(exe); if (opts.step_dependencies) |steps| { diff --git a/optimize.md b/optimize.md new file mode 100644 index 0000000..091bd58 --- /dev/null +++ b/optimize.md @@ -0,0 +1,13 @@ +== Failed Optimizations == + +* Giving locals their own stack space separate from values. The idea here was to save + some perf on push/pop of call frames so that we wouldn't have to copy the return values + back to the appropriate place. But since the wasm calling convention is to pass params + via the stack, you'd have to copy them elsewhere anyway, defeating the point of + the optimization anyway, which is to avoid copying values around. + +* Instruction stream. Instead of having an array of structs that contain opcode + immediates, + have a byte stream of opcodes and immediates where you don't have to pay for the extra memory + of the immediates if you don't need them. But it turns out that a lot of instructions + use immediates anyway and the overhead of fetching them out of the stream is more + expensive than just paying for the cache hits. Overall memory is diff --git a/run/main.zig b/run/main.zig index 3ed525f..d97524a 100644 --- a/run/main.zig +++ b/run/main.zig @@ -1,5 +1,6 @@ const std = @import("std"); const bytebox = @import("bytebox"); +const config = bytebox.config; const wasi = bytebox.wasi; const Val = bytebox.Val; @@ -105,7 +106,12 @@ fn parseCmdOpts(args: [][]const u8, env_buffer: *std.ArrayList([]const u8), dir_ arg_index += 1; if (getArgSafe(arg_index, args)) |mode_str| { if (bytebox.DebugTrace.parseMode(mode_str)) |mode| { - opts.trace = mode; + if (config.enable_debug_trace == false) { + log.err("Bytebox was not compiled with -Ddebug_trace=true. Enable this compile time flag if you want to enable tracing at runtime.", .{}); + opts.invalid_arg = mode_str; + } else { + opts.trace = mode; + } } else { opts.invalid_arg = mode_str; } @@ -169,6 +175,8 @@ fn printHelp(args: [][]const u8) void { \\ * none (default) \\ * function \\ * instruction + \\ Note that this requires bytebox to be compiled with the flag -Ddebug_trace=true, + \\ which is off by default for performance reasons. \\ \\ ; diff --git a/src/core.zig b/src/core.zig index d9d32e1..517a86d 100644 --- a/src/core.zig +++ b/src/core.zig @@ -4,6 +4,7 @@ const def = @import("definition.zig"); const inst = @import("instance.zig"); const vm_stack = @import("vm_stack.zig"); const vm_register = @import("vm_register.zig"); +pub const config = @import("config"); pub const wasi = @import("wasi.zig"); pub const LogLevel = common.LogLevel; diff --git a/src/definition.zig b/src/definition.zig index 672ce26..823e495 100644 --- a/src/definition.zig +++ b/src/definition.zig @@ -809,43 +809,19 @@ pub const TablePairImmediates = extern struct { pub const BlockImmediates = extern struct { block_type: BlockType, + num_returns: u16, block_value: BlockTypeValue, - num_returns: u32, continuation: u32, }; pub const IfImmediates = extern struct { block_type: BlockType, + num_returns: u16, block_value: BlockTypeValue, - num_returns: u32, else_continuation: u32, end_continuation: u32, }; -// const InstructionImmediatesTypes = enum(u8) { -// Void, -// ValType, -// ValueI32, -// ValueF32, -// ValueI64, -// ValueF64, -// ValueVec, -// Index, -// LabelId, -// MemoryOffset, -// MemoryOffsetAndLane, -// Block, -// CallIndirect, -// TablePair, -// If, -// VecShuffle16, -// }; - -pub const AlignedBytes = struct { - bytes: []align(1) const u8, - alignment: usize, -}; - pub const InstructionImmediates = extern union { Void: void, ValType: ValType, @@ -865,6 +841,10 @@ pub const InstructionImmediates = extern union { VecShuffle16: [16]u8, }; +comptime { + std.debug.assert(@sizeOf(InstructionImmediates) == 16); +} + pub const Instruction = struct { opcode: Opcode, immediate: InstructionImmediates, @@ -901,7 +881,7 @@ pub const Instruction = struct { block_value = BlockTypeValue{ .ValType = valtype }; } - const num_returns: u32 = @as(u32, @intCast(block_value.getBlocktypeReturnTypes(block_type, _module).len)); + const num_returns: u16 = @intCast(block_value.getBlocktypeReturnTypes(block_type, _module).len); return InstructionImmediates{ .Block = BlockImmediates{ diff --git a/src/instance.zig b/src/instance.zig index bc46bed..c4a0894 100644 --- a/src/instance.zig +++ b/src/instance.zig @@ -3,6 +3,7 @@ const AllocError = std.mem.Allocator.Error; const builtin = @import("builtin"); +const config = @import("config"); const metering = @import("metering.zig"); const common = @import("common.zig"); @@ -70,6 +71,7 @@ pub const DebugTrace = struct { }; pub fn setMode(new_mode: Mode) void { + std.debug.assert(config.enable_debug_trace == true); mode = new_mode; } @@ -86,11 +88,11 @@ pub const DebugTrace = struct { } pub fn shouldTraceFunctions() bool { - return mode == .Function; + return config.enable_debug_trace and mode == .Function; } pub fn shouldTraceInstructions() bool { - return mode == .Instruction; + return config.enable_debug_trace and mode == .Instruction; } pub fn printIndent(indent: u32) void { diff --git a/src/vm_stack.zig b/src/vm_stack.zig index 97f0e7f..3dce792 100644 --- a/src/vm_stack.zig +++ b/src/vm_stack.zig @@ -2,6 +2,8 @@ const std = @import("std"); const builtin = @import("builtin"); const assert = std.debug.assert; +const config = @import("config"); + const common = @import("common.zig"); const StableArray = common.StableArray; @@ -75,7 +77,7 @@ const metering = @import("metering.zig"); const DebugTraceStackVM = struct { fn traceInstruction(instruction_name: []const u8, pc: u32, stack: *const Stack) void { - if (DebugTrace.shouldTraceInstructions()) { + if (config.enable_debug_trace and DebugTrace.shouldTraceInstructions()) { const frame: *const CallFrame = stack.topFrame(); const name_section: *const NameCustomSection = &frame.module_instance.module_def.name_section; const module_name = name_section.getModuleName(); @@ -90,11 +92,9 @@ const FunctionInstance = struct { type_def_index: usize, def_index: usize, instructions_begin: usize, - local_types: std.ArrayList(ValType), - - fn deinit(func: *FunctionInstance) void { - func.local_types.deinit(); - } + num_locals: u32, + num_params: u16, + num_returns: u16, }; const Label = struct { @@ -107,7 +107,7 @@ const CallFrame = struct { func: *const FunctionInstance, module_instance: *ModuleInstance, locals: []Val, - num_returns: u32, + num_returns: u16, start_offset_values: u32, start_offset_labels: u16, }; @@ -173,7 +173,7 @@ const Stack = struct { stack.frames.len = opts.max_frames; } - fn checkExhausted(stack: *Stack, extra_values: u32) !void { + fn checkExhausted(stack: *Stack, comptime extra_values: u32) !void { if (stack.num_values + extra_values >= stack.values.len) { return error.TrapStackExhausted; } @@ -311,29 +311,25 @@ const Stack = struct { } } - fn pushFrame(stack: *Stack, func: *const FunctionInstance, module_instance: *ModuleInstance, param_types: []const ValType, all_local_types: []const ValType, num_returns: u32) !void { - const non_param_types: []const ValType = all_local_types[param_types.len..]; - + fn pushFrame(stack: *Stack, func: *const FunctionInstance, module_instance: *ModuleInstance) TrapError!void { // the stack should already be populated with the params to the function, so all that's // left to do is initialize the locals to their default values - const values_index_begin: u32 = stack.num_values - @as(u32, @intCast(param_types.len)); - const values_index_end: u32 = stack.num_values + @as(u32, @intCast(non_param_types.len)); + const values_index_begin: u32 = stack.num_values - func.num_params; + const values_index_end: u32 = stack.num_values + func.num_locals; if (stack.num_frames < stack.frames.len and values_index_end < stack.values.len) { const locals_and_params: []Val = stack.values[values_index_begin..values_index_end]; - var locals = stack.values[stack.num_values..values_index_end]; + const locals = stack.values[stack.num_values..values_index_end]; stack.num_values = values_index_end; - for (non_param_types, 0..) |valtype, i| { - locals[i] = Val.default(valtype); - } + @memset(std.mem.sliceAsBytes(locals), 0); stack.frames[stack.num_frames] = CallFrame{ .func = func, .module_instance = module_instance, .locals = locals_and_params, - .num_returns = num_returns, + .num_returns = func.num_returns, .start_offset_values = values_index_begin, .start_offset_labels = stack.num_labels, }; @@ -893,7 +889,7 @@ const InstructionFuncs = struct { } } - fn truncateTo(comptime T: type, value: anytype) !T { + fn truncateTo(comptime T: type, value: anytype) TrapError!T { switch (T) { i32 => {}, u32 => {}, @@ -994,7 +990,7 @@ const InstructionFuncs = struct { return @as(T, @bitCast(value)); } - fn loadArrayFromMem(comptime read_type: type, comptime out_type: type, comptime array_len: usize, store: *Store, offset_from_memarg: usize, offset_from_stack: i32) ![array_len]out_type { + fn loadArrayFromMem(comptime read_type: type, comptime out_type: type, comptime array_len: usize, store: *Store, offset_from_memarg: usize, offset_from_stack: i32) TrapError![array_len]out_type { if (offset_from_stack < 0) { return error.TrapOutOfBoundsMemoryAccess; } @@ -1021,7 +1017,7 @@ const InstructionFuncs = struct { return ret; } - fn storeInMem(value: anytype, stack: *Stack, offset_from_memarg: usize) !void { + fn storeInMem(value: anytype, stack: *Stack, offset_from_memarg: usize) TrapError!void { const offset_from_stack: i64 = stack.popIndexType(); if (offset_from_stack < 0) { return error.TrapOutOfBoundsMemoryAccess; @@ -1055,14 +1051,10 @@ const InstructionFuncs = struct { std.mem.writeInt(write_type, mem[0..byte_count], write_value, .little); } - fn call(pc: u32, stack: *Stack, module_instance: *ModuleInstance, func: *const FunctionInstance) !FuncCallData { - const functype: *const FunctionTypeDefinition = &module_instance.module_def.types.items[func.type_def_index]; - const param_types: []const ValType = functype.getParams(); - const return_types: []const ValType = functype.getReturns(); + fn call(pc: u32, stack: *Stack, module_instance: *ModuleInstance, func: *const FunctionInstance) TrapError!FuncCallData { const continuation: u32 = pc + 1; - - try stack.pushFrame(func, module_instance, param_types, func.local_types.items, functype.calcNumReturns()); - try stack.pushLabel(@as(u32, @intCast(return_types.len)), continuation); + try stack.pushFrame(func, module_instance); + try stack.pushLabel(func.num_returns, continuation); DebugTrace.traceFunction(module_instance, stack.num_frames, func.def_index); @@ -1072,7 +1064,7 @@ const InstructionFuncs = struct { }; } - fn callImport(pc: u32, stack: *Stack, func: *const FunctionImport) !FuncCallData { + fn callImport(pc: u32, stack: *Stack, func: *const FunctionImport) TrapError!FuncCallData { switch (func.data) { .Host => |data| { const params_len: u32 = @as(u32, @intCast(data.func_def.getParams().len)); @@ -1596,12 +1588,14 @@ const InstructionFuncs = struct { } } - if (root_stackvm.debug_state) |*debug_state| { - if (debug_state.trap_counter > 0) { - debug_state.trap_counter -= 1; - if (debug_state.trap_counter == 0) { - debug_state.pc = pc; - return error.TrapDebug; + if (config.enable_debug_trap) { + if (root_stackvm.debug_state) |*debug_state| { + if (debug_state.trap_counter > 0) { + debug_state.trap_counter -= 1; + if (debug_state.trap_counter == 0) { + debug_state.pc = pc; + return error.TrapDebug; + } } } } @@ -5258,10 +5252,6 @@ pub const StackVM = struct { pub fn deinit(vm: *VM) void { var self: *StackVM = fromVM(vm); - for (self.functions.items) |*func| { - func.deinit(); - } - self.functions.deinit(); self.stack.deinit(); @@ -5294,16 +5284,13 @@ pub const StackVM = struct { const func_type: *const FunctionTypeDefinition = &module.module_def.types.items[def_func.type_index]; const param_types: []const ValType = func_type.getParams(); - var local_types = std.ArrayList(ValType).init(vm.allocator); - try local_types.ensureTotalCapacity(param_types.len + def_func.locals.items.len); - local_types.appendSliceAssumeCapacity(param_types); - local_types.appendSliceAssumeCapacity(def_func.locals.items); - const f = FunctionInstance{ .type_def_index = def_func.type_index, .def_index = @as(u32, @intCast(i)), .instructions_begin = def_func.instructions_begin, - .local_types = local_types, + .num_locals = @intCast(def_func.locals.items.len), + .num_params = @intCast(param_types.len), + .num_returns = @intCast(func_type.getReturns().len), }; try self.functions.append(f); } @@ -5485,12 +5472,9 @@ pub const StackVM = struct { fn invokeInternal(self: *StackVM, module: *ModuleInstance, func_instance_index: usize, params: [*]const Val, returns: [*]Val) !void { const func: FunctionInstance = self.functions.items[func_instance_index]; const func_def: FunctionDefinition = module.module_def.functions.items[func.def_index]; - const func_type: *const FunctionTypeDefinition = &module.module_def.types.items[func.type_def_index]; - const param_types: []const ValType = func_type.getParams(); - const return_types: []const ValType = func_type.getReturns(); - const params_slice = params[0..param_types.len]; - var returns_slice = returns[0..return_types.len]; + const params_slice = params[0..func.num_params]; + var returns_slice = returns[0..func.num_returns]; // Ensure any leftover stack state doesn't pollute this invoke. Can happen if the previous invoke returned an error. self.stack.popAll(); @@ -5501,8 +5485,8 @@ pub const StackVM = struct { self.stack.pushValue(v); } - try self.stack.pushFrame(&func, module, param_types, func.local_types.items, func_type.calcNumReturns()); - try self.stack.pushLabel(@as(u32, @intCast(return_types.len)), @intCast(func_def.continuation)); + try self.stack.pushFrame(&func, module); + try self.stack.pushLabel(func.num_returns, @intCast(func_def.continuation)); DebugTrace.traceFunction(module, self.stack.num_frames, func.def_index); diff --git a/src/wasi.zig b/src/wasi.zig index 78ea382..8efa1a2 100644 --- a/src/wasi.zig +++ b/src/wasi.zig @@ -2400,7 +2400,6 @@ fn wasi_path_filestat_get(userdata: ?*anyopaque, module: *ModuleInstance, params if (context.fdLookup(fd_dir_wasi, &errno)) |fd_info| { if (Helpers.getMemorySlice(module, path_mem_offset, path_mem_length, &errno)) |path| { if (context.hasPathAccess(fd_info, path, &errno)) { - if (builtin.os.tag == .windows) { const dir = std.fs.Dir{ .fd = fd_info.fd, @@ -2409,14 +2408,13 @@ fn wasi_path_filestat_get(userdata: ?*anyopaque, module: *ModuleInstance, params defer file.close(); const stat: std.os.wasi.filestat_t = Helpers.filestatGetWindows(file.handle, &errno); - if (errno == .SUCCESS) { - Helpers.writeFilestatToMemory(&stat, filestat_out_mem_offset, module, &errno); - } + if (errno == .SUCCESS) { + Helpers.writeFilestatToMemory(&stat, filestat_out_mem_offset, module, &errno); + } } else |err| { errno = Errno.translateError(err); } } else { - var flags: std.posix.O = .{ .ACCMODE = .RDONLY, }; diff --git a/test/wasm/main.zig b/test/wasm/main.zig index 7847587..ae01183 100644 --- a/test/wasm/main.zig +++ b/test/wasm/main.zig @@ -1,5 +1,6 @@ const std = @import("std"); const bytebox = @import("bytebox"); +const config = bytebox.config; const ValType = bytebox.ValType; const Val = bytebox.Val; const TaggedVal = bytebox.TaggedVal; @@ -1358,7 +1359,9 @@ pub fn main() !void { print("found test filter: {s}\n", .{opts.test_filter_or_null.?}); } else if (strcmp("--trace", arg)) { args_index += 1; - if (bytebox.DebugTrace.parseMode(args[args_index])) |mode| { + if (config.enable_debug_trace == false) { + print("Debug tracing must be enabled at compile time -Ddebug_trace=true\n", .{}); + } else if (bytebox.DebugTrace.parseMode(args[args_index])) |mode| { bytebox.DebugTrace.setMode(mode); } else { print("got invalid trace mode '{s}', check help for allowed options", .{args[args_index]});