|
| 1 | +// evmone: Fast Ethereum Virtual Machine implementation |
| 2 | +// Copyright 2020 The evmone Authors. |
| 3 | +// SPDX-License-Identifier: Apache-2.0 |
| 4 | + |
| 5 | +#include "synthetic_benchmarks.hpp" |
| 6 | +#include "helpers.hpp" |
| 7 | +#include "test/utils/bytecode.hpp" |
| 8 | +#include <evmc/instructions.h> |
| 9 | +#include <evmone/instruction_traits.hpp> |
| 10 | + |
| 11 | +using namespace benchmark; |
| 12 | + |
| 13 | +namespace evmone::test |
| 14 | +{ |
| 15 | +namespace |
| 16 | +{ |
| 17 | +/// Stack limit inside the EVM benchmark loop (one stack item is used for the loop counter). |
| 18 | +constexpr auto stack_limit = 1023; |
| 19 | + |
| 20 | +enum class Mode |
| 21 | +{ |
| 22 | + min_stack = 0, ///< The code uses as minimal stack as possible. |
| 23 | + full_stack = 1, ///< The code fills the stack up to its limit. |
| 24 | +}; |
| 25 | + |
| 26 | +/// The instruction grouping by EVM stack requirements. |
| 27 | +enum class InstructionCategory : char |
| 28 | +{ |
| 29 | + nop = 'n', ///< No-op instruction. |
| 30 | + nullop = 'a', ///< Nullary operator - produces a result without any stack input. |
| 31 | + unop = 'u', ///< Unary operator. |
| 32 | + binop = 'b', ///< Binary operator. |
| 33 | + push = 'p', ///< PUSH instruction. |
| 34 | + dup = 'd', ///< DUP instruction. |
| 35 | + swap = 's', ///< SWAP instruction. |
| 36 | + other = 'X', ///< Not any of the categories above. |
| 37 | +}; |
| 38 | + |
| 39 | +constexpr InstructionCategory get_instruction_category(evmc_opcode opcode) noexcept |
| 40 | +{ |
| 41 | + const auto trait = instr::traits[opcode]; |
| 42 | + if (opcode >= OP_PUSH1 && opcode <= OP_PUSH32) |
| 43 | + return InstructionCategory::push; |
| 44 | + else if (opcode >= OP_SWAP1 && opcode <= OP_SWAP16) |
| 45 | + return InstructionCategory::swap; |
| 46 | + else if (opcode >= OP_DUP1 && opcode <= OP_DUP16) |
| 47 | + return InstructionCategory::dup; |
| 48 | + else if (trait.stack_height_required == 0 && trait.stack_height_change == 0) |
| 49 | + return InstructionCategory::nop; |
| 50 | + else if (trait.stack_height_required == 0 && trait.stack_height_change == 1) |
| 51 | + return InstructionCategory::nullop; |
| 52 | + else if (trait.stack_height_required == 1 && trait.stack_height_change == 0) |
| 53 | + return InstructionCategory::unop; |
| 54 | + else if (trait.stack_height_required == 2 && trait.stack_height_change == -1) |
| 55 | + return InstructionCategory::binop; |
| 56 | + else |
| 57 | + return InstructionCategory::other; |
| 58 | +} |
| 59 | + |
| 60 | +struct CodeParams |
| 61 | +{ |
| 62 | + evmc_opcode opcode; |
| 63 | + Mode mode; |
| 64 | +}; |
| 65 | + |
| 66 | +/// The less-than comparison operator. Needed for std::map. |
| 67 | +[[maybe_unused]] inline constexpr bool operator<(const CodeParams& a, const CodeParams& b) noexcept |
| 68 | +{ |
| 69 | + return std::tuple(a.opcode, a.mode) < std::tuple(b.opcode, b.mode); |
| 70 | +} |
| 71 | + |
| 72 | +std::string to_string(const CodeParams& params) |
| 73 | +{ |
| 74 | + return std::string{instr::traits[params.opcode].name} + '/' + |
| 75 | + static_cast<char>(get_instruction_category(params.opcode)) + |
| 76 | + std::to_string(static_cast<int>(params.mode)); |
| 77 | +} |
| 78 | + |
| 79 | +/// Generates the EVM benchmark loop inner code for the given opcode and "mode". |
| 80 | +bytecode generate_loop_inner_code(CodeParams params) |
| 81 | +{ |
| 82 | + const auto [opcode, mode] = params; |
| 83 | + const auto category = get_instruction_category(opcode); |
| 84 | + switch (mode) |
| 85 | + { |
| 86 | + case Mode::min_stack: |
| 87 | + switch (category) |
| 88 | + { |
| 89 | + case InstructionCategory::nop: |
| 90 | + // JUMPDEST JUMPDEST ... |
| 91 | + return stack_limit * 2 * bytecode{opcode}; |
| 92 | + |
| 93 | + case InstructionCategory::nullop: |
| 94 | + // CALLER POP CALLER POP ... |
| 95 | + return stack_limit * (bytecode{opcode} + OP_POP); |
| 96 | + |
| 97 | + case InstructionCategory::unop: |
| 98 | + // DUP1 NOT NOT ... POP |
| 99 | + return OP_DUP1 + stack_limit * 2 * bytecode{opcode} + OP_POP; |
| 100 | + |
| 101 | + case InstructionCategory::binop: |
| 102 | + // DUP1 DUP1 ADD DUP1 ADD DUP1 ADD ... POP |
| 103 | + return OP_DUP1 + (stack_limit - 1) * (OP_DUP1 + bytecode{opcode}) + OP_POP; |
| 104 | + |
| 105 | + case InstructionCategory::push: |
| 106 | + // PUSH1 POP PUSH1 POP ... |
| 107 | + return stack_limit * (push(opcode, {}) + OP_POP); |
| 108 | + |
| 109 | + case InstructionCategory::dup: |
| 110 | + { |
| 111 | + // The required n stack height for DUPn is provided by |
| 112 | + // duplicating the loop counter n-1 times with DUP1. |
| 113 | + const auto n = opcode - OP_DUP1 + 1; |
| 114 | + // DUP1 ... DUPn POP DUPn POP ... POP ... |
| 115 | + // \ n-1 / \ n-1 / |
| 116 | + return (n - 1) * OP_DUP1 + // Required n stack height. |
| 117 | + (stack_limit - (n - 1)) * // |
| 118 | + (bytecode{opcode} + OP_POP) + // Multiple DUPn POP pairs. |
| 119 | + (n - 1) * OP_POP; // Pop initially duplicated values. |
| 120 | + } |
| 121 | + |
| 122 | + case InstructionCategory::swap: |
| 123 | + { |
| 124 | + // The required n+1 stack height for SWAPn is provided by duplicating the loop counter |
| 125 | + // n times with DUP1. This also guarantees the loop counter remains unchanged because |
| 126 | + // it is always going to be swapped to the same value. |
| 127 | + const auto n = opcode - OP_SWAP1 + 1; |
| 128 | + // DUP1 ... SWAPn SWAPn ... POP ... |
| 129 | + // \ n / \ n / |
| 130 | + return n * OP_DUP1 + // Required n+1 stack height. |
| 131 | + stack_limit * 2 * bytecode{opcode} + // Multiple SWAPns. |
| 132 | + n * OP_POP; // Pop initially duplicated values. |
| 133 | + } |
| 134 | + |
| 135 | + default: |
| 136 | + break; |
| 137 | + } |
| 138 | + break; |
| 139 | + |
| 140 | + case Mode::full_stack: |
| 141 | + switch (category) |
| 142 | + { |
| 143 | + case InstructionCategory::nullop: |
| 144 | + // CALLER CALLER ... POP POP ... |
| 145 | + return stack_limit * opcode + stack_limit * OP_POP; |
| 146 | + |
| 147 | + case InstructionCategory::binop: |
| 148 | + // DUP1 DUP1 DUP1 ... ADD ADD ADD ... POP |
| 149 | + return stack_limit * OP_DUP1 + (stack_limit - 1) * opcode + OP_POP; |
| 150 | + |
| 151 | + case InstructionCategory::push: |
| 152 | + // PUSH1 PUSH1 PUSH1 ... POP POP POP ... |
| 153 | + return stack_limit * push(opcode, {}) + stack_limit * OP_POP; |
| 154 | + |
| 155 | + case InstructionCategory::dup: |
| 156 | + { |
| 157 | + // The required initial n stack height for DUPn is provided by |
| 158 | + // duplicating the loop counter n-1 times with DUP1. |
| 159 | + const auto n = opcode - OP_DUP1 + 1; |
| 160 | + // DUP1 ... DUPn DUPn ... POP POP ... |
| 161 | + // \ n-1 / \ S-(n-1) / \ S / |
| 162 | + return (n - 1) * OP_DUP1 + // Required n stack height. |
| 163 | + (stack_limit - (n - 1)) * bytecode{opcode} + // Fill the stack with DUPn. |
| 164 | + stack_limit * OP_POP; // Clear whole stack. |
| 165 | + } |
| 166 | + |
| 167 | + default: |
| 168 | + break; |
| 169 | + } |
| 170 | + break; |
| 171 | + } |
| 172 | + |
| 173 | + return {}; |
| 174 | +} |
| 175 | + |
| 176 | +/// Generates a benchmark loop with given inner code. |
| 177 | +/// |
| 178 | +/// This generates do-while loop with 255 iterations and it starts with PUSH1 of 255 as the loop |
| 179 | +/// counter. The while check is done as `(counter += -1) != 0`. The SUB is avoided because it |
| 180 | +/// consumes arguments in unnatural order and additional SWAP would be required. |
| 181 | +/// |
| 182 | +/// The loop counter stays on the stack top. The inner code is allowed to duplicate it, but must not |
| 183 | +/// modify it. |
| 184 | +bytecode generate_loop_v1(const bytecode& inner_code) |
| 185 | +{ |
| 186 | + const auto counter = push(255); |
| 187 | + const auto jumpdest_offset = counter.size(); |
| 188 | + return counter + OP_JUMPDEST + inner_code + // loop label + inner code |
| 189 | + push("ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff") + // -1 |
| 190 | + OP_ADD + OP_DUP1 + // counter += (-1) |
| 191 | + push(jumpdest_offset) + OP_JUMPI; // jump to jumpdest_offset if counter != 0 |
| 192 | +} |
| 193 | + |
| 194 | +/// Generates a benchmark loop with given inner code. |
| 195 | +/// |
| 196 | +/// This is improved variant of v1. It has exactly the same instructions and consumes the same |
| 197 | +/// amount of gas, but according to performed benchmarks (see "loop_v1" and "loop_v2") it runs |
| 198 | +/// faster. And we want the lowest possible loop overhead. |
| 199 | +/// The change is to set the loop counter to -255 and check `(counter += 1) != 0`. |
| 200 | +bytecode generate_loop_v2(const bytecode& inner_code) |
| 201 | +{ |
| 202 | + const auto counter = |
| 203 | + push("ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff01"); // -255 |
| 204 | + const auto jumpdest_offset = counter.size(); |
| 205 | + return counter + OP_JUMPDEST + inner_code + // loop label + inner code |
| 206 | + push(1) + OP_ADD + OP_DUP1 + // counter += 1 |
| 207 | + push(jumpdest_offset) + OP_JUMPI; // jump to jumpdest_offset if counter != 0 |
| 208 | +} |
| 209 | + |
| 210 | +bytes_view generate_code(CodeParams params) |
| 211 | +{ |
| 212 | + static std::map<CodeParams, bytecode> cache; |
| 213 | + |
| 214 | + auto& code = cache[params]; |
| 215 | + if (!code.empty()) |
| 216 | + return code; |
| 217 | + |
| 218 | + code = generate_loop_v2(generate_loop_inner_code(params)); // Cache it. |
| 219 | + return code; |
| 220 | +} |
| 221 | +} // namespace |
| 222 | + |
| 223 | +void register_synthetic_benchmarks() |
| 224 | +{ |
| 225 | + std::vector<CodeParams> params_list; |
| 226 | + |
| 227 | + // Nops & unops. |
| 228 | + for (const auto opcode : {OP_JUMPDEST, OP_ISZERO, OP_NOT}) |
| 229 | + params_list.push_back({opcode, Mode::min_stack}); |
| 230 | + |
| 231 | + // Binops. |
| 232 | + for (const auto opcode : {OP_ADD, OP_MUL, OP_SUB, OP_SIGNEXTEND, OP_LT, OP_GT, OP_SLT, OP_SGT, |
| 233 | + OP_EQ, OP_AND, OP_OR, OP_XOR, OP_BYTE, OP_SHL, OP_SHR, OP_SAR}) |
| 234 | + params_list.insert( |
| 235 | + params_list.end(), {{opcode, Mode::min_stack}, {opcode, Mode::full_stack}}); |
| 236 | + |
| 237 | + // Nullops. |
| 238 | + for (const auto opcode : {OP_ADDRESS, OP_CALLER, OP_CALLVALUE, OP_CALLDATASIZE, OP_CODESIZE, |
| 239 | + OP_RETURNDATASIZE, OP_PC, OP_MSIZE, OP_GAS}) |
| 240 | + params_list.insert( |
| 241 | + params_list.end(), {{opcode, Mode::min_stack}, {opcode, Mode::full_stack}}); |
| 242 | + |
| 243 | + // PUSH. |
| 244 | + for (auto opcode = OP_PUSH1; opcode <= OP_PUSH32; opcode = static_cast<evmc_opcode>(opcode + 1)) |
| 245 | + params_list.insert( |
| 246 | + params_list.end(), {{opcode, Mode::min_stack}, {opcode, Mode::full_stack}}); |
| 247 | + |
| 248 | + // SWAP. |
| 249 | + for (auto opcode = OP_SWAP1; opcode <= OP_SWAP16; opcode = static_cast<evmc_opcode>(opcode + 1)) |
| 250 | + params_list.insert(params_list.end(), {{opcode, Mode::min_stack}}); |
| 251 | + |
| 252 | + // DUP. |
| 253 | + for (auto opcode = OP_DUP1; opcode <= OP_DUP16; opcode = static_cast<evmc_opcode>(opcode + 1)) |
| 254 | + params_list.insert( |
| 255 | + params_list.end(), {{opcode, Mode::min_stack}, {opcode, Mode::full_stack}}); |
| 256 | + |
| 257 | + |
| 258 | + for (auto& [vm_name, vm] : registered_vms) |
| 259 | + { |
| 260 | + RegisterBenchmark((std::string{vm_name} + "/execute/synth/loop_v1").c_str(), |
| 261 | + [&vm = vm](State& state) { execute(state, vm, generate_loop_v1({})); }); |
| 262 | + RegisterBenchmark((std::string{vm_name} + "/execute/synth/loop_v2").c_str(), |
| 263 | + [&vm = vm](State& state) { execute(state, vm, generate_loop_v2({})); }); |
| 264 | + } |
| 265 | + |
| 266 | + for (const auto params : params_list) |
| 267 | + { |
| 268 | + for (auto& [vm_name, vm] : registered_vms) |
| 269 | + { |
| 270 | + RegisterBenchmark( |
| 271 | + (std::string{vm_name} + "/execute/synth/" + to_string(params)).c_str(), |
| 272 | + [&vm = vm, params](State& state) { execute(state, vm, generate_code(params)); }) |
| 273 | + ->Unit(kMicrosecond); |
| 274 | + } |
| 275 | + } |
| 276 | +} |
| 277 | +} // namespace evmone::test |
0 commit comments