From a0515cdb94e9e8d5d83084bea8d8f73c451f77f3 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 25 Nov 2023 06:20:27 +0100 Subject: [PATCH 1/8] [rtl] add "ASIC style" register file option providing a full/dedicated hardware reset --- rtl/core/neorv32_cpu_regfile.vhd | 158 +++++++++++++++++++++---------- 1 file changed, 110 insertions(+), 48 deletions(-) diff --git a/rtl/core/neorv32_cpu_regfile.vhd b/rtl/core/neorv32_cpu_regfile.vhd index e8aebb18f..45aa55753 100644 --- a/rtl/core/neorv32_cpu_regfile.vhd +++ b/rtl/core/neorv32_cpu_regfile.vhd @@ -2,16 +2,13 @@ -- # << NEORV32 CPU - General Purpose Data Register File >> # -- # ********************************************************************************************* # -- # Data register file. 32 entries (= 1024 bit) for RV32I ISA (default), 16 entries (= 512 bit) # --- # for RV32E ISA (when RISC-V "E" extension is enabled). # +-- # for RV32E ISA (when RISC-V "E" extension is enabled via "RVE_EN"). # -- # # --- # Register zero (x0) is a "normal" physical register that is set to zero by the CPU control # --- # hardware. This is not required for non-BRAM-based register files where x0 is hardwired to # --- # zero. Set to 'false' in this case. # +-- # By default the register file is coded to infer block RAM (for FPGAs), that do no provide a # +-- # dedicated hardware reset. For ASIC implementation or setup requiring a dedicated hardware # +-- # reset a single-register-based architecture can be enabled via "RST_EN". # -- # # --- # The register file uses synchronous read accesses and a *single* (multiplexed) address port # --- # for writing and reading rd/rs1 and a single read-only port for rs2. Therefore, the whole # --- # register file can be mapped to a single true-dual-port block RAM. A third and a fourth read # --- # port can be optionally enabled. # +-- # A third and a fourth read port can be optionally enabled ("RS3_EN", "RS4_EN"). # -- # ********************************************************************************************* # -- # BSD 3-Clause License # -- # # @@ -53,13 +50,15 @@ use neorv32.neorv32_package.all; entity neorv32_cpu_regfile is generic ( - RVE_EN : boolean; -- implement embedded RF extension? + RST_EN : boolean; -- enable dedicated hardware reset ("ASIC style") + RVE_EN : boolean; -- implement embedded RF extension RS3_EN : boolean; -- enable 3rd read port RS4_EN : boolean -- enable 4th read port ); port ( -- global control -- clk_i : in std_ulogic; -- global clock, rising edge + rstn_i : in std_ulogic; -- global reset, low-active, async ctrl_i : in ctrl_bus_t; -- main control bus -- data input -- alu_i : in std_ulogic_vector(XLEN-1 downto 0); -- ALU result @@ -84,13 +83,12 @@ architecture neorv32_cpu_regfile_rtl of neorv32_cpu_regfile is signal reg_file : reg_file_t; -- access -- - signal rf_wdata : std_ulogic_vector(XLEN-1 downto 0); -- write-back data - signal rf_we : std_ulogic; -- write enable - signal rd_zero : std_ulogic; -- writing to x0? - signal opa_addr : std_ulogic_vector(4 downto 0); -- rs1/dst address - signal opb_addr : std_ulogic_vector(4 downto 0); -- rs2 address - signal opc_addr : std_ulogic_vector(4 downto 0); -- rs3 address - signal opd_addr : std_ulogic_vector(4 downto 0); -- rs4 address + signal rf_wdata : std_ulogic_vector(XLEN-1 downto 0); -- write-back data + signal rf_we : std_ulogic; -- write enable + signal rf_we_sel : std_ulogic_vector((2**addr_bits_c)-1 downto 0); -- one-hot write enable + signal rd_zero : std_ulogic; -- writing to x0? + signal opa_addr : std_ulogic_vector(4 downto 0); -- rs1/rd address + signal rs4_addr : std_ulogic_vector(4 downto 0); -- rs4 address begin @@ -108,47 +106,111 @@ begin end process wb_select; - -- Access Logic --------------------------------------------------------------------------- + -- FPGA Register File (no hardware reset) ------------------------------------------------- -- ------------------------------------------------------------------------------------------- - -- access addresses -- - opa_addr <= "00000" when (ctrl_i.rf_zero_we = '1') else -- force rd = zero - ctrl_i.rf_rd when (ctrl_i.rf_wb_en = '1') else -- rd - ctrl_i.rf_rs1; -- rs1 - opb_addr <= ctrl_i.rf_rs2; -- rs2 - opc_addr <= ctrl_i.rf_rs3; -- rs3 - opd_addr <= ctrl_i.ir_funct12(6 downto 5) & ctrl_i.ir_funct3; -- rs4: [26:25] & [14:12]; not RISC-V-standard! + register_file_fpga: + if not RST_EN generate + + -- Register zero (x0) is a "normal" physical register that is set to zero by the CPU control + -- hardware. The register file uses synchronous read accesses and a *single* multiplexed + -- address port for writing and reading rd/rs1 and a single read-only port for rs2. Therefore, + -- the whole register file can be mapped to a single true-dual-port block RAM. + + rd_zero <= '1' when (ctrl_i.rf_rd = "00000") else '0'; + rf_we <= (ctrl_i.rf_wb_en and (not rd_zero)) or ctrl_i.rf_zero_we; -- never write to x0 unless explicitly forced + opa_addr <= "00000" when (ctrl_i.rf_zero_we = '1') else -- force rd = zero + ctrl_i.rf_rd when (ctrl_i.rf_wb_en = '1') else -- rd + ctrl_i.rf_rs1; -- rs1 + + register_file: process(clk_i) + begin + if rising_edge(clk_i) then + if (rf_we = '1') then + reg_file(to_integer(unsigned(opa_addr(addr_bits_c-1 downto 0)))) <= rf_wdata; + end if; + rs1_o <= reg_file(to_integer(unsigned(opa_addr(addr_bits_c-1 downto 0)))); + rs2_o <= reg_file(to_integer(unsigned(ctrl_i.rf_rs2(addr_bits_c-1 downto 0)))); + end if; + end process register_file; - -- write enable -- - rd_zero <= '1' when (ctrl_i.rf_rd = "00000") else '0'; - rf_we <= (ctrl_i.rf_wb_en and (not rd_zero)) or ctrl_i.rf_zero_we; -- do not allow writes to x0 unless explicitly forced + end generate; - -- Register File -------------------------------------------------------------------------- + -- ASIC Register File (full hardware reset) ----------------------------------------------- -- ------------------------------------------------------------------------------------------- - register_file: process(clk_i) - begin - if rising_edge(clk_i) then - if (rf_we = '1') then - reg_file(to_integer(unsigned(opa_addr(addr_bits_c-1 downto 0)))) <= rf_wdata; + register_file_asic: + if RST_EN generate + + -- write enable decoder -- + we_decode: process(ctrl_i) + begin + rf_we_sel <= (others => '0'); + if (ctrl_i.rf_wb_en = '1') then + rf_we_sel(to_integer(unsigned(ctrl_i.rf_rd(addr_bits_c-1 downto 0)))) <= '1'; end if; - rs1_o <= reg_file(to_integer(unsigned(opa_addr(addr_bits_c-1 downto 0)))); - rs2_o <= reg_file(to_integer(unsigned(opb_addr(addr_bits_c-1 downto 0)))); - - -- optional 3rd read port -- - if (RS3_EN = true) then - rs3_o <= reg_file(to_integer(unsigned(opc_addr(addr_bits_c-1 downto 0)))); - else - rs3_o <= (others => '0'); + end process we_decode; + + -- individual registers -- + reg_gen: + for i in 1 to (2**addr_bits_c)-1 generate + register_file: process(rstn_i, clk_i) + begin + if (rstn_i = '0') then + reg_file(i) <= (others => '0'); + elsif rising_edge(clk_i) then + if (rf_we_sel(i) = '1') then + reg_file(i) <= rf_wdata; + end if; + end if; + end process register_file; + end generate; + + reg_file(0) <= (others => '0'); -- x0 is hardwired to zero + + rf_read: process(clk_i) + begin + if rising_edge(clk_i) then + rs1_o <= reg_file(to_integer(unsigned(ctrl_i.rf_rs1(addr_bits_c-1 downto 0)))); + rs2_o <= reg_file(to_integer(unsigned(ctrl_i.rf_rs2(addr_bits_c-1 downto 0)))); end if; + end process rf_read; + + end generate; + + + -- Additional Read Ports ------------------------------------------------------------------ + -- ------------------------------------------------------------------------------------------- + rs3_enable: -- optional 3rd read port + if RS3_EN generate + rs3_read: process(clk_i) + begin + if rising_edge(clk_i) then + rs3_o <= reg_file(to_integer(unsigned(ctrl_i.rf_rs3(addr_bits_c-1 downto 0)))); + end if; + end process rs3_read; + end generate; + + rs3_disable: + if not RS3_EN generate + rs3_o <= (others => '0'); + end generate; + - -- optional 4th read port -- - if (RS4_EN = true) then - rs4_o <= reg_file(to_integer(unsigned(opd_addr(addr_bits_c-1 downto 0)))); - else - rs4_o <= (others => '0'); + rs4_enable: -- optional 4th read port + if RS4_EN generate + rs4_read: process(clk_i) + begin + if rising_edge(clk_i) then + rs4_o <= reg_file(to_integer(unsigned(rs4_addr(addr_bits_c-1 downto 0)))); end if; - end if; - end process register_file; + end process rs4_read; + rs4_addr <= ctrl_i.ir_funct12(6 downto 5) & ctrl_i.ir_funct3; -- rs4: [26:25] & [14:12]; not RISC-V-standard! + end generate; + + rs4_disable: + if not RS4_EN generate + rs4_o <= (others => '0'); + end generate; end neorv32_cpu_regfile_rtl; From bd0b90e76b3b89c4e4cf906467a587149c679dbf Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 25 Nov 2023 06:22:14 +0100 Subject: [PATCH 2/8] [rtl] add new top generic "REGFILE_HW_RST" --- rtl/core/neorv32_cpu.vhd | 6 +++++- rtl/core/neorv32_package.vhd | 6 ++---- rtl/core/neorv32_top.vhd | 2 ++ 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/rtl/core/neorv32_cpu.vhd b/rtl/core/neorv32_cpu.vhd index ae04a37e5..eec2bd35f 100644 --- a/rtl/core/neorv32_cpu.vhd +++ b/rtl/core/neorv32_cpu.vhd @@ -64,6 +64,7 @@ entity neorv32_cpu is -- Tuning Options -- FAST_MUL_EN : boolean; -- use DSPs for M extension's multiplier FAST_SHIFT_EN : boolean; -- use barrel shifter for shift operations + REGFILE_HW_RST : boolean; -- implement full hardware reset for register file -- Physical Memory Protection (PMP) -- PMP_NUM_REGIONS : natural range 0 to 16; -- number of regions (0..16) PMP_MIN_GRANULARITY : natural; -- minimal region granularity in bytes, has to be a power of 2, min 4 bytes @@ -191,6 +192,7 @@ begin -- Tuning Options -- FAST_MUL_EN => FAST_MUL_EN, -- use DSPs for M extension's multiplier FAST_SHIFT_EN => FAST_SHIFT_EN, -- use barrel shifter for shift operations + REGFILE_HW_RST => REGFILE_HW_RST, -- implement full hardware reset for register file -- Physical memory protection (PMP) -- PMP_EN => pmp_enable_c, -- physical memory protection enabled -- Hardware Performance Monitors (HPM) -- @@ -256,13 +258,15 @@ begin -- ------------------------------------------------------------------------------------------- neorv32_cpu_regfile_inst: entity neorv32.neorv32_cpu_regfile generic map ( - RVE_EN => CPU_EXTENSION_RISCV_E, -- implement embedded RF extension? + RST_EN => REGFILE_HW_RST, -- enable dedicated hardware reset ("ASIC style") + RVE_EN => CPU_EXTENSION_RISCV_E, -- implement embedded RF extension RS3_EN => regfile_rs3_en_c, -- enable 3rd read port RS4_EN => regfile_rs4_en_c -- enable 4th read port ) port map ( -- global control -- clk_i => clk_i, -- global clock, rising edge + rstn_i => rstn_i, -- global reset, low-active, async ctrl_i => ctrl, -- main control bus -- data input -- alu_i => alu_res, -- ALU result diff --git a/rtl/core/neorv32_package.vhd b/rtl/core/neorv32_package.vhd index ec806e62c..e25dd4ed5 100644 --- a/rtl/core/neorv32_package.vhd +++ b/rtl/core/neorv32_package.vhd @@ -44,9 +44,6 @@ package neorv32_package is -- Architecture Configuration ------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- - -- if register x0 is implemented as a *physical register* it has to be explicitly set to zero by the CPU hardware -- - constant reset_x0_c : boolean := true; -- has to be 'true' for the default register file rtl description (BRAM-based) - -- max response time for processor-internal bus transactions -- -- = cycles after which an *unacknowledged* internal bus access will timeout triggering a bus fault exception constant bus_timeout_c : natural := 15; -- default = 15 @@ -59,7 +56,7 @@ package neorv32_package is -- Architecture Constants ----------------------------------------------------------------- -- ------------------------------------------------------------------------------------------- - constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01090105"; -- hardware version + constant hw_version_c : std_ulogic_vector(31 downto 0) := x"01090106"; -- hardware version constant archid_c : natural := 19; -- official RISC-V architecture ID constant XLEN : natural := 32; -- native data path width, do not change! @@ -766,6 +763,7 @@ package neorv32_package is -- Tuning Options -- FAST_MUL_EN : boolean := false; FAST_SHIFT_EN : boolean := false; + REGFILE_HW_RST : boolean := false; -- Physical Memory Protection (PMP) -- PMP_NUM_REGIONS : natural range 0 to 16 := 0; PMP_MIN_GRANULARITY : natural := 4; diff --git a/rtl/core/neorv32_top.vhd b/rtl/core/neorv32_top.vhd index b13423963..4496f76c8 100644 --- a/rtl/core/neorv32_top.vhd +++ b/rtl/core/neorv32_top.vhd @@ -72,6 +72,7 @@ entity neorv32_top is -- Tuning Options -- FAST_MUL_EN : boolean := false; -- use DSPs for M extension's multiplier FAST_SHIFT_EN : boolean := false; -- use barrel shifter for shift operations + REGFILE_HW_RST : boolean := false; -- implement full hardware reset for register file -- Physical Memory Protection (PMP) -- PMP_NUM_REGIONS : natural range 0 to 16 := 0; -- number of regions (0..16) @@ -506,6 +507,7 @@ begin -- Tuning Options -- FAST_MUL_EN => FAST_MUL_EN, FAST_SHIFT_EN => FAST_SHIFT_EN, + REGFILE_HW_RST => REGFILE_HW_RST, -- Physical Memory Protection (PMP) -- PMP_NUM_REGIONS => PMP_NUM_REGIONS, PMP_MIN_GRANULARITY => PMP_MIN_GRANULARITY, From 4e0e6fe8a4727d6dd8ee0008ad910153d5652faf Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 25 Nov 2023 06:24:38 +0100 Subject: [PATCH 3/8] add "CSR_MXISA_RFHWRST" MXISA CSR bit --- docs/datasheet/cpu_csr.adoc | 3 ++- rtl/core/neorv32_cpu_control.vhd | 12 +++++++----- sw/lib/include/neorv32_cpu_csr.h | 1 + 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/datasheet/cpu_csr.adoc b/docs/datasheet/cpu_csr.adoc index 9953c6b96..eb91ee70b 100644 --- a/docs/datasheet/cpu_csr.adoc +++ b/docs/datasheet/cpu_csr.adoc @@ -1005,7 +1005,8 @@ discover ISA sub-extensions and CPU configuration options | 11 | `CSR_MXISA_SDTRIG` | r/- | <<_sdtrig_isa_extension>> available | 19:12 | - | r/- | hardwired to zero | 20 | `CSR_MXISA_IS_SIM` | r/- | set if CPU is being **simulated** (⚠️ not guaranteed) -| 31:21 | - | r/- | hardwired to zero +| 28:21 | - | r/- | hardwired to zero +| 29 | `CSR_MXISA_RFHWRST` | r/- | full hardware reset of register file available when set (`REGFILE_HW_RST`) | 30 | `CSR_MXISA_FASTMUL` | r/- | fast multiplication available when set (`FAST_MUL_EN`) | 31 | `CSR_MXISA_FASTSHIFT` | r/- | fast shifts available when set (`FAST_SHIFT_EN`) |======================= diff --git a/rtl/core/neorv32_cpu_control.vhd b/rtl/core/neorv32_cpu_control.vhd index 4045b0308..3736fb9af 100644 --- a/rtl/core/neorv32_cpu_control.vhd +++ b/rtl/core/neorv32_cpu_control.vhd @@ -75,6 +75,7 @@ entity neorv32_cpu_control is -- Tuning Options -- FAST_MUL_EN : boolean; -- use DSPs for M extension's multiplier FAST_SHIFT_EN : boolean; -- use barrel shifter for shift operations + REGFILE_HW_RST : boolean; -- implement full hardware reset for register file -- Physical memory protection (PMP) -- PMP_EN : boolean; -- physical memory protection enabled -- Hardware Performance Monitors (HPM) -- @@ -611,7 +612,7 @@ begin if (execute_engine.ir(instr_funct3_msb_c) = '0') then -- beq / bne execute_engine.branch_taken <= cmp_i(cmp_equal_c) xor execute_engine.ir(instr_funct3_lsb_c); else -- blt(u) / bge(u) - execute_engine.branch_taken <= cmp_i(cmp_less_c) xor execute_engine.ir(instr_funct3_lsb_c); + execute_engine.branch_taken <= cmp_i(cmp_less_c) xor execute_engine.ir(instr_funct3_lsb_c); end if; else -- unconditional branch execute_engine.branch_taken <= '1'; @@ -649,7 +650,7 @@ begin execute_engine.pc <= execute_engine.next_pc(XLEN-1 downto 1) & '0'; end if; - -- next PC: address of next logic instruction -- + -- next PC: address of next instruction -- case execute_engine.state is when TRAP_ENTER => -- starting trap environment @@ -1015,10 +1016,10 @@ begin when BRANCHED => -- delay cycle to wait for reset of pipeline front-end (instruction fetch) -- ------------------------------------------------------------ execute_engine.state_nxt <= DISPATCH; - -- house keeping: use this state to (re-)initialize the register file's x0/zero register -- - if (reset_x0_c = true) then -- if x0 is a "real" register that has to be initialized to zero + -- house keeping: use this state also to (re-)initialize the register file's x0/zero register -- + if (REGFILE_HW_RST = false) then -- x0 does not provide a dedicated hardware reset ctrl_nxt.rf_mux <= rf_mux_csr_c; -- this will return 0 since csr.re_nxt is zero - ctrl_nxt.rf_zero_we <= '1'; -- allow/force write access to x0 + ctrl_nxt.rf_zero_we <= '1'; -- force write access to x0 end if; when MEM_REQ => -- trigger memory request @@ -2094,6 +2095,7 @@ begin -- misc -- csr_rdata(20) <= bool_to_ulogic_f(is_simulation_c); -- is this a simulation? -- tuning options -- + csr_rdata(29) <= bool_to_ulogic_f(REGFILE_HW_RST); -- full hardware reset of register file csr_rdata(30) <= bool_to_ulogic_f(FAST_MUL_EN); -- DSP-based multiplication (M extensions only) csr_rdata(31) <= bool_to_ulogic_f(FAST_SHIFT_EN); -- parallel logic for shifts (barrel shifters) diff --git a/sw/lib/include/neorv32_cpu_csr.h b/sw/lib/include/neorv32_cpu_csr.h index 3f62bd52f..22139e031 100644 --- a/sw/lib/include/neorv32_cpu_csr.h +++ b/sw/lib/include/neorv32_cpu_csr.h @@ -386,6 +386,7 @@ enum NEORV32_CSR_XISA_enum { CSR_MXISA_IS_SIM = 20, /**< CPU mxisa CSR (20): this might be a simulation when set (r/-)*/ // Tuning options + CSR_MXISA_RFHWRST = 29, /**< CPU mxisa CSR (29): Register file has full hardware reset (r/-)*/ CSR_MXISA_FASTMUL = 30, /**< CPU mxisa CSR (30): DSP-based multiplication (M extensions only) (r/-)*/ CSR_MXISA_FASTSHIFT = 31 /**< CPU mxisa CSR (31): parallel logic for shifts (barrel shifters) (r/-)*/ }; From 53f508b190eb038edb827ca90a228f9bac5562e0 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 25 Nov 2023 06:25:08 +0100 Subject: [PATCH 4/8] [sim] test ASIC-style register file --- sim/neorv32_tb.vhd | 1 + sim/simple/neorv32_tb.simple.vhd | 1 + 2 files changed, 2 insertions(+) diff --git a/sim/neorv32_tb.vhd b/sim/neorv32_tb.vhd index b41746337..6ed9b6104 100644 --- a/sim/neorv32_tb.vhd +++ b/sim/neorv32_tb.vhd @@ -239,6 +239,7 @@ begin -- Extension Options -- FAST_MUL_EN => false, -- use DSPs for M extension's multiplier FAST_SHIFT_EN => false, -- use barrel shifter for shift operations + REGFILE_HW_RST => true, -- full hardware reset -- Physical Memory Protection (PMP) -- PMP_NUM_REGIONS => 5, -- number of regions (0..16) PMP_MIN_GRANULARITY => 4, -- minimal region granularity in bytes, has to be a power of 2, min 4 bytes diff --git a/sim/simple/neorv32_tb.simple.vhd b/sim/simple/neorv32_tb.simple.vhd index 733a4956d..ad1295d9c 100644 --- a/sim/simple/neorv32_tb.simple.vhd +++ b/sim/simple/neorv32_tb.simple.vhd @@ -182,6 +182,7 @@ begin -- Extension Options -- FAST_MUL_EN => true, -- use DSPs for M extension's multiplier FAST_SHIFT_EN => true, -- use barrel shifter for shift operations + REGFILE_HW_RST => false, -- no hardware reset -- Physical Memory Protection (PMP) -- PMP_NUM_REGIONS => 5, -- number of regions (0..16) PMP_MIN_GRANULARITY => 4, -- minimal region granularity in bytes, has to be a power of 2, min 4 bytes From ee2db1c0b9c85247ce90ea1ad8911cc2275c301d Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 25 Nov 2023 06:38:33 +0100 Subject: [PATCH 5/8] [changelog] add v1.9.1.6 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fa5b0ffb4..7ec843cb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ mimpid = 0x01040312 -> Version 01.04.03.12 -> v1.4.3.12 | Date (*dd.mm.yyyy*) | Version | Comment | |:-------------------:|:-------:|:--------| +| 25.11.2023 | 1.9.1.6 | :sparkles: add option for "ASIC style" register file that provides a full/dedicated hardware reset; [#736](https://github.com/stnolting/neorv32/pull/736) | | 23.11.2023 | 1.9.1.5 | clean-up & rework CPU branch logic; [#735](https://github.com/stnolting/neorv32/pull/735) | | 21.11.2023 | 1.9.1.4 | :bug: fix bug in handling of "misaligned instruction exception"; [#734](https://github.com/stnolting/neorv32/pull/734) | | 20.11.2023 | 1.9.1.3 | :bug: fix wiring of FPU exception flags; [#733](https://github.com/stnolting/neorv32/pull/733) | From 02b780f39b4c8db998f179e94f6a515119dfe840 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 25 Nov 2023 06:38:59 +0100 Subject: [PATCH 6/8] [rte] add regfile reset tuning-option flag --- sw/lib/source/neorv32_rte.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sw/lib/source/neorv32_rte.c b/sw/lib/source/neorv32_rte.c index d15d39b16..cade7dcac 100644 --- a/sw/lib/source/neorv32_rte.c +++ b/sw/lib/source/neorv32_rte.c @@ -481,12 +481,9 @@ void neorv32_rte_print_hw_config(void) { // CPU tuning options neorv32_uart0_printf("\nTuning options: "); - if (tmp & (1< Date: Sat, 25 Nov 2023 06:43:01 +0100 Subject: [PATCH 7/8] [docs] update processor generics list --- docs/datasheet/soc.adoc | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/docs/datasheet/soc.adoc b/docs/datasheet/soc.adoc index c9c34394e..2f73abc84 100644 --- a/docs/datasheet/soc.adoc +++ b/docs/datasheet/soc.adoc @@ -183,12 +183,6 @@ If optional modules (like CPU extensions or peripheral devices) are not enabled will not be synthesized at all. Hence, the disabled modules do not increase area and power requirements and do not impact timing. -.Configuration Check -[NOTE] -Not all configuration combinations are valid. The processor RTL code provides sanity checks to inform the user -during synthesis/simulation if an invalid combination has been detected. It is recommended to run a quick simulation -using the provided simulation/GHDL scripts to verify the configuration of the processor generics. - .Table Abbreviations [NOTE] The generic type "`suv(x:y)`" is an abbreviation for "`std_ulogic_vector(x downto y)`". @@ -219,9 +213,10 @@ The generic type "`suv(x:y)`" is an abbreviation for "`std_ulogic_vector(x downt | `CPU_EXTENSION_RISCV_Zihpm` | boolean | false | Enable <<_zihpm_isa_extension>> (hardware performance monitors). | `CPU_EXTENSION_RISCV_Zmmul` | boolean | false | Enable <<_zmmul_isa_extension>> (hardware-based integer multiplication). | `CPU_EXTENSION_RISCV_Zxcfu` | boolean | false | Enable NEORV32-specific <<_zxcfu_isa_extension>> (custom RISC-V instructions). -4+^| **CPU Tuning Options** -| `FAST_MUL_EN` | boolean | false | Implement fast (but large) full-parallel multipliers (trying to infer DSP blocks). -| `FAST_SHIFT_EN` | boolean | false | Implement fast (but large) full-parallel barrel shifters. +4+^| **CPU <<_architecture>> Tuning Options** +| `FAST_MUL_EN` | boolean | false | Implement fast but large full-parallel multipliers (trying to infer DSP blocks); see section <<_cpu_arithmetic_logic_unit>>. +| `FAST_SHIFT_EN` | boolean | false | Implement fast but large full-parallel barrel shifters; see section <<_cpu_arithmetic_logic_unit>>. +| `REGFILE_HW_RST` | boolean | false | Implement full hardware reset for register file (prevent inferring of BRAM); see section <<_cpu_register_file>>. 4+^| **Physical Memory Protection (<<_pmp_isa_extension>>)** | `PMP_NUM_REGIONS` | natural | 0 | Number of implemented PMP regions (0..16). | `PMP_MIN_GRANULARITY` | natural | 4 | Minimal region granularity in bytes. Has to be a power of two, min 4. From 0ab0191d4857e3d97163b411c6655d25914c1b80 Mon Sep 17 00:00:00 2001 From: stnolting <22944758+stnolting@users.noreply.github.com> Date: Sat, 25 Nov 2023 07:09:31 +0100 Subject: [PATCH 8/8] [docs] update CPU tuning options --- docs/datasheet/cpu.adoc | 87 +++++++++++++++++++++++++++++------------ 1 file changed, 62 insertions(+), 25 deletions(-) diff --git a/docs/datasheet/cpu.adoc b/docs/datasheet/cpu.adoc index c83068cd9..6eaba0b49 100644 --- a/docs/datasheet/cpu.adoc +++ b/docs/datasheet/cpu.adoc @@ -87,44 +87,66 @@ caused by speculative execution (like Spectre or Meltdown). :sectnums: ==== CPU Register File -The data register file contains the general purpose "`x`" architecture registers. For the `rv32i` ISA there are 32 32-bit registers -and for the `rv32e` ISA there are 16 32-bit registers. Register zero (`x0`/`zero`) always read as zero and any write access to it -is discarded. +The data register file contains the general purpose architecture registers `x0` to `x31`. For the `rv32e` ISA only the lower +16 registers are implemented. Register zero (`x0`/`zero`) always read as zero and any write access to it has no effect. +Up to four individual synchronous read ports allow to fetch up to 4 register operands at once. The write and read accesses +are mutually exclusive as they happen in separate cycles. Hence, there is no need to consider things like "read-during-write" +behavior. -The register file is implemented as synchronous memory with synchronous read and write accesses. Register `zero` is also mapped to -a _physical memory location_ in the register file. By this, there is no need to add a further multiplexer to "insert" zero if reading -from register `zero` reducing logic requirements and shortening the critical path. Furthermore, the whole register file can be mapped -entirely to FPGA block RAM. +The register file provides two different implementation options configured via the top's `REGFILE_HW_RST` generic. -The memory of the register file uses two access ports: a read-only port for reading register `rs2` (second source operand) and a -read/write port for reading register `rs1` (first source operand) or for writing processing results to register `rd` (destination register). -Hence, a simple dual-port RAM can be used to implement the entire register file. From a functional point of view, read and write accesses to -the register file do never occur in the same clock cycle, so no bypass logic is required at all. +* `REGFILE_HW_RST = false` (default): In this configuration the register file is implemented as plain memory array without a +dictated hardware reset. This architecture allows to infer FPGA block RAM for the entire register file resulting in minimal +logic utilization and optimal timing. +* `REGFILE_HW_RST = true`: This configuration is based on individual FFs that do provide a dedicated hardware reset. +Hence, the register cannot be mapped to FPGA block RAM. This optional should only be selected if the application requires a +reset of the register file (e.g. for security reasons) or if the design shall be synthesized for an **ASIC** implementation. -.Register File Reset -[IMPORTANT] -The CPU register file does **not** provide any reset capabilities (in order to allow mapping to block RAM). -Hence, all integer registers (`x1` to `x15`/`x31`) have unknown values after a hardware reset and can still contain -sensitive data like encryption keys. +The state of this configuration generic can be checked by software via the <<_mxisa>> CSR. + +.FPGA Implementation +[WARNING] +Enabling the `REGFILE_HW_RST` option for FPGA implementation is not recommended as this will massively increase the amount +of required logic resources. + +.Implementation of the `zero` Register within FPGA Block RAM +[NOTE] +Register `zero` is also mapped to a _physical memory location_ within the register file's block RAM. By this, there is no need +to add a further multiplexer to "insert" zero if reading from register `zero` reducing logic requirements and shortening the +critical path. However, this also requires that the physical storage bits of register `zero` are explicitly initialized (set +to zero) by the hardware. This is done transparently by the CPU control requiring no additional processing overhead. + +.Block RAM Ports +[NOTE] +The default register file configuration uses two access ports: a read-only port for reading register `rs2` (second source operand) +and a read/write port for reading register `rs1` (first source operand) and for writing processing results to register `rd` +(destination register). Hence, a simple dual-port RAM can be used to implement the entire register file. From a functional point +of view, read and write accesses to the register file do never occur in the same clock cycle, so no bypass logic is required at all. :sectnums: ==== CPU Arithmetic Logic Unit -The arithmetic/logic unit (ALU) is used for processing data from the register file and also for memory and branch address computations. -All simple <<_i_isa_extension>> processing operations (`add`, `and`, ...) are implemented as combinatorial logic requiring only a single cycle to -complete. More sophisticated instructions (shift operations from the base ISA and all further ISA extensions) are processed by so-called -"ALU co-processors". +The arithmetic/logic unit (ALU) is used for actual data processing as well as generating memory and branch addresses. +All "simple" <<_i_isa_extension>> computational instructions (like `add` and `or`) are implemented as plain combinatorial logic +requiring only a single cycle to complete. More sophisticated instructions like shift operations or multiplications are processed +by so-called "ALU co-processors". -The co-processors are implemented as iterative units that require several cycles to complete processing. Besides the base ISA's shift instructions, -the co-processors are used to implement all further processing-based ISA extensions (e.g. <<_m_isa_extension>> and -<<_b_isa_extension>>). +The co-processors are implemented as iterative units that require several cycles to complete processing. Besides the base ISA's +shift instructions, the co-processors are used to implement all further processing-based ISA extensions (e.g. <<_m_isa_extension>> +and <<_b_isa_extension>>). .Multi-Cycle Execution Monitor [NOTE] The CPU control will raise an illegal instruction exception if a multi-cycle functional unit (like the <<_custom_functions_unit_cfu>>) does not complete processing in a bound amount of time (configured via the package's `monitor_mc_tmo_c` constant; default = 512 clock cycles). +.Tuning Options +[TIP] +The ALU architecture can be tuned for an application-specific area-vs-performance trade-off. The `FAST_MUL_EN` and `FAST_SHIFT_EN` +generics can be used to implement performance-optimized barrel shifters and DSP blocks, respectively. See sections <<_i_isa_extension>>, +<<_b_isa_extension>> and <<_m_isa_extension>> for specific examples. + :sectnums: ==== CPU Bus Unit @@ -494,6 +516,11 @@ the following sub-extensions: | Carry-less multiply | `clmul` `clmulh` `clmulr` | 36 |======================= +.Barrel Shifter +[TIP] +Shift operations can be accelerated (at the cost of additional logic resources) by enabling the `FAST_SHIFT_EN` +configuration option that will replace the (time-variant) bit-serial shifter by a (time-constant) barrel shifter. + ==== `C` ISA Extension @@ -561,6 +588,11 @@ will clear/flush the data cache and resynchronize it with main memory. The `wfi` instruction is used to enter <<_sleep_mode>>. Executing the `wfi` instruction in user-mode will raise an illegal instruction exception if the `TW` bit of <<_mstatus>> is set. +.Barrel Shifter +[TIP] +Shift operations can be accelerated (at the cost of additional logic resources) by enabling the `FAST_SHIFT_EN` +configuration option that will replace the (time-variant) bit-serial shifter by a (time-constant) barrel shifter. + ==== `M` ISA Extension @@ -575,6 +607,11 @@ Hardware-accelerated integer multiplication and division operations are availabl | Division | `div` `divu` `rem` `remu` | 36 |======================= +.DSP Blocks +[TIP] +Multiplication operations can be accelerated (at the cost of additional logic resources) by enabling the `FAST_MUL_EN` +configuration option that will replace the (time-variant) bit-serial multiplier by (time-constant) FPGA DSP blocks. + ==== `U` ISA Extension @@ -852,10 +889,10 @@ defined by the NEORV32 core library (the runtime environment _RTE_) and can be u with the pre-defined RTE function. The <<_mcause>>, <<_mepc>>, <<_mtval>> and <<_mtinst>> columns show the value being written to the according CSRs when a trap is triggered: -* **I-PC** - address of interrupted instruction (instruction has _not_ been executed yet) +* **I-PC** - address of intercepted instruction (instruction has _not_ been executed yet) * **PC** - address of instruction that caused the trap (instruction has been executed) * **ADR** - bad data memory access address that caused the trap -* **INS** - the (decompressed) instruction word that caused the trap +* **INS** - the transformed/decompressed instruction word that caused the trap * **0** - zero .NEORV32 Trap Listing